Merge tag 'drm-fixes-v4.7-rc1' of git://people.freedesktop.org/~airlied/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2016 21:08:56 +0000 (14:08 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2016 21:08:56 +0000 (14:08 -0700)
Pull drm fixes from Dave Airlie:

 - one IMX built-in regression fix

 - a set of amdgpu fixes, mostly powerplay and polaris GPU stuff

 - a set of i915 fixes all over, many cc'ed to stable.

   The i915 batch contain support for DP++ dongle detection, which is
   used to fix some regressions in the HDMI color depth area

* tag 'drm-fixes-v4.7-rc1' of git://people.freedesktop.org/~airlied/linux: (44 commits)
  drm/amd: add Kconfig dependency for ACP on DRM_AMDGPU
  drm/amdgpu: Fix hdmi deep color support.
  drm/amdgpu: fix bug in fence driver fini
  drm/i915: Stop automatically retiring requests after a GPU hang
  drm/i915: Unify intel_ring_begin()
  drm/i915: Ignore stale wm register values on resume on ilk-bdw (v2)
  drm/i915/psr: Try to program link training times correctly
  drm/imx: Match imx-ipuv3-crtc components using device node in platform data
  drm/i915/bxt: Adjusting the error in horizontal timings retrieval
  drm/i915: Don't leave old junk in ilk active watermarks on readout
  drm/i915: s/DPPL/DPLL/ for SKL DPLLs
  drm/i915: Fix gen8 semaphores id for legacy mode
  drm/i915: Set crtc_state->lane_count for HDMI
  drm/i915/BXT: Retrieving the horizontal timing for DSI
  drm/i915: Protect gen7 irq_seqno_barrier with uncore lock
  drm/i915: Re-enable GGTT earlier during resume on pre-gen6 platforms
  drm/i915: Determine DP++ type 1 DVI adaptor presence based on VBT
  drm/i915: Enable/disable TMDS output buffers in DP++ adaptor as needed
  drm/i915: Respect DP++ adaptor TMDS clock limit
  drm: Add helper for DP++ adaptors
  ...

668 files changed:
Documentation/devicetree/bindings/bus/ti-gpmc.txt [deleted file]
Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
Documentation/devicetree/bindings/mtd/gpmc-nand.txt
Documentation/devicetree/bindings/mtd/nand.txt
Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt [new file with mode: 0644]
Documentation/devicetree/bindings/spi/spi-fsl-dspi.txt
Documentation/devicetree/bindings/spi/sqi-pic32.txt [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
Documentation/devicetree/bindings/thermal/rcar-thermal.txt
Documentation/devicetree/bindings/thermal/tango-thermal.txt [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt
Documentation/devicetree/bindings/watchdog/renesas-wdt.txt [new file with mode: 0644]
Documentation/filesystems/dax.txt
Documentation/hwmon/max34440
Documentation/kbuild/kconfig-language.txt
Documentation/pwm.txt
Documentation/sysctl/kernel.txt
Documentation/thermal/sysfs-api.txt
Documentation/watchdog/hpwdt.txt
Documentation/watchdog/watchdog-parameters.txt
MAINTAINERS
Makefile
arch/arc/include/uapi/asm/unistd.h
arch/arc/kernel/perf_event.c
arch/arm/boot/Makefile
arch/arm/boot/bootp/Makefile
arch/arm/boot/dts/Makefile
arch/arm/boot/dts/exynos3250-monk.dts
arch/arm/boot/dts/exynos3250-rinato.dts
arch/arm/boot/dts/exynos3250.dtsi
arch/arm/boot/dts/exynos4210.dtsi
arch/arm/boot/dts/exynos4412-odroid-common.dtsi
arch/arm/boot/dts/exynos4412-ppmu-common.dtsi [new file with mode: 0644]
arch/arm/boot/dts/exynos4412-trats2.dts
arch/arm/boot/dts/exynos4x12.dtsi
arch/arm/boot/dts/exynos5420.dtsi
arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
arch/arm/boot/dts/imx7d-nitrogen7.dts [new file with mode: 0644]
arch/arm/boot/dts/imx7d.dtsi
arch/arm/boot/dts/r8a7779.dtsi
arch/arm/boot/dts/r8a7790.dtsi
arch/arm/boot/dts/r8a7791.dtsi
arch/arm/boot/dts/r8a7793.dtsi
arch/arm/boot/dts/r8a7794.dtsi
arch/arm/boot/dts/tegra124-jetson-tk1.dts
arch/arm/boot/dts/tegra124-nyan.dtsi
arch/arm/boot/dts/tegra124-venice2.dts
arch/arm/boot/dts/tegra124.dtsi
arch/arm/boot/dts/vf-colibri-eval-v3.dtsi
arch/arm/boot/dts/vf-colibri.dtsi
arch/arm/boot/dts/vfxxx.dtsi
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmio.h
arch/arm/kernel/perf_callchain.c
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arm.c
arch/arm/kvm/mmio.c
arch/arm/mach-lpc32xx/Makefile
arch/arm/mach-lpc32xx/include/mach/irqs.h
arch/arm/mach-lpc32xx/irq.c [deleted file]
arch/arm/mach-omap2/gpmc-nand.c
arch/arm/mach-pxa/Kconfig
arch/arm/mach-pxa/eseries.c
arch/arm/mach-pxa/spitz.c
arch/arm/mach-s3c24xx/mach-rx1950.c
arch/arm/vdso/Makefile
arch/arm64/boot/dts/renesas/r8a7795.dtsi
arch/arm64/configs/defconfig
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmio.h
arch/arm64/include/uapi/asm/unistd.h
arch/arm64/kernel/perf_callchain.c
arch/arm64/kvm/Kconfig
arch/arm64/kvm/Makefile
arch/arm64/kvm/inject_fault.c
arch/c6x/include/uapi/asm/unistd.h
arch/cris/arch-v32/drivers/mach-a3/nandflash.c
arch/cris/arch-v32/drivers/mach-fs/nandflash.c
arch/h8300/boot/compressed/Makefile
arch/h8300/include/uapi/asm/unistd.h
arch/hexagon/include/uapi/asm/unistd.h
arch/ia64/Makefile
arch/m32r/boot/compressed/Makefile
arch/metag/include/uapi/asm/unistd.h
arch/metag/kernel/perf_callchain.c
arch/microblaze/include/asm/unistd.h
arch/microblaze/include/uapi/asm/unistd.h
arch/microblaze/kernel/syscall_table.S
arch/microblaze/pci/pci-common.c
arch/mips/include/asm/mach-jz4740/jz4740_nand.h
arch/mips/jz4740/board-qi_lb60.c
arch/mips/kernel/perf_event.c
arch/mn10300/boot/compressed/Makefile
arch/nios2/Makefile
arch/nios2/boot/compressed/Makefile
arch/nios2/include/uapi/asm/unistd.h
arch/openrisc/include/uapi/asm/unistd.h
arch/parisc/Kconfig
arch/parisc/include/asm/cmpxchg.h
arch/parisc/include/asm/eisa_eeprom.h
arch/parisc/include/asm/ftrace.h
arch/parisc/include/asm/futex.h
arch/parisc/include/asm/ldcw.h
arch/parisc/include/asm/syscall.h
arch/parisc/include/asm/thread_info.h
arch/parisc/include/asm/uaccess.h
arch/parisc/include/uapi/asm/pdc.h
arch/parisc/include/uapi/asm/ptrace.h
arch/parisc/include/uapi/asm/unistd.h
arch/parisc/kernel/entry.S
arch/parisc/kernel/ftrace.c
arch/parisc/kernel/ptrace.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/time.c
arch/parisc/lib/bitops.c
arch/parisc/math-emu/fpudispatch.c
arch/powerpc/perf/callchain.c
arch/powerpc/sysdev/axonram.c
arch/s390/boot/compressed/Makefile
arch/s390/kernel/perf_event.c
arch/score/include/uapi/asm/unistd.h
arch/sh/boot/compressed/Makefile
arch/sh/boot/romimage/Makefile
arch/sh/kernel/perf_callchain.c
arch/sparc/kernel/perf_event.c
arch/tile/include/uapi/asm/unistd.h
arch/tile/kernel/perf_event.c
arch/unicore32/boot/Makefile
arch/unicore32/boot/compressed/Makefile
arch/unicore32/include/uapi/asm/unistd.h
arch/x86/boot/compressed/Makefile
arch/x86/entry/thunk_64.S
arch/x86/entry/vdso/Makefile
arch/x86/events/core.c
arch/x86/events/intel/p4.c
arch/x86/events/intel/uncore.c
arch/x86/include/asm/bugs.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/disabled-features.h
arch/x86/include/asm/intel_telemetry.h
arch/x86/include/asm/pmc_core.h [new file with mode: 0644]
arch/x86/include/uapi/asm/svm.h
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/tsc_msr.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/mm/fault.c
arch/x86/pci/xen.c
arch/x86/platform/efi/efi_stub_64.S
arch/x86/purgatory/Makefile
arch/x86/realmode/rm/Makefile
arch/x86/xen/setup.c
arch/x86/xen/time.c
arch/xtensa/kernel/perf_event.c
block/ioctl.c
drivers/acpi/battery.c
drivers/acpi/device_pm.c
drivers/base/power/main.c
drivers/bcma/driver_chipcommon_sflash.c
drivers/block/brd.c
drivers/block/rbd.c
drivers/clk/clk-pwm.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/intel_pstate.c
drivers/cpufreq/mt8173-cpufreq.c
drivers/cpuidle/cpuidle.c
drivers/gpu/drm/i915/intel_panel.c
drivers/hwmon/emc2103.c
drivers/hwmon/lm75.c
drivers/hwmon/ntc_thermistor.c
drivers/hwmon/pwm-fan.c
drivers/hwmon/scpi-hwmon.c
drivers/hwmon/tmp102.c
drivers/input/misc/max77693-haptic.c
drivers/input/misc/max8997_haptic.c
drivers/input/misc/pwm-beeper.c
drivers/input/touchscreen/sun4i-ts.c
drivers/iommu/intel-iommu.c
drivers/iommu/iova.c
drivers/leds/leds-pwm.c
drivers/memory/Kconfig
drivers/memory/fsl_ifc.c
drivers/memory/omap-gpmc.c
drivers/mmc/card/block.c
drivers/mmc/core/core.c
drivers/mmc/core/host.c
drivers/mmc/host/dw_mmc-rockchip.c
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/sdhci-acpi.c
drivers/mmc/host/sdhci-pci-core.c
drivers/mtd/chips/Kconfig
drivers/mtd/devices/bcm47xxsflash.c
drivers/mtd/devices/bcm47xxsflash.h
drivers/mtd/devices/docg3.c
drivers/mtd/devices/m25p80.c
drivers/mtd/devices/pmc551.c
drivers/mtd/maps/ck804xrom.c
drivers/mtd/maps/esb2rom.c
drivers/mtd/maps/ichxrom.c
drivers/mtd/maps/uclinux.c
drivers/mtd/mtdchar.c
drivers/mtd/mtdconcat.c
drivers/mtd/mtdcore.c
drivers/mtd/mtdpart.c
drivers/mtd/nand/ams-delta.c
drivers/mtd/nand/atmel_nand.c
drivers/mtd/nand/au1550nd.c
drivers/mtd/nand/bf5xx_nand.c
drivers/mtd/nand/brcmnand/brcmnand.c
drivers/mtd/nand/cafe_nand.c
drivers/mtd/nand/cmx270_nand.c
drivers/mtd/nand/davinci_nand.c
drivers/mtd/nand/denali.c
drivers/mtd/nand/diskonchip.c
drivers/mtd/nand/docg4.c
drivers/mtd/nand/fsl_elbc_nand.c
drivers/mtd/nand/fsl_ifc_nand.c
drivers/mtd/nand/fsl_upm.c
drivers/mtd/nand/fsmc_nand.c
drivers/mtd/nand/gpio.c
drivers/mtd/nand/gpmi-nand/gpmi-nand.c
drivers/mtd/nand/hisi504_nand.c
drivers/mtd/nand/jz4740_nand.c
drivers/mtd/nand/jz4780_bch.c
drivers/mtd/nand/jz4780_nand.c
drivers/mtd/nand/lpc32xx_mlc.c
drivers/mtd/nand/lpc32xx_slc.c
drivers/mtd/nand/mpc5121_nfc.c
drivers/mtd/nand/mxc_nand.c
drivers/mtd/nand/nand_base.c
drivers/mtd/nand/nand_bch.c
drivers/mtd/nand/nandsim.c
drivers/mtd/nand/nuc900_nand.c
drivers/mtd/nand/omap2.c
drivers/mtd/nand/orion_nand.c
drivers/mtd/nand/pasemi_nand.c
drivers/mtd/nand/plat_nand.c
drivers/mtd/nand/pxa3xx_nand.c
drivers/mtd/nand/qcom_nandc.c
drivers/mtd/nand/s3c2410.c
drivers/mtd/nand/sh_flctl.c
drivers/mtd/nand/sharpsl.c
drivers/mtd/nand/sm_common.c
drivers/mtd/nand/socrates_nand.c
drivers/mtd/nand/sunxi_nand.c
drivers/mtd/nand/vf610_nfc.c
drivers/mtd/onenand/onenand_base.c
drivers/mtd/spi-nor/spi-nor.c
drivers/nvdimm/pmem.c
drivers/of/Makefile
drivers/of/of_mtd.c [deleted file]
drivers/pinctrl/intel/pinctrl-baytrail.c
drivers/platform/x86/Kconfig
drivers/platform/x86/Makefile
drivers/platform/x86/asus-laptop.c
drivers/platform/x86/asus-wmi.c
drivers/platform/x86/dell-rbtn.c
drivers/platform/x86/fujitsu-laptop.c
drivers/platform/x86/ideapad-laptop.c
drivers/platform/x86/intel_menlow.c
drivers/platform/x86/intel_pmc_core.c [new file with mode: 0644]
drivers/platform/x86/intel_pmc_core.h [new file with mode: 0644]
drivers/platform/x86/intel_telemetry_core.c
drivers/platform/x86/intel_telemetry_pltdrv.c
drivers/platform/x86/sony-laptop.c
drivers/platform/x86/surfacepro3_button.c
drivers/platform/x86/thinkpad_acpi.c
drivers/pwm/core.c
drivers/pwm/pwm-crc.c
drivers/pwm/pwm-lpc18xx-sct.c
drivers/pwm/pwm-omap-dmtimer.c
drivers/pwm/pwm-rcar.c
drivers/pwm/pwm-sun4i.c
drivers/pwm/sysfs.c
drivers/s390/block/dcssblk.c
drivers/soc/mediatek/mtk-pmic-wrap.c
drivers/spi/Kconfig
drivers/spi/Makefile
drivers/spi/spi-axi-spi-engine.c
drivers/spi/spi-bcm53xx.c
drivers/spi/spi-cadence.c
drivers/spi/spi-davinci.c
drivers/spi/spi-dln2.c
drivers/spi/spi-dw-pci.c
drivers/spi/spi-fsl-dspi.c
drivers/spi/spi-fsl-espi.c
drivers/spi/spi-octeon.c
drivers/spi/spi-omap2-mcspi.c
drivers/spi/spi-pic32-sqi.c [new file with mode: 0644]
drivers/spi/spi-pic32.c [new file with mode: 0644]
drivers/spi/spi-pxa2xx-dma.c
drivers/spi/spi-pxa2xx-pci.c
drivers/spi/spi-pxa2xx.c
drivers/spi/spi-pxa2xx.h
drivers/spi/spi-qup.c
drivers/spi/spi-rockchip.c
drivers/spi/spi-st-ssc4.c
drivers/spi/spi-zynqmp-gqspi.c
drivers/spi/spi.c
drivers/staging/mt29f_spinand/mt29f_spinand.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/gov_bang_bang.c
drivers/thermal/hisi_thermal.c
drivers/thermal/int340x_thermal/processor_thermal_device.c
drivers/thermal/intel_powerclamp.c
drivers/thermal/mtk_thermal.c
drivers/thermal/of-thermal.c
drivers/thermal/qcom-spmi-temp-alarm.c
drivers/thermal/rcar_thermal.c
drivers/thermal/rockchip_thermal.c
drivers/thermal/tango_thermal.c [new file with mode: 0644]
drivers/thermal/tegra/Kconfig [new file with mode: 0644]
drivers/thermal/tegra/Makefile [new file with mode: 0644]
drivers/thermal/tegra/soctherm-fuse.c [new file with mode: 0644]
drivers/thermal/tegra/soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/soctherm.h [new file with mode: 0644]
drivers/thermal/tegra/tegra124-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/tegra132-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra/tegra210-soctherm.c [new file with mode: 0644]
drivers/thermal/tegra_soctherm.c [deleted file]
drivers/thermal/thermal-generic-adc.c [new file with mode: 0644]
drivers/thermal/ti-soc-thermal/ti-thermal-common.c
drivers/thermal/x86_pkg_temp_thermal.c
drivers/vfio/pci/vfio_pci.c
drivers/vfio/pci/vfio_pci_config.c
drivers/vfio/pci/vfio_pci_private.h
drivers/vfio/vfio_iommu_spapr_tce.c
drivers/video/backlight/lm3630a_bl.c
drivers/video/backlight/lp855x_bl.c
drivers/video/backlight/lp8788_bl.c
drivers/video/backlight/pwm_bl.c
drivers/video/fbdev/ssd1307fb.c
drivers/virtio/virtio_balloon.c
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/cpwd.c
drivers/watchdog/f71808e_wdt.c
drivers/watchdog/imx2_wdt.c
drivers/watchdog/jz4740_wdt.c
drivers/watchdog/octeon-wdt-main.c
drivers/watchdog/qcom-wdt.c
drivers/watchdog/renesas_wdt.c [new file with mode: 0644]
drivers/watchdog/shwdt.c
drivers/watchdog/sp5100_tco.c
drivers/watchdog/watchdog_core.c
drivers/watchdog/watchdog_dev.c
drivers/xen/Makefile
drivers/xen/events/events_base.c
drivers/xen/gntdev.c
fs/Kconfig
fs/block_dev.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/mdsmap.c
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/compat.c
fs/dax.c
fs/ext2/file.c
fs/ext2/inode.c
fs/ext2/super.c
fs/ext4/balloc.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mmp.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/namei.c
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/direct.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/internal.h
fs/nfs/nfs42.h
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4file.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/nfs3xdr.c
fs/nfsd/nfs4layouts.c
fs/nfsd/nfs4state.c
fs/nfsd/state.h
fs/ocfs2/inode.c
fs/ocfs2/journal.h
fs/readdir.c
fs/xattr.c
fs/xfs/kmem.c
fs/xfs/kmem.h
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_dir2_sf.c
fs/xfs/libxfs/xfs_inode_fork.c
fs/xfs/libxfs/xfs_inode_fork.h
fs/xfs/libxfs/xfs_log_format.h
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_shared.h
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr.h
fs/xfs/xfs_attr_inactive.c
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_sysfs.c
fs/xfs/xfs_sysfs.h
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_xattr.c
include/asm-generic/preempt.h
include/dt-bindings/thermal/tegra124-soctherm.h
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/kvm/vgic/vgic.h [new file with mode: 0644]
include/linux/bcma/bcma_driver_chipcommon.h
include/linux/blkdev.h
include/linux/ceph/ceph_frag.h
include/linux/ceph/ceph_fs.h
include/linux/ceph/decode.h
include/linux/ceph/libceph.h
include/linux/ceph/mon_client.h
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/dax.h
include/linux/errno.h
include/linux/export.h
include/linux/fs.h
include/linux/fsl_ifc.h
include/linux/iova.h
include/linux/irqchip/arm-gic-v3.h
include/linux/irqchip/arm-gic.h
include/linux/jbd2.h
include/linux/kvm_host.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmc/host.h
include/linux/mtd/fsmc.h
include/linux/mtd/map.h
include/linux/mtd/mtd.h
include/linux/mtd/nand.h
include/linux/mtd/onenand.h
include/linux/mtd/sharpsl.h
include/linux/mtd/spi-nor.h
include/linux/nfs4.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/of_mtd.h [deleted file]
include/linux/omap-gpmc.h
include/linux/perf_event.h
include/linux/platform_data/gpmc-omap.h [new file with mode: 0644]
include/linux/platform_data/mtd-nand-omap2.h
include/linux/pwm.h
include/linux/sched.h
include/linux/seqlock.h
include/linux/slub_def.h
include/linux/spi/spi.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/msg_prot.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtrdma.h
include/linux/thermal.h
include/trace/events/kvm.h
include/uapi/asm-generic/unistd.h
include/uapi/linux/perf_event.h
include/uapi/mtd/mtd-abi.h
init/Kconfig
kernel/bpf/stackmap.c
kernel/events/callchain.c
kernel/fork.c
kernel/gcov/Kconfig
kernel/locking/percpu-rwsem.c
kernel/sched/core.c
kernel/sched/cpufreq_schedutil.c
kernel/sysctl.c
lib/dma-debug.c
lib/iov_iter.c
mm/Kconfig
mm/filemap.c
mm/kasan/kasan.h
mm/memcontrol.c
mm/memory.c
mm/truncate.c
mm/zsmalloc.c
net/ceph/ceph_common.c
net/ceph/ceph_strings.c
net/ceph/debugfs.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/auth_unix.c
net/sunrpc/clnt.c
net/sunrpc/svc_xprt.c
net/sunrpc/xdr.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_marshal.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
scripts/Kbuild.include
scripts/Makefile.build
scripts/Makefile.extrawarn
scripts/Makefile.lib
scripts/adjust_autoksyms.sh [new file with mode: 0755]
scripts/basic/fixdep.c
scripts/coccicheck
scripts/coccinelle/api/setup_timer.cocci
scripts/coccinelle/misc/compare_const_fl.cocci [deleted file]
scripts/genksyms/genksyms.c
scripts/kconfig/confdata.c
scripts/kconfig/symbol.c
scripts/package/Makefile
scripts/package/builddeb
scripts/package/mkspec
security/yama/yama_lsm.c
tools/Makefile
tools/build/Makefile.build
tools/kvm/kvm_stat/Makefile [new file with mode: 0644]
tools/kvm/kvm_stat/kvm_stat [new file with mode: 0755]
tools/kvm/kvm_stat/kvm_stat.txt [new file with mode: 0644]
tools/objtool/Makefile
tools/objtool/elf.h
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-script.txt
tools/perf/Documentation/perf-trace.txt
tools/perf/builtin-annotate.c
tools/perf/builtin-buildid-cache.c
tools/perf/builtin-diff.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/perf.c
tools/perf/util/annotate.c
tools/perf/util/build-id.c
tools/perf/util/db-export.c
tools/perf/util/dso.c
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/machine.c
tools/perf/util/machine.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/stat-shadow.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/top.h
tools/perf/util/util.c
tools/perf/util/util.h
tools/testing/selftests/seccomp/seccomp_bpf.c
tools/testing/selftests/vm/thuge-gen.c
tools/virtio/ringtest/Makefile
tools/virtio/ringtest/main.c
tools/virtio/ringtest/virtio_ring_0_9.c
tools/virtio/ringtest/virtio_ring_inorder.c [new file with mode: 0644]
virt/kvm/arm/arch_timer.c
virt/kvm/arm/hyp/timer-sr.c
virt/kvm/arm/hyp/vgic-v2-sr.c
virt/kvm/arm/pmu.c
virt/kvm/arm/vgic-v2.c
virt/kvm/arm/vgic-v3.c
virt/kvm/arm/vgic.c
virt/kvm/arm/vgic/vgic-init.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-irqfd.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-kvm-device.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio-v2.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio-v3.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-mmio.h [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-v2.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic-v3.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic.c [new file with mode: 0644]
virt/kvm/arm/vgic/vgic.h [new file with mode: 0644]
virt/kvm/kvm_main.c

diff --git a/Documentation/devicetree/bindings/bus/ti-gpmc.txt b/Documentation/devicetree/bindings/bus/ti-gpmc.txt
deleted file mode 100644 (file)
index 0168370..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-Device tree bindings for OMAP general purpose memory controllers (GPMC)
-
-The actual devices are instantiated from the child nodes of a GPMC node.
-
-Required properties:
-
- - compatible:         Should be set to one of the following:
-
-                       ti,omap2420-gpmc (omap2420)
-                       ti,omap2430-gpmc (omap2430)
-                       ti,omap3430-gpmc (omap3430 & omap3630)
-                       ti,omap4430-gpmc (omap4430 & omap4460 & omap543x)
-                       ti,am3352-gpmc   (am335x devices)
-
- - reg:                        A resource specifier for the register space
-                       (see the example below)
- - ti,hwmods:          Should be set to "ti,gpmc" until the DT transition is
-                       completed.
- - #address-cells:     Must be set to 2 to allow memory address translation
- - #size-cells:                Must be set to 1 to allow CS address passing
- - gpmc,num-cs:                The maximum number of chip-select lines that controller
-                       can support.
- - gpmc,num-waitpins:  The maximum number of wait pins that controller can
-                       support.
- - ranges:             Must be set up to reflect the memory layout with four
-                       integer values for each chip-select line in use:
-
-                          <cs-number> 0 <physical address of mapping> <size>
-
-                       Currently, calculated values derived from the contents
-                       of the per-CS register GPMC_CONFIG7 (as set up by the
-                       bootloader) are used for the physical address decoding.
-                       As this will change in the future, filling correct
-                       values here is a requirement.
-
-Timing properties for child nodes. All are optional and default to 0.
-
- - gpmc,sync-clk-ps:   Minimum clock period for synchronous mode, in picoseconds
-
- Chip-select signal timings (in nanoseconds) corresponding to GPMC_CONFIG2:
- - gpmc,cs-on-ns:      Assertion time
- - gpmc,cs-rd-off-ns:  Read deassertion time
- - gpmc,cs-wr-off-ns:  Write deassertion time
-
- ADV signal timings (in nanoseconds) corresponding to GPMC_CONFIG3:
- - gpmc,adv-on-ns:     Assertion time
- - gpmc,adv-rd-off-ns: Read deassertion time
- - gpmc,adv-wr-off-ns: Write deassertion time
- - gpmc,adv-aad-mux-on-ns:     Assertion time for AAD
- - gpmc,adv-aad-mux-rd-off-ns: Read deassertion time for AAD
- - gpmc,adv-aad-mux-wr-off-ns: Write deassertion time for AAD
-
- WE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
- - gpmc,we-on-ns       Assertion time
- - gpmc,we-off-ns:     Deassertion time
-
- OE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
- - gpmc,oe-on-ns:      Assertion time
- - gpmc,oe-off-ns:     Deassertion time
- - gpmc,oe-aad-mux-on-ns:      Assertion time for AAD
- - gpmc,oe-aad-mux-off-ns:     Deassertion time for AAD
-
- Access time and cycle time timings (in nanoseconds) corresponding to
- GPMC_CONFIG5:
- - gpmc,page-burst-access-ns:  Multiple access word delay
- - gpmc,access-ns:             Start-cycle to first data valid delay
- - gpmc,rd-cycle-ns:           Total read cycle time
- - gpmc,wr-cycle-ns:           Total write cycle time
- - gpmc,bus-turnaround-ns:     Turn-around time between successive accesses
- - gpmc,cycle2cycle-delay-ns:  Delay between chip-select pulses
- - gpmc,clk-activation-ns:     GPMC clock activation time
- - gpmc,wait-monitoring-ns:    Start of wait monitoring with regard to valid
-                               data
-
-Boolean timing parameters. If property is present parameter enabled and
-disabled if omitted:
- - gpmc,adv-extra-delay:       ADV signal is delayed by half GPMC clock
- - gpmc,cs-extra-delay:                CS signal is delayed by half GPMC clock
- - gpmc,cycle2cycle-diffcsen:  Add "cycle2cycle-delay" between successive
-                               accesses to a different CS
- - gpmc,cycle2cycle-samecsen:  Add "cycle2cycle-delay" between successive
-                               accesses to the same CS
- - gpmc,oe-extra-delay:                OE signal is delayed by half GPMC clock
- - gpmc,we-extra-delay:                WE signal is delayed by half GPMC clock
- - gpmc,time-para-granularity: Multiply all access times by 2
-
-The following are only applicable to OMAP3+ and AM335x:
- - gpmc,wr-access-ns:          In synchronous write mode, for single or
-                               burst accesses, defines the number of
-                               GPMC_FCLK cycles from start access time
-                               to the GPMC_CLK rising edge used by the
-                               memory device for the first data capture.
- - gpmc,wr-data-mux-bus-ns:    In address-data multiplex mode, specifies
-                               the time when the first data is driven on
-                               the address-data bus.
-
-GPMC chip-select settings properties for child nodes. All are optional.
-
-- gpmc,burst-length    Page/burst length. Must be 4, 8 or 16.
-- gpmc,burst-wrap      Enables wrap bursting
-- gpmc,burst-read      Enables read page/burst mode
-- gpmc,burst-write     Enables write page/burst mode
-- gpmc,device-width    Total width of device(s) connected to a GPMC
-                       chip-select in bytes. The GPMC supports 8-bit
-                       and 16-bit devices and so this property must be
-                       1 or 2.
-- gpmc,mux-add-data    Address and data multiplexing configuration.
-                       Valid values are 1 for address-address-data
-                       multiplexing mode and 2 for address-data
-                       multiplexing mode.
-- gpmc,sync-read       Enables synchronous read. Defaults to asynchronous
-                       is this is not set.
-- gpmc,sync-write      Enables synchronous writes. Defaults to asynchronous
-                       is this is not set.
-- gpmc,wait-pin                Wait-pin used by client. Must be less than
-                       "gpmc,num-waitpins".
-- gpmc,wait-on-read    Enables wait monitoring on reads.
-- gpmc,wait-on-write   Enables wait monitoring on writes.
-
-Example for an AM33xx board:
-
-       gpmc: gpmc@50000000 {
-               compatible = "ti,am3352-gpmc";
-               ti,hwmods = "gpmc";
-               reg = <0x50000000 0x2000>;
-               interrupts = <100>;
-
-               gpmc,num-cs = <8>;
-               gpmc,num-waitpins = <2>;
-               #address-cells = <2>;
-               #size-cells = <1>;
-               ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
-
-               /* child nodes go here */
-       };
diff --git a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
new file mode 100644 (file)
index 0000000..21055e2
--- /dev/null
@@ -0,0 +1,152 @@
+Device tree bindings for OMAP general purpose memory controllers (GPMC)
+
+The actual devices are instantiated from the child nodes of a GPMC node.
+
+Required properties:
+
+ - compatible:         Should be set to one of the following:
+
+                       ti,omap2420-gpmc (omap2420)
+                       ti,omap2430-gpmc (omap2430)
+                       ti,omap3430-gpmc (omap3430 & omap3630)
+                       ti,omap4430-gpmc (omap4430 & omap4460 & omap543x)
+                       ti,am3352-gpmc   (am335x devices)
+
+ - reg:                        A resource specifier for the register space
+                       (see the example below)
+ - ti,hwmods:          Should be set to "ti,gpmc" until the DT transition is
+                       completed.
+ - #address-cells:     Must be set to 2 to allow memory address translation
+ - #size-cells:                Must be set to 1 to allow CS address passing
+ - gpmc,num-cs:                The maximum number of chip-select lines that controller
+                       can support.
+ - gpmc,num-waitpins:  The maximum number of wait pins that controller can
+                       support.
+ - ranges:             Must be set up to reflect the memory layout with four
+                       integer values for each chip-select line in use:
+
+                          <cs-number> 0 <physical address of mapping> <size>
+
+                       Currently, calculated values derived from the contents
+                       of the per-CS register GPMC_CONFIG7 (as set up by the
+                       bootloader) are used for the physical address decoding.
+                       As this will change in the future, filling correct
+                       values here is a requirement.
+ - interrupt-controller: The GPMC driver implements and interrupt controller for
+                       the NAND events "fifoevent" and "termcount" plus the
+                       rising/falling edges on the GPMC_WAIT pins.
+                       The interrupt number mapping is as follows
+                       0 - NAND_fifoevent
+                       1 - NAND_termcount
+                       2 - GPMC_WAIT0 pin edge
+                       3 - GPMC_WAIT1 pin edge, and so on.
+ - interrupt-cells:    Must be set to 2
+ - gpio-controller:    The GPMC driver implements a GPIO controller for the
+                       GPMC WAIT pins that can be used as general purpose inputs.
+                       0 maps to GPMC_WAIT0 pin.
+ - gpio-cells:         Must be set to 2
+
+Timing properties for child nodes. All are optional and default to 0.
+
+ - gpmc,sync-clk-ps:   Minimum clock period for synchronous mode, in picoseconds
+
+ Chip-select signal timings (in nanoseconds) corresponding to GPMC_CONFIG2:
+ - gpmc,cs-on-ns:      Assertion time
+ - gpmc,cs-rd-off-ns:  Read deassertion time
+ - gpmc,cs-wr-off-ns:  Write deassertion time
+
+ ADV signal timings (in nanoseconds) corresponding to GPMC_CONFIG3:
+ - gpmc,adv-on-ns:     Assertion time
+ - gpmc,adv-rd-off-ns: Read deassertion time
+ - gpmc,adv-wr-off-ns: Write deassertion time
+ - gpmc,adv-aad-mux-on-ns:     Assertion time for AAD
+ - gpmc,adv-aad-mux-rd-off-ns: Read deassertion time for AAD
+ - gpmc,adv-aad-mux-wr-off-ns: Write deassertion time for AAD
+
+ WE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
+ - gpmc,we-on-ns       Assertion time
+ - gpmc,we-off-ns:     Deassertion time
+
+ OE signals timings (in nanoseconds) corresponding to GPMC_CONFIG4:
+ - gpmc,oe-on-ns:      Assertion time
+ - gpmc,oe-off-ns:     Deassertion time
+ - gpmc,oe-aad-mux-on-ns:      Assertion time for AAD
+ - gpmc,oe-aad-mux-off-ns:     Deassertion time for AAD
+
+ Access time and cycle time timings (in nanoseconds) corresponding to
+ GPMC_CONFIG5:
+ - gpmc,page-burst-access-ns:  Multiple access word delay
+ - gpmc,access-ns:             Start-cycle to first data valid delay
+ - gpmc,rd-cycle-ns:           Total read cycle time
+ - gpmc,wr-cycle-ns:           Total write cycle time
+ - gpmc,bus-turnaround-ns:     Turn-around time between successive accesses
+ - gpmc,cycle2cycle-delay-ns:  Delay between chip-select pulses
+ - gpmc,clk-activation-ns:     GPMC clock activation time
+ - gpmc,wait-monitoring-ns:    Start of wait monitoring with regard to valid
+                               data
+
+Boolean timing parameters. If property is present parameter enabled and
+disabled if omitted:
+ - gpmc,adv-extra-delay:       ADV signal is delayed by half GPMC clock
+ - gpmc,cs-extra-delay:                CS signal is delayed by half GPMC clock
+ - gpmc,cycle2cycle-diffcsen:  Add "cycle2cycle-delay" between successive
+                               accesses to a different CS
+ - gpmc,cycle2cycle-samecsen:  Add "cycle2cycle-delay" between successive
+                               accesses to the same CS
+ - gpmc,oe-extra-delay:                OE signal is delayed by half GPMC clock
+ - gpmc,we-extra-delay:                WE signal is delayed by half GPMC clock
+ - gpmc,time-para-granularity: Multiply all access times by 2
+
+The following are only applicable to OMAP3+ and AM335x:
+ - gpmc,wr-access-ns:          In synchronous write mode, for single or
+                               burst accesses, defines the number of
+                               GPMC_FCLK cycles from start access time
+                               to the GPMC_CLK rising edge used by the
+                               memory device for the first data capture.
+ - gpmc,wr-data-mux-bus-ns:    In address-data multiplex mode, specifies
+                               the time when the first data is driven on
+                               the address-data bus.
+
+GPMC chip-select settings properties for child nodes. All are optional.
+
+- gpmc,burst-length    Page/burst length. Must be 4, 8 or 16.
+- gpmc,burst-wrap      Enables wrap bursting
+- gpmc,burst-read      Enables read page/burst mode
+- gpmc,burst-write     Enables write page/burst mode
+- gpmc,device-width    Total width of device(s) connected to a GPMC
+                       chip-select in bytes. The GPMC supports 8-bit
+                       and 16-bit devices and so this property must be
+                       1 or 2.
+- gpmc,mux-add-data    Address and data multiplexing configuration.
+                       Valid values are 1 for address-address-data
+                       multiplexing mode and 2 for address-data
+                       multiplexing mode.
+- gpmc,sync-read       Enables synchronous read. Defaults to asynchronous
+                       is this is not set.
+- gpmc,sync-write      Enables synchronous writes. Defaults to asynchronous
+                       is this is not set.
+- gpmc,wait-pin                Wait-pin used by client. Must be less than
+                       "gpmc,num-waitpins".
+- gpmc,wait-on-read    Enables wait monitoring on reads.
+- gpmc,wait-on-write   Enables wait monitoring on writes.
+
+Example for an AM33xx board:
+
+       gpmc: gpmc@50000000 {
+               compatible = "ti,am3352-gpmc";
+               ti,hwmods = "gpmc";
+               reg = <0x50000000 0x2000>;
+               interrupts = <100>;
+
+               gpmc,num-cs = <8>;
+               gpmc,num-waitpins = <2>;
+               #address-cells = <2>;
+               #size-cells = <1>;
+               ranges = <0 0 0x08000000 0x10000000>; /* CS0 @addr 0x8000000, size 0x10000000 */
+               interrupt-controller;
+               #interrupt-cells = <2>;
+               gpio-controller;
+               #gpio-cells = <2>;
+
+               /* child nodes go here */
+       };
index 0f6985b5de49afb7eb38991d86197ba407cb87cf..7066597c9a81850af6db19424b4c7449baf828b1 100644 (file)
@@ -24,6 +24,7 @@ Required properties:
                          brcm,brcmnand-v5.0
                          brcm,brcmnand-v6.0
                          brcm,brcmnand-v6.1
+                         brcm,brcmnand-v6.2
                          brcm,brcmnand-v7.0
                          brcm,brcmnand-v7.1
                          brcm,brcmnand
index fb733c4e1c116e76bea05e690bbd968fd66c8395..3ee7e202657cdb83f7e430ff7b00a29ba69ed097 100644 (file)
@@ -13,7 +13,11 @@ Documentation/devicetree/bindings/mtd/nand.txt
 
 Required properties:
 
- - reg:                The CS line the peripheral is connected to
+ - compatible: "ti,omap2-nand"
+ - reg:                range id (CS number), base offset and length of the
+               NAND I/O space
+ - interrupt-parent: must point to gpmc node
+ - interrupts: Two interrupt specifiers, one for fifoevent, one for termcount.
 
 Optional properties:
 
@@ -44,6 +48,7 @@ Optional properties:
                locating ECC errors for BCHx algorithms. SoC devices which have
                ELM hardware engines should specify this device node in .dtsi
                Using ELM for ECC error correction frees some CPU cycles.
+ - rb-gpios:   GPIO specifier for the ready/busy# pin.
 
 For inline partition table parsing (optional):
 
@@ -55,20 +60,26 @@ Example for an AM33xx board:
        gpmc: gpmc@50000000 {
                compatible = "ti,am3352-gpmc";
                ti,hwmods = "gpmc";
-               reg = <0x50000000 0x1000000>;
+               reg = <0x50000000 0x36c>;
                interrupts = <100>;
                gpmc,num-cs = <8>;
                gpmc,num-waitpins = <2>;
                #address-cells = <2>;
                #size-cells = <1>;
-               ranges = <0 0 0x08000000 0x2000>;       /* CS0: NAND */
+               ranges = <0 0 0x08000000 0x1000000>;    /* CS0 space, 16MB */
                elm_id = <&elm>;
+               interrupt-controller;
+               #interrupt-cells = <2>;
 
                nand@0,0 {
-                       reg = <0 0 0>; /* CS0, offset 0 */
+                       compatible = "ti,omap2-nand";
+                       reg = <0 0 4>;          /* CS0, offset 0, NAND I/O window 4 */
+                       interrupt-parent = <&gpmc>;
+                       interrupts = <0 IRQ_TYPE_NONE>, <1 IRQ_TYPE NONE>;
                        nand-bus-width = <16>;
                        ti,nand-ecc-opt = "bch8";
                        ti,nand-xfer-type = "polled";
+                       rb-gpios = <&gpmc 0 GPIO_ACTIVE_HIGH>; /* gpmc_wait0 */
 
                        gpmc,sync-clk-ps = <0>;
                        gpmc,cs-on-ns = <0>;
index b53f92e252d4a7f48427ca1e123381dc04e91e41..68342eac23833951c272b5153874cdd6f774db5f 100644 (file)
@@ -1,8 +1,31 @@
-* MTD generic binding
+* NAND chip and NAND controller generic binding
+
+NAND controller/NAND chip representation:
+
+The NAND controller should be represented with its own DT node, and all
+NAND chips attached to this controller should be defined as children nodes
+of the NAND controller. This representation should be enforced even for
+simple controllers supporting only one chip.
+
+Mandatory NAND controller properties:
+- #address-cells: depends on your controller. Should at least be 1 to
+                 encode the CS line id.
+- #size-cells: depends on your controller. Put zero unless you need a
+              mapping between CS lines and dedicated memory regions
+
+Optional NAND controller properties
+- ranges: only needed if you need to define a mapping between CS lines and
+         memory regions
+
+Optional NAND chip properties:
 
 - nand-ecc-mode : String, operation mode of the NAND ecc mode.
-  Supported values are: "none", "soft", "hw", "hw_syndrome", "hw_oob_first",
-  "soft_bch".
+                 Supported values are: "none", "soft", "hw", "hw_syndrome",
+                 "hw_oob_first".
+                 Deprecated values:
+                 "soft_bch": use "soft" and nand-ecc-algo instead
+- nand-ecc-algo: string, algorithm of NAND ECC.
+                Supported values are: "hamming", "bch".
 - nand-bus-width : 8 or 16 bus width if not present 8
 - nand-on-flash-bbt: boolean to enable on flash bbt option if not present false
 
@@ -19,3 +42,19 @@ errors per {size} bytes".
 The interpretation of these parameters is implementation-defined, so not all
 implementations must support all possible combinations. However, implementations
 are encouraged to further specify the value(s) they support.
+
+Example:
+
+       nand-controller {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               /* controller specific properties */
+
+               nand@0 {
+                       reg = <0>;
+                       nand-ecc-mode = "soft_bch";
+
+                       /* controller specific properties */
+               };
+       };
diff --git a/Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt b/Documentation/devicetree/bindings/spi/microchip,spi-pic32.txt
new file mode 100644 (file)
index 0000000..79de379
--- /dev/null
@@ -0,0 +1,34 @@
+Microchip PIC32 SPI Master controller
+
+Required properties:
+- compatible: Should be "microchip,pic32mzda-spi".
+- reg: Address and length of register space for the device.
+- interrupts: Should contain all three spi interrupts in sequence
+              of <fault-irq>, <receive-irq>, <transmit-irq>.
+- interrupt-names: Should be "fault", "rx", "tx" in order.
+- clocks: Phandle of the clock generating SPI clock on the bus.
+- clock-names: Should be "mck0".
+- cs-gpios: Specifies the gpio pins to be used for chipselects.
+            See: Documentation/devicetree/bindings/spi/spi-bus.txt
+
+Optional properties:
+- dmas: Two or more DMA channel specifiers following the convention outlined
+        in Documentation/devicetree/bindings/dma/dma.txt
+- dma-names: Names for the dma channels. There must be at least one channel
+             named "spi-tx" for transmit and named "spi-rx" for receive.
+
+Example:
+
+spi1: spi@1f821000 {
+        compatible = "microchip,pic32mzda-spi";
+        reg = <0x1f821000 0x200>;
+        interrupts = <109 IRQ_TYPE_LEVEL_HIGH>,
+                     <110 IRQ_TYPE_LEVEL_HIGH>,
+                     <111 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "fault", "rx", "tx";
+        clocks = <&PBCLK2>;
+        clock-names = "mck0";
+        cs-gpios = <&gpio3 4 GPIO_ACTIVE_LOW>;
+        dmas = <&dma 134>, <&dma 135>;
+        dma-names = "spi-rx", "spi-tx";
+};
index 1ad0fe310ff990966cb18ae64c1e992793086c52..ff5893d275a2132e63f5f824f9a074d18e904865 100644 (file)
@@ -16,8 +16,7 @@ Required properties:
 
 Optional property:
 - big-endian: If present the dspi device's registers are implemented
-  in big endian mode, otherwise in native mode(same with CPU), for more
-  detail please see: Documentation/devicetree/bindings/regmap/regmap.txt.
+  in big endian mode.
 
 Optional SPI slave node properties:
 - fsl,spi-cs-sck-delay: a delay in nanoseconds between activating chip
diff --git a/Documentation/devicetree/bindings/spi/sqi-pic32.txt b/Documentation/devicetree/bindings/spi/sqi-pic32.txt
new file mode 100644 (file)
index 0000000..c82d021
--- /dev/null
@@ -0,0 +1,18 @@
+Microchip PIC32 Quad SPI controller
+-----------------------------------
+Required properties:
+- compatible: Should be "microchip,pic32mzda-sqi".
+- reg: Address and length of SQI controller register space.
+- interrupts: Should contain SQI interrupt.
+- clocks: Should contain phandle of two clocks in sequence, one that drives
+          clock on SPI bus and other that drives SQI controller.
+- clock-names: Should be "spi_ck" and "reg_ck" in order.
+
+Example:
+       sqi1: spi@1f8e2000 {
+               compatible = "microchip,pic32mzda-sqi";
+               reg = <0x1f8e2000 0x200>;
+               clocks = <&rootclk REF2CLK>, <&rootclk PB5CLK>;
+               clock-names = "spi_ck", "reg_ck";
+               interrupts = <169 IRQ_TYPE_LEVEL_HIGH>;
+       };
index 6908d3aca59807fbbbb7a926fb10232275645aaa..edebfa0a985ef03f4557d440a8e41a0c2284e033 100644 (file)
@@ -26,6 +26,10 @@ Required properties :
     of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
     list of valid values when referring to thermal sensors.
 
+Note:
+- the "critical" type trip points will be set to SOC_THERM hardware as the
+shut down temperature. Once the temperature of this thermal zone is higher
+than it, the system will be shutdown or reset by hardware.
 
 Example :
 
@@ -51,5 +55,13 @@ Example: referring to thermal sensors :
 
                         thermal-sensors =
                                 <&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+                       trips {
+                               cpu_shutdown_trip: shutdown-trip {
+                                       temperature = <102500>;
+                                       hysteresis = <1000>;
+                                       type = "critical";
+                               };
+                       };
                 };
        };
index e5ee3f15989337f37b9d5e43036c31a2b4c9ff11..a8e52c8ccfcca623040a38ebd8f27063c3f96c72 100644 (file)
@@ -11,7 +11,6 @@ Required properties:
                            - "renesas,thermal-r8a7791" (R-Car M2-W)
                            - "renesas,thermal-r8a7792" (R-Car V2H)
                            - "renesas,thermal-r8a7793" (R-Car M2-N)
-                           - "renesas,thermal-r8a7794" (R-Car E2)
 - reg                  : Address range of the thermal registers.
                          The 1st reg will be recognized as common register
                          if it has "interrupts".
diff --git a/Documentation/devicetree/bindings/thermal/tango-thermal.txt b/Documentation/devicetree/bindings/thermal/tango-thermal.txt
new file mode 100644 (file)
index 0000000..212198d
--- /dev/null
@@ -0,0 +1,17 @@
+* Tango Thermal
+
+The SMP8758 SoC includes 3 instances of this temperature sensor
+(in the CPU, video decoder, and PCIe controller).
+
+Required properties:
+- #thermal-sensor-cells: Should be 0 (see thermal.txt)
+- compatible: "sigma,smp8758-thermal"
+- reg: Address range of the thermal registers
+
+Example:
+
+       cpu_temp: thermal@920100 {
+               #thermal-sensor-cells = <0>;
+               compatible = "sigma,smp8758-thermal";
+               reg = <0x920100 12>;
+       };
diff --git a/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt b/Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt
new file mode 100644 (file)
index 0000000..d723555
--- /dev/null
@@ -0,0 +1,89 @@
+General Purpose Analog To Digital Converter (ADC) based thermal sensor.
+
+On some of platforms, thermal sensor like thermistors are connected to
+one of ADC channel and sensor resistance is read via voltage across the
+sensor resistor. The voltage read across the sensor is mapped to
+temperature using voltage-temperature lookup table.
+
+Required properties:
+===================
+- compatible:               Must be "generic-adc-thermal".
+- temperature-lookup-table:  Two dimensional array of Integer; lookup table
+                            to map the relation between ADC value and
+                            temperature. When ADC is read, the value is
+                            looked up on the table to get the equivalent
+                            temperature.
+                            The first value of the each row of array is the
+                            temperature in milliCelsius and second value of
+                            the each row of array is the ADC read value.
+- #thermal-sensor-cells:     Should be 1. See ./thermal.txt for a description
+                            of this property.
+
+Example :
+#include <dt-bindings/thermal/thermal.h>
+
+i2c@7000c400 {
+       ads1015: ads1015@4a {
+               reg = <0x4a>;
+               compatible = "ads1015";
+               sampling-frequency = <3300>;
+               #io-channel-cells = <1>;
+       };
+};
+
+tboard_thermistor: thermal-sensor {
+       compatible = "generic-adc-thermal";
+       #thermal-sensor-cells = <0>;
+       io-channels = <&ads1015 1>;
+       io-channel-names = "sensor-channel";
+       temperature-lookup-table = <    (-40000) 2578
+                                       (-39000) 2577
+                                       (-38000) 2576
+                                       (-37000) 2575
+                                       (-36000) 2574
+                                       (-35000) 2573
+                                       (-34000) 2572
+                                       (-33000) 2571
+                                       (-32000) 2569
+                                       (-31000) 2568
+                                       (-30000) 2567
+                                       ::::::::::
+                                       118000 254
+                                       119000 247
+                                       120000 240
+                                       121000 233
+                                       122000 226
+                                       123000 220
+                                       124000 214
+                                       125000 208>;
+};
+
+dummy_cool_dev: dummy-cool-dev {
+       compatible = "dummy-cooling-dev";
+       #cooling-cells = <2>; /* min followed by max */
+};
+
+thermal-zones {
+       Tboard {
+               polling-delay = <15000>; /* milliseconds */
+               polling-delay-passive = <0>; /* milliseconds */
+               thermal-sensors = <&tboard_thermistor>;
+
+               trips {
+                       therm_est_trip: therm_est_trip {
+                               temperature = <40000>;
+                               type = "active";
+                               hysteresis = <1000>;
+                       };
+               };
+
+               cooling-maps {
+                       map0 {
+                               trip = <&therm_est_trip>;
+                               cooling-device = <&dummy_cool_dev THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+                               contribution = <100>;
+                       };
+
+               };
+       };
+};
index 8dab6fd024aa4edb889b880db02225e13cb44c61..107280ef0025401fe8ba003e5c7f8972f2443445 100644 (file)
@@ -5,10 +5,12 @@ Required properties:
 - reg : Should contain WDT registers location and length
 - interrupts : Should contain WDT interrupt
 
-Optional property:
+Optional properties:
 - big-endian: If present the watchdog device's registers are implemented
   in big endian mode, otherwise in native mode(same with CPU), for more
   detail please see: Documentation/devicetree/bindings/regmap/regmap.txt.
+- fsl,ext-reset-output: If present the watchdog device is configured to
+  assert its external reset (WDOG_B) instead of issuing a software reset.
 
 Examples:
 
diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt
new file mode 100644 (file)
index 0000000..b9512f1
--- /dev/null
@@ -0,0 +1,25 @@
+Renesas Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible : Should be "renesas,r8a7795-wdt", or "renesas,rcar-gen3-wdt"
+
+  When compatible with the generic version, nodes must list the SoC-specific
+  version corresponding to the platform first, followed by the generic
+  version.
+
+- reg : Should contain WDT registers location and length
+- clocks : the clock feeding the watchdog timer.
+
+Optional properties:
+- timeout-sec : Contains the watchdog timeout in seconds
+- power-domains : the power domain the WDT belongs to
+
+Examples:
+
+       wdt0: watchdog@e6020000 {
+               compatible = "renesas,r8a7795-wdt", "renesas,rcar-gen3-wdt";
+               reg = <0 0xe6020000 0 0x0c>;
+               clocks = <&cpg CPG_MOD 402>;
+               power-domains = <&cpg>;
+               timeout-sec = <60>;
+       };
index 7bde64014a89716a5242b413e0536cb207864802..ce4587d257d20b607fd08e80204dc09f0cc31862 100644 (file)
@@ -79,6 +79,38 @@ These filesystems may be used for inspiration:
 - ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt
 
 
+Handling Media Errors
+---------------------
+
+The libnvdimm subsystem stores a record of known media error locations for
+each pmem block device (in gendisk->badblocks). If we fault at such location,
+or one with a latent error not yet discovered, the application can expect
+to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply
+writing the affected sectors (through the pmem driver, and if the underlying
+NVDIMM supports the clear_poison DSM defined by ACPI).
+
+Since DAX IO normally doesn't go through the driver/bio path, applications or
+sysadmins have an option to restore the lost data from a prior backup/inbuilt
+redundancy in the following ways:
+
+1. Delete the affected file, and restore from a backup (sysadmin route):
+   This will free the file system blocks that were being used by the file,
+   and the next time they're allocated, they will be zeroed first, which
+   happens through the driver, and will clear bad sectors.
+
+2. Truncate or hole-punch the part of the file that has a bad-block (at least
+   an entire aligned sector has to be hole-punched, but not necessarily an
+   entire filesystem block).
+
+These are the two basic paths that allow DAX filesystems to continue operating
+in the presence of media errors. More robust error recovery mechanisms can be
+built on top of this in the future, for example, involving redundancy/mirroring
+provided at the block layer through DM, or additionally, at the filesystem
+level. These would have to rely on the above two tenets, that error clearing
+can happen either by sending an IO through the driver, or zeroing (also through
+the driver).
+
+
 Shortcomings
 ------------
 
index f5b1fcaa9e4e41ae76d050ff41fe9c8b1d59ab3d..9ba6587b76573e5cfd0d0fa4c05dcf2c27e95e05 100644 (file)
@@ -5,17 +5,17 @@ Supported chips:
   * Maxim MAX34440
     Prefixes: 'max34440'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34440.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34440.pdf
   * Maxim MAX34441
     PMBus 5-Channel Power-Supply Manager and Intelligent Fan Controller
     Prefixes: 'max34441'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34441.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34441.pdf
   * Maxim MAX34446
     PMBus Power-Supply Data Logger
     Prefixes: 'max34446'
     Addresses scanned: -
-    Datasheet: http://datasheets.maxim-ic.com/en/ds/MAX34446.pdf
+    Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34446.pdf
   * Maxim MAX34460
     PMBus 12-Channel Voltage Monitor & Sequencer
     Prefix: 'max34460'
index c52856da0cad555c7eeecd90c9738fccf941adb2..db101857b2c98c8e1f306620be41287b41f43a25 100644 (file)
@@ -241,9 +241,8 @@ comment "module support disabled"
        depends on !MODULES
 
 MODVERSIONS directly depends on MODULES, this means it's only visible if
-MODULES is different from 'n'. The comment on the other hand is always
-visible when MODULES is visible (the (empty) dependency of MODULES is
-also part of the comment dependencies).
+MODULES is different from 'n'. The comment on the other hand is only
+visible when MODULES is set to 'n'.
 
 
 Kconfig syntax
@@ -285,12 +284,17 @@ choices:
        "endchoice"
 
 This defines a choice group and accepts any of the above attributes as
-options. A choice can only be of type bool or tristate, while a boolean
-choice only allows a single config entry to be selected, a tristate
-choice also allows any number of config entries to be set to 'm'. This
-can be used if multiple drivers for a single hardware exists and only a
-single driver can be compiled/loaded into the kernel, but all drivers
-can be compiled as modules.
+options. A choice can only be of type bool or tristate.  If no type is
+specified for a choice, it's type will be determined by the type of
+the first choice element in the group or remain unknown if none of the
+choice elements have a type specified, as well.
+
+While a boolean choice only allows a single config entry to be
+selected, a tristate choice also allows any number of config entries
+to be set to 'm'. This can be used if multiple drivers for a single
+hardware exists and only a single driver can be compiled/loaded into
+the kernel, but all drivers can be compiled as modules.
+
 A choice accepts another option "optional", which allows to set the
 choice to 'n' and no entry needs to be selected.
 If no [symbol] is associated with a choice, then you can not have multiple
index ca895fd211e4e9f5f6bd0fc6a13bf60d9a0c14b2..789b27c6ec996735932131315ee7a5b9aebd0c7d 100644 (file)
@@ -42,9 +42,26 @@ variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
 
 After being requested, a PWM has to be configured using:
 
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
 
-To start/stop toggling the PWM output use pwm_enable()/pwm_disable().
+This API controls both the PWM period/duty_cycle config and the
+enable/disable state.
+
+The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
+around pwm_apply_state() and should not be used if the user wants to change
+several parameter at once. For example, if you see pwm_config() and
+pwm_{enable,disable}() calls in the same function, this probably means you
+should switch to pwm_apply_state().
+
+The PWM user API also allows one to query the PWM state with pwm_get_state().
+
+In addition to the PWM state, the PWM API also exposes PWM arguments, which
+are the reference PWM config one should use on this PWM.
+PWM arguments are usually platform-specific and allows the PWM user to only
+care about dutycycle relatively to the full period (like, duty = 50% of the
+period). struct pwm_args contains 2 fields (period and polarity) and should
+be used to set the initial PWM config (usually done in the probe function
+of the PWM user). PWM arguments are retrieved with pwm_get_args().
 
 Using PWMs with the sysfs interface
 -----------------------------------
@@ -105,6 +122,15 @@ goes low for the remainder of the period. Conversely, a signal with inversed
 polarity starts low for the duration of the duty cycle and goes high for the
 remainder of the period.
 
+Drivers are encouraged to implement ->apply() instead of the legacy
+->enable(), ->disable() and ->config() methods. Doing that should provide
+atomicity in the PWM config workflow, which is required when the PWM controls
+a critical device (like a regulator).
+
+The implementation of ->get_state() (a method used to retrieve initial PWM
+state) is also encouraged for the same reason: letting the PWM user know
+about the current PWM state would allow him to avoid glitches.
+
 Locking
 -------
 
index daabdd7ee543ea7f8bfe61274869ded73f353a94..a3683ce2a2f3ca99f32b3c2b120d3bd5d2d81683 100644 (file)
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -668,6 +669,19 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_max_contexts_per_stack:
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
index ed419d6c8dec8d4db73d4cf432c937f9b0bc3126..efc3f3d293c499d479e0c558837157caca5d0352 100644 (file)
@@ -69,8 +69,8 @@ temperature) and throttle appropriate devices.
 1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 
     This interface function removes the thermal zone device.
-    It deletes the corresponding entry form /sys/class/thermal folder and
-    unbind all the thermal cooling devices it uses.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds all the thermal cooling devices it uses.
 
 1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
                struct device *dev, int sensor_id, void *data,
@@ -146,32 +146,32 @@ temperature) and throttle appropriate devices.
 
     This interface function adds a new thermal cooling device (fan/processor/...)
     to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
-    to all the thermal zone devices register at the same time.
+    to all the thermal zone devices registered at the same time.
     name: the cooling device name.
     devdata: device private data.
     ops: thermal cooling devices call-backs.
        .get_max_state: get the Maximum throttle state of the cooling device.
-       .get_cur_state: get the Current throttle state of the cooling device.
+       .get_cur_state: get the Currently requested throttle state of the cooling device.
        .set_cur_state: set the Current throttle state of the cooling device.
 
 1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 
-    This interface function remove the thermal cooling device.
-    It deletes the corresponding entry form /sys/class/thermal folder and
-    unbind itself from all the thermal zone devices using it.
+    This interface function removes the thermal cooling device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds itself from all the thermal zone devices using it.
 
 1.3 interface for binding a thermal zone device with a thermal cooling device
 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
        int trip, struct thermal_cooling_device *cdev,
        unsigned long upper, unsigned long lower, unsigned int weight);
 
-    This interface function bind a thermal cooling device to the certain trip
+    This interface function binds a thermal cooling device to a particular trip
     point of a thermal zone device.
     This function is usually called in the thermal zone device .bind callback.
     tz: the thermal zone device
     cdev: thermal cooling device
-    trip: indicates which trip point the cooling devices is associated with
-         in this thermal zone.
+    trip: indicates which trip point in this thermal zone the cooling device
+          is associated with.
     upper:the Maximum cooling state for this trip point.
           THERMAL_NO_LIMIT means no upper limit,
          and the cooling device can be in max_state.
@@ -184,13 +184,13 @@ temperature) and throttle appropriate devices.
 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
                int trip, struct thermal_cooling_device *cdev);
 
-    This interface function unbind a thermal cooling device from the certain
+    This interface function unbinds a thermal cooling device from a particular
     trip point of a thermal zone device. This function is usually called in
     the thermal zone device .unbind callback.
     tz: the thermal zone device
     cdev: thermal cooling device
-    trip: indicates which trip point the cooling devices is associated with
-         in this thermal zone.
+    trip: indicates which trip point in this thermal zone the cooling device
+          is associated with.
 
 1.4 Thermal Zone Parameters
 1.4.1 struct thermal_bind_params
@@ -210,13 +210,13 @@ temperature) and throttle appropriate devices.
                this thermal zone and cdev, for a particular trip point.
                If nth bit is set, then the cdev and thermal zone are bound
                for trip point n.
-    .limits: This is an array of cooling state limits. Must have exactly
-         2 * thermal_zone.number_of_trip_points. It is an array consisting
-         of tuples <lower-state upper-state> of state limits. Each trip
-         will be associated with one state limit tuple when binding.
-         A NULL pointer means <THERMAL_NO_LIMITS THERMAL_NO_LIMITS>
-         on all trips. These limits are used when binding a cdev to a
-         trip point.
+    .binding_limits: This is an array of cooling state limits. Must have
+                     exactly 2 * thermal_zone.number_of_trip_points. It is an
+                     array consisting of tuples <lower-state upper-state> of
+                     state limits. Each trip will be associated with one state
+                     limit tuple when binding. A NULL pointer means
+                     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+                     These limits are used when binding a cdev to a trip point.
     .match: This call back returns success(0) if the 'tz and cdev' need to
            be bound, as per platform data.
 1.4.2 struct thermal_zone_params
@@ -351,8 +351,8 @@ cdev[0-*]
        RO, Optional
 
 cdev[0-*]_trip_point
-       The trip point with which cdev[0-*] is associated in this thermal
-       zone; -1 means the cooling device is not associated with any trip
+       The trip point in this thermal zone which cdev[0-*] is associated
+       with; -1 means the cooling device is not associated with any trip
        point.
        RO, Optional
 
index 9488078900e0c44bdf90ae502e4a114e5919b98d..a40398cce9d1557c5be11b5dca185d4c79c70641 100644 (file)
@@ -1,64 +1,67 @@
-Last reviewed: 06/02/2009
+Last reviewed: 04/04/2016
 
-                     HP iLO2 NMI Watchdog Driver
-              NMI sourcing for iLO2 based ProLiant Servers
+                     HPE iLO NMI Watchdog Driver
+              NMI sourcing for iLO based ProLiant Servers
                      Documentation and Driver by
-              Thomas Mingarelli <thomas.mingarelli@hp.com>
+              Thomas Mingarelli <thomas.mingarelli@hpe.com>
 
- The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
+ The HPE iLO NMI Watchdog driver is a kernel module that provides basic
  watchdog functionality and the added benefit of NMI sourcing. Both the
  watchdog functionality and the NMI sourcing capability need to be enabled
  by the user. Remember that the two modes are not dependent on one another.
  A user can have the NMI sourcing without the watchdog timer and vice-versa.
+ All references to iLO in this document imply it also works on iLO2 and all
+ subsequent generations.
 
  Watchdog functionality is enabled like any other common watchdog driver. That
  is, an application needs to be started that kicks off the watchdog timer. A
  basic application exists in the Documentation/watchdog/src directory called
  watchdog-test.c. Simply compile the C file and kick it off. If the system
- gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
+ gets into a bad state and hangs, the HPE ProLiant iLO timer register will
  not be updated in a timely fashion and a hardware system reset (also known as
  an Automatic Server Recovery (ASR)) event will occur.
 
- The hpwdt driver also has four (4) module parameters. They are the following:
+ The hpwdt driver also has three (3) module parameters. They are the following:
 
- soft_margin - allows the user to set the watchdog timer value
- allow_kdump - allows the user to save off a kernel dump image after an NMI
+ soft_margin - allows the user to set the watchdog timer value.
+               Default value is 30 seconds.
+ allow_kdump - allows the user to save off a kernel dump image after an NMI.
+               Default value is 1/ON
  nowayout    - basic watchdog parameter that does not allow the timer to
                be restarted or an impending ASR to be escaped.
- priority    - determines whether or not the hpwdt driver is first on the
-               die_notify list to handle NMIs or last. The default value
-               for this module parameter is 0 or LAST. If the user wants to
-               enable NMI sourcing then reload the hpwdt driver with
-               priority=1 (and boot with nmi_watchdog=0).
+               Default value is set when compiling the kernel. If it is set
+               to "Y", then there is no way of disabling the watchdog once
+               it has been started.
 
  NOTE: More information about watchdog drivers in general, including the ioctl
        interface to /dev/watchdog can be found in
        Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
 
- The priority parameter was introduced due to other kernel software that relied
- on handling NMIs (like oprofile). Keeping hpwdt's priority at 0 (or LAST)
- enables the users of NMIs for non critical events to be work as expected.
-
  The NMI sourcing capability is disabled by default due to the inability to
  distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
  Linux kernel. What this means is that the hpwdt nmi handler code is called
  each time the NMI signal fires off. This could amount to several thousands of
  NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
  confused" message in the logs or if the system gets into a hung state, then
- the hpwdt driver can be reloaded with the "priority" module parameter set
- (priority=1).
+ the hpwdt driver can be reloaded.
 
  1. If the kernel has not been booted with nmi_watchdog turned off then
-    edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
-    currently booting kernel line.
+    edit and place the nmi_watchdog=0 at the end of the currently booting
+    kernel line. Depending on your Linux distribution and platform setup:
+    For non-UEFI systems
+       /boot/grub/grub.conf   or
+       /boot/grub/menu.lst
+    For UEFI systems
+      /boot/efi/EFI/distroname/grub.conf   or
+      /boot/efi/efi/distroname/elilo.conf
  2. reboot the sever
- 3. Once the system comes up perform a rmmod hpwdt
- 4. insmod /lib/modules/`uname -r`/kernel/drivers/char/watchdog/hpwdt.ko priority=1
+ 3. Once the system comes up perform a modprobe -r hpwdt
+ 4. modprobe /lib/modules/`uname -r`/kernel/drivers/watchdog/hpwdt.ko
 
  Now, the hpwdt can successfully receive and source the NMI and provide a log
- message that details the reason for the NMI (as determined by the HP BIOS).
+ message that details the reason for the NMI (as determined by the HPE BIOS).
 
- Below is a list of NMIs the HP BIOS understands along with the associated
+ Below is a list of NMIs the HPE BIOS understands along with the associated
  code (reason):
 
        No source found                00h
@@ -92,4 +95,4 @@ Last reviewed: 06/02/2009
 
 
  -- Tom Mingarelli
-    (thomas.mingarelli@hp.com)
+    (thomas.mingarelli@hpe.com)
index c161399a6b5c134932512b6ae74e6e1676f5051c..a8d364227a7704af5119d71d40bfc81f4f25a570 100644 (file)
@@ -86,6 +86,10 @@ nowayout: Watchdog cannot be stopped once started
 davinci_wdt:
 heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60
 -------------------------------------------------
+ebc-c384_wdt:
+timeout: Watchdog timeout in seconds. (1<=timeout<=15300, default=60)
+nowayout: Watchdog cannot be stopped once started
+-------------------------------------------------
 ep93xx_wdt:
 nowayout: Watchdog cannot be stopped once started
 timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD)
index 33020068822807625aa8873116267df7d7138036..c68eaeb5a5c0ba2c791afbf0e500bede4bb5e4af 100644 (file)
@@ -6096,6 +6096,14 @@ S:       Maintained
 F:     arch/x86/include/asm/intel_telemetry.h
 F:     drivers/platform/x86/intel_telemetry*
 
+INTEL PMC CORE DRIVER
+M:     Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+M:     Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+L:     platform-driver-x86@vger.kernel.org
+S:     Maintained
+F:     arch/x86/include/asm/pmc_core.h
+F:     drivers/platform/x86/intel_pmc_core*
+
 IOC3 ETHERNET DRIVER
 M:     Ralf Baechle <ralf@linux-mips.org>
 L:     linux-mips@linux-mips.org
@@ -6491,6 +6499,7 @@ F:        arch/*/include/asm/kvm*
 F:     include/linux/kvm*
 F:     include/uapi/linux/kvm*
 F:     virt/kvm/
+F:     tools/kvm/
 
 KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
 M:     Joerg Roedel <joro@8bytes.org>
@@ -8881,6 +8890,7 @@ F:        arch/*/kernel/*/perf_event*.c
 F:     arch/*/kernel/*/*/perf_event*.c
 F:     arch/*/include/asm/perf_event.h
 F:     arch/*/kernel/perf_callchain.c
+F:     arch/*/events/*
 F:     tools/perf/
 
 PERSONALITY HANDLING
@@ -11295,6 +11305,7 @@ F:      drivers/platform/x86/thinkpad_acpi.c
 
 TI BANDGAP AND THERMAL DRIVER
 M:     Eduardo Valentin <edubezval@gmail.com>
+M:     Keerthy <j-keerthy@ti.com>
 L:     linux-pm@vger.kernel.org
 L:     linux-omap@vger.kernel.org
 S:     Maintained
@@ -12345,6 +12356,7 @@ L:      linux-watchdog@vger.kernel.org
 W:     http://www.linux-watchdog.org/
 T:     git git://www.linux-watchdog.org/linux-watchdog.git
 S:     Maintained
+F:     Documentation/devicetree/bindings/watchdog/
 F:     Documentation/watchdog/
 F:     drivers/watchdog/
 F:     include/linux/watchdog.h
index 0f9cb36d45c2c59a589670679e7fa1d25ff9ee59..9ee5863dae2333826d8a98444e9c6cff8a55d7e9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -128,6 +128,10 @@ _all:
 # Cancel implicit rules on top Makefile
 $(CURDIR)/Makefile Makefile: ;
 
+ifneq ($(words $(subst :, ,$(CURDIR))), 1)
+  $(error main directory cannot contain spaces nor colons)
+endif
+
 ifneq ($(KBUILD_OUTPUT),)
 # Invoke a second make in the output directory, passing relevant variables
 # check that the output directory actually exists
@@ -142,7 +146,7 @@ PHONY += $(MAKECMDGOALS) sub-make
 $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
        @:
 
-sub-make: FORCE
+sub-make:
        $(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
        -f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
 
@@ -364,7 +368,7 @@ AFLAGS_MODULE   =
 LDFLAGS_MODULE  =
 CFLAGS_KERNEL  =
 AFLAGS_KERNEL  =
-CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage
+CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage -fno-tree-loop-im -Wno-maybe-uninitialized
 CFLAGS_KCOV    = -fsanitize-coverage=trace-pc
 
 
@@ -617,7 +621,11 @@ KBUILD_CFLAGS      += $(call cc-option,-fno-delete-null-pointer-checks,)
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS  += -Os $(call cc-disable-warning,maybe-uninitialized,)
 else
-KBUILD_CFLAGS  += -O2
+ifdef CONFIG_PROFILE_ALL_BRANCHES
+KBUILD_CFLAGS  += -O2 $(call cc-disable-warning,maybe-uninitialized,)
+else
+KBUILD_CFLAGS   += -O2
+endif
 endif
 
 # Tell gcc to never replace conditional load with a non-conditional one
@@ -697,9 +705,10 @@ KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
 else
 
-# This warning generated too much noise in a regular build.
-# Use make W=1 to enable this warning (see scripts/Makefile.build)
+# These warnings generated too much noise in a regular build.
+# Use make W=1 to enable them (see scripts/Makefile.build)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 endif
 
 ifdef CONFIG_FRAME_POINTER
@@ -926,27 +935,41 @@ export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Doc
 
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
 
-# Final link of vmlinux
-      cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
-quiet_cmd_link-vmlinux = LINK    $@
-
-# Include targets which we want to
-# execute if the rest of the kernel build went well.
-vmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE
+# Include targets which we want to execute sequentially if the rest of the
+# kernel build went well. If CONFIG_TRIM_UNUSED_KSYMS is set, this might be
+# evaluated more than once.
+PHONY += vmlinux_prereq
+vmlinux_prereq: $(vmlinux-deps) FORCE
 ifdef CONFIG_HEADERS_CHECK
        $(Q)$(MAKE) -f $(srctree)/Makefile headers_check
 endif
-ifdef CONFIG_SAMPLES
-       $(Q)$(MAKE) $(build)=samples
-endif
 ifdef CONFIG_BUILD_DOCSRC
        $(Q)$(MAKE) $(build)=Documentation
 endif
 ifdef CONFIG_GDB_SCRIPTS
        $(Q)ln -fsn `cd $(srctree) && /bin/pwd`/scripts/gdb/vmlinux-gdb.py
 endif
+ifdef CONFIG_TRIM_UNUSED_KSYMS
+       $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \
+         "$(MAKE) KBUILD_MODULES=1 -f $(srctree)/Makefile vmlinux_prereq"
+endif
+
+# standalone target for easier testing
+include/generated/autoksyms.h: FORCE
+       $(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh true
+
+# Final link of vmlinux
+      cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
+quiet_cmd_link-vmlinux = LINK    $@
+
+vmlinux: scripts/link-vmlinux.sh vmlinux_prereq $(vmlinux-deps) FORCE
        +$(call if_changed,link-vmlinux)
 
+# Build samples along the rest of the kernel
+ifdef CONFIG_SAMPLES
+vmlinux-dirs += samples
+endif
+
 # The actual objects are generated when descending,
 # make sure no implicit rule kicks in
 $(sort $(vmlinux-deps)): $(vmlinux-dirs) ;
@@ -998,10 +1021,12 @@ prepare2: prepare3 outputmakefile asm-generic
 prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
                    include/config/auto.conf
        $(cmd_crmodverdir)
+       $(Q)test -e include/generated/autoksyms.h || \
+           touch   include/generated/autoksyms.h
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
-prepare0: archprepare FORCE
+prepare0: archprepare
        $(Q)$(MAKE) $(build)=.
 
 # All the preparing..
@@ -1061,7 +1086,7 @@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
 export INSTALL_FW_PATH
 
 PHONY += firmware_install
-firmware_install: FORCE
+firmware_install:
        @mkdir -p $(objtree)/firmware
        $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
 
@@ -1081,7 +1106,7 @@ PHONY += archscripts
 archscripts:
 
 PHONY += __headers
-__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
+__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
        $(Q)$(MAKE) $(build)=scripts build_unifdef
 
 PHONY += headers_install_all
@@ -1192,7 +1217,8 @@ else # CONFIG_MODULES
 # Modules not configured
 # ---------------------------------------------------------------------------
 
-modules modules_install: FORCE
+PHONY += modules modules_install
+modules modules_install:
        @echo >&2
        @echo >&2 "The present kernel configuration has modules disabled."
        @echo >&2 "Type 'make config' and enable loadable module support."
@@ -1283,6 +1309,7 @@ boards := $(sort $(notdir $(boards)))
 board-dirs := $(dir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*/*_defconfig))
 board-dirs := $(sort $(notdir $(board-dirs:/=)))
 
+PHONY += help
 help:
        @echo  'Cleaning targets:'
        @echo  '  clean           - Remove most generated files but keep the config and'
@@ -1453,6 +1480,7 @@ $(clean-dirs):
 clean: rm-dirs := $(MODVERDIR)
 clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
 
+PHONY += help
 help:
        @echo  '  Building external modules.'
        @echo  '  Syntax: make -C path/to/kernel/src M=$$PWD target'
index 39e58d1cdf90b7d8f84d108d8024d8974c3b800c..41fa2ec9e02c7721717e5c513bc9703ebed5bed4 100644 (file)
@@ -15,6 +15,7 @@
 #if !defined(_UAPI_ASM_ARC_UNISTD_H) || defined(__SYSCALL)
 #define _UAPI_ASM_ARC_UNISTD_H
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
index 8b134cfe5e1f11023b559639497f7e1a35d2ee79..6fd48021324b50acd326a4ce9a17031f05db7609 100644 (file)
@@ -48,7 +48,7 @@ struct arc_callchain_trace {
 static int callchain_trace(unsigned int addr, void *data)
 {
        struct arc_callchain_trace *ctrl = data;
-       struct perf_callchain_entry *entry = ctrl->perf_stuff;
+       struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff;
        perf_callchain_store(entry, addr);
 
        if (ctrl->depth++ < 3)
@@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct arc_callchain_trace ctrl = {
                .depth = 0,
@@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        /*
         * User stack can't be unwound trivially with kernel dwarf unwinder
index 446705a4325aafb9126484100090a69c71b0aff1..5be33a2d59a9d8c685d40e9a4084de8bf374bdb5 100644 (file)
@@ -82,7 +82,6 @@ $(obj)/uImage:        $(obj)/zImage FORCE
 
 $(obj)/bootp/bootp: $(obj)/zImage initrd FORCE
        $(Q)$(MAKE) $(build)=$(obj)/bootp $@
-       @:
 
 $(obj)/bootpImage: $(obj)/bootp/bootp FORCE
        $(call if_changed,objcopy)
index 5761f0039133ff33dc8bd96a61e2d2c355f59682..5e4acd253b300db5ed40af0f9a424b0493f4de6c 100644 (file)
@@ -17,7 +17,6 @@ targets       := bootp init.o kernel.o initrd.o
 # Note that bootp.lds picks up kernel.o and initrd.o
 $(obj)/bootp:  $(src)/bootp.lds $(addprefix $(obj)/,init.o kernel.o initrd.o) FORCE
        $(call if_changed,ld)
-       @:
 
 # kernel.o and initrd.o includes a binary image using
 # .incbin, a dependency which is not tracked automatically
@@ -26,4 +25,4 @@ $(obj)/kernel.o: arch/arm/boot/zImage FORCE
 
 $(obj)/initrd.o: $(INITRD) FORCE
 
-PHONY += $(INITRD) FORCE
+PHONY += $(INITRD)
index 0f89d87cb2a0ab99f801471a24e17eb5391a13bb..06b6c2d695bfb6bfbb0a61bbfffd958a58bf20d3 100644 (file)
@@ -399,6 +399,7 @@ dtb-$(CONFIG_SOC_IMX6UL) += \
        imx6ul-tx6ul-mainboard.dtb
 dtb-$(CONFIG_SOC_IMX7D) += \
        imx7d-cl-som-imx7.dtb \
+       imx7d-nitrogen7.dtb \
        imx7d-sbc-imx7.dtb \
        imx7d-sdb.dtb
 dtb-$(CONFIG_SOC_LS1021A) += \
index 267f81adb42fbc01e4361fc6ea9014bfa2bc5a9a..8c89062663108a7f4e9579f911778f5954cba33d 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos3250.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/clock/samsung,s2mps11.h>
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        status = "okay";
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &xusbxti {
        clock-frequency = <24000000>;
 };
index 31eb09bae0a2f9b10f7cd7b6f984d7a8ef57916d..e422819591dcb32e33b27a4f2ef6a94057961e4b 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos3250.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/clock/samsung,s2mps11.h>
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_lcd0 {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mcuisp {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_isp {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peril {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        status = "okay";
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &xusbxti {
        clock-frequency = <24000000>;
 };
index 094782b207ee138362d294f4770438a175a77730..62f3dcd9e046c3dc526d81a8bd70a575be99487f 100644 (file)
                        clock-names = "ppmu";
                        status = "disabled";
                };
+
+               bus_dmc: bus_dmc {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu_dmc CLK_DIV_DMC>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_dmc_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_dmc_opp_table: opp_table1 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@134000000 {
+                               opp-hz = /bits/ 64 <134000000>;
+                               opp-microvolt = <800000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                               opp-microvolt = <825000>;
+                       };
+                       opp@400000000 {
+                               opp-hz = /bits/ 64 <400000000>;
+                               opp-microvolt = <875000>;
+                       };
+               };
+
+               bus_leftbus: bus_leftbus {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_GDL>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_rightbus: bus_rightbus {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_GDR>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_lcd0: bus_lcd0 {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_160>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_fsys: bus_fsys {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_200>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_mcuisp: bus_mcuisp {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_400_MCUISP>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_mcuisp_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_isp: bus_isp {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_266>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_isp_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_peril: bus_peril {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_DIV_ACLK_100>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_peril_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_mfc: bus_mfc {
+                       compatible = "samsung,exynos-bus";
+                       clocks = <&cmu CLK_SCLK_MFC>;
+                       clock-names = "bus";
+                       operating-points-v2 = <&bus_leftbus_opp_table>;
+                       status = "disabled";
+               };
+
+               bus_leftbus_opp_table: opp_table2 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                               opp-microvolt = <900000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                               opp-microvolt = <900000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                               opp-microvolt = <1000000>;
+                       };
+                       opp@134000000 {
+                               opp-hz = /bits/ 64 <134000000>;
+                               opp-microvolt = <1000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                               opp-microvolt = <1000000>;
+                       };
+               };
+
+               bus_mcuisp_opp_table: opp_table3 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                       };
+                       opp@400000000 {
+                               opp-hz = /bits/ 64 <400000000>;
+                       };
+               };
+
+               bus_isp_opp_table: opp_table4 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+                       opp@200000000 {
+                               opp-hz = /bits/ 64 <200000000>;
+                       };
+                       opp@300000000 {
+                               opp-hz = /bits/ 64 <300000000>;
+                       };
+               };
+
+               bus_peril_opp_table: opp_table5 {
+                       compatible = "operating-points-v2";
+                       opp-shared;
+
+                       opp@50000000 {
+                               opp-hz = /bits/ 64 <50000000>;
+                       };
+                       opp@80000000 {
+                               opp-hz = /bits/ 64 <80000000>;
+                       };
+                       opp@100000000 {
+                               opp-hz = /bits/ 64 <100000000>;
+                       };
+               };
        };
 };
 
index c1cb8df6da0771f90d06280babf3ca202b702c99..2d9b02967105fd7e9320adba5625d4e5bb2a0be4 100644 (file)
                power-domains = <&pd_lcd1>;
                #iommu-cells = <0>;
        };
+
+       bus_dmc: bus_dmc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_DMC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_acp: bus_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_ACP>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK100>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK133>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_opp_table>;
+               status = "disabled";
+       };
+
+       bus_display: bus_display {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK160>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_display_opp_table>;
+               status = "disabled";
+       };
+
+       bus_lcd0: bus_lcd0 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK200>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_leftbus: bus_leftbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_rightbus: bus_rightbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDR>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_SCLK_MFC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_dmc_opp_table: opp_table1 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <1025000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+                       opp-microvolt = <1050000>;
+               };
+               opp@400000000 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <1150000>;
+               };
+       };
+
+       bus_acp_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@5000000 {
+                       opp-hz = /bits/ 64 <5000000>;
+               };
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
+
+       bus_fsys_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@10000000 {
+                       opp-hz = /bits/ 64 <10000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+       };
+
+       bus_display_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+       };
+
+       bus_leftbus_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
 };
 
 &gic {
index cab0f07d7d282a168df14516426951106aecf3bc..ec7619a384a2b31943ba61f624c893341df90e38 100644 (file)
@@ -11,6 +11,7 @@
 #include <dt-bindings/input/input.h>
 #include <dt-bindings/clock/maxim,max77686.h>
 #include "exynos4412.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/gpio/gpio.h>
 
 / {
        };
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_acp {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_c2c {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_display {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
 
                        buck1_reg: BUCK1 {
                                regulator-name = "vdd_mif";
-                               regulator-min-microvolt = <1000000>;
-                               regulator-max-microvolt = <1000000>;
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1100000>;
                                regulator-always-on;
                                regulator-boot-on;
                        };
 
                        buck3_reg: BUCK3 {
                                regulator-name = "vdd_int";
-                               regulator-min-microvolt = <1000000>;
-                               regulator-max-microvolt = <1000000>;
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1050000>;
                                regulator-always-on;
                                regulator-boot-on;
                        };
diff --git a/arch/arm/boot/dts/exynos4412-ppmu-common.dtsi b/arch/arm/boot/dts/exynos4412-ppmu-common.dtsi
new file mode 100644 (file)
index 0000000..16e4b77
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Device tree sources for Exynos4412 PPMU common device tree
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+&ppmu_dmc0 {
+       status = "okay";
+
+       events {
+              ppmu_dmc0_3: ppmu-event3-dmc0 {
+                      event-name = "ppmu-event3-dmc0";
+              };
+       };
+};
+
+&ppmu_dmc1 {
+       status = "okay";
+
+       events {
+              ppmu_dmc1_3: ppmu-event3-dmc1 {
+                      event-name = "ppmu-event3-dmc1";
+              };
+       };
+};
+
+&ppmu_leftbus {
+       status = "okay";
+
+       events {
+              ppmu_leftbus_3: ppmu-event3-leftbus {
+                      event-name = "ppmu-event3-leftbus";
+              };
+       };
+};
+
+&ppmu_rightbus {
+       status = "okay";
+
+       events {
+              ppmu_rightbus_3: ppmu-event3-rightbus {
+                      event-name = "ppmu-event3-rightbus";
+              };
+       };
+};
index 5d1eaea3f77806ae8c4d6a9b26893d3437ecdf9c..9336fd4824d9949db63f3b91b59e8dfb67e81d14 100644 (file)
@@ -14,6 +14,7 @@
 
 /dts-v1/;
 #include "exynos4412.dtsi"
+#include "exynos4412-ppmu-common.dtsi"
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/interrupt-controller/irq.h>
 #include <dt-bindings/clock/maxim,max77686.h>
        status = "okay";
 };
 
+&bus_dmc {
+       devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
+       vdd-supply = <&buck1_reg>;
+       status = "okay";
+};
+
+&bus_acp {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_c2c {
+       devfreq = <&bus_dmc>;
+       status = "okay";
+};
+
+&bus_leftbus {
+       devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
+       vdd-supply = <&buck3_reg>;
+       status = "okay";
+};
+
+&bus_rightbus {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_display {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_leftbus>;
+       status = "okay";
+};
+
 &cpu0 {
        cpu0-supply = <&buck2_reg>;
 };
        assigned-clock-parents =  <&clock CLK_XUSBXTI>;
 };
 
-&ppmu_dmc0 {
-       status = "okay";
-
-       events {
-               ppmu_dmc0_3: ppmu-event3-dmc0 {
-                       event-name = "ppmu-event3-dmc0";
-               };
-       };
-};
-
-&ppmu_dmc1 {
-       status = "okay";
-
-       events {
-               ppmu_dmc1_3: ppmu-event3-dmc1 {
-                       event-name = "ppmu-event3-dmc1";
-               };
-       };
-};
-
-&ppmu_leftbus {
-       status = "okay";
-
-       events {
-               ppmu_leftbus_3: ppmu-event3-leftbus {
-                       event-name = "ppmu-event3-leftbus";
-               };
-       };
-};
-
-&ppmu_rightbus {
-       status = "okay";
-
-       events {
-               ppmu_rightbus_3: ppmu-event3-rightbus {
-                       event-name = "ppmu-event3-rightbus";
-               };
-       };
-};
-
 &pinctrl_0 {
        pinctrl-names = "default";
        pinctrl-0 = <&sleep0>;
index b7490ea0c75cc536c89f5d8f014b6248d50ab0e6..c452499ae8c9a45a128e15dd80f9e29a439d3155 100644 (file)
                clocks = <&clock CLK_SMMU_LITE1>, <&clock CLK_FIMC_LITE1>;
                #iommu-cells = <0>;
        };
+
+       bus_dmc: bus_dmc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_DMC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_acp: bus_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_ACP>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_c2c: bus_c2c {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_C2C>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_dmc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_dmc_opp_table: opp_table1 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp@400000000 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <1050000>;
+               };
+       };
+
+       bus_acp_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@267000000 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_leftbus: bus_leftbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_rightbus: bus_rightbus {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DIV_GDR>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_display: bus_display {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK160>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_display_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK133>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_ACLK100>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_SCLK_MFC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_leftbus_opp_table>;
+               status = "disabled";
+       };
+
+       bus_leftbus_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+                       opp-microvolt = <900000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+                       opp-microvolt = <925000>;
+               };
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <1000000>;
+               };
+       };
+
+       bus_display_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@160000000 {
+                       opp-hz = /bits/ 64 <160000000>;
+               };
+               opp@200000000 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_fsys_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp@134000000 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp@50000000 {
+                       opp-hz = /bits/ 64 <50000000>;
+               };
+               opp@100000000 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
 };
 
 &combiner {
index 4c8523471c65e5701ac5ccd34bb748a029caa83e..c6e05eb88937d0c59bf09a2b546435937f723c1c 100644 (file)
                };
        };
 
+       nocp_mem0_0: nocp@10CA1000 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1000 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem0_1: nocp@10CA1400 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1400 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem1_0: nocp@10CA1800 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1800 0x200>;
+               status = "disabled";
+       };
+
+       nocp_mem1_1: nocp@10CA1C00 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x10CA1C00 0x200>;
+               status = "disabled";
+       };
+
+       nocp_g3d_0: nocp@11A51000 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x11A51000 0x200>;
+               status = "disabled";
+       };
+
+       nocp_g3d_1: nocp@11A51400 {
+               compatible = "samsung,exynos5420-nocp";
+               reg = <0x11A51400 0x200>;
+               status = "disabled";
+       };
+
        gsc_pd: power-domain@10044000 {
                compatible = "samsung,exynos4210-pd";
                reg = <0x10044000 0x20>;
                power-domains = <&disp_pd>;
                #iommu-cells = <0>;
        };
+
+       bus_wcore: bus_wcore {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_WCORE>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_wcore_opp_table>;
+               status = "disabled";
+       };
+
+       bus_noc: bus_noc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK100_NOC>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_noc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys_apb: bus_fsys_apb {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_PCLK200_FSYS>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys: bus_fsys {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK200_FSYS>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_fsys2: bus_fsys2 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK200_FSYS2>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_fsys2_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mfc: bus_mfc {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK333>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_mfc_opp_table>;
+               status = "disabled";
+       };
+
+       bus_gen: bus_gen {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK266>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_gen_opp_table>;
+               status = "disabled";
+       };
+
+       bus_peri: bus_peri {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK66>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_peri_opp_table>;
+               status = "disabled";
+       };
+
+       bus_g2d: bus_g2d {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK333_G2D>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_g2d_opp_table>;
+               status = "disabled";
+       };
+
+       bus_g2d_acp: bus_g2d_acp {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK266_G2D>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_g2d_acp_opp_table>;
+               status = "disabled";
+       };
+
+       bus_jpeg: bus_jpeg {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_JPEG>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_jpeg_opp_table>;
+               status = "disabled";
+       };
+
+       bus_jpeg_apb: bus_jpeg_apb {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK166>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_jpeg_apb_opp_table>;
+               status = "disabled";
+       };
+
+       bus_disp1_fimd: bus_disp1_fimd {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_DISP1>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_disp1_fimd_opp_table>;
+               status = "disabled";
+       };
+
+       bus_disp1: bus_disp1 {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_DISP1>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_disp1_opp_table>;
+               status = "disabled";
+       };
+
+       bus_gscl_scaler: bus_gscl_scaler {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK300_GSCL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_gscl_opp_table>;
+               status = "disabled";
+       };
+
+       bus_mscl: bus_mscl {
+               compatible = "samsung,exynos-bus";
+               clocks = <&clock CLK_DOUT_ACLK400_MSCL>;
+               clock-names = "bus";
+               operating-points-v2 = <&bus_mscl_opp_table>;
+               status = "disabled";
+       };
+
+       bus_wcore_opp_table: opp_table2 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+                       opp-microvolt = <925000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <333000000>;
+                       opp-microvolt = <950000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <987500>;
+               };
+       };
+
+       bus_noc_opp_table: opp_table3 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <86000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+       };
+
+       bus_fsys_apb_opp_table: opp_table4 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_fsys2_opp_table: opp_table5 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <100000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+       };
+
+       bus_mfc_opp_table: opp_table6 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <96000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+       };
+
+       bus_gen_opp_table: opp_table7 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <89000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <133000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <178000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_peri_opp_table: opp_table8 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+       };
+
+       bus_g2d_opp_table: opp_table9 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+       };
+
+       bus_g2d_acp_opp_table: opp_table10 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <67000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <133000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <178000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <267000000>;
+               };
+       };
+
+       bus_jpeg_opp_table: opp_table11 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <75000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_jpeg_apb_opp_table: opp_table12 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <111000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <134000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+       };
+
+       bus_disp1_fimd_opp_table: opp_table13 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <120000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+       };
+
+       bus_disp1_opp_table: opp_table14 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <120000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_gscl_opp_table: opp_table15 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <150000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <200000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <300000000>;
+               };
+       };
+
+       bus_mscl_opp_table: opp_table16 {
+               compatible = "operating-points-v2";
+
+               opp00 {
+                       opp-hz = /bits/ 64 <84000000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <167000000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <222000000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <333000000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <400000000>;
+               };
+       };
 };
 
 &dp {
index 20fa7612080d9f065685a9fde7aa140e90afa185..2a4e10bc88012cfd976a205f6c8f0b81f9c788ee 100644 (file)
        };
 };
 
+&bus_wcore {
+       devfreq-events = <&nocp_mem0_0>, <&nocp_mem0_1>,
+                       <&nocp_mem1_0>, <&nocp_mem1_1>;
+       vdd-supply = <&buck3_reg>;
+       exynos,saturation-ratio = <100>;
+       status = "okay";
+};
+
+&bus_noc {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys_apb {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_fsys2 {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_mfc {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_gen {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_peri {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_g2d {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_g2d_acp {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_jpeg {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_jpeg_apb {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_disp1_fimd {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_disp1 {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_gscl_scaler {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
+&bus_mscl {
+       devfreq = <&bus_wcore>;
+       status = "okay";
+};
+
 &clock_audss {
        assigned-clocks = <&clock_audss EXYNOS_MOUT_AUDSS>,
                        <&clock_audss EXYNOS_MOUT_I2S>,
        vqmmc-supply = <&ldo13_reg>;
 };
 
+&nocp_mem0_0 {
+       status = "okay";
+};
+
+&nocp_mem0_1 {
+       status = "okay";
+};
+
+&nocp_mem1_0 {
+       status = "okay";
+};
+
+&nocp_mem1_1 {
+       status = "okay";
+};
+
 &pinctrl_0 {
        hdmi_hpd_irq: hdmi-hpd-irq {
                samsung,pins = "gpx3-7";
diff --git a/arch/arm/boot/dts/imx7d-nitrogen7.dts b/arch/arm/boot/dts/imx7d-nitrogen7.dts
new file mode 100644 (file)
index 0000000..1ce9780
--- /dev/null
@@ -0,0 +1,745 @@
+/*
+ * Copyright 2016 Boundary Devices, Inc.
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "imx7d.dtsi"
+
+/ {
+       model = "Boundary Devices i.MX7 Nitrogen7 Board";
+       compatible = "boundary,imx7d-nitrogen7", "fsl,imx7d";
+
+       aliases {
+               fb_lcd = &lcdif;
+               t_lcd = &t_lcd;
+       };
+
+       memory {
+               reg = <0x80000000 0x40000000>;
+       };
+
+       backlight-j9 {
+               compatible = "gpio-backlight";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_backlight_j9>;
+               gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>;
+               default-on;
+       };
+
+       backlight-j20 {
+               compatible = "pwm-backlight";
+               pwms = <&pwm1 0 5000000>;
+               brightness-levels = <0 4 8 16 32 64 128 255>;
+               default-brightness-level = <6>;
+               status = "okay";
+       };
+
+       reg_usb_otg1_vbus: regulator-usb-otg1-vbus {
+               compatible = "regulator-fixed";
+               regulator-name = "usb_otg1_vbus";
+               regulator-min-microvolt = <5000000>;
+               regulator-max-microvolt = <5000000>;
+               gpio = <&gpio1 5 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+
+       reg_usb_otg2_vbus: regulator-usb-otg2-vbus {
+               compatible = "regulator-fixed";
+               regulator-name = "usb_otg2_vbus";
+               regulator-min-microvolt = <5000000>;
+               regulator-max-microvolt = <5000000>;
+               gpio = <&gpio4 7 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+
+       reg_can2_3v3: regulator-can2-3v3 {
+               compatible = "regulator-fixed";
+               regulator-name = "can2-3v3";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+               gpio = <&gpio2 14 GPIO_ACTIVE_LOW>;
+       };
+
+       reg_vref_1v8: regulator-vref-1v8 {
+               compatible = "regulator-fixed";
+               regulator-name = "vref-1v8";
+               regulator-min-microvolt = <1800000>;
+               regulator-max-microvolt = <1800000>;
+       };
+
+       reg_vref_3v3: regulator-vref-3v3 {
+               compatible = "regulator-fixed";
+               regulator-name = "vref-3v3";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+       };
+
+       reg_wlan: regulator-wlan {
+               compatible = "regulator-fixed";
+               regulator-min-microvolt = <3300000>;
+               regulator-max-microvolt = <3300000>;
+               clocks = <&clks IMX7D_CLKO2_ROOT_DIV>;
+               clock-names = "slow";
+               regulator-name = "reg_wlan";
+               startup-delay-us = <70000>;
+               gpio = <&gpio4 21 GPIO_ACTIVE_HIGH>;
+               enable-active-high;
+       };
+};
+
+&adc1 {
+       vref-supply = <&reg_vref_1v8>;
+       status = "okay";
+};
+
+&adc2 {
+       vref-supply = <&reg_vref_1v8>;
+       status = "okay";
+};
+
+&clks {
+       assigned-clocks = <&clks IMX7D_CLKO2_ROOT_SRC>,
+                         <&clks IMX7D_CLKO2_ROOT_DIV>;
+       assigned-clock-parents = <&clks IMX7D_CKIL>;
+       assigned-clock-rates = <0>, <32768>;
+};
+
+&cpu0 {
+       arm-supply = <&sw1a_reg>;
+};
+
+&fec1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_enet1>;
+       assigned-clocks = <&clks IMX7D_ENET1_TIME_ROOT_SRC>,
+                         <&clks IMX7D_ENET1_TIME_ROOT_CLK>;
+       assigned-clock-parents = <&clks IMX7D_PLL_ENET_MAIN_100M_CLK>;
+       assigned-clock-rates = <0>, <100000000>;
+       phy-mode = "rgmii";
+       phy-handle = <&ethphy0>;
+       fsl,magic-packet;
+       status = "okay";
+
+       mdio {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               ethphy0: ethernet-phy@4 {
+                       reg = <4>;
+               };
+       };
+};
+
+&flexcan2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_flexcan2>;
+       xceiver-supply = <&reg_can2_3v3>;
+       status = "okay";
+};
+
+&i2c1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c1>;
+       status = "okay";
+
+       pmic: pfuze3000@08 {
+               compatible = "fsl,pfuze3000";
+               reg = <0x08>;
+
+               regulators {
+                       sw1a_reg: sw1a {
+                               regulator-min-microvolt = <700000>;
+                               regulator-max-microvolt = <1475000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                               regulator-ramp-delay = <6250>;
+                       };
+
+                       /* use sw1c_reg to align with pfuze100/pfuze200 */
+                       sw1c_reg: sw1b {
+                               regulator-min-microvolt = <700000>;
+                               regulator-max-microvolt = <1475000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                               regulator-ramp-delay = <6250>;
+                       };
+
+                       sw2_reg: sw2 {
+                               regulator-min-microvolt = <1500000>;
+                               regulator-max-microvolt = <1850000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       sw3a_reg: sw3 {
+                               regulator-min-microvolt = <900000>;
+                               regulator-max-microvolt = <1650000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       swbst_reg: swbst {
+                               regulator-min-microvolt = <5000000>;
+                               regulator-max-microvolt = <5150000>;
+                       };
+
+                       snvs_reg: vsnvs {
+                               regulator-min-microvolt = <1000000>;
+                               regulator-max-microvolt = <3000000>;
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       vref_reg: vrefddr {
+                               regulator-boot-on;
+                               regulator-always-on;
+                       };
+
+                       vgen1_reg: vldo1 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen2_reg: vldo2 {
+                               regulator-min-microvolt = <800000>;
+                               regulator-max-microvolt = <1550000>;
+                               regulator-always-on;
+                       };
+
+                       vgen3_reg: vccsd {
+                               regulator-min-microvolt = <2850000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen4_reg: v33 {
+                               regulator-min-microvolt = <2850000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen5_reg: vldo3 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+
+                       vgen6_reg: vldo4 {
+                               regulator-min-microvolt = <1800000>;
+                               regulator-max-microvolt = <3300000>;
+                               regulator-always-on;
+                       };
+               };
+       };
+};
+
+&i2c2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c2>;
+       status = "okay";
+
+       rtc@68 {
+               compatible = "rv4162";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_i2c2_rv4162>;
+               reg = <0x68>;
+               interrupts-extended = <&gpio2 15 IRQ_TYPE_LEVEL_LOW>;
+       };
+};
+
+&i2c3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c3>;
+       status = "okay";
+
+       touch@48 {
+               compatible = "ti,tsc2004";
+               reg = <0x48>;
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_i2c3_tsc2004>;
+               interrupts-extended = <&gpio3 4 IRQ_TYPE_EDGE_FALLING>;
+               wakeup-gpios = <&gpio3 4 GPIO_ACTIVE_LOW>;
+       };
+};
+
+&i2c4 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_i2c4>;
+       status = "okay";
+
+       codec: wm8960@1a {
+               compatible = "wlf,wm8960";
+               reg = <0x1a>;
+               clocks = <&clks IMX7D_AUDIO_MCLK_ROOT_CLK>;
+               clock-names = "mclk";
+               wlf,shared-lrclk;
+       };
+};
+
+&lcdif {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_lcdif_dat
+                    &pinctrl_lcdif_ctrl>;
+       lcd-supply = <&reg_vref_3v3>;
+       display = <&display0>;
+       status = "okay";
+
+       display0: lcd-display {
+               bits-per-pixel = <16>;
+               bus-width = <18>;
+
+               display-timings {
+                       native-mode = <&t_lcd>;
+                       t_lcd: t_lcd_default {
+                               /* default to Okaya display */
+                               clock-frequency = <30000000>;
+                               hactive = <800>;
+                               vactive = <480>;
+                               hfront-porch = <40>;
+                               hback-porch = <40>;
+                               hsync-len = <48>;
+                               vback-porch = <29>;
+                               vfront-porch = <13>;
+                               vsync-len = <3>;
+                               hsync-active = <0>;
+                               vsync-active = <0>;
+                               de-active = <1>;
+                               pixelclk-active = <0>;
+                       };
+               };
+       };
+};
+
+&pwm1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_pwm1>;
+       status = "okay";
+};
+
+&pwm2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_pwm2>;
+       status = "okay";
+};
+
+&uart1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart1>;
+       assigned-clocks = <&clks IMX7D_UART1_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart2 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart2>;
+       assigned-clocks = <&clks IMX7D_UART2_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart3>;
+       assigned-clocks = <&clks IMX7D_UART3_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_OSC_24M_CLK>;
+       status = "okay";
+};
+
+&uart6 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_uart6>;
+       assigned-clocks = <&clks IMX7D_UART6_ROOT_SRC>;
+       assigned-clock-parents = <&clks IMX7D_PLL_SYS_MAIN_240M_CLK>;
+       fsl,uart-has-rtscts;
+       status = "okay";
+};
+
+&usbotg1 {
+       vbus-supply = <&reg_usb_otg1_vbus>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usbotg1>;
+       status = "okay";
+};
+
+&usbotg2 {
+       vbus-supply = <&reg_usb_otg2_vbus>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usbotg2>;
+       dr_mode = "host";
+       status = "okay";
+};
+
+&usdhc1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc1>;
+       cd-gpios = <&gpio5 0 GPIO_ACTIVE_LOW>;
+       vmmc-supply = <&vgen3_reg>;
+       bus-width = <4>;
+       fsl,tuning-step = <2>;
+       wakeup-source;
+       keep-power-in-suspend;
+       status = "okay";
+};
+
+&usdhc2 {
+       #address-cells = <1>;
+       #size-cells = <0>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc2>;
+       bus-width = <4>;
+       non-removable;
+       vmmc-supply = <&reg_wlan>;
+       cap-power-off-card;
+       keep-power-in-suspend;
+       status = "okay";
+
+       wlcore: wlcore@2 {
+               compatible = "ti,wl1271";
+               reg = <2>;
+               interrupt-parent = <&gpio4>;
+               interrupts = <20 IRQ_TYPE_LEVEL_HIGH>;
+               ref-clock-frequency = <38400000>;
+       };
+};
+
+&usdhc3 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_usdhc3>;
+       assigned-clocks = <&clks IMX7D_USDHC3_ROOT_CLK>;
+       assigned-clock-rates = <400000000>;
+       bus-width = <8>;
+       fsl,tuning-step = <2>;
+       non-removable;
+       status = "okay";
+};
+
+&wdog1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_wdog1>;
+       status = "okay";
+};
+
+&iomuxc {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_hog_1 &pinctrl_j2>;
+
+       pinctrl_hog_1: hoggrp-1 {
+               fsl,pins = <
+                       MX7D_PAD_SD3_RESET_B__GPIO6_IO11        0x5d
+                       MX7D_PAD_GPIO1_IO13__GPIO1_IO13         0x7d
+                       MX7D_PAD_ECSPI2_MISO__GPIO4_IO22        0x7d
+               >;
+       };
+
+       pinctrl_enet1: enet1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO10__ENET1_MDIO                 0x3
+                       MX7D_PAD_GPIO1_IO11__ENET1_MDC                  0x3
+                       MX7D_PAD_GPIO1_IO12__CCM_ENET_REF_CLK1          0x3
+                       MX7D_PAD_ENET1_RGMII_TXC__ENET1_RGMII_TXC       0x71
+                       MX7D_PAD_ENET1_RGMII_TD0__ENET1_RGMII_TD0       0x71
+                       MX7D_PAD_ENET1_RGMII_TD1__ENET1_RGMII_TD1       0x71
+                       MX7D_PAD_ENET1_RGMII_TD2__ENET1_RGMII_TD2       0x71
+                       MX7D_PAD_ENET1_RGMII_TD3__ENET1_RGMII_TD3       0x71
+                       MX7D_PAD_ENET1_RGMII_TX_CTL__ENET1_RGMII_TX_CTL 0x71
+                       MX7D_PAD_ENET1_RGMII_RXC__ENET1_RGMII_RXC       0x71
+                       MX7D_PAD_ENET1_RGMII_RD0__ENET1_RGMII_RD0       0x11
+                       MX7D_PAD_ENET1_RGMII_RD1__ENET1_RGMII_RD1       0x11
+                       MX7D_PAD_ENET1_RGMII_RD2__ENET1_RGMII_RD2       0x11
+                       MX7D_PAD_ENET1_RGMII_RD3__ENET1_RGMII_RD3       0x71
+                       MX7D_PAD_ENET1_RGMII_RX_CTL__ENET1_RGMII_RX_CTL 0x11
+                       MX7D_PAD_SD3_STROBE__GPIO6_IO10                 0x75
+               >;
+       };
+
+       pinctrl_flexcan2: flexcan2grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO14__FLEXCAN2_RX        0x7d
+                       MX7D_PAD_GPIO1_IO15__FLEXCAN2_TX        0x7d
+                       MX7D_PAD_EPDC_DATA14__GPIO2_IO14        0x7d
+               >;
+       };
+
+       pinctrl_i2c1: i2c1grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C1_SDA__I2C1_SDA             0x4000007f
+                       MX7D_PAD_I2C1_SCL__I2C1_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c2: i2c2grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C2_SDA__I2C2_SDA             0x4000007f
+                       MX7D_PAD_I2C2_SCL__I2C2_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c2_rv4162: i2c2-rv4162grp {
+               fsl,pins = <
+                       MX7D_PAD_EPDC_DATA15__GPIO2_IO15        0x7d
+               >;
+       };
+
+       pinctrl_i2c3: i2c3grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C3_SDA__I2C3_SDA             0x4000007f
+                       MX7D_PAD_I2C3_SCL__I2C3_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_i2c3_tsc2004: i2c3tsc2004grp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_RESET__GPIO3_IO4           0x79
+                       MX7D_PAD_SD2_WP__GPIO5_IO10             0x7d
+               >;
+       };
+
+       pinctrl_i2c4: i2c4grp {
+               fsl,pins = <
+                       MX7D_PAD_I2C4_SDA__I2C4_SDA             0x4000007f
+                       MX7D_PAD_I2C4_SCL__I2C4_SCL             0x4000007f
+               >;
+       };
+
+       pinctrl_j2: j2grp {
+               fsl,pins = <
+                       MX7D_PAD_SAI1_TX_DATA__GPIO6_IO15       0x7d
+                       MX7D_PAD_EPDC_BDR0__GPIO2_IO28          0x7d
+                       MX7D_PAD_SAI1_RX_DATA__GPIO6_IO12       0x7d
+                       MX7D_PAD_EPDC_BDR1__GPIO2_IO29          0x7d
+                       MX7D_PAD_SD1_WP__GPIO5_IO1              0x7d
+                       MX7D_PAD_EPDC_SDSHR__GPIO2_IO19         0x7d
+                       MX7D_PAD_SD1_RESET_B__GPIO5_IO2         0x7d
+                       MX7D_PAD_SD2_RESET_B__GPIO5_IO11        0x7d
+                       MX7D_PAD_EPDC_DATA07__GPIO2_IO7         0x7d
+                       MX7D_PAD_EPDC_DATA08__GPIO2_IO8         0x7d
+                       MX7D_PAD_EPDC_DATA09__GPIO2_IO9         0x7d
+                       MX7D_PAD_EPDC_DATA10__GPIO2_IO10        0x7d
+                       MX7D_PAD_EPDC_DATA11__GPIO2_IO11        0x7d
+                       MX7D_PAD_EPDC_DATA12__GPIO2_IO12        0x7d
+                       MX7D_PAD_SAI1_TX_SYNC__GPIO6_IO14       0x7d
+                       MX7D_PAD_EPDC_DATA13__GPIO2_IO13        0x7d
+                       MX7D_PAD_SAI1_TX_BCLK__GPIO6_IO13       0x7d
+                       MX7D_PAD_SD2_CD_B__GPIO5_IO9            0x7d
+                       MX7D_PAD_EPDC_GDCLK__GPIO2_IO24         0x7d
+                       MX7D_PAD_SAI2_RX_DATA__GPIO6_IO21       0x7d
+                       MX7D_PAD_EPDC_GDOE__GPIO2_IO25          0x7d
+                       MX7D_PAD_EPDC_GDRL__GPIO2_IO26          0x7d
+                       MX7D_PAD_SAI2_TX_DATA__GPIO6_IO22       0x7d
+                       MX7D_PAD_EPDC_SDCE0__GPIO2_IO20         0x7d
+                       MX7D_PAD_SAI2_TX_BCLK__GPIO6_IO20       0x7d
+                       MX7D_PAD_EPDC_SDCE1__GPIO2_IO21         0x7d
+                       MX7D_PAD_SAI2_TX_SYNC__GPIO6_IO19       0x7d
+                       MX7D_PAD_EPDC_SDCE2__GPIO2_IO22         0x7d
+                       MX7D_PAD_EPDC_SDCE3__GPIO2_IO23         0x7d
+                       MX7D_PAD_EPDC_GDSP__GPIO2_IO27          0x7d
+                       MX7D_PAD_EPDC_SDCLK__GPIO2_IO16         0x7d
+                       MX7D_PAD_EPDC_SDLE__GPIO2_IO17          0x7d
+                       MX7D_PAD_EPDC_SDOE__GPIO2_IO18          0x7d
+                       MX7D_PAD_EPDC_PWR_COM__GPIO2_IO30       0x7d
+                       MX7D_PAD_EPDC_PWR_STAT__GPIO2_IO31      0x7d
+               >;
+       };
+
+       pinctrl_lcdif_dat: lcdifdatgrp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_DATA00__LCD_DATA0          0x79
+                       MX7D_PAD_LCD_DATA01__LCD_DATA1          0x79
+                       MX7D_PAD_LCD_DATA02__LCD_DATA2          0x79
+                       MX7D_PAD_LCD_DATA03__LCD_DATA3          0x79
+                       MX7D_PAD_LCD_DATA04__LCD_DATA4          0x79
+                       MX7D_PAD_LCD_DATA05__LCD_DATA5          0x79
+                       MX7D_PAD_LCD_DATA06__LCD_DATA6          0x79
+                       MX7D_PAD_LCD_DATA07__LCD_DATA7          0x79
+                       MX7D_PAD_LCD_DATA08__LCD_DATA8          0x79
+                       MX7D_PAD_LCD_DATA09__LCD_DATA9          0x79
+                       MX7D_PAD_LCD_DATA10__LCD_DATA10         0x79
+                       MX7D_PAD_LCD_DATA11__LCD_DATA11         0x79
+                       MX7D_PAD_LCD_DATA12__LCD_DATA12         0x79
+                       MX7D_PAD_LCD_DATA13__LCD_DATA13         0x79
+                       MX7D_PAD_LCD_DATA14__LCD_DATA14         0x79
+                       MX7D_PAD_LCD_DATA15__LCD_DATA15         0x79
+                       MX7D_PAD_LCD_DATA16__LCD_DATA16         0x79
+                       MX7D_PAD_LCD_DATA17__LCD_DATA17         0x79
+                       MX7D_PAD_LCD_DATA18__LCD_DATA18         0x79
+                       MX7D_PAD_LCD_DATA19__LCD_DATA19         0x79
+                       MX7D_PAD_LCD_DATA20__LCD_DATA20         0x79
+                       MX7D_PAD_LCD_DATA21__LCD_DATA21         0x79
+                       MX7D_PAD_LCD_DATA22__LCD_DATA22         0x79
+                       MX7D_PAD_LCD_DATA23__LCD_DATA23         0x79
+               >;
+       };
+
+       pinctrl_lcdif_ctrl: lcdifctrlgrp {
+               fsl,pins = <
+                       MX7D_PAD_LCD_CLK__LCD_CLK               0x79
+                       MX7D_PAD_LCD_ENABLE__LCD_ENABLE         0x79
+                       MX7D_PAD_LCD_VSYNC__LCD_VSYNC           0x79
+                       MX7D_PAD_LCD_HSYNC__LCD_HSYNC           0x79
+               >;
+       };
+
+       pinctrl_pwm2: pwm2grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO09__PWM2_OUT           0x7d
+               >;
+       };
+
+       pinctrl_uart1: uart1grp {
+               fsl,pins = <
+                       MX7D_PAD_UART1_TX_DATA__UART1_DCE_TX    0x79
+                       MX7D_PAD_UART1_RX_DATA__UART1_DCE_RX    0x79
+               >;
+       };
+
+       pinctrl_uart2: uart2grp {
+               fsl,pins = <
+                       MX7D_PAD_UART2_TX_DATA__UART2_DCE_TX    0x79
+                       MX7D_PAD_UART2_RX_DATA__UART2_DCE_RX    0x79
+               >;
+       };
+
+       pinctrl_uart3: uart3grp {
+               fsl,pins = <
+                       MX7D_PAD_UART3_TX_DATA__UART3_DCE_TX    0x79
+                       MX7D_PAD_UART3_RX_DATA__UART3_DCE_RX    0x79
+                       MX7D_PAD_EPDC_DATA04__GPIO2_IO4         0x7d
+               >;
+       };
+
+       pinctrl_uart6: uart6grp {
+               fsl,pins = <
+                       MX7D_PAD_ECSPI1_MOSI__UART6_DCE_TX      0x79
+                       MX7D_PAD_ECSPI1_SCLK__UART6_DCE_RX      0x79
+                       MX7D_PAD_ECSPI1_SS0__UART6_DCE_CTS      0x79
+                       MX7D_PAD_ECSPI1_MISO__UART6_DCE_RTS     0x79
+               >;
+       };
+
+       pinctrl_usbotg2: usbotg2grp {
+               fsl,pins = <
+                       MX7D_PAD_UART3_RTS_B__USB_OTG2_OC       0x7d
+                       MX7D_PAD_UART3_CTS_B__GPIO4_IO7         0x14
+               >;
+       };
+
+       pinctrl_usdhc1: usdhc1grp {
+               fsl,pins = <
+                       MX7D_PAD_SD1_CMD__SD1_CMD               0x59
+                       MX7D_PAD_SD1_CLK__SD1_CLK               0x19
+                       MX7D_PAD_SD1_DATA0__SD1_DATA0           0x59
+                       MX7D_PAD_SD1_DATA1__SD1_DATA1           0x59
+                       MX7D_PAD_SD1_DATA2__SD1_DATA2           0x59
+                       MX7D_PAD_SD1_DATA3__SD1_DATA3           0x59
+                       MX7D_PAD_GPIO1_IO08__SD1_VSELECT        0x75
+                       MX7D_PAD_SD1_CD_B__GPIO5_IO0            0x75
+               >;
+       };
+
+       pinctrl_usdhc2: usdhc2grp {
+               fsl,pins = <
+                       MX7D_PAD_SD2_CMD__SD2_CMD               0x59
+                       MX7D_PAD_SD2_CLK__SD2_CLK               0x19
+                       MX7D_PAD_SD2_DATA0__SD2_DATA0           0x59
+                       MX7D_PAD_SD2_DATA1__SD2_DATA1           0x59
+                       MX7D_PAD_SD2_DATA2__SD2_DATA2           0x59
+                       MX7D_PAD_SD2_DATA3__SD2_DATA3           0x59
+                       MX7D_PAD_ECSPI2_SCLK__GPIO4_IO20        0x59
+                       MX7D_PAD_ECSPI2_MOSI__GPIO4_IO21        0x59
+               >;
+       };
+
+       pinctrl_usdhc3: usdhc3grp {
+               fsl,pins = <
+                       MX7D_PAD_SD3_CMD__SD3_CMD               0x59
+                       MX7D_PAD_SD3_CLK__SD3_CLK               0x19
+                       MX7D_PAD_SD3_DATA0__SD3_DATA0           0x59
+                       MX7D_PAD_SD3_DATA1__SD3_DATA1           0x59
+                       MX7D_PAD_SD3_DATA2__SD3_DATA2           0x59
+                       MX7D_PAD_SD3_DATA3__SD3_DATA3           0x59
+                       MX7D_PAD_SD3_DATA4__SD3_DATA4           0x59
+                       MX7D_PAD_SD3_DATA5__SD3_DATA5           0x59
+                       MX7D_PAD_SD3_DATA6__SD3_DATA6           0x59
+                       MX7D_PAD_SD3_DATA7__SD3_DATA7           0x59
+               >;
+       };
+};
+
+&iomuxc_lpsr {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_hog_2>;
+
+       pinctrl_hog_2: hoggrp-2 {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO02__GPIO1_IO2          0x7d
+                       MX7D_PAD_GPIO1_IO03__CCM_CLKO2          0x7d
+               >;
+       };
+
+       pinctrl_backlight_j9: backlightj9grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO07__GPIO1_IO7          0x7d
+               >;
+       };
+
+       pinctrl_pwm1: pwm1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO01__PWM1_OUT           0x7d
+               >;
+       };
+
+       pinctrl_usbotg1: usbotg1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO04__USB_OTG1_OC        0x7d
+                       MX7D_PAD_GPIO1_IO05__GPIO1_IO5          0x14
+               >;
+       };
+
+       pinctrl_wdog1: wdog1grp {
+               fsl,pins = <
+                       MX7D_PAD_GPIO1_IO00__WDOD1_WDOG_B       0x75
+               >;
+       };
+};
index b5a50e0e7ff1980523e6567934135b7aad7a73d9..6b3faa298417dcf4e084279f204e6ae97f0019bc 100644 (file)
                                #pwm-cells = <2>;
                                status = "disabled";
                        };
+
+                       lcdif: lcdif@30730000 {
+                               compatible = "fsl,imx7d-lcdif", "fsl,imx28-lcdif";
+                               reg = <0x30730000 0x10000>;
+                               interrupts = <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>,
+                                       <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CLK_DUMMY>;
+                               clock-names = "pix", "axi", "disp_axi";
+                               status = "disabled";
+                       };
                };
 
                aips3: aips-bus@30800000 {
                                status = "disabled";
                        };
 
+                       flexcan1: can@30a00000 {
+                               compatible = "fsl,imx7d-flexcan", "fsl,imx6q-flexcan";
+                               reg = <0x30a00000 0x10000>;
+                               interrupts = <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CAN1_ROOT_CLK>;
+                               clock-names = "ipg", "per";
+                               status = "disabled";
+                       };
+
+                       flexcan2: can@30a10000 {
+                               compatible = "fsl,imx7d-flexcan", "fsl,imx6q-flexcan";
+                               reg = <0x30a10000 0x10000>;
+                               interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks IMX7D_CLK_DUMMY>,
+                                       <&clks IMX7D_CAN2_ROOT_CLK>;
+                               clock-names = "ipg", "per";
+                               status = "disabled";
+                       };
+
                        i2c1: i2c@30a20000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
index 0c82097daffcdd8bd3ac5c0840c7cae0a039d277..b9bbcce69dfbd5b9efda7ae20a83752cf459c925 100644 (file)
@@ -14,6 +14,7 @@
 #include <dt-bindings/clock/r8a7779-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7779-sysc.h>
 
 / {
        compatible = "renesas,r8a7779";
                        compatible = "arm,cortex-a9";
                        reg = <1>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM1>;
                };
                cpu@2 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <2>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM2>;
                };
                cpu@3 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <3>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7779_PD_ARM3>;
                };
        };
 
                reg = <0xffc70000 0x1000>;
                interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc71000 0x1000>;
                interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc72000 0x1000>;
                interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffc73000 0x1000>;
                interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF0>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF1>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF2>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF3>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF4>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp0_clks R8A7779_CLK_SCIF5>,
                         <&cpg_clocks R8A7779_CLK_S1>, <&scif_clk>;
                clock-names = "fck", "brg_int", "scif_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                             <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                             <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                             <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp0_clks R8A7779_CLK_TMU2>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
 
                #renesas,channels = <3>;
 
                reg = <0xfc600000 0x2000>;
                interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7779_CLK_SATA>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
        };
 
        sdhi0: sd@ffe4c000 {
                reg = <0xffe4c000 0x100>;
                interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4d000 0x100>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4e000 0x100>;
                interrupts = <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0xffe4f000 0x100>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7779_CLK_SDHI3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7779_CLK_HSPI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfff80000 0 0x40000>;
                interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7779_CLK_DU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7779_PD_ALWAYS_ON>;
                status = "disabled";
 
                ports {
                                "mmc1", "mmc0";
                };
        };
+
+       sysc: system-controller@ffd85000 {
+               compatible = "renesas,r8a7779-sysc";
+               reg = <0xffd85000 0x0200>;
+               #power-domain-cells = <1>;
+       };
 };
index 935064fe7b13497bcc2a7e5f59e3831d02d2dfcd..83cf23cd26bb51e6632cffb9e87a12cdc2ea442f 100644 (file)
@@ -13,6 +13,7 @@
 #include <dt-bindings/clock/r8a7790-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7790-sysc.h>
 
 / {
        compatible = "renesas,r8a7790";
@@ -52,6 +53,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7790_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU0>;
                        next-level-cache = <&L2_CA15>;
 
                        /* kHz - uV - OPPs unknown yet */
@@ -68,6 +70,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <1>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU1>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -76,6 +79,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <2>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU2>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -84,6 +88,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <3>;
                        clock-frequency = <1300000000>;
+                       power-domains = <&sysc R8A7790_PD_CA15_CPU3>;
                        next-level-cache = <&L2_CA15>;
                };
 
@@ -92,6 +97,7 @@
                        compatible = "arm,cortex-a7";
                        reg = <0x100>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU0>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x101>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU1>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x102>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU2>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <0x103>;
                        clock-frequency = <780000000>;
+                       power-domains = <&sysc R8A7790_PD_CA7_CPU3>;
                        next-level-cache = <&L2_CA7>;
                };
        };
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7790_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
 
        L2_CA7: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7790_PD_CA7_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7790_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7790_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7790_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7790_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7790_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7790_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                              GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7790_CLK_USBDMAC0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                              GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7790_CLK_USBDMAC1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7790_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp3_clks R8A7790_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_IIC2>;
                dmas = <&dmac0 0x69>, <&dmac0 0x6a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7790_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7790_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                clocks = <&mstp3_clks R8A7790_CLK_MMCIF1>;
                dmas = <&dmac0 0xe1>, <&dmac0 0xe2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                dmas = <&dmac1 0xcd>, <&dmac1 0xce>;
                dma-names = "tx", "rx";
                max-frequency = <195000000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xc9>, <&dmac1 0xca>;
                dma-names = "tx", "rx";
                max-frequency = <195000000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xc1>, <&dmac1 0xc2>;
                dma-names = "tx", "rx";
                max-frequency = <97500000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&dmac1 0xd3>, <&dmac1 0xd4>;
                dma-names = "tx", "rx";
                max-frequency = <97500000>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee300000 0 0x2000>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_SATA0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee500000 0 0x2000>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_SATA1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&usb_dmac0 0>, <&usb_dmac0 1>,
                       <&usb_dmac1 0>, <&usb_dmac1 1>;
                dma-names = "ch0", "ch1", "ch2", "ch3";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef2000 0 0x1000>;
                interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef3000 0 0x1000>;
                interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7790_CLK_VIN3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe920000 0 0x8000>;
                interrupts = <GIC_SPI 266 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_R>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-sru;
                renesas,#rpf = <5>;
                reg = <0 0xfe928000 0 0x8000>;
                interrupts = <GIC_SPI 267 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_S>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lut;
                renesas,has-sru;
                reg = <0 0xfe930000 0 0x8000>;
                interrupts = <GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_DU0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                reg = <0 0xfe938000 0 0x8000>;
                interrupts = <GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_VSP1_DU1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                clocks = <&mstp9_clks R8A7790_CLK_RCAN0>,
                         <&cpg_clocks R8A7790_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7790_CLK_RCAN1>,
                         <&cpg_clocks R8A7790_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe980000 0 0x10300>;
                interrupts = <GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7790_CLK_JPU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
        };
 
        clocks {
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7790-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        qspi: spi@e6b10000 {
                compatible = "renesas,qspi-r8a7790", "renesas,qspi";
                reg = <0 0xe6b10000 0 0x2c>;
                clocks = <&mstp9_clks R8A7790_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7790_CLK_MSIOF0>;
                dmas = <&dmac0 0x51>, <&dmac0 0x52>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF1>;
                dmas = <&dmac0 0x55>, <&dmac0 0x56>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF2>;
                dmas = <&dmac0 0x41>, <&dmac0 0x42>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7790_CLK_MSIOF3>;
                dmas = <&dmac0 0x45>, <&dmac0 0x46>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee000000 0 0xc00>;
                interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_SSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                phys = <&usb2 1>;
                phy-names = "usb";
                status = "disabled";
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0a0000 0 0x1100>;
                interrupts = <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                compatible = "renesas,pci-r8a7790", "renesas,pci-rcar-gen2";
                device_type = "pci";
                clocks = <&mstp7_clks R8A7790_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                reg = <0 0xee0d0000 0 0xc00>,
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7790_CLK_PCIEC>, <&pcie_bus_clk>;
                clock-names = "pcie", "pcie_bus";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                                "mix.0", "mix.1",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index 565c270e549d2bb603d5f87384cdcb417161ab30..db67e342c58566fac210e1ccaa849259ec35090a 100644 (file)
@@ -13,6 +13,7 @@
 #include <dt-bindings/clock/r8a7791-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7791-sysc.h>
 
 / {
        compatible = "renesas,r8a7791";
@@ -51,6 +52,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7791_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7791_PD_CA15_CPU0>;
                        next-level-cache = <&L2_CA15>;
 
                        /* kHz - uV - OPPs unknown yet */
@@ -67,6 +69,7 @@
                        compatible = "arm,cortex-a15";
                        reg = <1>;
                        clock-frequency = <1500000000>;
+                       power-domains = <&sysc R8A7791_PD_CA15_CPU1>;
                        next-level-cache = <&L2_CA15>;
                };
        };
@@ -92,6 +95,7 @@
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7791_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        gpio7: gpio@e6055800 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7791_CLK_GPIO7>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7791_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7791_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7791_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7791_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7791_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7791_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                              GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7791_CLK_USBDMAC0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                              GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                interrupt-names = "ch0", "ch1";
                clocks = <&mstp3_clks R8A7791_CLK_USBDMAC1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <2>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7791_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp9_clks R8A7791_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
                max-frequency = <97500000>;
                clocks = <&mstp3_clks R8A7791_CLK_SDHI0>;
                dmas = <&dmac1 0xcd>, <&dmac1 0xce>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_SDHI1>;
                dmas = <&dmac1 0xc1>, <&dmac1 0xc2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7791_CLK_SDHI2>;
                dmas = <&dmac1 0xd3>, <&dmac1 0xd4>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee300000 0 0x2000>;
                interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_SATA0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee500000 0 0x2000>;
                interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_SATA1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                dmas = <&usb_dmac0 0>, <&usb_dmac0 1>,
                       <&usb_dmac1 0>, <&usb_dmac1 1>;
                dma-names = "ch0", "ch1", "ch2", "ch3";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7791_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef2000 0 0x1000>;
                interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7791_CLK_VIN2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe928000 0 0x8000>;
                interrupts = <GIC_SPI 267 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_S>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lut;
                renesas,has-sru;
                reg = <0 0xfe930000 0 0x8000>;
                interrupts = <GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_DU0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                reg = <0 0xfe938000 0 0x8000>;
                interrupts = <GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_VSP1_DU1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                renesas,has-lif;
                renesas,has-lut;
                clocks = <&mstp9_clks R8A7791_CLK_RCAN0>,
                         <&cpg_clocks R8A7791_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7791_CLK_RCAN1>,
                         <&cpg_clocks R8A7791_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xfe980000 0 0x10300>;
                interrupts = <GIC_SPI 272 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7791_CLK_JPU>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
        };
 
        clocks {
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7791-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        qspi: spi@e6b10000 {
                compatible = "renesas,qspi-r8a7791", "renesas,qspi";
                reg = <0 0xe6b10000 0 0x2c>;
                clocks = <&mstp9_clks R8A7791_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp0_clks R8A7791_CLK_MSIOF0>;
                dmas = <&dmac0 0x51>, <&dmac0 0x52>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7791_CLK_MSIOF1>;
                dmas = <&dmac0 0x55>, <&dmac0 0x56>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp2_clks R8A7791_CLK_MSIOF2>;
                dmas = <&dmac0 0x41>, <&dmac0 0x42>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xee000000 0 0xc00>;
                interrupts = <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_SSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                phys = <&usb2 1>;
                phy-names = "usb";
                status = "disabled";
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7791_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7791_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7791_CLK_PCIEC>, <&pcie_bus_clk>;
                clock-names = "pcie", "pcie_bus";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                                "mix.0", "mix.1",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index cf6dc2aeef201ab69b9ce589bae18bcd7184ed68..1dd6d202cd4ce54579edf79f90971dd3f91f08bd 100644 (file)
@@ -11,6 +11,7 @@
 #include <dt-bindings/clock/r8a7793-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7793-sysc.h>
 
 / {
        compatible = "renesas,r8a7793";
@@ -43,6 +44,7 @@
                        voltage-tolerance = <1>; /* 1% */
                        clocks = <&cpg_clocks R8A7793_CLK_Z>;
                        clock-latency = <300000>; /* 300 us */
+                       power-domains = <&sysc R8A7793_PD_CA15_CPU0>;
 
                        /* kHz - uV - OPPs unknown yet */
                        operating-points = <1500000 1000000>,
@@ -76,6 +78,7 @@
 
        L2_CA15: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7793_PD_CA15_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        gpio7: gpio@e6055800 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7793_CLK_GPIO7>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        thermal: thermal@e61f0000 {
                reg = <0 0xe61f0000 0 0x14>, <0 0xe61f0100 0 0x38>;
                interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp5_clks R8A7793_CLK_THERMAL>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #thermal-sensor-cells = <0>;
        };
 
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7793_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7793_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7793_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
        };
 
        dmac0: dma-controller@e6700000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7793_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7793_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7793_CLK_AUDIO_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                                "ch12";
                clocks = <&mstp5_clks R8A7793_CLK_AUDIO_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <13>;
        };
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <6>;
                status = "disabled";
        };
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7793_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                i2c-scl-internal-delay-ns = <110>;
                status = "disabled";
        };
                clocks = <&mstp9_clks R8A7793_CLK_IICDVFS>;
                dmas = <&dmac0 0x77>, <&dmac0 0x78>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI0>;
                dmas = <&dmac0 0xcd>, <&dmac0 0xce>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI1>;
                dmas = <&dmac0 0xc1>, <&dmac0 0xc2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp3_clks R8A7793_CLK_SDHI2>;
                dmas = <&dmac0 0xd3>, <&dmac0 0xd4>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7793_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp9_clks R8A7793_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                clocks = <&mstp9_clks R8A7793_CLK_RCAN0>,
                         <&cpg_clocks R8A7793_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7793_CLK_RCAN1>,
                         <&cpg_clocks R8A7793_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7793-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        ipmmu_sy0: mmu@e6280000 {
                compatible = "renesas,ipmmu-r8a7793", "renesas,ipmmu-vmsa";
                reg = <0 0xe6280000 0 0x1000>;
                                "src.4", "src.3", "src.2", "src.1", "src.0",
                                "dvc.0", "dvc.1",
                                "clk_a", "clk_b", "clk_c", "clk_i";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7793_PD_ALWAYS_ON>;
 
                status = "disabled";
 
index e45b23f3114954fff02d6ca6baa9764b49420597..f334a3a715f27d127aa4ebbe7c6ef8281bb30b1f 100644 (file)
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/r8a7794-clock.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/power/r8a7794-sysc.h>
 
 / {
        compatible = "renesas,r8a7794";
@@ -42,6 +43,7 @@
                        compatible = "arm,cortex-a7";
                        reg = <0>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7794_PD_CA7_CPU0>;
                        next-level-cache = <&L2_CA7>;
                };
 
                        compatible = "arm,cortex-a7";
                        reg = <1>;
                        clock-frequency = <1000000000>;
+                       power-domains = <&sysc R8A7794_PD_CA7_CPU1>;
                        next-level-cache = <&L2_CA7>;
                };
        };
 
        L2_CA7: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7794_PD_CA7_SCU>;
                cache-unified;
                cache-level = <2>;
        };
@@ -82,7 +86,7 @@
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio1: gpio@e6051000 {
@@ -95,7 +99,7 @@
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio2: gpio@e6052000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio3: gpio@e6053000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio4: gpio@e6054000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio5: gpio@e6055000 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        gpio6: gpio@e6055400 {
                #interrupt-cells = <2>;
                interrupt-controller;
                clocks = <&mstp9_clks R8A7794_CLK_GPIO6>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        cmt0: timer@ffca0000 {
                             <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp1_clks R8A7794_CLK_CMT0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0x60>;
 
                             <GIC_SPI 127 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_CMT1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
 
                renesas,channels-mask = <0xff>;
 
                             <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>,
                             <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp4_clks R8A7794_CLK_IRQC>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
        };
 
        pfc: pin-controller@e6060000 {
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7794_CLK_SYS_DMAC0>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                                "ch12", "ch13", "ch14";
                clocks = <&mstp2_clks R8A7794_CLK_SYS_DMAC1>;
                clock-names = "fck";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #dma-cells = <1>;
                dma-channels = <15>;
        };
                clock-names = "fck";
                dmas = <&dmac0 0x21>, <&dmac0 0x22>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x25>, <&dmac0 0x26>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x27>, <&dmac0 0x28>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1b>, <&dmac0 0x1c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1f>, <&dmac0 0x20>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x23>, <&dmac0 0x24>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x3d>, <&dmac0 0x3e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x19>, <&dmac0 0x1a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck";
                dmas = <&dmac0 0x1d>, <&dmac0 0x1e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x29>, <&dmac0 0x2a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2d>, <&dmac0 0x2e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2b>, <&dmac0 0x2c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x2f>, <&dmac0 0x30>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfb>, <&dmac0 0xfc>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0xfd>, <&dmac0 0xfe>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x39>, <&dmac0 0x3a>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x4d>, <&dmac0 0x4e>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clock-names = "fck", "brg_int", "scif_clk";
                dmas = <&dmac0 0x3b>, <&dmac0 0x3c>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee700000 0 0x400>;
                interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_ETHER>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                phy-mode = "rmii";
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6800000 0 0x800>, <0 0xee0e8000 0 0x4000>;
                interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_ETHERAVB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                reg = <0 0xe6508000 0 0x40>;
                interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6518000 0 0x40>;
                interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6530000 0 0x40>;
                interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6540000 0 0x40>;
                interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C3>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6520000 0 0x40>;
                interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C4>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                reg = <0 0xe6528000 0 0x40>;
                interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp9_clks R8A7794_CLK_I2C5>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                i2c-scl-internal-delay-ns = <6>;
                clocks = <&mstp3_clks R8A7794_CLK_IIC0>;
                dmas = <&dmac0 0x61>, <&dmac0 0x62>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp3_clks R8A7794_CLK_IIC1>;
                dmas = <&dmac0 0x65>, <&dmac0 0x66>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                #address-cells = <1>;
                #size-cells = <0>;
                status = "disabled";
                clocks = <&mstp3_clks R8A7794_CLK_MMCIF0>;
                dmas = <&dmac0 0xd1>, <&dmac0 0xd2>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                reg-io-width = <4>;
                status = "disabled";
        };
                reg = <0 0xee100000 0 0x200>;
                interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee140000 0 0x100>;
                interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xee160000 0 0x100>;
                interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp3_clks R8A7794_CLK_SDHI2>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7794_CLK_QSPI_MOD>;
                dmas = <&dmac0 0x17>, <&dmac0 0x18>;
                dma-names = "tx", "rx";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                num-cs = <1>;
                #address-cells = <1>;
                #size-cells = <0>;
                reg = <0 0xe6ef0000 0 0x1000>;
                interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_VIN0>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                reg = <0 0xe6ef1000 0 0x1000>;
                interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp8_clks R8A7794_CLK_VIN1>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                      <0 0xee080000 0 0x1100>;
                interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <0 0>;
                      <0 0xee0c0000 0 0x1100>;
                interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_EHCI>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                bus-range = <1 1>;
                reg = <0 0xe6590000 0 0x100>;
                interrupts = <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&mstp7_clks R8A7794_CLK_HSUSB>;
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                renesas,buswait = <4>;
                phys = <&usb0 1>;
                phy-names = "usb";
                #size-cells = <0>;
                clocks = <&mstp7_clks R8A7794_CLK_HSUSB>;
                clock-names = "usbhs";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
 
                usb0: usb-channel@0 {
                clocks = <&mstp9_clks R8A7794_CLK_RCAN0>,
                         <&cpg_clocks R8A7794_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                clocks = <&mstp9_clks R8A7794_CLK_RCAN1>,
                         <&cpg_clocks R8A7794_CLK_RCAN>, <&can_clk>;
                clock-names = "clkp1", "clkp2", "can_clk";
-               power-domains = <&cpg_clocks>;
+               power-domains = <&sysc R8A7794_PD_ALWAYS_ON>;
                status = "disabled";
        };
 
                };
        };
 
+       sysc: system-controller@e6180000 {
+               compatible = "renesas,r8a7794-sysc";
+               reg = <0 0xe6180000 0 0x0200>;
+               #power-domain-cells = <1>;
+       };
+
        ipmmu_sy0: mmu@e6280000 {
                compatible = "renesas,ipmmu-r8a7794", "renesas,ipmmu-vmsa";
                reg = <0 0xe6280000 0 0x1000>;
index a99f07ad6312d62682fe0893acaaaf57f6e6bcc2..941f36263c8ff97f21a1703ac58fcf24b0599896 100644 (file)
                vddio-pex-ctl-supply = <&vdd_3v3_lp0>;
                avdd-pll-erefe-supply = <&avdd_1v05_run>;
 
+               /* Mini PCIe */
                pci@1,0 {
+                       phys = <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-4}>;
+                       phy-names = "pcie-0";
                        status = "okay";
                };
 
+               /* Gigabit Ethernet */
                pci@2,0 {
+                       phys = <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-2}>;
+                       phy-names = "pcie-0";
                        status = "okay";
                };
        };
        sata@0,70020000 {
                status = "okay";
 
+               phys = <&{/padctl@0,7009f000/pads/sata/lanes/sata-0}>;
+               phy-names = "sata-0";
+
                hvdd-supply = <&vdd_3v3_lp0>;
                vddio-supply = <&vdd_1v05_run>;
                avdd-supply = <&vdd_1v05_run>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* Micro A/B */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Mini PCIe */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* USB3 */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>; /* USB3 */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
        padctl@0,7009f000 {
-               pinctrl-0 = <&padctl_default>;
-               pinctrl-names = "default";
+               status = "okay";
 
-               padctl_default: pinmux {
-                       usb3 {
-                               nvidia,lanes = "pcie-0", "pcie-1";
-                               nvidia,function = "usb3";
-                               nvidia,iddq = <0>;
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
                        };
 
                        pcie {
-                               nvidia,lanes = "pcie-2", "pcie-3",
-                                              "pcie-4";
-                               nvidia,function = "pcie";
-                               nvidia,iddq = <0>;
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-2 {
+                                               nvidia,function = "pcie";
+                                               status = "okay";
+                                       };
+
+                                       pcie-4 {
+                                               nvidia,function = "pcie";
+                                               status = "okay";
+                                       };
+                               };
                        };
 
                        sata {
-                               nvidia,lanes = "sata-0";
-                               nvidia,function = "sata";
-                               nvidia,iddq = <0>;
+                               status = "okay";
+
+                               lanes {
+                                       sata-0 {
+                                               nvidia,function = "sata";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       /* Micro A/B */
+                       usb2-0 {
+                               status = "okay";
+                               mode = "otg";
+                       };
+
+                       /* Mini PCIe */
+                       usb2-1 {
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       /* USB3 */
+                       usb2-2 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_usb3_vbus>;
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <2>;
+                               status = "okay";
                        };
                };
        };
index 5f1fc1410bd0dc4461caff4f7ff976cc81f8e22a..0710a600cc69306ab1dc4619f0b8237851eba343 100644 (file)
                                        regulator-always-on;
                                };
 
-                               ldo0 {
+                               avdd_1v05_run: ldo0 {
                                        regulator-name = "+1.05V_RUN_AVDD";
                                        regulator-min-microvolt = <1050000>;
                                        regulator-max-microvolt = <1050000>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Internal USB */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* 2nd USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-1}>; /* 2nd USB A */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0", "usb3-1";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
+       padctl@0,7009f000 {
+               status = "okay";
+
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               vbus-supply = <&vdd_usb1_vbus>;
+                               status = "okay";
+                               mode = "otg";
+                       };
+
+                       usb2-1 {
+                               vbus-supply = <&vdd_run_cam>;
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       usb2-2 {
+                               vbus-supply = <&vdd_usb3_vbus>;
+                               status = "okay";
+                               mode = "host";
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <0>;
+                               status = "okay";
+                       };
+
+                       usb3-1 {
+                               nvidia,usb2-companion = <1>;
+                               status = "okay";
+                       };
+               };
+       };
+
        sdhci0_pwrseq: sdhci0_pwrseq {
                compatible = "mmc-pwrseq-simple";
 
                };
        };
 
-       usb@0,7d000000 { /* Rear external USB port. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d000000 {
-               status = "okay";
-               vbus-supply = <&vdd_usb1_vbus>;
-       };
-
-       usb@0,7d004000 { /* Internal webcam. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d004000 {
-               status = "okay";
-               vbus-supply = <&vdd_run_cam>;
-       };
-
-       usb@0,7d008000 { /* Left external USB port. */
-               status = "okay";
-       };
-
-       usb-phy@0,7d008000 {
-               status = "okay";
-               vbus-supply = <&vdd_usb3_vbus>;
-       };
-
        backlight: backlight {
                compatible = "pwm-backlight";
 
index 0318258dde3e098c788bcf4fc7cfca9b08914f21..973446d07182cd8958cddd2a7dd33e2f9f08815f 100644 (file)
                                        regulator-always-on;
                                };
 
-                               ldo0 {
+                               avdd_1v05_run: ldo0 {
                                        regulator-name = "+1.05V_RUN_AVDD";
                                        regulator-min-microvolt = <1050000>;
                                        regulator-max-microvolt = <1050000>;
                status = "okay";
        };
 
+       usb@0,70090000 {
+               phys = <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-1}>, /* Internal USB */
+                      <&{/padctl@0,7009f000/pads/usb2/lanes/usb2-2}>, /* 2nd USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-0}>, /* 1st USB A */
+                      <&{/padctl@0,7009f000/pads/pcie/lanes/pcie-1}>; /* 2nd USB A */
+               phy-names = "usb2-0", "usb2-1", "usb2-2", "usb3-0", "usb3-1";
+
+               avddio-pex-supply = <&vdd_1v05_run>;
+               dvddio-pex-supply = <&vdd_1v05_run>;
+               avdd-usb-supply = <&vdd_3v3_lp0>;
+               avdd-pll-utmip-supply = <&vddio_1v8>;
+               avdd-pll-erefe-supply = <&avdd_1v05_run>;
+               avdd-usb-ss-pll-supply = <&vdd_1v05_run>;
+               hvdd-usb-ss-supply = <&vdd_3v3_lp0>;
+               hvdd-usb-ss-pll-e-supply = <&vdd_3v3_lp0>;
+
+               status = "okay";
+       };
+
+       padctl@0,7009f000 {
+               pads {
+                       usb2 {
+                               status = "okay";
+
+                               lanes {
+                                       usb2-0 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-1 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+
+                                       usb2-2 {
+                                               nvidia,function = "xusb";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "okay";
+
+                               lanes {
+                                       pcie-0 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+
+                                       pcie-1 {
+                                               nvidia,function = "usb3-ss";
+                                               status = "okay";
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               status = "okay";
+                               mode = "otg";
+
+                               vbus-supply = <&vdd_usb1_vbus>;
+                       };
+
+                       usb2-1 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_run_cam>;
+                       };
+
+                       usb2-2 {
+                               status = "okay";
+                               mode = "host";
+
+                               vbus-supply = <&vdd_usb3_vbus>;
+                       };
+
+                       usb3-0 {
+                               nvidia,usb2-companion = <0>;
+                               status = "okay";
+                       };
+
+                       usb3-1 {
+                               nvidia,usb2-companion = <2>;
+                               status = "okay";
+                       };
+               };
+       };
+
        sdhci@0,700b0400 {
                cd-gpios = <&gpio TEGRA_GPIO(V, 2) GPIO_ACTIVE_HIGH>;
                power-gpios = <&gpio TEGRA_GPIO(R, 0) GPIO_ACTIVE_HIGH>;
index e4eac1f01e645f608e16483a0e37086ad1a2df91..ea4811870de271b0cd74f0caa8173535d86d7d3d 100644 (file)
@@ -2,7 +2,6 @@
 #include <dt-bindings/gpio/tegra-gpio.h>
 #include <dt-bindings/memory/tegra124-mc.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
-#include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/reset/tegra124-car.h>
 #include <dt-bindings/thermal/tegra124-soctherm.h>
@@ -51,9 +50,6 @@
                reset-names = "pex", "afi", "pcie_x";
                status = "disabled";
 
-               phys = <&padctl TEGRA_XUSB_PADCTL_PCIE>;
-               phy-names = "pcie";
-
                pci@1,0 {
                        device_type = "pci";
                        assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>;
                         <&tegra_car 123>,
                         <&tegra_car 129>;
                reset-names = "sata", "sata-oob", "sata-cold";
-               phys = <&padctl TEGRA_XUSB_PADCTL_SATA>;
-               phy-names = "sata-phy";
                status = "disabled";
        };
 
                status = "disabled";
        };
 
+       usb@0,70090000 {
+               compatible = "nvidia,tegra124-xusb";
+               reg = <0x0 0x70090000 0x0 0x8000>,
+                     <0x0 0x70098000 0x0 0x1000>,
+                     <0x0 0x70099000 0x0 0x1000>;
+               reg-names = "hcd", "fpci", "ipfs";
+
+               interrupts = <GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>;
+
+               clocks = <&tegra_car TEGRA124_CLK_XUSB_HOST>,
+                        <&tegra_car TEGRA124_CLK_XUSB_HOST_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_FALCON_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS_DIV2>,
+                        <&tegra_car TEGRA124_CLK_XUSB_SS_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_HS_SRC>,
+                        <&tegra_car TEGRA124_CLK_XUSB_FS_SRC>,
+                        <&tegra_car TEGRA124_CLK_PLL_U_480M>,
+                        <&tegra_car TEGRA124_CLK_CLK_M>,
+                        <&tegra_car TEGRA124_CLK_PLL_E>;
+               clock-names = "xusb_host", "xusb_host_src",
+                             "xusb_falcon_src", "xusb_ss",
+                             "xusb_ss_div2", "xusb_ss_src",
+                             "xusb_hs_src", "xusb_fs_src",
+                             "pll_u_480m", "clk_m", "pll_e";
+               resets = <&tegra_car 89>, <&tegra_car 156>,
+                        <&tegra_car 143>;
+               reset-names = "xusb_host", "xusb_ss", "xusb_src";
+
+               nvidia,xusb-padctl = <&padctl>;
+
+               status = "disabled";
+       };
+
        padctl: padctl@0,7009f000 {
                compatible = "nvidia,tegra124-xusb-padctl";
                reg = <0x0 0x7009f000 0x0 0x1000>;
                resets = <&tegra_car 142>;
                reset-names = "padctl";
 
-               #phy-cells = <1>;
+               pads {
+                       usb2 {
+                               status = "disabled";
+
+                               lanes {
+                                       usb2-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       usb2-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       usb2-2 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       ulpi {
+                               status = "disabled";
+
+                               lanes {
+                                       ulpi-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       hsic {
+                               status = "disabled";
+
+                               lanes {
+                                       hsic-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       hsic-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       pcie {
+                               status = "disabled";
+
+                               lanes {
+                                       pcie-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-1 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-2 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-3 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+
+                                       pcie-4 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+
+                       sata {
+                               status = "disabled";
+
+                               lanes {
+                                       sata-0 {
+                                               status = "disabled";
+                                               #phy-cells = <0>;
+                                       };
+                               };
+                       };
+               };
+
+               ports {
+                       usb2-0 {
+                               status = "disabled";
+                       };
+
+                       usb2-1 {
+                               status = "disabled";
+                       };
+
+                       usb2-2 {
+                               status = "disabled";
+                       };
+
+                       ulpi-0 {
+                               status = "disabled";
+                       };
+
+                       hsic-0 {
+                               status = "disabled";
+                       };
+
+                       hsic-1 {
+                               status = "disabled";
+                       };
+
+                       usb3-0 {
+                               status = "disabled";
+                       };
+
+                       usb3-1 {
+                               status = "disabled";
+                       };
+               };
        };
 
        sdhci@0,700b0000 {
index 4d8b7f69353551b1c065d4a27d5609bf6266fff3..a8a8e434fb2717560bf80959794560015c5cc3fb 100644 (file)
                clock-frequency = <16000000>;
        };
 
+       panel: panel {
+               compatible = "edt,et057090dhu";
+               backlight = <&bl>;
+       };
+
        reg_3v3: regulator-3v3 {
                compatible = "regulator-fixed";
                regulator-name = "3.3V";
        status  = "okay";
 };
 
+&dcu0 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_dcu0_1>;
+       fsl,panel = <&panel>;
+       status = "okay";
+};
+
 &dspi1 {
        status = "okay";
 
        vin-supply = <&reg_3v3>;
 };
 
+&tcon0 {
+       status = "okay";
+};
+
 &uart0 {
        status = "okay";
 };
index 226a86ffd3c9c87da92cc6428237183bd393c635..b7417094dc11c32578856090b2bfc22f4df6ddb8 100644 (file)
                        >;
                };
 
+               pinctrl_dcu0_1: dcu0grp_1 {
+                       fsl,pins = <
+                               VF610_PAD_PTE0__DCU0_HSYNC      0x1902
+                               VF610_PAD_PTE1__DCU0_VSYNC      0x1902
+                               VF610_PAD_PTE2__DCU0_PCLK       0x1902
+                               VF610_PAD_PTE4__DCU0_DE         0x1902
+                               VF610_PAD_PTE5__DCU0_R0         0x1902
+                               VF610_PAD_PTE6__DCU0_R1         0x1902
+                               VF610_PAD_PTE7__DCU0_R2         0x1902
+                               VF610_PAD_PTE8__DCU0_R3         0x1902
+                               VF610_PAD_PTE9__DCU0_R4         0x1902
+                               VF610_PAD_PTE10__DCU0_R5        0x1902
+                               VF610_PAD_PTE11__DCU0_R6        0x1902
+                               VF610_PAD_PTE12__DCU0_R7        0x1902
+                               VF610_PAD_PTE13__DCU0_G0        0x1902
+                               VF610_PAD_PTE14__DCU0_G1        0x1902
+                               VF610_PAD_PTE15__DCU0_G2        0x1902
+                               VF610_PAD_PTE16__DCU0_G3        0x1902
+                               VF610_PAD_PTE17__DCU0_G4        0x1902
+                               VF610_PAD_PTE18__DCU0_G5        0x1902
+                               VF610_PAD_PTE19__DCU0_G6        0x1902
+                               VF610_PAD_PTE20__DCU0_G7        0x1902
+                               VF610_PAD_PTE21__DCU0_B0        0x1902
+                               VF610_PAD_PTE22__DCU0_B1        0x1902
+                               VF610_PAD_PTE23__DCU0_B2        0x1902
+                               VF610_PAD_PTE24__DCU0_B3        0x1902
+                               VF610_PAD_PTE25__DCU0_B4        0x1902
+                               VF610_PAD_PTE26__DCU0_B5        0x1902
+                               VF610_PAD_PTE27__DCU0_B6        0x1902
+                               VF610_PAD_PTE28__DCU0_B7        0x1902
+                       >;
+               };
+
                pinctrl_dspi1: dspi1grp {
                        fsl,pins = <
                                VF610_PAD_PTD5__DSPI1_CS0               0x33e2
index 04ef54d45a917059b8ec3d3578420a93aa5d9f60..2c13ec696ac541f96d781c5a94c251bad36e481a 100644 (file)
                                                        <20000000>;
                        };
 
+                       tcon0: timing-controller@4003d000 {
+                               compatible = "fsl,vf610-tcon";
+                               reg = <0x4003d000 0x1000>;
+                               clocks = <&clks VF610_CLK_TCON0>;
+                               clock-names = "ipg";
+                               status = "disabled";
+                       };
+
                        wdoga5: wdog@4003e000 {
                                compatible = "fsl,vf610-wdt", "fsl,imx21-wdt";
                                reg = <0x4003e000 0x1000>;
                                status = "disabled";
                        };
 
+                       dcu0: dcu@40058000 {
+                               compatible = "fsl,vf610-dcu";
+                               reg = <0x40058000 0x1200>;
+                               interrupts = <30 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&clks VF610_CLK_DCU0>,
+                                       <&clks VF610_CLK_DCU0_DIV>;
+                               clock-names = "dcu", "pix";
+                               fsl,tcon = <&tcon0>;
+                               status = "disabled";
+                       };
+
                        i2c0: i2c@40066000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
index 0df6b1fc965571116ed4ae2366aff451873888e5..96387d477e91c8a4b82b500f58f8e05a694a1096 100644 (file)
@@ -41,6 +41,8 @@
 
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 
+#define KVM_REQ_VCPU_EXIT      8
+
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -226,6 +228,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
+void kvm_arm_halt_guest(struct kvm *kvm);
+void kvm_arm_resume_guest(struct kvm *kvm);
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
 unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu);
index d8e90c8cb5fa0ab4c0486cb6d9fd53d576456291..f3a7de71f515016bf9ab0b2525bfe622794975a7 100644 (file)
@@ -28,6 +28,9 @@ struct kvm_decode {
        bool sign_extend;
 };
 
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
+
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 27563befa8a2df5b27ea39b56bde584771945dfd..22bf1f64d99a44291bd037a74b61ec1d249ec01e 100644 (file)
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct frame_tail buftail;
        unsigned long err;
@@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct frame_tail __user *tail;
 
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-       while ((entry->nr < sysctl_perf_event_max_stack) &&
+       while ((entry->nr < entry->max_stack) &&
               tail && !((unsigned long)tail & 0x3))
                tail = user_backtrace(tail, entry);
 }
@@ -89,13 +89,13 @@ static int
 callchain_trace(struct stackframe *fr,
                void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, fr->pc);
        return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stackframe fr;
 
index 95a000515e43286f822f541fe411902df42bdeb8..02abfff68ee542c7674707e6d88f80e9578be2e1 100644 (file)
@@ -46,6 +46,13 @@ config KVM_ARM_HOST
        ---help---
          Provides host support for ARM processors.
 
+config KVM_NEW_VGIC
+       bool "New VGIC implementation"
+       depends on KVM
+       default y
+       ---help---
+         uses the new VGIC implementation
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index eb1bf4309c13a8c517288b0aa72e86f0fc376b06..a596b58f6d37d052e3e3ad2de155444b570797c3 100644 (file)
@@ -21,7 +21,18 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+
+ifeq ($(CONFIG_KVM_NEW_VGIC),y)
+obj-y += $(KVM)/arm/vgic/vgic.o
+obj-y += $(KVM)/arm/vgic/vgic-init.o
+obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
+obj-y += $(KVM)/arm/vgic/vgic-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio.o
+obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
+obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
+else
 obj-y += $(KVM)/arm/vgic.o
 obj-y += $(KVM)/arm/vgic-v2.o
 obj-y += $(KVM)/arm/vgic-v2-emul.o
+endif
 obj-y += $(KVM)/arm/arch_timer.o
index 237d5d82f0afd6f1749a12f46872dc5170b2c3c6..893941ec98dc6da787629068359275a863516056 100644 (file)
@@ -455,7 +455,7 @@ static void update_vttbr(struct kvm *kvm)
 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 {
        struct kvm *kvm = vcpu->kvm;
-       int ret;
+       int ret = 0;
 
        if (likely(vcpu->arch.has_run_once))
                return 0;
@@ -478,9 +478,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
         * interrupts from the virtual timer with a userspace gic.
         */
        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
-               kvm_timer_enable(kvm);
+               ret = kvm_timer_enable(vcpu);
 
-       return 0;
+       return ret;
 }
 
 bool kvm_arch_intc_initialized(struct kvm *kvm)
@@ -488,30 +488,37 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
        return vgic_initialized(kvm);
 }
 
-static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
-static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
-
-static void kvm_arm_halt_guest(struct kvm *kvm)
+void kvm_arm_halt_guest(struct kvm *kvm)
 {
        int i;
        struct kvm_vcpu *vcpu;
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                vcpu->arch.pause = true;
-       force_vm_exit(cpu_all_mask);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT);
+}
+
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.pause = true;
+       kvm_vcpu_kick(vcpu);
 }
 
-static void kvm_arm_resume_guest(struct kvm *kvm)
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu)
+{
+       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+
+       vcpu->arch.pause = false;
+       swake_up(wq);
+}
+
+void kvm_arm_resume_guest(struct kvm *kvm)
 {
        int i;
        struct kvm_vcpu *vcpu;
 
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-               vcpu->arch.pause = false;
-               swake_up(wq);
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arm_resume_vcpu(vcpu);
 }
 
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
index 0f6600f05137921b61fe5c9a80b87302e7d03234..10f80a6c797a2b6b67363b6f163ef5b85b28465e 100644 (file)
@@ -23,7 +23,7 @@
 
 #include "trace.h"
 
-static void mmio_write_buf(char *buf, unsigned int len, unsigned long data)
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
 {
        void *datap = NULL;
        union {
@@ -55,7 +55,7 @@ static void mmio_write_buf(char *buf, unsigned int len, unsigned long data)
        memcpy(buf, datap, len);
 }
 
-static unsigned long mmio_read_buf(char *buf, unsigned int len)
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
 {
        unsigned long data = 0;
        union {
@@ -66,7 +66,7 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len)
 
        switch (len) {
        case 1:
-               data = buf[0];
+               data = *(u8 *)buf;
                break;
        case 2:
                memcpy(&tmp.hword, buf, len);
@@ -87,11 +87,10 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len)
 
 /**
  * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ *                          or in-kernel IO emulation
+ *
  * @vcpu: The VCPU pointer
  * @run:  The VCPU run struct containing the mmio data
- *
- * This should only be called after returning from userspace for MMIO load
- * emulation.
  */
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
@@ -104,7 +103,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
                if (len > sizeof(unsigned long))
                        return -EINVAL;
 
-               data = mmio_read_buf(run->mmio.data, len);
+               data = kvm_mmio_read_buf(run->mmio.data, len);
 
                if (vcpu->arch.mmio_decode.sign_extend &&
                    len < sizeof(unsigned long)) {
@@ -190,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                                               len);
 
                trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
-               mmio_write_buf(data_buf, len, data);
+               kvm_mmio_write_buf(data_buf, len, data);
 
                ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
                                       data_buf);
@@ -206,18 +205,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
        run->mmio.is_write      = is_write;
        run->mmio.phys_addr     = fault_ipa;
        run->mmio.len           = len;
-       if (is_write)
-               memcpy(run->mmio.data, data_buf, len);
 
        if (!ret) {
                /* We handled the access successfully in the kernel. */
+               if (!is_write)
+                       memcpy(run->mmio.data, data_buf, len);
                vcpu->stat.mmio_exit_kernel++;
                kvm_handle_mmio_return(vcpu, run);
                return 1;
-       } else {
-               vcpu->stat.mmio_exit_user++;
        }
 
+       if (is_write)
+               memcpy(run->mmio.data, data_buf, len);
+       vcpu->stat.mmio_exit_user++;
        run->exit_reason        = KVM_EXIT_MMIO;
        return 0;
 }
index c70709ada692ed05a001131e36e63b4d15103a95..79b6b07e115d55b1be05c676ed0326f2bba3c22a 100644 (file)
@@ -2,6 +2,6 @@
 # Makefile for the linux kernel.
 #
 
-obj-y  := irq.o common.o serial.o
+obj-y  := common.o serial.o
 obj-y  += pm.o suspend.o
 obj-y  += phy3250.o
index 9e3b90df32e1626c3858367910a2b446369d279d..00190535df9061ad6750bcac1f1a07ce65173574 100644 (file)
 #define IRQ_LPC32XX_GPI_06             LPC32XX_SIC2_IRQ(28)
 #define IRQ_LPC32XX_SYSCLK             LPC32XX_SIC2_IRQ(31)
 
-#define NR_IRQS                                96
+#define LPC32XX_NR_IRQS                        96
 
 #endif
diff --git a/arch/arm/mach-lpc32xx/irq.c b/arch/arm/mach-lpc32xx/irq.c
deleted file mode 100644 (file)
index 2ae431e..0000000
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * arch/arm/mach-lpc32xx/irq.c
- *
- * Author: Kevin Wells <kevin.wells@nxp.com>
- *
- * Copyright (C) 2010 NXP Semiconductors
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/irqdomain.h>
-#include <linux/module.h>
-
-#include <mach/irqs.h>
-#include <mach/hardware.h>
-#include <mach/platform.h>
-#include "common.h"
-
-/*
- * Default value representing the Activation polarity of all internal
- * interrupt sources
- */
-#define MIC_APR_DEFAULT                0x3FF0EFE0
-#define SIC1_APR_DEFAULT       0xFBD27186
-#define SIC2_APR_DEFAULT       0x801810C0
-
-/*
- * Default value representing the Activation Type of all internal
- * interrupt sources. All are level sensitive.
- */
-#define MIC_ATR_DEFAULT                0x00000000
-#define SIC1_ATR_DEFAULT       0x00026000
-#define SIC2_ATR_DEFAULT       0x00000000
-
-static struct irq_domain *lpc32xx_mic_domain;
-static struct device_node *lpc32xx_mic_np;
-
-struct lpc32xx_event_group_regs {
-       void __iomem *enab_reg;
-       void __iomem *edge_reg;
-       void __iomem *maskstat_reg;
-       void __iomem *rawstat_reg;
-};
-
-static const struct lpc32xx_event_group_regs lpc32xx_event_int_regs = {
-       .enab_reg = LPC32XX_CLKPWR_INT_ER,
-       .edge_reg = LPC32XX_CLKPWR_INT_AP,
-       .maskstat_reg = LPC32XX_CLKPWR_INT_SR,
-       .rawstat_reg = LPC32XX_CLKPWR_INT_RS,
-};
-
-static const struct lpc32xx_event_group_regs lpc32xx_event_pin_regs = {
-       .enab_reg = LPC32XX_CLKPWR_PIN_ER,
-       .edge_reg = LPC32XX_CLKPWR_PIN_AP,
-       .maskstat_reg = LPC32XX_CLKPWR_PIN_SR,
-       .rawstat_reg = LPC32XX_CLKPWR_PIN_RS,
-};
-
-struct lpc32xx_event_info {
-       const struct lpc32xx_event_group_regs *event_group;
-       u32 mask;
-};
-
-/*
- * Maps an IRQ number to and event mask and register
- */
-static const struct lpc32xx_event_info lpc32xx_events[NR_IRQS] = {
-       [IRQ_LPC32XX_GPI_08] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_08_BIT,
-       },
-       [IRQ_LPC32XX_GPI_09] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_09_BIT,
-       },
-       [IRQ_LPC32XX_GPI_19] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_19_BIT,
-       },
-       [IRQ_LPC32XX_GPI_07] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_07_BIT,
-       },
-       [IRQ_LPC32XX_GPI_00] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_00_BIT,
-       },
-       [IRQ_LPC32XX_GPI_01] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_01_BIT,
-       },
-       [IRQ_LPC32XX_GPI_02] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_02_BIT,
-       },
-       [IRQ_LPC32XX_GPI_03] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_03_BIT,
-       },
-       [IRQ_LPC32XX_GPI_04] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_04_BIT,
-       },
-       [IRQ_LPC32XX_GPI_05] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_05_BIT,
-       },
-       [IRQ_LPC32XX_GPI_06] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_06_BIT,
-       },
-       [IRQ_LPC32XX_GPI_28] = {
-               .event_group = &lpc32xx_event_pin_regs,
-               .mask = LPC32XX_CLKPWR_EXTSRC_GPI_28_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_00] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_00_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_01] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_01_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_02] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_02_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_03] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_03_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_04] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_04_BIT,
-       },
-       [IRQ_LPC32XX_GPIO_05] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_GPIO_05_BIT,
-       },
-       [IRQ_LPC32XX_KEY] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_KEY_BIT,
-       },
-       [IRQ_LPC32XX_ETHERNET] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_MAC_BIT,
-       },
-       [IRQ_LPC32XX_USB_OTG_ATX] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_USBATXINT_BIT,
-       },
-       [IRQ_LPC32XX_USB_HOST] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_USB_BIT,
-       },
-       [IRQ_LPC32XX_RTC] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_RTC_BIT,
-       },
-       [IRQ_LPC32XX_MSTIMER] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_MSTIMER_BIT,
-       },
-       [IRQ_LPC32XX_TS_AUX] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_TS_AUX_BIT,
-       },
-       [IRQ_LPC32XX_TS_P] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_TS_P_BIT,
-       },
-       [IRQ_LPC32XX_TS_IRQ] = {
-               .event_group = &lpc32xx_event_int_regs,
-               .mask = LPC32XX_CLKPWR_INTSRC_ADC_BIT,
-       },
-};
-
-static void get_controller(unsigned int irq, unsigned int *base,
-       unsigned int *irqbit)
-{
-       if (irq < 32) {
-               *base = LPC32XX_MIC_BASE;
-               *irqbit = 1 << irq;
-       } else if (irq < 64) {
-               *base = LPC32XX_SIC1_BASE;
-               *irqbit = 1 << (irq - 32);
-       } else {
-               *base = LPC32XX_SIC2_BASE;
-               *irqbit = 1 << (irq - 64);
-       }
-}
-
-static void lpc32xx_mask_irq(struct irq_data *d)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       reg = __raw_readl(LPC32XX_INTC_MASK(ctrl)) & ~mask;
-       __raw_writel(reg, LPC32XX_INTC_MASK(ctrl));
-}
-
-static void lpc32xx_unmask_irq(struct irq_data *d)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       reg = __raw_readl(LPC32XX_INTC_MASK(ctrl)) | mask;
-       __raw_writel(reg, LPC32XX_INTC_MASK(ctrl));
-}
-
-static void lpc32xx_ack_irq(struct irq_data *d)
-{
-       unsigned int ctrl, mask;
-
-       get_controller(d->hwirq, &ctrl, &mask);
-
-       __raw_writel(mask, LPC32XX_INTC_RAW_STAT(ctrl));
-
-       /* Also need to clear pending wake event */
-       if (lpc32xx_events[d->hwirq].mask != 0)
-               __raw_writel(lpc32xx_events[d->hwirq].mask,
-                       lpc32xx_events[d->hwirq].event_group->rawstat_reg);
-}
-
-static void __lpc32xx_set_irq_type(unsigned int irq, int use_high_level,
-       int use_edge)
-{
-       unsigned int reg, ctrl, mask;
-
-       get_controller(irq, &ctrl, &mask);
-
-       /* Activation level, high or low */
-       reg = __raw_readl(LPC32XX_INTC_POLAR(ctrl));
-       if (use_high_level)
-               reg |= mask;
-       else
-               reg &= ~mask;
-       __raw_writel(reg, LPC32XX_INTC_POLAR(ctrl));
-
-       /* Activation type, edge or level */
-       reg = __raw_readl(LPC32XX_INTC_ACT_TYPE(ctrl));
-       if (use_edge)
-               reg |= mask;
-       else
-               reg &= ~mask;
-       __raw_writel(reg, LPC32XX_INTC_ACT_TYPE(ctrl));
-
-       /* Use same polarity for the wake events */
-       if (lpc32xx_events[irq].mask != 0) {
-               reg = __raw_readl(lpc32xx_events[irq].event_group->edge_reg);
-
-               if (use_high_level)
-                       reg |= lpc32xx_events[irq].mask;
-               else
-                       reg &= ~lpc32xx_events[irq].mask;
-
-               __raw_writel(reg, lpc32xx_events[irq].event_group->edge_reg);
-       }
-}
-
-static int lpc32xx_set_irq_type(struct irq_data *d, unsigned int type)
-{
-       switch (type) {
-       case IRQ_TYPE_EDGE_RISING:
-               /* Rising edge sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 1, 1);
-               irq_set_handler_locked(d, handle_edge_irq);
-               break;
-
-       case IRQ_TYPE_EDGE_FALLING:
-               /* Falling edge sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 0, 1);
-               irq_set_handler_locked(d, handle_edge_irq);
-               break;
-
-       case IRQ_TYPE_LEVEL_LOW:
-               /* Low level sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 0, 0);
-               irq_set_handler_locked(d, handle_level_irq);
-               break;
-
-       case IRQ_TYPE_LEVEL_HIGH:
-               /* High level sensitive */
-               __lpc32xx_set_irq_type(d->hwirq, 1, 0);
-               irq_set_handler_locked(d, handle_level_irq);
-               break;
-
-       /* Other modes are not supported */
-       default:
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int lpc32xx_irq_wake(struct irq_data *d, unsigned int state)
-{
-       unsigned long eventreg;
-
-       if (lpc32xx_events[d->hwirq].mask != 0) {
-               eventreg = __raw_readl(lpc32xx_events[d->hwirq].
-                       event_group->enab_reg);
-
-               if (state)
-                       eventreg |= lpc32xx_events[d->hwirq].mask;
-               else {
-                       eventreg &= ~lpc32xx_events[d->hwirq].mask;
-
-                       /*
-                        * When disabling the wakeup, clear the latched
-                        * event
-                        */
-                       __raw_writel(lpc32xx_events[d->hwirq].mask,
-                               lpc32xx_events[d->hwirq].
-                               event_group->rawstat_reg);
-               }
-
-               __raw_writel(eventreg,
-                       lpc32xx_events[d->hwirq].event_group->enab_reg);
-
-               return 0;
-       }
-
-       /* Clear event */
-       __raw_writel(lpc32xx_events[d->hwirq].mask,
-               lpc32xx_events[d->hwirq].event_group->rawstat_reg);
-
-       return -ENODEV;
-}
-
-static void __init lpc32xx_set_default_mappings(unsigned int apr,
-       unsigned int atr, unsigned int offset)
-{
-       unsigned int i;
-
-       /* Set activation levels for each interrupt */
-       i = 0;
-       while (i < 32) {
-               __lpc32xx_set_irq_type(offset + i, ((apr >> i) & 0x1),
-                       ((atr >> i) & 0x1));
-               i++;
-       }
-}
-
-static struct irq_chip lpc32xx_irq_chip = {
-       .name = "MIC",
-       .irq_ack = lpc32xx_ack_irq,
-       .irq_mask = lpc32xx_mask_irq,
-       .irq_unmask = lpc32xx_unmask_irq,
-       .irq_set_type = lpc32xx_set_irq_type,
-       .irq_set_wake = lpc32xx_irq_wake
-};
-
-static void lpc32xx_sic1_handler(struct irq_desc *desc)
-{
-       unsigned long ints = __raw_readl(LPC32XX_INTC_STAT(LPC32XX_SIC1_BASE));
-
-       while (ints != 0) {
-               int irqno = fls(ints) - 1;
-
-               ints &= ~(1 << irqno);
-
-               generic_handle_irq(LPC32XX_SIC1_IRQ(irqno));
-       }
-}
-
-static void lpc32xx_sic2_handler(struct irq_desc *desc)
-{
-       unsigned long ints = __raw_readl(LPC32XX_INTC_STAT(LPC32XX_SIC2_BASE));
-
-       while (ints != 0) {
-               int irqno = fls(ints) - 1;
-
-               ints &= ~(1 << irqno);
-
-               generic_handle_irq(LPC32XX_SIC2_IRQ(irqno));
-       }
-}
-
-static int __init __lpc32xx_mic_of_init(struct device_node *node,
-                                       struct device_node *parent)
-{
-       lpc32xx_mic_np = node;
-
-       return 0;
-}
-
-static const struct of_device_id mic_of_match[] __initconst = {
-       { .compatible = "nxp,lpc3220-mic", .data = __lpc32xx_mic_of_init },
-       { }
-};
-
-void __init lpc32xx_init_irq(void)
-{
-       unsigned int i;
-
-       /* Setup MIC */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_MIC_BASE));
-       __raw_writel(MIC_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_MIC_BASE));
-       __raw_writel(MIC_ATR_DEFAULT, LPC32XX_INTC_ACT_TYPE(LPC32XX_MIC_BASE));
-
-       /* Setup SIC1 */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_SIC1_BASE));
-       __raw_writel(SIC1_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_SIC1_BASE));
-       __raw_writel(SIC1_ATR_DEFAULT,
-                               LPC32XX_INTC_ACT_TYPE(LPC32XX_SIC1_BASE));
-
-       /* Setup SIC2 */
-       __raw_writel(0, LPC32XX_INTC_MASK(LPC32XX_SIC2_BASE));
-       __raw_writel(SIC2_APR_DEFAULT, LPC32XX_INTC_POLAR(LPC32XX_SIC2_BASE));
-       __raw_writel(SIC2_ATR_DEFAULT,
-                               LPC32XX_INTC_ACT_TYPE(LPC32XX_SIC2_BASE));
-
-       /* Configure supported IRQ's */
-       for (i = 0; i < NR_IRQS; i++) {
-               irq_set_chip_and_handler(i, &lpc32xx_irq_chip,
-                                        handle_level_irq);
-               irq_clear_status_flags(i, IRQ_NOREQUEST);
-       }
-
-       /* Set default mappings */
-       lpc32xx_set_default_mappings(MIC_APR_DEFAULT, MIC_ATR_DEFAULT, 0);
-       lpc32xx_set_default_mappings(SIC1_APR_DEFAULT, SIC1_ATR_DEFAULT, 32);
-       lpc32xx_set_default_mappings(SIC2_APR_DEFAULT, SIC2_ATR_DEFAULT, 64);
-
-       /* Initially disable all wake events */
-       __raw_writel(0, LPC32XX_CLKPWR_P01_ER);
-       __raw_writel(0, LPC32XX_CLKPWR_INT_ER);
-       __raw_writel(0, LPC32XX_CLKPWR_PIN_ER);
-
-       /*
-        * Default wake activation polarities, all pin sources are low edge
-        * triggered
-        */
-       __raw_writel(LPC32XX_CLKPWR_INTSRC_TS_P_BIT |
-               LPC32XX_CLKPWR_INTSRC_MSTIMER_BIT |
-               LPC32XX_CLKPWR_INTSRC_RTC_BIT,
-               LPC32XX_CLKPWR_INT_AP);
-       __raw_writel(0, LPC32XX_CLKPWR_PIN_AP);
-
-       /* Clear latched wake event states */
-       __raw_writel(__raw_readl(LPC32XX_CLKPWR_PIN_RS),
-               LPC32XX_CLKPWR_PIN_RS);
-       __raw_writel(__raw_readl(LPC32XX_CLKPWR_INT_RS),
-               LPC32XX_CLKPWR_INT_RS);
-
-       of_irq_init(mic_of_match);
-
-       lpc32xx_mic_domain = irq_domain_add_legacy(lpc32xx_mic_np, NR_IRQS,
-                                                  0, 0, &irq_domain_simple_ops,
-                                                  NULL);
-       if (!lpc32xx_mic_domain)
-               panic("Unable to add MIC irq domain\n");
-
-       /* MIC SUBIRQx interrupts will route handling to the chain handlers */
-       irq_set_chained_handler(IRQ_LPC32XX_SUB1IRQ, lpc32xx_sic1_handler);
-       irq_set_chained_handler(IRQ_LPC32XX_SUB2IRQ, lpc32xx_sic2_handler);
-}
index 72918c4973ea7f0cc86da080916c9f72bd327fcd..f6ac027f3c3bf2d5bb41b436fd85fa40a9a2a670 100644 (file)
@@ -97,10 +97,7 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
        gpmc_nand_res[2].start = gpmc_get_client_irq(GPMC_IRQ_COUNT_EVENT);
 
        memset(&s, 0, sizeof(struct gpmc_settings));
-       if (gpmc_nand_data->of_node)
-               gpmc_read_settings_dt(gpmc_nand_data->of_node, &s);
-       else
-               gpmc_set_legacy(gpmc_nand_data, &s);
+       gpmc_set_legacy(gpmc_nand_data, &s);
 
        s.device_nand = true;
 
@@ -121,8 +118,6 @@ int gpmc_nand_init(struct omap_nand_platform_data *gpmc_nand_data,
        if (err < 0)
                goto out_free_cs;
 
-       gpmc_update_nand_reg(&gpmc_nand_data->reg, gpmc_nand_data->cs);
-
        if (!gpmc_hwecc_bch_capable(gpmc_nand_data->ecc_opt)) {
                pr_err("omap2-nand: Unsupported NAND ECC scheme selected\n");
                err = -EINVAL;
index 7ee4652b4c61d9a3a921d52de13d3e46dd356d78..cd894d69e7663a8abf187ed77a3b7dff84752fc9 100644 (file)
@@ -6,6 +6,7 @@ comment "Intel/Marvell Dev Platforms (sorted by hardware release time)"
 
 config MACH_PXA27X_DT
        bool "Support PXA27x platforms from device tree"
+       select PINCTRL
        select POWER_SUPPLY
        select PXA27x
        select USE_OF
@@ -17,6 +18,7 @@ config MACH_PXA27X_DT
 config MACH_PXA3XX_DT
        bool "Support PXA3xx platforms from device tree"
        select CPU_PXA300
+       select PINCTRL
        select POWER_SUPPLY
        select PXA3xx
        select USE_OF
index e838b11fb8c7d3992f88d247522064a83e2ea08f..fa9d71d194f01f44fb5bccf065a8a362838f6b3a 100644 (file)
@@ -128,7 +128,7 @@ struct resource eseries_tmio_resources[] = {
 /* Some e-series hardware cannot control the 32K clock */
 static void __init __maybe_unused eseries_register_clks(void)
 {
-       clk_register_fixed_rate(NULL, "CLK_CK32K", NULL, CLK_IS_ROOT, 32768);
+       clk_register_fixed_rate(NULL, "CLK_CK32K", NULL, 0, 32768);
 }
 
 #ifdef CONFIG_MACH_E330
index d9578bc49fdc61766c430e55e1ab21857a6ebf03..bd7cd8b6a286ec3713145c3713e861c7419bd25e 100644 (file)
@@ -763,14 +763,49 @@ static struct nand_bbt_descr spitz_nand_bbt = {
        .pattern        = scan_ff_pattern
 };
 
-static struct nand_ecclayout akita_oobinfo = {
-       .oobfree        = { {0x08, 0x09} },
-       .eccbytes       = 24,
-       .eccpos         = {
-                       0x05, 0x01, 0x02, 0x03, 0x06, 0x07, 0x15, 0x11,
-                       0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-                       0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37,
-       },
+static int akita_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section > 12)
+               return -ERANGE;
+
+       switch (section % 3) {
+       case 0:
+               oobregion->offset = 5;
+               oobregion->length = 1;
+               break;
+
+       case 1:
+               oobregion->offset = 1;
+               oobregion->length = 3;
+               break;
+
+       case 2:
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       }
+
+       oobregion->offset += (section / 3) * 0x10;
+
+       return 0;
+}
+
+static int akita_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops akita_ooblayout_ops = {
+       .ecc = akita_ooblayout_ecc,
+       .free = akita_ooblayout_free,
 };
 
 static struct sharpsl_nand_platform_data spitz_nand_pdata = {
@@ -804,11 +839,11 @@ static void __init spitz_nand_init(void)
        } else if (machine_is_akita()) {
                spitz_nand_partitions[1].size = 58 * 1024 * 1024;
                spitz_nand_bbt.len = 1;
-               spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+               spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
        } else if (machine_is_borzoi()) {
                spitz_nand_partitions[1].size = 32 * 1024 * 1024;
                spitz_nand_bbt.len = 1;
-               spitz_nand_pdata.ecc_layout = &akita_oobinfo;
+               spitz_nand_pdata.ecc_layout = &akita_ooblayout_ops;
        }
 
        platform_device_register(&spitz_nand_device);
index 774c982a7b7ed4260a49ca01d7d9714b188f65d4..25a139bb9826dadfafd327bb1f2e1e4163993566 100644 (file)
@@ -496,6 +496,12 @@ static int rx1950_backlight_init(struct device *dev)
                return PTR_ERR(lcd_pwm);
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(lcd_pwm);
+
        rx1950_lcd_power(1);
        rx1950_bl_power(1);
 
index 1160434eece0509c3797733b49e8fcb1262e42e7..59a8fa7b8a3bd5541e58a7a1c6e811852aad30df 100644 (file)
@@ -74,5 +74,5 @@ $(MODLIB)/vdso: FORCE
        @mkdir -p $(MODLIB)/vdso
 
 PHONY += vdso_install
-vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso FORCE
+vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso
        $(call cmd,vdso_install)
index 7cb2d72e7378301a83b6e493701daa5ff9799868..3285a9286786c48cce2ba1cba1b3f1be3d4d4f2f 100644 (file)
@@ -10,6 +10,7 @@
 
 #include <dt-bindings/clock/r8a7795-cpg-mssr.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/power/r8a7795-sysc.h>
 
 / {
        compatible = "renesas,r8a7795";
@@ -39,6 +40,7 @@
                        compatible = "arm,cortex-a57", "arm,armv8";
                        reg = <0x0>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU0>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -47,6 +49,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x1>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU1>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -54,6 +57,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x2>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU2>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
@@ -61,6 +65,7 @@
                        compatible = "arm,cortex-a57","arm,armv8";
                        reg = <0x3>;
                        device_type = "cpu";
+                       power-domains = <&sysc R8A7795_PD_CA57_CPU3>;
                        next-level-cache = <&L2_CA57>;
                        enable-method = "psci";
                };
 
        L2_CA57: cache-controller@0 {
                compatible = "cache";
+               power-domains = <&sysc R8A7795_PD_CA57_SCU>;
                cache-unified;
                cache-level = <2>;
        };
 
        L2_CA53: cache-controller@1 {
                compatible = "cache";
+               power-domains = <&sysc R8A7795_PD_CA53_SCU>;
                cache-unified;
                cache-level = <2>;
        };
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 912>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio1: gpio@e6051000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 911>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio2: gpio@e6052000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 910>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio3: gpio@e6053000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 909>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio4: gpio@e6054000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 908>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio5: gpio@e6055000 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 907>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio6: gpio@e6055400 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 906>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                gpio7: gpio@e6055800 {
                        #interrupt-cells = <2>;
                        interrupt-controller;
                        clocks = <&cpg CPG_MOD 905>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                pmu_a57 {
                        #power-domain-cells = <0>;
                };
 
+               sysc: system-controller@e6180000 {
+                       compatible = "renesas,r8a7795-sysc";
+                       reg = <0 0xe6180000 0 0x0400>;
+                       #power-domain-cells = <1>;
+               };
+
                audma0: dma-controller@ec700000 {
                        compatible = "renesas,rcar-dmac";
                        reg = <0 0xec700000 0 0x10000>;
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 502>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 501>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                      GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH
                                      GIC_SPI 161 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 407>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                };
 
                dmac0: dma-controller@e6700000 {
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 219>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 218>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                        "ch12", "ch13", "ch14", "ch15";
                        clocks = <&cpg CPG_MOD 217>;
                        clock-names = "fck";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <16>;
                };
                                          "ch20", "ch21", "ch22", "ch23",
                                          "ch24";
                        clocks = <&cpg CPG_MOD 812>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        phy-mode = "rgmii-id";
                        #address-cells = <1>;
                        #size-cells = <0>;
                        clock-names = "clkp1", "clkp2", "can_clk";
                        assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
                        assigned-clock-rates = <40000000>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "clkp1", "clkp2", "can_clk";
                        assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
                        assigned-clock-rates = <40000000>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x31>, <&dmac1 0x30>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x33>, <&dmac1 0x32>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x35>, <&dmac1 0x34>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x37>, <&dmac0 0x36>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x39>, <&dmac0 0x38>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x51>, <&dmac1 0x50>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x53>, <&dmac1 0x52>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x13>, <&dmac1 0x12>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x57>, <&dmac0 0x56>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac0 0x59>, <&dmac0 0x58>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clock-names = "fck", "brg_int", "scif_clk";
                        dmas = <&dmac1 0x5b>, <&dmac1 0x5a>;
                        dma-names = "tx", "rx";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xe6500000 0 0x40>;
                        interrupts = <GIC_SPI 287 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 931>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe6508000 0 0x40>;
                        interrupts = <GIC_SPI 288 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 930>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                        reg = <0 0xe6510000 0 0x40>;
                        interrupts = <GIC_SPI 286 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 929>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                        reg = <0 0xe66d0000 0 0x40>;
                        interrupts = <GIC_SPI 290 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 928>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66d8000 0 0x40>;
                        interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 927>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66e0000 0 0x40>;
                        interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 919>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <110>;
                        status = "disabled";
                };
                        reg = <0 0xe66e8000 0 0x40>;
                        interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 918>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        i2c-scl-internal-delay-ns = <6>;
                        status = "disabled";
                };
                                      "src.1", "src.0",
                                      "dvc.0", "dvc.1",
                                      "clk_a", "clk_b", "clk_c", "clk_i";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
 
                        rcar_sound,dvc {
                        reg = <0 0xee000000 0 0xc00>;
                        interrupts = <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 328>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee040000 0 0xc00>;
                        interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 327>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                                      GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>;
                        interrupt-names = "ch0", "ch1";
                        clocks = <&cpg CPG_MOD 330>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <2>;
                };
                                      GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>;
                        interrupt-names = "ch0", "ch1";
                        clocks = <&cpg CPG_MOD 331>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #dma-cells = <1>;
                        dma-channels = <2>;
                };
                        reg = <0 0xee100000 0 0x2000>;
                        interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 314>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee120000 0 0x2000>;
                        interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 313>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        reg = <0 0xee140000 0 0x2000>;
                        interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 312>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        cap-mmc-highspeed;
                        status = "disabled";
                };
                        reg = <0 0xee160000 0 0x2000>;
                        interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 311>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        cap-mmc-highspeed;
                        status = "disabled";
                };
                        reg = <0 0xee080200 0 0x700>;
                        interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 703>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        compatible = "renesas,usb2-phy-r8a7795";
                        reg = <0 0xee0a0200 0 0x700>;
                        clocks = <&cpg CPG_MOD 702>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        compatible = "renesas,usb2-phy-r8a7795";
                        reg = <0 0xee0c0200 0 0x700>;
                        clocks = <&cpg CPG_MOD 701>;
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        #phy-cells = <0>;
                        status = "disabled";
                };
                        clocks = <&cpg CPG_MOD 703>;
                        phys = <&usb2_phy0>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 702>;
                        phys = <&usb2_phy1>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 701>;
                        phys = <&usb2_phy2>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 703>;
                        phys = <&usb2_phy0>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 702>;
                        phys = <&usb2_phy1>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        clocks = <&cpg CPG_MOD 701>;
                        phys = <&usb2_phy2>;
                        phy-names = "usb";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
                pciec0: pcie@fe000000 {
                        interrupt-map = <0 0 0 0 &gic GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 319>, <&pcie_bus_clk>;
                        clock-names = "pcie", "pcie_bus";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
 
                        interrupt-map = <0 0 0 0 &gic GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
                        clocks = <&cpg CPG_MOD 318>, <&pcie_bus_clk>;
                        clock-names = "pcie", "pcie_bus";
-                       power-domains = <&cpg>;
+                       power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
                        status = "disabled";
                };
        };
index 89171505e750c3597177fe0fbc3a1b65afe567da..fd2d74d0491ece760f65a551acd3f5304fc792fb 100644 (file)
@@ -200,6 +200,8 @@ CONFIG_SENSORS_INA2XX=m
 CONFIG_THERMAL=y
 CONFIG_THERMAL_EMULATION=y
 CONFIG_EXYNOS_THERMAL=y
+CONFIG_WATCHDOG=y
+CONFIG_RENESAS_WDT=y
 CONFIG_MFD_SPMI_PMIC=y
 CONFIG_MFD_SEC_CORE=y
 CONFIG_MFD_HI655X_PMIC=y
index e63d23bad36ea2a932723449b1cee6eedd05da77..49095fc4b482d3f455aa4d7d0fe48aa6f0252abc 100644 (file)
@@ -43,6 +43,8 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
+#define KVM_REQ_VCPU_EXIT      8
+
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(long ext);
@@ -327,6 +329,10 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
+void kvm_arm_halt_guest(struct kvm *kvm);
+void kvm_arm_resume_guest(struct kvm *kvm);
+void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
+void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
 
 u64 __kvm_call_hyp(void *hypfn, ...);
 #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
index fe612a9625766b5fff3698e2d1014d7376744e4f..75ea4207975760f92a1b8d091a7e04b8c7dd0e7a 100644 (file)
@@ -30,6 +30,9 @@ struct kvm_decode {
        bool sign_extend;
 };
 
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
+
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 phys_addr_t fault_ipa);
index 1caadc24e3fe99c1ed3169cfbb238ab279ad0ff4..043d17a21342e8168ca2e71933b72316cfd6eb5f 100644 (file)
@@ -13,4 +13,7 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
+
+#define __ARCH_WANT_RENAMEAT
+
 #include <asm-generic/unistd.h>
index 32c3c6e70119f4e123498b85f1bc28398e333b13..713ca824f266a7fa771b3609ec8dec5264eef3ed 100644 (file)
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct frame_tail buftail;
        unsigned long err;
@@ -76,7 +76,7 @@ struct compat_frame_tail {
 
 static struct compat_frame_tail __user *
 compat_user_backtrace(struct compat_frame_tail __user *tail,
-                     struct perf_callchain_entry *entry)
+                     struct perf_callchain_entry_ctx *entry)
 {
        struct compat_frame_tail buftail;
        unsigned long err;
@@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                         struct pt_regs *regs)
 {
        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct frame_tail __user *)regs->regs[29];
 
-               while (entry->nr < sysctl_perf_event_max_stack &&
+               while (entry->nr < entry->max_stack &&
                       tail && !((unsigned long)tail & 0xf))
                        tail = user_backtrace(tail, entry);
        } else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
                tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-               while ((entry->nr < sysctl_perf_event_max_stack) &&
+               while ((entry->nr < entry->max_stack) &&
                        tail && !((unsigned long)tail & 0x3))
                        tail = compat_user_backtrace(tail, entry);
 #endif
@@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
  */
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, frame->pc);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        struct stackframe frame;
index aa2e34e99582dfd1bd308d0a39829f3ae33a99c5..c4f26ef91e772f7af71b79c35d13a11f5f063e03 100644 (file)
@@ -54,6 +54,13 @@ config KVM_ARM_PMU
          Adds support for a virtual Performance Monitoring Unit (PMU) in
          virtual machines.
 
+config KVM_NEW_VGIC
+       bool "New VGIC implementation"
+       depends on KVM
+       default y
+        ---help---
+          uses the new VGIC implementation
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index 122cff482ac459b44346dc3eeca06739338ad8ef..a7a958ca29d56ab11e2e267f34baa3012940fc53 100644 (file)
@@ -20,10 +20,22 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
+ifeq ($(CONFIG_KVM_NEW_VGIC),y)
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
+else
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
+endif
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
index 4d1ac81870d27e6f272abde088e0f5e8290c80d1..e9e0e6db73f6bdad6fbe77b7dda51ba04bf6734b 100644 (file)
@@ -162,7 +162,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
                esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT);
 
        if (!is_iabt)
-               esr |= ESR_ELx_EC_DABT_LOW;
+               esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
 
        vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_ELx_FSC_EXTABT;
 }
index e7d09a614d1028b4e0673f73f05d3fa0388e23f5..12d73d9d81f55ceba344eb91ca95130a308a35f9 100644 (file)
@@ -14,6 +14,7 @@
  *   more details.
  */
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_CLONE
 
 /* Use the standard ABI for syscalls. */
index 5aa3f516231081ccc92e29999e9826b8d61a85b4..3f646c787e584e40a097208abd9fdb05221cf755 100644 (file)
@@ -157,6 +157,7 @@ struct mtd_info *__init crisv32_nand_flash_probe(void)
        /* 20 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        /* this->bbt_options = NAND_BBT_USE_FLASH; */
index a7c17b0f172a360e4b57465ad7c19b8c5196ac2f..a74540514bdbdcde086259df8f08a9beec340cd0 100644 (file)
@@ -148,6 +148,7 @@ struct mtd_info *__init crisv32_nand_flash_probe(void)
        /* 20 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        /* this->bbt_options = NAND_BBT_USE_FLASH; */
index 7643633f1330fd148b1e702973e15fb166137735..613bfe6f52727d5739e25a6a8102ac1b5b041ac6 100644 (file)
@@ -23,7 +23,6 @@ LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -estartup -T $(obj)/vmlinux.lds \
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 7a2eb698def355b6c418782ae47e3653289394a8..7dd20ef7625adeeba03fba80cde6aaa70d91e3c2 100644 (file)
@@ -1,3 +1,5 @@
 #define __ARCH_NOMMU
 
+#define __ARCH_WANT_RENAMEAT
+
 #include <asm-generic/unistd.h>
index ffee405d68034a004168e0763e26fd146273d14f..21517600432b42e529ea9a1bc2fb68771e97b16a 100644 (file)
@@ -27,6 +27,7 @@
  */
 
 #define sys_mmap2 sys_mmap_pgoff
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_VFORK
index 970d0bd99621b32eef163debccd8c2d41a970250..c100d780f1eb21f8b5c38ad33c3c562dc626eca5 100644 (file)
@@ -95,8 +95,8 @@ define archhelp
   echo '* unwcheck     - Check vmlinux for invalid unwind info'
 endef
 
-archprepare: make_nr_irqs_h FORCE
-PHONY += make_nr_irqs_h FORCE
+archprepare: make_nr_irqs_h
+PHONY += make_nr_irqs_h
 
-make_nr_irqs_h: FORCE
+make_nr_irqs_h:
        $(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h
index 01729c2979ba2634f49f7c4403aab4e95652fead..0606a727aab234382f6ea790db35f434f108a500 100644 (file)
@@ -19,7 +19,6 @@ LDFLAGS_vmlinux := -T
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index b80b8e899d22dfd202bbf8b9403a48b342d3462e..459b6ec1584862e20212a34c56b7ed7ce57082d3 100644 (file)
@@ -7,6 +7,8 @@
  * (at your option) any later version.
  */
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
 
index 252abc12a5a31f6221b106b060ffd6926a8a6b5b..3e8e048040dfc96c0f73fbd48fb7c75d036937d9 100644 (file)
@@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr)
 
 static struct metag_frame __user *
 user_backtrace(struct metag_frame __user *user_frame,
-              struct perf_callchain_entry *entry)
+              struct perf_callchain_entry_ctx *entry)
 {
        struct metag_frame frame;
        unsigned long calladdr;
@@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        unsigned long sp = regs->ctx.AX[0].U0;
        struct metag_frame __user *frame;
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        --frame;
 
-       while ((entry->nr < sysctl_perf_event_max_stack) && frame)
+       while ((entry->nr < entry->max_stack) && frame)
                frame = user_backtrace(frame, entry);
 }
 
@@ -78,13 +78,13 @@ static int
 callchain_trace(struct stackframe *fr,
                void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
        perf_callchain_store(entry, fr->pc);
        return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stackframe fr;
 
index 76ed17b56fead0092462c7f5498767b281c0339a..805ae5d712e8baa63095199f1601ce548c26d606 100644 (file)
@@ -38,6 +38,6 @@
 
 #endif /* __ASSEMBLY__ */
 
-#define __NR_syscalls         389
+#define __NR_syscalls         392
 
 #endif /* _ASM_MICROBLAZE_UNISTD_H */
index 32850c73be09b915309fe892486c0d5187eed0a5..a8bd3fa28bc7f4e97158fff49d25443524647b0d 100644 (file)
 #define __NR_memfd_create      386
 #define __NR_bpf               387
 #define __NR_execveat          388
+#define __NR_userfaultfd       389
+#define __NR_membarrier                390
+#define __NR_mlock2            391
 
 #endif /* _UAPI_ASM_MICROBLAZE_UNISTD_H */
index 29c8568ec55c32776139cd79e1e441d23638078d..6b3dd99126d753a22a9ed270ec92761c2f936e27 100644 (file)
@@ -389,3 +389,6 @@ ENTRY(sys_call_table)
        .long sys_memfd_create
        .long sys_bpf
        .long sys_execveat
+       .long sys_userfaultfd
+       .long sys_membarrier            /* 390 */
+       .long sys_mlock2
index 35654be3f1c03289bda596b184b64c2d734d7655..14cba600da7ae4fff9573cfcb27465e8f322e73d 100644 (file)
@@ -48,6 +48,8 @@ static int global_phb_number;         /* Global phb counter */
 resource_size_t isa_mem_base;
 
 unsigned long isa_io_base;
+EXPORT_SYMBOL(isa_io_base);
+
 static int pci_bus_count;
 
 struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
index 398733e3e2cf65006541cf5c3eff6c68b06b0eb0..7f7b0fc554da5c5a706768f73cfd730d7ed9ae85 100644 (file)
@@ -27,7 +27,7 @@ struct jz_nand_platform_data {
 
        unsigned char banks[JZ_NAND_NUM_BANKS];
 
-       void (*ident_callback)(struct platform_device *, struct nand_chip *,
+       void (*ident_callback)(struct platform_device *, struct mtd_info *,
                                struct mtd_partition **, int *num_partitions);
 };
 
index 4e3f9b7a02e4997101843a9283c45cb97fa81866..258fd03c9ef5aa98145cb69b699f39f5dd92e81a 100644 (file)
 #define QI_LB60_GPIO_KEYIN8            JZ_GPIO_PORTD(26)
 
 /* NAND */
-static struct nand_ecclayout qi_lb60_ecclayout_1gb = {
-       .eccbytes = 36,
-       .eccpos = {
-               6,  7,  8,  9,  10, 11, 12, 13,
-               14, 15, 16, 17, 18, 19, 20, 21,
-               22, 23, 24, 25, 26, 27, 28, 29,
-               30, 31, 32, 33, 34, 35, 36, 37,
-               38, 39, 40, 41
-       },
-       .oobfree = {
-               { .offset = 2, .length = 4 },
-               { .offset = 42, .length = 22 }
-       },
-};
 
 /* Early prototypes of the QI LB60 had only 1GB of NAND.
  * In order to support these devices as well the partition and ecc layout is
@@ -84,25 +70,6 @@ static struct mtd_partition qi_lb60_partitions_1gb[] = {
        },
 };
 
-static struct nand_ecclayout qi_lb60_ecclayout_2gb = {
-       .eccbytes = 72,
-       .eccpos = {
-               12, 13, 14, 15, 16, 17, 18, 19,
-               20, 21, 22, 23, 24, 25, 26, 27,
-               28, 29, 30, 31, 32, 33, 34, 35,
-               36, 37, 38, 39, 40, 41, 42, 43,
-               44, 45, 46, 47, 48, 49, 50, 51,
-               52, 53, 54, 55, 56, 57, 58, 59,
-               60, 61, 62, 63, 64, 65, 66, 67,
-               68, 69, 70, 71, 72, 73, 74, 75,
-               76, 77, 78, 79, 80, 81, 82, 83
-       },
-       .oobfree = {
-               { .offset = 2, .length = 10 },
-               { .offset = 84, .length = 44 },
-       },
-};
-
 static struct mtd_partition qi_lb60_partitions_2gb[] = {
        {
                .name = "NAND BOOT partition",
@@ -121,19 +88,67 @@ static struct mtd_partition qi_lb60_partitions_2gb[] = {
        },
 };
 
+static int qi_lb60_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 36;
+       oobregion->offset = 6;
+
+       if (mtd->oobsize == 128) {
+               oobregion->length *= 2;
+               oobregion->offset *= 2;
+       }
+
+       return 0;
+}
+
+static int qi_lb60_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       int eccbytes = 36, eccoff = 6;
+
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->oobsize == 128) {
+               eccbytes *= 2;
+               eccoff *= 2;
+       }
+
+       if (!section) {
+               oobregion->offset = 2;
+               oobregion->length = eccoff - 2;
+       } else {
+               oobregion->offset = eccoff + eccbytes;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops qi_lb60_ooblayout_ops = {
+       .ecc = qi_lb60_ooblayout_ecc,
+       .free = qi_lb60_ooblayout_free,
+};
+
 static void qi_lb60_nand_ident(struct platform_device *pdev,
-               struct nand_chip *chip, struct mtd_partition **partitions,
+               struct mtd_info *mtd, struct mtd_partition **partitions,
                int *num_partitions)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
        if (chip->page_shift == 12) {
-               chip->ecc.layout = &qi_lb60_ecclayout_2gb;
                *partitions = qi_lb60_partitions_2gb;
                *num_partitions = ARRAY_SIZE(qi_lb60_partitions_2gb);
        } else {
-               chip->ecc.layout = &qi_lb60_ecclayout_1gb;
                *partitions = qi_lb60_partitions_1gb;
                *num_partitions = ARRAY_SIZE(qi_lb60_partitions_1gb);
        }
+
+       mtd_set_ooblayout(mtd, &qi_lb60_ooblayout_ops);
 }
 
 static struct jz_nand_platform_data qi_lb60_nand_pdata = {
index 5021c546ad07d3e28b7d0ac1969448c32b9a93e2..d64056e0bb567ad9c18174b2cc6aa584579caa40 100644 (file)
@@ -25,8 +25,8 @@
  * the user stack callchains, we will add it here.
  */
 
-static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
-       unsigned long reg29)
+static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
+                                   unsigned long reg29)
 {
        unsigned long *sp = (unsigned long *)reg29;
        unsigned long addr;
@@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
                addr = *sp++;
                if (__kernel_text_address(addr)) {
                        perf_callchain_store(entry, addr);
-                       if (entry->nr >= sysctl_perf_event_max_stack)
+                       if (entry->nr >= entry->max_stack)
                                break;
                }
        }
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                     struct pt_regs *regs)
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+                          struct pt_regs *regs)
 {
        unsigned long sp = regs->regs[29];
 #ifdef CONFIG_KALLSYMS
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
        }
        do {
                perf_callchain_store(entry, pc);
-               if (entry->nr >= sysctl_perf_event_max_stack)
+               if (entry->nr >= entry->max_stack)
                        break;
                pc = unwind_stack(current, &sp, pc, &ra);
        } while (pc);
index 08a95e171685979eae6cbb5bfef89d7043e57f17..5f56f9de10616d0954a58d4e27b60aa6d966e4a1 100644 (file)
@@ -8,7 +8,6 @@ LDFLAGS_vmlinux := -Ttext $(CONFIG_KERNEL_ZIMAGE_BASE_ADDRESS) -e startup_32
 
 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 2328f82ba2a8a47527c45dc99ba69c97e8bf6f0a..e74afc12d5163259c200e0280a382f64355d0196 100644 (file)
@@ -20,7 +20,7 @@ UTS_SYSNAME = Linux
 
 export MMU
 
-LIBGCC         := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+LIBGCC         := $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)
 
 KBUILD_CFLAGS += -pipe -D__linux__ -D__ELF__
 KBUILD_CFLAGS += $(if $(CONFIG_NIOS2_HW_MUL_SUPPORT),-mhw-mul,-mno-hw-mul)
@@ -53,7 +53,7 @@ all: vmImage
 archclean:
        $(Q)$(MAKE) $(clean)=$(nios2-boot)
 
-%.dtb:
+%.dtb: | scripts
        $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
 
 dtbs:
index 5b0fb346d88818022acbd4789a33207688731e6c..d5921c9a9726050be73fff222d0520502a849773 100644 (file)
@@ -11,7 +11,6 @@ LDFLAGS_vmlinux := -T
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 LDFLAGS_piggy.o := -r --format binary --oformat elf32-littlenios2 -T
 
index c4bf7951046153b3a39f5c11a21dadfe3de0f74a..51a32c71ce2bd0a5950701f402a1b963d54169df 100644 (file)
@@ -17,6 +17,8 @@
 
  #define sys_mmap2 sys_mmap_pgoff
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls */
 #include <asm-generic/unistd.h>
 
index ce40b71df0069d8516a94141556cb37ab32819ee..471905bd77452b06ad8bca2143038ef31da41f81 100644 (file)
@@ -20,6 +20,7 @@
 
 #define sys_mmap2 sys_mmap_pgoff
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_CLONE
 
index 3d498a676551d0fa778f0bafe0cbced3fe44765d..dc117385ce2e17480f15771fdb96969f7e1971bf 100644 (file)
@@ -6,6 +6,7 @@ config PARISC
        select HAVE_OPROFILE
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_SYSCALL_TRACEPOINTS
        select ARCH_WANT_FRAME_POINTERS
        select RTC_CLASS
        select RTC_DRV_GENERIC
@@ -31,6 +32,8 @@ config PARISC
        select HAVE_DEBUG_STACKOVERFLOW
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_SECCOMP_FILTER
+       select HAVE_ARCH_TRACEHOOK
+       select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT)
        select ARCH_NO_COHERENT_DMA_MMAP
        select CPU_NO_EFFICIENT_FFS
 
index 0a90b965cccbefe172be5dde879b5035b4589685..7ada309008073ac62021942709107f526a76a11f 100644 (file)
@@ -52,8 +52,7 @@ extern void __cmpxchg_called_with_bad_pointer(void);
 /* __cmpxchg_u32/u64 defined in arch/parisc/lib/bitops.c */
 extern unsigned long __cmpxchg_u32(volatile unsigned int *m, unsigned int old,
                                   unsigned int new_);
-extern unsigned long __cmpxchg_u64(volatile unsigned long *ptr,
-                                  unsigned long old, unsigned long new_);
+extern u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new_);
 
 /* don't worry...optimizer will get rid of most of this */
 static inline unsigned long
@@ -61,7 +60,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size)
 {
        switch (size) {
 #ifdef CONFIG_64BIT
-       case 8: return __cmpxchg_u64((unsigned long *)ptr, old, new_);
+       case 8: return __cmpxchg_u64((u64 *)ptr, old, new_);
 #endif
        case 4: return __cmpxchg_u32((unsigned int *)ptr,
                                     (unsigned int)old, (unsigned int)new_);
@@ -86,7 +85,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 {
        switch (size) {
 #ifdef CONFIG_64BIT
-       case 8: return __cmpxchg_u64((unsigned long *)ptr, old, new_);
+       case 8: return __cmpxchg_u64((u64 *)ptr, old, new_);
 #endif
        case 4: return __cmpxchg_u32(ptr, old, new_);
        default:
@@ -111,4 +110,6 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 #endif
 
+#define cmpxchg64(ptr, o, n) __cmpxchg_u64(ptr, o, n)
+
 #endif /* _ASM_PARISC_CMPXCHG_H_ */
index 8ce8b85ca588411c469ebb4527db83bfe0a48f35..5637ac962f8ec9b43551847a4d3a9202bf45726e 100644 (file)
@@ -99,7 +99,7 @@ struct eeprom_eisa_slot_info
 #define HPEE_MEMORY_DECODE_24BITS 0x04
 #define HPEE_MEMORY_DECODE_32BITS 0x08
 /* byte 2 and 3 are a 16bit LE value
- * containging the memory size in kilobytes */
+ * containing the memory size in kilobytes */
 /* byte 4,5,6 are a 24bit LE value
  * containing the memory base address */
 
@@ -135,7 +135,7 @@ struct eeprom_eisa_slot_info
 #define HPEE_PORT_SHARED    0x40
 #define HPEE_PORT_MORE      0x80
 /* byte 1 and 2 is a 16bit LE value
- * conating the start port number */
+ * containing the start port number */
 
 #define HPEE_PORT_INIT_MAX_LEN     60 /* in bytes here */
 /* port init entry byte 0 */
index 24cd81d58d706faafe469c10a85f1182a28b958a..d635c6b0269df84337cf0abc23f87603aad99296 100644 (file)
@@ -6,6 +6,8 @@ extern void mcount(void);
 
 #define MCOUNT_INSN_SIZE 4
 
+extern unsigned long sys_call_table[];
+
 extern unsigned long return_address(unsigned int);
 
 #define ftrace_return_address(n) return_address(n)
index 49df14805a9b44bba6857ec4138fe62164e935ab..ac8bd586ace8d17329e2536244b7833c0db46c83 100644 (file)
@@ -35,70 +35,57 @@ static inline int
 futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
        unsigned long int flags;
-       u32 val;
        int op = (encoded_op >> 28) & 7;
        int cmp = (encoded_op >> 24) & 15;
        int oparg = (encoded_op << 8) >> 20;
        int cmparg = (encoded_op << 20) >> 20;
-       int oldval = 0, ret;
+       int oldval, ret;
+       u32 tmp;
+
        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
                oparg = 1 << oparg;
 
        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
                return -EFAULT;
 
+       _futex_spin_lock_irqsave(uaddr, &flags);
        pagefault_disable();
 
-       _futex_spin_lock_irqsave(uaddr, &flags);
+       ret = -EFAULT;
+       if (unlikely(get_user(oldval, uaddr) != 0))
+               goto out_pagefault_enable;
+
+       ret = 0;
+       tmp = oldval;
 
        switch (op) {
        case FUTEX_OP_SET:
-               /* *(int *)UADDR2 = OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret)
-                       ret = put_user(oparg, uaddr);
+               tmp = oparg;
                break;
        case FUTEX_OP_ADD:
-               /* *(int *)UADDR2 += OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval + oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp += oparg;
                break;
        case FUTEX_OP_OR:
-               /* *(int *)UADDR2 |= OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval | oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp |= oparg;
                break;
        case FUTEX_OP_ANDN:
-               /* *(int *)UADDR2 &= ~OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval & ~oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp &= ~oparg;
                break;
        case FUTEX_OP_XOR:
-               /* *(int *)UADDR2 ^= OPARG; */
-               ret = get_user(oldval, uaddr);
-               if (!ret) {
-                       val = oldval ^ oparg;
-                       ret = put_user(val, uaddr);
-               }
+               tmp ^= oparg;
                break;
        default:
                ret = -ENOSYS;
        }
 
-       _futex_spin_unlock_irqrestore(uaddr, &flags);
+       if (ret == 0 && unlikely(put_user(tmp, uaddr) != 0))
+               ret = -EFAULT;
 
+out_pagefault_enable:
        pagefault_enable();
+       _futex_spin_unlock_irqrestore(uaddr, &flags);
 
-       if (!ret) {
+       if (ret == 0) {
                switch (cmp) {
                case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
                case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
@@ -112,12 +99,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
        return ret;
 }
 
-/* Non-atomic version */
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
                              u32 oldval, u32 newval)
 {
-       int ret;
        u32 val;
        unsigned long flags;
 
@@ -137,17 +122,20 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
         */
 
        _futex_spin_lock_irqsave(uaddr, &flags);
+       if (unlikely(get_user(val, uaddr) != 0)) {
+               _futex_spin_unlock_irqrestore(uaddr, &flags);
+               return -EFAULT;
+       }
 
-       ret = get_user(val, uaddr);
-
-       if (!ret && val == oldval)
-               ret = put_user(newval, uaddr);
+       if (val == oldval && unlikely(put_user(newval, uaddr) != 0)) {
+               _futex_spin_unlock_irqrestore(uaddr, &flags);
+               return -EFAULT;
+       }
 
        *uval = val;
-
        _futex_spin_unlock_irqrestore(uaddr, &flags);
 
-       return ret;
+       return 0;
 }
 
 #endif /*__KERNEL__*/
index 8121aa6db2ff21ad37510879dd82a3fe7ba7fa29..8be707e1b6c77f291a722a0f2b4b1cdb62a1f273 100644 (file)
@@ -40,7 +40,7 @@
    memory to indicate to the compiler that the assembly code reads
    or writes to items other than those listed in the input and output
    operands.  This may pessimize the code somewhat but __ldcw is
-   usually used within code blocks surrounded by memory barriors.  */
+   usually used within code blocks surrounded by memory barriers.  */
 #define __ldcw(a) ({                                           \
        unsigned __ret;                                         \
        __asm__ __volatile__(__LDCW " 0(%1),%0"                 \
index 637ce8d6f3752425371acee122b207d8e5af0b50..5e0b4e6bd99d14c94437ae409faca2ae413d9cde 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/err.h>
 #include <asm/ptrace.h>
 
+#define NR_syscalls (__NR_Linux_syscalls)
+
 static inline long syscall_get_nr(struct task_struct *tsk,
                                  struct pt_regs *regs)
 {
@@ -33,12 +35,19 @@ static inline void syscall_get_arguments(struct task_struct *tsk,
                args[1] = regs->gr[25];
        case 1:
                args[0] = regs->gr[26];
+       case 0:
                break;
        default:
                BUG();
        }
 }
 
+static inline long syscall_get_return_value(struct task_struct *task,
+                                               struct pt_regs *regs)
+{
+       return regs->gr[28];
+}
+
 static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
index e96e693fd58ca0df4a231a35a6f753e447dac1cb..7581330ea35be1e15498cf5cef9bbcbd3889aab9 100644 (file)
@@ -55,6 +55,7 @@ struct thread_info {
 #define TIF_SINGLESTEP         9       /* single stepping? */
 #define TIF_BLOCKSTEP          10      /* branch stepping? */
 #define TIF_SECCOMP            11      /* secure computing */
+#define TIF_SYSCALL_TRACEPOINT 12      /* syscall tracepoint instrumentation */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
@@ -66,12 +67,13 @@ struct thread_info {
 #define _TIF_SINGLESTEP                (1 << TIF_SINGLESTEP)
 #define _TIF_BLOCKSTEP         (1 << TIF_BLOCKSTEP)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
+#define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_USER_WORK_MASK     (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
                                  _TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP |        \
                                 _TIF_BLOCKSTEP | _TIF_SYSCALL_AUDIT | \
-                                _TIF_SECCOMP)
+                                _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT)
 
 #ifdef CONFIG_64BIT
 # ifdef CONFIG_COMPAT
index 7955e43f3f3f27558da65b100b9c3f64ebe26b02..0f59fd9ca20526d8a27066c724ab2499965ec62c 100644 (file)
@@ -40,14 +40,10 @@ static inline long access_ok(int type, const void __user * addr,
 #define get_user __get_user
 
 #if !defined(CONFIG_64BIT)
-#define LDD_KERNEL(ptr)                BUILD_BUG()
-#define LDD_USER(ptr)          BUILD_BUG()
-#define STD_KERNEL(x, ptr)     __put_kernel_asm64(x, ptr)
+#define LDD_USER(ptr)          __get_user_asm64(ptr)
 #define STD_USER(x, ptr)       __put_user_asm64(x, ptr)
 #else
-#define LDD_KERNEL(ptr)                __get_kernel_asm("ldd", ptr)
 #define LDD_USER(ptr)          __get_user_asm("ldd", ptr)
-#define STD_KERNEL(x, ptr)     __put_kernel_asm("std", x, ptr)
 #define STD_USER(x, ptr)       __put_user_asm("std", x, ptr)
 #endif
 
@@ -80,70 +76,70 @@ struct exception_data {
        unsigned long fault_addr;
 };
 
+/*
+ * load_sr2() preloads the space register %%sr2 - based on the value of
+ * get_fs() - with either a value of 0 to access kernel space (KERNEL_DS which
+ * is 0), or with the current value of %%sr3 to access user space (USER_DS)
+ * memory. The following __get_user_asm() and __put_user_asm() functions have
+ * %%sr2 hard-coded to access the requested memory.
+ */
+#define load_sr2() \
+       __asm__(" or,=  %0,%%r0,%%r0\n\t"       \
+               " mfsp %%sr3,%0\n\t"            \
+               " mtsp %0,%%sr2\n\t"            \
+               : : "r"(get_fs()) : )
+
 #define __get_user(x, ptr)                               \
 ({                                                       \
        register long __gu_err __asm__ ("r8") = 0;       \
        register long __gu_val __asm__ ("r9") = 0;       \
                                                         \
-       if (segment_eq(get_fs(), KERNEL_DS)) {           \
-           switch (sizeof(*(ptr))) {                    \
-           case 1: __get_kernel_asm("ldb", ptr); break; \
-           case 2: __get_kernel_asm("ldh", ptr); break; \
-           case 4: __get_kernel_asm("ldw", ptr); break; \
-           case 8: LDD_KERNEL(ptr); break;              \
-           default: BUILD_BUG(); break;                 \
-           }                                            \
-       }                                                \
-       else {                                           \
-           switch (sizeof(*(ptr))) {                    \
+       load_sr2();                                      \
+       switch (sizeof(*(ptr))) {                        \
            case 1: __get_user_asm("ldb", ptr); break;   \
            case 2: __get_user_asm("ldh", ptr); break;   \
            case 4: __get_user_asm("ldw", ptr); break;   \
            case 8: LDD_USER(ptr);  break;               \
            default: BUILD_BUG(); break;                 \
-           }                                            \
        }                                                \
                                                         \
        (x) = (__force __typeof__(*(ptr))) __gu_val;     \
        __gu_err;                                        \
 })
 
-#define __get_kernel_asm(ldx, ptr)                      \
-       __asm__("\n1:\t" ldx "\t0(%2),%0\n\t"           \
+#define __get_user_asm(ldx, ptr)                        \
+       __asm__("\n1:\t" ldx "\t0(%%sr2,%2),%0\n\t"     \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_1)\
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "r"(ptr), "1"(__gu_err)               \
                : "r1");
 
-#define __get_user_asm(ldx, ptr)                        \
-       __asm__("\n1:\t" ldx "\t0(%%sr3,%2),%0\n\t"     \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_1)\
-               : "=r"(__gu_val), "=r"(__gu_err)        \
+#if !defined(CONFIG_64BIT)
+
+#define __get_user_asm64(ptr)                          \
+       __asm__("\n1:\tldw 0(%%sr2,%2),%0"              \
+               "\n2:\tldw 4(%%sr2,%2),%R0\n\t"         \
+               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_get_user_skip_2)\
+               ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_get_user_skip_1)\
+               : "=r"(__gu_val), "=r"(__gu_err)        \
                : "r"(ptr), "1"(__gu_err)               \
                : "r1");
 
+#endif /* !defined(CONFIG_64BIT) */
+
+
 #define __put_user(x, ptr)                                      \
 ({                                                             \
        register long __pu_err __asm__ ("r8") = 0;              \
         __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x);      \
                                                                \
-       if (segment_eq(get_fs(), KERNEL_DS)) {                  \
-           switch (sizeof(*(ptr))) {                           \
-           case 1: __put_kernel_asm("stb", __x, ptr); break;   \
-           case 2: __put_kernel_asm("sth", __x, ptr); break;   \
-           case 4: __put_kernel_asm("stw", __x, ptr); break;   \
-           case 8: STD_KERNEL(__x, ptr); break;                \
-           default: BUILD_BUG(); break;                        \
-           }                                                   \
-       }                                                       \
-       else {                                                  \
-           switch (sizeof(*(ptr))) {                           \
+       load_sr2();                                             \
+       switch (sizeof(*(ptr))) {                               \
            case 1: __put_user_asm("stb", __x, ptr); break;     \
            case 2: __put_user_asm("sth", __x, ptr); break;     \
            case 4: __put_user_asm("stw", __x, ptr); break;     \
            case 8: STD_USER(__x, ptr); break;                  \
            default: BUILD_BUG(); break;                        \
-           }                                                   \
        }                                                       \
                                                                \
        __pu_err;                                               \
@@ -159,17 +155,9 @@ struct exception_data {
  * r8/r9 are already listed as err/val.
  */
 
-#define __put_kernel_asm(stx, x, ptr)                       \
-       __asm__ __volatile__ (                              \
-               "\n1:\t" stx "\t%2,0(%1)\n\t"               \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_1)\
-               : "=r"(__pu_err)                            \
-               : "r"(ptr), "r"(x), "0"(__pu_err)           \
-               : "r1")
-
 #define __put_user_asm(stx, x, ptr)                         \
        __asm__ __volatile__ (                              \
-               "\n1:\t" stx "\t%2,0(%%sr3,%1)\n\t"         \
+               "\n1:\t" stx "\t%2,0(%%sr2,%1)\n\t"         \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_1)\
                : "=r"(__pu_err)                            \
                : "r"(ptr), "r"(x), "0"(__pu_err)           \
@@ -178,21 +166,10 @@ struct exception_data {
 
 #if !defined(CONFIG_64BIT)
 
-#define __put_kernel_asm64(__val, ptr) do {                \
-       __asm__ __volatile__ (                              \
-               "\n1:\tstw %2,0(%1)"                        \
-               "\n2:\tstw %R2,4(%1)\n\t"                   \
-               ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_2)\
-               ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_put_user_skip_1)\
-               : "=r"(__pu_err)                            \
-               : "r"(ptr), "r"(__val), "0"(__pu_err) \
-               : "r1");                                    \
-} while (0)
-
 #define __put_user_asm64(__val, ptr) do {                  \
        __asm__ __volatile__ (                              \
-               "\n1:\tstw %2,0(%%sr3,%1)"                  \
-               "\n2:\tstw %R2,4(%%sr3,%1)\n\t"             \
+               "\n1:\tstw %2,0(%%sr2,%1)"                  \
+               "\n2:\tstw %R2,4(%%sr2,%1)\n\t"             \
                ASM_EXCEPTIONTABLE_ENTRY(1b, fixup_put_user_skip_2)\
                ASM_EXCEPTIONTABLE_ENTRY(2b, fixup_put_user_skip_1)\
                : "=r"(__pu_err)                            \
index 702498f7705bf29c4f08507852031ea91def5a03..0609ff117f67e56c7d12bd3c1f312d97ebc9f286 100644 (file)
@@ -59,7 +59,7 @@
 #define PDC_MODEL_GET_BOOT__OP 8       /* returns boot test options    */
 #define PDC_MODEL_SET_BOOT__OP 9       /* set boot test options        */
 
-#define PA89_INSTRUCTION_SET   0x4     /* capatibilies returned        */
+#define PA89_INSTRUCTION_SET   0x4     /* capabilities returned        */
 #define PA90_INSTRUCTION_SET   0x8
 
 #define PDC_CACHE      5               /* return/set cache (& TLB) info*/
index c4fa6c8b9ad9fa865e7766bb774d7e4bfcada3c5..02ce2eb99a7f91ef66d0cdb783b9e3d87e493102 100644 (file)
  * N.B. gdb/strace care about the size and offsets within this
  * structure. If you change things, you may break object compatibility
  * for those applications.
+ *
+ * Please do NOT use this structure for future programs, but use
+ * user_regs_struct (see below) instead.
+ *
+ * It can be accessed through PTRACE_PEEKUSR/PTRACE_POKEUSR only.
  */
 
 struct pt_regs {
@@ -33,6 +38,45 @@ struct pt_regs {
        unsigned long ipsw;     /* CR22 */
 };
 
+/**
+ * struct user_regs_struct - User general purpose registers
+ *
+ * This is the user-visible general purpose register state structure
+ * which is used to define the elf_gregset_t.
+ *
+ * It can be accessed through PTRACE_GETREGSET with NT_PRSTATUS
+ * and through PTRACE_GETREGS.
+ */
+struct user_regs_struct {
+       unsigned long gr[32];   /* PSW is in gr[0] */
+       unsigned long sr[8];
+       unsigned long iaoq[2];
+       unsigned long iasq[2];
+       unsigned long sar;      /* CR11 */
+       unsigned long iir;      /* CR19 */
+       unsigned long isr;      /* CR20 */
+       unsigned long ior;      /* CR21 */
+       unsigned long ipsw;     /* CR22 */
+       unsigned long cr0;
+       unsigned long cr24, cr25, cr26, cr27, cr28, cr29, cr30, cr31;
+       unsigned long cr8, cr9, cr12, cr13, cr10, cr15;
+       unsigned long _pad[80-64];      /* pad to ELF_NGREG (80) */
+};
+
+/**
+ * struct user_fp_struct - User floating point registers
+ *
+ * This is the user-visible floating point register state structure.
+ * It uses the same layout and size as elf_fpregset_t.
+ *
+ * It can be accessed through PTRACE_GETREGSET with NT_PRFPREG
+ * and through PTRACE_GETFPREGS.
+ */
+struct user_fp_struct {
+       __u64 fr[32];
+};
+
+
 /*
  * The numbers chosen here are somewhat arbitrary but absolutely MUST
  * not overlap with any of the number assigned in <linux/ptrace.h>.
@@ -43,5 +87,9 @@ struct pt_regs {
  */
 #define PTRACE_SINGLEBLOCK     12      /* resume execution until next branch */
 
+#define PTRACE_GETREGS         18
+#define PTRACE_SETREGS         19
+#define PTRACE_GETFPREGS       14
+#define PTRACE_SETFPREGS       15
 
 #endif /* _UAPI_PARISC_PTRACE_H */
index cc0ce92c93c78905c01ce06caaed92da41588db2..a9b9407f38f7c63a3ad9f42b0dcd7d11d1fb8d92 100644 (file)
 #define __NR_uselib              (__NR_Linux + 86)
 #define __NR_swapon              (__NR_Linux + 87)
 #define __NR_reboot              (__NR_Linux + 88)
-#define __NR_mmap2             (__NR_Linux + 89)
+#define __NR_mmap2               (__NR_Linux + 89)
 #define __NR_mmap                (__NR_Linux + 90)
 #define __NR_munmap              (__NR_Linux + 91)
 #define __NR_truncate            (__NR_Linux + 92)
 #define __NR_recv                (__NR_Linux + 98)
 #define __NR_statfs              (__NR_Linux + 99)
 #define __NR_fstatfs            (__NR_Linux + 100)
-#define __NR_stat64           (__NR_Linux + 101)
+#define __NR_stat64             (__NR_Linux + 101)
 /* #define __NR_socketcall         (__NR_Linux + 102) */
 #define __NR_syslog             (__NR_Linux + 103)
 #define __NR_setitimer          (__NR_Linux + 104)
 #define __NR_adjtimex           (__NR_Linux + 124)
 #define __NR_mprotect           (__NR_Linux + 125)
 #define __NR_sigprocmask        (__NR_Linux + 126)
-#define __NR_create_module      (__NR_Linux + 127)
+#define __NR_create_module      (__NR_Linux + 127) /* not used */
 #define __NR_init_module        (__NR_Linux + 128)
 #define __NR_delete_module      (__NR_Linux + 129)
-#define __NR_get_kernel_syms    (__NR_Linux + 130)
+#define __NR_get_kernel_syms    (__NR_Linux + 130) /* not used */
 #define __NR_quotactl           (__NR_Linux + 131)
 #define __NR_getpgid            (__NR_Linux + 132)
 #define __NR_fchdir             (__NR_Linux + 133)
 #define __NR_bdflush            (__NR_Linux + 134)
 #define __NR_sysfs              (__NR_Linux + 135)
 #define __NR_personality        (__NR_Linux + 136)
-#define __NR_afs_syscall        (__NR_Linux + 137) /* Syscall for Andrew File System */
+#define __NR_afs_syscall        (__NR_Linux + 137) /* not used */
 #define __NR_setfsuid           (__NR_Linux + 138)
 #define __NR_setfsgid           (__NR_Linux + 139)
 #define __NR__llseek            (__NR_Linux + 140)
 #define __NR_setresuid          (__NR_Linux + 164)
 #define __NR_getresuid          (__NR_Linux + 165)
 #define __NR_sigaltstack        (__NR_Linux + 166)
-#define __NR_query_module       (__NR_Linux + 167)
+#define __NR_query_module       (__NR_Linux + 167) /* not used */
 #define __NR_poll               (__NR_Linux + 168)
-#define __NR_nfsservctl         (__NR_Linux + 169)
+#define __NR_nfsservctl         (__NR_Linux + 169) /* not used */
 #define __NR_setresgid          (__NR_Linux + 170)
 #define __NR_getresgid          (__NR_Linux + 171)
 #define __NR_prctl              (__NR_Linux + 172)
 #define __NR_shmdt              (__NR_Linux + 193)
 #define __NR_shmget             (__NR_Linux + 194)
 #define __NR_shmctl             (__NR_Linux + 195)
-
-#define __NR_getpmsg           (__NR_Linux + 196) /* Somebody *wants* streams? */
-#define __NR_putpmsg           (__NR_Linux + 197)
-
+#define __NR_getpmsg            (__NR_Linux + 196) /* not used */
+#define __NR_putpmsg            (__NR_Linux + 197) /* not used */
 #define __NR_lstat64            (__NR_Linux + 198)
 #define __NR_truncate64         (__NR_Linux + 199)
 #define __NR_ftruncate64        (__NR_Linux + 200)
 #define __NR_getdents64         (__NR_Linux + 201)
 #define __NR_fcntl64            (__NR_Linux + 202)
-#define __NR_attrctl            (__NR_Linux + 203)
-#define __NR_acl_get            (__NR_Linux + 204)
-#define __NR_acl_set            (__NR_Linux + 205)
+#define __NR_attrctl            (__NR_Linux + 203) /* not used */
+#define __NR_acl_get            (__NR_Linux + 204) /* not used */
+#define __NR_acl_set            (__NR_Linux + 205) /* not used */
 #define __NR_gettid             (__NR_Linux + 206)
 #define __NR_readahead          (__NR_Linux + 207)
 #define __NR_tkill              (__NR_Linux + 208)
 #define __NR_futex              (__NR_Linux + 210)
 #define __NR_sched_setaffinity  (__NR_Linux + 211)
 #define __NR_sched_getaffinity  (__NR_Linux + 212)
-#define __NR_set_thread_area    (__NR_Linux + 213)
-#define __NR_get_thread_area    (__NR_Linux + 214)
+#define __NR_set_thread_area    (__NR_Linux + 213) /* not used */
+#define __NR_get_thread_area    (__NR_Linux + 214) /* not used */
 #define __NR_io_setup           (__NR_Linux + 215)
 #define __NR_io_destroy         (__NR_Linux + 216)
 #define __NR_io_getevents       (__NR_Linux + 217)
 #define __NR_mbind             (__NR_Linux + 260)
 #define __NR_get_mempolicy     (__NR_Linux + 261)
 #define __NR_set_mempolicy     (__NR_Linux + 262)
-#define __NR_vserver           (__NR_Linux + 263)
+#define __NR_vserver           (__NR_Linux + 263) /* not used */
 #define __NR_add_key           (__NR_Linux + 264)
 #define __NR_request_key       (__NR_Linux + 265)
 #define __NR_keyctl            (__NR_Linux + 266)
 #define __NR_kexec_load                (__NR_Linux + 300)
 #define __NR_utimensat         (__NR_Linux + 301)
 #define __NR_signalfd          (__NR_Linux + 302)
-#define __NR_timerfd           (__NR_Linux + 303)
+#define __NR_timerfd           (__NR_Linux + 303) /* not used */
 #define __NR_eventfd           (__NR_Linux + 304)
 #define __NR_fallocate         (__NR_Linux + 305)
 #define __NR_timerfd_create    (__NR_Linux + 306)
index 39127d3e70e56f2295b6e288f6642ed9908bcfbd..baa3d9d6e971f597326324220cfd3b091b2eec60 100644 (file)
         * boundary
         */
 
-       .text
+       .section .text.hot
        .align 2048
 
 ENTRY(fault_vector_20)
@@ -2019,6 +2019,7 @@ ftrace_stub:
        .procend
 ENDPROC(mcount)
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        .align 8
        .globl return_to_handler
        .type  return_to_handler, @function
@@ -2040,11 +2041,17 @@ parisc_return_to_handler:
 #endif
 
        /* call ftrace_return_to_handler(0) */
+       .import ftrace_return_to_handler,code
+       load32 ftrace_return_to_handler,%ret0
+       load32 .Lftrace_ret,%r2
 #ifdef CONFIG_64BIT
        ldo -16(%sp),%ret1              /* Reference param save area */
+       bve     (%ret0)
+#else
+       bv      %r0(%ret0)
 #endif
-       BL ftrace_return_to_handler,%r2
        ldi 0,%r26
+.Lftrace_ret:
        copy %ret0,%rp
 
        /* restore original return values */
@@ -2062,6 +2069,8 @@ parisc_return_to_handler:
        .procend
 ENDPROC(return_to_handler)
 
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_IRQSTACKS
index b13f9ec6f2946506c2b42ef4748b447652db81c1..a828a0adf52c0b19d14eb12219bf718c3cf37704 100644 (file)
 #include <asm/ftrace.h>
 
 
+#define __hot __attribute__ ((__section__ (".text.hot")))
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /*
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
  */
-static void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+static void __hot prepare_ftrace_return(unsigned long *parent,
+                                       unsigned long self_addr)
 {
        unsigned long old;
        struct ftrace_graph_ent trace;
@@ -53,7 +56,7 @@ static void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-void notrace ftrace_function_trampoline(unsigned long parent,
+void notrace __hot ftrace_function_trampoline(unsigned long parent,
                                unsigned long self_addr,
                                unsigned long org_sp_gr3)
 {
index 8fb81a3915990dc741324b560986895f5025a9d5..b5458b37fc5be2463f77a86cdddf35e4c88ab3ef 100644 (file)
@@ -4,18 +4,20 @@
  * Copyright (C) 2000 Hewlett-Packard Co, Linuxcare Inc.
  * Copyright (C) 2000 Matthew Wilcox <matthew@wil.cx>
  * Copyright (C) 2000 David Huggins-Daines <dhd@debian.org>
- * Copyright (C) 2008 Helge Deller <deller@gmx.de>
+ * Copyright (C) 2008-2016 Helge Deller <deller@gmx.de>
  */
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/personality.h>
+#include <linux/regset.h>
 #include <linux/security.h>
 #include <linux/seccomp.h>
 #include <linux/compat.h>
 /* PSW bits we allow the debugger to modify */
 #define USER_PSW_BITS  (PSW_N | PSW_B | PSW_V | PSW_CB)
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+/*
+ * These are our native regset flavors.
+ */
+enum parisc_regset {
+       REGSET_GENERAL,
+       REGSET_FP
+};
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -114,6 +127,7 @@ void user_enable_block_step(struct task_struct *task)
 long arch_ptrace(struct task_struct *child, long request,
                 unsigned long addr, unsigned long data)
 {
+       unsigned long __user *datap = (unsigned long __user *)data;
        unsigned long tmp;
        long ret = -EIO;
 
@@ -126,7 +140,7 @@ long arch_ptrace(struct task_struct *child, long request,
                     addr >= sizeof(struct pt_regs))
                        break;
                tmp = *(unsigned long *) ((char *) task_regs(child) + addr);
-               ret = put_user(tmp, (unsigned long __user *) data);
+               ret = put_user(tmp, datap);
                break;
 
        /* Write the word at location addr in the USER area.  This will need
@@ -165,6 +179,34 @@ long arch_ptrace(struct task_struct *child, long request,
                }
                break;
 
+       case PTRACE_GETREGS:    /* Get all gp regs from the child. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_GENERAL,
+                                          0, sizeof(struct user_regs_struct),
+                                          datap);
+
+       case PTRACE_SETREGS:    /* Set all gp regs in the child. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_GENERAL,
+                                            0, sizeof(struct user_regs_struct),
+                                            datap);
+
+       case PTRACE_GETFPREGS:  /* Get the child FPU state. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_FP,
+                                          0, sizeof(struct user_fp_struct),
+                                          datap);
+
+       case PTRACE_SETFPREGS:  /* Set the child FPU state. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_FP,
+                                            0, sizeof(struct user_fp_struct),
+                                            datap);
+
        default:
                ret = ptrace_request(child, request, addr, data);
                break;
@@ -283,6 +325,10 @@ long do_syscall_trace_enter(struct pt_regs *regs)
                regs->gr[20] = -1UL;
                goto out;
        }
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_enter(regs, regs->gr[20]);
+#endif
 
 #ifdef CONFIG_64BIT
        if (!is_compat_task())
@@ -311,6 +357,324 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 
        audit_syscall_exit(regs);
 
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_exit(regs, regs->gr[20]);
+#endif
+
        if (stepping || test_thread_flag(TIF_SYSCALL_TRACE))
                tracehook_report_syscall_exit(regs, stepping);
 }
+
+
+/*
+ * regset functions.
+ */
+
+static int fpr_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       __u64 *k = kbuf;
+       __u64 __user *u = ubuf;
+       __u64 reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       *k++ = regs->fr[pos++];
+       else
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       if (__put_user(regs->fr[pos++], u++))
+                               return -EFAULT;
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NFPREG * sizeof(reg), -1);
+}
+
+static int fpr_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const __u64 *k = kbuf;
+       const __u64 __user *u = ubuf;
+       __u64 reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NFPREG; --count)
+                       regs->fr[pos++] = *k++;
+       else
+               for (; count > 0 && pos < ELF_NFPREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       regs->fr[pos++] = reg;
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NFPREG * sizeof(reg), -1);
+}
+
+#define RI(reg) (offsetof(struct user_regs_struct,reg) / sizeof(long))
+
+static unsigned long get_reg(struct pt_regs *regs, int num)
+{
+       switch (num) {
+       case RI(gr[0]) ... RI(gr[31]):  return regs->gr[num - RI(gr[0])];
+       case RI(sr[0]) ... RI(sr[7]):   return regs->sr[num - RI(sr[0])];
+       case RI(iasq[0]):               return regs->iasq[0];
+       case RI(iasq[1]):               return regs->iasq[1];
+       case RI(iaoq[0]):               return regs->iaoq[0];
+       case RI(iaoq[1]):               return regs->iaoq[1];
+       case RI(sar):                   return regs->sar;
+       case RI(iir):                   return regs->iir;
+       case RI(isr):                   return regs->isr;
+       case RI(ior):                   return regs->ior;
+       case RI(ipsw):                  return regs->ipsw;
+       case RI(cr27):                  return regs->cr27;
+       case RI(cr0):                   return mfctl(0);
+       case RI(cr24):                  return mfctl(24);
+       case RI(cr25):                  return mfctl(25);
+       case RI(cr26):                  return mfctl(26);
+       case RI(cr28):                  return mfctl(28);
+       case RI(cr29):                  return mfctl(29);
+       case RI(cr30):                  return mfctl(30);
+       case RI(cr31):                  return mfctl(31);
+       case RI(cr8):                   return mfctl(8);
+       case RI(cr9):                   return mfctl(9);
+       case RI(cr12):                  return mfctl(12);
+       case RI(cr13):                  return mfctl(13);
+       case RI(cr10):                  return mfctl(10);
+       case RI(cr15):                  return mfctl(15);
+       default:                        return 0;
+       }
+}
+
+static void set_reg(struct pt_regs *regs, int num, unsigned long val)
+{
+       switch (num) {
+       case RI(gr[0]): /*
+                        * PSW is in gr[0].
+                        * Allow writing to Nullify, Divide-step-correction,
+                        * and carry/borrow bits.
+                        * BEWARE, if you set N, and then single step, it won't
+                        * stop on the nullified instruction.
+                        */
+                       val &= USER_PSW_BITS;
+                       regs->gr[0] &= ~USER_PSW_BITS;
+                       regs->gr[0] |= val;
+                       return;
+       case RI(gr[1]) ... RI(gr[31]):
+                       regs->gr[num - RI(gr[0])] = val;
+                       return;
+       case RI(iaoq[0]):
+       case RI(iaoq[1]):
+                       regs->iaoq[num - RI(iaoq[0])] = val;
+                       return;
+       case RI(sar):   regs->sar = val;
+                       return;
+       default:        return;
+#if 0
+       /* do not allow to change any of the following registers (yet) */
+       case RI(sr[0]) ... RI(sr[7]):   return regs->sr[num - RI(sr[0])];
+       case RI(iasq[0]):               return regs->iasq[0];
+       case RI(iasq[1]):               return regs->iasq[1];
+       case RI(iir):                   return regs->iir;
+       case RI(isr):                   return regs->isr;
+       case RI(ior):                   return regs->ior;
+       case RI(ipsw):                  return regs->ipsw;
+       case RI(cr27):                  return regs->cr27;
+        case cr0, cr24, cr25, cr26, cr27, cr28, cr29, cr30, cr31;
+        case cr8, cr9, cr12, cr13, cr10, cr15;
+#endif
+       }
+}
+
+static int gpr_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       unsigned long *k = kbuf;
+       unsigned long __user *u = ubuf;
+       unsigned long reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       *k++ = get_reg(regs, pos++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       if (__put_user(get_reg(regs, pos++), u++))
+                               return -EFAULT;
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NGREG * sizeof(reg), -1);
+}
+
+static int gpr_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const unsigned long *k = kbuf;
+       const unsigned long __user *u = ubuf;
+       unsigned long reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       set_reg(regs, pos++, *k++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       set_reg(regs, pos++, reg);
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NGREG * sizeof(reg), -1);
+}
+
+static const struct user_regset native_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+               .size = sizeof(long), .align = sizeof(long),
+               .get = gpr_get, .set = gpr_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+               .size = sizeof(__u64), .align = sizeof(__u64),
+               .get = fpr_get, .set = fpr_set
+       }
+};
+
+static const struct user_regset_view user_parisc_native_view = {
+       .name = "parisc", .e_machine = ELF_ARCH, .ei_osabi = ELFOSABI_LINUX,
+       .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
+};
+
+#ifdef CONFIG_64BIT
+#include <linux/compat.h>
+
+static int gpr32_get(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    void *kbuf, void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       compat_ulong_t *k = kbuf;
+       compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       *k++ = get_reg(regs, pos++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       if (__put_user((compat_ulong_t) get_reg(regs, pos++), u++))
+                               return -EFAULT;
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       ELF_NGREG * sizeof(reg), -1);
+}
+
+static int gpr32_set(struct task_struct *target,
+                    const struct user_regset *regset,
+                    unsigned int pos, unsigned int count,
+                    const void *kbuf, const void __user *ubuf)
+{
+       struct pt_regs *regs = task_regs(target);
+       const compat_ulong_t *k = kbuf;
+       const compat_ulong_t __user *u = ubuf;
+       compat_ulong_t reg;
+
+       pos /= sizeof(reg);
+       count /= sizeof(reg);
+
+       if (kbuf)
+               for (; count > 0 && pos < ELF_NGREG; --count)
+                       set_reg(regs, pos++, *k++);
+       else
+               for (; count > 0 && pos < ELF_NGREG; --count) {
+                       if (__get_user(reg, u++))
+                               return -EFAULT;
+                       set_reg(regs, pos++, reg);
+               }
+
+       kbuf = k;
+       ubuf = u;
+       pos *= sizeof(reg);
+       count *= sizeof(reg);
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        ELF_NGREG * sizeof(reg), -1);
+}
+
+/*
+ * These are the regset flavors matching the 32bit native set.
+ */
+static const struct user_regset compat_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS, .n = ELF_NGREG,
+               .size = sizeof(compat_long_t), .align = sizeof(compat_long_t),
+               .get = gpr32_get, .set = gpr32_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG, .n = ELF_NFPREG,
+               .size = sizeof(__u64), .align = sizeof(__u64),
+               .get = fpr_get, .set = fpr_set
+       }
+};
+
+static const struct user_regset_view user_parisc_compat_view = {
+       .name = "parisc", .e_machine = EM_PARISC, .ei_osabi = ELFOSABI_LINUX,
+       .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets)
+};
+#endif /* CONFIG_64BIT */
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+       BUILD_BUG_ON(sizeof(struct user_regs_struct)/sizeof(long) != ELF_NGREG);
+       BUILD_BUG_ON(sizeof(struct user_fp_struct)/sizeof(__u64) != ELF_NFPREG);
+#ifdef CONFIG_64BIT
+       if (is_compat_task())
+               return &user_parisc_compat_view;
+#endif
+       return &user_parisc_native_view;
+}
index 57b4836b7ecd898e10197aa0d473ea6107b9fb42..d03422e5f188368f8df5283cedfd4e32845e64df 100644 (file)
@@ -912,6 +912,7 @@ END(lws_table)
 
        .align 8
 ENTRY(sys_call_table)
+       .export sys_call_table,data
 #include "syscall_table.S"
 END(sys_call_table)
 
index 400acac0a304d12b235e05773d9c9f53a87f55b2..58dd6801f5bece511f16603b7db47906637582b0 100644 (file)
 
 static unsigned long clocktick __read_mostly;  /* timer cycles per tick */
 
+#ifndef CONFIG_64BIT
+/*
+ * The processor-internal cycle counter (Control Register 16) is used as time
+ * source for the sched_clock() function.  This register is 64bit wide on a
+ * 64-bit kernel and 32bit on a 32-bit kernel. Since sched_clock() always
+ * requires a 64bit counter we emulate on the 32-bit kernel the higher 32bits
+ * with a per-cpu variable which we increase every time the counter
+ * wraps-around (which happens every ~4 secounds).
+ */
+static DEFINE_PER_CPU(unsigned long, cr16_high_32_bits);
+#endif
+
 /*
  * We keep time on PA-RISC Linux by using the Interval Timer which is
  * a pair of registers; one is read-only and one is write-only; both
@@ -108,6 +120,12 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
         */
        mtctl(next_tick, 16);
 
+#if !defined(CONFIG_64BIT)
+       /* check for overflow on a 32bit kernel (every ~4 seconds). */
+       if (unlikely(next_tick < now))
+               this_cpu_inc(cr16_high_32_bits);
+#endif
+
        /* Skip one clocktick on purpose if we missed next_tick.
         * The new CR16 must be "later" than current CR16 otherwise
         * itimer would not fire until CR16 wrapped - e.g 4 seconds
@@ -219,6 +237,12 @@ void __init start_cpu_itimer(void)
        unsigned int cpu = smp_processor_id();
        unsigned long next_tick = mfctl(16) + clocktick;
 
+#if defined(CONFIG_HAVE_UNSTABLE_SCHED_CLOCK) && defined(CONFIG_64BIT)
+       /* With multiple 64bit CPUs online, the cr16's are not syncronized. */
+       if (cpu != 0)
+               clear_sched_clock_stable();
+#endif
+
        mtctl(next_tick, 16);           /* kick off Interval Timer (CR16) */
 
        per_cpu(cpu_data, cpu).it_value = next_tick;
@@ -246,15 +270,52 @@ void read_persistent_clock(struct timespec *ts)
        }
 }
 
+
+/*
+ * sched_clock() framework
+ */
+
+static u32 cyc2ns_mul __read_mostly;
+static u32 cyc2ns_shift __read_mostly;
+
+u64 sched_clock(void)
+{
+       u64 now;
+
+       /* Get current cycle counter (Control Register 16). */
+#ifdef CONFIG_64BIT
+       now = mfctl(16);
+#else
+       now = mfctl(16) + (((u64) this_cpu_read(cr16_high_32_bits)) << 32);
+#endif
+
+       /* return the value in ns (cycles_2_ns) */
+       return mul_u64_u32_shr(now, cyc2ns_mul, cyc2ns_shift);
+}
+
+
+/*
+ * timer interrupt and sched_clock() initialization
+ */
+
 void __init time_init(void)
 {
        unsigned long current_cr16_khz;
 
+       current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
        clocktick = (100 * PAGE0->mem_10msec) / HZ;
 
+       /* calculate mult/shift values for cr16 */
+       clocks_calc_mult_shift(&cyc2ns_mul, &cyc2ns_shift, current_cr16_khz,
+                               NSEC_PER_MSEC, 0);
+
+#if defined(CONFIG_HAVE_UNSTABLE_SCHED_CLOCK) && defined(CONFIG_64BIT)
+       /* At bootup only one 64bit CPU is online and cr16 is "stable" */
+       set_sched_clock_stable();
+#endif
+
        start_cpu_itimer();     /* get CPU 0 started */
 
        /* register at clocksource framework */
-       current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
        clocksource_register_khz(&clocksource_cr16, current_cr16_khz);
 }
index 187118841af193adf92025351bad2d29b253c38c..8e45b0a97abf67bfc7621c5a964502890038cf2e 100644 (file)
@@ -55,11 +55,10 @@ unsigned long __xchg8(char x, char *ptr)
 }
 
 
-#ifdef CONFIG_64BIT
-unsigned long __cmpxchg_u64(volatile unsigned long *ptr, unsigned long old, unsigned long new)
+u64 __cmpxchg_u64(volatile u64 *ptr, u64 old, u64 new)
 {
        unsigned long flags;
-       unsigned long prev;
+       u64 prev;
 
        _atomic_spin_lock_irqsave(ptr, flags);
        if ((prev = *ptr) == old)
@@ -67,7 +66,6 @@ unsigned long __cmpxchg_u64(volatile unsigned long *ptr, unsigned long old, unsi
        _atomic_spin_unlock_irqrestore(ptr, flags);
        return prev;
 }
-#endif
 
 unsigned long __cmpxchg_u32(volatile unsigned int *ptr, unsigned int old, unsigned int new)
 {
index 673b73e8420d0fd5742264ad6c2214c43a78ef9a..18df1237c93cabb2356ca95cbb2050697f79b275 100644 (file)
@@ -184,7 +184,7 @@ static void parisc_linux_get_fpu_type(u_int fpregs[])
 
 /*
  * this routine will decode the excepting floating point instruction and
- * call the approiate emulation routine.
+ * call the appropriate emulation routine.
  * It is called by decode_fpu with the following parameters:
  * fpudispatch(current_ir, unimplemented_code, 0, &Fpu_register)
  * where current_ir is the instruction to be emulated,
index 26d37e6f924e09534bbd41a28f045179858d08db..0fc26714780acd95e3df8043c4a764d1e823aefc 100644 (file)
@@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        unsigned long sp, next_sp;
        unsigned long next_ip;
@@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
                        next_ip = regs->nip;
                        lr = regs->link;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL);
 
                } else {
                        if (level == 0)
@@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp)
                puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long sp, next_sp;
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                fp = (unsigned long __user *) sp;
                if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
                        return;
@@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
                            read_user_stack_64(&uregs[PT_R1], &sp))
                                return;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
                        perf_callchain_store(entry, next_ip);
                        continue;
                }
@@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
        return rc;
 }
 
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                          struct pt_regs *regs)
 {
 }
@@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
        return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned int sp, next_sp;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
        sp = regs->gpr[1];
        perf_callchain_store(entry, next_ip);
 
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                fp = (unsigned int __user *) (unsigned long) sp;
                if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
                        return;
@@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
                            read_user_stack_32(&uregs[PT_R1], &sp))
                                return;
                        level = 0;
-                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
                        perf_callchain_store(entry, next_ip);
                        continue;
                }
@@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        if (current_is_64bit())
                perf_callchain_user_64(entry, regs);
index 0d112b94d91d3ef00d6ac69a3abf61a238ddcb76..ff75d70f7285dd21bd42ba5e8b5b2b6b9e69ab70 100644 (file)
@@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-                      void __pmem **kaddr, pfn_t *pfn)
+                      void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct axon_ram_bank *bank = device->bd_disk->private_data;
        loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
index fac6ac9790fad18efc2f587757068f87ca7765fd..1dd210347e1243ddf01cc8761aabb157c6d7cec1 100644 (file)
@@ -22,7 +22,6 @@ OBJECTS += $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o
 LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS)
        $(call if_changed,ld)
-       @:
 
 sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\)$$/\#define SZ\2 0x\1/p'
 
index c3e4099b60a59da2452eddd333620a6f17acf9dd..87035fa58bbe897a8e1234d44a60d3131c2ab343 100644 (file)
@@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register);
 
 static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        perf_callchain_store(entry, address);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        if (user_mode(regs))
index 9cb4260a5f3e3ea232fd1a328bac9f7e5ed854b3..d4008c339e8936da10255a677aa6bb1986760786 100644 (file)
@@ -1,5 +1,6 @@
 #define __ARCH_HAVE_MMU
 
+#define __ARCH_WANT_RENAMEAT
 #define __ARCH_WANT_SYSCALL_NO_AT
 #define __ARCH_WANT_SYSCALL_NO_FLAGS
 #define __ARCH_WANT_SYSCALL_OFF_T
index 6df826ee731637489410f247f48d21d3d9a87189..c4c47ea9fa941542e0efae62ca3bed2d4877dd68 100644 (file)
@@ -55,7 +55,6 @@ $(addprefix $(obj)/,$(lib1funcs-y)): $(obj)/%: $(lib1funcs-dir)/% FORCE
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(lib1funcs-obj) FORCE
        $(call if_changed,ld)
-       @:
 
 $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
index 2216ee57f2516e14ff7c0017139dd01f1a83beb6..43c41191de5d9e37ba600bf250868c6387ad5f30 100644 (file)
@@ -17,7 +17,6 @@ LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(load-y) -e romstart \
 
 $(obj)/vmlinux: $(obj)/head.o $(obj-y) $(obj)/piggy.o FORCE
        $(call if_changed,ld)
-       @:
 
 OBJCOPYFLAGS += -j .empty_zero_page
 
index cc80b614b5fae9dfd00cf62156f0baea250b9ff2..fa2c0cd23eaa88059e0470ed953fe6daedbbcb3e 100644 (file)
@@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name)
 
 static void callchain_address(void *data, unsigned long addr, int reliable)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        if (reliable)
                perf_callchain_store(entry, addr);
@@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        perf_callchain_store(entry, regs->pc);
 
index a4b8b5aed21c7b0fa83b30c7f3783e5c109ccf61..710f3278d448d910df1c4224f0146ad3bf555b89 100644 (file)
@@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void)
 }
 pure_initcall(init_hw_perf_events);
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
        unsigned long ksp, fp;
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
                        }
                }
 #endif
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
        return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long ufp;
@@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
                pc = sf.callers_pc;
                ufp = (unsigned long)sf.fp + STACK_BIAS;
                perf_callchain_store(entry, pc);
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
                                   struct pt_regs *regs)
 {
        unsigned long ufp;
@@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
                        ufp = (unsigned long)sf.fp;
                }
                perf_callchain_store(entry, pc);
-       } while (entry->nr < sysctl_perf_event_max_stack);
+       } while (entry->nr < entry->max_stack);
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        u64 saved_fault_address = current_thread_info()->fault_address;
        u8 saved_fault_code = get_thread_fault_code();
index 3866397aaf5ae7c2a939911ed84a5130c50c9962..24e9187e85a86cd17911e64f76922e1034013919 100644 (file)
@@ -12,6 +12,7 @@
  *   more details.
  */
 
+#define __ARCH_WANT_RENAMEAT
 #if !defined(__LP64__) || defined(__SYSCALL_COMPAT)
 /* Use the flavor of this syscall that matches the 32-bit API better. */
 #define __ARCH_WANT_SYNC_FILE_RANGE2
index 8767060d70fb32eb2dfc0a7da1010233973d78ea..6394c1ccb68eb574f9d424be9c39d324a2ef87e0 100644 (file)
@@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events);
 /*
  * Tile specific backtracing code for perf_events.
  */
-static inline void perf_callchain(struct perf_callchain_entry *entry,
+static inline void perf_callchain(struct perf_callchain_entry_ctx *entry,
                    struct pt_regs *regs)
 {
        struct KBacktraceIterator kbt;
@@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry,
        }
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                    struct pt_regs *regs)
 {
        perf_callchain(entry, regs);
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                      struct pt_regs *regs)
 {
        perf_callchain(entry, regs);
index ec7fb70b412bdcaffcedb8653d5ca3d89ca15173..828855007b2937435ca888c1bfc1e426aaa7c4e9 100644 (file)
@@ -31,7 +31,7 @@ $(obj)/uImage: $(obj)/zImage FORCE
        $(call if_changed,uimage)
        @echo '  Image $@ is ready'
 
-PHONY += initrd FORCE
+PHONY += initrd
 initrd:
        @test "$(INITRD)" != "" || \
        (echo You must specify INITRD; exit -1)
index 96494fb646f74b2e1dddc42028f52db1a7afe36c..9aecdd3ddc4828242b7387061c71447cb44082e4 100644 (file)
@@ -54,7 +54,6 @@ LDFLAGS_vmlinux += -T
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \
                $(obj)/misc.o FORCE
        $(call if_changed,ld)
-       @:
 
 # We now have a PIC decompressor implementation.  Decompressors running
 # from RAM should not define ZTEXTADDR.  Decompressors running directly
index d4cc4559d8485f23838417c6f51208b4925082d3..1f63c476528e6ce58b590b16346a3d9961968f87 100644 (file)
@@ -10,6 +10,8 @@
  * published by the Free Software Foundation.
  */
 
+#define __ARCH_WANT_RENAMEAT
+
 /* Use the standard ABI for syscalls. */
 #include <asm-generic/unistd.h>
 #define __ARCH_WANT_SYS_CLONE
index cfdd8c3f8af2e095b0c9e3388c17af834e0589b9..f1356889204e5c74430bc78bf231c12901866730 100644 (file)
@@ -87,7 +87,6 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
 
 $(obj)/vmlinux: $(vmlinux-objs-y) FORCE
        $(call if_changed,ld)
-       @:
 
 OBJCOPYFLAGS_vmlinux.bin :=  -R .comment -S
 $(obj)/vmlinux.bin: vmlinux FORCE
index 98df1fa8825cdeeea525e9900839347eabbde53c..027aec4a74df6a99c2e5733e2f68ff5fb5fbc19a 100644 (file)
@@ -8,16 +8,15 @@
 #include <linux/linkage.h>
 #include "calling.h"
 #include <asm/asm.h>
-#include <asm/frame.h>
 
        /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
        .macro THUNK name, func, put_ret_addr_in_rdi=0
        .globl \name
        .type \name, @function
 \name:
-       FRAME_BEGIN
+       pushq %rbp
+       movq %rsp, %rbp
 
-       /* this one pushes 9 elems, the next one would be %rIP */
        pushq %rdi
        pushq %rsi
        pushq %rdx
@@ -29,8 +28,8 @@
        pushq %r11
 
        .if \put_ret_addr_in_rdi
-       /* 9*8(%rsp) is return addr on stack */
-       movq 9*8(%rsp), %rdi
+       /* 8(%rbp) is return addr on stack */
+       movq 8(%rbp), %rdi
        .endif
 
        call \func
@@ -65,7 +64,7 @@ restore:
        popq %rdx
        popq %rsi
        popq %rdi
-       FRAME_END
+       popq %rbp
        ret
        _ASM_NOKPROBE(restore)
 #endif
index 6874da5f67fcf38b3b50c0274bb7a104f35ecdbf..253b72eaade6b538905acb4c7a8329b04480655a 100644 (file)
@@ -193,10 +193,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
 $(MODLIB)/vdso: FORCE
        @mkdir -p $(MODLIB)/vdso
 
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
        $(call cmd,vdso_install)
 
 PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets) FORCE
+vdso_install: $(vdso_img_insttargets)
 
 clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
index 73a75aa5a66db39d69a6c4f60e091f32fd570c04..33787ee817f0cdaad78814849b0aad2dc7e2b407 100644 (file)
@@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name)
 
 static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        return perf_callchain_store(entry, addr);
 }
@@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
                /* TODO: We don't support guest os callchain now */
@@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment)
 #include <asm/compat.h>
 
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
        /* 32-bit process in 64-bit kernel. */
        unsigned long ss_base, cs_base;
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
        fp = compat_ptr(ss_base + regs->bp);
        pagefault_disable();
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                unsigned long bytes;
                frame.next_frame     = 0;
                frame.return_address = 0;
@@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #else
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
     return 0;
 }
 #endif
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
        struct stack_frame frame;
        const void __user *fp;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
                return;
 
        pagefault_disable();
-       while (entry->nr < sysctl_perf_event_max_stack) {
+       while (entry->nr < entry->max_stack) {
                unsigned long bytes;
                frame.next_frame             = NULL;
                frame.return_address = 0;
index 0a5ede187d9c1b45bc1c94f41609ce0a9531ab18..eb0533558c2b705957a30704218b983eda65e61e 100644 (file)
@@ -826,7 +826,7 @@ static int p4_hw_config(struct perf_event *event)
                 * Clear bits we reserve to be managed by kernel itself
                 * and never allowed from a user space
                 */
-                event->attr.config &= P4_CONFIG_MASK;
+               event->attr.config &= P4_CONFIG_MASK;
 
                rc = p4_validate_raw_event(event);
                if (rc)
index 16c1789164122b70f6d2cd4ad16ae18b741236aa..fce74062d9812031b491d997f1e995578ccf8af9 100644 (file)
@@ -891,7 +891,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
                return -ENODEV;
 
        pkg = topology_phys_to_logical_pkg(phys_id);
-       if (WARN_ON_ONCE(pkg < 0))
+       if (pkg < 0)
                return -EINVAL;
 
        if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
index 08abf639075f92b795d014cfde240d2769a224d5..5490bbaf71d5f67dfda35b345bf50b81e3008d7f 100644 (file)
@@ -1,8 +1,16 @@
 #ifndef _ASM_X86_BUGS_H
 #define _ASM_X86_BUGS_H
 
+#include <asm/processor.h>
+
 extern void check_bugs(void);
 
+#if defined(CONFIG_CPU_SUP_INTEL)
+void check_mpx_erratum(struct cpuinfo_x86 *c);
+#else
+static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {}
+#endif
+
 #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
 int ppro_with_ram_bug(void);
 #else
index 25ebb54905e0001801fc45ea86d6f8a111f011e2..483fb547e3c048afe850993b94dc2be2bc841d96 100644 (file)
@@ -64,9 +64,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
           (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||   \
           (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||   \
           (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
+          (((bit)>>5)==15 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
+          (((bit)>>5)==16 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
 
 #define DISABLED_MASK_BIT_SET(bit)                                     \
         ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||   \
@@ -83,9 +83,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
           (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||   \
           (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||   \
           (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
+          (((bit)>>5)==15 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
+          (((bit)>>5)==16 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
 
 #define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
index 39343be7d4f4315592da1026c700533ec435204b..911e9358ceb184b6b0b0f38b9c7b853fc4506fbe 100644 (file)
 #endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-# define DISABLE_PKU           (1<<(X86_FEATURE_PKU))
-# define DISABLE_OSPKE         (1<<(X86_FEATURE_OSPKE))
-#else
 # define DISABLE_PKU           0
 # define DISABLE_OSPKE         0
+#else
+# define DISABLE_PKU           (1<<(X86_FEATURE_PKU & 31))
+# define DISABLE_OSPKE         (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
 /*
index ed65fe701de5665a92dbf3bfe8de8d947fb7a440..85029b58d0cd34c2b6dff04ce5e0e1a20e1bc81b 100644 (file)
@@ -99,7 +99,7 @@ struct telemetry_core_ops {
        int (*reset_events)(void);
 };
 
-int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
                          struct telemetry_plt_config *pltconfig);
 
 int telemetry_clear_pltdata(void);
diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h
new file mode 100644 (file)
index 0000000..d4855f1
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+ * Intel Core SoC Power Management Controller Header File
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef _ASM_PMC_CORE_H
+#define _ASM_PMC_CORE_H
+
+/* API to read SLP_S0_RESIDENCY counter */
+int intel_pmc_slp_s0_counter_read(u32 *data);
+
+#endif /* _ASM_PMC_CORE_H */
index b9e9bb2c60898cf04019b7810f11e485acce5459..3725e145aa58c2879458f818016c8e9ddd268193 100644 (file)
@@ -2,10 +2,12 @@
 #define _UAPI__SVM_H
 
 #define SVM_EXIT_READ_CR0      0x000
+#define SVM_EXIT_READ_CR2      0x002
 #define SVM_EXIT_READ_CR3      0x003
 #define SVM_EXIT_READ_CR4      0x004
 #define SVM_EXIT_READ_CR8      0x008
 #define SVM_EXIT_WRITE_CR0     0x010
+#define SVM_EXIT_WRITE_CR2     0x012
 #define SVM_EXIT_WRITE_CR3     0x013
 #define SVM_EXIT_WRITE_CR4     0x014
 #define SVM_EXIT_WRITE_CR8     0x018
 
 #define SVM_EXIT_REASONS \
        { SVM_EXIT_READ_CR0,    "read_cr0" }, \
+       { SVM_EXIT_READ_CR2,    "read_cr2" }, \
        { SVM_EXIT_READ_CR3,    "read_cr3" }, \
        { SVM_EXIT_READ_CR4,    "read_cr4" }, \
        { SVM_EXIT_READ_CR8,    "read_cr8" }, \
        { SVM_EXIT_WRITE_CR0,   "write_cr0" }, \
+       { SVM_EXIT_WRITE_CR2,   "write_cr2" }, \
        { SVM_EXIT_WRITE_CR3,   "write_cr3" }, \
        { SVM_EXIT_WRITE_CR4,   "write_cr4" }, \
        { SVM_EXIT_WRITE_CR8,   "write_cr8" }, \
        { SVM_EXIT_READ_DR1,    "read_dr1" }, \
        { SVM_EXIT_READ_DR2,    "read_dr2" }, \
        { SVM_EXIT_READ_DR3,    "read_dr3" }, \
+       { SVM_EXIT_READ_DR4,    "read_dr4" }, \
+       { SVM_EXIT_READ_DR5,    "read_dr5" }, \
+       { SVM_EXIT_READ_DR6,    "read_dr6" }, \
+       { SVM_EXIT_READ_DR7,    "read_dr7" }, \
        { SVM_EXIT_WRITE_DR0,   "write_dr0" }, \
        { SVM_EXIT_WRITE_DR1,   "write_dr1" }, \
        { SVM_EXIT_WRITE_DR2,   "write_dr2" }, \
        { SVM_EXIT_WRITE_DR3,   "write_dr3" }, \
+       { SVM_EXIT_WRITE_DR4,   "write_dr4" }, \
        { SVM_EXIT_WRITE_DR5,   "write_dr5" }, \
+       { SVM_EXIT_WRITE_DR6,   "write_dr6" }, \
        { SVM_EXIT_WRITE_DR7,   "write_dr7" }, \
+       { SVM_EXIT_EXCP_BASE + DE_VECTOR,       "DE excp" }, \
        { SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" }, \
        { SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" }, \
+       { SVM_EXIT_EXCP_BASE + OF_VECTOR,       "OF excp" }, \
+       { SVM_EXIT_EXCP_BASE + BR_VECTOR,       "BR excp" }, \
        { SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" }, \
-       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
        { SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" }, \
+       { SVM_EXIT_EXCP_BASE + DF_VECTOR,       "DF excp" }, \
+       { SVM_EXIT_EXCP_BASE + TS_VECTOR,       "TS excp" }, \
+       { SVM_EXIT_EXCP_BASE + NP_VECTOR,       "NP excp" }, \
+       { SVM_EXIT_EXCP_BASE + SS_VECTOR,       "SS excp" }, \
+       { SVM_EXIT_EXCP_BASE + GP_VECTOR,       "GP excp" }, \
+       { SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
+       { SVM_EXIT_EXCP_BASE + MF_VECTOR,       "MF excp" }, \
        { SVM_EXIT_EXCP_BASE + AC_VECTOR,       "AC excp" }, \
        { SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" }, \
+       { SVM_EXIT_EXCP_BASE + XM_VECTOR,       "XF excp" }, \
        { SVM_EXIT_INTR,        "interrupt" }, \
        { SVM_EXIT_NMI,         "nmi" }, \
        { SVM_EXIT_SMI,         "smi" }, \
        { SVM_EXIT_INIT,        "init" }, \
        { SVM_EXIT_VINTR,       "vintr" }, \
        { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
+       { SVM_EXIT_IDTR_READ,   "read_idtr" }, \
+       { SVM_EXIT_GDTR_READ,   "read_gdtr" }, \
+       { SVM_EXIT_LDTR_READ,   "read_ldtr" }, \
+       { SVM_EXIT_TR_READ,     "read_rt" }, \
+       { SVM_EXIT_IDTR_WRITE,  "write_idtr" }, \
+       { SVM_EXIT_GDTR_WRITE,  "write_gdtr" }, \
+       { SVM_EXIT_LDTR_WRITE,  "write_ldtr" }, \
+       { SVM_EXIT_TR_WRITE,    "write_rt" }, \
+       { SVM_EXIT_RDTSC,       "rdtsc" }, \
+       { SVM_EXIT_RDPMC,       "rdpmc" }, \
+       { SVM_EXIT_PUSHF,       "pushf" }, \
+       { SVM_EXIT_POPF,        "popf" }, \
        { SVM_EXIT_CPUID,       "cpuid" }, \
+       { SVM_EXIT_RSM,         "rsm" }, \
+       { SVM_EXIT_IRET,        "iret" }, \
+       { SVM_EXIT_SWINT,       "swint" }, \
        { SVM_EXIT_INVD,        "invd" }, \
        { SVM_EXIT_PAUSE,       "pause" }, \
        { SVM_EXIT_HLT,         "hlt" }, \
        { SVM_EXIT_IOIO,        "io" }, \
        { SVM_EXIT_MSR,         "msr" }, \
        { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+       { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \
        { SVM_EXIT_SHUTDOWN,    "shutdown" }, \
        { SVM_EXIT_VMRUN,       "vmrun" }, \
        { SVM_EXIT_VMMCALL,     "hypercall" }, \
        { SVM_EXIT_STGI,        "stgi" }, \
        { SVM_EXIT_CLGI,        "clgi" }, \
        { SVM_EXIT_SKINIT,      "skinit" }, \
+       { SVM_EXIT_RDTSCP,      "rdtscp" }, \
+       { SVM_EXIT_ICEBP,       "icebp" }, \
        { SVM_EXIT_WBINVD,      "wbinvd" }, \
        { SVM_EXIT_MONITOR,     "monitor" }, \
        { SVM_EXIT_MWAIT,       "mwait" }, \
        { SVM_EXIT_XSETBV,      "xsetbv" }, \
        { SVM_EXIT_NPF,         "npf" }, \
-       { SVM_EXIT_RSM,         "rsm" }, \
        { SVM_EXIT_AVIC_INCOMPLETE_IPI,         "avic_incomplete_ipi" }, \
-       { SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }
+       { SVM_EXIT_AVIC_UNACCELERATED_ACCESS,   "avic_unaccelerated_access" }, \
+       { SVM_EXIT_ERR,         "invalid_guest_state" }
 
 
 #endif /* _UAPI__SVM_H */
index 6ef6ed9ccca6954891e2ea419990a89706d64523..0fe6953f421c9b92b72c331600f435e623c3e264 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
+#include <asm/bugs.h>
 #include <asm/cpu.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
@@ -270,6 +271,8 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 static __init int setup_disable_smep(char *arg)
 {
        setup_clear_cpu_cap(X86_FEATURE_SMEP);
+       /* Check for things that depend on SMEP being enabled: */
+       check_mpx_erratum(&boot_cpu_data);
        return 1;
 }
 __setup("nosmep", setup_disable_smep);
@@ -310,6 +313,10 @@ static bool pku_disabled;
 
 static __always_inline void setup_pku(struct cpuinfo_x86 *c)
 {
+       /* check the boot processor, plus compile options for PKU: */
+       if (!cpu_feature_enabled(X86_FEATURE_PKU))
+               return;
+       /* checks the actual processor's cpuid bits: */
        if (!cpu_has(c, X86_FEATURE_PKU))
                return;
        if (pku_disabled)
index 8dae51fd3db1c802278ac2647beb44a2fe0cccb0..6e2ffbebbcdbd053a66ee5dbfa7e421c4281b322 100644 (file)
 #include <asm/apic.h>
 #endif
 
+/*
+ * Just in case our CPU detection goes bad, or you have a weird system,
+ * allow a way to override the automatic disabling of MPX.
+ */
+static int forcempx;
+
+static int __init forcempx_setup(char *__unused)
+{
+       forcempx = 1;
+
+       return 1;
+}
+__setup("intel-skd-046-workaround=disable", forcempx_setup);
+
+void check_mpx_erratum(struct cpuinfo_x86 *c)
+{
+       if (forcempx)
+               return;
+       /*
+        * Turn off the MPX feature on CPUs where SMEP is not
+        * available or disabled.
+        *
+        * Works around Intel Erratum SKD046: "Branch Instructions
+        * May Initialize MPX Bound Registers Incorrectly".
+        *
+        * This might falsely disable MPX on systems without
+        * SMEP, like Atom processors without SMEP.  But there
+        * is no such hardware known at the moment.
+        */
+       if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
+               setup_clear_cpu_cap(X86_FEATURE_MPX);
+               pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
+       }
+}
+
 static void early_init_intel(struct cpuinfo_x86 *c)
 {
        u64 misc_enable;
@@ -173,6 +208,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                if (edx & (1U << 28))
                        c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
        }
+
+       check_mpx_erratum(c);
 }
 
 #ifdef CONFIG_X86_32
index 6b16c36f0939313dde91d03428cdc855ff3dba9e..6e789ca1f8412546a183460d65d304b10b1c8bef 100644 (file)
@@ -532,7 +532,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
        switch (code) {
        case ARCH_SET_GS:
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.gsindex = 0;
@@ -546,7 +546,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
        case ARCH_SET_FS:
                /* Not strictly needed for fs, but do it for symmetry
                   with gs */
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.fsindex = 0;
index e60ef918f53d52c765cd19821c5810adcda61820..600edd225e81147473342d66fb991b9cad870756 100644 (file)
@@ -392,7 +392,7 @@ static int putreg(struct task_struct *child,
 
 #ifdef CONFIG_X86_64
        case offsetof(struct user_regs_struct,fs_base):
-               if (value >= TASK_SIZE_OF(child))
+               if (value >= TASK_SIZE_MAX)
                        return -EIO;
                /*
                 * When changing the segment base, use do_arch_prctl
@@ -406,7 +406,7 @@ static int putreg(struct task_struct *child,
                /*
                 * Exactly the same here as the %fs handling above.
                 */
-               if (value >= TASK_SIZE_OF(child))
+               if (value >= TASK_SIZE_MAX)
                        return -EIO;
                if (child->thread.gsbase != value)
                        return do_arch_prctl(child, ARCH_SET_GS, value);
index 6aa0f4d9eea6816bbf0c78514e0b76d8f4dc7def..9911a0620f9a94c1b6e1aa02290649d7c2803600 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/param.h>
 
 /* CPU reference clock frequency: in KHz */
+#define FREQ_80                80000
 #define FREQ_83                83200
 #define FREQ_100       99840
 #define FREQ_133       133200
@@ -56,6 +57,8 @@ static struct freq_desc freq_desc_tables[] = {
        { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
        /* ANN */
        { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+       /* AIRMONT */
+       { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } },
 };
 
 static int match_cpu(u8 family, u8 model)
index 2214214c786b2f11295c0d7df835b2381a3b7ea0..1163e8173e5a71b26d77f9ffde3612bddefc76aa 100644 (file)
@@ -84,7 +84,7 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define TSC_RATIO_MIN          0x0000000000000001ULL
 #define TSC_RATIO_MAX          0x000000ffffffffffULL
 
-#define AVIC_HPA_MASK  ~((0xFFFULL << 52) || 0xFFF)
+#define AVIC_HPA_MASK  ~((0xFFFULL << 52) | 0xFFF)
 
 /*
  * 0xff is broadcast, so the max index allowed for physical APIC ID
@@ -3597,7 +3597,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
-       u32 index = svm->vmcb->control.exit_info_2 && 0xFF;
+       u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
        struct kvm_lapic *apic = svm->vcpu.arch.apic;
 
        trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
index e605d1ed334ff5550cb07310815e546028c26514..fb93010beaa4df8b61fb4b654d889b1dcaa0990d 100644 (file)
@@ -2418,7 +2418,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
        if (is_guest_mode(vcpu))
                msr_bitmap = vmx_msr_bitmap_nested;
-       else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
+       else if (cpu_has_secondary_exec_ctrls() &&
+                (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
                if (is_long_mode(vcpu))
                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
                else
@@ -4787,6 +4789,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+       if (cpu_has_secondary_exec_ctrls()) {
+               if (kvm_vcpu_apicv_active(vcpu))
+                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                                     SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+               else
+                       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                                       SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+       }
+
+       if (cpu_has_vmx_msr_bitmap())
+               vmx_set_msr_bitmap(vcpu);
 }
 
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -6333,23 +6348,20 @@ static __init int hardware_setup(void)
 
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-       if (enable_apicv) {
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       vmx_disable_intercept_msr_read_x2apic(msr);
-
-               /* According SDM, in x2apic mode, the whole id reg is used.
-                * But in KVM, it only use the highest eight bits. Need to
-                * intercept it */
-               vmx_enable_intercept_msr_read_x2apic(0x802);
-               /* TMCCT */
-               vmx_enable_intercept_msr_read_x2apic(0x839);
-               /* TPR */
-               vmx_disable_intercept_msr_write_x2apic(0x808);
-               /* EOI */
-               vmx_disable_intercept_msr_write_x2apic(0x80b);
-               /* SELF-IPI */
-               vmx_disable_intercept_msr_write_x2apic(0x83f);
-       }
+       for (msr = 0x800; msr <= 0x8ff; msr++)
+               vmx_disable_intercept_msr_read_x2apic(msr);
+
+       /* According SDM, in x2apic mode, the whole id reg is used.  But in
+        * KVM, it only use the highest eight bits. Need to intercept it */
+       vmx_enable_intercept_msr_read_x2apic(0x802);
+       /* TMCCT */
+       vmx_enable_intercept_msr_read_x2apic(0x839);
+       /* TPR */
+       vmx_disable_intercept_msr_write_x2apic(0x808);
+       /* EOI */
+       vmx_disable_intercept_msr_write_x2apic(0x80b);
+       /* SELF-IPI */
+       vmx_disable_intercept_msr_write_x2apic(0x83f);
 
        if (enable_ept) {
                kvm_mmu_set_mask_ptes(0ull,
index 5ce1ed02f7e80900ead34c8b8af2bfe223a9d3bd..7d1fa7cd237443e054da06255697412d5e9354f6 100644 (file)
@@ -292,7 +292,7 @@ void vmalloc_sync_all(void)
                return;
 
        for (address = VMALLOC_START & PMD_MASK;
-            address >= TASK_SIZE && address < FIXADDR_TOP;
+            address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
             address += PMD_SIZE) {
                struct page *page;
 
@@ -854,8 +854,13 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                                return;
                }
 #endif
-               /* Kernel addresses are always protection faults: */
-               if (address >= TASK_SIZE)
+
+               /*
+                * To avoid leaking information about the kernel page table
+                * layout, pretend that user-mode accesses to kernel addresses
+                * are always protection faults.
+                */
+               if (address >= TASK_SIZE_MAX)
                        error_code |= PF_PROT;
 
                if (likely(show_unhandled_signals))
index 4bd08b0fc8ea1b1c9badf0128920858ef8d41336..99ddab79215e2d3858ccbd2c4178cdffb08b7382 100644 (file)
@@ -491,8 +491,11 @@ int __init pci_xen_initial_domain(void)
 #endif
        __acpi_register_gsi = acpi_register_gsi_xen;
        __acpi_unregister_gsi = NULL;
-       /* Pre-allocate legacy irqs */
-       for (irq = 0; irq < nr_legacy_irqs(); irq++) {
+       /*
+        * Pre-allocate the legacy IRQs.  Use NR_LEGACY_IRQS here
+        * because we don't have a PIC and thus nr_legacy_irqs() is zero.
+        */
+       for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
                int trigger, polarity;
 
                if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
index 92723aeae0f96d2d5def8aac111765d9683f4d7f..cd95075944abff4a7f447d9727c40c050407768c 100644 (file)
@@ -11,7 +11,6 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/page_types.h>
-#include <asm/frame.h>
 
 #define SAVE_XMM                       \
        mov %rsp, %rax;                 \
        mov (%rsp), %rsp
 
 ENTRY(efi_call)
-       FRAME_BEGIN
+       pushq %rbp
+       movq %rsp, %rbp
        SAVE_XMM
-       mov (%rsp), %rax
-       mov 8(%rax), %rax
+       mov 16(%rbp), %rax
        subq $48, %rsp
        mov %r9, 32(%rsp)
        mov %rax, 40(%rsp)
@@ -53,6 +52,6 @@ ENTRY(efi_call)
        call *%rdi
        addq $48, %rsp
        RESTORE_XMM
-       FRAME_END
+       popq %rbp
        ret
 ENDPROC(efi_call)
index 92e3e1d84c1d73fb86b086b6ea8783ccf2f3663e..12734a96df478b9fc384f7cbbffeeee45559a302 100644 (file)
@@ -26,7 +26,5 @@ quiet_cmd_bin2c = BIN2C   $@
 
 $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
        $(call if_changed,bin2c)
-       @:
-
 
 obj-$(CONFIG_KEXEC_FILE)       += kexec-purgatory.o
index b95964610ea7f45320105f0378267a39aa7b427e..c556c5ae8de5973852dd7da7517c65fe04fb00e1 100644 (file)
@@ -59,7 +59,6 @@ OBJCOPYFLAGS_realmode.bin := -O binary
 targets += realmode.bin
 $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
        $(call if_changed,objcopy)
-       @:
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
index 7ab29518a3b9dc0c7f26bfa0d15727b378de324a..e345891450c3fcc2873a31d352b80afaacf7b74c 100644 (file)
@@ -393,6 +393,9 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        unsigned long i = 0;
        unsigned long n = end_pfn - start_pfn;
 
+       if (remap_pfn == 0)
+               remap_pfn = nr_pages;
+
        while (i < n) {
                unsigned long cur_pfn = start_pfn + i;
                unsigned long left = n - i;
@@ -438,17 +441,29 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(unsigned long nr_pages)
+static unsigned long __init xen_count_remap_pages(
+       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+       unsigned long remap_pages)
+{
+       if (start_pfn >= nr_pages)
+               return remap_pages;
+
+       return remap_pages + min(end_pfn, nr_pages) - start_pfn;
+}
+
+static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
+       unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
+                             unsigned long nr_pages, unsigned long last_val))
 {
        phys_addr_t start = 0;
-       unsigned long last_pfn = nr_pages;
+       unsigned long ret_val = 0;
        const struct e820entry *entry = xen_e820_map;
        int i;
 
        /*
         * Combine non-RAM regions and gaps until a RAM region (or the
-        * end of the map) is reached, then set the 1:1 map and
-        * remap the memory in those non-RAM regions.
+        * end of the map) is reached, then call the provided function
+        * to perform its duty on the non-RAM region.
         *
         * The combined non-RAM regions are rounded to a whole number
         * of pages so any partial pages are accessible via the 1:1
@@ -466,14 +481,13 @@ static void __init xen_set_identity_and_remap(unsigned long nr_pages)
                                end_pfn = PFN_UP(entry->addr);
 
                        if (start_pfn < end_pfn)
-                               last_pfn = xen_set_identity_and_remap_chunk(
-                                               start_pfn, end_pfn, nr_pages,
-                                               last_pfn);
+                               ret_val = func(start_pfn, end_pfn, nr_pages,
+                                              ret_val);
                        start = end;
                }
        }
 
-       pr_info("Released %ld page(s)\n", xen_released_pages);
+       return ret_val;
 }
 
 /*
@@ -596,35 +610,6 @@ static void __init xen_ignore_unusable(void)
        }
 }
 
-static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
-{
-       unsigned long extra = 0;
-       unsigned long start_pfn, end_pfn;
-       const struct e820entry *entry = xen_e820_map;
-       int i;
-
-       end_pfn = 0;
-       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
-               start_pfn = PFN_DOWN(entry->addr);
-               /* Adjacent regions on non-page boundaries handling! */
-               end_pfn = min(end_pfn, start_pfn);
-
-               if (start_pfn >= max_pfn)
-                       return extra + max_pfn - end_pfn;
-
-               /* Add any holes in map to result. */
-               extra += start_pfn - end_pfn;
-
-               end_pfn = PFN_UP(entry->addr + entry->size);
-               end_pfn = min(end_pfn, max_pfn);
-
-               if (entry->type != E820_RAM)
-                       extra += end_pfn - start_pfn;
-       }
-
-       return extra;
-}
-
 bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
 {
        struct e820entry *entry;
@@ -804,7 +789,7 @@ char * __init xen_memory_setup(void)
        max_pages = xen_get_max_pages();
 
        /* How many extra pages do we need due to remapping? */
-       max_pages += xen_count_remap_pages(max_pfn);
+       max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
 
        if (max_pages > max_pfn)
                extra_pages += max_pages - max_pfn;
@@ -922,7 +907,9 @@ char * __init xen_memory_setup(void)
         * Set identity map on non-RAM pages and prepare remapping the
         * underlying RAM.
         */
-       xen_set_identity_and_remap(max_pfn);
+       xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
+
+       pr_info("Released %ld page(s)\n", xen_released_pages);
 
        return "Xen";
 }
index a0a4e554c6f195ffe2abd37d712e12e9ee8d9e85..6deba5bc7e3490546031ce0a11e90e9f9a917884 100644 (file)
@@ -290,11 +290,11 @@ static int xen_vcpuop_set_next_event(unsigned long delta,
        WARN_ON(!clockevent_state_oneshot(evt));
 
        single.timeout_abs_ns = get_abs_timeout(delta);
-       single.flags = VCPU_SSHOTTMR_future;
+       /* Get an event anyway, even if the timeout is already expired */
+       single.flags = 0;
 
        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
-
-       BUG_ON(ret != 0 && ret != -ETIME);
+       BUG_ON(ret != 0);
 
        return ret;
 }
index a6b00b3af42993e937181a8412c8949f8ae65983..ef90479e03970ec107b69c8eaa012e3b0bc54927 100644 (file)
@@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event)
 
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-       struct perf_callchain_entry *entry = data;
+       struct perf_callchain_entry_ctx *entry = data;
 
        perf_callchain_store(entry, frame->pc);
        return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                           struct pt_regs *regs)
 {
-       xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
+       xtensa_backtrace_kernel(regs, entry->max_stack,
                                callchain_trace, NULL, entry);
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                         struct pt_regs *regs)
 {
-       xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
+       xtensa_backtrace_user(regs, entry->max_stack,
                              callchain_trace, entry);
 }
 
index 698c7933d5826d82377264661d06b0884d865f24..ed2397f8de9dc3c96f00f13c82c0e0e041fab06e 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
-#include <linux/badblocks.h>
 #include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
index b719ab3090bb2211f31a94af630c019f41c6860d..ab234791a0ba6101eaff16798a0c425dd347a4bd 100644 (file)
@@ -1316,7 +1316,7 @@ static int __init acpi_battery_init(void)
 
 static void __exit acpi_battery_exit(void)
 {
-       async_synchronize_cookie(async_cookie);
+       async_synchronize_cookie(async_cookie + 1);
        acpi_bus_unregister_driver(&acpi_battery_driver);
 #ifdef CONFIG_ACPI_PROCFS_POWER
        acpi_unlock_battery_dir(acpi_battery_dir);
index cd2c3d6d40e03fbc70ef802dd82c4e1ae36c7ef3..993fd31394c854c99e5ce0c2af824f36c50b7a22 100644 (file)
@@ -319,6 +319,7 @@ int acpi_device_fix_up_power(struct acpi_device *device)
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(acpi_device_fix_up_power);
 
 int acpi_device_update_power(struct acpi_device *device, int *state_p)
 {
index c81667d4bb6028cdaa73c12490100d5f5406cfc6..e44944f4be77d0a573e4e46850849b52ba09a349 100644 (file)
@@ -1267,14 +1267,15 @@ int dpm_suspend_late(pm_message_t state)
                error = device_suspend_late(dev);
 
                mutex_lock(&dpm_list_mtx);
+               if (!list_empty(&dev->power.entry))
+                       list_move(&dev->power.entry, &dpm_late_early_list);
+
                if (error) {
                        pm_dev_err(dev, state, " late", error);
                        dpm_save_failed_dev(dev_name(dev));
                        put_device(dev);
                        break;
                }
-               if (!list_empty(&dev->power.entry))
-                       list_move(&dev->power.entry, &dpm_late_early_list);
                put_device(dev);
 
                if (async_error)
index 04d706ca5f439be4576c8f78afb4b10a66b5dd3e..35b13a08ca3e6a4f21cd602ecf3d9ab9c328f9f0 100644 (file)
@@ -146,7 +146,6 @@ int bcma_sflash_init(struct bcma_drv_cc *cc)
                return -ENOTSUPP;
        }
 
-       sflash->window = BCMA_SOC_FLASH2;
        sflash->blocksize = e->blocksize;
        sflash->numblocks = e->numblocks;
        sflash->size = sflash->blocksize * sflash->numblocks;
index 51a071e322213982247aa7ff01a15d6057d3fb34..c04bd9bc39fd0565e1d11c6af9b15632c110148e 100644 (file)
@@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-                       void __pmem **kaddr, pfn_t *pfn)
+                       void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
        struct page *page;
index 0ede6d7e25686cf768e3e74d7611e5e61319d6e7..81666a56415e2bc6f960a3f5ac2220108c2b7ed1 100644 (file)
@@ -350,12 +350,12 @@ struct rbd_device {
        struct rbd_spec         *spec;
        struct rbd_options      *opts;
 
-       char                    *header_name;
+       struct ceph_object_id   header_oid;
+       struct ceph_object_locator header_oloc;
 
        struct ceph_file_layout layout;
 
-       struct ceph_osd_event   *watch_event;
-       struct rbd_obj_request  *watch_request;
+       struct ceph_osd_linger_request *watch_handle;
 
        struct rbd_spec         *parent_spec;
        u64                     parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
        return __rbd_obj_request_wait(obj_request, 0);
 }
 
-static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
-                                       unsigned long timeout)
-{
-       return __rbd_obj_request_wait(obj_request, timeout);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
                complete_all(&obj_request->completion);
 }
 
-static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
-{
-       dout("%s: obj %p\n", __func__, obj_request);
-       obj_request_done_set(obj_request);
-}
-
 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
                obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
-                               struct ceph_msg *msg)
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
        struct rbd_obj_request *obj_request = osd_req->r_priv;
        u16 opcode;
 
-       dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+       dout("%s: osd_req %p\n", __func__, osd_req);
        rbd_assert(osd_req == obj_request->osd_req);
        if (obj_request_img_data_test(obj_request)) {
                rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_CALL:
                rbd_osd_call_callback(obj_request);
                break;
-       case CEPH_OSD_OP_NOTIFY_ACK:
-       case CEPH_OSD_OP_WATCH:
-               rbd_osd_trivial_callback(obj_request);
-               break;
        default:
                rbd_warn(NULL, "%s: unsupported op %hu",
                        obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-       u64 snap_id;
 
-       rbd_assert(osd_req != NULL);
-
-       snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-       ceph_osdc_build_request(osd_req, obj_request->offset,
-                       NULL, snap_id, NULL);
+       if (img_request)
+               osd_req->r_snapid = img_request->snap_id;
 }
 
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-       struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_osd_request *osd_req = obj_request->osd_req;
-       struct ceph_snap_context *snapc;
-       struct timespec mtime = CURRENT_TIME;
 
-       rbd_assert(osd_req != NULL);
-
-       snapc = img_request ? img_request->snapc : NULL;
-       ceph_osdc_build_request(osd_req, obj_request->offset,
-                       snapc, CEPH_NOSNAP, &mtime);
+       osd_req->r_mtime = CURRENT_TIME;
+       osd_req->r_data_offset = obj_request->offset;
 }
 
 /*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
                                          GFP_NOIO);
        if (!osd_req)
-               return NULL;    /* ENOMEM */
+               goto fail;
 
        if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
        osd_req->r_priv = obj_request;
 
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-       ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                            obj_request->object_name))
+               goto fail;
+
+       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+               goto fail;
 
        return osd_req;
+
+fail:
+       ceph_osdc_put_request(osd_req);
+       return NULL;
 }
 
 /*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
                                                false, GFP_NOIO);
        if (!osd_req)
-               return NULL;    /* ENOMEM */
+               goto fail;
 
        osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
 
        osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-       ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+                            obj_request->object_name))
+               goto fail;
+
+       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+               goto fail;
 
        return osd_req;
+
+fail:
+       ceph_osdc_put_request(osd_req);
+       return NULL;
 }
 
 
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
 {
        struct rbd_obj_request *obj_request;
        struct rbd_obj_request *next_obj_request;
+       int ret = 0;
 
        dout("%s: img %p\n", __func__, img_request);
-       for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
-               int ret;
 
+       rbd_img_request_get(img_request);
+       for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                ret = rbd_img_obj_request_submit(obj_request);
                if (ret)
-                       return ret;
+                       goto out_put_ireq;
        }
 
-       return 0;
+out_put_ireq:
+       rbd_img_request_put(img_request);
+       return ret;
 }
 
 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
        obj_request_done_set(obj_request);
 }
 
-static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
-{
-       struct rbd_obj_request *obj_request;
-       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       int ret;
-
-       obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                                       OBJ_REQUEST_NODATA);
-       if (!obj_request)
-               return -ENOMEM;
-
-       ret = -ENOMEM;
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
-
-       osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
-                                       notify_id, 0, 0);
-       rbd_osd_req_format_read(obj_request);
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
 
-       ret = rbd_obj_request_submit(osdc, obj_request);
-       if (ret)
-               goto out;
-       ret = rbd_obj_request_wait(obj_request);
-out:
-       rbd_obj_request_put(obj_request);
-
-       return ret;
-}
-
-static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
+                        u64 notifier_id, void *data, size_t data_len)
 {
-       struct rbd_device *rbd_dev = (struct rbd_device *)data;
+       struct rbd_device *rbd_dev = arg;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        int ret;
 
-       dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
-               rbd_dev->header_name, (unsigned long long)notify_id,
-               (unsigned int)opcode);
+       dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
+            cookie, notify_id);
 
        /*
         * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
        if (ret)
                rbd_warn(rbd_dev, "refresh failed: %d", ret);
 
-       ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+       ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+                                  &rbd_dev->header_oloc, notify_id, cookie,
+                                  NULL, 0);
        if (ret)
                rbd_warn(rbd_dev, "notify_ack ret %d", ret);
 }
 
-/*
- * Send a (un)watch request and wait for the ack.  Return a request
- * with a ref held on success or error.
- */
-static struct rbd_obj_request *rbd_obj_watch_request_helper(
-                                               struct rbd_device *rbd_dev,
-                                               bool watch)
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
 {
-       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       struct ceph_options *opts = osdc->client->options;
-       struct rbd_obj_request *obj_request;
+       struct rbd_device *rbd_dev = arg;
        int ret;
 
-       obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
-                                            OBJ_REQUEST_NODATA);
-       if (!obj_request)
-               return ERR_PTR(-ENOMEM);
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
-                             rbd_dev->watch_event->cookie, 0, watch);
-       rbd_osd_req_format_write(obj_request);
+       rbd_warn(rbd_dev, "encountered watch error: %d", err);
 
-       if (watch)
-               ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
-
-       ret = rbd_obj_request_submit(osdc, obj_request);
-       if (ret)
-               goto out;
+       __rbd_dev_header_unwatch_sync(rbd_dev);
 
-       ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
-       if (ret)
-               goto out;
-
-       ret = obj_request->result;
+       ret = rbd_dev_header_watch_sync(rbd_dev);
        if (ret) {
-               if (watch)
-                       rbd_obj_request_end(obj_request);
-               goto out;
+               rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
+               return;
        }
 
-       return obj_request;
-
-out:
-       rbd_obj_request_put(obj_request);
-       return ERR_PTR(ret);
+       ret = rbd_dev_refresh(rbd_dev);
+       if (ret)
+               rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
 }
 
 /*
@@ -3205,35 +3140,33 @@ out:
 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
-       struct rbd_obj_request *obj_request;
-       int ret;
+       struct ceph_osd_linger_request *handle;
 
-       rbd_assert(!rbd_dev->watch_event);
-       rbd_assert(!rbd_dev->watch_request);
+       rbd_assert(!rbd_dev->watch_handle);
 
-       ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
-                                    &rbd_dev->watch_event);
-       if (ret < 0)
-               return ret;
+       handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
+                                &rbd_dev->header_oloc, rbd_watch_cb,
+                                rbd_watch_errcb, rbd_dev);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
 
-       obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
-       if (IS_ERR(obj_request)) {
-               ceph_osdc_cancel_event(rbd_dev->watch_event);
-               rbd_dev->watch_event = NULL;
-               return PTR_ERR(obj_request);
-       }
+       rbd_dev->watch_handle = handle;
+       return 0;
+}
 
-       /*
-        * A watch request is set to linger, so the underlying osd
-        * request won't go away until we unregister it.  We retain
-        * a pointer to the object request during that time (in
-        * rbd_dev->watch_request), so we'll keep a reference to it.
-        * We'll drop that reference after we've unregistered it in
-        * rbd_dev_header_unwatch_sync().
-        */
-       rbd_dev->watch_request = obj_request;
+static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       int ret;
 
-       return 0;
+       if (!rbd_dev->watch_handle)
+               return;
+
+       ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+       if (ret)
+               rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
+
+       rbd_dev->watch_handle = NULL;
 }
 
 /*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
  */
 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
 {
-       struct rbd_obj_request *obj_request;
-
-       rbd_assert(rbd_dev->watch_event);
-       rbd_assert(rbd_dev->watch_request);
-
-       rbd_obj_request_end(rbd_dev->watch_request);
-       rbd_obj_request_put(rbd_dev->watch_request);
-       rbd_dev->watch_request = NULL;
-
-       obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
-       if (!IS_ERR(obj_request))
-               rbd_obj_request_put(obj_request);
-       else
-               rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
-                        PTR_ERR(obj_request));
-
-       ceph_osdc_cancel_event(rbd_dev->watch_event);
-       rbd_dev->watch_event = NULL;
+       __rbd_dev_header_unwatch_sync(rbd_dev);
 
        dout("%s flushing notifies\n", __func__);
        ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
 
-               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
                                       0, size, ondisk);
                if (ret < 0)
                        goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        bool need_put = !!rbd_dev->opts;
 
+       ceph_oid_destroy(&rbd_dev->header_oid);
+
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
        kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
+       ceph_oid_init(&rbd_dev->header_oid);
+       ceph_oloc_init(&rbd_dev->header_oloc);
+
        rbd_dev->dev.bus = &rbd_bus_type;
        rbd_dev->dev.type = &rbd_device_type;
        rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_size",
                                &snapid, sizeof (snapid),
                                &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_object_prefix", NULL, 0,
                                reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_features",
                                &snapid, sizeof (snapid),
                                &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
 
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_parent",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_stripe_unit_count", NULL, 0,
                                (char *)&striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapcontext", NULL, 0,
                                reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        snapid = cpu_to_le64(snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
                                "rbd", "get_snapshot_name",
                                &snapid, sizeof (snapid),
                                reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 again:
        ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
        if (ret == -ENOENT && tries++ < 1) {
-               ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
-                                              &newest_epoch);
+               ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
+                                           &newest_epoch);
                if (ret < 0)
                        return ret;
 
                if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
-                       ceph_monc_request_next_osdmap(&rbdc->client->monc);
+                       ceph_osdc_maybe_request_map(&rbdc->client->osdc);
                        (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
                                                     newest_epoch,
                                                     opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 {
        struct rbd_spec *spec = rbd_dev->spec;
-       size_t size;
+       int ret;
 
        /* Record the header object name for this rbd image. */
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
+       rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
        if (rbd_dev->image_format == 1)
-               size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+               ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                      spec->image_name, RBD_SUFFIX);
        else
-               size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
-
-       rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
-       if (!rbd_dev->header_name)
-               return -ENOMEM;
+               ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+                                      RBD_HEADER_PREFIX, spec->image_id);
 
-       if (rbd_dev->image_format == 1)
-               sprintf(rbd_dev->header_name, "%s%s",
-                       spec->image_name, RBD_SUFFIX);
-       else
-               sprintf(rbd_dev->header_name, "%s%s",
-                       RBD_HEADER_PREFIX, spec->image_id);
-       return 0;
+       return ret;
 }
 
 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 {
        rbd_dev_unprobe(rbd_dev);
-       kfree(rbd_dev->header_name);
-       rbd_dev->header_name = NULL;
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
        rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                                pr_info("image %s/%s does not exist\n",
                                        rbd_dev->spec->pool_name,
                                        rbd_dev->spec->image_name);
-                       goto out_header_name;
+                       goto err_out_format;
                }
        }
 
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
                goto err_out_probe;
 
        dout("discovered format %u image, header name is %s\n",
-               rbd_dev->image_format, rbd_dev->header_name);
+               rbd_dev->image_format, rbd_dev->header_oid.name);
        return 0;
 
 err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
 err_out_watch:
        if (!depth)
                rbd_dev_header_unwatch_sync(rbd_dev);
-out_header_name:
-       kfree(rbd_dev->header_name);
-       rbd_dev->header_name = NULL;
 err_out_format:
        rbd_dev->image_format = 0;
        kfree(rbd_dev->spec->image_id);
index 883045814dac2536fcb667b36e3db01d8715596e..1630a1f085f70cfeb7983b1329da1166ff6eb8e2 100644 (file)
@@ -59,6 +59,7 @@ static int clk_pwm_probe(struct platform_device *pdev)
        struct clk_init_data init;
        struct clk_pwm *clk_pwm;
        struct pwm_device *pwm;
+       struct pwm_args pargs;
        const char *clk_name;
        struct clk *clk;
        int ret;
@@ -71,22 +72,28 @@ static int clk_pwm_probe(struct platform_device *pdev)
        if (IS_ERR(pwm))
                return PTR_ERR(pwm);
 
-       if (!pwm->period) {
+       pwm_get_args(pwm, &pargs);
+       if (!pargs.period) {
                dev_err(&pdev->dev, "invalid PWM period\n");
                return -EINVAL;
        }
 
        if (of_property_read_u32(node, "clock-frequency", &clk_pwm->fixed_rate))
-               clk_pwm->fixed_rate = NSEC_PER_SEC / pwm->period;
+               clk_pwm->fixed_rate = NSEC_PER_SEC / pargs.period;
 
-       if (pwm->period != NSEC_PER_SEC / clk_pwm->fixed_rate &&
-           pwm->period != DIV_ROUND_UP(NSEC_PER_SEC, clk_pwm->fixed_rate)) {
+       if (pargs.period != NSEC_PER_SEC / clk_pwm->fixed_rate &&
+           pargs.period != DIV_ROUND_UP(NSEC_PER_SEC, clk_pwm->fixed_rate)) {
                dev_err(&pdev->dev,
                        "clock-frequency does not match PWM period\n");
                return -EINVAL;
        }
 
-       ret = pwm_config(pwm, (pwm->period + 1) >> 1, pwm->period);
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(pwm);
+       ret = pwm_config(pwm, (pargs.period + 1) >> 1, pargs.period);
        if (ret < 0)
                return ret;
 
index 035513b012eebf049cffbe713680c556993f944f..36bc11a106aa075ab2bf0d85d00f0acff2b60770 100644 (file)
@@ -78,9 +78,14 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 static int cpufreq_start_governor(struct cpufreq_policy *policy);
 
-static inline int cpufreq_exit_governor(struct cpufreq_policy *policy)
+static inline void cpufreq_exit_governor(struct cpufreq_policy *policy)
 {
-       return cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+       (void)cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+}
+
+static inline void cpufreq_stop_governor(struct cpufreq_policy *policy)
+{
+       (void)cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 }
 
 /**
@@ -1026,13 +1031,8 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
                return 0;
 
        down_write(&policy->rwsem);
-       if (has_target()) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret) {
-                       pr_err("%s: Failed to stop governor\n", __func__);
-                       goto unlock;
-               }
-       }
+       if (has_target())
+               cpufreq_stop_governor(policy);
 
        cpumask_set_cpu(cpu, policy->cpus);
 
@@ -1041,8 +1041,6 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
                if (ret)
                        pr_err("%s: Failed to start governor\n", __func__);
        }
-
-unlock:
        up_write(&policy->rwsem);
        return ret;
 }
@@ -1354,11 +1352,8 @@ static void cpufreq_offline(unsigned int cpu)
        }
 
        down_write(&policy->rwsem);
-       if (has_target()) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret)
-                       pr_err("%s: Failed to stop governor\n", __func__);
-       }
+       if (has_target())
+               cpufreq_stop_governor(policy);
 
        cpumask_clear_cpu(cpu, policy->cpus);
 
@@ -1387,12 +1382,8 @@ static void cpufreq_offline(unsigned int cpu)
        if (cpufreq_driver->stop_cpu)
                cpufreq_driver->stop_cpu(policy);
 
-       /* If cpu is last user of policy, free policy */
-       if (has_target()) {
-               ret = cpufreq_exit_governor(policy);
-               if (ret)
-                       pr_err("%s: Failed to exit governor\n", __func__);
-       }
+       if (has_target())
+               cpufreq_exit_governor(policy);
 
        /*
         * Perform the ->exit() even during light-weight tear-down,
@@ -1626,7 +1617,6 @@ EXPORT_SYMBOL(cpufreq_generic_suspend);
 void cpufreq_suspend(void)
 {
        struct cpufreq_policy *policy;
-       int ret;
 
        if (!cpufreq_driver)
                return;
@@ -1639,14 +1629,8 @@ void cpufreq_suspend(void)
        for_each_active_policy(policy) {
                if (has_target()) {
                        down_write(&policy->rwsem);
-                       ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
+                       cpufreq_stop_governor(policy);
                        up_write(&policy->rwsem);
-
-                       if (ret) {
-                               pr_err("%s: Failed to stop governor for policy: %p\n",
-                                       __func__, policy);
-                               continue;
-                       }
                }
 
                if (cpufreq_driver->suspend && cpufreq_driver->suspend(policy))
@@ -2049,16 +2033,15 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
 
        ret = policy->governor->governor(policy, event);
 
-       if (!ret) {
-               if (event == CPUFREQ_GOV_POLICY_INIT)
+       if (event == CPUFREQ_GOV_POLICY_INIT) {
+               if (ret)
+                       module_put(policy->governor->owner);
+               else
                        policy->governor->initialized++;
-               else if (event == CPUFREQ_GOV_POLICY_EXIT)
-                       policy->governor->initialized--;
-       }
-
-       if (((event == CPUFREQ_GOV_POLICY_INIT) && ret) ||
-                       ((event == CPUFREQ_GOV_POLICY_EXIT) && !ret))
+       } else if (event == CPUFREQ_GOV_POLICY_EXIT) {
+               policy->governor->initialized--;
                module_put(policy->governor->owner);
+       }
 
        return ret;
 }
@@ -2221,20 +2204,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
        old_gov = policy->governor;
        /* end old governor */
        if (old_gov) {
-               ret = cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-               if (ret) {
-                       /* This can happen due to race with other operations */
-                       pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
-                                __func__, old_gov->name, ret);
-                       return ret;
-               }
-
-               ret = cpufreq_exit_governor(policy);
-               if (ret) {
-                       pr_err("%s: Failed to Exit Governor: %s (%d)\n",
-                              __func__, old_gov->name, ret);
-                       return ret;
-               }
+               cpufreq_stop_governor(policy);
+               cpufreq_exit_governor(policy);
        }
 
        /* start new governor */
@@ -2495,10 +2466,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 
        register_hotcpu_notifier(&cpufreq_cpu_notifier);
        pr_debug("driver %s up and running\n", driver_data->name);
-
-out:
-       put_online_cpus();
-       return ret;
+       goto out;
 
 err_if_unreg:
        subsys_interface_unregister(&cpufreq_interface);
@@ -2508,7 +2476,9 @@ err_null_driver:
        write_lock_irqsave(&cpufreq_driver_lock, flags);
        cpufreq_driver = NULL;
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
-       goto out;
+out:
+       put_online_cpus();
+       return ret;
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_driver);
 
index b76a98dd9988b4f553ddbdd47b663f8720e5fc89..3a9c4325d6e224a55b0aa334476b5cc99529f3c9 100644 (file)
@@ -1461,12 +1461,11 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
        intel_pstate_clear_update_util_hook(policy->cpu);
 
        cpu = all_cpu_data[0];
-       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate) {
-               if (policy->max < policy->cpuinfo.max_freq &&
-                   policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
-                       pr_debug("policy->max > max non turbo frequency\n");
-                       policy->max = policy->cpuinfo.max_freq;
-               }
+       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
+           policy->max < policy->cpuinfo.max_freq &&
+           policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
+               pr_debug("policy->max > max non turbo frequency\n");
+               policy->max = policy->cpuinfo.max_freq;
        }
 
        if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
index 6f602c7a71bd80fc6c8376574483e1bae788885f..643f43179df16c1a56efc2930d61f31c7d7a79db 100644 (file)
@@ -307,17 +307,24 @@ static int mtk_cpufreq_set_target(struct cpufreq_policy *policy,
        return 0;
 }
 
+#define DYNAMIC_POWER "dynamic-power-coefficient"
+
 static void mtk_cpufreq_ready(struct cpufreq_policy *policy)
 {
        struct mtk_cpu_dvfs_info *info = policy->driver_data;
        struct device_node *np = of_node_get(info->cpu_dev->of_node);
+       u32 capacitance = 0;
 
        if (WARN_ON(!np))
                return;
 
        if (of_find_property(np, "#cooling-cells", NULL)) {
-               info->cdev = of_cpufreq_cooling_register(np,
-                                                        policy->related_cpus);
+               of_property_read_u32(np, DYNAMIC_POWER, &capacitance);
+
+               info->cdev = of_cpufreq_power_cooling_register(np,
+                                               policy->related_cpus,
+                                               capacitance,
+                                               NULL);
 
                if (IS_ERR(info->cdev)) {
                        dev_err(info->cpu_dev,
index 2b8e6ce62e816eb3887b983d47a8a2a4c96929e1..a4d0059e232cbd22478d29d92370eb027e8bde79 100644 (file)
@@ -214,7 +214,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
                tick_broadcast_exit();
        }
 
-       if (!cpuidle_state_is_coupled(drv, entered_state))
+       if (!cpuidle_state_is_coupled(drv, index))
                local_irq_enable();
 
        /*
index a0788763757b0d650ab25d213aea9d1b5c8c032d..8357d571553a56471ff42ab0f048bab7f52f475f 100644 (file)
@@ -1638,6 +1638,12 @@ static int pwm_setup_backlight(struct intel_connector *connector,
                return -ENODEV;
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(panel->backlight.pwm);
+
        retval = pwm_config(panel->backlight.pwm, CRC_PMIC_PWM_PERIOD_NS,
                            CRC_PMIC_PWM_PERIOD_NS);
        if (retval < 0) {
index 952fe692d7649a67d1dfe1d8d8c2c2ed8272a719..24e395c5907d4c549912072c3219bd24c389488e 100644 (file)
@@ -58,7 +58,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 };
  */
 static int apd = -1;
 module_param(apd, bint, 0);
-MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode");
+MODULE_PARM_DESC(apd, "Set to zero to disable anti-parallel diode mode");
 
 struct temperature {
        s8      degrees;
index 0addc84ba948a09e6b740db56593fbd124b8c8d5..69166ab3151d52db66421ad353082fd57cf4cf3d 100644 (file)
@@ -77,7 +77,6 @@ static const u8 LM75_REG_TEMP[3] = {
 struct lm75_data {
        struct i2c_client       *client;
        struct device           *hwmon_dev;
-       struct thermal_zone_device      *tz;
        struct mutex            update_lock;
        u8                      orig_conf;
        u8                      resolution;     /* In bits, between 9 and 12 */
@@ -306,11 +305,9 @@ lm75_probe(struct i2c_client *client, const struct i2c_device_id *id)
        if (IS_ERR(data->hwmon_dev))
                return PTR_ERR(data->hwmon_dev);
 
-       data->tz = thermal_zone_of_sensor_register(data->hwmon_dev, 0,
-                                                  data->hwmon_dev,
-                                                  &lm75_of_thermal_ops);
-       if (IS_ERR(data->tz))
-               data->tz = NULL;
+       devm_thermal_zone_of_sensor_register(data->hwmon_dev, 0,
+                                            data->hwmon_dev,
+                                            &lm75_of_thermal_ops);
 
        dev_info(dev, "%s: sensor '%s'\n",
                 dev_name(data->hwmon_dev), client->name);
@@ -322,7 +319,6 @@ static int lm75_remove(struct i2c_client *client)
 {
        struct lm75_data *data = i2c_get_clientdata(client);
 
-       thermal_zone_of_sensor_unregister(data->hwmon_dev, data->tz);
        hwmon_device_unregister(data->hwmon_dev);
        lm75_write_value(client, LM75_REG_CONF, data->orig_conf);
        return 0;
index faa6e8dfbaaf56a28bcf305b1db3da04aa7f6be3..8ef7b713cb1aafed515aa27766636c76c46778bd 100644 (file)
@@ -259,7 +259,6 @@ struct ntc_data {
        struct device *dev;
        int n_comp;
        char name[PLATFORM_NAME_SIZE];
-       struct thermal_zone_device *tz;
 };
 
 #if defined(CONFIG_OF) && IS_ENABLED(CONFIG_IIO)
@@ -579,6 +578,7 @@ static const struct thermal_zone_of_device_ops ntc_of_thermal_ops = {
 
 static int ntc_thermistor_probe(struct platform_device *pdev)
 {
+       struct thermal_zone_device *tz;
        const struct of_device_id *of_id =
                        of_match_device(of_match_ptr(ntc_match), &pdev->dev);
        const struct platform_device_id *pdev_id;
@@ -677,12 +677,10 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
        dev_info(&pdev->dev, "Thermistor type: %s successfully probed.\n",
                                                                pdev_id->name);
 
-       data->tz = thermal_zone_of_sensor_register(data->dev, 0, data->dev,
-                                                  &ntc_of_thermal_ops);
-       if (IS_ERR(data->tz)) {
+       tz = devm_thermal_zone_of_sensor_register(data->dev, 0, data->dev,
+                                                 &ntc_of_thermal_ops);
+       if (IS_ERR(tz))
                dev_dbg(&pdev->dev, "Failed to register to thermal fw.\n");
-               data->tz = NULL;
-       }
 
        return 0;
 err_after_sysfs:
@@ -700,8 +698,6 @@ static int ntc_thermistor_remove(struct platform_device *pdev)
        sysfs_remove_group(&data->dev->kobj, &ntc_attr_group);
        ntc_iio_channel_release(pdata);
 
-       thermal_zone_of_sensor_unregister(data->dev, data->tz);
-
        return 0;
 }
 
index 3e23003f78b01ca731e8c232f38e1132eccf251f..f9af3935b427c4cef758197fd4e11b4b7759c2eb 100644 (file)
@@ -40,15 +40,18 @@ struct pwm_fan_ctx {
 
 static int  __set_pwm(struct pwm_fan_ctx *ctx, unsigned long pwm)
 {
+       struct pwm_args pargs;
        unsigned long duty;
        int ret = 0;
 
+       pwm_get_args(ctx->pwm, &pargs);
+
        mutex_lock(&ctx->lock);
        if (ctx->pwm_value == pwm)
                goto exit_set_pwm_err;
 
-       duty = DIV_ROUND_UP(pwm * (ctx->pwm->period - 1), MAX_PWM);
-       ret = pwm_config(ctx->pwm, duty, ctx->pwm->period);
+       duty = DIV_ROUND_UP(pwm * (pargs.period - 1), MAX_PWM);
+       ret = pwm_config(ctx->pwm, duty, pargs.period);
        if (ret)
                goto exit_set_pwm_err;
 
@@ -215,6 +218,7 @@ static int pwm_fan_probe(struct platform_device *pdev)
 {
        struct thermal_cooling_device *cdev;
        struct pwm_fan_ctx *ctx;
+       struct pwm_args pargs;
        struct device *hwmon;
        int duty_cycle;
        int ret;
@@ -233,11 +237,19 @@ static int pwm_fan_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, ctx);
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(ctx->pwm);
+
        /* Set duty cycle to maximum allowed */
-       duty_cycle = ctx->pwm->period - 1;
+       pwm_get_args(ctx->pwm, &pargs);
+
+       duty_cycle = pargs.period - 1;
        ctx->pwm_value = MAX_PWM;
 
-       ret = pwm_config(ctx->pwm, duty_cycle, ctx->pwm->period);
+       ret = pwm_config(ctx->pwm, duty_cycle, pargs.period);
        if (ret) {
                dev_err(&pdev->dev, "Failed to configure PWM\n");
                return ret;
@@ -303,14 +315,16 @@ static int pwm_fan_suspend(struct device *dev)
 static int pwm_fan_resume(struct device *dev)
 {
        struct pwm_fan_ctx *ctx = dev_get_drvdata(dev);
+       struct pwm_args pargs;
        unsigned long duty;
        int ret;
 
        if (ctx->pwm_value == 0)
                return 0;
 
-       duty = DIV_ROUND_UP(ctx->pwm_value * (ctx->pwm->period - 1), MAX_PWM);
-       ret = pwm_config(ctx->pwm, duty, ctx->pwm->period);
+       pwm_get_args(ctx->pwm, &pargs);
+       duty = DIV_ROUND_UP(ctx->pwm_value * (pargs.period - 1), MAX_PWM);
+       ret = pwm_config(ctx->pwm, duty, pargs.period);
        if (ret)
                return ret;
        return pwm_enable(ctx->pwm);
index 912b449c83038b0402391fc50503b930066f874c..25b44e68926d1f9464345f5f17efecc365e129ce 100644 (file)
@@ -31,10 +31,8 @@ struct sensor_data {
 };
 
 struct scpi_thermal_zone {
-       struct list_head list;
        int sensor_id;
        struct scpi_sensors *scpi_sensors;
-       struct thermal_zone_device *tzd;
 };
 
 struct scpi_sensors {
@@ -92,20 +90,6 @@ scpi_show_label(struct device *dev, struct device_attribute *attr, char *buf)
        return sprintf(buf, "%s\n", sensor->info.name);
 }
 
-static void
-unregister_thermal_zones(struct platform_device *pdev,
-                        struct scpi_sensors *scpi_sensors)
-{
-       struct list_head *pos;
-
-       list_for_each(pos, &scpi_sensors->thermal_zones) {
-               struct scpi_thermal_zone *zone;
-
-               zone = list_entry(pos, struct scpi_thermal_zone, list);
-               thermal_zone_of_sensor_unregister(&pdev->dev, zone->tzd);
-       }
-}
-
 static struct thermal_zone_of_device_ops scpi_sensor_ops = {
        .get_temp = scpi_read_temp,
 };
@@ -118,7 +102,7 @@ static int scpi_hwmon_probe(struct platform_device *pdev)
        struct scpi_ops *scpi_ops;
        struct device *hwdev, *dev = &pdev->dev;
        struct scpi_sensors *scpi_sensors;
-       int ret, idx;
+       int idx, ret;
 
        scpi_ops = get_scpi_ops();
        if (!scpi_ops)
@@ -232,47 +216,34 @@ static int scpi_hwmon_probe(struct platform_device *pdev)
        INIT_LIST_HEAD(&scpi_sensors->thermal_zones);
        for (i = 0; i < nr_sensors; i++) {
                struct sensor_data *sensor = &scpi_sensors->data[i];
+               struct thermal_zone_device *z;
                struct scpi_thermal_zone *zone;
 
                if (sensor->info.class != TEMPERATURE)
                        continue;
 
                zone = devm_kzalloc(dev, sizeof(*zone), GFP_KERNEL);
-               if (!zone) {
-                       ret = -ENOMEM;
-                       goto unregister_tzd;
-               }
+               if (!zone)
+                       return -ENOMEM;
 
                zone->sensor_id = i;
                zone->scpi_sensors = scpi_sensors;
-               zone->tzd = thermal_zone_of_sensor_register(dev,
-                               sensor->info.sensor_id, zone, &scpi_sensor_ops);
+               z = devm_thermal_zone_of_sensor_register(dev,
+                                                        sensor->info.sensor_id,
+                                                        zone,
+                                                        &scpi_sensor_ops);
                /*
                 * The call to thermal_zone_of_sensor_register returns
                 * an error for sensors that are not associated with
                 * any thermal zones or if the thermal subsystem is
                 * not configured.
                 */
-               if (IS_ERR(zone->tzd)) {
+               if (IS_ERR(z)) {
                        devm_kfree(dev, zone);
                        continue;
                }
-               list_add(&zone->list, &scpi_sensors->thermal_zones);
        }
 
-       return 0;
-
-unregister_tzd:
-       unregister_thermal_zones(pdev, scpi_sensors);
-       return ret;
-}
-
-static int scpi_hwmon_remove(struct platform_device *pdev)
-{
-       struct scpi_sensors *scpi_sensors = platform_get_drvdata(pdev);
-
-       unregister_thermal_zones(pdev, scpi_sensors);
-
        return 0;
 }
 
@@ -288,7 +259,6 @@ static struct platform_driver scpi_hwmon_platdrv = {
                .of_match_table = scpi_of_match,
        },
        .probe          = scpi_hwmon_probe,
-       .remove         = scpi_hwmon_remove,
 };
 module_platform_driver(scpi_hwmon_platdrv);
 
index 5289aa0980a8fac074822ddb412f35e7a9380bd5..f1e96fd7f445bff92965c286b188a2ec253b0813 100644 (file)
@@ -53,7 +53,6 @@
 struct tmp102 {
        struct i2c_client *client;
        struct device *hwmon_dev;
-       struct thermal_zone_device *tz;
        struct mutex lock;
        u16 config_orig;
        unsigned long last_update;
@@ -232,10 +231,8 @@ static int tmp102_probe(struct i2c_client *client,
                goto fail_restore_config;
        }
        tmp102->hwmon_dev = hwmon_dev;
-       tmp102->tz = thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
-                                                    &tmp102_of_thermal_ops);
-       if (IS_ERR(tmp102->tz))
-               tmp102->tz = NULL;
+       devm_thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
+                                            &tmp102_of_thermal_ops);
 
        dev_info(dev, "initialized\n");
 
@@ -251,7 +248,6 @@ static int tmp102_remove(struct i2c_client *client)
 {
        struct tmp102 *tmp102 = i2c_get_clientdata(client);
 
-       thermal_zone_of_sensor_unregister(tmp102->hwmon_dev, tmp102->tz);
        hwmon_device_unregister(tmp102->hwmon_dev);
 
        /* Stop monitoring if device was stopped originally */
index 6d96bff32a0e3776190ee2c34bf2ea461866c057..29ddeb7be84bb485f676ea1fb632db48a53d9cfc 100644 (file)
@@ -70,10 +70,13 @@ struct max77693_haptic {
 
 static int max77693_haptic_set_duty_cycle(struct max77693_haptic *haptic)
 {
-       int delta = (haptic->pwm_dev->period + haptic->pwm_duty) / 2;
+       struct pwm_args pargs;
+       int delta;
        int error;
 
-       error = pwm_config(haptic->pwm_dev, delta, haptic->pwm_dev->period);
+       pwm_get_args(haptic->pwm_dev, &pargs);
+       delta = (pargs.period + haptic->pwm_duty) / 2;
+       error = pwm_config(haptic->pwm_dev, delta, pargs.period);
        if (error) {
                dev_err(haptic->dev, "failed to configure pwm: %d\n", error);
                return error;
@@ -234,6 +237,7 @@ static int max77693_haptic_play_effect(struct input_dev *dev, void *data,
                                       struct ff_effect *effect)
 {
        struct max77693_haptic *haptic = input_get_drvdata(dev);
+       struct pwm_args pargs;
        u64 period_mag_multi;
 
        haptic->magnitude = effect->u.rumble.strong_magnitude;
@@ -245,7 +249,8 @@ static int max77693_haptic_play_effect(struct input_dev *dev, void *data,
         * The formula to convert magnitude to pwm_duty as follows:
         * - pwm_duty = (magnitude * pwm_period) / MAX_MAGNITUDE(0xFFFF)
         */
-       period_mag_multi = (u64)haptic->pwm_dev->period * haptic->magnitude;
+       pwm_get_args(haptic->pwm_dev, &pargs);
+       period_mag_multi = (u64)pargs.period * haptic->magnitude;
        haptic->pwm_duty = (unsigned int)(period_mag_multi >>
                                                MAX_MAGNITUDE_SHIFT);
 
@@ -329,6 +334,12 @@ static int max77693_haptic_probe(struct platform_device *pdev)
                return PTR_ERR(haptic->pwm_dev);
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(haptic->pwm_dev);
+
        haptic->motor_reg = devm_regulator_get(&pdev->dev, "haptic");
        if (IS_ERR(haptic->motor_reg)) {
                dev_err(&pdev->dev, "failed to get regulator\n");
index 8d6326d7e7beaf1875bb95af385dff1285f10b0e..99bc762881d5d40bc0c0788a59698f227da8737d 100644 (file)
@@ -306,6 +306,12 @@ static int max8997_haptic_probe(struct platform_device *pdev)
                                error);
                        goto err_free_mem;
                }
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(chip->pwm);
                break;
 
        default:
index f2261ab5470126f0935c3ca9d3830b55699dc378..8d71332687455d51d0391d2f5425d3cbf48edd8f 100644 (file)
@@ -87,6 +87,12 @@ static int pwm_beeper_probe(struct platform_device *pdev)
                goto err_free;
        }
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(beeper->pwm);
+
        beeper->input = input_allocate_device();
        if (!beeper->input) {
                dev_err(&pdev->dev, "Failed to allocate input device\n");
index 485794376ee5e656d91e89b94998d7a44ffc9992..d07dd29d4848f03d91e40bd762078b969a23bac1 100644 (file)
 struct sun4i_ts_data {
        struct device *dev;
        struct input_dev *input;
-       struct thermal_zone_device *tz;
        void __iomem *base;
        unsigned int irq;
        bool ignore_fifo_data;
@@ -366,10 +365,7 @@ static int sun4i_ts_probe(struct platform_device *pdev)
        if (IS_ERR(hwmon))
                return PTR_ERR(hwmon);
 
-       ts->tz = thermal_zone_of_sensor_register(ts->dev, 0, ts,
-                                                &sun4i_ts_tz_ops);
-       if (IS_ERR(ts->tz))
-               ts->tz = NULL;
+       devm_thermal_zone_of_sensor_register(ts->dev, 0, ts, &sun4i_ts_tz_ops);
 
        writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC);
 
@@ -377,7 +373,6 @@ static int sun4i_ts_probe(struct platform_device *pdev)
                error = input_register_device(ts->input);
                if (error) {
                        writel(0, ts->base + TP_INT_FIFOC);
-                       thermal_zone_of_sensor_unregister(ts->dev, ts->tz);
                        return error;
                }
        }
@@ -394,8 +389,6 @@ static int sun4i_ts_remove(struct platform_device *pdev)
        if (ts->input)
                input_unregister_device(ts->input);
 
-       thermal_zone_of_sensor_unregister(ts->dev, ts->tz);
-
        /* Deactivate all IRQs */
        writel(0, ts->base + TP_INT_FIFOC);
 
index b2bfb9594508feea4169a9c7704d25c95acf1490..a644d0cec2d8275d202fd3d871673b7787dd258c 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
 #include <linux/memory.h>
+#include <linux/cpu.h>
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/iova.h>
@@ -390,6 +391,7 @@ struct dmar_domain {
                                         * domain ids are 16 bit wide according
                                         * to VT-d spec, section 9.3 */
 
+       bool has_iotlb_device;
        struct list_head devices;       /* all devices' list */
        struct iova_domain iovad;       /* iova's that belong to this domain */
 
@@ -456,27 +458,32 @@ static LIST_HEAD(dmar_rmrr_units);
 
 static void flush_unmaps_timeout(unsigned long data);
 
-static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+struct deferred_flush_entry {
+       unsigned long iova_pfn;
+       unsigned long nrpages;
+       struct dmar_domain *domain;
+       struct page *freelist;
+};
 
 #define HIGH_WATER_MARK 250
-struct deferred_flush_tables {
+struct deferred_flush_table {
        int next;
-       struct iova *iova[HIGH_WATER_MARK];
-       struct dmar_domain *domain[HIGH_WATER_MARK];
-       struct page *freelist[HIGH_WATER_MARK];
+       struct deferred_flush_entry entries[HIGH_WATER_MARK];
+};
+
+struct deferred_flush_data {
+       spinlock_t lock;
+       int timer_on;
+       struct timer_list timer;
+       long size;
+       struct deferred_flush_table *tables;
 };
 
-static struct deferred_flush_tables *deferred_flush;
+DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 
 /* bitmap for indexing intel_iommus */
 static int g_num_of_iommus;
 
-static DEFINE_SPINLOCK(async_umap_flush_lock);
-static LIST_HEAD(unmaps_to_do);
-
-static int timer_on;
-static long list_size;
-
 static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
@@ -1458,10 +1465,35 @@ iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
        return NULL;
 }
 
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+       struct device_domain_info *info;
+       bool has_iotlb_device = false;
+
+       assert_spin_locked(&device_domain_lock);
+
+       list_for_each_entry(info, &domain->devices, link) {
+               struct pci_dev *pdev;
+
+               if (!info->dev || !dev_is_pci(info->dev))
+                       continue;
+
+               pdev = to_pci_dev(info->dev);
+               if (pdev->ats_enabled) {
+                       has_iotlb_device = true;
+                       break;
+               }
+       }
+
+       domain->has_iotlb_device = has_iotlb_device;
+}
+
 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!info || !dev_is_pci(info->dev))
                return;
 
@@ -1481,6 +1513,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 #endif
        if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
                info->ats_enabled = 1;
+               domain_update_iotlb(info->domain);
                info->ats_qdep = pci_ats_queue_depth(pdev);
        }
 }
@@ -1489,6 +1522,8 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!dev_is_pci(info->dev))
                return;
 
@@ -1497,6 +1532,7 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
        if (info->ats_enabled) {
                pci_disable_ats(pdev);
                info->ats_enabled = 0;
+               domain_update_iotlb(info->domain);
        }
 #ifdef CONFIG_INTEL_IOMMU_SVM
        if (info->pri_enabled) {
@@ -1517,6 +1553,9 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
        unsigned long flags;
        struct device_domain_info *info;
 
+       if (!domain->has_iotlb_device)
+               return;
+
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_entry(info, &domain->devices, link) {
                if (!info->ats_enabled)
@@ -1734,6 +1773,7 @@ static struct dmar_domain *alloc_domain(int flags)
        memset(domain, 0, sizeof(*domain));
        domain->nid = -1;
        domain->flags = flags;
+       domain->has_iotlb_device = false;
        INIT_LIST_HEAD(&domain->devices);
 
        return domain;
@@ -1918,8 +1958,12 @@ static void domain_exit(struct dmar_domain *domain)
                return;
 
        /* Flush any lazy unmaps that may reference this domain */
-       if (!intel_iommu_strict)
-               flush_unmaps_timeout(0);
+       if (!intel_iommu_strict) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
 
        /* Remove associated devices and clear attached or cached domains */
        rcu_read_lock();
@@ -3077,7 +3121,7 @@ static int __init init_dmars(void)
        bool copied_tables = false;
        struct device *dev;
        struct intel_iommu *iommu;
-       int i, ret;
+       int i, ret, cpu;
 
        /*
         * for each drhd
@@ -3110,11 +3154,20 @@ static int __init init_dmars(void)
                goto error;
        }
 
-       deferred_flush = kzalloc(g_num_of_iommus *
-               sizeof(struct deferred_flush_tables), GFP_KERNEL);
-       if (!deferred_flush) {
-               ret = -ENOMEM;
-               goto free_g_iommus;
+       for_each_possible_cpu(cpu) {
+               struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
+                                                             cpu);
+
+               dfd->tables = kzalloc(g_num_of_iommus *
+                                     sizeof(struct deferred_flush_table),
+                                     GFP_KERNEL);
+               if (!dfd->tables) {
+                       ret = -ENOMEM;
+                       goto free_g_iommus;
+               }
+
+               spin_lock_init(&dfd->lock);
+               setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
        }
 
        for_each_active_iommu(iommu, drhd) {
@@ -3291,19 +3344,20 @@ free_iommu:
                disable_dmar_iommu(iommu);
                free_dmar_iommu(iommu);
        }
-       kfree(deferred_flush);
 free_g_iommus:
+       for_each_possible_cpu(cpu)
+               kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
        kfree(g_iommus);
 error:
        return ret;
 }
 
 /* This takes a number of _MM_ pages, not VTD pages */
-static struct iova *intel_alloc_iova(struct device *dev,
+static unsigned long intel_alloc_iova(struct device *dev,
                                     struct dmar_domain *domain,
                                     unsigned long nrpages, uint64_t dma_mask)
 {
-       struct iova *iova = NULL;
+       unsigned long iova_pfn = 0;
 
        /* Restrict dma_mask to the width that the iommu can handle */
        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
@@ -3316,19 +3370,19 @@ static struct iova *intel_alloc_iova(struct device *dev,
                 * DMA_BIT_MASK(32) and if that fails then try allocating
                 * from higher range
                 */
-               iova = alloc_iova(&domain->iovad, nrpages,
-                                 IOVA_PFN(DMA_BIT_MASK(32)), 1);
-               if (iova)
-                       return iova;
+               iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+                                          IOVA_PFN(DMA_BIT_MASK(32)));
+               if (iova_pfn)
+                       return iova_pfn;
        }
-       iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
-       if (unlikely(!iova)) {
+       iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
+       if (unlikely(!iova_pfn)) {
                pr_err("Allocating %ld-page iova for %s failed",
                       nrpages, dev_name(dev));
-               return NULL;
+               return 0;
        }
 
-       return iova;
+       return iova_pfn;
 }
 
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
@@ -3426,7 +3480,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 {
        struct dmar_domain *domain;
        phys_addr_t start_paddr;
-       struct iova *iova;
+       unsigned long iova_pfn;
        int prot = 0;
        int ret;
        struct intel_iommu *iommu;
@@ -3444,8 +3498,8 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        iommu = domain_get_iommu(domain);
        size = aligned_nrpages(paddr, size);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-       if (!iova)
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+       if (!iova_pfn)
                goto error;
 
        /*
@@ -3463,7 +3517,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
         * might have two guest_addr mapping to the same host paddr, but this
         * is not a big problem
         */
-       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
                                 mm_to_dma_pfn(paddr_pfn), size, prot);
        if (ret)
                goto error;
@@ -3471,18 +3525,18 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
                iommu_flush_iotlb_psi(iommu, domain,
-                                     mm_to_dma_pfn(iova->pfn_lo),
+                                     mm_to_dma_pfn(iova_pfn),
                                      size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
-       start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+       start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
        start_paddr += paddr & ~PAGE_MASK;
        return start_paddr;
 
 error:
-       if (iova)
-               __free_iova(&domain->iovad, iova);
+       if (iova_pfn)
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
        pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
                dev_name(dev), size, (unsigned long long)paddr, dir);
        return 0;
@@ -3497,91 +3551,120 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
                                  dir, *dev->dma_mask);
 }
 
-static void flush_unmaps(void)
+static void flush_unmaps(struct deferred_flush_data *flush_data)
 {
        int i, j;
 
-       timer_on = 0;
+       flush_data->timer_on = 0;
 
        /* just flush them all */
        for (i = 0; i < g_num_of_iommus; i++) {
                struct intel_iommu *iommu = g_iommus[i];
+               struct deferred_flush_table *flush_table =
+                               &flush_data->tables[i];
                if (!iommu)
                        continue;
 
-               if (!deferred_flush[i].next)
+               if (!flush_table->next)
                        continue;
 
                /* In caching mode, global flushes turn emulation expensive */
                if (!cap_caching_mode(iommu->cap))
                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
-               for (j = 0; j < deferred_flush[i].next; j++) {
+               for (j = 0; j < flush_table->next; j++) {
                        unsigned long mask;
-                       struct iova *iova = deferred_flush[i].iova[j];
-                       struct dmar_domain *domain = deferred_flush[i].domain[j];
+                       struct deferred_flush_entry *entry =
+                                               &flush_table->entries[j];
+                       unsigned long iova_pfn = entry->iova_pfn;
+                       unsigned long nrpages = entry->nrpages;
+                       struct dmar_domain *domain = entry->domain;
+                       struct page *freelist = entry->freelist;
 
                        /* On real hardware multiple invalidations are expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain,
-                                       iova->pfn_lo, iova_size(iova),
-                                       !deferred_flush[i].freelist[j], 0);
+                                       mm_to_dma_pfn(iova_pfn),
+                                       nrpages, !freelist, 0);
                        else {
-                               mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
-                               iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-                                               (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                               mask = ilog2(nrpages);
+                               iommu_flush_dev_iotlb(domain,
+                                               (uint64_t)iova_pfn << PAGE_SHIFT, mask);
                        }
-                       __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
-                       if (deferred_flush[i].freelist[j])
-                               dma_free_pagelist(deferred_flush[i].freelist[j]);
+                       free_iova_fast(&domain->iovad, iova_pfn, nrpages);
+                       if (freelist)
+                               dma_free_pagelist(freelist);
                }
-               deferred_flush[i].next = 0;
+               flush_table->next = 0;
        }
 
-       list_size = 0;
+       flush_data->size = 0;
 }
 
-static void flush_unmaps_timeout(unsigned long data)
+static void flush_unmaps_timeout(unsigned long cpuid)
 {
+       struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
        unsigned long flags;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       flush_unmaps();
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       spin_lock_irqsave(&flush_data->lock, flags);
+       flush_unmaps(flush_data);
+       spin_unlock_irqrestore(&flush_data->lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
+static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
+                     unsigned long nrpages, struct page *freelist)
 {
        unsigned long flags;
-       int next, iommu_id;
+       int entry_id, iommu_id;
        struct intel_iommu *iommu;
+       struct deferred_flush_entry *entry;
+       struct deferred_flush_data *flush_data;
+       unsigned int cpuid;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       if (list_size == HIGH_WATER_MARK)
-               flush_unmaps();
+       cpuid = get_cpu();
+       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
+
+       /* Flush all CPUs' entries to avoid deferring too much.  If
+        * this becomes a bottleneck, can just flush us, and rely on
+        * flush timer for the rest.
+        */
+       if (flush_data->size == HIGH_WATER_MARK) {
+               int cpu;
+
+               for_each_online_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
+
+       spin_lock_irqsave(&flush_data->lock, flags);
 
        iommu = domain_get_iommu(dom);
        iommu_id = iommu->seq_id;
 
-       next = deferred_flush[iommu_id].next;
-       deferred_flush[iommu_id].domain[next] = dom;
-       deferred_flush[iommu_id].iova[next] = iova;
-       deferred_flush[iommu_id].freelist[next] = freelist;
-       deferred_flush[iommu_id].next++;
+       entry_id = flush_data->tables[iommu_id].next;
+       ++(flush_data->tables[iommu_id].next);
 
-       if (!timer_on) {
-               mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
-               timer_on = 1;
+       entry = &flush_data->tables[iommu_id].entries[entry_id];
+       entry->domain = dom;
+       entry->iova_pfn = iova_pfn;
+       entry->nrpages = nrpages;
+       entry->freelist = freelist;
+
+       if (!flush_data->timer_on) {
+               mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
+               flush_data->timer_on = 1;
        }
-       list_size++;
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       flush_data->size++;
+       spin_unlock_irqrestore(&flush_data->lock, flags);
+
+       put_cpu();
 }
 
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 {
        struct dmar_domain *domain;
        unsigned long start_pfn, last_pfn;
-       struct iova *iova;
+       unsigned long nrpages;
+       unsigned long iova_pfn;
        struct intel_iommu *iommu;
        struct page *freelist;
 
@@ -3593,13 +3676,11 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        iommu = domain_get_iommu(domain);
 
-       iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-       if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
-                     (unsigned long long)dev_addr))
-               return;
+       iova_pfn = IOVA_PFN(dev_addr);
 
-       start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-       last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+       nrpages = aligned_nrpages(dev_addr, size);
+       start_pfn = mm_to_dma_pfn(iova_pfn);
+       last_pfn = start_pfn + nrpages - 1;
 
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 dev_name(dev), start_pfn, last_pfn);
@@ -3608,12 +3689,12 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-                                     last_pfn - start_pfn + 1, !freelist, 0);
+                                     nrpages, !freelist, 0);
                /* free iova */
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
                dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova, freelist);
+               add_unmap(domain, iova_pfn, nrpages, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3625,7 +3706,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
                             size_t size, enum dma_data_direction dir,
                             struct dma_attrs *attrs)
 {
-       intel_unmap(dev, dev_addr);
+       intel_unmap(dev, dev_addr, size);
 }
 
 static void *intel_alloc_coherent(struct device *dev, size_t size,
@@ -3684,7 +3765,7 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
        size = PAGE_ALIGN(size);
        order = get_order(size);
 
-       intel_unmap(dev, dma_handle);
+       intel_unmap(dev, dma_handle, size);
        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
                __free_pages(page, order);
 }
@@ -3693,7 +3774,16 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
                           int nelems, enum dma_data_direction dir,
                           struct dma_attrs *attrs)
 {
-       intel_unmap(dev, sglist[0].dma_address);
+       dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+       unsigned long nrpages = 0;
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i) {
+               nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+       }
+
+       intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
 }
 
 static int intel_nontranslate_map_sg(struct device *hddev,
@@ -3717,7 +3807,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        struct dmar_domain *domain;
        size_t size = 0;
        int prot = 0;
-       struct iova *iova = NULL;
+       unsigned long iova_pfn;
        int ret;
        struct scatterlist *sg;
        unsigned long start_vpfn;
@@ -3736,9 +3826,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        for_each_sg(sglist, sg, nelems, i)
                size += aligned_nrpages(sg->offset, sg->length);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
                                *dev->dma_mask);
-       if (!iova) {
+       if (!iova_pfn) {
                sglist->dma_length = 0;
                return 0;
        }
@@ -3753,13 +3843,13 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
                prot |= DMA_PTE_WRITE;
 
-       start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+       start_vpfn = mm_to_dma_pfn(iova_pfn);
 
        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
        if (unlikely(ret)) {
                dma_pte_free_pagetable(domain, start_vpfn,
                                       start_vpfn + size - 1);
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
                return 0;
        }
 
@@ -4498,6 +4588,46 @@ static struct notifier_block intel_iommu_memory_nb = {
        .priority = 0
 };
 
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < g_num_of_iommus; i++) {
+               struct intel_iommu *iommu = g_iommus[i];
+               struct dmar_domain *domain;
+               u16 did;
+
+               if (!iommu)
+                       continue;
+
+               for (did = 0; did < 0xffff; did++) {
+                       domain = get_iommu_domain(iommu, did);
+
+                       if (!domain)
+                               continue;
+                       free_cpu_cached_iovas(cpu, &domain->iovad);
+               }
+       }
+}
+
+static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
+                                   unsigned long action, void *v)
+{
+       unsigned int cpu = (unsigned long)v;
+
+       switch (action) {
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               free_all_cpu_cached_iovas(cpu);
+               flush_unmaps_timeout(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_cpu_nb = {
+       .notifier_call = intel_iommu_cpu_notifier,
+};
 
 static ssize_t intel_iommu_show_version(struct device *dev,
                                        struct device_attribute *attr,
@@ -4631,7 +4761,6 @@ int __init intel_iommu_init(void)
        up_write(&dmar_global_lock);
        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
 
-       init_timer(&unmap_timer);
 #ifdef CONFIG_SWIOTLB
        swiotlb = 0;
 #endif
@@ -4648,6 +4777,7 @@ int __init intel_iommu_init(void)
        bus_register_notifier(&pci_bus_type, &device_nb);
        if (si_domain && !hw_pass_through)
                register_memory_notifier(&intel_iommu_memory_nb);
+       register_hotcpu_notifier(&intel_iommu_cpu_nb);
 
        intel_iommu_enabled = 1;
 
index fa0adef32bd6d3a4af1b97ee3b1fb22dde6c225f..ba764a0835d3cd881981f0e049956e6f71a62788 100644 (file)
 #include <linux/iova.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+                              unsigned long pfn,
+                              unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -38,6 +49,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
        iovad->granule = granule;
        iovad->start_pfn = start_pfn;
        iovad->dma_32bit_pfn = pfn_32bit;
+       init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -291,33 +303,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-       unsigned long flags;
-       struct rb_node *node;
+       struct rb_node *node = iovad->rbroot.rb_node;
+
+       assert_spin_locked(&iovad->iova_rbtree_lock);
 
-       /* Take the lock so that no other thread is manipulating the rbtree */
-       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       node = iovad->rbroot.rb_node;
        while (node) {
                struct iova *iova = container_of(node, struct iova, node);
 
                /* If pfn falls within iova's range, return iova */
                if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-                       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-                       /* We are not holding the lock while this iova
-                        * is referenced by the caller as the same thread
-                        * which called this function also calls __free_iova()
-                        * and it is by design that only one thread can possibly
-                        * reference a particular iova and hence no conflict.
-                        */
                        return iova;
                }
 
@@ -327,9 +324,35 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
                        node = node->rb_right;
        }
 
-       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
        return NULL;
 }
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+       assert_spin_locked(&iovad->iova_rbtree_lock);
+       __cached_rbnode_delete_update(iovad, iova);
+       rb_erase(&iova->node, &iovad->rbroot);
+       free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+       unsigned long flags;
+       struct iova *iova;
+
+       /* Take the lock so that no other thread is manipulating the rbtree */
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+       iova = private_find_iova(iovad, pfn);
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+       return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -344,10 +367,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
        unsigned long flags;
 
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       __cached_rbnode_delete_update(iovad, iova);
-       rb_erase(&iova->node, &iovad->rbroot);
+       private_free_iova(iovad, iova);
        spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-       free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -369,6 +390,63 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+               unsigned long limit_pfn)
+{
+       bool flushed_rcache = false;
+       unsigned long iova_pfn;
+       struct iova *new_iova;
+
+       iova_pfn = iova_rcache_get(iovad, size, limit_pfn);
+       if (iova_pfn)
+               return iova_pfn;
+
+retry:
+       new_iova = alloc_iova(iovad, size, limit_pfn, true);
+       if (!new_iova) {
+               unsigned int cpu;
+
+               if (flushed_rcache)
+                       return 0;
+
+               /* Try replenishing IOVAs by flushing rcache. */
+               flushed_rcache = true;
+               for_each_online_cpu(cpu)
+                       free_cpu_cached_iovas(cpu, iovad);
+               goto retry;
+       }
+
+       return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+       if (iova_rcache_insert(iovad, pfn, size))
+               return;
+
+       free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -379,6 +457,7 @@ void put_iova_domain(struct iova_domain *iovad)
        struct rb_node *node;
        unsigned long flags;
 
+       free_iova_rcaches(iovad);
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
        node = rb_first(&iovad->rbroot);
        while (node) {
@@ -550,5 +629,295 @@ error:
        return NULL;
 }
 
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+       unsigned long size;
+       unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+       spinlock_t lock;
+       struct iova_magazine *loaded;
+       struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+       return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+       kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+       unsigned long flags;
+       int i;
+
+       if (!mag)
+               return;
+
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+       for (i = 0 ; i < mag->size; ++i) {
+               struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+               BUG_ON(!iova);
+               private_free_iova(iovad, iova);
+       }
+
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+       mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+       return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+       return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+                                      unsigned long limit_pfn)
+{
+       BUG_ON(iova_magazine_empty(mag));
+
+       if (mag->pfns[mag->size - 1] >= limit_pfn)
+               return 0;
+
+       return mag->pfns[--mag->size];
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+       BUG_ON(iova_magazine_full(mag));
+
+       mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned int cpu;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               spin_lock_init(&rcache->lock);
+               rcache->depot_size = 0;
+               rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+               if (WARN_ON(!rcache->cpu_rcaches))
+                       continue;
+               for_each_possible_cpu(cpu) {
+                       cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+                       spin_lock_init(&cpu_rcache->lock);
+                       cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+                       cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+               }
+       }
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+                                struct iova_rcache *rcache,
+                                unsigned long iova_pfn)
+{
+       struct iova_magazine *mag_to_free = NULL;
+       struct iova_cpu_rcache *cpu_rcache;
+       bool can_insert = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_full(cpu_rcache->loaded)) {
+               can_insert = true;
+       } else if (!iova_magazine_full(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               can_insert = true;
+       } else {
+               struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+               if (new_mag) {
+                       spin_lock(&rcache->lock);
+                       if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+                               rcache->depot[rcache->depot_size++] =
+                                               cpu_rcache->loaded;
+                       } else {
+                               mag_to_free = cpu_rcache->loaded;
+                       }
+                       spin_unlock(&rcache->lock);
+
+                       cpu_rcache->loaded = new_mag;
+                       can_insert = true;
+               }
+       }
+
+       if (can_insert)
+               iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       if (mag_to_free) {
+               iova_magazine_free_pfns(mag_to_free, iovad);
+               iova_magazine_free(mag_to_free);
+       }
+
+       return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+                              unsigned long size)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return false;
+
+       return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+                                      unsigned long limit_pfn)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       unsigned long iova_pfn = 0;
+       bool has_pfn = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_empty(cpu_rcache->loaded)) {
+               has_pfn = true;
+       } else if (!iova_magazine_empty(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               has_pfn = true;
+       } else {
+               spin_lock(&rcache->lock);
+               if (rcache->depot_size > 0) {
+                       iova_magazine_free(cpu_rcache->loaded);
+                       cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+                       has_pfn = true;
+               }
+               spin_unlock(&rcache->lock);
+       }
+
+       if (has_pfn)
+               iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return 0;
+
+       return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn);
+}
+
+/*
+ * Free a cpu's rcache.
+ */
+static void free_cpu_iova_rcache(unsigned int cpu, struct iova_domain *iovad,
+                                struct iova_rcache *rcache)
+{
+       struct iova_cpu_rcache *cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+       iova_magazine_free(cpu_rcache->loaded);
+
+       iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+       iova_magazine_free(cpu_rcache->prev);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       unsigned int cpu;
+       int i, j;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               for_each_possible_cpu(cpu)
+                       free_cpu_iova_rcache(cpu, iovad, rcache);
+               spin_lock_irqsave(&rcache->lock, flags);
+               free_percpu(rcache->cpu_rcaches);
+               for (j = 0; j < rcache->depot_size; ++j) {
+                       iova_magazine_free_pfns(rcache->depot[j], iovad);
+                       iova_magazine_free(rcache->depot[j]);
+               }
+               spin_unlock_irqrestore(&rcache->lock, flags);
+       }
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+               spin_lock_irqsave(&cpu_rcache->lock, flags);
+               iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+               iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+               spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+       }
+}
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
index 4783bacb2e9d8e48c043223fd89643069d93f4b9..a9145aa7f36a017327f9c7b74140ec5d1a730e40 100644 (file)
@@ -91,6 +91,7 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
                       struct led_pwm *led, struct device_node *child)
 {
        struct led_pwm_data *led_data = &priv->leds[priv->num_leds];
+       struct pwm_args pargs;
        int ret;
 
        led_data->active_low = led->active_low;
@@ -117,7 +118,15 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
        else
                led_data->cdev.brightness_set_blocking = led_pwm_set_blocking;
 
-       led_data->period = pwm_get_period(led_data->pwm);
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to the
+        * atomic PWM API.
+        */
+       pwm_apply_args(led_data->pwm);
+
+       pwm_get_args(led_data->pwm, &pargs);
+
+       led_data->period = pargs.period;
        if (!led_data->period && (led->pwm_period_ns > 0))
                led_data->period = led->pwm_period_ns;
 
index c61a284133e023df5da91d2ecafe3e15199941e1..81ddb17575a99dc9c77fbfecb595d77be054befa 100644 (file)
@@ -51,6 +51,7 @@ config TI_EMIF
 
 config OMAP_GPMC
        bool
+       select GPIOLIB
        help
          This driver is for the General Purpose Memory Controller (GPMC)
          present on Texas Instruments SoCs (e.g. OMAP2+). GPMC allows
index 2a691da8c1c7c0441c08d97a8d1e711099a32299..904b4af5f1424ef978d317ba052f3a268d111818 100644 (file)
@@ -59,11 +59,11 @@ int fsl_ifc_find(phys_addr_t addr_base)
 {
        int i = 0;
 
-       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->gregs)
                return -ENODEV;
 
        for (i = 0; i < fsl_ifc_ctrl_dev->banks; i++) {
-               u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->regs->cspr_cs[i].cspr);
+               u32 cspr = ifc_in32(&fsl_ifc_ctrl_dev->gregs->cspr_cs[i].cspr);
                if (cspr & CSPR_V && (cspr & CSPR_BA) ==
                                convert_ifc_address(addr_base))
                        return i;
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(fsl_ifc_find);
 
 static int fsl_ifc_ctrl_init(struct fsl_ifc_ctrl *ctrl)
 {
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
 
        /*
         * Clear all the common status and event registers
@@ -104,7 +104,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
        irq_dispose_mapping(ctrl->nand_irq);
        irq_dispose_mapping(ctrl->irq);
 
-       iounmap(ctrl->regs);
+       iounmap(ctrl->gregs);
 
        dev_set_drvdata(&dev->dev, NULL);
        kfree(ctrl);
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(nand_irq_lock);
 
 static u32 check_nand_stat(struct fsl_ifc_ctrl *ctrl)
 {
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        unsigned long flags;
        u32 stat;
 
@@ -157,7 +157,7 @@ static irqreturn_t fsl_ifc_nand_irq(int irqno, void *data)
 static irqreturn_t fsl_ifc_ctrl_irq(int irqno, void *data)
 {
        struct fsl_ifc_ctrl *ctrl = data;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc = ctrl->gregs;
        u32 err_axiid, err_srcid, status, cs_err, err_addr;
        irqreturn_t ret = IRQ_NONE;
 
@@ -215,6 +215,7 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
 {
        int ret = 0;
        int version, banks;
+       void __iomem *addr;
 
        dev_info(&dev->dev, "Freescale Integrated Flash Controller\n");
 
@@ -225,22 +226,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
        dev_set_drvdata(&dev->dev, fsl_ifc_ctrl_dev);
 
        /* IOMAP the entire IFC region */
-       fsl_ifc_ctrl_dev->regs = of_iomap(dev->dev.of_node, 0);
-       if (!fsl_ifc_ctrl_dev->regs) {
+       fsl_ifc_ctrl_dev->gregs = of_iomap(dev->dev.of_node, 0);
+       if (!fsl_ifc_ctrl_dev->gregs) {
                dev_err(&dev->dev, "failed to get memory region\n");
                ret = -ENODEV;
                goto err;
        }
 
-       version = ifc_in32(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
-                       FSL_IFC_VERSION_MASK;
-       banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
-       dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
-               version >> 24, (version >> 16) & 0xf, banks);
-
-       fsl_ifc_ctrl_dev->version = version;
-       fsl_ifc_ctrl_dev->banks = banks;
-
        if (of_property_read_bool(dev->dev.of_node, "little-endian")) {
                fsl_ifc_ctrl_dev->little_endian = true;
                dev_dbg(&dev->dev, "IFC REGISTERS are LITTLE endian\n");
@@ -249,8 +241,9 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
                dev_dbg(&dev->dev, "IFC REGISTERS are BIG endian\n");
        }
 
-       version = ioread32be(&fsl_ifc_ctrl_dev->regs->ifc_rev) &
+       version = ifc_in32(&fsl_ifc_ctrl_dev->gregs->ifc_rev) &
                        FSL_IFC_VERSION_MASK;
+
        banks = (version == FSL_IFC_VERSION_1_0_0) ? 4 : 8;
        dev_info(&dev->dev, "IFC version %d.%d, %d banks\n",
                version >> 24, (version >> 16) & 0xf, banks);
@@ -258,6 +251,13 @@ static int fsl_ifc_ctrl_probe(struct platform_device *dev)
        fsl_ifc_ctrl_dev->version = version;
        fsl_ifc_ctrl_dev->banks = banks;
 
+       addr = fsl_ifc_ctrl_dev->gregs;
+       if (version >= FSL_IFC_VERSION_2_0_0)
+               addr += PGOFFSET_64K;
+       else
+               addr += PGOFFSET_4K;
+       fsl_ifc_ctrl_dev->rregs = addr;
+
        /* get the Controller level irq */
        fsl_ifc_ctrl_dev->irq = irq_of_parse_and_map(dev->dev.of_node, 0);
        if (fsl_ifc_ctrl_dev->irq == 0) {
index 21825ddce4a3c83f70cd0c44beccee91210b37ee..af4884ba6b7cafa6fe5b2eeda1bee00e59fdd2a3 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/of_mtd.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/omap-gpmc.h>
-#include <linux/mtd/nand.h>
 #include <linux/pm_runtime.h>
 
 #include <linux/platform_data/mtd-nand-omap2.h>
@@ -81,6 +81,8 @@
 
 #define GPMC_CONFIG_LIMITEDADDRESS             BIT(1)
 
+#define GPMC_STATUS_EMPTYWRITEBUFFERSTATUS     BIT(0)
+
 #define        GPMC_CONFIG2_CSEXTRADELAY               BIT(7)
 #define        GPMC_CONFIG3_ADVEXTRADELAY              BIT(7)
 #define        GPMC_CONFIG4_OEEXTRADELAY               BIT(7)
 #define GPMC_CS_SIZE           0x30
 #define        GPMC_BCH_SIZE           0x10
 
+/*
+ * The first 1MB of GPMC address space is typically mapped to
+ * the internal ROM. Never allocate the first page, to
+ * facilitate bug detection; even if we didn't boot from ROM.
+ * As GPMC minimum partition size is 16MB we can only start from
+ * there.
+ */
+#define GPMC_MEM_START         0x1000000
 #define GPMC_MEM_END           0x3FFFFFFF
 
 #define GPMC_CHUNK_SHIFT       24              /* 16 MB */
 #define GPMC_CONFIG_RDY_BSY    0x00000001
 #define GPMC_CONFIG_DEV_SIZE   0x00000002
 #define GPMC_CONFIG_DEV_TYPE   0x00000003
-#define GPMC_SET_IRQ_STATUS    0x00000004
 
 #define GPMC_CONFIG1_WRAPBURST_SUPP     (1 << 31)
 #define GPMC_CONFIG1_READMULTIPLE_SUPP  (1 << 30)
 #define GPMC_CONFIG_WRITEPROTECT       0x00000010
 #define WR_RD_PIN_MONITORING           0x00600000
 
-#define GPMC_ENABLE_IRQ                0x0000000d
-
 /* ECC commands */
 #define GPMC_ECC_READ          0 /* Reset Hardware ECC for read */
 #define GPMC_ECC_WRITE         1 /* Reset Hardware ECC for write */
 #define GPMC_ECC_READSYN       2 /* Reset before syndrom is read back */
 
-/* XXX: Only NAND irq has been considered,currently these are the only ones used
- */
-#define        GPMC_NR_IRQ             2
+#define        GPMC_NR_NAND_IRQS       2 /* number of NAND specific IRQs */
 
 enum gpmc_clk_domain {
        GPMC_CD_FCLK,
@@ -199,11 +204,6 @@ struct gpmc_cs_data {
        struct resource mem;
 };
 
-struct gpmc_client_irq {
-       unsigned                irq;
-       u32                     bitmask;
-};
-
 /* Structure to save gpmc cs context */
 struct gpmc_cs_config {
        u32 config1;
@@ -231,9 +231,15 @@ struct omap3_gpmc_regs {
        struct gpmc_cs_config cs_context[GPMC_CS_NUM];
 };
 
-static struct gpmc_client_irq gpmc_client_irq[GPMC_NR_IRQ];
-static struct irq_chip gpmc_irq_chip;
-static int gpmc_irq_start;
+struct gpmc_device {
+       struct device *dev;
+       int irq;
+       struct irq_chip irq_chip;
+       struct gpio_chip gpio_chip;
+       int nirqs;
+};
+
+static struct irq_domain *gpmc_irq_domain;
 
 static struct resource gpmc_mem_root;
 static struct gpmc_cs_data gpmc_cs[GPMC_CS_NUM];
@@ -241,8 +247,6 @@ static DEFINE_SPINLOCK(gpmc_mem_lock);
 /* Define chip-selects as reserved by default until probe completes */
 static unsigned int gpmc_cs_num = GPMC_CS_NUM;
 static unsigned int gpmc_nr_waitpins;
-static struct device *gpmc_dev;
-static int gpmc_irq;
 static resource_size_t phys_base, mem_size;
 static unsigned gpmc_capability;
 static void __iomem *gpmc_base;
@@ -1054,14 +1058,6 @@ int gpmc_configure(int cmd, int wval)
        u32 regval;
 
        switch (cmd) {
-       case GPMC_ENABLE_IRQ:
-               gpmc_write_reg(GPMC_IRQENABLE, wval);
-               break;
-
-       case GPMC_SET_IRQ_STATUS:
-               gpmc_write_reg(GPMC_IRQSTATUS, wval);
-               break;
-
        case GPMC_CONFIG_WP:
                regval = gpmc_read_reg(GPMC_CONFIG);
                if (wval)
@@ -1084,7 +1080,7 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
 {
        int i;
 
-       reg->gpmc_status = gpmc_base + GPMC_STATUS;
+       reg->gpmc_status = NULL;        /* deprecated */
        reg->gpmc_nand_command = gpmc_base + GPMC_CS0_OFFSET +
                                GPMC_CS_NAND_COMMAND + GPMC_CS_SIZE * cs;
        reg->gpmc_nand_address = gpmc_base + GPMC_CS0_OFFSET +
@@ -1118,87 +1114,201 @@ void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
        }
 }
 
-int gpmc_get_client_irq(unsigned irq_config)
+static bool gpmc_nand_writebuffer_empty(void)
 {
-       int i;
+       if (gpmc_read_reg(GPMC_STATUS) & GPMC_STATUS_EMPTYWRITEBUFFERSTATUS)
+               return true;
 
-       if (hweight32(irq_config) > 1)
+       return false;
+}
+
+static struct gpmc_nand_ops nand_ops = {
+       .nand_writebuffer_empty = gpmc_nand_writebuffer_empty,
+};
+
+/**
+ * gpmc_omap_get_nand_ops - Get the GPMC NAND interface
+ * @regs: the GPMC NAND register map exclusive for NAND use.
+ * @cs: GPMC chip select number on which the NAND sits. The
+ *      register map returned will be specific to this chip select.
+ *
+ * Returns NULL on error e.g. invalid cs.
+ */
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *reg, int cs)
+{
+       if (cs >= gpmc_cs_num)
+               return NULL;
+
+       gpmc_update_nand_reg(reg, cs);
+
+       return &nand_ops;
+}
+EXPORT_SYMBOL_GPL(gpmc_omap_get_nand_ops);
+
+int gpmc_get_client_irq(unsigned irq_config)
+{
+       if (!gpmc_irq_domain) {
+               pr_warn("%s called before GPMC IRQ domain available\n",
+                       __func__);
                return 0;
+       }
 
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (gpmc_client_irq[i].bitmask & irq_config)
-                       return gpmc_client_irq[i].irq;
+       /* we restrict this to NAND IRQs only */
+       if (irq_config >= GPMC_NR_NAND_IRQS)
+               return 0;
 
-       return 0;
+       return irq_create_mapping(gpmc_irq_domain, irq_config);
 }
 
-static int gpmc_irq_endis(unsigned irq, bool endis)
+static int gpmc_irq_endis(unsigned long hwirq, bool endis)
 {
-       int i;
        u32 regval;
 
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (irq == gpmc_client_irq[i].irq) {
-                       regval = gpmc_read_reg(GPMC_IRQENABLE);
-                       if (endis)
-                               regval |= gpmc_client_irq[i].bitmask;
-                       else
-                               regval &= ~gpmc_client_irq[i].bitmask;
-                       gpmc_write_reg(GPMC_IRQENABLE, regval);
-                       break;
-               }
+       /* bits GPMC_NR_NAND_IRQS to 8 are reserved */
+       if (hwirq >= GPMC_NR_NAND_IRQS)
+               hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       regval = gpmc_read_reg(GPMC_IRQENABLE);
+       if (endis)
+               regval |= BIT(hwirq);
+       else
+               regval &= ~BIT(hwirq);
+       gpmc_write_reg(GPMC_IRQENABLE, regval);
 
        return 0;
 }
 
 static void gpmc_irq_disable(struct irq_data *p)
 {
-       gpmc_irq_endis(p->irq, false);
+       gpmc_irq_endis(p->hwirq, false);
 }
 
 static void gpmc_irq_enable(struct irq_data *p)
 {
-       gpmc_irq_endis(p->irq, true);
+       gpmc_irq_endis(p->hwirq, true);
 }
 
-static void gpmc_irq_noop(struct irq_data *data) { }
+static void gpmc_irq_mask(struct irq_data *d)
+{
+       gpmc_irq_endis(d->hwirq, false);
+}
 
-static unsigned int gpmc_irq_noop_ret(struct irq_data *data) { return 0; }
+static void gpmc_irq_unmask(struct irq_data *d)
+{
+       gpmc_irq_endis(d->hwirq, true);
+}
 
-static int gpmc_setup_irq(void)
+static void gpmc_irq_edge_config(unsigned long hwirq, bool rising_edge)
 {
-       int i;
        u32 regval;
 
-       if (!gpmc_irq)
+       /* NAND IRQs polarity is not configurable */
+       if (hwirq < GPMC_NR_NAND_IRQS)
+               return;
+
+       /* WAITPIN starts at BIT 8 */
+       hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       regval = gpmc_read_reg(GPMC_CONFIG);
+       if (rising_edge)
+               regval &= ~BIT(hwirq);
+       else
+               regval |= BIT(hwirq);
+
+       gpmc_write_reg(GPMC_CONFIG, regval);
+}
+
+static void gpmc_irq_ack(struct irq_data *d)
+{
+       unsigned int hwirq = d->hwirq;
+
+       /* skip reserved bits */
+       if (hwirq >= GPMC_NR_NAND_IRQS)
+               hwirq += 8 - GPMC_NR_NAND_IRQS;
+
+       /* Setting bit to 1 clears (or Acks) the interrupt */
+       gpmc_write_reg(GPMC_IRQSTATUS, BIT(hwirq));
+}
+
+static int gpmc_irq_set_type(struct irq_data *d, unsigned int trigger)
+{
+       /* can't set type for NAND IRQs */
+       if (d->hwirq < GPMC_NR_NAND_IRQS)
                return -EINVAL;
 
-       gpmc_irq_start = irq_alloc_descs(-1, 0, GPMC_NR_IRQ, 0);
-       if (gpmc_irq_start < 0) {
-               pr_err("irq_alloc_descs failed\n");
-               return gpmc_irq_start;
+       /* We can support either rising or falling edge at a time */
+       if (trigger == IRQ_TYPE_EDGE_FALLING)
+               gpmc_irq_edge_config(d->hwirq, false);
+       else if (trigger == IRQ_TYPE_EDGE_RISING)
+               gpmc_irq_edge_config(d->hwirq, true);
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static int gpmc_irq_map(struct irq_domain *d, unsigned int virq,
+                       irq_hw_number_t hw)
+{
+       struct gpmc_device *gpmc = d->host_data;
+
+       irq_set_chip_data(virq, gpmc);
+       if (hw < GPMC_NR_NAND_IRQS) {
+               irq_modify_status(virq, IRQ_NOREQUEST, IRQ_NOAUTOEN);
+               irq_set_chip_and_handler(virq, &gpmc->irq_chip,
+                                        handle_simple_irq);
+       } else {
+               irq_set_chip_and_handler(virq, &gpmc->irq_chip,
+                                        handle_edge_irq);
        }
 
-       gpmc_irq_chip.name = "gpmc";
-       gpmc_irq_chip.irq_startup = gpmc_irq_noop_ret;
-       gpmc_irq_chip.irq_enable = gpmc_irq_enable;
-       gpmc_irq_chip.irq_disable = gpmc_irq_disable;
-       gpmc_irq_chip.irq_shutdown = gpmc_irq_noop;
-       gpmc_irq_chip.irq_ack = gpmc_irq_noop;
-       gpmc_irq_chip.irq_mask = gpmc_irq_noop;
-       gpmc_irq_chip.irq_unmask = gpmc_irq_noop;
-
-       gpmc_client_irq[0].bitmask = GPMC_IRQ_FIFOEVENTENABLE;
-       gpmc_client_irq[1].bitmask = GPMC_IRQ_COUNT_EVENT;
-
-       for (i = 0; i < GPMC_NR_IRQ; i++) {
-               gpmc_client_irq[i].irq = gpmc_irq_start + i;
-               irq_set_chip_and_handler(gpmc_client_irq[i].irq,
-                                       &gpmc_irq_chip, handle_simple_irq);
-               irq_modify_status(gpmc_client_irq[i].irq, IRQ_NOREQUEST,
-                                 IRQ_NOAUTOEN);
+       return 0;
+}
+
+static const struct irq_domain_ops gpmc_irq_domain_ops = {
+       .map    = gpmc_irq_map,
+       .xlate  = irq_domain_xlate_twocell,
+};
+
+static irqreturn_t gpmc_handle_irq(int irq, void *data)
+{
+       int hwirq, virq;
+       u32 regval, regvalx;
+       struct gpmc_device *gpmc = data;
+
+       regval = gpmc_read_reg(GPMC_IRQSTATUS);
+       regvalx = regval;
+
+       if (!regval)
+               return IRQ_NONE;
+
+       for (hwirq = 0; hwirq < gpmc->nirqs; hwirq++) {
+               /* skip reserved status bits */
+               if (hwirq == GPMC_NR_NAND_IRQS)
+                       regvalx >>= 8 - GPMC_NR_NAND_IRQS;
+
+               if (regvalx & BIT(hwirq)) {
+                       virq = irq_find_mapping(gpmc_irq_domain, hwirq);
+                       if (!virq) {
+                               dev_warn(gpmc->dev,
+                                        "spurious irq detected hwirq %d, virq %d\n",
+                                        hwirq, virq);
+                       }
+
+                       generic_handle_irq(virq);
+               }
        }
 
+       gpmc_write_reg(GPMC_IRQSTATUS, regval);
+
+       return IRQ_HANDLED;
+}
+
+static int gpmc_setup_irq(struct gpmc_device *gpmc)
+{
+       u32 regval;
+       int rc;
+
        /* Disable interrupts */
        gpmc_write_reg(GPMC_IRQENABLE, 0);
 
@@ -1206,22 +1316,45 @@ static int gpmc_setup_irq(void)
        regval = gpmc_read_reg(GPMC_IRQSTATUS);
        gpmc_write_reg(GPMC_IRQSTATUS, regval);
 
-       return request_irq(gpmc_irq, gpmc_handle_irq, 0, "gpmc", NULL);
+       gpmc->irq_chip.name = "gpmc";
+       gpmc->irq_chip.irq_enable = gpmc_irq_enable;
+       gpmc->irq_chip.irq_disable = gpmc_irq_disable;
+       gpmc->irq_chip.irq_ack = gpmc_irq_ack;
+       gpmc->irq_chip.irq_mask = gpmc_irq_mask;
+       gpmc->irq_chip.irq_unmask = gpmc_irq_unmask;
+       gpmc->irq_chip.irq_set_type = gpmc_irq_set_type;
+
+       gpmc_irq_domain = irq_domain_add_linear(gpmc->dev->of_node,
+                                               gpmc->nirqs,
+                                               &gpmc_irq_domain_ops,
+                                               gpmc);
+       if (!gpmc_irq_domain) {
+               dev_err(gpmc->dev, "IRQ domain add failed\n");
+               return -ENODEV;
+       }
+
+       rc = request_irq(gpmc->irq, gpmc_handle_irq, 0, "gpmc", gpmc);
+       if (rc) {
+               dev_err(gpmc->dev, "failed to request irq %d: %d\n",
+                       gpmc->irq, rc);
+               irq_domain_remove(gpmc_irq_domain);
+               gpmc_irq_domain = NULL;
+       }
+
+       return rc;
 }
 
-static int gpmc_free_irq(void)
+static int gpmc_free_irq(struct gpmc_device *gpmc)
 {
-       int i;
+       int hwirq;
 
-       if (gpmc_irq)
-               free_irq(gpmc_irq, NULL);
+       free_irq(gpmc->irq, gpmc);
 
-       for (i = 0; i < GPMC_NR_IRQ; i++) {
-               irq_set_handler(gpmc_client_irq[i].irq, NULL);
-               irq_set_chip(gpmc_client_irq[i].irq, &no_irq_chip);
-       }
+       for (hwirq = 0; hwirq < gpmc->nirqs; hwirq++)
+               irq_dispose_mapping(irq_find_mapping(gpmc_irq_domain, hwirq));
 
-       irq_free_descs(gpmc_irq_start, GPMC_NR_IRQ);
+       irq_domain_remove(gpmc_irq_domain);
+       gpmc_irq_domain = NULL;
 
        return 0;
 }
@@ -1242,12 +1375,7 @@ static void gpmc_mem_init(void)
 {
        int cs;
 
-       /*
-        * The first 1MB of GPMC address space is typically mapped to
-        * the internal ROM. Never allocate the first page, to
-        * facilitate bug detection; even if we didn't boot from ROM.
-        */
-       gpmc_mem_root.start = SZ_1M;
+       gpmc_mem_root.start = GPMC_MEM_START;
        gpmc_mem_root.end = GPMC_MEM_END;
 
        /* Reserve all regions that has been set up by bootloader */
@@ -1796,105 +1924,6 @@ static void __maybe_unused gpmc_read_timings_dt(struct device_node *np,
                of_property_read_bool(np, "gpmc,time-para-granularity");
 }
 
-#if IS_ENABLED(CONFIG_MTD_NAND)
-
-static const char * const nand_xfer_types[] = {
-       [NAND_OMAP_PREFETCH_POLLED]             = "prefetch-polled",
-       [NAND_OMAP_POLLED]                      = "polled",
-       [NAND_OMAP_PREFETCH_DMA]                = "prefetch-dma",
-       [NAND_OMAP_PREFETCH_IRQ]                = "prefetch-irq",
-};
-
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-                                struct device_node *child)
-{
-       u32 val;
-       const char *s;
-       struct gpmc_timings gpmc_t;
-       struct omap_nand_platform_data *gpmc_nand_data;
-
-       if (of_property_read_u32(child, "reg", &val) < 0) {
-               dev_err(&pdev->dev, "%s has no 'reg' property\n",
-                       child->full_name);
-               return -ENODEV;
-       }
-
-       gpmc_nand_data = devm_kzalloc(&pdev->dev, sizeof(*gpmc_nand_data),
-                                     GFP_KERNEL);
-       if (!gpmc_nand_data)
-               return -ENOMEM;
-
-       gpmc_nand_data->cs = val;
-       gpmc_nand_data->of_node = child;
-
-       /* Detect availability of ELM module */
-       gpmc_nand_data->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
-       if (gpmc_nand_data->elm_of_node == NULL)
-               gpmc_nand_data->elm_of_node =
-                                       of_parse_phandle(child, "elm_id", 0);
-
-       /* select ecc-scheme for NAND */
-       if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
-               pr_err("%s: ti,nand-ecc-opt not found\n", __func__);
-               return -ENODEV;
-       }
-
-       if (!strcmp(s, "sw"))
-               gpmc_nand_data->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
-       else if (!strcmp(s, "ham1") ||
-                !strcmp(s, "hw") || !strcmp(s, "hw-romcode"))
-               gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_HAM1_CODE_HW;
-       else if (!strcmp(s, "bch4"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH4_CODE_HW;
-               else
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
-       else if (!strcmp(s, "bch8"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH8_CODE_HW;
-               else
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
-       else if (!strcmp(s, "bch16"))
-               if (gpmc_nand_data->elm_of_node)
-                       gpmc_nand_data->ecc_opt =
-                               OMAP_ECC_BCH16_CODE_HW;
-               else
-                       pr_err("%s: BCH16 requires ELM support\n", __func__);
-       else
-               pr_err("%s: ti,nand-ecc-opt invalid value\n", __func__);
-
-       /* select data transfer mode for NAND controller */
-       if (!of_property_read_string(child, "ti,nand-xfer-type", &s))
-               for (val = 0; val < ARRAY_SIZE(nand_xfer_types); val++)
-                       if (!strcasecmp(s, nand_xfer_types[val])) {
-                               gpmc_nand_data->xfer_type = val;
-                               break;
-                       }
-
-       gpmc_nand_data->flash_bbt = of_get_nand_on_flash_bbt(child);
-
-       val = of_get_nand_bus_width(child);
-       if (val == 16)
-               gpmc_nand_data->devsize = NAND_BUSWIDTH_16;
-
-       gpmc_read_timings_dt(child, &gpmc_t);
-       gpmc_nand_init(gpmc_nand_data, &gpmc_t);
-
-       return 0;
-}
-#else
-static int gpmc_probe_nand_child(struct platform_device *pdev,
-                                struct device_node *child)
-{
-       return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_MTD_ONENAND)
 static int gpmc_probe_onenand_child(struct platform_device *pdev,
                                 struct device_node *child)
@@ -1950,6 +1979,8 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
        const char *name;
        int ret, cs;
        u32 val;
+       struct gpio_desc *waitpin_desc = NULL;
+       struct gpmc_device *gpmc = platform_get_drvdata(pdev);
 
        if (of_property_read_u32(child, "reg", &cs) < 0) {
                dev_err(&pdev->dev, "%s has no 'reg' property\n",
@@ -2010,23 +2041,80 @@ static int gpmc_probe_generic_child(struct platform_device *pdev,
        if (ret < 0) {
                dev_err(&pdev->dev, "cannot remap GPMC CS %d to %pa\n",
                        cs, &res.start);
+               if (res.start < GPMC_MEM_START) {
+                       dev_info(&pdev->dev,
+                                "GPMC CS %d start cannot be lesser than 0x%x\n",
+                                cs, GPMC_MEM_START);
+               } else if (res.end > GPMC_MEM_END) {
+                       dev_info(&pdev->dev,
+                                "GPMC CS %d end cannot be greater than 0x%x\n",
+                                cs, GPMC_MEM_END);
+               }
                goto err;
        }
 
-       ret = of_property_read_u32(child, "bank-width", &gpmc_s.device_width);
-       if (ret < 0)
-               goto err;
+       if (of_node_cmp(child->name, "nand") == 0) {
+               /* Warn about older DT blobs with no compatible property */
+               if (!of_property_read_bool(child, "compatible")) {
+                       dev_warn(&pdev->dev,
+                                "Incompatible NAND node: missing compatible");
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       if (of_device_is_compatible(child, "ti,omap2-nand")) {
+               /* NAND specific setup */
+               val = 8;
+               of_property_read_u32(child, "nand-bus-width", &val);
+               switch (val) {
+               case 8:
+                       gpmc_s.device_width = GPMC_DEVWIDTH_8BIT;
+                       break;
+               case 16:
+                       gpmc_s.device_width = GPMC_DEVWIDTH_16BIT;
+                       break;
+               default:
+                       dev_err(&pdev->dev, "%s: invalid 'nand-bus-width'\n",
+                               child->name);
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               /* disable write protect */
+               gpmc_configure(GPMC_CONFIG_WP, 0);
+               gpmc_s.device_nand = true;
+       } else {
+               ret = of_property_read_u32(child, "bank-width",
+                                          &gpmc_s.device_width);
+               if (ret < 0)
+                       goto err;
+       }
+
+       /* Reserve wait pin if it is required and valid */
+       if (gpmc_s.wait_on_read || gpmc_s.wait_on_write) {
+               unsigned int wait_pin = gpmc_s.wait_pin;
+
+               waitpin_desc = gpiochip_request_own_desc(&gpmc->gpio_chip,
+                                                        wait_pin, "WAITPIN");
+               if (IS_ERR(waitpin_desc)) {
+                       dev_err(&pdev->dev, "invalid wait-pin: %d\n", wait_pin);
+                       ret = PTR_ERR(waitpin_desc);
+                       goto err;
+               }
+       }
 
        gpmc_cs_show_timings(cs, "before gpmc_cs_program_settings");
+
        ret = gpmc_cs_program_settings(cs, &gpmc_s);
        if (ret < 0)
-               goto err;
+               goto err_cs;
 
        ret = gpmc_cs_set_timings(cs, &gpmc_t, &gpmc_s);
        if (ret) {
                dev_err(&pdev->dev, "failed to set gpmc timings for: %s\n",
                        child->name);
-               goto err;
+               goto err_cs;
        }
 
        /* Clear limited address i.e. enable A26-A11 */
@@ -2057,16 +2145,81 @@ err_child_fail:
        dev_err(&pdev->dev, "failed to create gpmc child %s\n", child->name);
        ret = -ENODEV;
 
+err_cs:
+       if (waitpin_desc)
+               gpiochip_free_own_desc(waitpin_desc);
+
 err:
        gpmc_cs_free(cs);
 
        return ret;
 }
 
+static int gpmc_gpio_get_direction(struct gpio_chip *chip, unsigned int offset)
+{
+       return 1;       /* we're input only */
+}
+
+static int gpmc_gpio_direction_input(struct gpio_chip *chip,
+                                    unsigned int offset)
+{
+       return 0;       /* we're input only */
+}
+
+static int gpmc_gpio_direction_output(struct gpio_chip *chip,
+                                     unsigned int offset, int value)
+{
+       return -EINVAL; /* we're input only */
+}
+
+static void gpmc_gpio_set(struct gpio_chip *chip, unsigned int offset,
+                         int value)
+{
+}
+
+static int gpmc_gpio_get(struct gpio_chip *chip, unsigned int offset)
+{
+       u32 reg;
+
+       offset += 8;
+
+       reg = gpmc_read_reg(GPMC_STATUS) & BIT(offset);
+
+       return !!reg;
+}
+
+static int gpmc_gpio_init(struct gpmc_device *gpmc)
+{
+       int ret;
+
+       gpmc->gpio_chip.parent = gpmc->dev;
+       gpmc->gpio_chip.owner = THIS_MODULE;
+       gpmc->gpio_chip.label = DEVICE_NAME;
+       gpmc->gpio_chip.ngpio = gpmc_nr_waitpins;
+       gpmc->gpio_chip.get_direction = gpmc_gpio_get_direction;
+       gpmc->gpio_chip.direction_input = gpmc_gpio_direction_input;
+       gpmc->gpio_chip.direction_output = gpmc_gpio_direction_output;
+       gpmc->gpio_chip.set = gpmc_gpio_set;
+       gpmc->gpio_chip.get = gpmc_gpio_get;
+       gpmc->gpio_chip.base = -1;
+
+       ret = gpiochip_add(&gpmc->gpio_chip);
+       if (ret < 0) {
+               dev_err(gpmc->dev, "could not register gpio chip: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static void gpmc_gpio_exit(struct gpmc_device *gpmc)
+{
+       gpiochip_remove(&gpmc->gpio_chip);
+}
+
 static int gpmc_probe_dt(struct platform_device *pdev)
 {
        int ret;
-       struct device_node *child;
        const struct of_device_id *of_id =
                of_match_device(gpmc_dt_ids, &pdev->dev);
 
@@ -2094,17 +2247,26 @@ static int gpmc_probe_dt(struct platform_device *pdev)
                return ret;
        }
 
+       return 0;
+}
+
+static int gpmc_probe_dt_children(struct platform_device *pdev)
+{
+       int ret;
+       struct device_node *child;
+
        for_each_available_child_of_node(pdev->dev.of_node, child) {
 
                if (!child->name)
                        continue;
 
-               if (of_node_cmp(child->name, "nand") == 0)
-                       ret = gpmc_probe_nand_child(pdev, child);
-               else if (of_node_cmp(child->name, "onenand") == 0)
+               if (of_node_cmp(child->name, "onenand") == 0)
                        ret = gpmc_probe_onenand_child(pdev, child);
                else
                        ret = gpmc_probe_generic_child(pdev, child);
+
+               if (ret)
+                       return ret;
        }
 
        return 0;
@@ -2114,6 +2276,11 @@ static int gpmc_probe_dt(struct platform_device *pdev)
 {
        return 0;
 }
+
+static int gpmc_probe_dt_children(struct platform_device *pdev)
+{
+       return 0;
+}
 #endif
 
 static int gpmc_probe(struct platform_device *pdev)
@@ -2121,6 +2288,14 @@ static int gpmc_probe(struct platform_device *pdev)
        int rc;
        u32 l;
        struct resource *res;
+       struct gpmc_device *gpmc;
+
+       gpmc = devm_kzalloc(&pdev->dev, sizeof(*gpmc), GFP_KERNEL);
+       if (!gpmc)
+               return -ENOMEM;
+
+       gpmc->dev = &pdev->dev;
+       platform_set_drvdata(pdev, gpmc);
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (res == NULL)
@@ -2134,15 +2309,16 @@ static int gpmc_probe(struct platform_device *pdev)
                return PTR_ERR(gpmc_base);
 
        res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-       if (res == NULL)
-               dev_warn(&pdev->dev, "Failed to get resource: irq\n");
-       else
-               gpmc_irq = res->start;
+       if (!res) {
+               dev_err(&pdev->dev, "Failed to get resource: irq\n");
+               return -ENOENT;
+       }
+
+       gpmc->irq = res->start;
 
        gpmc_l3_clk = devm_clk_get(&pdev->dev, "fck");
        if (IS_ERR(gpmc_l3_clk)) {
                dev_err(&pdev->dev, "Failed to get GPMC fck\n");
-               gpmc_irq = 0;
                return PTR_ERR(gpmc_l3_clk);
        }
 
@@ -2151,11 +2327,18 @@ static int gpmc_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
+       if (pdev->dev.of_node) {
+               rc = gpmc_probe_dt(pdev);
+               if (rc)
+                       return rc;
+       } else {
+               gpmc_cs_num = GPMC_CS_NUM;
+               gpmc_nr_waitpins = GPMC_NR_WAITPINS;
+       }
+
        pm_runtime_enable(&pdev->dev);
        pm_runtime_get_sync(&pdev->dev);
 
-       gpmc_dev = &pdev->dev;
-
        l = gpmc_read_reg(GPMC_REVISION);
 
        /*
@@ -2174,36 +2357,51 @@ static int gpmc_probe(struct platform_device *pdev)
                gpmc_capability = GPMC_HAS_WR_ACCESS | GPMC_HAS_WR_DATA_MUX_BUS;
        if (GPMC_REVISION_MAJOR(l) > 0x5)
                gpmc_capability |= GPMC_HAS_MUX_AAD;
-       dev_info(gpmc_dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
+       dev_info(gpmc->dev, "GPMC revision %d.%d\n", GPMC_REVISION_MAJOR(l),
                 GPMC_REVISION_MINOR(l));
 
        gpmc_mem_init();
-
-       if (gpmc_setup_irq() < 0)
-               dev_warn(gpmc_dev, "gpmc_setup_irq failed\n");
-
-       if (!pdev->dev.of_node) {
-               gpmc_cs_num      = GPMC_CS_NUM;
-               gpmc_nr_waitpins = GPMC_NR_WAITPINS;
+       rc = gpmc_gpio_init(gpmc);
+       if (rc)
+               goto gpio_init_failed;
+
+       gpmc->nirqs = GPMC_NR_NAND_IRQS + gpmc_nr_waitpins;
+       rc = gpmc_setup_irq(gpmc);
+       if (rc) {
+               dev_err(gpmc->dev, "gpmc_setup_irq failed\n");
+               goto setup_irq_failed;
        }
 
-       rc = gpmc_probe_dt(pdev);
+       rc = gpmc_probe_dt_children(pdev);
        if (rc < 0) {
-               pm_runtime_put_sync(&pdev->dev);
-               dev_err(gpmc_dev, "failed to probe DT parameters\n");
-               return rc;
+               dev_err(gpmc->dev, "failed to probe DT children\n");
+               goto dt_children_failed;
        }
 
        return 0;
+
+dt_children_failed:
+       gpmc_free_irq(gpmc);
+setup_irq_failed:
+       gpmc_gpio_exit(gpmc);
+gpio_init_failed:
+       gpmc_mem_exit();
+       pm_runtime_put_sync(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
+
+       return rc;
 }
 
 static int gpmc_remove(struct platform_device *pdev)
 {
-       gpmc_free_irq();
+       struct gpmc_device *gpmc = platform_get_drvdata(pdev);
+
+       gpmc_free_irq(gpmc);
+       gpmc_gpio_exit(gpmc);
        gpmc_mem_exit();
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
-       gpmc_dev = NULL;
+
        return 0;
 }
 
@@ -2249,25 +2447,6 @@ static __exit void gpmc_exit(void)
 postcore_initcall(gpmc_init);
 module_exit(gpmc_exit);
 
-static irqreturn_t gpmc_handle_irq(int irq, void *dev)
-{
-       int i;
-       u32 regval;
-
-       regval = gpmc_read_reg(GPMC_IRQSTATUS);
-
-       if (!regval)
-               return IRQ_NONE;
-
-       for (i = 0; i < GPMC_NR_IRQ; i++)
-               if (regval & gpmc_client_irq[i].bitmask)
-                       generic_handle_irq(gpmc_client_irq[i].irq);
-
-       gpmc_write_reg(GPMC_IRQSTATUS, regval);
-
-       return IRQ_HANDLED;
-}
-
 static struct omap3_gpmc_regs gpmc_context;
 
 void omap3_gpmc_save_context(void)
index ddc96206288a1f281fa8b0663ef033b3fed774bb..e62fde3ac431c111ec945d4aba9c877f6308d81d 100644 (file)
@@ -618,6 +618,10 @@ static int mmc_blk_ioctl_cmd(struct block_device *bdev,
 
        ioc_err = __mmc_blk_ioctl_cmd(card, md, idata);
 
+       /* Always switch back to main area after RPMB access */
+       if (md->area_type & MMC_BLK_DATA_AREA_RPMB)
+               mmc_blk_part_switch(card, dev_get_drvdata(&card->dev));
+
        mmc_put_card(card);
 
        err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
@@ -685,6 +689,10 @@ static int mmc_blk_ioctl_multi_cmd(struct block_device *bdev,
        for (i = 0; i < num_of_cmds && !ioc_err; i++)
                ioc_err = __mmc_blk_ioctl_cmd(card, md, idata[i]);
 
+       /* Always switch back to main area after RPMB access */
+       if (md->area_type & MMC_BLK_DATA_AREA_RPMB)
+               mmc_blk_part_switch(card, dev_get_drvdata(&card->dev));
+
        mmc_put_card(card);
 
        /* copy to user if data and response */
@@ -748,16 +756,25 @@ static inline int mmc_blk_part_switch(struct mmc_card *card,
        if (mmc_card_mmc(card)) {
                u8 part_config = card->ext_csd.part_config;
 
+               if (md->part_type == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                       mmc_retune_pause(card->host);
+
                part_config &= ~EXT_CSD_PART_CONFIG_ACC_MASK;
                part_config |= md->part_type;
 
                ret = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
                                 EXT_CSD_PART_CONFIG, part_config,
                                 card->ext_csd.part_time);
-               if (ret)
+               if (ret) {
+                       if (md->part_type == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                               mmc_retune_unpause(card->host);
                        return ret;
+               }
 
                card->ext_csd.part_config = part_config;
+
+               if (main_md->part_curr == EXT_CSD_PART_CONFIG_ACC_RPMB)
+                       mmc_retune_unpause(card->host);
        }
 
        main_md->part_curr = md->part_type;
@@ -2519,11 +2536,12 @@ static const struct mmc_fixup blk_fixups[] =
                  MMC_QUIRK_BLK_NO_CMD23),
 
        /*
-        * Some Micron MMC cards needs longer data read timeout than
-        * indicated in CSD.
+        * Some MMC cards need longer data read timeout than indicated in CSD.
         */
        MMC_FIXUP(CID_NAME_ANY, CID_MANFID_MICRON, 0x200, add_quirk_mmc,
                  MMC_QUIRK_LONG_READ_TIME),
+       MMC_FIXUP("008GE0", CID_MANFID_TOSHIBA, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_LONG_READ_TIME),
 
        /*
         * On these Samsung MoviNAND parts, performing secure erase or
index 99275e40bf2fb7f61083d6bcf2b867d76d6b25ca..8b4dfd45433b73c1ccf6fe4cd4f9ca3338546cff 100644 (file)
@@ -875,11 +875,11 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
        /*
         * Some cards require longer data read timeout than indicated in CSD.
         * Address this by setting the read timeout to a "reasonably high"
-        * value. For the cards tested, 300ms has proven enough. If necessary,
+        * value. For the cards tested, 600ms has proven enough. If necessary,
         * this value can be increased if other problematic cards require this.
         */
        if (mmc_card_long_read_time(card) && data->flags & MMC_DATA_READ) {
-               data->timeout_ns = 300000000;
+               data->timeout_ns = 600000000;
                data->timeout_clks = 0;
        }
 
index e0a3ee16c0d3f5ada74a53079f9089e630552ac6..1be42fab1a3051a77523abf093bb3cbf057bc32c 100644 (file)
@@ -68,8 +68,32 @@ void mmc_retune_enable(struct mmc_host *host)
                          jiffies + host->retune_period * HZ);
 }
 
+/*
+ * Pause re-tuning for a small set of operations.  The pause begins after the
+ * next command and after first doing re-tuning.
+ */
+void mmc_retune_pause(struct mmc_host *host)
+{
+       if (!host->retune_paused) {
+               host->retune_paused = 1;
+               mmc_retune_needed(host);
+               mmc_retune_hold(host);
+       }
+}
+EXPORT_SYMBOL(mmc_retune_pause);
+
+void mmc_retune_unpause(struct mmc_host *host)
+{
+       if (host->retune_paused) {
+               host->retune_paused = 0;
+               mmc_retune_release(host);
+       }
+}
+EXPORT_SYMBOL(mmc_retune_unpause);
+
 void mmc_retune_disable(struct mmc_host *host)
 {
+       mmc_retune_unpause(host);
        host->can_retune = 0;
        del_timer_sync(&host->retune_timer);
        host->retune_now = 0;
index 8c20b81cafd87ff81e9d3166b144fcc26cc8df28..358b0dc853b064c1f640201c1503faee97bc4e6b 100644 (file)
@@ -66,6 +66,70 @@ static void dw_mci_rk3288_set_ios(struct dw_mci *host, struct mmc_ios *ios)
        /* Make sure we use phases which we can enumerate with */
        if (!IS_ERR(priv->sample_clk))
                clk_set_phase(priv->sample_clk, priv->default_sample_phase);
+
+       /*
+        * Set the drive phase offset based on speed mode to achieve hold times.
+        *
+        * NOTE: this is _not_ a value that is dynamically tuned and is also
+        * _not_ a value that will vary from board to board.  It is a value
+        * that could vary between different SoC models if they had massively
+        * different output clock delays inside their dw_mmc IP block (delay_o),
+        * but since it's OK to overshoot a little we don't need to do complex
+        * calculations and can pick values that will just work for everyone.
+        *
+        * When picking values we'll stick with picking 0/90/180/270 since
+        * those can be made very accurately on all known Rockchip SoCs.
+        *
+        * Note that these values match values from the DesignWare Databook
+        * tables for the most part except for SDR12 and "ID mode".  For those
+        * two modes the databook calculations assume a clock in of 50MHz.  As
+        * seen above, we always use a clock in rate that is exactly the
+        * card's input clock (times RK3288_CLKGEN_DIV, but that gets divided
+        * back out before the controller sees it).
+        *
+        * From measurement of a single device, it appears that delay_o is
+        * about .5 ns.  Since we try to leave a bit of margin, it's expected
+        * that numbers here will be fine even with much larger delay_o
+        * (the 1.4 ns assumed by the DesignWare Databook would result in the
+        * same results, for instance).
+        */
+       if (!IS_ERR(priv->drv_clk)) {
+               int phase;
+
+               /*
+                * In almost all cases a 90 degree phase offset will provide
+                * sufficient hold times across all valid input clock rates
+                * assuming delay_o is not absurd for a given SoC.  We'll use
+                * that as a default.
+                */
+               phase = 90;
+
+               switch (ios->timing) {
+               case MMC_TIMING_MMC_DDR52:
+                       /*
+                        * Since clock in rate with MMC_DDR52 is doubled when
+                        * bus width is 8 we need to double the phase offset
+                        * to get the same timings.
+                        */
+                       if (ios->bus_width == MMC_BUS_WIDTH_8)
+                               phase = 180;
+                       break;
+               case MMC_TIMING_UHS_SDR104:
+               case MMC_TIMING_MMC_HS200:
+                       /*
+                        * In the case of 150 MHz clock (typical max for
+                        * Rockchip SoCs), 90 degree offset will add a delay
+                        * of 1.67 ns.  That will meet min hold time of .8 ns
+                        * as long as clock output delay is < .87 ns.  On
+                        * SoCs measured this seems to be OK, but it doesn't
+                        * hurt to give margin here, so we use 180.
+                        */
+                       phase = 180;
+                       break;
+               }
+
+               clk_set_phase(priv->drv_clk, phase);
+       }
 }
 
 #define NUM_PHASES                     360
@@ -233,10 +297,10 @@ static int dw_mci_rockchip_init(struct dw_mci *host)
 
 /* Common capabilities of RK3288 SoC */
 static unsigned long dw_mci_rk3288_dwmmc_caps[4] = {
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
-       MMC_CAP_ERASE,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
+       MMC_CAP_ERASE | MMC_CAP_CMD23,
 };
 
 static const struct dw_mci_drv_data rk2928_drv_data = {
index 9dd1bd3584343f026646da7ba48d8612f664e24d..829a6eebcdcee551ee23f9679f2fb4c8bac46b13 100644 (file)
@@ -2595,13 +2595,13 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
        /* Useful defaults if platform data is unset. */
        if (host->use_dma == TRANS_MODE_IDMAC) {
                mmc->max_segs = host->ring_size;
-               mmc->max_blk_size = 65536;
+               mmc->max_blk_size = 65535;
                mmc->max_seg_size = 0x1000;
                mmc->max_req_size = mmc->max_seg_size * host->ring_size;
                mmc->max_blk_count = mmc->max_req_size / 512;
        } else if (host->use_dma == TRANS_MODE_EDMAC) {
                mmc->max_segs = 64;
-               mmc->max_blk_size = 65536;
+               mmc->max_blk_size = 65535;
                mmc->max_blk_count = 65535;
                mmc->max_req_size =
                                mmc->max_blk_size * mmc->max_blk_count;
@@ -2609,7 +2609,7 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
        } else {
                /* TRANS_MODE_PIO */
                mmc->max_segs = 64;
-               mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
+               mmc->max_blk_size = 65535; /* BLKSIZ is 16 bits */
                mmc->max_blk_count = 512;
                mmc->max_req_size = mmc->max_blk_size *
                                    mmc->max_blk_count;
index b2d70ba6caa74d1a24de0f980bcd85e4d84ac242..458ffb7637e5f902984d4727a3ada9a1251b5d56 100644 (file)
@@ -274,7 +274,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = {
        .chip    = &sdhci_acpi_chip_int,
        .caps    = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
                   MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR |
-                  MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+                  MMC_CAP_WAIT_WHILE_BUSY,
        .caps2   = MMC_CAP2_HC_ERASE_SZ,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
        .quirks  = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
@@ -289,7 +289,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = {
                   SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
        .quirks2 = SDHCI_QUIRK2_HOST_OFF_CARD_ON,
        .caps    = MMC_CAP_NONREMOVABLE | MMC_CAP_POWER_OFF_CARD |
-                  MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+                  MMC_CAP_WAIT_WHILE_BUSY,
        .flags   = SDHCI_ACPI_RUNTIME_PM,
        .pm_caps = MMC_PM_KEEP_POWER,
        .probe_slot     = sdhci_acpi_sdio_probe_slot,
@@ -301,7 +301,7 @@ static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sd = {
        .quirks  = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
        .quirks2 = SDHCI_QUIRK2_CARD_ON_NEEDS_BUS_ON |
                   SDHCI_QUIRK2_STOP_WITH_TC,
-       .caps    = MMC_CAP_BUS_WIDTH_TEST | MMC_CAP_WAIT_WHILE_BUSY,
+       .caps    = MMC_CAP_WAIT_WHILE_BUSY,
        .probe_slot     = sdhci_acpi_sd_probe_slot,
 };
 
@@ -378,7 +378,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        acpi_handle handle = ACPI_HANDLE(dev);
-       struct acpi_device *device;
+       struct acpi_device *device, *child;
        struct sdhci_acpi_host *c;
        struct sdhci_host *host;
        struct resource *iomem;
@@ -390,6 +390,11 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
        if (acpi_bus_get_device(handle, &device))
                return -ENODEV;
 
+       /* Power on the SDHCI controller and its children */
+       acpi_device_fix_up_power(device);
+       list_for_each_entry(child, &device->children, node)
+               acpi_device_fix_up_power(child);
+
        if (acpi_bus_get_status(device) || !device->status.present)
                return -ENODEV;
 
index 97d4eebd6bf59edbeb88f7efa5e6fec8bbab6aeb..a4dbf7421edc7f9342a18fa7f19cb033d05a0fa8 100644 (file)
@@ -356,7 +356,6 @@ static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
                                 MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR |
-                                MMC_CAP_BUS_WIDTH_TEST |
                                 MMC_CAP_WAIT_WHILE_BUSY;
        slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ;
        slot->hw_reset = sdhci_pci_int_hw_reset;
@@ -372,15 +371,13 @@ static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot)
 {
        slot->host->mmc->caps |= MMC_CAP_POWER_OFF_CARD | MMC_CAP_NONREMOVABLE |
-                                MMC_CAP_BUS_WIDTH_TEST |
                                 MMC_CAP_WAIT_WHILE_BUSY;
        return 0;
 }
 
 static int byt_sd_probe_slot(struct sdhci_pci_slot *slot)
 {
-       slot->host->mmc->caps |= MMC_CAP_BUS_WIDTH_TEST |
-                                MMC_CAP_WAIT_WHILE_BUSY;
+       slot->host->mmc->caps |= MMC_CAP_WAIT_WHILE_BUSY;
        slot->cd_con_id = NULL;
        slot->cd_idx = 0;
        slot->cd_override_level = true;
index 3b3dabce58de9d9b2d9ad1838800c06bf04aa6e5..bbfa1f1292668a84f7e0a6e708dfd9a6b01d1d20 100644 (file)
@@ -115,6 +115,7 @@ config MTD_MAP_BANK_WIDTH_16
 
 config MTD_MAP_BANK_WIDTH_32
        bool "Support 256-bit buswidth" if MTD_CFI_GEOMETRY
+       select MTD_COMPLEX_MAPPINGS if HAS_IOMEM
        default n
        help
          If you wish to support CFI devices on a physical bus which is
index 347bb83db8642a15320ed65771ca4e58d89da2b1..1c65c15b31a1aaed713363d5430c917754b13f13 100644 (file)
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
 #include <linux/platform_device.h>
 #include <linux/bcma/bcma.h>
@@ -109,8 +110,7 @@ static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
        if ((from + len) > mtd->size)
                return -EINVAL;
 
-       memcpy_fromio(buf, (void __iomem *)KSEG0ADDR(b47s->window + from),
-                     len);
+       memcpy_fromio(buf, b47s->window + from, len);
        *retlen = len;
 
        return len;
@@ -275,15 +275,33 @@ static void bcm47xxsflash_bcma_cc_write(struct bcm47xxsflash *b47s, u16 offset,
 
 static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
 {
-       struct bcma_sflash *sflash = dev_get_platdata(&pdev->dev);
+       struct device *dev = &pdev->dev;
+       struct bcma_sflash *sflash = dev_get_platdata(dev);
        struct bcm47xxsflash *b47s;
+       struct resource *res;
        int err;
 
-       b47s = devm_kzalloc(&pdev->dev, sizeof(*b47s), GFP_KERNEL);
+       b47s = devm_kzalloc(dev, sizeof(*b47s), GFP_KERNEL);
        if (!b47s)
                return -ENOMEM;
        sflash->priv = b47s;
 
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res) {
+               dev_err(dev, "invalid resource\n");
+               return -EINVAL;
+       }
+       if (!devm_request_mem_region(dev, res->start, resource_size(res),
+                                    res->name)) {
+               dev_err(dev, "can't request region for resource %pR\n", res);
+               return -EBUSY;
+       }
+       b47s->window = ioremap_cache(res->start, resource_size(res));
+       if (!b47s->window) {
+               dev_err(dev, "ioremap failed for resource %pR\n", res);
+               return -ENOMEM;
+       }
+
        b47s->bcma_cc = container_of(sflash, struct bcma_drv_cc, sflash);
        b47s->cc_read = bcm47xxsflash_bcma_cc_read;
        b47s->cc_write = bcm47xxsflash_bcma_cc_write;
@@ -297,7 +315,6 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
                break;
        }
 
-       b47s->window = sflash->window;
        b47s->blocksize = sflash->blocksize;
        b47s->numblocks = sflash->numblocks;
        b47s->size = sflash->size;
@@ -306,6 +323,7 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
        err = mtd_device_parse_register(&b47s->mtd, probes, NULL, NULL, 0);
        if (err) {
                pr_err("Failed to register MTD device: %d\n", err);
+               iounmap(b47s->window);
                return err;
        }
 
@@ -321,6 +339,7 @@ static int bcm47xxsflash_bcma_remove(struct platform_device *pdev)
        struct bcm47xxsflash *b47s = sflash->priv;
 
        mtd_device_unregister(&b47s->mtd);
+       iounmap(b47s->window);
 
        return 0;
 }
index fe93daf4f4894a35f2d7b8a65363d1d163d09ffc..1564b62b412e3c071f5f3062e2e992296d531f91 100644 (file)
@@ -65,7 +65,8 @@ struct bcm47xxsflash {
 
        enum bcm47xxsflash_type type;
 
-       u32 window;
+       void __iomem *window;
+
        u32 blocksize;
        u16 numblocks;
        u32 size;
index e7b2e439696c851298c21775a3117fd59811fbf1..b833e6cc684c389985bbe91aec15c2612daf6b5e 100644 (file)
@@ -67,16 +67,40 @@ module_param(reliable_mode, uint, 0);
 MODULE_PARM_DESC(reliable_mode, "Set the docg3 mode (0=normal MLC, 1=fast, "
                 "2=reliable) : MLC normal operations are in normal mode");
 
-/**
- * struct docg3_oobinfo - DiskOnChip G3 OOB layout
- * @eccbytes: 8 bytes are used (1 for Hamming ECC, 7 for BCH ECC)
- * @eccpos: ecc positions (byte 7 is Hamming ECC, byte 8-14 are BCH ECC)
- * @oobfree: free pageinfo bytes (byte 0 until byte 6, byte 15
- */
-static struct nand_ecclayout docg3_oobinfo = {
-       .eccbytes = 8,
-       .eccpos = {7, 8, 9, 10, 11, 12, 13, 14},
-       .oobfree = {{0, 7}, {15, 1} },
+static int docg3_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       /* byte 7 is Hamming ECC, byte 8-14 are BCH ECC */
+       oobregion->offset = 7;
+       oobregion->length = 8;
+
+       return 0;
+}
+
+static int docg3_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       /* free bytes: byte 0 until byte 6, byte 15 */
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 7;
+       } else {
+               oobregion->offset = 15;
+               oobregion->length = 1;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops nand_ooblayout_docg3_ops = {
+       .ecc = docg3_ooblayout_ecc,
+       .free = docg3_ooblayout_free,
 };
 
 static inline u8 doc_readb(struct docg3 *docg3, u16 reg)
@@ -1857,7 +1881,7 @@ static int __init doc_set_driver_info(int chip_id, struct mtd_info *mtd)
        mtd->_read_oob = doc_read_oob;
        mtd->_write_oob = doc_write_oob;
        mtd->_block_isbad = doc_block_isbad;
-       mtd->ecclayout = &docg3_oobinfo;
+       mtd_set_ooblayout(mtd, &nand_ooblayout_docg3_ops);
        mtd->oobavail = 8;
        mtd->ecc_strength = DOC_ECC_BCH_T;
 
index c9c3b7fa30511c2b4a8f1b9c97bdd1cd89992758..9d6854467651774182ab8895198d41ed7caaf945 100644 (file)
@@ -131,6 +131,28 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
        /* convert the dummy cycles to the number of bytes */
        dummy /= 8;
 
+       if (spi_flash_read_supported(spi)) {
+               struct spi_flash_read_message msg;
+               int ret;
+
+               memset(&msg, 0, sizeof(msg));
+
+               msg.buf = buf;
+               msg.from = from;
+               msg.len = len;
+               msg.read_opcode = nor->read_opcode;
+               msg.addr_width = nor->addr_width;
+               msg.dummy_bytes = dummy;
+               /* TODO: Support other combinations */
+               msg.opcode_nbits = SPI_NBITS_SINGLE;
+               msg.addr_nbits = SPI_NBITS_SINGLE;
+               msg.data_nbits = m25p80_rx_nbits(nor);
+
+               ret = spi_flash_read(spi, &msg);
+               *retlen = msg.retlen;
+               return ret;
+       }
+
        spi_message_init(&m);
        memset(t, 0, (sizeof t));
 
index 708b7e8c8b18ca27e00a6b9c59b11125808dfbdb..220f9200fa52f4bf01178d3b61dbc1926b3bca80 100644 (file)
@@ -353,7 +353,7 @@ static int pmc551_write(struct mtd_info *mtd, loff_t to, size_t len,
  * mechanism
  * returns the size of the memory region found.
  */
-static int fixup_pmc551(struct pci_dev *dev)
+static int __init fixup_pmc551(struct pci_dev *dev)
 {
 #ifdef CONFIG_MTD_PMC551_BUGFIX
        u32 dram_data;
index 0455166f05faeaf364252bf0ddd39ba1fa24931b..4f206a99164c1a2b78d991cb87924323128f5d87 100644 (file)
@@ -112,8 +112,8 @@ static void ck804xrom_cleanup(struct ck804xrom_window *window)
 }
 
 
-static int ck804xrom_init_one(struct pci_dev *pdev,
-                             const struct pci_device_id *ent)
+static int __init ck804xrom_init_one(struct pci_dev *pdev,
+                                    const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        u8 byte;
index 76ed651b515beef6275b7d67820eb1c4fee8e8dc..9646b0766ce02ba9aa6f3f12f1719107cb309e5e 100644 (file)
@@ -144,8 +144,8 @@ static void esb2rom_cleanup(struct esb2rom_window *window)
        pci_dev_put(window->pdev);
 }
 
-static int esb2rom_init_one(struct pci_dev *pdev,
-                           const struct pci_device_id *ent)
+static int __init esb2rom_init_one(struct pci_dev *pdev,
+                                  const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        struct esb2rom_window *window = &esb2rom_window;
index 8636bba422009a1f54ff58b1611acac54cbf9b45..e17d02ae03f0966419a8e42b5192967a98883b39 100644 (file)
@@ -84,8 +84,8 @@ static void ichxrom_cleanup(struct ichxrom_window *window)
 }
 
 
-static int ichxrom_init_one(struct pci_dev *pdev,
-                           const struct pci_device_id *ent)
+static int __init ichxrom_init_one(struct pci_dev *pdev,
+                                  const struct pci_device_id *ent)
 {
        static char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
        struct ichxrom_window *window = &ichxrom_window;
index c1af83db5202fe46f503b68289fbc06d0ee59828..00a8190797ec1d4a445fd4bd46bfe03903699a15 100644 (file)
@@ -4,11 +4,13 @@
  *     uclinux.c -- generic memory mapped MTD driver for uclinux
  *
  *     (C) Copyright 2002, Greg Ungerer (gerg@snapgear.com)
+ *
+ *      License: GPL
  */
 
 /****************************************************************************/
 
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -117,27 +119,6 @@ static int __init uclinux_mtd_init(void)
 
        return(0);
 }
-
-/****************************************************************************/
-
-static void __exit uclinux_mtd_cleanup(void)
-{
-       if (uclinux_ram_mtdinfo) {
-               mtd_device_unregister(uclinux_ram_mtdinfo);
-               map_destroy(uclinux_ram_mtdinfo);
-               uclinux_ram_mtdinfo = NULL;
-       }
-       if (uclinux_ram_map.virt)
-               uclinux_ram_map.virt = 0;
-}
-
-/****************************************************************************/
-
-module_init(uclinux_mtd_init);
-module_exit(uclinux_mtd_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com>");
-MODULE_DESCRIPTION("Generic MTD for uClinux");
+device_initcall(uclinux_mtd_init);
 
 /****************************************************************************/
index 6d19835b80a952e098cfdfa8436971d5c68a593e..2a47a3f0e7308ea13b31391d77e8dab83f536dfe 100644 (file)
@@ -465,35 +465,108 @@ static int mtdchar_readoob(struct file *file, struct mtd_info *mtd,
 }
 
 /*
- * Copies (and truncates, if necessary) data from the larger struct,
- * nand_ecclayout, to the smaller, deprecated layout struct,
- * nand_ecclayout_user. This is necessary only to support the deprecated
- * API ioctl ECCGETLAYOUT while allowing all new functionality to use
- * nand_ecclayout flexibly (i.e. the struct may change size in new
- * releases without requiring major rewrites).
+ * Copies (and truncates, if necessary) OOB layout information to the
+ * deprecated layout struct, nand_ecclayout_user. This is necessary only to
+ * support the deprecated API ioctl ECCGETLAYOUT while allowing all new
+ * functionality to use mtd_ooblayout_ops flexibly (i.e. mtd_ooblayout_ops
+ * can describe any kind of OOB layout with almost zero overhead from a
+ * memory usage point of view).
  */
-static int shrink_ecclayout(const struct nand_ecclayout *from,
-               struct nand_ecclayout_user *to)
+static int shrink_ecclayout(struct mtd_info *mtd,
+                           struct nand_ecclayout_user *to)
 {
-       int i;
+       struct mtd_oob_region oobregion;
+       int i, section = 0, ret;
 
-       if (!from || !to)
+       if (!mtd || !to)
                return -EINVAL;
 
        memset(to, 0, sizeof(*to));
 
-       to->eccbytes = min((int)from->eccbytes, MTD_MAX_ECCPOS_ENTRIES);
-       for (i = 0; i < to->eccbytes; i++)
-               to->eccpos[i] = from->eccpos[i];
+       to->eccbytes = 0;
+       for (i = 0; i < MTD_MAX_ECCPOS_ENTRIES;) {
+               u32 eccpos;
+
+               ret = mtd_ooblayout_ecc(mtd, section, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               eccpos = oobregion.offset;
+               for (; i < MTD_MAX_ECCPOS_ENTRIES &&
+                      eccpos < oobregion.offset + oobregion.length; i++) {
+                       to->eccpos[i] = eccpos++;
+                       to->eccbytes++;
+               }
+       }
 
        for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES; i++) {
-               if (from->oobfree[i].length == 0 &&
-                               from->oobfree[i].offset == 0)
+               ret = mtd_ooblayout_free(mtd, i, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               to->oobfree[i].offset = oobregion.offset;
+               to->oobfree[i].length = oobregion.length;
+               to->oobavail += to->oobfree[i].length;
+       }
+
+       return 0;
+}
+
+static int get_oobinfo(struct mtd_info *mtd, struct nand_oobinfo *to)
+{
+       struct mtd_oob_region oobregion;
+       int i, section = 0, ret;
+
+       if (!mtd || !to)
+               return -EINVAL;
+
+       memset(to, 0, sizeof(*to));
+
+       to->eccbytes = 0;
+       for (i = 0; i < ARRAY_SIZE(to->eccpos);) {
+               u32 eccpos;
+
+               ret = mtd_ooblayout_ecc(mtd, section, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
                        break;
-               to->oobavail += from->oobfree[i].length;
-               to->oobfree[i] = from->oobfree[i];
+               }
+
+               if (oobregion.length + i > ARRAY_SIZE(to->eccpos))
+                       return -EINVAL;
+
+               eccpos = oobregion.offset;
+               for (; eccpos < oobregion.offset + oobregion.length; i++) {
+                       to->eccpos[i] = eccpos++;
+                       to->eccbytes++;
+               }
        }
 
+       for (i = 0; i < 8; i++) {
+               ret = mtd_ooblayout_free(mtd, i, &oobregion);
+               if (ret < 0) {
+                       if (ret != -ERANGE)
+                               return ret;
+
+                       break;
+               }
+
+               to->oobfree[i][0] = oobregion.offset;
+               to->oobfree[i][1] = oobregion.length;
+       }
+
+       to->useecc = MTD_NANDECC_AUTOPLACE;
+
        return 0;
 }
 
@@ -815,16 +888,12 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
        {
                struct nand_oobinfo oi;
 
-               if (!mtd->ecclayout)
+               if (!mtd->ooblayout)
                        return -EOPNOTSUPP;
-               if (mtd->ecclayout->eccbytes > ARRAY_SIZE(oi.eccpos))
-                       return -EINVAL;
 
-               oi.useecc = MTD_NANDECC_AUTOPLACE;
-               memcpy(&oi.eccpos, mtd->ecclayout->eccpos, sizeof(oi.eccpos));
-               memcpy(&oi.oobfree, mtd->ecclayout->oobfree,
-                      sizeof(oi.oobfree));
-               oi.eccbytes = mtd->ecclayout->eccbytes;
+               ret = get_oobinfo(mtd, &oi);
+               if (ret)
+                       return ret;
 
                if (copy_to_user(argp, &oi, sizeof(struct nand_oobinfo)))
                        return -EFAULT;
@@ -913,14 +982,14 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
        {
                struct nand_ecclayout_user *usrlay;
 
-               if (!mtd->ecclayout)
+               if (!mtd->ooblayout)
                        return -EOPNOTSUPP;
 
                usrlay = kmalloc(sizeof(*usrlay), GFP_KERNEL);
                if (!usrlay)
                        return -ENOMEM;
 
-               shrink_ecclayout(mtd->ecclayout, usrlay);
+               shrink_ecclayout(mtd, usrlay);
 
                if (copy_to_user(argp, usrlay, sizeof(*usrlay)))
                        ret = -EFAULT;
index 239a8c806b6772df642bf6cb1385616147e975c8..d573606b91c2a57a4ff07e040b79fd86a7762c5b 100644 (file)
@@ -777,7 +777,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],       /* subdevices to c
 
        }
 
-       concat->mtd.ecclayout = subdev[0]->ecclayout;
+       mtd_set_ooblayout(&concat->mtd, subdev[0]->ooblayout);
 
        concat->num_subdev = num_devs;
        concat->mtd.name = name;
index bee180bd11e79fe236e0287647986fe81fea864c..e3936b847c6b3a7adaf757dfa3e0cddde0ae28bf 100644 (file)
@@ -1016,6 +1016,366 @@ int mtd_write_oob(struct mtd_info *mtd, loff_t to,
 }
 EXPORT_SYMBOL_GPL(mtd_write_oob);
 
+/**
+ * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section
+ * @mtd: MTD device structure
+ * @section: ECC section. Depending on the layout you may have all the ECC
+ *          bytes stored in a single contiguous section, or one section
+ *          per ECC chunk (and sometime several sections for a single ECC
+ *          ECC chunk)
+ * @oobecc: OOB region struct filled with the appropriate ECC position
+ *         information
+ *
+ * This functions return ECC section information in the OOB area. I you want
+ * to get all the ECC bytes information, then you should call
+ * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+                     struct mtd_oob_region *oobecc)
+{
+       memset(oobecc, 0, sizeof(*oobecc));
+
+       if (!mtd || section < 0)
+               return -EINVAL;
+
+       if (!mtd->ooblayout || !mtd->ooblayout->ecc)
+               return -ENOTSUPP;
+
+       return mtd->ooblayout->ecc(mtd, section, oobecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc);
+
+/**
+ * mtd_ooblayout_free - Get the OOB region definition of a specific free
+ *                     section
+ * @mtd: MTD device structure
+ * @section: Free section you are interested in. Depending on the layout
+ *          you may have all the free bytes stored in a single contiguous
+ *          section, or one section per ECC chunk plus an extra section
+ *          for the remaining bytes (or other funky layout).
+ * @oobfree: OOB region struct filled with the appropriate free position
+ *          information
+ *
+ * This functions return free bytes position in the OOB area. I you want
+ * to get all the free bytes information, then you should call
+ * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+                      struct mtd_oob_region *oobfree)
+{
+       memset(oobfree, 0, sizeof(*oobfree));
+
+       if (!mtd || section < 0)
+               return -EINVAL;
+
+       if (!mtd->ooblayout || !mtd->ooblayout->free)
+               return -ENOTSUPP;
+
+       return mtd->ooblayout->free(mtd, section, oobfree);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_free);
+
+/**
+ * mtd_ooblayout_find_region - Find the region attached to a specific byte
+ * @mtd: mtd info structure
+ * @byte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: used to retrieve the ECC position
+ * @iter: iterator function. Should be either mtd_ooblayout_free or
+ *       mtd_ooblayout_ecc depending on the region type you're searching for
+ *
+ * This functions returns the section id and oobregion information of a
+ * specific byte. For example, say you want to know where the 4th ECC byte is
+ * stored, you'll use:
+ *
+ * mtd_ooblayout_find_region(mtd, 3, &section, &oobregion, mtd_ooblayout_ecc);
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte,
+                               int *sectionp, struct mtd_oob_region *oobregion,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       int pos = 0, ret, section = 0;
+
+       memset(oobregion, 0, sizeof(*oobregion));
+
+       while (1) {
+               ret = iter(mtd, section, oobregion);
+               if (ret)
+                       return ret;
+
+               if (pos + oobregion->length > byte)
+                       break;
+
+               pos += oobregion->length;
+               section++;
+       }
+
+       /*
+        * Adjust region info to make it start at the beginning at the
+        * 'start' ECC byte.
+        */
+       oobregion->offset += byte - pos;
+       oobregion->length -= byte - pos;
+       *sectionp = section;
+
+       return 0;
+}
+
+/**
+ * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific
+ *                               ECC byte
+ * @mtd: mtd info structure
+ * @eccbyte: the byte we are searching for
+ * @sectionp: pointer where the section id will be stored
+ * @oobregion: OOB region information
+ *
+ * Works like mtd_ooblayout_find_region() except it searches for a specific ECC
+ * byte.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+                                int *section,
+                                struct mtd_oob_region *oobregion)
+{
+       return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion,
+                                        mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion);
+
+/**
+ * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @buf: destination buffer to store OOB bytes
+ * @oobbuf: OOB buffer
+ * @start: first byte to retrieve
+ * @nbytes: number of bytes to retrieve
+ * @iter: section iterator
+ *
+ * Extract bytes attached to a specific category (ECC or free)
+ * from the OOB buffer and copy them into buf.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf,
+                               const u8 *oobbuf, int start, int nbytes,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret;
+
+       ret = mtd_ooblayout_find_region(mtd, start, &section,
+                                       &oobregion, iter);
+
+       while (!ret) {
+               int cnt;
+
+               cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+               memcpy(buf, oobbuf + oobregion.offset, cnt);
+               buf += cnt;
+               nbytes -= cnt;
+
+               if (!nbytes)
+                       break;
+
+               ret = iter(mtd, ++section, &oobregion);
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @buf: source buffer to get OOB bytes from
+ * @oobbuf: OOB buffer
+ * @start: first OOB byte to set
+ * @nbytes: number of OOB bytes to set
+ * @iter: section iterator
+ *
+ * Fill the OOB buffer with data provided in buf. The category (ECC or free)
+ * is selected by passing the appropriate iterator.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf,
+                               u8 *oobbuf, int start, int nbytes,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret;
+
+       ret = mtd_ooblayout_find_region(mtd, start, &section,
+                                       &oobregion, iter);
+
+       while (!ret) {
+               int cnt;
+
+               cnt = oobregion.length > nbytes ? nbytes : oobregion.length;
+               memcpy(oobbuf + oobregion.offset, buf, cnt);
+               buf += cnt;
+               nbytes -= cnt;
+
+               if (!nbytes)
+                       break;
+
+               ret = iter(mtd, ++section, &oobregion);
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category
+ * @mtd: mtd info structure
+ * @iter: category iterator
+ *
+ * Count the number of bytes in a given category.
+ *
+ * Returns a positive value on success, a negative error code otherwise.
+ */
+static int mtd_ooblayout_count_bytes(struct mtd_info *mtd,
+                               int (*iter)(struct mtd_info *,
+                                           int section,
+                                           struct mtd_oob_region *oobregion))
+{
+       struct mtd_oob_region oobregion = { };
+       int section = 0, ret, nbytes = 0;
+
+       while (1) {
+               ret = iter(mtd, section++, &oobregion);
+               if (ret) {
+                       if (ret == -ERANGE)
+                               ret = nbytes;
+                       break;
+               }
+
+               nbytes += oobregion.length;
+       }
+
+       return ret;
+}
+
+/**
+ * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+                              const u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes);
+
+/**
+ * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get ECC bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+                              u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes);
+
+/**
+ * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer
+ * @mtd: mtd info structure
+ * @databuf: destination buffer to store ECC bytes
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to retrieve
+ * @nbytes: number of ECC bytes to retrieve
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+                               const u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_get_bytes(mtd, databuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_get_databytes);
+
+/**
+ * mtd_ooblayout_get_eccbytes - set data bytes into the oob buffer
+ * @mtd: mtd info structure
+ * @eccbuf: source buffer to get data bytes from
+ * @oobbuf: OOB buffer
+ * @start: first ECC byte to set
+ * @nbytes: number of ECC bytes to set
+ *
+ * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+                               u8 *oobbuf, int start, int nbytes)
+{
+       return mtd_ooblayout_set_bytes(mtd, databuf, oobbuf, start, nbytes,
+                                      mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_set_databytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of free bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count free bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd)
+{
+       return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_free);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_freebytes);
+
+/**
+ * mtd_ooblayout_count_freebytes - count the number of ECC bytes in OOB
+ * @mtd: mtd info structure
+ *
+ * Works like mtd_ooblayout_count_bytes(), except it count ECC bytes.
+ *
+ * Returns zero on success, a negative error code otherwise.
+ */
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd)
+{
+       return mtd_ooblayout_count_bytes(mtd, mtd_ooblayout_ecc);
+}
+EXPORT_SYMBOL_GPL(mtd_ooblayout_count_eccbytes);
+
 /*
  * Method to access the protection register area, present in some flash
  * devices. The user data is one time programmable but the factory data is read
index 08de4b2cf0f5ec291ba4bfe9e43b8f8d083c0bbd..1f13e32556f869634146c102ef1f2b0aea8fe4f1 100644 (file)
@@ -317,6 +317,27 @@ static int part_block_markbad(struct mtd_info *mtd, loff_t ofs)
        return res;
 }
 
+static int part_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct mtd_part *part = mtd_to_part(mtd);
+
+       return mtd_ooblayout_ecc(part->master, section, oobregion);
+}
+
+static int part_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct mtd_part *part = mtd_to_part(mtd);
+
+       return mtd_ooblayout_free(part->master, section, oobregion);
+}
+
+static const struct mtd_ooblayout_ops part_ooblayout_ops = {
+       .ecc = part_ooblayout_ecc,
+       .free = part_ooblayout_free,
+};
+
 static inline void free_partition(struct mtd_part *p)
 {
        kfree(p->mtd.name);
@@ -533,7 +554,7 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
                        part->name);
        }
 
-       slave->mtd.ecclayout = master->ecclayout;
+       mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
        slave->mtd.ecc_step_size = master->ecc_step_size;
        slave->mtd.ecc_strength = master->ecc_strength;
        slave->mtd.bitflip_threshold = master->bitflip_threshold;
index 68b58c85789c354f16886e2c8d8045dec137bd43..78e12cc8bac2f5bc43cc54a7a809c8734571bad1 100644 (file)
@@ -224,6 +224,7 @@ static int ams_delta_init(struct platform_device *pdev)
        /* 25 us command delay time */
        this->chip_delay = 30;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        platform_set_drvdata(pdev, io_base);
 
index 20cbaabb29590f61f2009411e114e1fc83acec81..efc8ea250c1d5966409d9aabb6588af55e4d478a 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
@@ -72,30 +71,44 @@ struct atmel_nand_nfc_caps {
        uint32_t rb_mask;
 };
 
-/* oob layout for large page size
+/*
+ * oob layout for large page size
  * bad block info is on bytes 0 and 1
  * the bytes have to be consecutives to avoid
  * several NAND_CMD_RNDOUT during read
- */
-static struct nand_ecclayout atmel_oobinfo_large = {
-       .eccbytes = 4,
-       .eccpos = {60, 61, 62, 63},
-       .oobfree = {
-               {2, 58}
-       },
-};
-
-/* oob layout for small page size
+ *
+ * oob layout for small page size
  * bad block info is on bytes 4 and 5
  * the bytes have to be consecutives to avoid
  * several NAND_CMD_RNDOUT during read
  */
-static struct nand_ecclayout atmel_oobinfo_small = {
-       .eccbytes = 4,
-       .eccpos = {0, 1, 2, 3},
-       .oobfree = {
-               {6, 10}
-       },
+static int atmel_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 4;
+       oobregion->offset = 0;
+
+       return 0;
+}
+
+static int atmel_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 6;
+       oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops atmel_ooblayout_sp_ops = {
+       .ecc = atmel_ooblayout_ecc_sp,
+       .free = atmel_ooblayout_free_sp,
 };
 
 struct atmel_nfc {
@@ -163,8 +176,6 @@ struct atmel_nand_host {
        int                     *pmecc_delta;
 };
 
-static struct nand_ecclayout atmel_pmecc_oobinfo;
-
 /*
  * Enable NAND.
  */
@@ -434,14 +445,13 @@ err_buf:
 static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
-       struct atmel_nand_host *host = nand_get_controller_data(chip);
 
        if (use_dma && len > mtd->oobsize)
                /* only use DMA for bigger than oob size: better performances */
                if (atmel_nand_dma_op(mtd, buf, len, 1) == 0)
                        return;
 
-       if (host->board.bus_width_16)
+       if (chip->options & NAND_BUSWIDTH_16)
                atmel_read_buf16(mtd, buf, len);
        else
                atmel_read_buf8(mtd, buf, len);
@@ -450,14 +460,13 @@ static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
 static void atmel_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
-       struct atmel_nand_host *host = nand_get_controller_data(chip);
 
        if (use_dma && len > mtd->oobsize)
                /* only use DMA for bigger than oob size: better performances */
                if (atmel_nand_dma_op(mtd, (void *)buf, len, 0) == 0)
                        return;
 
-       if (host->board.bus_width_16)
+       if (chip->options & NAND_BUSWIDTH_16)
                atmel_write_buf16(mtd, buf, len);
        else
                atmel_write_buf8(mtd, buf, len);
@@ -483,22 +492,6 @@ static int pmecc_get_ecc_bytes(int cap, int sector_size)
        return (m * cap + 7) / 8;
 }
 
-static void pmecc_config_ecc_layout(struct nand_ecclayout *layout,
-                                   int oobsize, int ecc_len)
-{
-       int i;
-
-       layout->eccbytes = ecc_len;
-
-       /* ECC will occupy the last ecc_len bytes continuously */
-       for (i = 0; i < ecc_len; i++)
-               layout->eccpos[i] = oobsize - ecc_len + i;
-
-       layout->oobfree[0].offset = PMECC_OOB_RESERVED_BYTES;
-       layout->oobfree[0].length =
-               oobsize - ecc_len - layout->oobfree[0].offset;
-}
-
 static void __iomem *pmecc_get_alpha_to(struct atmel_nand_host *host)
 {
        int table_size;
@@ -836,13 +829,16 @@ static void pmecc_correct_data(struct mtd_info *mtd, uint8_t *buf, uint8_t *ecc,
                        dev_dbg(host->dev, "Bit flip in data area, byte_pos: %d, bit_pos: %d, 0x%02x -> 0x%02x\n",
                                pos, bit_pos, err_byte, *(buf + byte_pos));
                } else {
+                       struct mtd_oob_region oobregion;
+
                        /* Bit flip in OOB area */
                        tmp = sector_num * nand_chip->ecc.bytes
                                        + (byte_pos - sector_size);
                        err_byte = ecc[tmp];
                        ecc[tmp] ^= (1 << bit_pos);
 
-                       pos = tmp + nand_chip->ecc.layout->eccpos[0];
+                       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+                       pos = tmp + oobregion.offset;
                        dev_dbg(host->dev, "Bit flip in OOB, oob_byte_pos: %d, bit_pos: %d, 0x%02x -> 0x%02x\n",
                                pos, bit_pos, err_byte, ecc[tmp]);
                }
@@ -863,17 +859,6 @@ static int pmecc_correction(struct mtd_info *mtd, u32 pmecc_stat, uint8_t *buf,
        uint8_t *buf_pos;
        int max_bitflips = 0;
 
-       /* If can correct bitfilps from erased page, do the normal check */
-       if (host->caps->pmecc_correct_erase_page)
-               goto normal_check;
-
-       for (i = 0; i < nand_chip->ecc.total; i++)
-               if (ecc[i] != 0xff)
-                       goto normal_check;
-       /* Erased page, return OK */
-       return 0;
-
-normal_check:
        for (i = 0; i < nand_chip->ecc.steps; i++) {
                err_nbr = 0;
                if (pmecc_stat & 0x1) {
@@ -884,16 +869,30 @@ normal_check:
                        pmecc_get_sigma(mtd);
 
                        err_nbr = pmecc_err_location(mtd);
-                       if (err_nbr == -1) {
+                       if (err_nbr >= 0) {
+                               pmecc_correct_data(mtd, buf_pos, ecc, i,
+                                                  nand_chip->ecc.bytes,
+                                                  err_nbr);
+                       } else if (!host->caps->pmecc_correct_erase_page) {
+                               u8 *ecc_pos = ecc + (i * nand_chip->ecc.bytes);
+
+                               /* Try to detect erased pages */
+                               err_nbr = nand_check_erased_ecc_chunk(buf_pos,
+                                                       host->pmecc_sector_size,
+                                                       ecc_pos,
+                                                       nand_chip->ecc.bytes,
+                                                       NULL, 0,
+                                                       nand_chip->ecc.strength);
+                       }
+
+                       if (err_nbr < 0) {
                                dev_err(host->dev, "PMECC: Too many errors\n");
                                mtd->ecc_stats.failed++;
                                return -EIO;
-                       } else {
-                               pmecc_correct_data(mtd, buf_pos, ecc, i,
-                                       nand_chip->ecc.bytes, err_nbr);
-                               mtd->ecc_stats.corrected += err_nbr;
-                               max_bitflips = max_t(int, max_bitflips, err_nbr);
                        }
+
+                       mtd->ecc_stats.corrected += err_nbr;
+                       max_bitflips = max_t(int, max_bitflips, err_nbr);
                }
                pmecc_stat >>= 1;
        }
@@ -931,7 +930,6 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
        struct atmel_nand_host *host = nand_get_controller_data(chip);
        int eccsize = chip->ecc.size * chip->ecc.steps;
        uint8_t *oob = chip->oob_poi;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint32_t stat;
        unsigned long end_time;
        int bitflips = 0;
@@ -953,7 +951,11 @@ static int atmel_nand_pmecc_read_page(struct mtd_info *mtd,
 
        stat = pmecc_readl_relaxed(host->ecc, ISR);
        if (stat != 0) {
-               bitflips = pmecc_correction(mtd, stat, buf, &oob[eccpos[0]]);
+               struct mtd_oob_region oobregion;
+
+               mtd_ooblayout_ecc(mtd, 0, &oobregion);
+               bitflips = pmecc_correction(mtd, stat, buf,
+                                           &oob[oobregion.offset]);
                if (bitflips < 0)
                        /* uncorrectable errors */
                        return 0;
@@ -967,8 +969,8 @@ static int atmel_nand_pmecc_write_page(struct mtd_info *mtd,
                int page)
 {
        struct atmel_nand_host *host = nand_get_controller_data(chip);
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
-       int i, j;
+       struct mtd_oob_region oobregion = { };
+       int i, j, section = 0;
        unsigned long end_time;
 
        if (!host->nfc || !host->nfc->write_by_sram) {
@@ -987,11 +989,14 @@ static int atmel_nand_pmecc_write_page(struct mtd_info *mtd,
 
        for (i = 0; i < chip->ecc.steps; i++) {
                for (j = 0; j < chip->ecc.bytes; j++) {
-                       int pos;
+                       if (!oobregion.length)
+                               mtd_ooblayout_ecc(mtd, section, &oobregion);
 
-                       pos = i * chip->ecc.bytes + j;
-                       chip->oob_poi[eccpos[pos]] =
+                       chip->oob_poi[oobregion.offset] =
                                pmecc_readb_ecc_relaxed(host->ecc, i, j);
+                       oobregion.length--;
+                       oobregion.offset++;
+                       section++;
                }
        }
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1003,8 +1008,9 @@ static void atmel_pmecc_core_init(struct mtd_info *mtd)
 {
        struct nand_chip *nand_chip = mtd_to_nand(mtd);
        struct atmel_nand_host *host = nand_get_controller_data(nand_chip);
+       int eccbytes = mtd_ooblayout_count_eccbytes(mtd);
        uint32_t val = 0;
-       struct nand_ecclayout *ecc_layout;
+       struct mtd_oob_region oobregion;
 
        pmecc_writel(host->ecc, CTRL, PMECC_CTRL_RST);
        pmecc_writel(host->ecc, CTRL, PMECC_CTRL_DISABLE);
@@ -1054,11 +1060,11 @@ static void atmel_pmecc_core_init(struct mtd_info *mtd)
                | PMECC_CFG_AUTO_DISABLE);
        pmecc_writel(host->ecc, CFG, val);
 
-       ecc_layout = nand_chip->ecc.layout;
        pmecc_writel(host->ecc, SAREA, mtd->oobsize - 1);
-       pmecc_writel(host->ecc, SADDR, ecc_layout->eccpos[0]);
+       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       pmecc_writel(host->ecc, SADDR, oobregion.offset);
        pmecc_writel(host->ecc, EADDR,
-                       ecc_layout->eccpos[ecc_layout->eccbytes - 1]);
+                    oobregion.offset + eccbytes - 1);
        /* See datasheet about PMECC Clock Control Register */
        pmecc_writel(host->ecc, CLK, 2);
        pmecc_writel(host->ecc, IDR, 0xff);
@@ -1206,6 +1212,7 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                dev_warn(host->dev,
                        "Can't get I/O resource regs for PMECC controller, rolling back on software ECC\n");
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1280,11 +1287,8 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                        err_no = -EINVAL;
                        goto err;
                }
-               pmecc_config_ecc_layout(&atmel_pmecc_oobinfo,
-                                       mtd->oobsize,
-                                       nand_chip->ecc.total);
 
-               nand_chip->ecc.layout = &atmel_pmecc_oobinfo;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                break;
        default:
                dev_warn(host->dev,
@@ -1292,6 +1296,7 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev,
                /* page size not handled by HW ECC */
                /* switching back to soft ECC */
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1359,12 +1364,12 @@ static int atmel_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
 {
        int eccsize = chip->ecc.size;
        int eccbytes = chip->ecc.bytes;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint8_t *p = buf;
        uint8_t *oob = chip->oob_poi;
        uint8_t *ecc_pos;
        int stat;
        unsigned int max_bitflips = 0;
+       struct mtd_oob_region oobregion = {};
 
        /*
         * Errata: ALE is incorrectly wired up to the ECC controller
@@ -1382,19 +1387,20 @@ static int atmel_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        chip->read_buf(mtd, p, eccsize);
 
        /* move to ECC position if needed */
-       if (eccpos[0] != 0) {
-               /* This only works on large pages
-                * because the ECC controller waits for
-                * NAND_CMD_RNDOUTSTART after the
-                * NAND_CMD_RNDOUT.
-                * anyway, for small pages, the eccpos[0] == 0
+       mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (oobregion.offset != 0) {
+               /*
+                * This only works on large pages because the ECC controller
+                * waits for NAND_CMD_RNDOUTSTART after the NAND_CMD_RNDOUT.
+                * Anyway, for small pages, the first ECC byte is at offset
+                * 0 in the OOB area.
                 */
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
-                               mtd->writesize + eccpos[0], -1);
+                             mtd->writesize + oobregion.offset, -1);
        }
 
        /* the ECC controller needs to read the ECC just after the data */
-       ecc_pos = oob + eccpos[0];
+       ecc_pos = oob + oobregion.offset;
        chip->read_buf(mtd, ecc_pos, eccbytes);
 
        /* check if there's an error */
@@ -1504,58 +1510,17 @@ static void atmel_nand_hwctl(struct mtd_info *mtd, int mode)
                ecc_writel(host->ecc, CR, ATMEL_ECC_RST);
 }
 
-static int atmel_of_init_port(struct atmel_nand_host *host,
-                             struct device_node *np)
+static int atmel_of_init_ecc(struct atmel_nand_host *host,
+                            struct device_node *np)
 {
-       u32 val;
        u32 offset[2];
-       int ecc_mode;
-       struct atmel_nand_data *board = &host->board;
-       enum of_gpio_flags flags = 0;
-
-       host->caps = (struct atmel_nand_caps *)
-               of_device_get_match_data(host->dev);
-
-       if (of_property_read_u32(np, "atmel,nand-addr-offset", &val) == 0) {
-               if (val >= 32) {
-                       dev_err(host->dev, "invalid addr-offset %u\n", val);
-                       return -EINVAL;
-               }
-               board->ale = val;
-       }
-
-       if (of_property_read_u32(np, "atmel,nand-cmd-offset", &val) == 0) {
-               if (val >= 32) {
-                       dev_err(host->dev, "invalid cmd-offset %u\n", val);
-                       return -EINVAL;
-               }
-               board->cle = val;
-       }
-
-       ecc_mode = of_get_nand_ecc_mode(np);
-
-       board->ecc_mode = ecc_mode < 0 ? NAND_ECC_SOFT : ecc_mode;
-
-       board->on_flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       board->has_dma = of_property_read_bool(np, "atmel,nand-has-dma");
-
-       if (of_get_nand_bus_width(np) == 16)
-               board->bus_width_16 = 1;
-
-       board->rdy_pin = of_get_gpio_flags(np, 0, &flags);
-       board->rdy_pin_active_low = (flags == OF_GPIO_ACTIVE_LOW);
-
-       board->enable_pin = of_get_gpio(np, 1);
-       board->det_pin = of_get_gpio(np, 2);
+       u32 val;
 
        host->has_pmecc = of_property_read_bool(np, "atmel,has-pmecc");
 
-       /* load the nfc driver if there is */
-       of_platform_populate(np, NULL, NULL, host->dev);
-
-       if (!(board->ecc_mode == NAND_ECC_HW) || !host->has_pmecc)
-               return 0;       /* Not using PMECC */
+       /* Not using PMECC */
+       if (!(host->nand_chip.ecc.mode == NAND_ECC_HW) || !host->has_pmecc)
+               return 0;
 
        /* use PMECC, get correction capability, sector size and lookup
         * table offset.
@@ -1596,16 +1561,65 @@ static int atmel_of_init_port(struct atmel_nand_host *host,
                /* Will build a lookup table and initialize the offset later */
                return 0;
        }
+
        if (!offset[0] && !offset[1]) {
                dev_err(host->dev, "Invalid PMECC lookup table offset\n");
                return -EINVAL;
        }
+
        host->pmecc_lookup_table_offset_512 = offset[0];
        host->pmecc_lookup_table_offset_1024 = offset[1];
 
        return 0;
 }
 
+static int atmel_of_init_port(struct atmel_nand_host *host,
+                             struct device_node *np)
+{
+       u32 val;
+       struct atmel_nand_data *board = &host->board;
+       enum of_gpio_flags flags = 0;
+
+       host->caps = (struct atmel_nand_caps *)
+               of_device_get_match_data(host->dev);
+
+       if (of_property_read_u32(np, "atmel,nand-addr-offset", &val) == 0) {
+               if (val >= 32) {
+                       dev_err(host->dev, "invalid addr-offset %u\n", val);
+                       return -EINVAL;
+               }
+               board->ale = val;
+       }
+
+       if (of_property_read_u32(np, "atmel,nand-cmd-offset", &val) == 0) {
+               if (val >= 32) {
+                       dev_err(host->dev, "invalid cmd-offset %u\n", val);
+                       return -EINVAL;
+               }
+               board->cle = val;
+       }
+
+       board->has_dma = of_property_read_bool(np, "atmel,nand-has-dma");
+
+       board->rdy_pin = of_get_gpio_flags(np, 0, &flags);
+       board->rdy_pin_active_low = (flags == OF_GPIO_ACTIVE_LOW);
+
+       board->enable_pin = of_get_gpio(np, 1);
+       board->det_pin = of_get_gpio(np, 2);
+
+       /* load the nfc driver if there is */
+       of_platform_populate(np, NULL, NULL, host->dev);
+
+       /*
+        * Initialize ECC mode to NAND_ECC_SOFT so that we have a correct value
+        * even if the nand-ecc-mode property is not defined.
+        */
+       host->nand_chip.ecc.mode = NAND_ECC_SOFT;
+       host->nand_chip.ecc.algo = NAND_ECC_HAMMING;
+
+       return 0;
+}
+
 static int atmel_hw_nand_init_params(struct platform_device *pdev,
                                         struct atmel_nand_host *host)
 {
@@ -1618,6 +1632,7 @@ static int atmel_hw_nand_init_params(struct platform_device *pdev,
                dev_err(host->dev,
                        "Can't get I/O resource regs, use software ECC\n");
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -1631,25 +1646,26 @@ static int atmel_hw_nand_init_params(struct platform_device *pdev,
        /* set ECC page size and oob layout */
        switch (mtd->writesize) {
        case 512:
-               nand_chip->ecc.layout = &atmel_oobinfo_small;
+               mtd_set_ooblayout(mtd, &atmel_ooblayout_sp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_528);
                break;
        case 1024:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_1056);
                break;
        case 2048:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_2112);
                break;
        case 4096:
-               nand_chip->ecc.layout = &atmel_oobinfo_large;
+               mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_4224);
                break;
        default:
                /* page size not handled by HW ECC */
                /* switching back to soft ECC */
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                return 0;
        }
 
@@ -2147,6 +2163,19 @@ static int atmel_nand_probe(struct platform_device *pdev)
        } else {
                memcpy(&host->board, dev_get_platdata(&pdev->dev),
                       sizeof(struct atmel_nand_data));
+               nand_chip->ecc.mode = host->board.ecc_mode;
+
+               /*
+                * When using software ECC every supported avr32 board means
+                * Hamming algorithm. If that ever changes we'll need to add
+                * ecc_algo field to the struct atmel_nand_data.
+                */
+               if (nand_chip->ecc.mode == NAND_ECC_SOFT)
+                       nand_chip->ecc.algo = NAND_ECC_HAMMING;
+
+               /* 16-bit bus width */
+               if (host->board.bus_width_16)
+                       nand_chip->options |= NAND_BUSWIDTH_16;
        }
 
         /* link the private data structures */
@@ -2188,11 +2217,8 @@ static int atmel_nand_probe(struct platform_device *pdev)
                nand_chip->cmd_ctrl = atmel_nand_cmd_ctrl;
        }
 
-       nand_chip->ecc.mode = host->board.ecc_mode;
        nand_chip->chip_delay = 40;             /* 40us command delay time */
 
-       if (host->board.bus_width_16)   /* 16-bit bus width */
-               nand_chip->options |= NAND_BUSWIDTH_16;
 
        nand_chip->read_buf = atmel_read_buf;
        nand_chip->write_buf = atmel_write_buf;
@@ -2225,11 +2251,6 @@ static int atmel_nand_probe(struct platform_device *pdev)
                }
        }
 
-       if (host->board.on_flash_bbt || on_flash_bbt) {
-               dev_info(&pdev->dev, "Use On Flash BBT\n");
-               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
-       }
-
        if (!host->board.has_dma)
                use_dma = 0;
 
@@ -2256,6 +2277,18 @@ static int atmel_nand_probe(struct platform_device *pdev)
                goto err_scan_ident;
        }
 
+       if (host->board.on_flash_bbt || on_flash_bbt)
+               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
+
+       if (nand_chip->bbt_options & NAND_BBT_USE_FLASH)
+               dev_info(&pdev->dev, "Use On Flash BBT\n");
+
+       if (IS_ENABLED(CONFIG_OF) && pdev->dev.of_node) {
+               res = atmel_of_init_ecc(host, pdev->dev.of_node);
+               if (res)
+                       goto err_hw_ecc;
+       }
+
        if (nand_chip->ecc.mode == NAND_ECC_HW) {
                if (host->has_pmecc)
                        res = atmel_pmecc_nand_init_params(pdev, host);
index 341ea4904164dec5dcfebac0ebf487eceaac2d88..9bf6d9915694e0cd69708fd0a3d4cc7f706968ae 100644 (file)
@@ -459,6 +459,7 @@ static int au1550nd_probe(struct platform_device *pdev)
        /* 30 us command delay time */
        this->chip_delay = 30;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        if (pd->devwidth)
                this->options |= NAND_BUSWIDTH_16;
index 7f6b30e615b7f47c2863fc5c11d33de24c319c43..37da4236ab908099217184bd310a567365396192 100644 (file)
@@ -109,28 +109,33 @@ static const unsigned short bfin_nfc_pin_req[] =
         0};
 
 #ifdef CONFIG_MTD_NAND_BF5XX_BOOTROM_ECC
-static struct nand_ecclayout bootrom_ecclayout = {
-       .eccbytes = 24,
-       .eccpos = {
-               0x8 * 0, 0x8 * 0 + 1, 0x8 * 0 + 2,
-               0x8 * 1, 0x8 * 1 + 1, 0x8 * 1 + 2,
-               0x8 * 2, 0x8 * 2 + 1, 0x8 * 2 + 2,
-               0x8 * 3, 0x8 * 3 + 1, 0x8 * 3 + 2,
-               0x8 * 4, 0x8 * 4 + 1, 0x8 * 4 + 2,
-               0x8 * 5, 0x8 * 5 + 1, 0x8 * 5 + 2,
-               0x8 * 6, 0x8 * 6 + 1, 0x8 * 6 + 2,
-               0x8 * 7, 0x8 * 7 + 1, 0x8 * 7 + 2
-       },
-       .oobfree = {
-               { 0x8 * 0 + 3, 5 },
-               { 0x8 * 1 + 3, 5 },
-               { 0x8 * 2 + 3, 5 },
-               { 0x8 * 3 + 3, 5 },
-               { 0x8 * 4 + 3, 5 },
-               { 0x8 * 5 + 3, 5 },
-               { 0x8 * 6 + 3, 5 },
-               { 0x8 * 7 + 3, 5 },
-       }
+static int bootrom_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = section * 8;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int bootrom_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 8) + 3;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops bootrom_ooblayout_ops = {
+       .ecc = bootrom_ooblayout_ecc,
+       .free = bootrom_ooblayout_free,
 };
 #endif
 
@@ -800,7 +805,7 @@ static int bf5xx_nand_probe(struct platform_device *pdev)
        /* setup hardware ECC data struct */
        if (hardware_ecc) {
 #ifdef CONFIG_MTD_NAND_BF5XX_BOOTROM_ECC
-               chip->ecc.layout = &bootrom_ecclayout;
+               mtd_set_ooblayout(mtd, &bootrom_ooblayout_ops);
 #endif
                chip->read_buf      = bf5xx_nand_dma_read_buf;
                chip->write_buf     = bf5xx_nand_dma_write_buf;
@@ -812,6 +817,7 @@ static int bf5xx_nand_probe(struct platform_device *pdev)
                chip->ecc.write_page_raw = bf5xx_nand_write_page_raw;
        } else {
                chip->ecc.mode      = NAND_ECC_SOFT;
+               chip->ecc.algo  = NAND_ECC_HAMMING;
        }
 
        /* scan hardware nand chip and setup mtd info data struct */
index e0528397306a4d92d0a2624835c42b96f91e5897..b76ad7c0144f7501e7de328c8c0175690df3a039 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/list.h>
@@ -601,7 +600,7 @@ static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val)
 
 static inline int brcmnand_cmd_shift(struct brcmnand_controller *ctrl)
 {
-       if (ctrl->nand_version < 0x0700)
+       if (ctrl->nand_version < 0x0602)
                return 24;
        return 0;
 }
@@ -781,127 +780,183 @@ static inline bool is_hamming_ecc(struct brcmnand_cfg *cfg)
 }
 
 /*
- * Returns a nand_ecclayout strucutre for the given layout/configuration.
- * Returns NULL on failure.
+ * Set mtd->ooblayout to the appropriate mtd_ooblayout_ops given
+ * the layout/configuration.
+ * Returns -ERRCODE on failure.
  */
-static struct nand_ecclayout *brcmnand_create_layout(int ecc_level,
-                                                    struct brcmnand_host *host)
+static int brcmnand_hamming_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
        struct brcmnand_cfg *cfg = &host->hwcfg;
-       int i, j;
-       struct nand_ecclayout *layout;
-       int req;
-       int sectors;
-       int sas;
-       int idx1, idx2;
-
-       layout = devm_kzalloc(&host->pdev->dev, sizeof(*layout), GFP_KERNEL);
-       if (!layout)
-               return NULL;
-
-       sectors = cfg->page_size / (512 << cfg->sector_size_1k);
-       sas = cfg->spare_area_size << cfg->sector_size_1k;
-
-       /* Hamming */
-       if (is_hamming_ecc(cfg)) {
-               for (i = 0, idx1 = 0, idx2 = 0; i < sectors; i++) {
-                       /* First sector of each page may have BBI */
-                       if (i == 0) {
-                               layout->oobfree[idx2].offset = i * sas + 1;
-                               /* Small-page NAND use byte 6 for BBI */
-                               if (cfg->page_size == 512)
-                                       layout->oobfree[idx2].offset--;
-                               layout->oobfree[idx2].length = 5;
-                       } else {
-                               layout->oobfree[idx2].offset = i * sas;
-                               layout->oobfree[idx2].length = 6;
-                       }
-                       idx2++;
-                       layout->eccpos[idx1++] = i * sas + 6;
-                       layout->eccpos[idx1++] = i * sas + 7;
-                       layout->eccpos[idx1++] = i * sas + 8;
-                       layout->oobfree[idx2].offset = i * sas + 9;
-                       layout->oobfree[idx2].length = 7;
-                       idx2++;
-                       /* Leave zero-terminated entry for OOBFREE */
-                       if (idx1 >= MTD_MAX_ECCPOS_ENTRIES_LARGE ||
-                                   idx2 >= MTD_MAX_OOBFREE_ENTRIES_LARGE - 1)
-                               break;
-               }
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
 
-               return layout;
-       }
+       if (section >= sectors)
+               return -ERANGE;
 
-       /*
-        * CONTROLLER_VERSION:
-        *   < v5.0: ECC_REQ = ceil(BCH_T * 13/8)
-        *  >= v5.0: ECC_REQ = ceil(BCH_T * 14/8)
-        * But we will just be conservative.
-        */
-       req = DIV_ROUND_UP(ecc_level * 14, 8);
-       if (req >= sas) {
-               dev_err(&host->pdev->dev,
-                       "error: ECC too large for OOB (ECC bytes %d, spare sector %d)\n",
-                       req, sas);
-               return NULL;
-       }
+       oobregion->offset = (section * sas) + 6;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int brcmnand_hamming_ooblayout_free(struct mtd_info *mtd, int section,
+                                          struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
 
-       layout->eccbytes = req * sectors;
-       for (i = 0, idx1 = 0, idx2 = 0; i < sectors; i++) {
-               for (j = sas - req; j < sas && idx1 <
-                               MTD_MAX_ECCPOS_ENTRIES_LARGE; j++, idx1++)
-                       layout->eccpos[idx1] = i * sas + j;
+       if (section >= sectors * 2)
+               return -ERANGE;
+
+       oobregion->offset = (section / 2) * sas;
+
+       if (section & 1) {
+               oobregion->offset += 9;
+               oobregion->length = 7;
+       } else {
+               oobregion->length = 6;
 
                /* First sector of each page may have BBI */
-               if (i == 0) {
-                       if (cfg->page_size == 512 && (sas - req >= 6)) {
-                               /* Small-page NAND use byte 6 for BBI */
-                               layout->oobfree[idx2].offset = 0;
-                               layout->oobfree[idx2].length = 5;
-                               idx2++;
-                               if (sas - req > 6) {
-                                       layout->oobfree[idx2].offset = 6;
-                                       layout->oobfree[idx2].length =
-                                               sas - req - 6;
-                                       idx2++;
-                               }
-                       } else if (sas > req + 1) {
-                               layout->oobfree[idx2].offset = i * sas + 1;
-                               layout->oobfree[idx2].length = sas - req - 1;
-                               idx2++;
-                       }
-               } else if (sas > req) {
-                       layout->oobfree[idx2].offset = i * sas;
-                       layout->oobfree[idx2].length = sas - req;
-                       idx2++;
+               if (!section) {
+                       /*
+                        * Small-page NAND use byte 6 for BBI while large-page
+                        * NAND use byte 0.
+                        */
+                       if (cfg->page_size > 512)
+                               oobregion->offset++;
+                       oobregion->length--;
                }
-               /* Leave zero-terminated entry for OOBFREE */
-               if (idx1 >= MTD_MAX_ECCPOS_ENTRIES_LARGE ||
-                               idx2 >= MTD_MAX_OOBFREE_ENTRIES_LARGE - 1)
-                       break;
        }
 
-       return layout;
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops brcmnand_hamming_ooblayout_ops = {
+       .ecc = brcmnand_hamming_ooblayout_ecc,
+       .free = brcmnand_hamming_ooblayout_free,
+};
+
+static int brcmnand_bch_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
+
+       if (section >= sectors)
+               return -ERANGE;
+
+       oobregion->offset = (section * (sas + 1)) - chip->ecc.bytes;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int brcmnand_bch_ooblayout_free_lp(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+       int sectors = cfg->page_size / (512 << cfg->sector_size_1k);
+
+       if (section >= sectors)
+               return -ERANGE;
+
+       if (sas <= chip->ecc.bytes)
+               return 0;
+
+       oobregion->offset = section * sas;
+       oobregion->length = sas - chip->ecc.bytes;
+
+       if (!section) {
+               oobregion->offset++;
+               oobregion->length--;
+       }
+
+       return 0;
 }
 
-static struct nand_ecclayout *brcmstb_choose_ecc_layout(
-               struct brcmnand_host *host)
+static int brcmnand_bch_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                         struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct brcmnand_host *host = nand_get_controller_data(chip);
+       struct brcmnand_cfg *cfg = &host->hwcfg;
+       int sas = cfg->spare_area_size << cfg->sector_size_1k;
+
+       if (section > 1 || sas - chip->ecc.bytes < 6 ||
+           (section && sas - chip->ecc.bytes == 6))
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 5;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = sas - chip->ecc.bytes - 6;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops brcmnand_bch_lp_ooblayout_ops = {
+       .ecc = brcmnand_bch_ooblayout_ecc,
+       .free = brcmnand_bch_ooblayout_free_lp,
+};
+
+static const struct mtd_ooblayout_ops brcmnand_bch_sp_ooblayout_ops = {
+       .ecc = brcmnand_bch_ooblayout_ecc,
+       .free = brcmnand_bch_ooblayout_free_sp,
+};
+
+static int brcmstb_choose_ecc_layout(struct brcmnand_host *host)
 {
-       struct nand_ecclayout *layout;
        struct brcmnand_cfg *p = &host->hwcfg;
+       struct mtd_info *mtd = nand_to_mtd(&host->chip);
+       struct nand_ecc_ctrl *ecc = &host->chip.ecc;
        unsigned int ecc_level = p->ecc_level;
+       int sas = p->spare_area_size << p->sector_size_1k;
+       int sectors = p->page_size / (512 << p->sector_size_1k);
 
        if (p->sector_size_1k)
                ecc_level <<= 1;
 
-       layout = brcmnand_create_layout(ecc_level, host);
-       if (!layout) {
+       if (is_hamming_ecc(p)) {
+               ecc->bytes = 3 * sectors;
+               mtd_set_ooblayout(mtd, &brcmnand_hamming_ooblayout_ops);
+               return 0;
+       }
+
+       /*
+        * CONTROLLER_VERSION:
+        *   < v5.0: ECC_REQ = ceil(BCH_T * 13/8)
+        *  >= v5.0: ECC_REQ = ceil(BCH_T * 14/8)
+        * But we will just be conservative.
+        */
+       ecc->bytes = DIV_ROUND_UP(ecc_level * 14, 8);
+       if (p->page_size == 512)
+               mtd_set_ooblayout(mtd, &brcmnand_bch_sp_ooblayout_ops);
+       else
+               mtd_set_ooblayout(mtd, &brcmnand_bch_lp_ooblayout_ops);
+
+       if (ecc->bytes >= sas) {
                dev_err(&host->pdev->dev,
-                               "no proper ecc_layout for this NAND cfg\n");
-               return NULL;
+                       "error: ECC too large for OOB (ECC bytes %d, spare sector %d)\n",
+                       ecc->bytes, sas);
+               return -EINVAL;
        }
 
-       return layout;
+       return 0;
 }
 
 static void brcmnand_wp(struct mtd_info *mtd, int wp)
@@ -1870,9 +1925,31 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
        cfg->col_adr_bytes = 2;
        cfg->blk_adr_bytes = get_blk_adr_bytes(mtd->size, mtd->writesize);
 
+       if (chip->ecc.mode != NAND_ECC_HW) {
+               dev_err(ctrl->dev, "only HW ECC supported; selected: %d\n",
+                       chip->ecc.mode);
+               return -EINVAL;
+       }
+
+       if (chip->ecc.algo == NAND_ECC_UNKNOWN) {
+               if (chip->ecc.strength == 1 && chip->ecc.size == 512)
+                       /* Default to Hamming for 1-bit ECC, if unspecified */
+                       chip->ecc.algo = NAND_ECC_HAMMING;
+               else
+                       /* Otherwise, BCH */
+                       chip->ecc.algo = NAND_ECC_BCH;
+       }
+
+       if (chip->ecc.algo == NAND_ECC_HAMMING && (chip->ecc.strength != 1 ||
+                                                  chip->ecc.size != 512)) {
+               dev_err(ctrl->dev, "invalid Hamming params: %d bits per %d bytes\n",
+                       chip->ecc.strength, chip->ecc.size);
+               return -EINVAL;
+       }
+
        switch (chip->ecc.size) {
        case 512:
-               if (chip->ecc.strength == 1) /* Hamming */
+               if (chip->ecc.algo == NAND_ECC_HAMMING)
                        cfg->ecc_level = 15;
                else
                        cfg->ecc_level = chip->ecc.strength;
@@ -2001,8 +2078,8 @@ static int brcmnand_init_cs(struct brcmnand_host *host, struct device_node *dn)
         */
        chip->options |= NAND_USE_BOUNCE_BUFFER;
 
-       if (of_get_nand_on_flash_bbt(dn))
-               chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
+       if (chip->bbt_options & NAND_BBT_USE_FLASH)
+               chip->bbt_options |= NAND_BBT_NO_OOB;
 
        if (brcmnand_setup_dev(host))
                return -ENXIO;
@@ -2011,9 +2088,9 @@ static int brcmnand_init_cs(struct brcmnand_host *host, struct device_node *dn)
        /* only use our internal HW threshold */
        mtd->bitflip_threshold = 1;
 
-       chip->ecc.layout = brcmstb_choose_ecc_layout(host);
-       if (!chip->ecc.layout)
-               return -ENXIO;
+       ret = brcmstb_choose_ecc_layout(host);
+       if (ret)
+               return ret;
 
        if (nand_scan_tail(mtd))
                return -ENXIO;
@@ -2115,6 +2192,7 @@ static const struct of_device_id brcmnand_of_match[] = {
        { .compatible = "brcm,brcmnand-v5.0" },
        { .compatible = "brcm,brcmnand-v6.0" },
        { .compatible = "brcm,brcmnand-v6.1" },
+       { .compatible = "brcm,brcmnand-v6.2" },
        { .compatible = "brcm,brcmnand-v7.0" },
        { .compatible = "brcm,brcmnand-v7.1" },
        {},
index e553aff689878ff6ef5185155eee71506b24d550..0b0c93702abbd43c96b4e48eda148db9d36ddbee 100644 (file)
@@ -459,10 +459,37 @@ static int cafe_nand_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        return max_bitflips;
 }
 
-static struct nand_ecclayout cafe_oobinfo_2048 = {
-       .eccbytes = 14,
-       .eccpos = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
-       .oobfree = {{14, 50}}
+static int cafe_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int cafe_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = chip->ecc.total;
+       oobregion->length = mtd->oobsize - chip->ecc.total;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops cafe_ooblayout_ops = {
+       .ecc = cafe_ooblayout_ecc,
+       .free = cafe_ooblayout_free,
 };
 
 /* Ick. The BBT code really ought to be able to work this bit out
@@ -494,12 +521,6 @@ static struct nand_bbt_descr cafe_bbt_mirror_descr_2048 = {
        .pattern = cafe_mirror_pattern_2048
 };
 
-static struct nand_ecclayout cafe_oobinfo_512 = {
-       .eccbytes = 14,
-       .eccpos = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
-       .oobfree = {{14, 2}}
-};
-
 static struct nand_bbt_descr cafe_bbt_main_descr_512 = {
        .options = NAND_BBT_LASTBLOCK | NAND_BBT_CREATE | NAND_BBT_WRITE
                | NAND_BBT_2BIT | NAND_BBT_VERSION,
@@ -743,12 +764,11 @@ static int cafe_nand_probe(struct pci_dev *pdev,
                cafe->ctl2 |= 1<<29; /* 2KiB page size */
 
        /* Set up ECC according to the type of chip we found */
+       mtd_set_ooblayout(mtd, &cafe_ooblayout_ops);
        if (mtd->writesize == 2048) {
-               cafe->nand.ecc.layout = &cafe_oobinfo_2048;
                cafe->nand.bbt_td = &cafe_bbt_main_descr_2048;
                cafe->nand.bbt_md = &cafe_bbt_mirror_descr_2048;
        } else if (mtd->writesize == 512) {
-               cafe->nand.ecc.layout = &cafe_oobinfo_512;
                cafe->nand.bbt_td = &cafe_bbt_main_descr_512;
                cafe->nand.bbt_md = &cafe_bbt_mirror_descr_512;
        } else {
index 6f97ebba52c4c1136bebf752a20578aaa8ecca0a..49133783ca5363f723cb2b9983f02e6ef9ba6b40 100644 (file)
@@ -187,6 +187,7 @@ static int __init cmx270_init(void)
        /* 15 us command delay time */
        this->chip_delay = 20;
        this->ecc.mode = NAND_ECC_SOFT;
+       this->ecc.algo = NAND_ECC_HAMMING;
 
        /* read/write functions */
        this->read_byte = cmx270_read_byte;
index 8cb821b6686efea3c22854946a5031718c598f94..cc07ba0f044deeb167772c529a635236377d983b 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 #include <linux/of_device.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 
 #include <linux/platform_data/mtd-davinci.h>
 #include <linux/platform_data/mtd-davinci-aemif.h>
@@ -54,7 +53,6 @@
  */
 struct davinci_nand_info {
        struct nand_chip        chip;
-       struct nand_ecclayout   ecclayout;
 
        struct device           *dev;
        struct clk              *clk;
@@ -480,63 +478,46 @@ static int nand_davinci_dev_ready(struct mtd_info *mtd)
  * ten ECC bytes plus the manufacturer's bad block marker byte, and
  * and not overlapping the default BBT markers.
  */
-static struct nand_ecclayout hwecc4_small = {
-       .eccbytes = 10,
-       .eccpos = { 0, 1, 2, 3, 4,
-               /* offset 5 holds the badblock marker */
-               6, 7,
-               13, 14, 15, },
-       .oobfree = {
-               {.offset = 8, .length = 5, },
-               {.offset = 16, },
-       },
-};
+static int hwecc4_ooblayout_small_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section > 2)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 5;
+       } else if (section == 1) {
+               oobregion->offset = 6;
+               oobregion->length = 2;
+       } else {
+               oobregion->offset = 13;
+               oobregion->length = 3;
+       }
 
-/* An ECC layout for using 4-bit ECC with large-page (2048bytes) flash,
- * storing ten ECC bytes plus the manufacturer's bad block marker byte,
- * and not overlapping the default BBT markers.
- */
-static struct nand_ecclayout hwecc4_2048 = {
-       .eccbytes = 40,
-       .eccpos = {
-               /* at the end of spare sector */
-               24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
-               34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-               44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-               },
-       .oobfree = {
-               /* 2 bytes at offset 0 hold manufacturer badblock markers */
-               {.offset = 2, .length = 22, },
-               /* 5 bytes at offset 8 hold BBT markers */
-               /* 8 bytes at offset 16 hold JFFS2 clean markers */
-       },
-};
+       return 0;
+}
 
-/*
- * An ECC layout for using 4-bit ECC with large-page (4096bytes) flash,
- * storing ten ECC bytes plus the manufacturer's bad block marker byte,
- * and not overlapping the default BBT markers.
- */
-static struct nand_ecclayout hwecc4_4096 = {
-       .eccbytes = 80,
-       .eccpos = {
-               /* at the end of spare sector */
-               48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-               58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
-               68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
-               78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
-               98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
-               108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
-               118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-       },
-       .oobfree = {
-               /* 2 bytes at offset 0 hold manufacturer badblock markers */
-               {.offset = 2, .length = 46, },
-               /* 5 bytes at offset 8 hold BBT markers */
-               /* 8 bytes at offset 16 hold JFFS2 clean markers */
-       },
+static int hwecc4_ooblayout_small_free(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 8;
+               oobregion->length = 5;
+       } else {
+               oobregion->offset = 16;
+               oobregion->length = mtd->oobsize - 16;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops hwecc4_small_ooblayout_ops = {
+       .ecc = hwecc4_ooblayout_small_ecc,
+       .free = hwecc4_ooblayout_small_free,
 };
 
 #if defined(CONFIG_OF)
@@ -577,8 +558,6 @@ static struct davinci_nand_pdata
                        "ti,davinci-mask-chipsel", &prop))
                        pdata->mask_chipsel = prop;
                if (!of_property_read_string(pdev->dev.of_node,
-                       "nand-ecc-mode", &mode) ||
-                   !of_property_read_string(pdev->dev.of_node,
                        "ti,davinci-ecc-mode", &mode)) {
                        if (!strncmp("none", mode, 4))
                                pdata->ecc_mode = NAND_ECC_NONE;
@@ -591,14 +570,11 @@ static struct davinci_nand_pdata
                        "ti,davinci-ecc-bits", &prop))
                        pdata->ecc_bits = prop;
 
-               prop = of_get_nand_bus_width(pdev->dev.of_node);
-               if (0 < prop || !of_property_read_u32(pdev->dev.of_node,
-                       "ti,davinci-nand-buswidth", &prop))
-                       if (prop == 16)
-                               pdata->options |= NAND_BUSWIDTH_16;
+               if (!of_property_read_u32(pdev->dev.of_node,
+                       "ti,davinci-nand-buswidth", &prop) && prop == 16)
+                       pdata->options |= NAND_BUSWIDTH_16;
+
                if (of_property_read_bool(pdev->dev.of_node,
-                       "nand-on-flash-bbt") ||
-                   of_property_read_bool(pdev->dev.of_node,
                        "ti,davinci-nand-use-bbt"))
                        pdata->bbt_options = NAND_BBT_USE_FLASH;
 
@@ -628,7 +604,6 @@ static int nand_davinci_probe(struct platform_device *pdev)
        void __iomem                    *base;
        int                             ret;
        uint32_t                        val;
-       nand_ecc_modes_t                ecc_mode;
        struct mtd_info                 *mtd;
 
        pdata = nand_davinci_get_pdata(pdev);
@@ -712,13 +687,53 @@ static int nand_davinci_probe(struct platform_device *pdev)
        info->chip.write_buf    = nand_davinci_write_buf;
 
        /* Use board-specific ECC config */
-       ecc_mode                = pdata->ecc_mode;
+       info->chip.ecc.mode     = pdata->ecc_mode;
 
        ret = -EINVAL;
-       switch (ecc_mode) {
+
+       info->clk = devm_clk_get(&pdev->dev, "aemif");
+       if (IS_ERR(info->clk)) {
+               ret = PTR_ERR(info->clk);
+               dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
+               return ret;
+       }
+
+       ret = clk_prepare_enable(info->clk);
+       if (ret < 0) {
+               dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n",
+                       ret);
+               goto err_clk_enable;
+       }
+
+       spin_lock_irq(&davinci_nand_lock);
+
+       /* put CSxNAND into NAND mode */
+       val = davinci_nand_readl(info, NANDFCR_OFFSET);
+       val |= BIT(info->core_chipsel);
+       davinci_nand_writel(info, NANDFCR_OFFSET, val);
+
+       spin_unlock_irq(&davinci_nand_lock);
+
+       /* Scan to find existence of the device(s) */
+       ret = nand_scan_ident(mtd, pdata->mask_chipsel ? 2 : 1, NULL);
+       if (ret < 0) {
+               dev_dbg(&pdev->dev, "no NAND chip(s) found\n");
+               goto err;
+       }
+
+       switch (info->chip.ecc.mode) {
        case NAND_ECC_NONE:
+               pdata->ecc_bits = 0;
+               break;
        case NAND_ECC_SOFT:
                pdata->ecc_bits = 0;
+               /*
+                * This driver expects Hamming based ECC when ecc_mode is set
+                * to NAND_ECC_SOFT. Force ecc.algo to NAND_ECC_HAMMING to
+                * avoid adding an extra ->ecc_algo field to
+                * davinci_nand_pdata.
+                */
+               info->chip.ecc.algo = NAND_ECC_HAMMING;
                break;
        case NAND_ECC_HW:
                if (pdata->ecc_bits == 4) {
@@ -754,37 +769,6 @@ static int nand_davinci_probe(struct platform_device *pdev)
        default:
                return -EINVAL;
        }
-       info->chip.ecc.mode = ecc_mode;
-
-       info->clk = devm_clk_get(&pdev->dev, "aemif");
-       if (IS_ERR(info->clk)) {
-               ret = PTR_ERR(info->clk);
-               dev_dbg(&pdev->dev, "unable to get AEMIF clock, err %d\n", ret);
-               return ret;
-       }
-
-       ret = clk_prepare_enable(info->clk);
-       if (ret < 0) {
-               dev_dbg(&pdev->dev, "unable to enable AEMIF clock, err %d\n",
-                       ret);
-               goto err_clk_enable;
-       }
-
-       spin_lock_irq(&davinci_nand_lock);
-
-       /* put CSxNAND into NAND mode */
-       val = davinci_nand_readl(info, NANDFCR_OFFSET);
-       val |= BIT(info->core_chipsel);
-       davinci_nand_writel(info, NANDFCR_OFFSET, val);
-
-       spin_unlock_irq(&davinci_nand_lock);
-
-       /* Scan to find existence of the device(s) */
-       ret = nand_scan_ident(mtd, pdata->mask_chipsel ? 2 : 1, NULL);
-       if (ret < 0) {
-               dev_dbg(&pdev->dev, "no NAND chip(s) found\n");
-               goto err;
-       }
 
        /* Update ECC layout if needed ... for 1-bit HW ECC, the default
         * is OK, but it allocates 6 bytes when only 3 are needed (for
@@ -805,26 +789,14 @@ static int nand_davinci_probe(struct platform_device *pdev)
                 * table marker fits in the free bytes.
                 */
                if (chunks == 1) {
-                       info->ecclayout = hwecc4_small;
-                       info->ecclayout.oobfree[1].length = mtd->oobsize - 16;
-                       goto syndrome_done;
-               }
-               if (chunks == 4) {
-                       info->ecclayout = hwecc4_2048;
-                       info->chip.ecc.mode = NAND_ECC_HW_OOB_FIRST;
-                       goto syndrome_done;
-               }
-               if (chunks == 8) {
-                       info->ecclayout = hwecc4_4096;
+                       mtd_set_ooblayout(mtd, &hwecc4_small_ooblayout_ops);
+               } else if (chunks == 4 || chunks == 8) {
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                        info->chip.ecc.mode = NAND_ECC_HW_OOB_FIRST;
-                       goto syndrome_done;
+               } else {
+                       ret = -EIO;
+                       goto err;
                }
-
-               ret = -EIO;
-               goto err;
-
-syndrome_done:
-               info->chip.ecc.layout = &info->ecclayout;
        }
 
        ret = nand_scan_tail(mtd);
@@ -850,7 +822,7 @@ err:
 
 err_clk_enable:
        spin_lock_irq(&davinci_nand_lock);
-       if (ecc_mode == NAND_ECC_HW_SYNDROME)
+       if (info->chip.ecc.mode == NAND_ECC_HW_SYNDROME)
                ecc4_busy = false;
        spin_unlock_irq(&davinci_nand_lock);
        return ret;
index 30bf5f690f787aefa13a2260a9cd3c303f362781..0476ae8776d938b09e09371d73cca855b85d4665 100644 (file)
@@ -1374,13 +1374,41 @@ static void denali_hw_init(struct denali_nand_info *denali)
  * correction
  */
 #define ECC_8BITS      14
-static struct nand_ecclayout nand_8bit_oob = {
-       .eccbytes = 14,
-};
-
 #define ECC_15BITS     26
-static struct nand_ecclayout nand_15bit_oob = {
-       .eccbytes = 26,
+
+static int denali_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct denali_nand_info *denali = mtd_to_denali(mtd);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = denali->bbtskipbytes;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int denali_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct denali_nand_info *denali = mtd_to_denali(mtd);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = chip->ecc.total + denali->bbtskipbytes;
+       oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops denali_ooblayout_ops = {
+       .ecc = denali_ooblayout_ecc,
+       .free = denali_ooblayout_free,
 };
 
 static uint8_t bbt_pattern[] = {'B', 'b', 't', '0' };
@@ -1561,7 +1589,6 @@ int denali_init(struct denali_nand_info *denali)
                        ECC_SECTOR_SIZE)))) {
                /* if MLC OOB size is large enough, use 15bit ECC*/
                denali->nand.ecc.strength = 15;
-               denali->nand.ecc.layout = &nand_15bit_oob;
                denali->nand.ecc.bytes = ECC_15BITS;
                iowrite32(15, denali->flash_reg + ECC_CORRECTION);
        } else if (mtd->oobsize < (denali->bbtskipbytes +
@@ -1571,20 +1598,13 @@ int denali_init(struct denali_nand_info *denali)
                goto failed_req_irq;
        } else {
                denali->nand.ecc.strength = 8;
-               denali->nand.ecc.layout = &nand_8bit_oob;
                denali->nand.ecc.bytes = ECC_8BITS;
                iowrite32(8, denali->flash_reg + ECC_CORRECTION);
        }
 
+       mtd_set_ooblayout(mtd, &denali_ooblayout_ops);
        denali->nand.ecc.bytes *= denali->devnum;
        denali->nand.ecc.strength *= denali->devnum;
-       denali->nand.ecc.layout->eccbytes *=
-               mtd->writesize / ECC_SECTOR_SIZE;
-       denali->nand.ecc.layout->oobfree[0].offset =
-               denali->bbtskipbytes + denali->nand.ecc.layout->eccbytes;
-       denali->nand.ecc.layout->oobfree[0].length =
-               mtd->oobsize - denali->nand.ecc.layout->eccbytes -
-               denali->bbtskipbytes;
 
        /*
         * Let driver know the total blocks number and how many blocks
index 547c1002941da4e81cee37576f497ca67d836b2c..a023ab9e9cbf616501cf16309f1198b67fbcf867 100644 (file)
@@ -950,20 +950,50 @@ static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat,
 
 //u_char mydatabuf[528];
 
-/* The strange out-of-order .oobfree list below is a (possibly unneeded)
- * attempt to retain compatibility.  It used to read:
- *     .oobfree = { {8, 8} }
- * Since that leaves two bytes unusable, it was changed.  But the following
- * scheme might affect existing jffs2 installs by moving the cleanmarker:
- *     .oobfree = { {6, 10} }
- * jffs2 seems to handle the above gracefully, but the current scheme seems
- * safer.  The only problem with it is that any code that parses oobfree must
- * be able to handle out-of-order segments.
- */
-static struct nand_ecclayout doc200x_oobinfo = {
-       .eccbytes = 6,
-       .eccpos = {0, 1, 2, 3, 4, 5},
-       .oobfree = {{8, 8}, {6, 2}}
+static int doc200x_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static int doc200x_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       /*
+        * The strange out-of-order free bytes definition is a (possibly
+        * unneeded) attempt to retain compatibility.  It used to read:
+        *      .oobfree = { {8, 8} }
+        * Since that leaves two bytes unusable, it was changed.  But the
+        * following scheme might affect existing jffs2 installs by moving the
+        * cleanmarker:
+        *      .oobfree = { {6, 10} }
+        * jffs2 seems to handle the above gracefully, but the current scheme
+        * seems safer. The only problem with it is that any code retrieving
+        * free bytes position must be able to handle out-of-order segments.
+        */
+       if (!section) {
+               oobregion->offset = 8;
+               oobregion->length = 8;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = 2;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops doc200x_ooblayout_ops = {
+       .ecc = doc200x_ooblayout_ecc,
+       .free = doc200x_ooblayout_free,
 };
 
 /* Find the (I)NFTL Media Header, and optionally also the mirror media header.
@@ -1537,6 +1567,7 @@ static int __init doc_probe(unsigned long physadr)
        nand->bbt_md            = nand->bbt_td + 1;
 
        mtd->owner              = THIS_MODULE;
+       mtd_set_ooblayout(mtd, &doc200x_ooblayout_ops);
 
        nand_set_controller_data(nand, doc);
        nand->select_chip       = doc200x_select_chip;
@@ -1548,7 +1579,6 @@ static int __init doc_probe(unsigned long physadr)
        nand->ecc.calculate     = doc200x_calculate_ecc;
        nand->ecc.correct       = doc200x_correct_data;
 
-       nand->ecc.layout        = &doc200x_oobinfo;
        nand->ecc.mode          = NAND_ECC_HW_SYNDROME;
        nand->ecc.size          = 512;
        nand->ecc.bytes         = 6;
index d86a60e1bbcb433a380a0718d2251cd2216a8ff9..47316998017f3e160a0eaf64783f26d2b1808ad9 100644 (file)
@@ -222,10 +222,33 @@ struct docg4_priv {
  * Bytes 8 - 14 are hw-generated ecc covering entire page + oob bytes 0 - 14.
  * Byte 15 (the last) is used by the driver as a "page written" flag.
  */
-static struct nand_ecclayout docg4_oobinfo = {
-       .eccbytes = 9,
-       .eccpos = {7, 8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {.offset = 2, .length = 5} }
+static int docg4_ooblayout_ecc(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 7;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static int docg4_ooblayout_free(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 2;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops docg4_ooblayout_ops = {
+       .ecc = docg4_ooblayout_ecc,
+       .free = docg4_ooblayout_free,
 };
 
 /*
@@ -1209,6 +1232,7 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
        mtd->writesize = DOCG4_PAGE_SIZE;
        mtd->erasesize = DOCG4_BLOCK_SIZE;
        mtd->oobsize = DOCG4_OOB_SIZE;
+       mtd_set_ooblayout(mtd, &docg4_ooblayout_ops);
        nand->chipsize = DOCG4_CHIP_SIZE;
        nand->chip_shift = DOCG4_CHIP_SHIFT;
        nand->bbt_erase_shift = nand->phys_erase_shift = DOCG4_ERASE_SHIFT;
@@ -1217,7 +1241,6 @@ static void __init init_mtd_structs(struct mtd_info *mtd)
        nand->pagemask = 0x3ffff;
        nand->badblockpos = NAND_LARGE_BADBLOCK_POS;
        nand->badblockbits = 8;
-       nand->ecc.layout = &docg4_oobinfo;
        nand->ecc.mode = NAND_ECC_HW_SYNDROME;
        nand->ecc.size = DOCG4_PAGE_SIZE;
        nand->ecc.prepad = 8;
index 059d5f7ec1248aeb412e186f58d8040dcc06f0f7..60a88f24c6b3279be38a5870dbb5957cfa5f4114 100644 (file)
@@ -79,32 +79,53 @@ struct fsl_elbc_fcm_ctrl {
 
 /* These map to the positions used by the FCM hardware ECC generator */
 
-/* Small Page FLASH with FMR[ECCM] = 0 */
-static struct nand_ecclayout fsl_elbc_oob_sp_eccm0 = {
-       .eccbytes = 3,
-       .eccpos = {6, 7, 8},
-       .oobfree = { {0, 5}, {9, 7} },
-};
+static int fsl_elbc_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct fsl_elbc_mtd *priv = nand_get_controller_data(chip);
 
-/* Small Page FLASH with FMR[ECCM] = 1 */
-static struct nand_ecclayout fsl_elbc_oob_sp_eccm1 = {
-       .eccbytes = 3,
-       .eccpos = {8, 9, 10},
-       .oobfree = { {0, 5}, {6, 2}, {11, 5} },
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-/* Large Page FLASH with FMR[ECCM] = 0 */
-static struct nand_ecclayout fsl_elbc_oob_lp_eccm0 = {
-       .eccbytes = 12,
-       .eccpos = {6, 7, 8, 22, 23, 24, 38, 39, 40, 54, 55, 56},
-       .oobfree = { {1, 5}, {9, 13}, {25, 13}, {41, 13}, {57, 7} },
-};
+       oobregion->offset = (16 * section) + 6;
+       if (priv->fmr & FMR_ECCM)
+               oobregion->offset += 2;
 
-/* Large Page FLASH with FMR[ECCM] = 1 */
-static struct nand_ecclayout fsl_elbc_oob_lp_eccm1 = {
-       .eccbytes = 12,
-       .eccpos = {8, 9, 10, 24, 25, 26, 40, 41, 42, 56, 57, 58},
-       .oobfree = { {1, 7}, {11, 13}, {27, 13}, {43, 13}, {59, 5} },
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int fsl_elbc_ooblayout_free(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct fsl_elbc_mtd *priv = nand_get_controller_data(chip);
+
+       if (section > chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               if (mtd->writesize > 512)
+                       oobregion->offset++;
+               oobregion->length = (priv->fmr & FMR_ECCM) ? 7 : 5;
+       } else {
+               oobregion->offset = (16 * section) -
+                                   ((priv->fmr & FMR_ECCM) ? 5 : 7);
+               if (section < chip->ecc.steps)
+                       oobregion->length = 13;
+               else
+                       oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsl_elbc_ooblayout_ops = {
+       .ecc = fsl_elbc_ooblayout_ecc,
+       .free = fsl_elbc_ooblayout_free,
 };
 
 /*
@@ -657,8 +678,8 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
                chip->ecc.bytes);
        dev_dbg(priv->dev, "fsl_elbc_init: nand->ecc.total = %d\n",
                chip->ecc.total);
-       dev_dbg(priv->dev, "fsl_elbc_init: nand->ecc.layout = %p\n",
-               chip->ecc.layout);
+       dev_dbg(priv->dev, "fsl_elbc_init: mtd->ooblayout = %p\n",
+               mtd->ooblayout);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->flags = %08x\n", mtd->flags);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->size = %lld\n", mtd->size);
        dev_dbg(priv->dev, "fsl_elbc_init: mtd->erasesize = %d\n",
@@ -675,14 +696,6 @@ static int fsl_elbc_chip_init_tail(struct mtd_info *mtd)
        } else if (mtd->writesize == 2048) {
                priv->page_size = 1;
                setbits32(&lbc->bank[priv->bank].or, OR_FCM_PGS);
-               /* adjust ecc setup if needed */
-               if ((in_be32(&lbc->bank[priv->bank].br) & BR_DECC) ==
-                   BR_DECC_CHK_GEN) {
-                       chip->ecc.size = 512;
-                       chip->ecc.layout = (priv->fmr & FMR_ECCM) ?
-                                          &fsl_elbc_oob_lp_eccm1 :
-                                          &fsl_elbc_oob_lp_eccm0;
-               }
        } else {
                dev_err(priv->dev,
                        "fsl_elbc_init: page size %d is not supported\n",
@@ -780,15 +793,14 @@ static int fsl_elbc_chip_init(struct fsl_elbc_mtd *priv)
        if ((in_be32(&lbc->bank[priv->bank].br) & BR_DECC) ==
            BR_DECC_CHK_GEN) {
                chip->ecc.mode = NAND_ECC_HW;
-               /* put in small page settings and adjust later if needed */
-               chip->ecc.layout = (priv->fmr & FMR_ECCM) ?
-                               &fsl_elbc_oob_sp_eccm1 : &fsl_elbc_oob_sp_eccm0;
+               mtd_set_ooblayout(mtd, &fsl_elbc_ooblayout_ops);
                chip->ecc.size = 512;
                chip->ecc.bytes = 3;
                chip->ecc.strength = 1;
        } else {
                /* otherwise fall back to default software ECC */
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        return 0;
index 43f5a3a4873f247d2e01e5c24dde49b3c607b3ed..4e9e5fd8faf30c22d5fa9159fe6a979228131954 100644 (file)
@@ -67,136 +67,6 @@ struct fsl_ifc_nand_ctrl {
 
 static struct fsl_ifc_nand_ctrl *ifc_nand_ctrl;
 
-/* 512-byte page with 4-bit ECC, 8-bit */
-static struct nand_ecclayout oob_512_8bit_ecc4 = {
-       .eccbytes = 8,
-       .eccpos = {8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {0, 5}, {6, 2} },
-};
-
-/* 512-byte page with 4-bit ECC, 16-bit */
-static struct nand_ecclayout oob_512_16bit_ecc4 = {
-       .eccbytes = 8,
-       .eccpos = {8, 9, 10, 11, 12, 13, 14, 15},
-       .oobfree = { {2, 6}, },
-};
-
-/* 2048-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_2048_ecc4 = {
-       .eccbytes = 32,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-       },
-       .oobfree = { {2, 6}, {40, 24} },
-};
-
-/* 4096-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_4096_ecc4 = {
-       .eccbytes = 64,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-       },
-       .oobfree = { {2, 6}, {72, 56} },
-};
-
-/* 4096-byte page size with 8-bit ECC -- requires 218-byte OOB */
-static struct nand_ecclayout oob_4096_ecc8 = {
-       .eccbytes = 128,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-       },
-       .oobfree = { {2, 6}, {136, 82} },
-};
-
-/* 8192-byte page size with 4-bit ECC */
-static struct nand_ecclayout oob_8192_ecc4 = {
-       .eccbytes = 128,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-       },
-       .oobfree = { {2, 6}, {136, 208} },
-};
-
-/* 8192-byte page size with 8-bit ECC -- requires 218-byte OOB */
-static struct nand_ecclayout oob_8192_ecc8 = {
-       .eccbytes = 256,
-       .eccpos = {
-               8, 9, 10, 11, 12, 13, 14, 15,
-               16, 17, 18, 19, 20, 21, 22, 23,
-               24, 25, 26, 27, 28, 29, 30, 31,
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63,
-               64, 65, 66, 67, 68, 69, 70, 71,
-               72, 73, 74, 75, 76, 77, 78, 79,
-               80, 81, 82, 83, 84, 85, 86, 87,
-               88, 89, 90, 91, 92, 93, 94, 95,
-               96, 97, 98, 99, 100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127,
-               128, 129, 130, 131, 132, 133, 134, 135,
-               136, 137, 138, 139, 140, 141, 142, 143,
-               144, 145, 146, 147, 148, 149, 150, 151,
-               152, 153, 154, 155, 156, 157, 158, 159,
-               160, 161, 162, 163, 164, 165, 166, 167,
-               168, 169, 170, 171, 172, 173, 174, 175,
-               176, 177, 178, 179, 180, 181, 182, 183,
-               184, 185, 186, 187, 188, 189, 190, 191,
-               192, 193, 194, 195, 196, 197, 198, 199,
-               200, 201, 202, 203, 204, 205, 206, 207,
-               208, 209, 210, 211, 212, 213, 214, 215,
-               216, 217, 218, 219, 220, 221, 222, 223,
-               224, 225, 226, 227, 228, 229, 230, 231,
-               232, 233, 234, 235, 236, 237, 238, 239,
-               240, 241, 242, 243, 244, 245, 246, 247,
-               248, 249, 250, 251, 252, 253, 254, 255,
-               256, 257, 258, 259, 260, 261, 262, 263,
-       },
-       .oobfree = { {2, 6}, {264, 80} },
-};
-
 /*
  * Generic flash bbt descriptors
  */
@@ -223,6 +93,57 @@ static struct nand_bbt_descr bbt_mirror_descr = {
        .pattern = mirror_pattern,
 };
 
+static int fsl_ifc_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int fsl_ifc_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->writesize == 512 &&
+           !(chip->options & NAND_BUSWIDTH_16)) {
+               if (!section) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 6;
+                       oobregion->length = 2;
+               }
+
+               return 0;
+       }
+
+       if (!section) {
+               oobregion->offset = 2;
+               oobregion->length = 6;
+       } else {
+               oobregion->offset = chip->ecc.total + 8;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsl_ifc_ooblayout_ops = {
+       .ecc = fsl_ifc_ooblayout_ecc,
+       .free = fsl_ifc_ooblayout_free,
+};
+
 /*
  * Set up the IFC hardware block and page address fields, and the ifc nand
  * structure addr field to point to the correct IFC buffer in memory
@@ -232,7 +153,7 @@ static void set_addr(struct mtd_info *mtd, int column, int page_addr, int oob)
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        int buf_num;
 
        ifc_nand_ctrl->page = page_addr;
@@ -257,18 +178,22 @@ static int is_blank(struct mtd_info *mtd, unsigned int bufnum)
        u8 __iomem *addr = priv->vbase + bufnum * (mtd->writesize * 2);
        u32 __iomem *mainarea = (u32 __iomem *)addr;
        u8 __iomem *oob = addr + mtd->writesize;
-       int i;
+       struct mtd_oob_region oobregion = { };
+       int i, section = 0;
 
        for (i = 0; i < mtd->writesize / 4; i++) {
                if (__raw_readl(&mainarea[i]) != 0xffffffff)
                        return 0;
        }
 
-       for (i = 0; i < chip->ecc.layout->eccbytes; i++) {
-               int pos = chip->ecc.layout->eccpos[i];
+       mtd_ooblayout_ecc(mtd, section++, &oobregion);
+       while (oobregion.length) {
+               for (i = 0; i < oobregion.length; i++) {
+                       if (__raw_readb(&oob[oobregion.offset + i]) != 0xff)
+                               return 0;
+               }
 
-               if (__raw_readb(&oob[pos]) != 0xff)
-                       return 0;
+               mtd_ooblayout_ecc(mtd, section++, &oobregion);
        }
 
        return 1;
@@ -295,7 +220,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd)
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
        struct fsl_ifc_nand_ctrl *nctrl = ifc_nand_ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        u32 eccstat[4];
        int i;
 
@@ -371,7 +296,7 @@ static void fsl_ifc_do_read(struct nand_chip *chip,
 {
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
        /* Program FIR/IFC_NAND_FCR0 for Small/Large page */
        if (mtd->writesize > 512) {
@@ -411,7 +336,7 @@ static void fsl_ifc_cmdfunc(struct mtd_info *mtd, unsigned int command,
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
 
        /* clear the read buffer */
        ifc_nand_ctrl->read_bytes = 0;
@@ -723,7 +648,7 @@ static int fsl_ifc_wait(struct mtd_info *mtd, struct nand_chip *chip)
 {
        struct fsl_ifc_mtd *priv = nand_get_controller_data(chip);
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc = ctrl->rregs;
        u32 nand_fsr;
 
        /* Use READ_STATUS command, but wait for the device to be ready */
@@ -808,8 +733,8 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
                                                        chip->ecc.bytes);
        dev_dbg(priv->dev, "%s: nand->ecc.total = %d\n", __func__,
                                                        chip->ecc.total);
-       dev_dbg(priv->dev, "%s: nand->ecc.layout = %p\n", __func__,
-                                                       chip->ecc.layout);
+       dev_dbg(priv->dev, "%s: mtd->ooblayout = %p\n", __func__,
+                                                       mtd->ooblayout);
        dev_dbg(priv->dev, "%s: mtd->flags = %08x\n", __func__, mtd->flags);
        dev_dbg(priv->dev, "%s: mtd->size = %lld\n", __func__, mtd->size);
        dev_dbg(priv->dev, "%s: mtd->erasesize = %d\n", __func__,
@@ -825,39 +750,42 @@ static int fsl_ifc_chip_init_tail(struct mtd_info *mtd)
 static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
 {
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
+       struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
        uint32_t csor = 0, csor_8k = 0, csor_ext = 0;
        uint32_t cs = priv->bank;
 
        /* Save CSOR and CSOR_ext */
-       csor = ifc_in32(&ifc->csor_cs[cs].csor);
-       csor_ext = ifc_in32(&ifc->csor_cs[cs].csor_ext);
+       csor = ifc_in32(&ifc_global->csor_cs[cs].csor);
+       csor_ext = ifc_in32(&ifc_global->csor_cs[cs].csor_ext);
 
        /* chage PageSize 8K and SpareSize 1K*/
        csor_8k = (csor & ~(CSOR_NAND_PGS_MASK)) | 0x0018C000;
-       ifc_out32(csor_8k, &ifc->csor_cs[cs].csor);
-       ifc_out32(0x0000400, &ifc->csor_cs[cs].csor_ext);
+       ifc_out32(csor_8k, &ifc_global->csor_cs[cs].csor);
+       ifc_out32(0x0000400, &ifc_global->csor_cs[cs].csor_ext);
 
        /* READID */
        ifc_out32((IFC_FIR_OP_CW0 << IFC_NAND_FIR0_OP0_SHIFT) |
-                 (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
-                 (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
-                 &ifc->ifc_nand.nand_fir0);
+                   (IFC_FIR_OP_UA  << IFC_NAND_FIR0_OP1_SHIFT) |
+                   (IFC_FIR_OP_RB << IFC_NAND_FIR0_OP2_SHIFT),
+                   &ifc_runtime->ifc_nand.nand_fir0);
        ifc_out32(NAND_CMD_READID << IFC_NAND_FCR0_CMD0_SHIFT,
-                 &ifc->ifc_nand.nand_fcr0);
-       ifc_out32(0x0, &ifc->ifc_nand.row3);
+                   &ifc_runtime->ifc_nand.nand_fcr0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.row3);
 
-       ifc_out32(0x0, &ifc->ifc_nand.nand_fbcr);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.nand_fbcr);
 
        /* Program ROW0/COL0 */
-       ifc_out32(0x0, &ifc->ifc_nand.row0);
-       ifc_out32(0x0, &ifc->ifc_nand.col0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.row0);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.col0);
 
        /* set the chip select for NAND Transaction */
-       ifc_out32(cs << IFC_NAND_CSEL_SHIFT, &ifc->ifc_nand.nand_csel);
+       ifc_out32(cs << IFC_NAND_CSEL_SHIFT,
+               &ifc_runtime->ifc_nand.nand_csel);
 
        /* start read seq */
-       ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT, &ifc->ifc_nand.nandseq_strt);
+       ifc_out32(IFC_NAND_SEQ_STRT_FIR_STRT,
+               &ifc_runtime->ifc_nand.nandseq_strt);
 
        /* wait for command complete flag or timeout */
        wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat,
@@ -867,17 +795,17 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv)
                printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n");
 
        /* Restore CSOR and CSOR_ext */
-       ifc_out32(csor, &ifc->csor_cs[cs].csor);
-       ifc_out32(csor_ext, &ifc->csor_cs[cs].csor_ext);
+       ifc_out32(csor, &ifc_global->csor_cs[cs].csor);
+       ifc_out32(csor_ext, &ifc_global->csor_cs[cs].csor_ext);
 }
 
 static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 {
        struct fsl_ifc_ctrl *ctrl = priv->ctrl;
-       struct fsl_ifc_regs __iomem *ifc = ctrl->regs;
+       struct fsl_ifc_global __iomem *ifc_global = ctrl->gregs;
+       struct fsl_ifc_runtime __iomem *ifc_runtime = ctrl->rregs;
        struct nand_chip *chip = &priv->chip;
        struct mtd_info *mtd = nand_to_mtd(&priv->chip);
-       struct nand_ecclayout *layout;
        u32 csor;
 
        /* Fill in fsl_ifc_mtd structure */
@@ -886,7 +814,8 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
 
        /* fill in nand_chip structure */
        /* set up function call table */
-       if ((ifc_in32(&ifc->cspr_cs[priv->bank].cspr)) & CSPR_PORT_SIZE_16)
+       if ((ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr))
+               & CSPR_PORT_SIZE_16)
                chip->read_byte = fsl_ifc_read_byte16;
        else
                chip->read_byte = fsl_ifc_read_byte;
@@ -900,13 +829,14 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        chip->bbt_td = &bbt_main_descr;
        chip->bbt_md = &bbt_mirror_descr;
 
-       ifc_out32(0x0, &ifc->ifc_nand.ncfgr);
+       ifc_out32(0x0, &ifc_runtime->ifc_nand.ncfgr);
 
        /* set up nand options */
        chip->bbt_options = NAND_BBT_USE_FLASH;
        chip->options = NAND_NO_SUBPAGE_WRITE;
 
-       if (ifc_in32(&ifc->cspr_cs[priv->bank].cspr) & CSPR_PORT_SIZE_16) {
+       if (ifc_in32(&ifc_global->cspr_cs[priv->bank].cspr)
+               & CSPR_PORT_SIZE_16) {
                chip->read_byte = fsl_ifc_read_byte16;
                chip->options |= NAND_BUSWIDTH_16;
        } else {
@@ -919,20 +849,11 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        chip->ecc.read_page = fsl_ifc_read_page;
        chip->ecc.write_page = fsl_ifc_write_page;
 
-       csor = ifc_in32(&ifc->csor_cs[priv->bank].csor);
-
-       /* Hardware generates ECC per 512 Bytes */
-       chip->ecc.size = 512;
-       chip->ecc.bytes = 8;
-       chip->ecc.strength = 4;
+       csor = ifc_in32(&ifc_global->csor_cs[priv->bank].csor);
 
        switch (csor & CSOR_NAND_PGS_MASK) {
        case CSOR_NAND_PGS_512:
-               if (chip->options & NAND_BUSWIDTH_16) {
-                       layout = &oob_512_16bit_ecc4;
-               } else {
-                       layout = &oob_512_8bit_ecc4;
-
+               if (!(chip->options & NAND_BUSWIDTH_16)) {
                        /* Avoid conflict with bad block marker */
                        bbt_main_descr.offs = 0;
                        bbt_mirror_descr.offs = 0;
@@ -942,35 +863,16 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
                break;
 
        case CSOR_NAND_PGS_2K:
-               layout = &oob_2048_ecc4;
                priv->bufnum_mask = 3;
                break;
 
        case CSOR_NAND_PGS_4K:
-               if ((csor & CSOR_NAND_ECC_MODE_MASK) ==
-                   CSOR_NAND_ECC_MODE_4) {
-                       layout = &oob_4096_ecc4;
-               } else {
-                       layout = &oob_4096_ecc8;
-                       chip->ecc.bytes = 16;
-                       chip->ecc.strength = 8;
-               }
-
                priv->bufnum_mask = 1;
                break;
 
        case CSOR_NAND_PGS_8K:
-               if ((csor & CSOR_NAND_ECC_MODE_MASK) ==
-                   CSOR_NAND_ECC_MODE_4) {
-                       layout = &oob_8192_ecc4;
-               } else {
-                       layout = &oob_8192_ecc8;
-                       chip->ecc.bytes = 16;
-                       chip->ecc.strength = 8;
-               }
-
                priv->bufnum_mask = 0;
-       break;
+               break;
 
        default:
                dev_err(priv->dev, "bad csor %#x: bad page size\n", csor);
@@ -980,9 +882,20 @@ static int fsl_ifc_chip_init(struct fsl_ifc_mtd *priv)
        /* Must also set CSOR_NAND_ECC_ENC_EN if DEC_EN set */
        if (csor & CSOR_NAND_ECC_DEC_EN) {
                chip->ecc.mode = NAND_ECC_HW;
-               chip->ecc.layout = layout;
+               mtd_set_ooblayout(mtd, &fsl_ifc_ooblayout_ops);
+
+               /* Hardware generates ECC per 512 Bytes */
+               chip->ecc.size = 512;
+               if ((csor & CSOR_NAND_ECC_MODE_MASK) == CSOR_NAND_ECC_MODE_4) {
+                       chip->ecc.bytes = 8;
+                       chip->ecc.strength = 4;
+               } else {
+                       chip->ecc.bytes = 16;
+                       chip->ecc.strength = 8;
+               }
        } else {
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        if (ctrl->version == FSL_IFC_VERSION_1_1_0)
@@ -1007,10 +920,10 @@ static int fsl_ifc_chip_remove(struct fsl_ifc_mtd *priv)
        return 0;
 }
 
-static int match_bank(struct fsl_ifc_regs __iomem *ifc, int bank,
+static int match_bank(struct fsl_ifc_global __iomem *ifc_global, int bank,
                      phys_addr_t addr)
 {
-       u32 cspr = ifc_in32(&ifc->cspr_cs[bank].cspr);
+       u32 cspr = ifc_in32(&ifc_global->cspr_cs[bank].cspr);
 
        if (!(cspr & CSPR_V))
                return 0;
@@ -1024,7 +937,7 @@ static DEFINE_MUTEX(fsl_ifc_nand_mutex);
 
 static int fsl_ifc_nand_probe(struct platform_device *dev)
 {
-       struct fsl_ifc_regs __iomem *ifc;
+       struct fsl_ifc_runtime __iomem *ifc;
        struct fsl_ifc_mtd *priv;
        struct resource res;
        static const char *part_probe_types[]
@@ -1034,9 +947,9 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
        struct device_node *node = dev->dev.of_node;
        struct mtd_info *mtd;
 
-       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->regs)
+       if (!fsl_ifc_ctrl_dev || !fsl_ifc_ctrl_dev->rregs)
                return -ENODEV;
-       ifc = fsl_ifc_ctrl_dev->regs;
+       ifc = fsl_ifc_ctrl_dev->rregs;
 
        /* get, allocate and map the memory resource */
        ret = of_address_to_resource(node, 0, &res);
@@ -1047,7 +960,7 @@ static int fsl_ifc_nand_probe(struct platform_device *dev)
 
        /* find which chip select it is connected to */
        for (bank = 0; bank < fsl_ifc_ctrl_dev->banks; bank++) {
-               if (match_bank(ifc, bank, res.start))
+               if (match_bank(fsl_ifc_ctrl_dev->gregs, bank, res.start))
                        break;
        }
 
index cafd12de72766f193ce409d46ac5ae9b3e9489a9..d85fa2555b6838b967ab3aee36f78360ec6ac321 100644 (file)
@@ -170,6 +170,7 @@ static int fun_chip_init(struct fsl_upm_nand *fun,
        fun->chip.read_buf = fun_read_buf;
        fun->chip.write_buf = fun_write_buf;
        fun->chip.ecc.mode = NAND_ECC_SOFT;
+       fun->chip.ecc.algo = NAND_ECC_HAMMING;
        if (fun->mchip_count > 1)
                fun->chip.select_chip = fun_select_chip;
 
index 1bdcd4fa26d4bcd9c7c98fc5767960ac6abe1da7..d4f454a4b35e7e5ed8aff7a6d5a326c86d62032f 100644 (file)
 #include <linux/amba/bus.h>
 #include <mtd/mtd-abi.h>
 
-static struct nand_ecclayout fsmc_ecc1_128_layout = {
-       .eccbytes = 24,
-       .eccpos = {2, 3, 4, 18, 19, 20, 34, 35, 36, 50, 51, 52,
-               66, 67, 68, 82, 83, 84, 98, 99, 100, 114, 115, 116},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-               {.offset = 24, .length = 8},
-               {.offset = 40, .length = 8},
-               {.offset = 56, .length = 8},
-               {.offset = 72, .length = 8},
-               {.offset = 88, .length = 8},
-               {.offset = 104, .length = 8},
-               {.offset = 120, .length = 8}
-       }
-};
+static int fsmc_ecc1_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-static struct nand_ecclayout fsmc_ecc1_64_layout = {
-       .eccbytes = 12,
-       .eccpos = {2, 3, 4, 18, 19, 20, 34, 35, 36, 50, 51, 52},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-               {.offset = 24, .length = 8},
-               {.offset = 40, .length = 8},
-               {.offset = 56, .length = 8},
-       }
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-static struct nand_ecclayout fsmc_ecc1_16_layout = {
-       .eccbytes = 3,
-       .eccpos = {2, 3, 4},
-       .oobfree = {
-               {.offset = 8, .length = 8},
-       }
-};
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 3;
 
-/*
- * ECC4 layout for NAND of pagesize 8192 bytes & OOBsize 256 bytes. 13*16 bytes
- * of OB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block and 46
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_256_layout = {
-       .eccbytes = 208,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126,
-               130, 131, 132, 133, 134, 135, 136,
-               137, 138, 139, 140, 141, 142,
-               146, 147, 148, 149, 150, 151, 152,
-               153, 154, 155, 156, 157, 158,
-               162, 163, 164, 165, 166, 167, 168,
-               169, 170, 171, 172, 173, 174,
-               178, 179, 180, 181, 182, 183, 184,
-               185, 186, 187, 188, 189, 190,
-               194, 195, 196, 197, 198, 199, 200,
-               201, 202, 203, 204, 205, 206,
-               210, 211, 212, 213, 214, 215, 216,
-               217, 218, 219, 220, 221, 222,
-               226, 227, 228, 229, 230, 231, 232,
-               233, 234, 235, 236, 237, 238,
-               242, 243, 244, 245, 246, 247, 248,
-               249, 250, 251, 252, 253, 254
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 3},
-               {.offset = 143, .length = 3},
-               {.offset = 159, .length = 3},
-               {.offset = 175, .length = 3},
-               {.offset = 191, .length = 3},
-               {.offset = 207, .length = 3},
-               {.offset = 223, .length = 3},
-               {.offset = 239, .length = 3},
-               {.offset = 255, .length = 1}
-       }
-};
+       return 0;
+}
 
-/*
- * ECC4 layout for NAND of pagesize 4096 bytes & OOBsize 224 bytes. 13*8 bytes
- * of OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block & 118
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_224_layout = {
-       .eccbytes = 104,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 97}
-       }
-};
+static int fsmc_ecc1_ooblayout_free(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-/*
- * ECC4 layout for NAND of pagesize 4096 bytes & OOBsize 128 bytes. 13*8 bytes
- * of OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block & 22
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_128_layout = {
-       .eccbytes = 104,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-               66,  67,  68,  69,  70,  71,  72,
-               73,  74,  75,  76,  77,  78,
-               82,  83,  84,  85,  86,  87,  88,
-               89,  90,  91,  92,  93,  94,
-               98,  99, 100, 101, 102, 103, 104,
-               105, 106, 107, 108, 109, 110,
-               114, 115, 116, 117, 118, 119, 120,
-               121, 122, 123, 124, 125, 126
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 3},
-               {.offset = 79, .length = 3},
-               {.offset = 95, .length = 3},
-               {.offset = 111, .length = 3},
-               {.offset = 127, .length = 1}
-       }
-};
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
 
-/*
- * ECC4 layout for NAND of pagesize 2048 bytes & OOBsize 64 bytes. 13*4 bytes of
- * OOB size is reserved for ECC, Byte no. 0 & 1 reserved for bad block and 10
- * bytes are free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_64_layout = {
-       .eccbytes = 52,
-       .eccpos = {  2,   3,   4,   5,   6,   7,   8,
-               9,  10,  11,  12,  13,  14,
-               18,  19,  20,  21,  22,  23,  24,
-               25,  26,  27,  28,  29,  30,
-               34,  35,  36,  37,  38,  39,  40,
-               41,  42,  43,  44,  45,  46,
-               50,  51,  52,  53,  54,  55,  56,
-               57,  58,  59,  60,  61,  62,
-       },
-       .oobfree = {
-               {.offset = 15, .length = 3},
-               {.offset = 31, .length = 3},
-               {.offset = 47, .length = 3},
-               {.offset = 63, .length = 1},
-       }
-};
+       oobregion->offset = (section * 16) + 8;
 
-/*
- * ECC4 layout for NAND of pagesize 512 bytes & OOBsize 16 bytes. 13 bytes of
- * OOB size is reserved for ECC, Byte no. 4 & 5 reserved for bad block and One
- * byte is free for use.
- */
-static struct nand_ecclayout fsmc_ecc4_16_layout = {
-       .eccbytes = 13,
-       .eccpos = { 0,  1,  2,  3,  6,  7, 8,
-               9, 10, 11, 12, 13, 14
-       },
-       .oobfree = {
-               {.offset = 15, .length = 1},
-       }
+       if (section < chip->ecc.steps - 1)
+               oobregion->length = 8;
+       else
+               oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsmc_ecc1_ooblayout_ops = {
+       .ecc = fsmc_ecc1_ooblayout_ecc,
+       .free = fsmc_ecc1_ooblayout_free,
 };
 
 /*
@@ -250,28 +81,46 @@ static struct nand_ecclayout fsmc_ecc4_16_layout = {
  * There are 13 bytes of ecc for every 512 byte block and it has to be read
  * consecutively and immediately after the 512 byte data block for hardware to
  * generate the error bit offsets in 512 byte data.
- * Managing the ecc bytes in the following way makes it easier for software to
- * read ecc bytes consecutive to data bytes. This way is similar to
- * oobfree structure maintained already in generic nand driver
  */
-static struct fsmc_eccplace fsmc_ecc4_lp_place = {
-       .eccplace = {
-               {.offset = 2, .length = 13},
-               {.offset = 18, .length = 13},
-               {.offset = 34, .length = 13},
-               {.offset = 50, .length = 13},
-               {.offset = 66, .length = 13},
-               {.offset = 82, .length = 13},
-               {.offset = 98, .length = 13},
-               {.offset = 114, .length = 13}
-       }
-};
+static int fsmc_ecc4_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
 
-static struct fsmc_eccplace fsmc_ecc4_sp_place = {
-       .eccplace = {
-               {.offset = 0, .length = 4},
-               {.offset = 6, .length = 9}
-       }
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->length = chip->ecc.bytes;
+
+       if (!section && mtd->writesize <= 512)
+               oobregion->offset = 0;
+       else
+               oobregion->offset = (section * 16) + 2;
+
+       return 0;
+}
+
+static int fsmc_ecc4_ooblayout_free(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 15;
+
+       if (section < chip->ecc.steps - 1)
+               oobregion->length = 3;
+       else
+               oobregion->length = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops fsmc_ecc4_ooblayout_ops = {
+       .ecc = fsmc_ecc4_ooblayout_ecc,
+       .free = fsmc_ecc4_ooblayout_free,
 };
 
 /**
@@ -283,7 +132,6 @@ static struct fsmc_eccplace fsmc_ecc4_sp_place = {
  * @partitions:                Partition info for a NAND Flash.
  * @nr_partitions:     Total number of partition of a NAND flash.
  *
- * @ecc_place:         ECC placing locations in oobfree type format.
  * @bank:              Bank number for probed device.
  * @clk:               Clock structure for FSMC.
  *
@@ -303,7 +151,6 @@ struct fsmc_nand_data {
        struct mtd_partition    *partitions;
        unsigned int            nr_partitions;
 
-       struct fsmc_eccplace    *ecc_place;
        unsigned int            bank;
        struct device           *dev;
        enum access_mode        mode;
@@ -710,8 +557,6 @@ static void fsmc_write_buf_dma(struct mtd_info *mtd, const uint8_t *buf,
 static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                 uint8_t *buf, int oob_required, int page)
 {
-       struct fsmc_nand_data *host = mtd_to_fsmc(mtd);
-       struct fsmc_eccplace *ecc_place = host->ecc_place;
        int i, j, s, stat, eccsize = chip->ecc.size;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
@@ -734,9 +579,15 @@ static int fsmc_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                chip->read_buf(mtd, p, eccsize);
 
                for (j = 0; j < eccbytes;) {
-                       off = ecc_place->eccplace[group].offset;
-                       len = ecc_place->eccplace[group].length;
-                       group++;
+                       struct mtd_oob_region oobregion;
+                       int ret;
+
+                       ret = mtd_ooblayout_ecc(mtd, group++, &oobregion);
+                       if (ret)
+                               return ret;
+
+                       off = oobregion.offset;
+                       len = oobregion.length;
 
                        /*
                         * length is intentionally kept a higher multiple of 2
@@ -1084,24 +935,10 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
        if (AMBA_REV_BITS(host->pid) >= 8) {
                switch (mtd->oobsize) {
                case 16:
-                       nand->ecc.layout = &fsmc_ecc4_16_layout;
-                       host->ecc_place = &fsmc_ecc4_sp_place;
-                       break;
                case 64:
-                       nand->ecc.layout = &fsmc_ecc4_64_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 128:
-                       nand->ecc.layout = &fsmc_ecc4_128_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 224:
-                       nand->ecc.layout = &fsmc_ecc4_224_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
-                       break;
                case 256:
-                       nand->ecc.layout = &fsmc_ecc4_256_layout;
-                       host->ecc_place = &fsmc_ecc4_lp_place;
                        break;
                default:
                        dev_warn(&pdev->dev, "No oob scheme defined for oobsize %d\n",
@@ -1109,6 +946,8 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                        ret = -EINVAL;
                        goto err_probe;
                }
+
+               mtd_set_ooblayout(mtd, &fsmc_ecc4_ooblayout_ops);
        } else {
                switch (nand->ecc.mode) {
                case NAND_ECC_HW:
@@ -1119,9 +958,11 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                        nand->ecc.strength = 1;
                        break;
 
-               case NAND_ECC_SOFT_BCH:
-                       dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
-                       break;
+               case NAND_ECC_SOFT:
+                       if (nand->ecc.algo == NAND_ECC_BCH) {
+                               dev_info(&pdev->dev, "Using 4-bit SW BCH ECC scheme\n");
+                               break;
+                       }
 
                default:
                        dev_err(&pdev->dev, "Unsupported ECC mode!\n");
@@ -1132,16 +973,13 @@ static int __init fsmc_nand_probe(struct platform_device *pdev)
                 * Don't set layout for BCH4 SW ECC. This will be
                 * generated later in nand_bch_init() later.
                 */
-               if (nand->ecc.mode != NAND_ECC_SOFT_BCH) {
+               if (nand->ecc.mode == NAND_ECC_HW) {
                        switch (mtd->oobsize) {
                        case 16:
-                               nand->ecc.layout = &fsmc_ecc1_16_layout;
-                               break;
                        case 64:
-                               nand->ecc.layout = &fsmc_ecc1_64_layout;
-                               break;
                        case 128:
-                               nand->ecc.layout = &fsmc_ecc1_128_layout;
+                               mtd_set_ooblayout(mtd,
+                                                 &fsmc_ecc1_ooblayout_ops);
                                break;
                        default:
                                dev_warn(&pdev->dev,
index ded658fc7d73d07265e0c21abf7599e22bef89f1..6317f6836022e8cbcd144b8df10fc698524d793b 100644 (file)
@@ -273,6 +273,7 @@ static int gpio_nand_probe(struct platform_device *pdev)
        nand_set_flash_node(chip, pdev->dev.of_node);
        chip->IO_ADDR_W         = chip->IO_ADDR_R;
        chip->ecc.mode          = NAND_ECC_SOFT;
+       chip->ecc.algo          = NAND_ECC_HAMMING;
        chip->options           = gpiomtd->plat.options;
        chip->chip_delay        = gpiomtd->plat.chip_delay;
        chip->cmd_ctrl          = gpio_nand_cmd_ctrl;
index 8122c699ccf20895e400b8ae9246b39db00b1f00..6e461560c6a8eff68f7b0698c73e332c8f6bf28a 100644 (file)
@@ -25,7 +25,6 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include "gpmi-nand.h"
 #include "bch-regs.h"
 
@@ -47,10 +46,44 @@ static struct nand_bbt_descr gpmi_bbt_descr = {
  * We may change the layout if we can get the ECC info from the datasheet,
  * else we will use all the (page + OOB).
  */
-static struct nand_ecclayout gpmi_hw_ecclayout = {
-       .eccbytes = 0,
-       .eccpos = { 0, },
-       .oobfree = { {.offset = 0, .length = 0} }
+static int gpmi_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct gpmi_nand_data *this = nand_get_controller_data(chip);
+       struct bch_geometry *geo = &this->bch_geometry;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = geo->page_size - mtd->writesize;
+
+       return 0;
+}
+
+static int gpmi_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct gpmi_nand_data *this = nand_get_controller_data(chip);
+       struct bch_geometry *geo = &this->bch_geometry;
+
+       if (section)
+               return -ERANGE;
+
+       /* The available oob size we have. */
+       if (geo->page_size < mtd->writesize + mtd->oobsize) {
+               oobregion->offset = geo->page_size - mtd->writesize;
+               oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops gpmi_ooblayout_ops = {
+       .ecc = gpmi_ooblayout_ecc,
+       .free = gpmi_ooblayout_free,
 };
 
 static const struct gpmi_devdata gpmi_devdata_imx23 = {
@@ -141,7 +174,6 @@ static int set_geometry_by_ecc_info(struct gpmi_nand_data *this)
        struct bch_geometry *geo = &this->bch_geometry;
        struct nand_chip *chip = &this->nand;
        struct mtd_info *mtd = nand_to_mtd(chip);
-       struct nand_oobfree *of = gpmi_hw_ecclayout.oobfree;
        unsigned int block_mark_bit_offset;
 
        if (!(chip->ecc_strength_ds > 0 && chip->ecc_step_ds > 0))
@@ -229,12 +261,6 @@ static int set_geometry_by_ecc_info(struct gpmi_nand_data *this)
        geo->page_size = mtd->writesize + geo->metadata_size +
                (geo->gf_len * geo->ecc_strength * geo->ecc_chunk_count) / 8;
 
-       /* The available oob size we have. */
-       if (geo->page_size < mtd->writesize + mtd->oobsize) {
-               of->offset = geo->page_size - mtd->writesize;
-               of->length = mtd->oobsize - of->offset;
-       }
-
        geo->payload_size = mtd->writesize;
 
        geo->auxiliary_status_offset = ALIGN(geo->metadata_size, 4);
@@ -797,6 +823,7 @@ static void gpmi_free_dma_buffer(struct gpmi_nand_data *this)
 
        this->cmd_buffer        = NULL;
        this->data_buffer_dma   = NULL;
+       this->raw_buffer        = NULL;
        this->page_buffer_virt  = NULL;
        this->page_buffer_size  =  0;
 }
@@ -1037,14 +1064,87 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
        /* Loop over status bytes, accumulating ECC status. */
        status = auxiliary_virt + nfc_geo->auxiliary_status_offset;
 
+       read_page_swap_end(this, buf, nfc_geo->payload_size,
+                          this->payload_virt, this->payload_phys,
+                          nfc_geo->payload_size,
+                          payload_virt, payload_phys);
+
        for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
                if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
                        continue;
 
                if (*status == STATUS_UNCORRECTABLE) {
+                       int eccbits = nfc_geo->ecc_strength * nfc_geo->gf_len;
+                       u8 *eccbuf = this->raw_buffer;
+                       int offset, bitoffset;
+                       int eccbytes;
+                       int flips;
+
+                       /* Read ECC bytes into our internal raw_buffer */
+                       offset = nfc_geo->metadata_size * 8;
+                       offset += ((8 * nfc_geo->ecc_chunk_size) + eccbits) * (i + 1);
+                       offset -= eccbits;
+                       bitoffset = offset % 8;
+                       eccbytes = DIV_ROUND_UP(offset + eccbits, 8);
+                       offset /= 8;
+                       eccbytes -= offset;
+                       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, offset, -1);
+                       chip->read_buf(mtd, eccbuf, eccbytes);
+
+                       /*
+                        * ECC data are not byte aligned and we may have
+                        * in-band data in the first and last byte of
+                        * eccbuf. Set non-eccbits to one so that
+                        * nand_check_erased_ecc_chunk() does not count them
+                        * as bitflips.
+                        */
+                       if (bitoffset)
+                               eccbuf[0] |= GENMASK(bitoffset - 1, 0);
+
+                       bitoffset = (bitoffset + eccbits) % 8;
+                       if (bitoffset)
+                               eccbuf[eccbytes - 1] |= GENMASK(7, bitoffset);
+
+                       /*
+                        * The ECC hardware has an uncorrectable ECC status
+                        * code in case we have bitflips in an erased page. As
+                        * nothing was written into this subpage the ECC is
+                        * obviously wrong and we can not trust it. We assume
+                        * at this point that we are reading an erased page and
+                        * try to correct the bitflips in buffer up to
+                        * ecc_strength bitflips. If this is a page with random
+                        * data, we exceed this number of bitflips and have a
+                        * ECC failure. Otherwise we use the corrected buffer.
+                        */
+                       if (i == 0) {
+                               /* The first block includes metadata */
+                               flips = nand_check_erased_ecc_chunk(
+                                               buf + i * nfc_geo->ecc_chunk_size,
+                                               nfc_geo->ecc_chunk_size,
+                                               eccbuf, eccbytes,
+                                               auxiliary_virt,
+                                               nfc_geo->metadata_size,
+                                               nfc_geo->ecc_strength);
+                       } else {
+                               flips = nand_check_erased_ecc_chunk(
+                                               buf + i * nfc_geo->ecc_chunk_size,
+                                               nfc_geo->ecc_chunk_size,
+                                               eccbuf, eccbytes,
+                                               NULL, 0,
+                                               nfc_geo->ecc_strength);
+                       }
+
+                       if (flips > 0) {
+                               max_bitflips = max_t(unsigned int, max_bitflips,
+                                                    flips);
+                               mtd->ecc_stats.corrected += flips;
+                               continue;
+                       }
+
                        mtd->ecc_stats.failed++;
                        continue;
                }
+
                mtd->ecc_stats.corrected += *status;
                max_bitflips = max_t(unsigned int, max_bitflips, *status);
        }
@@ -1064,11 +1164,6 @@ static int gpmi_ecc_read_page(struct mtd_info *mtd, struct nand_chip *chip,
                chip->oob_poi[0] = ((uint8_t *) auxiliary_virt)[0];
        }
 
-       read_page_swap_end(this, buf, nfc_geo->payload_size,
-                       this->payload_virt, this->payload_phys,
-                       nfc_geo->payload_size,
-                       payload_virt, payload_phys);
-
        return max_bitflips;
 }
 
@@ -1327,18 +1422,19 @@ static int gpmi_ecc_read_oob(struct mtd_info *mtd, struct nand_chip *chip,
 static int
 gpmi_ecc_write_oob(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
-       struct nand_oobfree *of = mtd->ecclayout->oobfree;
+       struct mtd_oob_region of = { };
        int status = 0;
 
        /* Do we have available oob area? */
-       if (!of->length)
+       mtd_ooblayout_free(mtd, 0, &of);
+       if (!of.length)
                return -EPERM;
 
        if (!nand_is_slc(chip))
                return -EPERM;
 
-       chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + of->offset, page);
-       chip->write_buf(mtd, chip->oob_poi + of->offset, of->length);
+       chip->cmdfunc(mtd, NAND_CMD_SEQIN, mtd->writesize + of.offset, page);
+       chip->write_buf(mtd, chip->oob_poi + of.offset, of.length);
        chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
 
        status = chip->waitfunc(mtd, chip);
@@ -1840,6 +1936,7 @@ static void gpmi_nand_exit(struct gpmi_nand_data *this)
 static int gpmi_init_last(struct gpmi_nand_data *this)
 {
        struct nand_chip *chip = &this->nand;
+       struct mtd_info *mtd = nand_to_mtd(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        struct bch_geometry *bch_geo = &this->bch_geometry;
        int ret;
@@ -1861,7 +1958,7 @@ static int gpmi_init_last(struct gpmi_nand_data *this)
        ecc->mode       = NAND_ECC_HW;
        ecc->size       = bch_geo->ecc_chunk_size;
        ecc->strength   = bch_geo->ecc_strength;
-       ecc->layout     = &gpmi_hw_ecclayout;
+       mtd_set_ooblayout(mtd, &gpmi_ooblayout_ops);
 
        /*
         * We only enable the subpage read when:
@@ -1914,16 +2011,6 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
        /* Set up swap_block_mark, must be set before the gpmi_set_geometry() */
        this->swap_block_mark = !GPMI_IS_MX23(this);
 
-       if (of_get_nand_on_flash_bbt(this->dev->of_node)) {
-               chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
-
-               if (of_property_read_bool(this->dev->of_node,
-                                               "fsl,no-blockmark-swap"))
-                       this->swap_block_mark = false;
-       }
-       dev_dbg(this->dev, "Blockmark swapping %sabled\n",
-               this->swap_block_mark ? "en" : "dis");
-
        /*
         * Allocate a temporary DMA buffer for reading ID in the
         * nand_scan_ident().
@@ -1938,6 +2025,16 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
        if (ret)
                goto err_out;
 
+       if (chip->bbt_options & NAND_BBT_USE_FLASH) {
+               chip->bbt_options |= NAND_BBT_NO_OOB;
+
+               if (of_property_read_bool(this->dev->of_node,
+                                               "fsl,no-blockmark-swap"))
+                       this->swap_block_mark = false;
+       }
+       dev_dbg(this->dev, "Blockmark swapping %sabled\n",
+               this->swap_block_mark ? "en" : "dis");
+
        ret = gpmi_init_last(this);
        if (ret)
                goto err_out;
index 96502b624cfbd73ae4db0b679f2cda3893f61743..9432546f4cd47051e742f03699c4f6058b4915c4 100644 (file)
@@ -19,7 +19,6 @@
  * GNU General Public License for more details.
  */
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/sizes.h>
 #include <linux/clk.h>
@@ -631,8 +630,28 @@ static void hisi_nfc_host_init(struct hinfc_host *host)
        hinfc_write(host, HINFC504_INTEN_DMA, HINFC504_INTEN);
 }
 
-static struct nand_ecclayout nand_ecc_2K_16bits = {
-       .oobfree = { {2, 6} },
+static int hisi_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       /* FIXME: add ECC bytes position */
+       return -ENOTSUPP;
+}
+
+static int hisi_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 2;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops hisi_ooblayout_ops = {
+       .ecc = hisi_ooblayout_ecc,
+       .free = hisi_ooblayout_free,
 };
 
 static int hisi_nfc_ecc_probe(struct hinfc_host *host)
@@ -642,10 +661,9 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
        struct device *dev = host->dev;
        struct nand_chip *chip = &host->chip;
        struct mtd_info *mtd = nand_to_mtd(chip);
-       struct device_node *np = host->dev->of_node;
 
-       size = of_get_nand_ecc_step_size(np);
-       strength = of_get_nand_ecc_strength(np);
+       size = chip->ecc.size;
+       strength = chip->ecc.strength;
        if (size != 1024) {
                dev_err(dev, "error ecc size: %d\n", size);
                return -EINVAL;
@@ -668,7 +686,7 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
        case 16:
                ecc_bits = 6;
                if (mtd->writesize == 2048)
-                       chip->ecc.layout = &nand_ecc_2K_16bits;
+                       mtd_set_ooblayout(mtd, &hisi_ooblayout_ops);
 
                /* TODO: add more page size support */
                break;
@@ -695,7 +713,7 @@ static int hisi_nfc_ecc_probe(struct hinfc_host *host)
 
 static int hisi_nfc_probe(struct platform_device *pdev)
 {
-       int ret = 0, irq, buswidth, flag, max_chips = HINFC504_MAX_CHIP;
+       int ret = 0, irq, flag, max_chips = HINFC504_MAX_CHIP;
        struct device *dev = &pdev->dev;
        struct hinfc_host *host;
        struct nand_chip  *chip;
@@ -747,12 +765,6 @@ static int hisi_nfc_probe(struct platform_device *pdev)
        chip->read_buf          = hisi_nfc_read_buf;
        chip->chip_delay        = HINFC504_CHIP_DELAY;
 
-       chip->ecc.mode = of_get_nand_ecc_mode(np);
-
-       buswidth = of_get_nand_bus_width(np);
-       if (buswidth == 16)
-               chip->options |= NAND_BUSWIDTH_16;
-
        hisi_nfc_host_init(host);
 
        ret = devm_request_irq(dev, irq, hinfc_irq_handle, 0x0, "nandc", host);
index 673ceb2a0b44b677231e030aa8a0f23a5d412e3a..5551c36adbdf173b7df20629c36b065551f9d4ba 100644 (file)
@@ -221,7 +221,6 @@ static int jz_nand_correct_ecc_rs(struct mtd_info *mtd, uint8_t *dat,
        struct jz_nand *nand = mtd_to_jz_nand(mtd);
        int i, error_count, index;
        uint32_t reg, status, error;
-       uint32_t t;
        unsigned int timeout = 1000;
 
        for (i = 0; i < 9; ++i)
@@ -476,7 +475,7 @@ static int jz_nand_probe(struct platform_device *pdev)
        }
 
        if (pdata && pdata->ident_callback) {
-               pdata->ident_callback(pdev, chip, &pdata->partitions,
+               pdata->ident_callback(pdev, mtd, &pdata->partitions,
                                        &pdata->num_partitions);
        }
 
index 755499c6650e4e69614b4d342f9299fce9295b39..d74f4ba4a6f49b45859de1b04e0a3654ca5f2d7d 100644 (file)
@@ -287,7 +287,6 @@ static struct jz4780_bch *jz4780_bch_get(struct device_node *np)
        bch = platform_get_drvdata(pdev);
        clk_prepare_enable(bch->clk);
 
-       bch->dev = &pdev->dev;
        return bch;
 }
 
index e1c016c9d32d2caaf90d3800cf0b4e83d2d55c8c..daf3c4217f4deb034a4e952cb75a237f07c7bf43 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/gpio/consumer.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
@@ -56,8 +55,6 @@ struct jz4780_nand_chip {
        struct nand_chip chip;
        struct list_head chip_list;
 
-       struct nand_ecclayout ecclayout;
-
        struct gpio_desc *busy_gpio;
        struct gpio_desc *wp_gpio;
        unsigned int reading: 1;
@@ -165,8 +162,7 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
        struct nand_chip *chip = &nand->chip;
        struct mtd_info *mtd = nand_to_mtd(chip);
        struct jz4780_nand_controller *nfc = to_jz4780_nand_controller(chip->controller);
-       struct nand_ecclayout *layout = &nand->ecclayout;
-       u32 start, i;
+       int eccbytes;
 
        chip->ecc.bytes = fls((1 + 8) * chip->ecc.size) *
                                (chip->ecc.strength / 8);
@@ -183,7 +179,6 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
                chip->ecc.correct = jz4780_nand_ecc_correct;
                /* fall through */
        case NAND_ECC_SOFT:
-       case NAND_ECC_SOFT_BCH:
                dev_info(dev, "using %s (strength %d, size %d, bytes %d)\n",
                        (nfc->bch) ? "hardware BCH" : "software ECC",
                        chip->ecc.strength, chip->ecc.size, chip->ecc.bytes);
@@ -201,23 +196,17 @@ static int jz4780_nand_init_ecc(struct jz4780_nand_chip *nand, struct device *de
                return 0;
 
        /* Generate ECC layout. ECC codes are right aligned in the OOB area. */
-       layout->eccbytes = mtd->writesize / chip->ecc.size * chip->ecc.bytes;
+       eccbytes = mtd->writesize / chip->ecc.size * chip->ecc.bytes;
 
-       if (layout->eccbytes > mtd->oobsize - 2) {
+       if (eccbytes > mtd->oobsize - 2) {
                dev_err(dev,
                        "invalid ECC config: required %d ECC bytes, but only %d are available",
-                       layout->eccbytes, mtd->oobsize - 2);
+                       eccbytes, mtd->oobsize - 2);
                return -EINVAL;
        }
 
-       start = mtd->oobsize - layout->eccbytes;
-       for (i = 0; i < layout->eccbytes; i++)
-               layout->eccpos[i] = start + i;
-
-       layout->oobfree[0].offset = 2;
-       layout->oobfree[0].length = mtd->oobsize - layout->eccbytes - 2;
+       mtd->ooblayout = &nand_ooblayout_lp_ops;
 
-       chip->ecc.layout = layout;
        return 0;
 }
 
index d8c3e7afcc0bfa74c5d0a87e580ca53a6b28e916..852388171f2033320e7cba102c3be24d312c0f03 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_gpio.h>
 #include <linux/mtd/lpc32xx_mlc.h>
 #include <linux/io.h>
@@ -139,22 +138,37 @@ struct lpc32xx_nand_cfg_mlc {
        unsigned num_parts;
 };
 
-static struct nand_ecclayout lpc32xx_nand_oob = {
-       .eccbytes = 40,
-       .eccpos = { 6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-                  22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                  38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                  54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-       .oobfree = {
-               { .offset = 0,
-                 .length = 6, },
-               { .offset = 16,
-                 .length = 6, },
-               { .offset = 32,
-                 .length = 6, },
-               { .offset = 48,
-                 .length = 6, },
-               },
+static int lpc32xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = ((section + 1) * 16) - nand_chip->ecc.bytes;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int lpc32xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = 16 * section;
+       oobregion->length = 16 - nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops lpc32xx_ooblayout_ops = {
+       .ecc = lpc32xx_ooblayout_ecc,
+       .free = lpc32xx_ooblayout_free,
 };
 
 static struct nand_bbt_descr lpc32xx_nand_bbt = {
@@ -713,6 +727,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
        nand_chip->ecc.write_oob = lpc32xx_write_oob;
        nand_chip->ecc.read_oob = lpc32xx_read_oob;
        nand_chip->ecc.strength = 4;
+       nand_chip->ecc.bytes = 10;
        nand_chip->waitfunc = lpc32xx_waitfunc;
 
        nand_chip->options = NAND_NO_SUBPAGE_WRITE;
@@ -751,7 +766,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
 
        nand_chip->ecc.mode = NAND_ECC_HW;
        nand_chip->ecc.size = 512;
-       nand_chip->ecc.layout = &lpc32xx_nand_oob;
+       mtd_set_ooblayout(mtd, &lpc32xx_ooblayout_ops);
        host->mlcsubpages = mtd->writesize / 512;
 
        /* initially clear interrupt status */
index 3b8f3735f3e86324f36b0bd4484aab2cda571fb9..8d3edc34958e7b356431c92b0b9ad43c89ac062c 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/mtd/nand_ecc.h>
 #include <linux/gpio.h>
 #include <linux/of.h>
-#include <linux/of_mtd.h>
 #include <linux/of_gpio.h>
 #include <linux/mtd/lpc32xx_slc.h>
 
  * NAND ECC Layout for small page NAND devices
  * Note: For large and huge page devices, the default layouts are used
  */
-static struct nand_ecclayout lpc32xx_nand_oob_16 = {
-       .eccbytes = 6,
-       .eccpos = {10, 11, 12, 13, 14, 15},
-       .oobfree = {
-               { .offset = 0, .length = 4 },
-               { .offset = 6, .length = 4 },
-       },
+static int lpc32xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 6;
+       oobregion->offset = 10;
+
+       return 0;
+}
+
+static int lpc32xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 4;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = 4;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops lpc32xx_ooblayout_ops = {
+       .ecc = lpc32xx_ooblayout_ecc,
+       .free = lpc32xx_ooblayout_free,
 };
 
 static u8 bbt_pattern[] = {'B', 'b', 't', '0' };
@@ -194,7 +218,6 @@ struct lpc32xx_nand_cfg_slc {
        uint32_t rwidth;
        uint32_t rhold;
        uint32_t rsetup;
-       bool use_bbt;
        int wp_gpio;
        struct mtd_partition *parts;
        unsigned num_parts;
@@ -604,7 +627,8 @@ static int lpc32xx_nand_read_page_syndrome(struct mtd_info *mtd,
                                           int oob_required, int page)
 {
        struct lpc32xx_nand_host *host = nand_get_controller_data(chip);
-       int stat, i, status;
+       struct mtd_oob_region oobregion = { };
+       int stat, i, status, error;
        uint8_t *oobecc, tmpecc[LPC32XX_ECC_SAVE_SIZE];
 
        /* Issue read command */
@@ -620,7 +644,11 @@ static int lpc32xx_nand_read_page_syndrome(struct mtd_info *mtd,
        lpc32xx_slc_ecc_copy(tmpecc, (uint32_t *) host->ecc_buf, chip->ecc.steps);
 
        /* Pointer to ECC data retrieved from NAND spare area */
-       oobecc = chip->oob_poi + chip->ecc.layout->eccpos[0];
+       error = mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (error)
+               return error;
+
+       oobecc = chip->oob_poi + oobregion.offset;
 
        for (i = 0; i < chip->ecc.steps; i++) {
                stat = chip->ecc.correct(mtd, buf, oobecc,
@@ -666,7 +694,8 @@ static int lpc32xx_nand_write_page_syndrome(struct mtd_info *mtd,
                                            int oob_required, int page)
 {
        struct lpc32xx_nand_host *host = nand_get_controller_data(chip);
-       uint8_t *pb = chip->oob_poi + chip->ecc.layout->eccpos[0];
+       struct mtd_oob_region oobregion = { };
+       uint8_t *pb;
        int error;
 
        /* Write data, calculate ECC on outbound data */
@@ -678,6 +707,11 @@ static int lpc32xx_nand_write_page_syndrome(struct mtd_info *mtd,
         * The calculated ECC needs some manual work done to it before
         * committing it to NAND. Process the calculated ECC and place
         * the resultant values directly into the OOB buffer. */
+       error = mtd_ooblayout_ecc(mtd, 0, &oobregion);
+       if (error)
+               return error;
+
+       pb = chip->oob_poi + oobregion.offset;
        lpc32xx_slc_ecc_copy(pb, (uint32_t *)host->ecc_buf, chip->ecc.steps);
 
        /* Write ECC data to device */
@@ -747,7 +781,6 @@ static struct lpc32xx_nand_cfg_slc *lpc32xx_parse_dt(struct device *dev)
                return NULL;
        }
 
-       ncfg->use_bbt = of_get_nand_on_flash_bbt(np);
        ncfg->wp_gpio = of_get_named_gpio(np, "gpios", 0);
 
        return ncfg;
@@ -875,26 +908,22 @@ static int lpc32xx_nand_probe(struct platform_device *pdev)
         * custom BBT marker layout.
         */
        if (mtd->writesize <= 512)
-               chip->ecc.layout = &lpc32xx_nand_oob_16;
+               mtd_set_ooblayout(mtd, &lpc32xx_ooblayout_ops);
 
        /* These sizes remain the same regardless of page size */
        chip->ecc.size = 256;
        chip->ecc.bytes = LPC32XX_SLC_DEV_ECC_BYTES;
        chip->ecc.prepad = chip->ecc.postpad = 0;
 
-       /* Avoid extra scan if using BBT, setup BBT support */
-       if (host->ncfg->use_bbt) {
-               chip->bbt_options |= NAND_BBT_USE_FLASH;
-
-               /*
-                * Use a custom BBT marker setup for small page FLASH that
-                * won't interfere with the ECC layout. Large and huge page
-                * FLASH use the standard layout.
-                */
-               if (mtd->writesize <= 512) {
-                       chip->bbt_td = &bbt_smallpage_main_descr;
-                       chip->bbt_md = &bbt_smallpage_mirror_descr;
-               }
+       /*
+        * Use a custom BBT marker setup for small page FLASH that
+        * won't interfere with the ECC layout. Large and huge page
+        * FLASH use the standard layout.
+        */
+       if ((chip->bbt_options & NAND_BBT_USE_FLASH) &&
+           mtd->writesize <= 512) {
+               chip->bbt_td = &bbt_smallpage_main_descr;
+               chip->bbt_md = &bbt_smallpage_mirror_descr;
        }
 
        /*
index 5d7843ffff6ac0c85c8a5cc9a11901eb603fdd1b..7eacb2f545f50366cc8996d48d3f46aaf9ed195c 100644 (file)
@@ -710,6 +710,7 @@ static int mpc5121_nfc_probe(struct platform_device *op)
        chip->select_chip = mpc5121_nfc_select_chip;
        chip->bbt_options = NAND_BBT_USE_FLASH;
        chip->ecc.mode = NAND_ECC_SOFT;
+       chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* Support external chip-select logic on ADS5121 board */
        if (of_machine_is_compatible("fsl,mpc5121ads")) {
index 854c832597aa69c52121f7f39e777e36db9ce764..5173fadc9a4e637f01817ed040a42b72af1d7a68 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 
 #include <asm/mach/flash.h>
 #include <linux/platform_data/mtd-mxc_nand.h>
@@ -149,7 +148,7 @@ struct mxc_nand_devtype_data {
        int (*check_int)(struct mxc_nand_host *);
        void (*irq_control)(struct mxc_nand_host *, int);
        u32 (*get_ecc_status)(struct mxc_nand_host *);
-       struct nand_ecclayout *ecclayout_512, *ecclayout_2k, *ecclayout_4k;
+       const struct mtd_ooblayout_ops *ooblayout;
        void (*select_chip)(struct mtd_info *mtd, int chip);
        int (*correct_data)(struct mtd_info *mtd, u_char *dat,
                        u_char *read_ecc, u_char *calc_ecc);
@@ -200,73 +199,6 @@ struct mxc_nand_host {
        struct mxc_nand_platform_data pdata;
 };
 
-/* OOB placement block for use with hardware ecc generation */
-static struct nand_ecclayout nandv1_hw_eccoob_smallpage = {
-       .eccbytes = 5,
-       .eccpos = {6, 7, 8, 9, 10},
-       .oobfree = {{0, 5}, {12, 4}, }
-};
-
-static struct nand_ecclayout nandv1_hw_eccoob_largepage = {
-       .eccbytes = 20,
-       .eccpos = {6, 7, 8, 9, 10, 22, 23, 24, 25, 26,
-                  38, 39, 40, 41, 42, 54, 55, 56, 57, 58},
-       .oobfree = {{2, 4}, {11, 10}, {27, 10}, {43, 10}, {59, 5}, }
-};
-
-/* OOB description for 512 byte pages with 16 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_smallpage = {
-       .eccbytes = 1 * 9,
-       .eccpos = {
-                7,  8,  9, 10, 11, 12, 13, 14, 15
-       },
-       .oobfree = {
-               {.offset = 0, .length = 5}
-       }
-};
-
-/* OOB description for 2048 byte pages with 64 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_largepage = {
-       .eccbytes = 4 * 9,
-       .eccpos = {
-                7,  8,  9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63
-       },
-       .oobfree = {
-               {.offset = 2, .length = 4},
-               {.offset = 16, .length = 7},
-               {.offset = 32, .length = 7},
-               {.offset = 48, .length = 7}
-       }
-};
-
-/* OOB description for 4096 byte pages with 128 byte OOB */
-static struct nand_ecclayout nandv2_hw_eccoob_4k = {
-       .eccbytes = 8 * 9,
-       .eccpos = {
-               7,  8,  9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63,
-               71, 72, 73, 74, 75, 76, 77, 78, 79,
-               87, 88, 89, 90, 91, 92, 93, 94, 95,
-               103, 104, 105, 106, 107, 108, 109, 110, 111,
-               119, 120, 121, 122, 123, 124, 125, 126, 127,
-       },
-       .oobfree = {
-               {.offset = 2, .length = 4},
-               {.offset = 16, .length = 7},
-               {.offset = 32, .length = 7},
-               {.offset = 48, .length = 7},
-               {.offset = 64, .length = 7},
-               {.offset = 80, .length = 7},
-               {.offset = 96, .length = 7},
-               {.offset = 112, .length = 7},
-       }
-};
-
 static const char * const part_probes[] = {
        "cmdlinepart", "RedBoot", "ofpart", NULL };
 
@@ -942,6 +874,99 @@ static void mxc_do_addr_cycle(struct mtd_info *mtd, int column, int page_addr)
        }
 }
 
+static int mxc_v1_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int mxc_v1_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+
+       if (section > nand_chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               if (mtd->writesize <= 512) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 2;
+                       oobregion->length = 4;
+               }
+       } else {
+               oobregion->offset = ((section - 1) * 16) +
+                                   nand_chip->ecc.bytes + 6;
+               if (section < nand_chip->ecc.steps)
+                       oobregion->length = (section * 16) + 6 -
+                                           oobregion->offset;
+               else
+                       oobregion->length = mtd->oobsize - oobregion->offset;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops mxc_v1_ooblayout_ops = {
+       .ecc = mxc_v1_ooblayout_ecc,
+       .free = mxc_v1_ooblayout_free,
+};
+
+static int mxc_v2_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+       int stepsize = nand_chip->ecc.bytes == 9 ? 16 : 26;
+
+       if (section >= nand_chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * stepsize) + 7;
+       oobregion->length = nand_chip->ecc.bytes;
+
+       return 0;
+}
+
+static int mxc_v2_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand_chip = mtd_to_nand(mtd);
+       int stepsize = nand_chip->ecc.bytes == 9 ? 16 : 26;
+
+       if (section > nand_chip->ecc.steps)
+               return -ERANGE;
+
+       if (!section) {
+               if (mtd->writesize <= 512) {
+                       oobregion->offset = 0;
+                       oobregion->length = 5;
+               } else {
+                       oobregion->offset = 2;
+                       oobregion->length = 4;
+               }
+       } else {
+               oobregion->offset = section * stepsize;
+               oobregion->length = 7;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops mxc_v2_ooblayout_ops = {
+       .ecc = mxc_v2_ooblayout_ecc,
+       .free = mxc_v2_ooblayout_free,
+};
+
 /*
  * v2 and v3 type controllers can do 4bit or 8bit ecc depending
  * on how much oob the nand chip has. For 8bit ecc we need at least
@@ -959,23 +984,6 @@ static int get_eccsize(struct mtd_info *mtd)
                return 8;
 }
 
-static void ecc_8bit_layout_4k(struct nand_ecclayout *layout)
-{
-       int i, j;
-
-       layout->eccbytes = 8*18;
-       for (i = 0; i < 8; i++)
-               for (j = 0; j < 18; j++)
-                       layout->eccpos[i*18 + j] = i*26 + j + 7;
-
-       layout->oobfree[0].offset = 2;
-       layout->oobfree[0].length = 4;
-       for (i = 1; i < 8; i++) {
-               layout->oobfree[i].offset = i*26;
-               layout->oobfree[i].length = 7;
-       }
-}
-
 static void preset_v1(struct mtd_info *mtd)
 {
        struct nand_chip *nand_chip = mtd_to_nand(mtd);
@@ -1269,9 +1277,7 @@ static const struct mxc_nand_devtype_data imx21_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v1,
-       .ecclayout_512 = &nandv1_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv1_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv1_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v1_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v1,
        .irqpending_quirk = 1,
@@ -1294,9 +1300,7 @@ static const struct mxc_nand_devtype_data imx27_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v1,
-       .ecclayout_512 = &nandv1_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv1_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv1_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v1_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v1,
        .irqpending_quirk = 0,
@@ -1320,9 +1324,7 @@ static const struct mxc_nand_devtype_data imx25_nand_devtype_data = {
        .check_int = check_int_v1_v2,
        .irq_control = irq_control_v1_v2,
        .get_ecc_status = get_ecc_status_v2,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_4k,
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v2,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1346,9 +1348,7 @@ static const struct mxc_nand_devtype_data imx51_nand_devtype_data = {
        .check_int = check_int_v3,
        .irq_control = irq_control_v3,
        .get_ecc_status = get_ecc_status_v3,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1373,9 +1373,7 @@ static const struct mxc_nand_devtype_data imx53_nand_devtype_data = {
        .check_int = check_int_v3,
        .irq_control = irq_control_v3,
        .get_ecc_status = get_ecc_status_v3,
-       .ecclayout_512 = &nandv2_hw_eccoob_smallpage,
-       .ecclayout_2k = &nandv2_hw_eccoob_largepage,
-       .ecclayout_4k = &nandv2_hw_eccoob_smallpage, /* XXX: needs fix */
+       .ooblayout = &mxc_v2_ooblayout_ops,
        .select_chip = mxc_nand_select_chip_v1_v3,
        .correct_data = mxc_nand_correct_data_v2_v3,
        .irqpending_quirk = 0,
@@ -1461,25 +1459,12 @@ MODULE_DEVICE_TABLE(of, mxcnd_dt_ids);
 static int __init mxcnd_probe_dt(struct mxc_nand_host *host)
 {
        struct device_node *np = host->dev->of_node;
-       struct mxc_nand_platform_data *pdata = &host->pdata;
        const struct of_device_id *of_id =
                of_match_device(mxcnd_dt_ids, host->dev);
-       int buswidth;
 
        if (!np)
                return 1;
 
-       if (of_get_nand_ecc_mode(np) >= 0)
-               pdata->hw_ecc = 1;
-
-       pdata->flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       buswidth = of_get_nand_bus_width(np);
-       if (buswidth < 0)
-               return buswidth;
-
-       pdata->width = buswidth / 8;
-
        host->devtype_data = of_id->data;
 
        return 0;
@@ -1576,27 +1561,22 @@ static int mxcnd_probe(struct platform_device *pdev)
 
        this->select_chip = host->devtype_data->select_chip;
        this->ecc.size = 512;
-       this->ecc.layout = host->devtype_data->ecclayout_512;
+       mtd_set_ooblayout(mtd, host->devtype_data->ooblayout);
 
        if (host->pdata.hw_ecc) {
-               this->ecc.calculate = mxc_nand_calculate_ecc;
-               this->ecc.hwctl = mxc_nand_enable_hwecc;
-               this->ecc.correct = host->devtype_data->correct_data;
                this->ecc.mode = NAND_ECC_HW;
        } else {
                this->ecc.mode = NAND_ECC_SOFT;
+               this->ecc.algo = NAND_ECC_HAMMING;
        }
 
        /* NAND bus width determines access functions used by upper layer */
        if (host->pdata.width == 2)
                this->options |= NAND_BUSWIDTH_16;
 
-       if (host->pdata.flash_bbt) {
-               this->bbt_td = &bbt_main_descr;
-               this->bbt_md = &bbt_mirror_descr;
-               /* update flash based bbt */
+       /* update flash based bbt */
+       if (host->pdata.flash_bbt)
                this->bbt_options |= NAND_BBT_USE_FLASH;
-       }
 
        init_completion(&host->op_completion);
 
@@ -1637,6 +1617,26 @@ static int mxcnd_probe(struct platform_device *pdev)
                goto escan;
        }
 
+       switch (this->ecc.mode) {
+       case NAND_ECC_HW:
+               this->ecc.calculate = mxc_nand_calculate_ecc;
+               this->ecc.hwctl = mxc_nand_enable_hwecc;
+               this->ecc.correct = host->devtype_data->correct_data;
+               break;
+
+       case NAND_ECC_SOFT:
+               break;
+
+       default:
+               err = -EINVAL;
+               goto escan;
+       }
+
+       if (this->bbt_options & NAND_BBT_USE_FLASH) {
+               this->bbt_td = &bbt_main_descr;
+               this->bbt_md = &bbt_mirror_descr;
+       }
+
        /* allocate the right size buffer now */
        devm_kfree(&pdev->dev, (void *)host->data_buf);
        host->data_buf = devm_kzalloc(&pdev->dev, mtd->writesize + mtd->oobsize,
@@ -1649,12 +1649,11 @@ static int mxcnd_probe(struct platform_device *pdev)
        /* Call preset again, with correct writesize this time */
        host->devtype_data->preset(mtd);
 
-       if (mtd->writesize == 2048)
-               this->ecc.layout = host->devtype_data->ecclayout_2k;
-       else if (mtd->writesize == 4096) {
-               this->ecc.layout = host->devtype_data->ecclayout_4k;
-               if (get_eccsize(mtd) == 8)
-                       ecc_8bit_layout_4k(this->ecc.layout);
+       if (!this->ecc.bytes) {
+               if (host->eccsize == 8)
+                       this->ecc.bytes = 18;
+               else if (host->eccsize == 4)
+                       this->ecc.bytes = 9;
        }
 
        /*
index ba4f603e05375b32de7bf01f3bde05967bc0268e..0b0dc29d2af78b59ca36bb7f706b1139e065f55a 100644 (file)
 #include <linux/bitops.h>
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
+#include <linux/of.h>
+
+static int nand_get_device(struct mtd_info *mtd, int new_state);
+
+static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
+                            struct mtd_oob_ops *ops);
 
 /* Define default oob placement schemes for large and small page devices */
-static struct nand_ecclayout nand_oob_8 = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {
-               {.offset = 3,
-                .length = 2},
-               {.offset = 6,
-                .length = 2} }
-};
+static int nand_ooblayout_ecc_sp(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static struct nand_ecclayout nand_oob_16 = {
-       .eccbytes = 6,
-       .eccpos = {0, 1, 2, 3, 6, 7},
-       .oobfree = {
-               {.offset = 8,
-                . length = 8} }
-};
+       if (section > 1)
+               return -ERANGE;
 
-static struct nand_ecclayout nand_oob_64 = {
-       .eccbytes = 24,
-       .eccpos = {
-                  40, 41, 42, 43, 44, 45, 46, 47,
-                  48, 49, 50, 51, 52, 53, 54, 55,
-                  56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = {
-               {.offset = 2,
-                .length = 38} }
-};
+       if (!section) {
+               oobregion->offset = 0;
+               oobregion->length = 4;
+       } else {
+               oobregion->offset = 6;
+               oobregion->length = ecc->total - 4;
+       }
+
+       return 0;
+}
+
+static int nand_ooblayout_free_sp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       if (mtd->oobsize == 16) {
+               if (section)
+                       return -ERANGE;
+
+               oobregion->length = 8;
+               oobregion->offset = 8;
+       } else {
+               oobregion->length = 2;
+               if (!section)
+                       oobregion->offset = 3;
+               else
+                       oobregion->offset = 6;
+       }
+
+       return 0;
+}
 
-static struct nand_ecclayout nand_oob_128 = {
-       .eccbytes = 48,
-       .eccpos = {
-                  80, 81, 82, 83, 84, 85, 86, 87,
-                  88, 89, 90, 91, 92, 93, 94, 95,
-                  96, 97, 98, 99, 100, 101, 102, 103,
-                  104, 105, 106, 107, 108, 109, 110, 111,
-                  112, 113, 114, 115, 116, 117, 118, 119,
-                  120, 121, 122, 123, 124, 125, 126, 127},
-       .oobfree = {
-               {.offset = 2,
-                .length = 78} }
+const struct mtd_ooblayout_ops nand_ooblayout_sp_ops = {
+       .ecc = nand_ooblayout_ecc_sp,
+       .free = nand_ooblayout_free_sp,
 };
+EXPORT_SYMBOL_GPL(nand_ooblayout_sp_ops);
 
-static int nand_get_device(struct mtd_info *mtd, int new_state);
+static int nand_ooblayout_ecc_lp(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-static int nand_do_write_oob(struct mtd_info *mtd, loff_t to,
-                            struct mtd_oob_ops *ops);
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = ecc->total;
+       oobregion->offset = mtd->oobsize - oobregion->length;
+
+       return 0;
+}
+
+static int nand_ooblayout_free_lp(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = mtd->oobsize - ecc->total - 2;
+       oobregion->offset = 2;
+
+       return 0;
+}
+
+const struct mtd_ooblayout_ops nand_ooblayout_lp_ops = {
+       .ecc = nand_ooblayout_ecc_lp,
+       .free = nand_ooblayout_free_lp,
+};
+EXPORT_SYMBOL_GPL(nand_ooblayout_lp_ops);
 
 static int check_offs_len(struct mtd_info *mtd,
                                        loff_t ofs, uint64_t len)
@@ -1279,13 +1321,12 @@ static int nand_read_page_raw_syndrome(struct mtd_info *mtd,
 static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
                                uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        unsigned int max_bitflips = 0;
 
        chip->ecc.read_page_raw(mtd, chip, buf, 1, page);
@@ -1293,8 +1334,10 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        eccsteps = chip->ecc.steps;
        p = buf;
@@ -1326,14 +1369,14 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
                        uint32_t data_offs, uint32_t readlen, uint8_t *bufpoi,
                        int page)
 {
-       int start_step, end_step, num_steps;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
+       int start_step, end_step, num_steps, ret;
        uint8_t *p;
        int data_col_addr, i, gaps = 0;
        int datafrag_len, eccfrag_len, aligned_len, aligned_pos;
        int busw = (chip->options & NAND_BUSWIDTH_16) ? 2 : 1;
-       int index;
+       int index, section = 0;
        unsigned int max_bitflips = 0;
+       struct mtd_oob_region oobregion = { };
 
        /* Column address within the page aligned to ECC size (256bytes) */
        start_step = data_offs / chip->ecc.size;
@@ -1361,12 +1404,13 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
         * The performance is faster if we position offsets according to
         * ecc.pos. Let's make sure that there are no gaps in ECC positions.
         */
-       for (i = 0; i < eccfrag_len - 1; i++) {
-               if (eccpos[i + index] + 1 != eccpos[i + index + 1]) {
-                       gaps = 1;
-                       break;
-               }
-       }
+       ret = mtd_ooblayout_find_eccregion(mtd, index, &section, &oobregion);
+       if (ret)
+               return ret;
+
+       if (oobregion.length < eccfrag_len)
+               gaps = 1;
+
        if (gaps) {
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT, mtd->writesize, -1);
                chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1375,20 +1419,23 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
                 * Send the command to read the particular ECC bytes take care
                 * about buswidth alignment in read_buf.
                 */
-               aligned_pos = eccpos[index] & ~(busw - 1);
+               aligned_pos = oobregion.offset & ~(busw - 1);
                aligned_len = eccfrag_len;
-               if (eccpos[index] & (busw - 1))
+               if (oobregion.offset & (busw - 1))
                        aligned_len++;
-               if (eccpos[index + (num_steps * chip->ecc.bytes)] & (busw - 1))
+               if ((oobregion.offset + (num_steps * chip->ecc.bytes)) &
+                   (busw - 1))
                        aligned_len++;
 
                chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
-                                       mtd->writesize + aligned_pos, -1);
+                             mtd->writesize + aligned_pos, -1);
                chip->read_buf(mtd, &chip->oob_poi[aligned_pos], aligned_len);
        }
 
-       for (i = 0; i < eccfrag_len; i++)
-               chip->buffers->ecccode[i] = chip->oob_poi[eccpos[i + index]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, chip->buffers->ecccode,
+                                        chip->oob_poi, index, eccfrag_len);
+       if (ret)
+               return ret;
 
        p = bufpoi + data_col_addr;
        for (i = 0; i < eccfrag_len ; i += chip->ecc.bytes, p += chip->ecc.size) {
@@ -1429,13 +1476,12 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
 static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        unsigned int max_bitflips = 0;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
@@ -1445,8 +1491,10 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
        }
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        eccsteps = chip->ecc.steps;
        p = buf;
@@ -1491,12 +1539,11 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 static int nand_read_page_hwecc_oob_first(struct mtd_info *mtd,
        struct nand_chip *chip, uint8_t *buf, int oob_required, int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *p = buf;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        unsigned int max_bitflips = 0;
 
@@ -1505,8 +1552,10 @@ static int nand_read_page_hwecc_oob_first(struct mtd_info *mtd,
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
        chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               ecc_code[i] = chip->oob_poi[eccpos[i]];
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
                int stat;
@@ -1607,14 +1656,17 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
 /**
  * nand_transfer_oob - [INTERN] Transfer oob to client buffer
- * @chip: nand chip structure
+ * @mtd: mtd info structure
  * @oob: oob destination address
  * @ops: oob ops structure
  * @len: size of oob to transfer
  */
-static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
+static uint8_t *nand_transfer_oob(struct mtd_info *mtd, uint8_t *oob,
                                  struct mtd_oob_ops *ops, size_t len)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int ret;
+
        switch (ops->mode) {
 
        case MTD_OPS_PLACE_OOB:
@@ -1622,31 +1674,12 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob,
                memcpy(oob, chip->oob_poi + ops->ooboffs, len);
                return oob + len;
 
-       case MTD_OPS_AUTO_OOB: {
-               struct nand_oobfree *free = chip->ecc.layout->oobfree;
-               uint32_t boffs = 0, roffs = ops->ooboffs;
-               size_t bytes = 0;
-
-               for (; free->length && len; free++, len -= bytes) {
-                       /* Read request not from offset 0? */
-                       if (unlikely(roffs)) {
-                               if (roffs >= free->length) {
-                                       roffs -= free->length;
-                                       continue;
-                               }
-                               boffs = free->offset + roffs;
-                               bytes = min_t(size_t, len,
-                                             (free->length - roffs));
-                               roffs = 0;
-                       } else {
-                               bytes = min_t(size_t, len, free->length);
-                               boffs = free->offset;
-                       }
-                       memcpy(oob, chip->oob_poi + boffs, bytes);
-                       oob += bytes;
-               }
-               return oob;
-       }
+       case MTD_OPS_AUTO_OOB:
+               ret = mtd_ooblayout_get_databytes(mtd, oob, chip->oob_poi,
+                                                 ops->ooboffs, len);
+               BUG_ON(ret);
+               return oob + len;
+
        default:
                BUG();
        }
@@ -1780,7 +1813,7 @@ read_retry:
                                int toread = min(oobreadlen, max_oobsize);
 
                                if (toread) {
-                                       oob = nand_transfer_oob(chip,
+                                       oob = nand_transfer_oob(mtd,
                                                oob, ops, toread);
                                        oobreadlen -= toread;
                                }
@@ -1893,13 +1926,13 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-                            int page)
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
        chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
        chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
        return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_std);
 
 /**
  * nand_read_oob_syndrome - [REPLACEABLE] OOB data read function for HW ECC
@@ -1908,8 +1941,8 @@ static int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to read
  */
-static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-                                 int page)
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                          int page)
 {
        int length = mtd->oobsize;
        int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
@@ -1937,6 +1970,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 
        return 0;
 }
+EXPORT_SYMBOL(nand_read_oob_syndrome);
 
 /**
  * nand_write_oob_std - [REPLACEABLE] the most common OOB data write function
@@ -1944,8 +1978,7 @@ static int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
-                             int page)
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page)
 {
        int status = 0;
        const uint8_t *buf = chip->oob_poi;
@@ -1960,6 +1993,7 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
 
        return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_std);
 
 /**
  * nand_write_oob_syndrome - [REPLACEABLE] OOB data write function for HW ECC
@@ -1968,8 +2002,8 @@ static int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
  * @chip: nand chip info structure
  * @page: page number to write
  */
-static int nand_write_oob_syndrome(struct mtd_info *mtd,
-                                  struct nand_chip *chip, int page)
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                           int page)
 {
        int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
        int eccsize = chip->ecc.size, length = mtd->oobsize;
@@ -2019,6 +2053,7 @@ static int nand_write_oob_syndrome(struct mtd_info *mtd,
 
        return status & NAND_STATUS_FAIL ? -EIO : 0;
 }
+EXPORT_SYMBOL(nand_write_oob_syndrome);
 
 /**
  * nand_do_read_oob - [INTERN] NAND read out-of-band
@@ -2078,7 +2113,7 @@ static int nand_do_read_oob(struct mtd_info *mtd, loff_t from,
                        break;
 
                len = min(len, readlen);
-               buf = nand_transfer_oob(chip, buf, ops, len);
+               buf = nand_transfer_oob(mtd, buf, ops, len);
 
                if (chip->options & NAND_NEED_READRDY) {
                        /* Apply delay or wait for ready/busy pin */
@@ -2237,19 +2272,20 @@ static int nand_write_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
                                 const uint8_t *buf, int oob_required,
                                 int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        const uint8_t *p = buf;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        /* Software ECC calculation */
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        return chip->ecc.write_page_raw(mtd, chip, buf, 1, page);
 }
@@ -2266,12 +2302,11 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                                  const uint8_t *buf, int oob_required,
                                  int page)
 {
-       int i, eccsize = chip->ecc.size;
+       int i, eccsize = chip->ecc.size, ret;
        int eccbytes = chip->ecc.bytes;
        int eccsteps = chip->ecc.steps;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        const uint8_t *p = buf;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
                chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
@@ -2279,8 +2314,10 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                chip->ecc.calculate(mtd, p, &ecc_calc[i]);
        }
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
 
@@ -2308,11 +2345,10 @@ static int nand_write_subpage_hwecc(struct mtd_info *mtd,
        int ecc_size      = chip->ecc.size;
        int ecc_bytes     = chip->ecc.bytes;
        int ecc_steps     = chip->ecc.steps;
-       uint32_t *eccpos  = chip->ecc.layout->eccpos;
        uint32_t start_step = offset / ecc_size;
        uint32_t end_step   = (offset + data_len - 1) / ecc_size;
        int oob_bytes       = mtd->oobsize / ecc_steps;
-       int step, i;
+       int step, ret;
 
        for (step = 0; step < ecc_steps; step++) {
                /* configure controller for WRITE access */
@@ -2340,8 +2376,10 @@ static int nand_write_subpage_hwecc(struct mtd_info *mtd,
        /* copy calculated ECC for whole page to chip->buffer->oob */
        /* this include masked-value(0xFF) for unwritten subpages */
        ecc_calc = chip->buffers->ecccalc;
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        /* write OOB buffer to NAND device */
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -2478,6 +2516,7 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len,
                              struct mtd_oob_ops *ops)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
+       int ret;
 
        /*
         * Initialise to all 0xFF, to avoid the possibility of left over OOB
@@ -2492,31 +2531,12 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len,
                memcpy(chip->oob_poi + ops->ooboffs, oob, len);
                return oob + len;
 
-       case MTD_OPS_AUTO_OOB: {
-               struct nand_oobfree *free = chip->ecc.layout->oobfree;
-               uint32_t boffs = 0, woffs = ops->ooboffs;
-               size_t bytes = 0;
-
-               for (; free->length && len; free++, len -= bytes) {
-                       /* Write request not from offset 0? */
-                       if (unlikely(woffs)) {
-                               if (woffs >= free->length) {
-                                       woffs -= free->length;
-                                       continue;
-                               }
-                               boffs = free->offset + woffs;
-                               bytes = min_t(size_t, len,
-                                             (free->length - woffs));
-                               woffs = 0;
-                       } else {
-                               bytes = min_t(size_t, len, free->length);
-                               boffs = free->offset;
-                       }
-                       memcpy(chip->oob_poi + boffs, oob, bytes);
-                       oob += bytes;
-               }
-               return oob;
-       }
+       case MTD_OPS_AUTO_OOB:
+               ret = mtd_ooblayout_set_databytes(mtd, oob, chip->oob_poi,
+                                                 ops->ooboffs, len);
+               BUG_ON(ret);
+               return oob + len;
+
        default:
                BUG();
        }
@@ -3951,10 +3971,115 @@ ident_done:
        return type;
 }
 
+static const char * const nand_ecc_modes[] = {
+       [NAND_ECC_NONE]         = "none",
+       [NAND_ECC_SOFT]         = "soft",
+       [NAND_ECC_HW]           = "hw",
+       [NAND_ECC_HW_SYNDROME]  = "hw_syndrome",
+       [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first",
+};
+
+static int of_get_nand_ecc_mode(struct device_node *np)
+{
+       const char *pm;
+       int err, i;
+
+       err = of_property_read_string(np, "nand-ecc-mode", &pm);
+       if (err < 0)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
+               if (!strcasecmp(pm, nand_ecc_modes[i]))
+                       return i;
+
+       /*
+        * For backward compatibility we support few obsoleted values that don't
+        * have their mappings into nand_ecc_modes_t anymore (they were merged
+        * with other enums).
+        */
+       if (!strcasecmp(pm, "soft_bch"))
+               return NAND_ECC_SOFT;
+
+       return -ENODEV;
+}
+
+static const char * const nand_ecc_algos[] = {
+       [NAND_ECC_HAMMING]      = "hamming",
+       [NAND_ECC_BCH]          = "bch",
+};
+
+static int of_get_nand_ecc_algo(struct device_node *np)
+{
+       const char *pm;
+       int err, i;
+
+       err = of_property_read_string(np, "nand-ecc-algo", &pm);
+       if (!err) {
+               for (i = NAND_ECC_HAMMING; i < ARRAY_SIZE(nand_ecc_algos); i++)
+                       if (!strcasecmp(pm, nand_ecc_algos[i]))
+                               return i;
+               return -ENODEV;
+       }
+
+       /*
+        * For backward compatibility we also read "nand-ecc-mode" checking
+        * for some obsoleted values that were specifying ECC algorithm.
+        */
+       err = of_property_read_string(np, "nand-ecc-mode", &pm);
+       if (err < 0)
+               return err;
+
+       if (!strcasecmp(pm, "soft"))
+               return NAND_ECC_HAMMING;
+       else if (!strcasecmp(pm, "soft_bch"))
+               return NAND_ECC_BCH;
+
+       return -ENODEV;
+}
+
+static int of_get_nand_ecc_step_size(struct device_node *np)
+{
+       int ret;
+       u32 val;
+
+       ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
+       return ret ? ret : val;
+}
+
+static int of_get_nand_ecc_strength(struct device_node *np)
+{
+       int ret;
+       u32 val;
+
+       ret = of_property_read_u32(np, "nand-ecc-strength", &val);
+       return ret ? ret : val;
+}
+
+static int of_get_nand_bus_width(struct device_node *np)
+{
+       u32 val;
+
+       if (of_property_read_u32(np, "nand-bus-width", &val))
+               return 8;
+
+       switch (val) {
+       case 8:
+       case 16:
+               return val;
+       default:
+               return -EIO;
+       }
+}
+
+static bool of_get_nand_on_flash_bbt(struct device_node *np)
+{
+       return of_property_read_bool(np, "nand-on-flash-bbt");
+}
+
 static int nand_dt_init(struct nand_chip *chip)
 {
        struct device_node *dn = nand_get_flash_node(chip);
-       int ecc_mode, ecc_strength, ecc_step;
+       int ecc_mode, ecc_algo, ecc_strength, ecc_step;
 
        if (!dn)
                return 0;
@@ -3966,6 +4091,7 @@ static int nand_dt_init(struct nand_chip *chip)
                chip->bbt_options |= NAND_BBT_USE_FLASH;
 
        ecc_mode = of_get_nand_ecc_mode(dn);
+       ecc_algo = of_get_nand_ecc_algo(dn);
        ecc_strength = of_get_nand_ecc_strength(dn);
        ecc_step = of_get_nand_ecc_step_size(dn);
 
@@ -3978,6 +4104,9 @@ static int nand_dt_init(struct nand_chip *chip)
        if (ecc_mode >= 0)
                chip->ecc.mode = ecc_mode;
 
+       if (ecc_algo >= 0)
+               chip->ecc.algo = ecc_algo;
+
        if (ecc_strength >= 0)
                chip->ecc.strength = ecc_strength;
 
@@ -4054,6 +4183,82 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 }
 EXPORT_SYMBOL(nand_scan_ident);
 
+static int nand_set_ecc_soft_ops(struct mtd_info *mtd)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
+       if (WARN_ON(ecc->mode != NAND_ECC_SOFT))
+               return -EINVAL;
+
+       switch (ecc->algo) {
+       case NAND_ECC_HAMMING:
+               ecc->calculate = nand_calculate_ecc;
+               ecc->correct = nand_correct_data;
+               ecc->read_page = nand_read_page_swecc;
+               ecc->read_subpage = nand_read_subpage;
+               ecc->write_page = nand_write_page_swecc;
+               ecc->read_page_raw = nand_read_page_raw;
+               ecc->write_page_raw = nand_write_page_raw;
+               ecc->read_oob = nand_read_oob_std;
+               ecc->write_oob = nand_write_oob_std;
+               if (!ecc->size)
+                       ecc->size = 256;
+               ecc->bytes = 3;
+               ecc->strength = 1;
+               return 0;
+       case NAND_ECC_BCH:
+               if (!mtd_nand_has_bch()) {
+                       WARN(1, "CONFIG_MTD_NAND_ECC_BCH not enabled\n");
+                       return -EINVAL;
+               }
+               ecc->calculate = nand_bch_calculate_ecc;
+               ecc->correct = nand_bch_correct_data;
+               ecc->read_page = nand_read_page_swecc;
+               ecc->read_subpage = nand_read_subpage;
+               ecc->write_page = nand_write_page_swecc;
+               ecc->read_page_raw = nand_read_page_raw;
+               ecc->write_page_raw = nand_write_page_raw;
+               ecc->read_oob = nand_read_oob_std;
+               ecc->write_oob = nand_write_oob_std;
+               /*
+               * Board driver should supply ecc.size and ecc.strength
+               * values to select how many bits are correctable.
+               * Otherwise, default to 4 bits for large page devices.
+               */
+               if (!ecc->size && (mtd->oobsize >= 64)) {
+                       ecc->size = 512;
+                       ecc->strength = 4;
+               }
+
+               /*
+                * if no ecc placement scheme was provided pickup the default
+                * large page one.
+                */
+               if (!mtd->ooblayout) {
+                       /* handle large page devices only */
+                       if (mtd->oobsize < 64) {
+                               WARN(1, "OOB layout is required when using software BCH on small pages\n");
+                               return -EINVAL;
+                       }
+
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
+               }
+
+               /* See nand_bch_init() for details. */
+               ecc->bytes = 0;
+               ecc->priv = nand_bch_init(mtd);
+               if (!ecc->priv) {
+                       WARN(1, "BCH ECC initialization failed!\n");
+                       return -EINVAL;
+               }
+               return 0;
+       default:
+               WARN(1, "Unsupported ECC algorithm!\n");
+               return -EINVAL;
+       }
+}
+
 /*
  * Check if the chip configuration meet the datasheet requirements.
 
@@ -4098,14 +4303,15 @@ static bool nand_ecc_strength_good(struct mtd_info *mtd)
  */
 int nand_scan_tail(struct mtd_info *mtd)
 {
-       int i;
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        struct nand_buffers *nbuf;
+       int ret;
 
        /* New bad blocks should be marked in OOB, flash-based BBT, or both */
-       BUG_ON((chip->bbt_options & NAND_BBT_NO_OOB_BBM) &&
-                       !(chip->bbt_options & NAND_BBT_USE_FLASH));
+       if (WARN_ON((chip->bbt_options & NAND_BBT_NO_OOB_BBM) &&
+                  !(chip->bbt_options & NAND_BBT_USE_FLASH)))
+               return -EINVAL;
 
        if (!(chip->options & NAND_OWN_BUFFERS)) {
                nbuf = kzalloc(sizeof(*nbuf) + mtd->writesize
@@ -4128,24 +4334,22 @@ int nand_scan_tail(struct mtd_info *mtd)
        /*
         * If no default placement scheme is given, select an appropriate one.
         */
-       if (!ecc->layout && (ecc->mode != NAND_ECC_SOFT_BCH)) {
+       if (!mtd->ooblayout &&
+           !(ecc->mode == NAND_ECC_SOFT && ecc->algo == NAND_ECC_BCH)) {
                switch (mtd->oobsize) {
                case 8:
-                       ecc->layout = &nand_oob_8;
-                       break;
                case 16:
-                       ecc->layout = &nand_oob_16;
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_sp_ops);
                        break;
                case 64:
-                       ecc->layout = &nand_oob_64;
-                       break;
                case 128:
-                       ecc->layout = &nand_oob_128;
+                       mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
                        break;
                default:
-                       pr_warn("No oob scheme defined for oobsize %d\n",
-                                  mtd->oobsize);
-                       BUG();
+                       WARN(1, "No oob scheme defined for oobsize %d\n",
+                               mtd->oobsize);
+                       ret = -EINVAL;
+                       goto err_free;
                }
        }
 
@@ -4161,8 +4365,9 @@ int nand_scan_tail(struct mtd_info *mtd)
        case NAND_ECC_HW_OOB_FIRST:
                /* Similar to NAND_ECC_HW, but a separate read_page handle */
                if (!ecc->calculate || !ecc->correct || !ecc->hwctl) {
-                       pr_warn("No ECC functions supplied; hardware ECC not possible\n");
-                       BUG();
+                       WARN(1, "No ECC functions supplied; hardware ECC not possible\n");
+                       ret = -EINVAL;
+                       goto err_free;
                }
                if (!ecc->read_page)
                        ecc->read_page = nand_read_page_hwecc_oob_first;
@@ -4192,8 +4397,9 @@ int nand_scan_tail(struct mtd_info *mtd)
                     ecc->read_page == nand_read_page_hwecc ||
                     !ecc->write_page ||
                     ecc->write_page == nand_write_page_hwecc)) {
-                       pr_warn("No ECC functions supplied; hardware ECC not possible\n");
-                       BUG();
+                       WARN(1, "No ECC functions supplied; hardware ECC not possible\n");
+                       ret = -EINVAL;
+                       goto err_free;
                }
                /* Use standard syndrome read/write page function? */
                if (!ecc->read_page)
@@ -4211,61 +4417,22 @@ int nand_scan_tail(struct mtd_info *mtd)
 
                if (mtd->writesize >= ecc->size) {
                        if (!ecc->strength) {
-                               pr_warn("Driver must set ecc.strength when using hardware ECC\n");
-                               BUG();
+                               WARN(1, "Driver must set ecc.strength when using hardware ECC\n");
+                               ret = -EINVAL;
+                               goto err_free;
                        }
                        break;
                }
                pr_warn("%d byte HW ECC not possible on %d byte page size, fallback to SW ECC\n",
                        ecc->size, mtd->writesize);
                ecc->mode = NAND_ECC_SOFT;
+               ecc->algo = NAND_ECC_HAMMING;
 
        case NAND_ECC_SOFT:
-               ecc->calculate = nand_calculate_ecc;
-               ecc->correct = nand_correct_data;
-               ecc->read_page = nand_read_page_swecc;
-               ecc->read_subpage = nand_read_subpage;
-               ecc->write_page = nand_write_page_swecc;
-               ecc->read_page_raw = nand_read_page_raw;
-               ecc->write_page_raw = nand_write_page_raw;
-               ecc->read_oob = nand_read_oob_std;
-               ecc->write_oob = nand_write_oob_std;
-               if (!ecc->size)
-                       ecc->size = 256;
-               ecc->bytes = 3;
-               ecc->strength = 1;
-               break;
-
-       case NAND_ECC_SOFT_BCH:
-               if (!mtd_nand_has_bch()) {
-                       pr_warn("CONFIG_MTD_NAND_ECC_BCH not enabled\n");
-                       BUG();
-               }
-               ecc->calculate = nand_bch_calculate_ecc;
-               ecc->correct = nand_bch_correct_data;
-               ecc->read_page = nand_read_page_swecc;
-               ecc->read_subpage = nand_read_subpage;
-               ecc->write_page = nand_write_page_swecc;
-               ecc->read_page_raw = nand_read_page_raw;
-               ecc->write_page_raw = nand_write_page_raw;
-               ecc->read_oob = nand_read_oob_std;
-               ecc->write_oob = nand_write_oob_std;
-               /*
-                * Board driver should supply ecc.size and ecc.strength values
-                * to select how many bits are correctable. Otherwise, default
-                * to 4 bits for large page devices.
-                */
-               if (!ecc->size && (mtd->oobsize >= 64)) {
-                       ecc->size = 512;
-                       ecc->strength = 4;
-               }
-
-               /* See nand_bch_init() for details. */
-               ecc->bytes = 0;
-               ecc->priv = nand_bch_init(mtd);
-               if (!ecc->priv) {
-                       pr_warn("BCH ECC initialization failed!\n");
-                       BUG();
+               ret = nand_set_ecc_soft_ops(mtd);
+               if (ret) {
+                       ret = -EINVAL;
+                       goto err_free;
                }
                break;
 
@@ -4283,8 +4450,9 @@ int nand_scan_tail(struct mtd_info *mtd)
                break;
 
        default:
-               pr_warn("Invalid NAND_ECC_MODE %d\n", ecc->mode);
-               BUG();
+               WARN(1, "Invalid NAND_ECC_MODE %d\n", ecc->mode);
+               ret = -EINVAL;
+               goto err_free;
        }
 
        /* For many systems, the standard OOB write also works for raw */
@@ -4293,20 +4461,9 @@ int nand_scan_tail(struct mtd_info *mtd)
        if (!ecc->write_oob_raw)
                ecc->write_oob_raw = ecc->write_oob;
 
-       /*
-        * The number of bytes available for a client to place data into
-        * the out of band area.
-        */
-       mtd->oobavail = 0;
-       if (ecc->layout) {
-               for (i = 0; ecc->layout->oobfree[i].length; i++)
-                       mtd->oobavail += ecc->layout->oobfree[i].length;
-       }
-
-       /* ECC sanity check: warn if it's too weak */
-       if (!nand_ecc_strength_good(mtd))
-               pr_warn("WARNING: %s: the ECC used on your system is too weak compared to the one required by the NAND chip\n",
-                       mtd->name);
+       /* propagate ecc info to mtd_info */
+       mtd->ecc_strength = ecc->strength;
+       mtd->ecc_step_size = ecc->size;
 
        /*
         * Set the number of read / write steps for one page depending on ECC
@@ -4314,11 +4471,27 @@ int nand_scan_tail(struct mtd_info *mtd)
         */
        ecc->steps = mtd->writesize / ecc->size;
        if (ecc->steps * ecc->size != mtd->writesize) {
-               pr_warn("Invalid ECC parameters\n");
-               BUG();
+               WARN(1, "Invalid ECC parameters\n");
+               ret = -EINVAL;
+               goto err_free;
        }
        ecc->total = ecc->steps * ecc->bytes;
 
+       /*
+        * The number of bytes available for a client to place data into
+        * the out of band area.
+        */
+       ret = mtd_ooblayout_count_freebytes(mtd);
+       if (ret < 0)
+               ret = 0;
+
+       mtd->oobavail = ret;
+
+       /* ECC sanity check: warn if it's too weak */
+       if (!nand_ecc_strength_good(mtd))
+               pr_warn("WARNING: %s: the ECC used on your system is too weak compared to the one required by the NAND chip\n",
+                       mtd->name);
+
        /* Allow subpage writes up to ecc.steps. Not possible for MLC flash */
        if (!(chip->options & NAND_NO_SUBPAGE_WRITE) && nand_is_slc(chip)) {
                switch (ecc->steps) {
@@ -4343,7 +4516,6 @@ int nand_scan_tail(struct mtd_info *mtd)
        /* Large page NAND with SOFT_ECC should support subpage reads */
        switch (ecc->mode) {
        case NAND_ECC_SOFT:
-       case NAND_ECC_SOFT_BCH:
                if (chip->page_shift > 9)
                        chip->options |= NAND_SUBPAGE_READ;
                break;
@@ -4375,10 +4547,6 @@ int nand_scan_tail(struct mtd_info *mtd)
        mtd->_block_markbad = nand_block_markbad;
        mtd->writebufsize = mtd->writesize;
 
-       /* propagate ecc info to mtd_info */
-       mtd->ecclayout = ecc->layout;
-       mtd->ecc_strength = ecc->strength;
-       mtd->ecc_step_size = ecc->size;
        /*
         * Initialize bitflip_threshold to its default prior scan_bbt() call.
         * scan_bbt() might invoke mtd_read(), thus bitflip_threshold must be
@@ -4393,6 +4561,10 @@ int nand_scan_tail(struct mtd_info *mtd)
 
        /* Build bad block table */
        return chip->scan_bbt(mtd);
+err_free:
+       if (!(chip->options & NAND_OWN_BUFFERS))
+               kfree(chip->buffers);
+       return ret;
 }
 EXPORT_SYMBOL(nand_scan_tail);
 
@@ -4436,7 +4608,8 @@ void nand_release(struct mtd_info *mtd)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
 
-       if (chip->ecc.mode == NAND_ECC_SOFT_BCH)
+       if (chip->ecc.mode == NAND_ECC_SOFT &&
+           chip->ecc.algo == NAND_ECC_BCH)
                nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
 
        mtd_device_unregister(mtd);
index b585bae379290f9993b127087d52fc9dc753eda8..44763f87eae494eccd65241d5b6774aba4527cf4 100644 (file)
 /**
  * struct nand_bch_control - private NAND BCH control structure
  * @bch:       BCH control structure
- * @ecclayout: private ecc layout for this BCH configuration
  * @errloc:    error location array
  * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
  */
 struct nand_bch_control {
        struct bch_control   *bch;
-       struct nand_ecclayout ecclayout;
        unsigned int         *errloc;
        unsigned char        *eccmask;
 };
@@ -124,7 +122,6 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
        unsigned int m, t, eccsteps, i;
-       struct nand_ecclayout *layout = nand->ecc.layout;
        struct nand_bch_control *nbc = NULL;
        unsigned char *erased_page;
        unsigned int eccsize = nand->ecc.size;
@@ -161,34 +158,10 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
 
        eccsteps = mtd->writesize/eccsize;
 
-       /* if no ecc placement scheme was provided, build one */
-       if (!layout) {
-
-               /* handle large page devices only */
-               if (mtd->oobsize < 64) {
-                       printk(KERN_WARNING "must provide an oob scheme for "
-                              "oobsize %d\n", mtd->oobsize);
-                       goto fail;
-               }
-
-               layout = &nbc->ecclayout;
-               layout->eccbytes = eccsteps*eccbytes;
-
-               /* reserve 2 bytes for bad block marker */
-               if (layout->eccbytes+2 > mtd->oobsize) {
-                       printk(KERN_WARNING "no suitable oob scheme available "
-                              "for oobsize %d eccbytes %u\n", mtd->oobsize,
-                              eccbytes);
-                       goto fail;
-               }
-               /* put ecc bytes at oob tail */
-               for (i = 0; i < layout->eccbytes; i++)
-                       layout->eccpos[i] = mtd->oobsize-layout->eccbytes+i;
-
-               layout->oobfree[0].offset = 2;
-               layout->oobfree[0].length = mtd->oobsize-2-layout->eccbytes;
-
-               nand->ecc.layout = layout;
+       /* Check that we have an oob layout description. */
+       if (!mtd->ooblayout) {
+               pr_warn("missing oob scheme");
+               goto fail;
        }
 
        /* sanity checks */
@@ -196,7 +169,18 @@ struct nand_bch_control *nand_bch_init(struct mtd_info *mtd)
                printk(KERN_WARNING "eccsize %u is too large\n", eccsize);
                goto fail;
        }
-       if (layout->eccbytes != (eccsteps*eccbytes)) {
+
+       /*
+        * ecc->steps and ecc->total might be used by mtd->ooblayout->ecc(),
+        * which is called by mtd_ooblayout_count_eccbytes().
+        * Make sure they are properly initialized before calling
+        * mtd_ooblayout_count_eccbytes().
+        * FIXME: we should probably rework the sequencing in nand_scan_tail()
+        * to avoid setting those fields twice.
+        */
+       nand->ecc.steps = eccsteps;
+       nand->ecc.total = eccsteps * eccbytes;
+       if (mtd_ooblayout_count_eccbytes(mtd) != (eccsteps*eccbytes)) {
                printk(KERN_WARNING "invalid ecc layout\n");
                goto fail;
        }
index a58169a28741e7e0f3db85b77170594de299fd87..1eb934414eb5804978994a382dbb1d782ca59e53 100644 (file)
@@ -569,7 +569,7 @@ static void nandsim_debugfs_remove(struct nandsim *ns)
  *
  * RETURNS: 0 if success, -ENOMEM if memory alloc fails.
  */
-static int alloc_device(struct nandsim *ns)
+static int __init alloc_device(struct nandsim *ns)
 {
        struct file *cfile;
        int i, err;
@@ -654,7 +654,7 @@ static void free_device(struct nandsim *ns)
        }
 }
 
-static char *get_partition_name(int i)
+static char __init *get_partition_name(int i)
 {
        return kasprintf(GFP_KERNEL, "NAND simulator partition %d", i);
 }
@@ -664,7 +664,7 @@ static char *get_partition_name(int i)
  *
  * RETURNS: 0 if success, -ERRNO if failure.
  */
-static int init_nandsim(struct mtd_info *mtd)
+static int __init init_nandsim(struct mtd_info *mtd)
 {
        struct nand_chip *chip = mtd_to_nand(mtd);
        struct nandsim   *ns   = nand_get_controller_data(chip);
@@ -2261,6 +2261,7 @@ static int __init ns_init_module(void)
        chip->read_buf   = ns_nand_read_buf;
        chip->read_word  = ns_nand_read_word;
        chip->ecc.mode   = NAND_ECC_SOFT;
+       chip->ecc.algo   = NAND_ECC_HAMMING;
        /* The NAND_SKIP_BBTSCAN option is necessary for 'overridesize' */
        /* and 'badblocks' parameters to work */
        chip->options   |= NAND_SKIP_BBTSCAN;
@@ -2338,7 +2339,8 @@ static int __init ns_init_module(void)
                        retval = -EINVAL;
                        goto error;
                }
-               chip->ecc.mode = NAND_ECC_SOFT_BCH;
+               chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_BCH;
                chip->ecc.size = 512;
                chip->ecc.strength = bch;
                chip->ecc.bytes = eccbytes;
index dbc5b571c2bbcca6a570e5855d2907009abd9b0f..8f64011d32ef2a2791a0380b003c10ab3dc63f0b 100644 (file)
@@ -261,6 +261,7 @@ static int nuc900_nand_probe(struct platform_device *pdev)
        chip->chip_delay        = 50;
        chip->options           = 0;
        chip->ecc.mode          = NAND_ECC_SOFT;
+       chip->ecc.algo          = NAND_ECC_HAMMING;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        nuc900_nand->reg = devm_ioremap_resource(&pdev->dev, res);
index 0749ca1a145681552bf2e42b2be96c7d4e09eb4f..08e158895635cddda0bbea4e6973cfb8dd727e74 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -28,6 +29,7 @@
 #include <linux/mtd/nand_bch.h>
 #include <linux/platform_data/elm.h>
 
+#include <linux/omap-gpmc.h>
 #include <linux/platform_data/mtd-nand-omap2.h>
 
 #define        DRIVER_NAME     "omap2-nand"
@@ -151,13 +153,17 @@ static struct nand_hw_control omap_gpmc_controller = {
 };
 
 struct omap_nand_info {
-       struct omap_nand_platform_data  *pdata;
        struct nand_chip                nand;
        struct platform_device          *pdev;
 
        int                             gpmc_cs;
-       unsigned long                   phys_base;
+       bool                            dev_ready;
+       enum nand_io                    xfer_type;
+       int                             devsize;
        enum omap_ecc                   ecc_opt;
+       struct device_node              *elm_of_node;
+
+       unsigned long                   phys_base;
        struct completion               comp;
        struct dma_chan                 *dma;
        int                             gpmc_irq_fifo;
@@ -168,12 +174,14 @@ struct omap_nand_info {
        } iomode;
        u_char                          *buf;
        int                                     buf_len;
+       /* Interface to GPMC */
        struct gpmc_nand_regs           reg;
-       /* generated at runtime depending on ECC algorithm and layout selected */
-       struct nand_ecclayout           oobinfo;
+       struct gpmc_nand_ops            *ops;
+       bool                            flash_bbt;
        /* fields specific for BCHx_HW ECC scheme */
        struct device                   *elm_dev;
-       struct device_node              *of_node;
+       /* NAND ready gpio */
+       struct gpio_desc                *ready_gpiod;
 };
 
 static inline struct omap_nand_info *mtd_to_omap(struct mtd_info *mtd)
@@ -208,7 +216,7 @@ static int omap_prefetch_enable(int cs, int fifo_th, int dma_mode,
         */
        val = ((cs << PREFETCH_CONFIG1_CS_SHIFT) |
                PREFETCH_FIFOTHRESHOLD(fifo_th) | ENABLE_PREFETCH |
-               (dma_mode << DMA_MPU_MODE_SHIFT) | (0x1 & is_write));
+               (dma_mode << DMA_MPU_MODE_SHIFT) | (is_write & 0x1));
        writel(val, info->reg.gpmc_prefetch_config1);
 
        /*  Start the prefetch engine */
@@ -288,14 +296,13 @@ static void omap_write_buf8(struct mtd_info *mtd, const u_char *buf, int len)
 {
        struct omap_nand_info *info = mtd_to_omap(mtd);
        u_char *p = (u_char *)buf;
-       u32     status = 0;
+       bool status;
 
        while (len--) {
                iowrite8(*p++, info->nand.IO_ADDR_W);
                /* wait until buffer is available for write */
                do {
-                       status = readl(info->reg.gpmc_status) &
-                                       STATUS_BUFF_EMPTY;
+                       status = info->ops->nand_writebuffer_empty();
                } while (!status);
        }
 }
@@ -323,7 +330,7 @@ static void omap_write_buf16(struct mtd_info *mtd, const u_char * buf, int len)
 {
        struct omap_nand_info *info = mtd_to_omap(mtd);
        u16 *p = (u16 *) buf;
-       u32     status = 0;
+       bool status;
        /* FIXME try bursts of writesw() or DMA ... */
        len >>= 1;
 
@@ -331,8 +338,7 @@ static void omap_write_buf16(struct mtd_info *mtd, const u_char * buf, int len)
                iowrite16(*p++, info->nand.IO_ADDR_W);
                /* wait until buffer is available for write */
                do {
-                       status = readl(info->reg.gpmc_status) &
-                                       STATUS_BUFF_EMPTY;
+                       status = info->ops->nand_writebuffer_empty();
                } while (!status);
        }
 }
@@ -467,17 +473,8 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
        int ret;
        u32 val;
 
-       if (addr >= high_memory) {
-               struct page *p1;
-
-               if (((size_t)addr & PAGE_MASK) !=
-                       ((size_t)(addr + len - 1) & PAGE_MASK))
-                       goto out_copy;
-               p1 = vmalloc_to_page(addr);
-               if (!p1)
-                       goto out_copy;
-               addr = page_address(p1) + ((size_t)addr & ~PAGE_MASK);
-       }
+       if (!virt_addr_valid(addr))
+               goto out_copy;
 
        sg_init_one(&sg, addr, len);
        n = dma_map_sg(info->dma->device->dev, &sg, 1, dir);
@@ -497,6 +494,11 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
        tx->callback_param = &info->comp;
        dmaengine_submit(tx);
 
+       init_completion(&info->comp);
+
+       /* setup and start DMA using dma_addr */
+       dma_async_issue_pending(info->dma);
+
        /*  configure and start prefetch transfer */
        ret = omap_prefetch_enable(info->gpmc_cs,
                PREFETCH_FIFOTHRESHOLD_MAX, 0x1, len, is_write, info);
@@ -504,10 +506,6 @@ static inline int omap_nand_dma_transfer(struct mtd_info *mtd, void *addr,
                /* PFPW engine is busy, use cpu copy method */
                goto out_copy_unmap;
 
-       init_completion(&info->comp);
-       dma_async_issue_pending(info->dma);
-
-       /* setup and start DMA using dma_addr */
        wait_for_completion(&info->comp);
        tim = 0;
        limit = (loops_per_jiffy * msecs_to_jiffies(OMAP_NAND_TIMEOUT_MS));
@@ -1017,21 +1015,16 @@ static int omap_wait(struct mtd_info *mtd, struct nand_chip *chip)
 }
 
 /**
- * omap_dev_ready - calls the platform specific dev_ready function
+ * omap_dev_ready - checks the NAND Ready GPIO line
  * @mtd: MTD device structure
+ *
+ * Returns true if ready and false if busy.
  */
 static int omap_dev_ready(struct mtd_info *mtd)
 {
-       unsigned int val = 0;
        struct omap_nand_info *info = mtd_to_omap(mtd);
 
-       val = readl(info->reg.gpmc_status);
-
-       if ((val & 0x100) == 0x100) {
-               return 1;
-       } else {
-               return 0;
-       }
+       return gpiod_get_value(info->ready_gpiod);
 }
 
 /**
@@ -1495,9 +1488,8 @@ static int omap_elm_correct_data(struct mtd_info *mtd, u_char *data,
 static int omap_write_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
                               const uint8_t *buf, int oob_required, int page)
 {
-       int i;
+       int ret;
        uint8_t *ecc_calc = chip->buffers->ecccalc;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
 
        /* Enable GPMC ecc engine */
        chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
@@ -1508,8 +1500,10 @@ static int omap_write_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
        /* Update ecc vector from GPMC result registers */
        chip->ecc.calculate(mtd, buf, &ecc_calc[0]);
 
-       for (i = 0; i < chip->ecc.total; i++)
-               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+       ret = mtd_ooblayout_set_eccbytes(mtd, ecc_calc, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        /* Write ecc vector to OOB area */
        chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
@@ -1536,10 +1530,7 @@ static int omap_read_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
 {
        uint8_t *ecc_calc = chip->buffers->ecccalc;
        uint8_t *ecc_code = chip->buffers->ecccode;
-       uint32_t *eccpos = chip->ecc.layout->eccpos;
-       uint8_t *oob = &chip->oob_poi[eccpos[0]];
-       uint32_t oob_pos = mtd->writesize + chip->ecc.layout->eccpos[0];
-       int stat;
+       int stat, ret;
        unsigned int max_bitflips = 0;
 
        /* Enable GPMC ecc engine */
@@ -1549,13 +1540,18 @@ static int omap_read_page_bch(struct mtd_info *mtd, struct nand_chip *chip,
        chip->read_buf(mtd, buf, mtd->writesize);
 
        /* Read oob bytes */
-       chip->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_pos, -1);
-       chip->read_buf(mtd, oob, chip->ecc.total);
+       chip->cmdfunc(mtd, NAND_CMD_RNDOUT,
+                     mtd->writesize + BADBLOCK_MARKER_LENGTH, -1);
+       chip->read_buf(mtd, chip->oob_poi + BADBLOCK_MARKER_LENGTH,
+                      chip->ecc.total);
 
        /* Calculate ecc bytes */
        chip->ecc.calculate(mtd, buf, ecc_calc);
 
-       memcpy(ecc_code, &chip->oob_poi[eccpos[0]], chip->ecc.total);
+       ret = mtd_ooblayout_get_eccbytes(mtd, ecc_code, chip->oob_poi, 0,
+                                        chip->ecc.total);
+       if (ret)
+               return ret;
 
        stat = chip->ecc.correct(mtd, buf, ecc_code, ecc_calc);
 
@@ -1630,7 +1626,7 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
                        "CONFIG_MTD_NAND_OMAP_BCH not enabled\n");
                return false;
        }
-       if (ecc_needs_elm && !is_elm_present(info, pdata->elm_of_node)) {
+       if (ecc_needs_elm && !is_elm_present(info, info->elm_of_node)) {
                dev_err(&info->pdev->dev, "ELM not available\n");
                return false;
        }
@@ -1638,43 +1634,227 @@ static bool omap2_nand_ecc_check(struct omap_nand_info *info,
        return true;
 }
 
+static const char * const nand_xfer_types[] = {
+       [NAND_OMAP_PREFETCH_POLLED] = "prefetch-polled",
+       [NAND_OMAP_POLLED] = "polled",
+       [NAND_OMAP_PREFETCH_DMA] = "prefetch-dma",
+       [NAND_OMAP_PREFETCH_IRQ] = "prefetch-irq",
+};
+
+static int omap_get_dt_info(struct device *dev, struct omap_nand_info *info)
+{
+       struct device_node *child = dev->of_node;
+       int i;
+       const char *s;
+       u32 cs;
+
+       if (of_property_read_u32(child, "reg", &cs) < 0) {
+               dev_err(dev, "reg not found in DT\n");
+               return -EINVAL;
+       }
+
+       info->gpmc_cs = cs;
+
+       /* detect availability of ELM module. Won't be present pre-OMAP4 */
+       info->elm_of_node = of_parse_phandle(child, "ti,elm-id", 0);
+       if (!info->elm_of_node)
+               dev_dbg(dev, "ti,elm-id not in DT\n");
+
+       /* select ecc-scheme for NAND */
+       if (of_property_read_string(child, "ti,nand-ecc-opt", &s)) {
+               dev_err(dev, "ti,nand-ecc-opt not found\n");
+               return -EINVAL;
+       }
+
+       if (!strcmp(s, "sw")) {
+               info->ecc_opt = OMAP_ECC_HAM1_CODE_SW;
+       } else if (!strcmp(s, "ham1") ||
+                  !strcmp(s, "hw") || !strcmp(s, "hw-romcode")) {
+               info->ecc_opt = OMAP_ECC_HAM1_CODE_HW;
+       } else if (!strcmp(s, "bch4")) {
+               if (info->elm_of_node)
+                       info->ecc_opt = OMAP_ECC_BCH4_CODE_HW;
+               else
+                       info->ecc_opt = OMAP_ECC_BCH4_CODE_HW_DETECTION_SW;
+       } else if (!strcmp(s, "bch8")) {
+               if (info->elm_of_node)
+                       info->ecc_opt = OMAP_ECC_BCH8_CODE_HW;
+               else
+                       info->ecc_opt = OMAP_ECC_BCH8_CODE_HW_DETECTION_SW;
+       } else if (!strcmp(s, "bch16")) {
+               info->ecc_opt = OMAP_ECC_BCH16_CODE_HW;
+       } else {
+               dev_err(dev, "unrecognized value for ti,nand-ecc-opt\n");
+               return -EINVAL;
+       }
+
+       /* select data transfer mode */
+       if (!of_property_read_string(child, "ti,nand-xfer-type", &s)) {
+               for (i = 0; i < ARRAY_SIZE(nand_xfer_types); i++) {
+                       if (!strcasecmp(s, nand_xfer_types[i])) {
+                               info->xfer_type = i;
+                               return 0;
+                       }
+               }
+
+               dev_err(dev, "unrecognized value for ti,nand-xfer-type\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int omap_ooblayout_ecc(struct mtd_info *mtd, int section,
+                             struct mtd_oob_region *oobregion)
+{
+       struct omap_nand_info *info = mtd_to_omap(mtd);
+       struct nand_chip *chip = &info->nand;
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (info->ecc_opt == OMAP_ECC_HAM1_CODE_HW &&
+           !(chip->options & NAND_BUSWIDTH_16))
+               off = 1;
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = chip->ecc.total;
+
+       return 0;
+}
+
+static int omap_ooblayout_free(struct mtd_info *mtd, int section,
+                              struct mtd_oob_region *oobregion)
+{
+       struct omap_nand_info *info = mtd_to_omap(mtd);
+       struct nand_chip *chip = &info->nand;
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (info->ecc_opt == OMAP_ECC_HAM1_CODE_HW &&
+           !(chip->options & NAND_BUSWIDTH_16))
+               off = 1;
+
+       if (section)
+               return -ERANGE;
+
+       off += chip->ecc.total;
+       if (off >= mtd->oobsize)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = mtd->oobsize - off;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops omap_ooblayout_ops = {
+       .ecc = omap_ooblayout_ecc,
+       .free = omap_ooblayout_free,
+};
+
+static int omap_sw_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       /*
+        * When SW correction is employed, one OMAP specific marker byte is
+        * reserved after each ECC step.
+        */
+       oobregion->offset = off + (section * (chip->ecc.bytes + 1));
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int omap_sw_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       int off = BADBLOCK_MARKER_LENGTH;
+
+       if (section)
+               return -ERANGE;
+
+       /*
+        * When SW correction is employed, one OMAP specific marker byte is
+        * reserved after each ECC step.
+        */
+       off += ((chip->ecc.bytes + 1) * chip->ecc.steps);
+       if (off >= mtd->oobsize)
+               return -ERANGE;
+
+       oobregion->offset = off;
+       oobregion->length = mtd->oobsize - off;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops omap_sw_ooblayout_ops = {
+       .ecc = omap_sw_ooblayout_ecc,
+       .free = omap_sw_ooblayout_free,
+};
+
 static int omap_nand_probe(struct platform_device *pdev)
 {
        struct omap_nand_info           *info;
-       struct omap_nand_platform_data  *pdata;
+       struct omap_nand_platform_data  *pdata = NULL;
        struct mtd_info                 *mtd;
        struct nand_chip                *nand_chip;
-       struct nand_ecclayout           *ecclayout;
        int                             err;
-       int                             i;
        dma_cap_mask_t                  mask;
        unsigned                        sig;
-       unsigned                        oob_index;
        struct resource                 *res;
-
-       pdata = dev_get_platdata(&pdev->dev);
-       if (pdata == NULL) {
-               dev_err(&pdev->dev, "platform data missing\n");
-               return -ENODEV;
-       }
+       struct device                   *dev = &pdev->dev;
+       int                             min_oobbytes = BADBLOCK_MARKER_LENGTH;
+       int                             oobbytes_per_step;
 
        info = devm_kzalloc(&pdev->dev, sizeof(struct omap_nand_info),
                                GFP_KERNEL);
        if (!info)
                return -ENOMEM;
 
+       info->pdev = pdev;
+
+       if (dev->of_node) {
+               if (omap_get_dt_info(dev, info))
+                       return -EINVAL;
+       } else {
+               pdata = dev_get_platdata(&pdev->dev);
+               if (!pdata) {
+                       dev_err(&pdev->dev, "platform data missing\n");
+                       return -EINVAL;
+               }
+
+               info->gpmc_cs = pdata->cs;
+               info->reg = pdata->reg;
+               info->ecc_opt = pdata->ecc_opt;
+               if (pdata->dev_ready)
+                       dev_info(&pdev->dev, "pdata->dev_ready is deprecated\n");
+
+               info->xfer_type = pdata->xfer_type;
+               info->devsize = pdata->devsize;
+               info->elm_of_node = pdata->elm_of_node;
+               info->flash_bbt = pdata->flash_bbt;
+       }
+
        platform_set_drvdata(pdev, info);
+       info->ops = gpmc_omap_get_nand_ops(&info->reg, info->gpmc_cs);
+       if (!info->ops) {
+               dev_err(&pdev->dev, "Failed to get GPMC->NAND interface\n");
+               return -ENODEV;
+       }
 
-       info->pdev              = pdev;
-       info->gpmc_cs           = pdata->cs;
-       info->reg               = pdata->reg;
-       info->of_node           = pdata->of_node;
-       info->ecc_opt           = pdata->ecc_opt;
        nand_chip               = &info->nand;
        mtd                     = nand_to_mtd(nand_chip);
        mtd->dev.parent         = &pdev->dev;
        nand_chip->ecc.priv     = NULL;
-       nand_set_flash_node(nand_chip, pdata->of_node);
+       nand_set_flash_node(nand_chip, dev->of_node);
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        nand_chip->IO_ADDR_R = devm_ioremap_resource(&pdev->dev, res);
@@ -1688,6 +1868,13 @@ static int omap_nand_probe(struct platform_device *pdev)
        nand_chip->IO_ADDR_W = nand_chip->IO_ADDR_R;
        nand_chip->cmd_ctrl  = omap_hwcontrol;
 
+       info->ready_gpiod = devm_gpiod_get_optional(&pdev->dev, "rb",
+                                                   GPIOD_IN);
+       if (IS_ERR(info->ready_gpiod)) {
+               dev_err(dev, "failed to get ready gpio\n");
+               return PTR_ERR(info->ready_gpiod);
+       }
+
        /*
         * If RDY/BSY line is connected to OMAP then use the omap ready
         * function and the generic nand_wait function which reads the status
@@ -1695,7 +1882,7 @@ static int omap_nand_probe(struct platform_device *pdev)
         * chip delay which is slightly more than tR (AC Timing) of the NAND
         * device and read status register until you get a failure or success
         */
-       if (pdata->dev_ready) {
+       if (info->ready_gpiod) {
                nand_chip->dev_ready = omap_dev_ready;
                nand_chip->chip_delay = 0;
        } else {
@@ -1703,21 +1890,25 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->chip_delay = 50;
        }
 
-       if (pdata->flash_bbt)
-               nand_chip->bbt_options |= NAND_BBT_USE_FLASH | NAND_BBT_NO_OOB;
-       else
-               nand_chip->options |= NAND_SKIP_BBTSCAN;
+       if (info->flash_bbt)
+               nand_chip->bbt_options |= NAND_BBT_USE_FLASH;
 
        /* scan NAND device connected to chip controller */
-       nand_chip->options |= pdata->devsize & NAND_BUSWIDTH_16;
+       nand_chip->options |= info->devsize & NAND_BUSWIDTH_16;
        if (nand_scan_ident(mtd, 1, NULL)) {
-               dev_err(&info->pdev->dev, "scan failed, may be bus-width mismatch\n");
+               dev_err(&info->pdev->dev,
+                       "scan failed, may be bus-width mismatch\n");
                err = -ENXIO;
                goto return_error;
        }
 
+       if (nand_chip->bbt_options & NAND_BBT_USE_FLASH)
+               nand_chip->bbt_options |= NAND_BBT_NO_OOB;
+       else
+               nand_chip->options |= NAND_SKIP_BBTSCAN;
+
        /* re-populate low-level callbacks based on xfer modes */
-       switch (pdata->xfer_type) {
+       switch (info->xfer_type) {
        case NAND_OMAP_PREFETCH_POLLED:
                nand_chip->read_buf   = omap_read_buf_pref;
                nand_chip->write_buf  = omap_write_buf_pref;
@@ -1797,7 +1988,7 @@ static int omap_nand_probe(struct platform_device *pdev)
 
        default:
                dev_err(&pdev->dev,
-                       "xfer_type(%d) not supported!\n", pdata->xfer_type);
+                       "xfer_type(%d) not supported!\n", info->xfer_type);
                err = -EINVAL;
                goto return_error;
        }
@@ -1809,16 +2000,15 @@ static int omap_nand_probe(struct platform_device *pdev)
 
        /*
         * Bail out earlier to let NAND_ECC_SOFT code create its own
-        * ecclayout instead of using ours.
+        * ooblayout instead of using ours.
         */
        if (info->ecc_opt == OMAP_ECC_HAM1_CODE_SW) {
                nand_chip->ecc.mode = NAND_ECC_SOFT;
+               nand_chip->ecc.algo = NAND_ECC_HAMMING;
                goto scan_tail;
        }
 
        /* populate MTD interface based on ECC scheme */
-       ecclayout               = &info->oobinfo;
-       nand_chip->ecc.layout   = ecclayout;
        switch (info->ecc_opt) {
        case OMAP_ECC_HAM1_CODE_HW:
                pr_info("nand: using OMAP_ECC_HAM1_CODE_HW\n");
@@ -1829,19 +2019,12 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc;
                nand_chip->ecc.hwctl            = omap_enable_hwecc;
                nand_chip->ecc.correct          = omap_correct_data;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               if (nand_chip->options & NAND_BUSWIDTH_16)
-                       oob_index               = BADBLOCK_MARKER_LENGTH;
-               else
-                       oob_index               = 1;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* no reserved-marker in ecclayout for this ecc-scheme */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
+
+               if (!(nand_chip->options & NAND_BUSWIDTH_16))
+                       min_oobbytes            = 1;
+
                break;
 
        case OMAP_ECC_BCH4_CODE_HW_DETECTION_SW:
@@ -1853,19 +2036,9 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.hwctl            = omap_enable_hwecc_bch;
                nand_chip->ecc.correct          = nand_bch_correct_data;
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++) {
-                       ecclayout->eccpos[i] = oob_index;
-                       if (((i + 1) % nand_chip->ecc.bytes) == 0)
-                               oob_index++;
-               }
-               /* include reserved-marker in ecclayout->oobfree calculation */
-               ecclayout->oobfree->offset      = 1 +
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
+               /* Reserve one byte for the OMAP marker */
+               oobbytes_per_step               = nand_chip->ecc.bytes + 1;
                /* software bch library is used for locating errors */
                nand_chip->ecc.priv             = nand_bch_init(mtd);
                if (!nand_chip->ecc.priv) {
@@ -1887,16 +2060,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH4_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1914,19 +2079,9 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.hwctl            = omap_enable_hwecc_bch;
                nand_chip->ecc.correct          = nand_bch_correct_data;
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++) {
-                       ecclayout->eccpos[i] = oob_index;
-                       if (((i + 1) % nand_chip->ecc.bytes) == 0)
-                               oob_index++;
-               }
-               /* include reserved-marker in ecclayout->oobfree calculation */
-               ecclayout->oobfree->offset      = 1 +
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
+               mtd_set_ooblayout(mtd, &omap_sw_ooblayout_ops);
+               /* Reserve one byte for the OMAP marker */
+               oobbytes_per_step               = nand_chip->ecc.bytes + 1;
                /* software bch library is used for locating errors */
                nand_chip->ecc.priv             = nand_bch_init(mtd);
                if (!nand_chip->ecc.priv) {
@@ -1948,6 +2103,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH8_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1955,16 +2112,6 @@ static int omap_nand_probe(struct platform_device *pdev)
                if (err < 0)
                        goto return_error;
 
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
                break;
 
        case OMAP_ECC_BCH16_CODE_HW:
@@ -1978,6 +2125,8 @@ static int omap_nand_probe(struct platform_device *pdev)
                nand_chip->ecc.calculate        = omap_calculate_ecc_bch;
                nand_chip->ecc.read_page        = omap_read_page_bch;
                nand_chip->ecc.write_page       = omap_write_page_bch;
+               mtd_set_ooblayout(mtd, &omap_ooblayout_ops);
+               oobbytes_per_step               = nand_chip->ecc.bytes;
 
                err = elm_config(info->elm_dev, BCH16_ECC,
                                 mtd->writesize / nand_chip->ecc.size,
@@ -1985,16 +2134,6 @@ static int omap_nand_probe(struct platform_device *pdev)
                if (err < 0)
                        goto return_error;
 
-               /* define ECC layout */
-               ecclayout->eccbytes             = nand_chip->ecc.bytes *
-                                                       (mtd->writesize /
-                                                       nand_chip->ecc.size);
-               oob_index                       = BADBLOCK_MARKER_LENGTH;
-               for (i = 0; i < ecclayout->eccbytes; i++, oob_index++)
-                       ecclayout->eccpos[i]    = oob_index;
-               /* reserved marker already included in ecclayout->eccbytes */
-               ecclayout->oobfree->offset      =
-                               ecclayout->eccpos[ecclayout->eccbytes - 1] + 1;
                break;
        default:
                dev_err(&info->pdev->dev, "invalid or unsupported ECC scheme\n");
@@ -2002,13 +2141,13 @@ static int omap_nand_probe(struct platform_device *pdev)
                goto return_error;
        }
 
-       /* all OOB bytes from oobfree->offset till end off OOB are free */
-       ecclayout->oobfree->length = mtd->oobsize - ecclayout->oobfree->offset;
        /* check if NAND device's OOB is enough to store ECC signatures */
-       if (mtd->oobsize < (ecclayout->eccbytes + BADBLOCK_MARKER_LENGTH)) {
+       min_oobbytes += (oobbytes_per_step *
+                        (mtd->writesize / nand_chip->ecc.size));
+       if (mtd->oobsize < min_oobbytes) {
                dev_err(&info->pdev->dev,
                        "not enough OOB bytes required = %d, available=%d\n",
-                       ecclayout->eccbytes, mtd->oobsize);
+                       min_oobbytes, mtd->oobsize);
                err = -EINVAL;
                goto return_error;
        }
@@ -2020,7 +2159,10 @@ scan_tail:
                goto return_error;
        }
 
-       mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
+       if (dev->of_node)
+               mtd_device_register(mtd, NULL, 0);
+       else
+               mtd_device_register(mtd, pdata->parts, pdata->nr_parts);
 
        platform_set_drvdata(pdev, mtd);
 
@@ -2051,11 +2193,17 @@ static int omap_nand_remove(struct platform_device *pdev)
        return 0;
 }
 
+static const struct of_device_id omap_nand_ids[] = {
+       { .compatible = "ti,omap2-nand", },
+       {},
+};
+
 static struct platform_driver omap_nand_driver = {
        .probe          = omap_nand_probe,
        .remove         = omap_nand_remove,
        .driver         = {
                .name   = DRIVER_NAME,
+               .of_match_table = of_match_ptr(omap_nand_ids),
        },
 };
 
index d4614bfbfed6c1c63b95fec88c2af2c96dedd182..40a7c4a2cf0d44c8f407dc6c170d44cc8d768510 100644 (file)
@@ -130,6 +130,7 @@ static int __init orion_nand_probe(struct platform_device *pdev)
        nc->cmd_ctrl = orion_nand_cmd_ctrl;
        nc->read_buf = orion_nand_read_buf;
        nc->ecc.mode = NAND_ECC_SOFT;
+       nc->ecc.algo = NAND_ECC_HAMMING;
 
        if (board->chip_delay)
                nc->chip_delay = board->chip_delay;
index 3ab53ca53cca231c2e3431cc20f3d1932000601f..5de7591b05106bcbf172ad3bc3c6c8013c8b4c32 100644 (file)
@@ -92,8 +92,9 @@ int pasemi_device_ready(struct mtd_info *mtd)
 
 static int pasemi_nand_probe(struct platform_device *ofdev)
 {
+       struct device *dev = &ofdev->dev;
        struct pci_dev *pdev;
-       struct device_node *np = ofdev->dev.of_node;
+       struct device_node *np = dev->of_node;
        struct resource res;
        struct nand_chip *chip;
        int err = 0;
@@ -107,13 +108,11 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        if (pasemi_nand_mtd)
                return -ENODEV;
 
-       pr_debug("pasemi_nand at %pR\n", &res);
+       dev_dbg(dev, "pasemi_nand at %pR\n", &res);
 
        /* Allocate memory for MTD device structure and private data */
        chip = kzalloc(sizeof(struct nand_chip), GFP_KERNEL);
        if (!chip) {
-               printk(KERN_WARNING
-                      "Unable to allocate PASEMI NAND MTD device structure\n");
                err = -ENOMEM;
                goto out;
        }
@@ -121,7 +120,7 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        pasemi_nand_mtd = nand_to_mtd(chip);
 
        /* Link the private data with the MTD structure */
-       pasemi_nand_mtd->dev.parent = &ofdev->dev;
+       pasemi_nand_mtd->dev.parent = dev;
 
        chip->IO_ADDR_R = of_iomap(np, 0);
        chip->IO_ADDR_W = chip->IO_ADDR_R;
@@ -151,6 +150,7 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        chip->write_buf = pasemi_write_buf;
        chip->chip_delay = 0;
        chip->ecc.mode = NAND_ECC_SOFT;
+       chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* Enable the following for a flash based bad block table */
        chip->bbt_options = NAND_BBT_USE_FLASH;
@@ -162,13 +162,13 @@ static int pasemi_nand_probe(struct platform_device *ofdev)
        }
 
        if (mtd_device_register(pasemi_nand_mtd, NULL, 0)) {
-               printk(KERN_ERR "pasemi_nand: Unable to register MTD device\n");
+               dev_err(dev, "Unable to register MTD device\n");
                err = -ENODEV;
                goto out_lpc;
        }
 
-       printk(KERN_INFO "PA Semi NAND flash at %08llx, control at I/O %x\n",
-              res.start, lpcctl);
+       dev_info(dev, "PA Semi NAND flash at %pR, control at I/O %x\n", &res,
+                lpcctl);
 
        return 0;
 
index e4e50da30444fc2dc2a4c6d936680f74525dac53..415a53a0deeb306bb5baeaf3f5ad6638fd41d103 100644 (file)
@@ -74,6 +74,7 @@ static int plat_nand_probe(struct platform_device *pdev)
 
        data->chip.ecc.hwctl = pdata->ctrl.hwcontrol;
        data->chip.ecc.mode = NAND_ECC_SOFT;
+       data->chip.ecc.algo = NAND_ECC_HAMMING;
 
        platform_set_drvdata(pdev, data);
 
index d6508856da9937a433947a57dedce0d8fbd9d0e6..436dd6dc11f4a810e838059c3ed0c5254a7cb07e 100644 (file)
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_data/mtd-nand-pxa3xx.h>
 
 #define        CHIP_DELAY_TIMEOUT      msecs_to_jiffies(200)
@@ -324,6 +323,62 @@ static struct pxa3xx_nand_flash builtin_flash_types[] = {
        { 0xba20, 16, 16, &timing[3] },
 };
 
+static int pxa3xx_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct pxa3xx_nand_host *host = nand_get_controller_data(chip);
+       struct pxa3xx_nand_info *info = host->info_data;
+       int nchunks = mtd->writesize / info->chunk_size;
+
+       if (section >= nchunks)
+               return -ERANGE;
+
+       oobregion->offset = ((info->ecc_size + info->spare_size) * section) +
+                           info->spare_size;
+       oobregion->length = info->ecc_size;
+
+       return 0;
+}
+
+static int pxa3xx_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct pxa3xx_nand_host *host = nand_get_controller_data(chip);
+       struct pxa3xx_nand_info *info = host->info_data;
+       int nchunks = mtd->writesize / info->chunk_size;
+
+       if (section >= nchunks)
+               return -ERANGE;
+
+       if (!info->spare_size)
+               return 0;
+
+       oobregion->offset = section * (info->ecc_size + info->spare_size);
+       oobregion->length = info->spare_size;
+       if (!section) {
+               /*
+                * Bootrom looks in bytes 0 & 5 for bad blocks for the
+                * 4KB page / 4bit BCH combination.
+                */
+               if (mtd->writesize == 4096 && info->chunk_size == 2048) {
+                       oobregion->offset += 6;
+                       oobregion->length -= 6;
+               } else {
+                       oobregion->offset += 2;
+                       oobregion->length -= 2;
+               }
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops pxa3xx_ooblayout_ops = {
+       .ecc = pxa3xx_ooblayout_ecc,
+       .free = pxa3xx_ooblayout_free,
+};
+
 static u8 bbt_pattern[] = {'M', 'V', 'B', 'b', 't', '0' };
 static u8 bbt_mirror_pattern[] = {'1', 't', 'b', 'B', 'V', 'M' };
 
@@ -347,41 +402,6 @@ static struct nand_bbt_descr bbt_mirror_descr = {
        .pattern = bbt_mirror_pattern
 };
 
-static struct nand_ecclayout ecc_layout_2KB_bch4bit = {
-       .eccbytes = 32,
-       .eccpos = {
-               32, 33, 34, 35, 36, 37, 38, 39,
-               40, 41, 42, 43, 44, 45, 46, 47,
-               48, 49, 50, 51, 52, 53, 54, 55,
-               56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = { {2, 30} }
-};
-
-static struct nand_ecclayout ecc_layout_4KB_bch4bit = {
-       .eccbytes = 64,
-       .eccpos = {
-               32,  33,  34,  35,  36,  37,  38,  39,
-               40,  41,  42,  43,  44,  45,  46,  47,
-               48,  49,  50,  51,  52,  53,  54,  55,
-               56,  57,  58,  59,  60,  61,  62,  63,
-               96,  97,  98,  99,  100, 101, 102, 103,
-               104, 105, 106, 107, 108, 109, 110, 111,
-               112, 113, 114, 115, 116, 117, 118, 119,
-               120, 121, 122, 123, 124, 125, 126, 127},
-       /* Bootrom looks in bytes 0 & 5 for bad blocks */
-       .oobfree = { {6, 26}, { 64, 32} }
-};
-
-static struct nand_ecclayout ecc_layout_4KB_bch8bit = {
-       .eccbytes = 128,
-       .eccpos = {
-               32,  33,  34,  35,  36,  37,  38,  39,
-               40,  41,  42,  43,  44,  45,  46,  47,
-               48,  49,  50,  51,  52,  53,  54,  55,
-               56,  57,  58,  59,  60,  61,  62,  63},
-       .oobfree = { }
-};
-
 #define NDTR0_tCH(c)   (min((c), 7) << 19)
 #define NDTR0_tCS(c)   (min((c), 7) << 16)
 #define NDTR0_tWH(c)   (min((c), 7) << 11)
@@ -1546,9 +1566,12 @@ static void pxa3xx_nand_free_buff(struct pxa3xx_nand_info *info)
 }
 
 static int pxa_ecc_init(struct pxa3xx_nand_info *info,
-                       struct nand_ecc_ctrl *ecc,
+                       struct mtd_info *mtd,
                        int strength, int ecc_stepsize, int page_size)
 {
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+
        if (strength == 1 && ecc_stepsize == 512 && page_size == 2048) {
                info->nfullchunks = 1;
                info->ntotalchunks = 1;
@@ -1582,7 +1605,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_2KB_bch4bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
 
        } else if (strength == 4 && ecc_stepsize == 512 && page_size == 4096) {
@@ -1594,7 +1617,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_4KB_bch4bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
 
        /*
@@ -1612,7 +1635,7 @@ static int pxa_ecc_init(struct pxa3xx_nand_info *info,
                info->ecc_size = 32;
                ecc->mode = NAND_ECC_HW;
                ecc->size = info->chunk_size;
-               ecc->layout = &ecc_layout_4KB_bch8bit;
+               mtd_set_ooblayout(mtd, &pxa3xx_ooblayout_ops);
                ecc->strength = 16;
        } else {
                dev_err(&info->pdev->dev,
@@ -1651,6 +1674,12 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
        if (info->variant == PXA3XX_NAND_VARIANT_ARMADA370)
                nand_writel(info, NDECCCTRL, 0x0);
 
+       if (pdata->flash_bbt)
+               chip->bbt_options |= NAND_BBT_USE_FLASH;
+
+       chip->ecc.strength = pdata->ecc_strength;
+       chip->ecc.size = pdata->ecc_step_size;
+
        if (nand_scan_ident(mtd, 1, NULL))
                return -ENODEV;
 
@@ -1663,13 +1692,12 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                }
        }
 
-       if (pdata->flash_bbt) {
+       if (chip->bbt_options & NAND_BBT_USE_FLASH) {
                /*
                 * We'll use a bad block table stored in-flash and don't
                 * allow writing the bad block marker to the flash.
                 */
-               chip->bbt_options |= NAND_BBT_USE_FLASH |
-                                    NAND_BBT_NO_OOB_BBM;
+               chip->bbt_options |= NAND_BBT_NO_OOB_BBM;
                chip->bbt_td = &bbt_main_descr;
                chip->bbt_md = &bbt_mirror_descr;
        }
@@ -1689,10 +1717,9 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                }
        }
 
-       if (pdata->ecc_strength && pdata->ecc_step_size) {
-               ecc_strength = pdata->ecc_strength;
-               ecc_step = pdata->ecc_step_size;
-       } else {
+       ecc_strength = chip->ecc.strength;
+       ecc_step = chip->ecc.size;
+       if (!ecc_strength || !ecc_step) {
                ecc_strength = chip->ecc_strength_ds;
                ecc_step = chip->ecc_step_ds;
        }
@@ -1703,7 +1730,7 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
                ecc_step = 512;
        }
 
-       ret = pxa_ecc_init(info, &chip->ecc, ecc_strength,
+       ret = pxa_ecc_init(info, mtd, ecc_strength,
                           ecc_step, mtd->writesize);
        if (ret)
                return ret;
@@ -1903,15 +1930,6 @@ static int pxa3xx_nand_probe_dt(struct platform_device *pdev)
        if (of_get_property(np, "marvell,nand-keep-config", NULL))
                pdata->keep_config = 1;
        of_property_read_u32(np, "num-cs", &pdata->num_cs);
-       pdata->flash_bbt = of_get_nand_on_flash_bbt(np);
-
-       pdata->ecc_strength = of_get_nand_ecc_strength(np);
-       if (pdata->ecc_strength < 0)
-               pdata->ecc_strength = 0;
-
-       pdata->ecc_step_size = of_get_nand_ecc_step_size(np);
-       if (pdata->ecc_step_size < 0)
-               pdata->ecc_step_size = 0;
 
        pdev->dev.platform_data = pdata;
 
index f550a57e6eea237dacc7b3ca3ceda3914d8a481b..de7d28e62d4e5aeee107894b8f9a11c66f63ed66 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/delay.h>
 
 /* NANDc reg offsets */
@@ -1437,7 +1436,6 @@ static int qcom_nandc_write_oob(struct mtd_info *mtd, struct nand_chip *chip,
        struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
        u8 *oob = chip->oob_poi;
-       int free_boff;
        int data_size, oob_size;
        int ret, status = 0;
 
@@ -1451,12 +1449,11 @@ static int qcom_nandc_write_oob(struct mtd_info *mtd, struct nand_chip *chip,
 
        /* calculate the data and oob size for the last codeword/step */
        data_size = ecc->size - ((ecc->steps - 1) << 2);
-       oob_size = ecc->steps << 2;
-
-       free_boff = ecc->layout->oobfree[0].offset;
+       oob_size = mtd->oobavail;
 
        /* override new oob content to last codeword */
-       memcpy(nandc->data_buffer + data_size, oob + free_boff, oob_size);
+       mtd_ooblayout_get_databytes(mtd, nandc->data_buffer + data_size, oob,
+                                   0, mtd->oobavail);
 
        set_address(host, host->cw_size * (ecc->steps - 1), page);
        update_rw_regs(host, 1, false);
@@ -1710,61 +1707,52 @@ static void qcom_nandc_select_chip(struct mtd_info *mtd, int chipnr)
  * This layout is read as is when ECC is disabled. When ECC is enabled, the
  * inaccessible Bad Block byte(s) are ignored when we write to a page/oob,
  * and assumed as 0xffs when we read a page/oob. The ECC, unused and
- * dummy/real bad block bytes are grouped as ecc bytes in nand_ecclayout (i.e,
- * ecc->bytes is the sum of the three).
+ * dummy/real bad block bytes are grouped as ecc bytes (i.e, ecc->bytes is
+ * the sum of the three).
  */
-
-static struct nand_ecclayout *
-qcom_nand_create_layout(struct qcom_nand_host *host)
+static int qcom_nand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                  struct mtd_oob_region *oobregion)
 {
-       struct nand_chip *chip = &host->chip;
-       struct mtd_info *mtd = nand_to_mtd(chip);
-       struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct qcom_nand_host *host = to_qcom_nand_host(chip);
        struct nand_ecc_ctrl *ecc = &chip->ecc;
-       struct nand_ecclayout *layout;
-       int i, j, steps, pos = 0, shift = 0;
 
-       layout = devm_kzalloc(nandc->dev, sizeof(*layout), GFP_KERNEL);
-       if (!layout)
-               return NULL;
-
-       steps = mtd->writesize / ecc->size;
-       layout->eccbytes = steps * ecc->bytes;
+       if (section > 1)
+               return -ERANGE;
 
-       layout->oobfree[0].offset = (steps - 1) * ecc->bytes + host->bbm_size;
-       layout->oobfree[0].length = steps << 2;
-
-       /*
-        * the oob bytes in the first n - 1 codewords are all grouped together
-        * in the format:
-        * DUMMY_BBM + UNUSED + ECC
-        */
-       for (i = 0; i < steps - 1; i++) {
-               for (j = 0; j < ecc->bytes; j++)
-                       layout->eccpos[pos++] = i * ecc->bytes + j;
+       if (!section) {
+               oobregion->length = (ecc->bytes * (ecc->steps - 1)) +
+                                   host->bbm_size;
+               oobregion->offset = 0;
+       } else {
+               oobregion->length = host->ecc_bytes_hw + host->spare_bytes;
+               oobregion->offset = mtd->oobsize - oobregion->length;
        }
 
-       /*
-        * the oob bytes in the last codeword are grouped in the format:
-        * BBM + FREE OOB + UNUSED + ECC
-        */
+       return 0;
+}
 
-       /* fill up the bbm positions */
-       for (j = 0; j < host->bbm_size; j++)
-               layout->eccpos[pos++] = i * ecc->bytes + j;
+static int qcom_nand_ooblayout_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+       struct qcom_nand_host *host = to_qcom_nand_host(chip);
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
 
-       /*
-        * fill up the ecc and reserved positions, their indices are offseted
-        * by the free oob region
-        */
-       shift = layout->oobfree[0].length + host->bbm_size;
+       if (section)
+               return -ERANGE;
 
-       for (j = 0; j < (host->ecc_bytes_hw + host->spare_bytes); j++)
-               layout->eccpos[pos++] = i * ecc->bytes + shift + j;
+       oobregion->length = ecc->steps * 4;
+       oobregion->offset = ((ecc->steps - 1) * ecc->bytes) + host->bbm_size;
 
-       return layout;
+       return 0;
 }
 
+static const struct mtd_ooblayout_ops qcom_nand_ooblayout_ops = {
+       .ecc = qcom_nand_ooblayout_ecc,
+       .free = qcom_nand_ooblayout_free,
+};
+
 static int qcom_nand_host_setup(struct qcom_nand_host *host)
 {
        struct nand_chip *chip = &host->chip;
@@ -1851,9 +1839,7 @@ static int qcom_nand_host_setup(struct qcom_nand_host *host)
 
        ecc->mode = NAND_ECC_HW;
 
-       ecc->layout = qcom_nand_create_layout(host);
-       if (!ecc->layout)
-               return -ENOMEM;
+       mtd_set_ooblayout(mtd, &qcom_nand_ooblayout_ops);
 
        cwperpage = mtd->writesize / ecc->size;
 
index 9c9397b54b2ca9a826e27fc7625108100ae2fb87..d9309cf0ce2ec3cd6b76f3d61c4fb0433d32b213 100644 (file)
 
 /* new oob placement block for use with hardware ecc generation
  */
+static int s3c2410_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static int s3c2410_ooblayout_free(struct mtd_info *mtd, int section,
+                                 struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 8;
+       oobregion->length = 8;
+
+       return 0;
+}
 
-static struct nand_ecclayout nand_hw_eccoob = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {{8, 8}}
+static const struct mtd_ooblayout_ops s3c2410_ooblayout_ops = {
+       .ecc = s3c2410_ooblayout_ecc,
+       .free = s3c2410_ooblayout_free,
 };
 
 /* controller and mtd information */
@@ -542,7 +564,8 @@ static int s3c2410_nand_correct_data(struct mtd_info *mtd, u_char *dat,
        diff0 |= (diff1 << 8);
        diff0 |= (diff2 << 16);
 
-       if ((diff0 & ~(1<<fls(diff0))) == 0)
+       /* equal to "(diff0 & ~(1 << __ffs(diff0)))" */
+       if ((diff0 & (diff0 - 1)) == 0)
                return 1;
 
        return -1;
@@ -859,6 +882,7 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info,
        }
 #else
        chip->ecc.mode      = NAND_ECC_SOFT;
+       chip->ecc.algo  = NAND_ECC_HAMMING;
 #endif
 
        if (set->disable_ecc)
@@ -919,7 +943,7 @@ static void s3c2410_nand_update_chip(struct s3c2410_nand_info *info,
        } else {
                chip->ecc.size      = 512;
                chip->ecc.bytes     = 3;
-               chip->ecc.layout    = &nand_hw_eccoob;
+               mtd_set_ooblayout(nand_to_mtd(chip), &s3c2410_ooblayout_ops);
        }
 }
 
index 4814402902f96df8ff10e370c34aefdde8dcf4f1..6fa3bcd59769946f7b342ed97eebe51e625d5b0a 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/of_mtd.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/sh_dma.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sh_flctl.h>
 
-static struct nand_ecclayout flctl_4secc_oob_16 = {
-       .eccbytes = 10,
-       .eccpos = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-       .oobfree = {
-               {.offset = 12,
-               . length = 4} },
+static int flctl_4secc_ooblayout_sp_ecc(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 0;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int flctl_4secc_ooblayout_sp_free(struct mtd_info *mtd, int section,
+                                        struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->offset = 12;
+       oobregion->length = 4;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flctl_4secc_oob_smallpage_ops = {
+       .ecc = flctl_4secc_ooblayout_sp_ecc,
+       .free = flctl_4secc_ooblayout_sp_free,
 };
 
-static struct nand_ecclayout flctl_4secc_oob_64 = {
-       .eccbytes = 4 * 10,
-       .eccpos = {
-                6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-               22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-               38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-       .oobfree = {
-               {.offset =  2, .length = 4},
-               {.offset = 16, .length = 6},
-               {.offset = 32, .length = 6},
-               {.offset = 48, .length = 6} },
+static int flctl_4secc_ooblayout_lp_ecc(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = chip->ecc.bytes;
+
+       return 0;
+}
+
+static int flctl_4secc_ooblayout_lp_free(struct mtd_info *mtd, int section,
+                                        struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *chip = mtd_to_nand(mtd);
+
+       if (section >= chip->ecc.steps)
+               return -ERANGE;
+
+       oobregion->offset = section * 16;
+       oobregion->length = 6;
+
+       if (!section) {
+               oobregion->offset += 2;
+               oobregion->length -= 2;
+       }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flctl_4secc_oob_largepage_ops = {
+       .ecc = flctl_4secc_ooblayout_lp_ecc,
+       .free = flctl_4secc_ooblayout_lp_free,
 };
 
 static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
@@ -987,10 +1033,10 @@ static int flctl_chip_init_tail(struct mtd_info *mtd)
 
        if (flctl->hwecc) {
                if (mtd->writesize == 512) {
-                       chip->ecc.layout = &flctl_4secc_oob_16;
+                       mtd_set_ooblayout(mtd, &flctl_4secc_oob_smallpage_ops);
                        chip->badblock_pattern = &flctl_4secc_smallpage;
                } else {
-                       chip->ecc.layout = &flctl_4secc_oob_64;
+                       mtd_set_ooblayout(mtd, &flctl_4secc_oob_largepage_ops);
                        chip->badblock_pattern = &flctl_4secc_largepage;
                }
 
@@ -1005,6 +1051,7 @@ static int flctl_chip_init_tail(struct mtd_info *mtd)
                flctl->flcmncr_base |= _4ECCEN;
        } else {
                chip->ecc.mode = NAND_ECC_SOFT;
+               chip->ecc.algo = NAND_ECC_HAMMING;
        }
 
        return 0;
@@ -1044,8 +1091,6 @@ static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
        const struct of_device_id *match;
        struct flctl_soc_config *config;
        struct sh_flctl_platform_data *pdata;
-       struct device_node *dn = dev->of_node;
-       int ret;
 
        match = of_match_device(of_flctl_match, dev);
        if (match)
@@ -1065,15 +1110,6 @@ static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
        pdata->has_hwecc = config->has_hwecc;
        pdata->use_holden = config->use_holden;
 
-       /* parse user defined options */
-       ret = of_get_nand_bus_width(dn);
-       if (ret == 16)
-               pdata->flcmncr_val |= SEL_16BIT;
-       else if (ret != 8) {
-               dev_err(dev, "%s: invalid bus width\n", __func__);
-               return NULL;
-       }
-
        return pdata;
 }
 
@@ -1136,15 +1172,14 @@ static int flctl_probe(struct platform_device *pdev)
        nand->chip_delay = 20;
 
        nand->read_byte = flctl_read_byte;
+       nand->read_word = flctl_read_word;
        nand->write_buf = flctl_write_buf;
        nand->read_buf = flctl_read_buf;
        nand->select_chip = flctl_select_chip;
        nand->cmdfunc = flctl_cmdfunc;
 
-       if (pdata->flcmncr_val & SEL_16BIT) {
+       if (pdata->flcmncr_val & SEL_16BIT)
                nand->options |= NAND_BUSWIDTH_16;
-               nand->read_word = flctl_read_word;
-       }
 
        pm_runtime_enable(&pdev->dev);
        pm_runtime_resume(&pdev->dev);
@@ -1155,6 +1190,16 @@ static int flctl_probe(struct platform_device *pdev)
        if (ret)
                goto err_chip;
 
+       if (nand->options & NAND_BUSWIDTH_16) {
+               /*
+                * NAND_BUSWIDTH_16 may have been set by nand_scan_ident().
+                * Add the SEL_16BIT flag in pdata->flcmncr_val and re-assign
+                * flctl->flcmncr_base to pdata->flcmncr_val.
+                */
+               pdata->flcmncr_val |= SEL_16BIT;
+               flctl->flcmncr_base = pdata->flcmncr_val;
+       }
+
        ret = flctl_chip_init_tail(flctl_mtd);
        if (ret)
                goto err_chip;
index b7d1b55a160b4ebbbbc0fb52d80d8ee99da6612d..064ca1757589ac8d8cba8b536f72ff30632efcfe 100644 (file)
@@ -148,6 +148,7 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
        /* Link the private data with the MTD structure */
        mtd = nand_to_mtd(this);
        mtd->dev.parent = &pdev->dev;
+       mtd_set_ooblayout(mtd, data->ecc_layout);
 
        platform_set_drvdata(pdev, sharpsl);
 
@@ -170,7 +171,6 @@ static int sharpsl_nand_probe(struct platform_device *pdev)
        this->ecc.bytes = 3;
        this->ecc.strength = 1;
        this->badblock_pattern = data->badblock_pattern;
-       this->ecc.layout = data->ecc_layout;
        this->ecc.hwctl = sharpsl_nand_enable_hwecc;
        this->ecc.calculate = sharpsl_nand_calculate_ecc;
        this->ecc.correct = nand_correct_data;
index c514740f9a8330449af7a453adc5565d086933e2..5939dff253c28a65ae7bc40705c9a9874e58356f 100644 (file)
 #include <linux/sizes.h>
 #include "sm_common.h"
 
-static struct nand_ecclayout nand_oob_sm = {
-       .eccbytes = 6,
-       .eccpos = {8, 9, 10, 13, 14, 15},
-       .oobfree = {
-               {.offset = 0 , .length = 4}, /* reserved */
-               {.offset = 6 , .length = 2}, /* LBA1 */
-               {.offset = 11, .length = 2}  /* LBA2 */
+static int oob_sm_ooblayout_ecc(struct mtd_info *mtd, int section,
+                               struct mtd_oob_region *oobregion)
+{
+       if (section > 1)
+               return -ERANGE;
+
+       oobregion->length = 3;
+       oobregion->offset = ((section + 1) * 8) - 3;
+
+       return 0;
+}
+
+static int oob_sm_ooblayout_free(struct mtd_info *mtd, int section,
+                                struct mtd_oob_region *oobregion)
+{
+       switch (section) {
+       case 0:
+               /* reserved */
+               oobregion->offset = 0;
+               oobregion->length = 4;
+               break;
+       case 1:
+               /* LBA1 */
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       case 2:
+               /* LBA2 */
+               oobregion->offset = 11;
+               oobregion->length = 2;
+               break;
+       default:
+               return -ERANGE;
        }
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops oob_sm_ops = {
+       .ecc = oob_sm_ooblayout_ecc,
+       .free = oob_sm_ooblayout_free,
 };
 
 /* NOTE: This layout is is not compatabable with SmartMedia, */
@@ -28,15 +61,43 @@ static struct nand_ecclayout nand_oob_sm = {
 /* If you use smftl, it will bypass this and work correctly */
 /* If you not, then you break SmartMedia compliance anyway */
 
-static struct nand_ecclayout nand_oob_sm_small = {
-       .eccbytes = 3,
-       .eccpos = {0, 1, 2},
-       .oobfree = {
-               {.offset = 3 , .length = 2}, /* reserved */
-               {.offset = 6 , .length = 2}, /* LBA1 */
+static int oob_sm_small_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section)
+               return -ERANGE;
+
+       oobregion->length = 3;
+       oobregion->offset = 0;
+
+       return 0;
+}
+
+static int oob_sm_small_ooblayout_free(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       switch (section) {
+       case 0:
+               /* reserved */
+               oobregion->offset = 3;
+               oobregion->length = 2;
+               break;
+       case 1:
+               /* LBA1 */
+               oobregion->offset = 6;
+               oobregion->length = 2;
+               break;
+       default:
+               return -ERANGE;
        }
-};
 
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops oob_sm_small_ops = {
+       .ecc = oob_sm_small_ooblayout_ecc,
+       .free = oob_sm_small_ooblayout_free,
+};
 
 static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 {
@@ -121,9 +182,9 @@ int sm_register_device(struct mtd_info *mtd, int smartmedia)
 
        /* ECC layout */
        if (mtd->writesize == SM_SECTOR_SIZE)
-               chip->ecc.layout = &nand_oob_sm;
+               mtd_set_ooblayout(mtd, &oob_sm_ops);
        else if (mtd->writesize == SM_SMALL_PAGE)
-               chip->ecc.layout = &nand_oob_sm_small;
+               mtd_set_ooblayout(mtd, &oob_sm_small_ops);
        else
                return -ENODEV;
 
index e3305f9dd6fb97808abcb4b4dc6cb17db4ba994f..888fd314c62a234b7a43e3c922a3d4dbb14e36c9 100644 (file)
@@ -180,6 +180,7 @@ static int socrates_nand_probe(struct platform_device *ofdev)
        nand_chip->dev_ready = socrates_nand_device_ready;
 
        nand_chip->ecc.mode = NAND_ECC_SOFT;    /* enable ECC */
+       nand_chip->ecc.algo = NAND_ECC_HAMMING;
 
        /* TODO: I have no idea what real delay is. */
        nand_chip->chip_delay = 20;             /* 20us command delay time */
index 1c03eee44f3d6fe8ab02d8e6255f7c68fe42ece0..a83a690688b45067a318ef14ed97970c588bb018 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
-#include <linux/of_mtd.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
@@ -39,7 +38,7 @@
 #include <linux/dmaengine.h>
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
-#include <linux/io.h>
+#include <linux/iopoll.h>
 
 #define NFC_REG_CTL            0x0000
 #define NFC_REG_ST             0x0004
 /* define bit use in NFC_ECC_ST */
 #define NFC_ECC_ERR(x)         BIT(x)
 #define NFC_ECC_PAT_FOUND(x)   BIT(x + 16)
-#define NFC_ECC_ERR_CNT(b, x)  (((x) >> ((b) * 8)) & 0xff)
+#define NFC_ECC_ERR_CNT(b, x)  (((x) >> (((b) % 4) * 8)) & 0xff)
 
 #define NFC_DEFAULT_TIMEOUT_MS 1000
 
@@ -212,12 +211,9 @@ struct sunxi_nand_chip_sel {
  * sunxi HW ECC infos: stores information related to HW ECC support
  *
  * @mode:      the sunxi ECC mode field deduced from ECC requirements
- * @layout:    the OOB layout depending on the ECC requirements and the
- *             selected ECC mode
  */
 struct sunxi_nand_hw_ecc {
        int mode;
-       struct nand_ecclayout layout;
 };
 
 /*
@@ -239,6 +235,10 @@ struct sunxi_nand_chip {
        u32 timing_cfg;
        u32 timing_ctl;
        int selected;
+       int addr_cycles;
+       u32 addr[2];
+       int cmd_cycles;
+       u8 cmd[2];
        int nsels;
        struct sunxi_nand_chip_sel sels[0];
 };
@@ -298,54 +298,71 @@ static irqreturn_t sunxi_nfc_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static int sunxi_nfc_wait_int(struct sunxi_nfc *nfc, u32 flags,
-                             unsigned int timeout_ms)
+static int sunxi_nfc_wait_events(struct sunxi_nfc *nfc, u32 events,
+                                bool use_polling, unsigned int timeout_ms)
 {
-       init_completion(&nfc->complete);
+       int ret;
 
-       writel(flags, nfc->regs + NFC_REG_INT);
+       if (events & ~NFC_INT_MASK)
+               return -EINVAL;
 
        if (!timeout_ms)
                timeout_ms = NFC_DEFAULT_TIMEOUT_MS;
 
-       if (!wait_for_completion_timeout(&nfc->complete,
-                                        msecs_to_jiffies(timeout_ms))) {
-               dev_err(nfc->dev, "wait interrupt timedout\n");
-               return -ETIMEDOUT;
+       if (!use_polling) {
+               init_completion(&nfc->complete);
+
+               writel(events, nfc->regs + NFC_REG_INT);
+
+               ret = wait_for_completion_timeout(&nfc->complete,
+                                               msecs_to_jiffies(timeout_ms));
+
+               writel(0, nfc->regs + NFC_REG_INT);
+       } else {
+               u32 status;
+
+               ret = readl_poll_timeout(nfc->regs + NFC_REG_ST, status,
+                                        (status & events) == events, 1,
+                                        timeout_ms * 1000);
        }
 
-       return 0;
+       writel(events & NFC_INT_MASK, nfc->regs + NFC_REG_ST);
+
+       if (ret)
+               dev_err(nfc->dev, "wait interrupt timedout\n");
+
+       return ret;
 }
 
 static int sunxi_nfc_wait_cmd_fifo_empty(struct sunxi_nfc *nfc)
 {
-       unsigned long timeout = jiffies +
-                               msecs_to_jiffies(NFC_DEFAULT_TIMEOUT_MS);
+       u32 status;
+       int ret;
 
-       do {
-               if (!(readl(nfc->regs + NFC_REG_ST) & NFC_CMD_FIFO_STATUS))
-                       return 0;
-       } while (time_before(jiffies, timeout));
+       ret = readl_poll_timeout(nfc->regs + NFC_REG_ST, status,
+                                !(status & NFC_CMD_FIFO_STATUS), 1,
+                                NFC_DEFAULT_TIMEOUT_MS * 1000);
+       if (ret)
+               dev_err(nfc->dev, "wait for empty cmd FIFO timedout\n");
 
-       dev_err(nfc->dev, "wait for empty cmd FIFO timedout\n");
-       return -ETIMEDOUT;
+       return ret;
 }
 
 static int sunxi_nfc_rst(struct sunxi_nfc *nfc)
 {
-       unsigned long timeout = jiffies +
-                               msecs_to_jiffies(NFC_DEFAULT_TIMEOUT_MS);
+       u32 ctl;
+       int ret;
 
        writel(0, nfc->regs + NFC_REG_ECC_CTL);
        writel(NFC_RESET, nfc->regs + NFC_REG_CTL);
 
-       do {
-               if (!(readl(nfc->regs + NFC_REG_CTL) & NFC_RESET))
-                       return 0;
-       } while (time_before(jiffies, timeout));
+       ret = readl_poll_timeout(nfc->regs + NFC_REG_CTL, ctl,
+                                !(ctl & NFC_RESET), 1,
+                                NFC_DEFAULT_TIMEOUT_MS * 1000);
+       if (ret)
+               dev_err(nfc->dev, "wait for NAND controller reset timedout\n");
 
-       dev_err(nfc->dev, "wait for NAND controller reset timedout\n");
-       return -ETIMEDOUT;
+       return ret;
 }
 
 static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
@@ -354,7 +371,6 @@ static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        struct sunxi_nand_rb *rb;
-       unsigned long timeo = (sunxi_nand->nand.state == FL_ERASING ? 400 : 20);
        int ret;
 
        if (sunxi_nand->selected < 0)
@@ -364,12 +380,6 @@ static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
 
        switch (rb->type) {
        case RB_NATIVE:
-               ret = !!(readl(nfc->regs + NFC_REG_ST) &
-                        NFC_RB_STATE(rb->info.nativeid));
-               if (ret)
-                       break;
-
-               sunxi_nfc_wait_int(nfc, NFC_RB_B2R, timeo);
                ret = !!(readl(nfc->regs + NFC_REG_ST) &
                         NFC_RB_STATE(rb->info.nativeid));
                break;
@@ -407,7 +417,7 @@ static void sunxi_nfc_select_chip(struct mtd_info *mtd, int chip)
                sel = &sunxi_nand->sels[chip];
 
                ctl |= NFC_CE_SEL(sel->cs) | NFC_EN |
-                      NFC_PAGE_SHIFT(nand->page_shift - 10);
+                      NFC_PAGE_SHIFT(nand->page_shift);
                if (sel->rb.type == RB_NONE) {
                        nand->dev_ready = NULL;
                } else {
@@ -452,7 +462,7 @@ static void sunxi_nfc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
                tmp = NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD;
                writel(tmp, nfc->regs + NFC_REG_CMD);
 
-               ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+               ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
                if (ret)
                        break;
 
@@ -487,7 +497,7 @@ static void sunxi_nfc_write_buf(struct mtd_info *mtd, const uint8_t *buf,
                      NFC_ACCESS_DIR;
                writel(tmp, nfc->regs + NFC_REG_CMD);
 
-               ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+               ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
                if (ret)
                        break;
 
@@ -511,32 +521,54 @@ static void sunxi_nfc_cmd_ctrl(struct mtd_info *mtd, int dat,
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        int ret;
-       u32 tmp;
 
        ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
        if (ret)
                return;
 
-       if (ctrl & NAND_CTRL_CHANGE) {
-               tmp = readl(nfc->regs + NFC_REG_CTL);
-               if (ctrl & NAND_NCE)
-                       tmp |= NFC_CE_CTL;
-               else
-                       tmp &= ~NFC_CE_CTL;
-               writel(tmp, nfc->regs + NFC_REG_CTL);
-       }
+       if (dat == NAND_CMD_NONE && (ctrl & NAND_NCE) &&
+           !(ctrl & (NAND_CLE | NAND_ALE))) {
+               u32 cmd = 0;
 
-       if (dat == NAND_CMD_NONE)
-               return;
+               if (!sunxi_nand->addr_cycles && !sunxi_nand->cmd_cycles)
+                       return;
 
-       if (ctrl & NAND_CLE) {
-               writel(NFC_SEND_CMD1 | dat, nfc->regs + NFC_REG_CMD);
-       } else {
-               writel(dat, nfc->regs + NFC_REG_ADDR_LOW);
-               writel(NFC_SEND_ADR, nfc->regs + NFC_REG_CMD);
+               if (sunxi_nand->cmd_cycles--)
+                       cmd |= NFC_SEND_CMD1 | sunxi_nand->cmd[0];
+
+               if (sunxi_nand->cmd_cycles--) {
+                       cmd |= NFC_SEND_CMD2;
+                       writel(sunxi_nand->cmd[1],
+                              nfc->regs + NFC_REG_RCMD_SET);
+               }
+
+               sunxi_nand->cmd_cycles = 0;
+
+               if (sunxi_nand->addr_cycles) {
+                       cmd |= NFC_SEND_ADR |
+                              NFC_ADR_NUM(sunxi_nand->addr_cycles);
+                       writel(sunxi_nand->addr[0],
+                              nfc->regs + NFC_REG_ADDR_LOW);
+               }
+
+               if (sunxi_nand->addr_cycles > 4)
+                       writel(sunxi_nand->addr[1],
+                              nfc->regs + NFC_REG_ADDR_HIGH);
+
+               writel(cmd, nfc->regs + NFC_REG_CMD);
+               sunxi_nand->addr[0] = 0;
+               sunxi_nand->addr[1] = 0;
+               sunxi_nand->addr_cycles = 0;
+               sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        }
 
-       sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       if (ctrl & NAND_CLE) {
+               sunxi_nand->cmd[sunxi_nand->cmd_cycles++] = dat;
+       } else if (ctrl & NAND_ALE) {
+               sunxi_nand->addr[sunxi_nand->addr_cycles / 4] |=
+                               dat << ((sunxi_nand->addr_cycles % 4) * 8);
+               sunxi_nand->addr_cycles++;
+       }
 }
 
 /* These seed values have been extracted from Allwinner's BSP */
@@ -717,7 +749,8 @@ static void sunxi_nfc_hw_ecc_enable(struct mtd_info *mtd)
        ecc_ctl = readl(nfc->regs + NFC_REG_ECC_CTL);
        ecc_ctl &= ~(NFC_ECC_MODE_MSK | NFC_ECC_PIPELINE |
                     NFC_ECC_BLOCK_SIZE_MSK);
-       ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(data->mode) | NFC_ECC_EXCEPTION;
+       ecc_ctl |= NFC_ECC_EN | NFC_ECC_MODE(data->mode) | NFC_ECC_EXCEPTION |
+                  NFC_ECC_PIPELINE;
 
        writel(ecc_ctl, nfc->regs + NFC_REG_ECC_CTL);
 }
@@ -739,18 +772,106 @@ static inline void sunxi_nfc_user_data_to_buf(u32 user_data, u8 *buf)
        buf[3] = user_data >> 24;
 }
 
+static inline u32 sunxi_nfc_buf_to_user_data(const u8 *buf)
+{
+       return buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+}
+
+static void sunxi_nfc_hw_ecc_get_prot_oob_bytes(struct mtd_info *mtd, u8 *oob,
+                                               int step, bool bbm, int page)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+
+       sunxi_nfc_user_data_to_buf(readl(nfc->regs + NFC_REG_USER_DATA(step)),
+                                  oob);
+
+       /* De-randomize the Bad Block Marker. */
+       if (bbm && (nand->options & NAND_NEED_SCRAMBLING))
+               sunxi_nfc_randomize_bbm(mtd, page, oob);
+}
+
+static void sunxi_nfc_hw_ecc_set_prot_oob_bytes(struct mtd_info *mtd,
+                                               const u8 *oob, int step,
+                                               bool bbm, int page)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       u8 user_data[4];
+
+       /* Randomize the Bad Block Marker. */
+       if (bbm && (nand->options & NAND_NEED_SCRAMBLING)) {
+               memcpy(user_data, oob, sizeof(user_data));
+               sunxi_nfc_randomize_bbm(mtd, page, user_data);
+               oob = user_data;
+       }
+
+       writel(sunxi_nfc_buf_to_user_data(oob),
+              nfc->regs + NFC_REG_USER_DATA(step));
+}
+
+static void sunxi_nfc_hw_ecc_update_stats(struct mtd_info *mtd,
+                                         unsigned int *max_bitflips, int ret)
+{
+       if (ret < 0) {
+               mtd->ecc_stats.failed++;
+       } else {
+               mtd->ecc_stats.corrected += ret;
+               *max_bitflips = max_t(unsigned int, *max_bitflips, ret);
+       }
+}
+
+static int sunxi_nfc_hw_ecc_correct(struct mtd_info *mtd, u8 *data, u8 *oob,
+                                   int step, bool *erased)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+       u32 status, tmp;
+
+       *erased = false;
+
+       status = readl(nfc->regs + NFC_REG_ECC_ST);
+
+       if (status & NFC_ECC_ERR(step))
+               return -EBADMSG;
+
+       if (status & NFC_ECC_PAT_FOUND(step)) {
+               u8 pattern;
+
+               if (unlikely(!(readl(nfc->regs + NFC_REG_PAT_ID) & 0x1))) {
+                       pattern = 0x0;
+               } else {
+                       pattern = 0xff;
+                       *erased = true;
+               }
+
+               if (data)
+                       memset(data, pattern, ecc->size);
+
+               if (oob)
+                       memset(oob, pattern, ecc->bytes + 4);
+
+               return 0;
+       }
+
+       tmp = readl(nfc->regs + NFC_REG_ECC_ERR_CNT(step));
+
+       return NFC_ECC_ERR_CNT(step, tmp);
+}
+
 static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
                                       u8 *data, int data_off,
                                       u8 *oob, int oob_off,
                                       int *cur_off,
                                       unsigned int *max_bitflips,
-                                      bool bbm, int page)
+                                      bool bbm, bool oob_required, int page)
 {
        struct nand_chip *nand = mtd_to_nand(mtd);
        struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
        struct nand_ecc_ctrl *ecc = &nand->ecc;
        int raw_mode = 0;
-       u32 status;
+       bool erased;
        int ret;
 
        if (*cur_off != data_off)
@@ -769,34 +890,19 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
        writel(NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD | NFC_ECC_OP,
               nfc->regs + NFC_REG_CMD);
 
-       ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        sunxi_nfc_randomizer_disable(mtd);
        if (ret)
                return ret;
 
        *cur_off = oob_off + ecc->bytes + 4;
 
-       status = readl(nfc->regs + NFC_REG_ECC_ST);
-       if (status & NFC_ECC_PAT_FOUND(0)) {
-               u8 pattern = 0xff;
-
-               if (unlikely(!(readl(nfc->regs + NFC_REG_PAT_ID) & 0x1)))
-                       pattern = 0x0;
-
-               memset(data, pattern, ecc->size);
-               memset(oob, pattern, ecc->bytes + 4);
-
+       ret = sunxi_nfc_hw_ecc_correct(mtd, data, oob_required ? oob : NULL, 0,
+                                      &erased);
+       if (erased)
                return 1;
-       }
-
-       ret = NFC_ECC_ERR_CNT(0, readl(nfc->regs + NFC_REG_ECC_ERR_CNT(0)));
-
-       memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE, ecc->size);
-
-       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
-       sunxi_nfc_randomizer_read_buf(mtd, oob, ecc->bytes + 4, true, page);
 
-       if (status & NFC_ECC_ERR(0)) {
+       if (ret < 0) {
                /*
                 * Re-read the data with the randomizer disabled to identify
                 * bitflips in erased pages.
@@ -804,35 +910,34 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
                if (nand->options & NAND_NEED_SCRAMBLING) {
                        nand->cmdfunc(mtd, NAND_CMD_RNDOUT, data_off, -1);
                        nand->read_buf(mtd, data, ecc->size);
-                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
-                       nand->read_buf(mtd, oob, ecc->bytes + 4);
+               } else {
+                       memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE,
+                                     ecc->size);
                }
 
+               nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
+               nand->read_buf(mtd, oob, ecc->bytes + 4);
+
                ret = nand_check_erased_ecc_chunk(data, ecc->size,
                                                  oob, ecc->bytes + 4,
                                                  NULL, 0, ecc->strength);
                if (ret >= 0)
                        raw_mode = 1;
        } else {
-               /*
-                * The engine protects 4 bytes of OOB data per chunk.
-                * Retrieve the corrected OOB bytes.
-                */
-               sunxi_nfc_user_data_to_buf(readl(nfc->regs + NFC_REG_USER_DATA(0)),
-                                          oob);
+               memcpy_fromio(data, nfc->regs + NFC_RAM0_BASE, ecc->size);
 
-               /* De-randomize the Bad Block Marker. */
-               if (bbm && nand->options & NAND_NEED_SCRAMBLING)
-                       sunxi_nfc_randomize_bbm(mtd, page, oob);
-       }
+               if (oob_required) {
+                       nand->cmdfunc(mtd, NAND_CMD_RNDOUT, oob_off, -1);
+                       sunxi_nfc_randomizer_read_buf(mtd, oob, ecc->bytes + 4,
+                                                     true, page);
 
-       if (ret < 0) {
-               mtd->ecc_stats.failed++;
-       } else {
-               mtd->ecc_stats.corrected += ret;
-               *max_bitflips = max_t(unsigned int, *max_bitflips, ret);
+                       sunxi_nfc_hw_ecc_get_prot_oob_bytes(mtd, oob, 0,
+                                                           bbm, page);
+               }
        }
 
+       sunxi_nfc_hw_ecc_update_stats(mtd, max_bitflips, ret);
+
        return raw_mode;
 }
 
@@ -848,7 +953,7 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
        if (len <= 0)
                return;
 
-       if (*cur_off != offset)
+       if (!cur_off || *cur_off != offset)
                nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
                              offset + mtd->writesize, -1);
 
@@ -858,12 +963,8 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
                sunxi_nfc_randomizer_read_buf(mtd, oob + offset, len,
                                              false, page);
 
-       *cur_off = mtd->oobsize + mtd->writesize;
-}
-
-static inline u32 sunxi_nfc_buf_to_user_data(const u8 *buf)
-{
-       return buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+       if (cur_off)
+               *cur_off = mtd->oobsize + mtd->writesize;
 }
 
 static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
@@ -882,19 +983,6 @@ static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
 
        sunxi_nfc_randomizer_write_buf(mtd, data, ecc->size, false, page);
 
-       /* Fill OOB data in */
-       if ((nand->options & NAND_NEED_SCRAMBLING) && bbm) {
-               u8 user_data[4];
-
-               memcpy(user_data, oob, 4);
-               sunxi_nfc_randomize_bbm(mtd, page, user_data);
-               writel(sunxi_nfc_buf_to_user_data(user_data),
-                      nfc->regs + NFC_REG_USER_DATA(0));
-       } else {
-               writel(sunxi_nfc_buf_to_user_data(oob),
-                      nfc->regs + NFC_REG_USER_DATA(0));
-       }
-
        if (data_off + ecc->size != oob_off)
                nand->cmdfunc(mtd, NAND_CMD_RNDIN, oob_off, -1);
 
@@ -903,11 +991,13 @@ static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
                return ret;
 
        sunxi_nfc_randomizer_enable(mtd);
+       sunxi_nfc_hw_ecc_set_prot_oob_bytes(mtd, oob, 0, bbm, page);
+
        writel(NFC_DATA_TRANS | NFC_DATA_SWAP_METHOD |
               NFC_ACCESS_DIR | NFC_ECC_OP,
               nfc->regs + NFC_REG_CMD);
 
-       ret = sunxi_nfc_wait_int(nfc, NFC_CMD_INT_FLAG, 0);
+       ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
        sunxi_nfc_randomizer_disable(mtd);
        if (ret)
                return ret;
@@ -929,13 +1019,14 @@ static void sunxi_nfc_hw_ecc_write_extra_oob(struct mtd_info *mtd,
        if (len <= 0)
                return;
 
-       if (*cur_off != offset)
+       if (!cur_off || *cur_off != offset)
                nand->cmdfunc(mtd, NAND_CMD_RNDIN,
                              offset + mtd->writesize, -1);
 
        sunxi_nfc_randomizer_write_buf(mtd, oob + offset, len, false, page);
 
-       *cur_off = mtd->oobsize + mtd->writesize;
+       if (cur_off)
+               *cur_off = mtd->oobsize + mtd->writesize;
 }
 
 static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
@@ -958,7 +1049,7 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
                ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off, oob,
                                                  oob_off + mtd->writesize,
                                                  &cur_off, &max_bitflips,
-                                                 !i, page);
+                                                 !i, oob_required, page);
                if (ret < 0)
                        return ret;
                else if (ret)
@@ -974,6 +1065,39 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
        return max_bitflips;
 }
 
+static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
+                                        struct nand_chip *chip,
+                                        u32 data_offs, u32 readlen,
+                                        u8 *bufpoi, int page)
+{
+       struct nand_ecc_ctrl *ecc = &chip->ecc;
+       int ret, i, cur_off = 0;
+       unsigned int max_bitflips = 0;
+
+       sunxi_nfc_hw_ecc_enable(mtd);
+
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+       for (i = data_offs / ecc->size;
+            i < DIV_ROUND_UP(data_offs + readlen, ecc->size); i++) {
+               int data_off = i * ecc->size;
+               int oob_off = i * (ecc->bytes + 4);
+               u8 *data = bufpoi + data_off;
+               u8 *oob = chip->oob_poi + oob_off;
+
+               ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off,
+                                                 oob,
+                                                 oob_off + mtd->writesize,
+                                                 &cur_off, &max_bitflips, !i,
+                                                 false, page);
+               if (ret < 0)
+                       return ret;
+       }
+
+       sunxi_nfc_hw_ecc_disable(mtd);
+
+       return max_bitflips;
+}
+
 static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
                                       struct nand_chip *chip,
                                       const uint8_t *buf, int oob_required,
@@ -1026,7 +1150,9 @@ static int sunxi_nfc_hw_syndrome_ecc_read_page(struct mtd_info *mtd,
 
                ret = sunxi_nfc_hw_ecc_read_chunk(mtd, data, data_off, oob,
                                                  oob_off, &cur_off,
-                                                 &max_bitflips, !i, page);
+                                                 &max_bitflips, !i,
+                                                 oob_required,
+                                                 page);
                if (ret < 0)
                        return ret;
                else if (ret)
@@ -1074,6 +1200,40 @@ static int sunxi_nfc_hw_syndrome_ecc_write_page(struct mtd_info *mtd,
        return 0;
 }
 
+static int sunxi_nfc_hw_common_ecc_read_oob(struct mtd_info *mtd,
+                                           struct nand_chip *chip,
+                                           int page)
+{
+       chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+       chip->pagebuf = -1;
+
+       return chip->ecc.read_page(mtd, chip, chip->buffers->databuf, 1, page);
+}
+
+static int sunxi_nfc_hw_common_ecc_write_oob(struct mtd_info *mtd,
+                                            struct nand_chip *chip,
+                                            int page)
+{
+       int ret, status;
+
+       chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0, page);
+
+       chip->pagebuf = -1;
+
+       memset(chip->buffers->databuf, 0xff, mtd->writesize);
+       ret = chip->ecc.write_page(mtd, chip, chip->buffers->databuf, 1, page);
+       if (ret)
+               return ret;
+
+       /* Send command to program the OOB data */
+       chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+
+       status = chip->waitfunc(mtd, chip);
+
+       return status & NAND_STATUS_FAIL ? -EIO : 0;
+}
+
 static const s32 tWB_lut[] = {6, 12, 16, 20};
 static const s32 tRHW_lut[] = {4, 8, 12, 20};
 
@@ -1101,6 +1261,7 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
        struct sunxi_nfc *nfc = to_sunxi_nfc(chip->nand.controller);
        u32 min_clk_period = 0;
        s32 tWB, tADL, tWHR, tRHW, tCAD;
+       long real_clk_rate;
 
        /* T1 <=> tCLS */
        if (timings->tCLS_min > min_clk_period)
@@ -1163,6 +1324,18 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
                min_clk_period = DIV_ROUND_UP(timings->tWC_min, 2);
 
        /* T16 - T19 + tCAD */
+       if (timings->tWB_max > (min_clk_period * 20))
+               min_clk_period = DIV_ROUND_UP(timings->tWB_max, 20);
+
+       if (timings->tADL_min > (min_clk_period * 32))
+               min_clk_period = DIV_ROUND_UP(timings->tADL_min, 32);
+
+       if (timings->tWHR_min > (min_clk_period * 32))
+               min_clk_period = DIV_ROUND_UP(timings->tWHR_min, 32);
+
+       if (timings->tRHW_min > (min_clk_period * 20))
+               min_clk_period = DIV_ROUND_UP(timings->tRHW_min, 20);
+
        tWB  = sunxi_nand_lookup_timing(tWB_lut, timings->tWB_max,
                                        min_clk_period);
        if (tWB < 0) {
@@ -1198,23 +1371,26 @@ static int sunxi_nand_chip_set_timings(struct sunxi_nand_chip *chip,
        /* TODO: A83 has some more bits for CDQSS, CS, CLHZ, CCS, WC */
        chip->timing_cfg = NFC_TIMING_CFG(tWB, tADL, tWHR, tRHW, tCAD);
 
-       /*
-        * ONFI specification 3.1, paragraph 4.15.2 dictates that EDO data
-        * output cycle timings shall be used if the host drives tRC less than
-        * 30 ns.
-        */
-       chip->timing_ctl = (timings->tRC_min < 30000) ? NFC_TIMING_CTL_EDO : 0;
-
        /* Convert min_clk_period from picoseconds to nanoseconds */
        min_clk_period = DIV_ROUND_UP(min_clk_period, 1000);
 
        /*
-        * Convert min_clk_period into a clk frequency, then get the
-        * appropriate rate for the NAND controller IP given this formula
-        * (specified in the datasheet):
-        * nand clk_rate = 2 * min_clk_rate
+        * Unlike what is stated in Allwinner datasheet, the clk_rate should
+        * be set to (1 / min_clk_period), and not (2 / min_clk_period).
+        * This new formula was verified with a scope and validated by
+        * Allwinner engineers.
         */
-       chip->clk_rate = (2 * NSEC_PER_SEC) / min_clk_period;
+       chip->clk_rate = NSEC_PER_SEC / min_clk_period;
+       real_clk_rate = clk_round_rate(nfc->mod_clk, chip->clk_rate);
+
+       /*
+        * ONFI specification 3.1, paragraph 4.15.2 dictates that EDO data
+        * output cycle timings shall be used if the host drives tRC less than
+        * 30 ns.
+        */
+       min_clk_period = NSEC_PER_SEC / real_clk_rate;
+       chip->timing_ctl = ((min_clk_period * 2) < 30) ?
+                          NFC_TIMING_CTL_EDO : 0;
 
        return 0;
 }
@@ -1257,6 +1433,57 @@ static int sunxi_nand_chip_init_timings(struct sunxi_nand_chip *chip,
        return sunxi_nand_chip_set_timings(chip, timings);
 }
 
+static int sunxi_nand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+
+       if (section >= ecc->steps)
+               return -ERANGE;
+
+       oobregion->offset = section * (ecc->bytes + 4) + 4;
+       oobregion->length = ecc->bytes;
+
+       return 0;
+}
+
+static int sunxi_nand_ooblayout_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       struct nand_chip *nand = mtd_to_nand(mtd);
+       struct nand_ecc_ctrl *ecc = &nand->ecc;
+
+       if (section > ecc->steps)
+               return -ERANGE;
+
+       /*
+        * The first 2 bytes are used for BB markers, hence we
+        * only have 2 bytes available in the first user data
+        * section.
+        */
+       if (!section && ecc->mode == NAND_ECC_HW) {
+               oobregion->offset = 2;
+               oobregion->length = 2;
+
+               return 0;
+       }
+
+       oobregion->offset = section * (ecc->bytes + 4);
+
+       if (section < ecc->steps)
+               oobregion->length = 4;
+       else
+               oobregion->offset = mtd->oobsize - oobregion->offset;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops sunxi_nand_ooblayout_ops = {
+       .ecc = sunxi_nand_ooblayout_ecc,
+       .free = sunxi_nand_ooblayout_free,
+};
+
 static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
                                              struct nand_ecc_ctrl *ecc,
                                              struct device_node *np)
@@ -1266,7 +1493,6 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
        struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
        struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
        struct sunxi_nand_hw_ecc *data;
-       struct nand_ecclayout *layout;
        int nsectors;
        int ret;
        int i;
@@ -1295,7 +1521,6 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
        /* HW ECC always work with even numbers of ECC bytes */
        ecc->bytes = ALIGN(ecc->bytes, 2);
 
-       layout = &data->layout;
        nsectors = mtd->writesize / ecc->size;
 
        if (mtd->oobsize < ((ecc->bytes + 4) * nsectors)) {
@@ -1303,9 +1528,9 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
                goto err;
        }
 
-       layout->eccbytes = (ecc->bytes * nsectors);
-
-       ecc->layout = layout;
+       ecc->read_oob = sunxi_nfc_hw_common_ecc_read_oob;
+       ecc->write_oob = sunxi_nfc_hw_common_ecc_write_oob;
+       mtd_set_ooblayout(mtd, &sunxi_nand_ooblayout_ops);
        ecc->priv = data;
 
        return 0;
@@ -1325,9 +1550,6 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
                                       struct nand_ecc_ctrl *ecc,
                                       struct device_node *np)
 {
-       struct nand_ecclayout *layout;
-       int nsectors;
-       int i, j;
        int ret;
 
        ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
@@ -1336,40 +1558,9 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
 
        ecc->read_page = sunxi_nfc_hw_ecc_read_page;
        ecc->write_page = sunxi_nfc_hw_ecc_write_page;
-       layout = ecc->layout;
-       nsectors = mtd->writesize / ecc->size;
-
-       for (i = 0; i < nsectors; i++) {
-               if (i) {
-                       layout->oobfree[i].offset =
-                               layout->oobfree[i - 1].offset +
-                               layout->oobfree[i - 1].length +
-                               ecc->bytes;
-                       layout->oobfree[i].length = 4;
-               } else {
-                       /*
-                        * The first 2 bytes are used for BB markers, hence we
-                        * only have 2 bytes available in the first user data
-                        * section.
-                        */
-                       layout->oobfree[i].length = 2;
-                       layout->oobfree[i].offset = 2;
-               }
-
-               for (j = 0; j < ecc->bytes; j++)
-                       layout->eccpos[(ecc->bytes * i) + j] =
-                                       layout->oobfree[i].offset +
-                                       layout->oobfree[i].length + j;
-       }
-
-       if (mtd->oobsize > (ecc->bytes + 4) * nsectors) {
-               layout->oobfree[nsectors].offset =
-                               layout->oobfree[nsectors - 1].offset +
-                               layout->oobfree[nsectors - 1].length +
-                               ecc->bytes;
-               layout->oobfree[nsectors].length = mtd->oobsize -
-                               ((ecc->bytes + 4) * nsectors);
-       }
+       ecc->read_oob_raw = nand_read_oob_std;
+       ecc->write_oob_raw = nand_write_oob_std;
+       ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
 
        return 0;
 }
@@ -1378,9 +1569,6 @@ static int sunxi_nand_hw_syndrome_ecc_ctrl_init(struct mtd_info *mtd,
                                                struct nand_ecc_ctrl *ecc,
                                                struct device_node *np)
 {
-       struct nand_ecclayout *layout;
-       int nsectors;
-       int i;
        int ret;
 
        ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
@@ -1390,15 +1578,8 @@ static int sunxi_nand_hw_syndrome_ecc_ctrl_init(struct mtd_info *mtd,
        ecc->prepad = 4;
        ecc->read_page = sunxi_nfc_hw_syndrome_ecc_read_page;
        ecc->write_page = sunxi_nfc_hw_syndrome_ecc_write_page;
-
-       layout = ecc->layout;
-       nsectors = mtd->writesize / ecc->size;
-
-       for (i = 0; i < (ecc->bytes * nsectors); i++)
-               layout->eccpos[i] = i;
-
-       layout->oobfree[0].length = mtd->oobsize - i;
-       layout->oobfree[0].offset = i;
+       ecc->read_oob_raw = nand_read_oob_syndrome;
+       ecc->write_oob_raw = nand_write_oob_syndrome;
 
        return 0;
 }
@@ -1411,7 +1592,6 @@ static void sunxi_nand_ecc_cleanup(struct nand_ecc_ctrl *ecc)
                sunxi_nand_hw_common_ecc_ctrl_cleanup(ecc);
                break;
        case NAND_ECC_NONE:
-               kfree(ecc->layout);
        default:
                break;
        }
@@ -1432,8 +1612,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
                return -EINVAL;
 
        switch (ecc->mode) {
-       case NAND_ECC_SOFT_BCH:
-               break;
        case NAND_ECC_HW:
                ret = sunxi_nand_hw_ecc_ctrl_init(mtd, ecc, np);
                if (ret)
@@ -1445,10 +1623,6 @@ static int sunxi_nand_ecc_init(struct mtd_info *mtd, struct nand_ecc_ctrl *ecc,
                        return ret;
                break;
        case NAND_ECC_NONE:
-               ecc->layout = kzalloc(sizeof(*ecc->layout), GFP_KERNEL);
-               if (!ecc->layout)
-                       return -ENOMEM;
-               ecc->layout->oobfree[0].length = mtd->oobsize;
        case NAND_ECC_SOFT:
                break;
        default:
@@ -1536,21 +1710,6 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
                }
        }
 
-       timings = onfi_async_timing_mode_to_sdr_timings(0);
-       if (IS_ERR(timings)) {
-               ret = PTR_ERR(timings);
-               dev_err(dev,
-                       "could not retrieve timings for ONFI mode 0: %d\n",
-                       ret);
-               return ret;
-       }
-
-       ret = sunxi_nand_chip_set_timings(chip, timings);
-       if (ret) {
-               dev_err(dev, "could not configure chip timings: %d\n", ret);
-               return ret;
-       }
-
        nand = &chip->nand;
        /* Default tR value specified in the ONFI spec (chapter 4.15.1) */
        nand->chip_delay = 200;
@@ -1570,6 +1729,21 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
        mtd = nand_to_mtd(nand);
        mtd->dev.parent = dev;
 
+       timings = onfi_async_timing_mode_to_sdr_timings(0);
+       if (IS_ERR(timings)) {
+               ret = PTR_ERR(timings);
+               dev_err(dev,
+                       "could not retrieve timings for ONFI mode 0: %d\n",
+                       ret);
+               return ret;
+       }
+
+       ret = sunxi_nand_chip_set_timings(chip, timings);
+       if (ret) {
+               dev_err(dev, "could not configure chip timings: %d\n", ret);
+               return ret;
+       }
+
        ret = nand_scan_ident(mtd, nsels, NULL);
        if (ret)
                return ret;
@@ -1580,6 +1754,8 @@ static int sunxi_nand_chip_init(struct device *dev, struct sunxi_nfc *nfc,
        if (nand->options & NAND_NEED_SCRAMBLING)
                nand->options |= NAND_NO_SUBPAGE_WRITE;
 
+       nand->options |= NAND_SUBPAGE_READ;
+
        ret = sunxi_nand_chip_init_timings(chip, np);
        if (ret) {
                dev_err(dev, "could not configure chip timings: %d\n", ret);
@@ -1728,6 +1904,8 @@ static int sunxi_nfc_remove(struct platform_device *pdev)
        struct sunxi_nfc *nfc = platform_get_drvdata(pdev);
 
        sunxi_nand_chips_cleanup(nfc);
+       clk_disable_unprepare(nfc->mod_clk);
+       clk_disable_unprepare(nfc->ahb_clk);
 
        return 0;
 }
index 293feb19b0b149c7add9d4e2bf7dc0f2a8313275..3ad514c44dcb71a008e816a37e53af8183d6295a 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
-#include <linux/of_mtd.h>
 #include <linux/of_device.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
@@ -175,34 +174,6 @@ static inline struct vf610_nfc *mtd_to_nfc(struct mtd_info *mtd)
        return container_of(mtd_to_nand(mtd), struct vf610_nfc, chip);
 }
 
-static struct nand_ecclayout vf610_nfc_ecc45 = {
-       .eccbytes = 45,
-       .eccpos = {19, 20, 21, 22, 23,
-                  24, 25, 26, 27, 28, 29, 30, 31,
-                  32, 33, 34, 35, 36, 37, 38, 39,
-                  40, 41, 42, 43, 44, 45, 46, 47,
-                  48, 49, 50, 51, 52, 53, 54, 55,
-                  56, 57, 58, 59, 60, 61, 62, 63},
-       .oobfree = {
-               {.offset = 2,
-                .length = 17} }
-};
-
-static struct nand_ecclayout vf610_nfc_ecc60 = {
-       .eccbytes = 60,
-       .eccpos = { 4,  5,  6,  7,  8,  9, 10, 11,
-                  12, 13, 14, 15, 16, 17, 18, 19,
-                  20, 21, 22, 23, 24, 25, 26, 27,
-                  28, 29, 30, 31, 32, 33, 34, 35,
-                  36, 37, 38, 39, 40, 41, 42, 43,
-                  44, 45, 46, 47, 48, 49, 50, 51,
-                  52, 53, 54, 55, 56, 57, 58, 59,
-                  60, 61, 62, 63 },
-       .oobfree = {
-               {.offset = 2,
-                .length = 2} }
-};
-
 static inline u32 vf610_nfc_read(struct vf610_nfc *nfc, uint reg)
 {
        return readl(nfc->regs + reg);
@@ -781,14 +752,16 @@ static int vf610_nfc_probe(struct platform_device *pdev)
                if (mtd->oobsize > 64)
                        mtd->oobsize = 64;
 
+               /*
+                * mtd->ecclayout is not specified here because we're using the
+                * default large page ECC layout defined in NAND core.
+                */
                if (chip->ecc.strength == 32) {
                        nfc->ecc_mode = ECC_60_BYTE;
                        chip->ecc.bytes = 60;
-                       chip->ecc.layout = &vf610_nfc_ecc60;
                } else if (chip->ecc.strength == 24) {
                        nfc->ecc_mode = ECC_45_BYTE;
                        chip->ecc.bytes = 45;
-                       chip->ecc.layout = &vf610_nfc_ecc45;
                } else {
                        dev_err(nfc->dev, "Unsupported ECC strength\n");
                        err = -ENXIO;
index af28bb3ae7cfc56ccd68e0b0a9d6d05fc5e6296e..a4b029a417f04edf9740e29d286659b9e19712aa 100644 (file)
@@ -68,21 +68,33 @@ MODULE_PARM_DESC(otp,       "Corresponding behaviour of OneNAND in OTP"
  * flexonenand_oob_128 - oob info for Flex-Onenand with 4KB page
  * For now, we expose only 64 out of 80 ecc bytes
  */
-static struct nand_ecclayout flexonenand_oob_128 = {
-       .eccbytes       = 64,
-       .eccpos         = {
-               6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-               22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-               38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-               54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-               70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-               86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-               102, 103, 104, 105
-               },
-       .oobfree        = {
-               {2, 4}, {18, 4}, {34, 4}, {50, 4},
-               {66, 4}, {82, 4}, {98, 4}, {114, 4}
-       }
+static int flexonenand_ooblayout_ecc(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 6;
+       oobregion->length = 10;
+
+       return 0;
+}
+
+static int flexonenand_ooblayout_free(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 4;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops flexonenand_ooblayout_ops = {
+       .ecc = flexonenand_ooblayout_ecc,
+       .free = flexonenand_ooblayout_free,
 };
 
 /*
@@ -91,56 +103,77 @@ static struct nand_ecclayout flexonenand_oob_128 = {
  * Based on specification:
  * 4Gb M-die OneNAND Flash (KFM4G16Q4M, KFN8G16Q4M). Rev. 1.3, Apr. 2010
  *
- * For eccpos we expose only 64 bytes out of 72 (see struct nand_ecclayout)
- *
- * oobfree uses the spare area fields marked as
- * "Managed by internal ECC logic for Logical Sector Number area"
  */
-static struct nand_ecclayout onenand_oob_128 = {
-       .eccbytes       = 64,
-       .eccpos         = {
-               7, 8, 9, 10, 11, 12, 13, 14, 15,
-               23, 24, 25, 26, 27, 28, 29, 30, 31,
-               39, 40, 41, 42, 43, 44, 45, 46, 47,
-               55, 56, 57, 58, 59, 60, 61, 62, 63,
-               71, 72, 73, 74, 75, 76, 77, 78, 79,
-               87, 88, 89, 90, 91, 92, 93, 94, 95,
-               103, 104, 105, 106, 107, 108, 109, 110, 111,
-               119
-       },
-       .oobfree        = {
-               {2, 3}, {18, 3}, {34, 3}, {50, 3},
-               {66, 3}, {82, 3}, {98, 3}, {114, 3}
-       }
+static int onenand_ooblayout_128_ecc(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 7)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 7;
+       oobregion->length = 9;
+
+       return 0;
+}
+
+static int onenand_ooblayout_128_free(struct mtd_info *mtd, int section,
+                                     struct mtd_oob_region *oobregion)
+{
+       if (section >= 8)
+               return -ERANGE;
+
+       /*
+        * free bytes are using the spare area fields marked as
+        * "Managed by internal ECC logic for Logical Sector Number area"
+        */
+       oobregion->offset = (section * 16) + 2;
+       oobregion->length = 3;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_128_ooblayout_ops = {
+       .ecc = onenand_ooblayout_128_ecc,
+       .free = onenand_ooblayout_128_free,
 };
 
 /**
- * onenand_oob_64 - oob info for large (2KB) page
+ * onenand_oob_32_64 - oob info for large (2KB) page
  */
-static struct nand_ecclayout onenand_oob_64 = {
-       .eccbytes       = 20,
-       .eccpos         = {
-               8, 9, 10, 11, 12,
-               24, 25, 26, 27, 28,
-               40, 41, 42, 43, 44,
-               56, 57, 58, 59, 60,
-               },
-       .oobfree        = {
-               {2, 3}, {14, 2}, {18, 3}, {30, 2},
-               {34, 3}, {46, 2}, {50, 3}, {62, 2}
+static int onenand_ooblayout_32_64_ecc(struct mtd_info *mtd, int section,
+                                      struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 8;
+       oobregion->length = 5;
+
+       return 0;
+}
+
+static int onenand_ooblayout_32_64_free(struct mtd_info *mtd, int section,
+                                       struct mtd_oob_region *oobregion)
+{
+       int sections = (mtd->oobsize / 32) * 2;
+
+       if (section >= sections)
+               return -ERANGE;
+
+       if (section & 1) {
+               oobregion->offset = ((section - 1) * 16) + 14;
+               oobregion->length = 2;
+       } else  {
+               oobregion->offset = (section * 16) + 2;
+               oobregion->length = 3;
        }
-};
 
-/**
- * onenand_oob_32 - oob info for middle (1KB) page
- */
-static struct nand_ecclayout onenand_oob_32 = {
-       .eccbytes       = 10,
-       .eccpos         = {
-               8, 9, 10, 11, 12,
-               24, 25, 26, 27, 28,
-               },
-       .oobfree        = { {2, 3}, {14, 2}, {18, 3}, {30, 2} }
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops onenand_oob_32_64_ooblayout_ops = {
+       .ecc = onenand_ooblayout_32_64_ecc,
+       .free = onenand_ooblayout_32_64_free,
 };
 
 static const unsigned char ffchars[] = {
@@ -1024,34 +1057,15 @@ static int onenand_transfer_auto_oob(struct mtd_info *mtd, uint8_t *buf, int col
                                int thislen)
 {
        struct onenand_chip *this = mtd->priv;
-       struct nand_oobfree *free;
-       int readcol = column;
-       int readend = column + thislen;
-       int lastgap = 0;
-       unsigned int i;
-       uint8_t *oob_buf = this->oob_buf;
-
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               if (readcol >= lastgap)
-                       readcol += free->offset - lastgap;
-               if (readend >= lastgap)
-                       readend += free->offset - lastgap;
-               lastgap = free->offset + free->length;
-       }
-       this->read_bufferram(mtd, ONENAND_SPARERAM, oob_buf, 0, mtd->oobsize);
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               int free_end = free->offset + free->length;
-               if (free->offset < readend && free_end > readcol) {
-                       int st = max_t(int,free->offset,readcol);
-                       int ed = min_t(int,free_end,readend);
-                       int n = ed - st;
-                       memcpy(buf, oob_buf + st, n);
-                       buf += n;
-               } else if (column == 0)
-                       break;
-       }
+       int ret;
+
+       this->read_bufferram(mtd, ONENAND_SPARERAM, this->oob_buf, 0,
+                            mtd->oobsize);
+       ret = mtd_ooblayout_get_databytes(mtd, buf, this->oob_buf,
+                                         column, thislen);
+       if (ret)
+               return ret;
+
        return 0;
 }
 
@@ -1808,34 +1822,7 @@ static int onenand_panic_write(struct mtd_info *mtd, loff_t to, size_t len,
 static int onenand_fill_auto_oob(struct mtd_info *mtd, u_char *oob_buf,
                                  const u_char *buf, int column, int thislen)
 {
-       struct onenand_chip *this = mtd->priv;
-       struct nand_oobfree *free;
-       int writecol = column;
-       int writeend = column + thislen;
-       int lastgap = 0;
-       unsigned int i;
-
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               if (writecol >= lastgap)
-                       writecol += free->offset - lastgap;
-               if (writeend >= lastgap)
-                       writeend += free->offset - lastgap;
-               lastgap = free->offset + free->length;
-       }
-       free = this->ecclayout->oobfree;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES && free->length; i++, free++) {
-               int free_end = free->offset + free->length;
-               if (free->offset < writeend && free_end > writecol) {
-                       int st = max_t(int,free->offset,writecol);
-                       int ed = min_t(int,free_end,writeend);
-                       int n = ed - st;
-                       memcpy(oob_buf + st, buf, n);
-                       buf += n;
-               } else if (column == 0)
-                       break;
-       }
-       return 0;
+       return mtd_ooblayout_set_databytes(mtd, buf, oob_buf, column, thislen);
 }
 
 /**
@@ -4003,22 +3990,22 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
        switch (mtd->oobsize) {
        case 128:
                if (FLEXONENAND(this)) {
-                       this->ecclayout = &flexonenand_oob_128;
+                       mtd_set_ooblayout(mtd, &flexonenand_ooblayout_ops);
                        mtd->subpage_sft = 0;
                } else {
-                       this->ecclayout = &onenand_oob_128;
+                       mtd_set_ooblayout(mtd, &onenand_oob_128_ooblayout_ops);
                        mtd->subpage_sft = 2;
                }
                if (ONENAND_IS_NOP_1(this))
                        mtd->subpage_sft = 0;
                break;
        case 64:
-               this->ecclayout = &onenand_oob_64;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                mtd->subpage_sft = 2;
                break;
 
        case 32:
-               this->ecclayout = &onenand_oob_32;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                mtd->subpage_sft = 1;
                break;
 
@@ -4027,7 +4014,7 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
                        __func__, mtd->oobsize);
                mtd->subpage_sft = 0;
                /* To prevent kernel oops */
-               this->ecclayout = &onenand_oob_32;
+               mtd_set_ooblayout(mtd, &onenand_oob_32_64_ooblayout_ops);
                break;
        }
 
@@ -4037,12 +4024,12 @@ int onenand_scan(struct mtd_info *mtd, int maxchips)
         * The number of bytes available for a client to place data into
         * the out of band area
         */
-       mtd->oobavail = 0;
-       for (i = 0; i < MTD_MAX_OOBFREE_ENTRIES &&
-           this->ecclayout->oobfree[i].length; i++)
-               mtd->oobavail += this->ecclayout->oobfree[i].length;
+       ret = mtd_ooblayout_count_freebytes(mtd);
+       if (ret < 0)
+               ret = 0;
+
+       mtd->oobavail = ret;
 
-       mtd->ecclayout = this->ecclayout;
        mtd->ecc_strength = 1;
 
        /* Fill in remaining MTD driver data */
index 157841dc3e99e87826639f6418d3701f84632968..c52e45594bfd6e78775f6250314c9cf26a049e2a 100644 (file)
@@ -832,6 +832,7 @@ static const struct flash_info spi_nor_ids[] = {
        /* GigaDevice */
        { "gd25q32", INFO(0xc84016, 0, 64 * 1024,  64, SECT_4K) },
        { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
+       { "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
        { "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
 
        /* Intel/Numonyx -- xxxs33b */
index 042baec569319bcd6539212bf7b2e123b51e7e3a..608fc4464574e1d9edb4037c53b6998afdb11ebe 100644 (file)
@@ -164,14 +164,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-                     void __pmem **kaddr, pfn_t *pfn)
+                     void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        resource_size_t offset = sector * 512 + pmem->data_offset;
 
+       if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+               return -EIO;
        *kaddr = pmem->virt_addr + offset;
        *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
+       /*
+        * If badblocks are present, limit known good range to the
+        * requested range.
+        */
+       if (unlikely(pmem->bb.count))
+               return size;
        return pmem->size - pmem->pfn_pad - offset;
 }
 
index bee3fa96b981461778e3d5ee8ee3380f8dea2d71..d7efd9d458aab4554139e93c72a1688aa0eeef6a 100644 (file)
@@ -10,7 +10,6 @@ obj-$(CONFIG_OF_UNITTEST) += unittest.o
 obj-$(CONFIG_OF_MDIO)  += of_mdio.o
 obj-$(CONFIG_OF_PCI)   += of_pci.o
 obj-$(CONFIG_OF_PCI_IRQ)  += of_pci_irq.o
-obj-$(CONFIG_OF_MTD)   += of_mtd.o
 obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
deleted file mode 100644 (file)
index b7361ed..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- *
- */
-#include <linux/kernel.h>
-#include <linux/of_mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/export.h>
-
-/**
- * It maps 'enum nand_ecc_modes_t' found in include/linux/mtd/nand.h
- * into the device tree binding of 'nand-ecc', so that MTD
- * device driver can get nand ecc from device tree.
- */
-static const char *nand_ecc_modes[] = {
-       [NAND_ECC_NONE]         = "none",
-       [NAND_ECC_SOFT]         = "soft",
-       [NAND_ECC_HW]           = "hw",
-       [NAND_ECC_HW_SYNDROME]  = "hw_syndrome",
-       [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first",
-       [NAND_ECC_SOFT_BCH]     = "soft_bch",
-};
-
-/**
- * of_get_nand_ecc_mode - Get nand ecc mode for given device_node
- * @np:        Pointer to the given device_node
- *
- * The function gets ecc mode string from property 'nand-ecc-mode',
- * and return its index in nand_ecc_modes table, or errno in error case.
- */
-int of_get_nand_ecc_mode(struct device_node *np)
-{
-       const char *pm;
-       int err, i;
-
-       err = of_property_read_string(np, "nand-ecc-mode", &pm);
-       if (err < 0)
-               return err;
-
-       for (i = 0; i < ARRAY_SIZE(nand_ecc_modes); i++)
-               if (!strcasecmp(pm, nand_ecc_modes[i]))
-                       return i;
-
-       return -ENODEV;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_mode);
-
-/**
- * of_get_nand_ecc_step_size - Get ECC step size associated to
- * the required ECC strength (see below).
- * @np:        Pointer to the given device_node
- *
- * return the ECC step size, or errno in error case.
- */
-int of_get_nand_ecc_step_size(struct device_node *np)
-{
-       int ret;
-       u32 val;
-
-       ret = of_property_read_u32(np, "nand-ecc-step-size", &val);
-       return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_step_size);
-
-/**
- * of_get_nand_ecc_strength - Get required ECC strength over the
- * correspnding step size as defined by 'nand-ecc-size'
- * @np:        Pointer to the given device_node
- *
- * return the ECC strength, or errno in error case.
- */
-int of_get_nand_ecc_strength(struct device_node *np)
-{
-       int ret;
-       u32 val;
-
-       ret = of_property_read_u32(np, "nand-ecc-strength", &val);
-       return ret ? ret : val;
-}
-EXPORT_SYMBOL_GPL(of_get_nand_ecc_strength);
-
-/**
- * of_get_nand_bus_width - Get nand bus witdh for given device_node
- * @np:        Pointer to the given device_node
- *
- * return bus width option, or errno in error case.
- */
-int of_get_nand_bus_width(struct device_node *np)
-{
-       u32 val;
-
-       if (of_property_read_u32(np, "nand-bus-width", &val))
-               return 8;
-
-       switch(val) {
-       case 8:
-       case 16:
-               return val;
-       default:
-               return -EIO;
-       }
-}
-EXPORT_SYMBOL_GPL(of_get_nand_bus_width);
-
-/**
- * of_get_nand_on_flash_bbt - Get nand on flash bbt for given device_node
- * @np:        Pointer to the given device_node
- *
- * return true if present false other wise
- */
-bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-       return of_property_read_bool(np, "nand-on-flash-bbt");
-}
-EXPORT_SYMBOL_GPL(of_get_nand_on_flash_bbt);
index 55182fc58c6a4518b4d55993c4f3894acff0133c..677a811b3a6ffcb604d8a6c4cb836ec8dd81575e 100644 (file)
@@ -153,8 +153,10 @@ struct byt_community {
                .name                   = (n),                  \
                .pins                   = (p),                  \
                .npins                  = ARRAY_SIZE((p)),      \
-               .has_simple_funcs       = 1,            \
-               .simple_funcs           = (f),                  \
+               .has_simple_funcs       = 1,                    \
+               {                                               \
+                       .simple_funcs           = (f),          \
+               },                                              \
                .nfuncs                 = ARRAY_SIZE((f)),      \
        }
 #define PIN_GROUP_MIXED(n, p, f)                               \
@@ -163,7 +165,9 @@ struct byt_community {
                .pins                   = (p),                  \
                .npins                  = ARRAY_SIZE((p)),      \
                .has_simple_funcs       = 0,                    \
-               .mixed_funcs            = (f),                  \
+               {                                               \
+                       .mixed_funcs            = (f),          \
+               },                                              \
                .nfuncs                 = ARRAY_SIZE((f)),      \
        }
 
index ed2004be13cfcc58a6c9598a234433d9c76ba078..c06bb85c283912590d306e3336e28419f32a0c7c 100644 (file)
@@ -846,6 +846,18 @@ config INTEL_IMR
 
          If you are running on a Galileo/Quark say Y here.
 
+config INTEL_PMC_CORE
+       bool "Intel PMC Core driver"
+       depends on X86 && PCI
+       ---help---
+         The Intel Platform Controller Hub for Intel Core SoCs provides access
+         to Power Management Controller registers via a PCI interface. This
+         driver can utilize debugging capabilities and supported features as
+         exposed by the Power Management Controller.
+
+         Supported features:
+               - SLP_S0_RESIDENCY counter.
+
 config IBM_RTL
        tristate "Device driver to enable PRTL support"
        depends on X86 && PCI
index 448443c3baba9652e4c0edaa2d8295b8a1c3e0f1..9b11b4073e033f66b59d05d9dce01442aeee3775 100644 (file)
@@ -69,3 +69,4 @@ obj-$(CONFIG_INTEL_PUNIT_IPC)  += intel_punit_ipc.o
 obj-$(CONFIG_INTEL_TELEMETRY)  += intel_telemetry_core.o \
                                   intel_telemetry_pltdrv.o \
                                   intel_telemetry_debugfs.o
+obj-$(CONFIG_INTEL_PMC_CORE)    += intel_pmc_core.o
index f2b5d0a8adf03a2bb5ac521aa62fa2046c06f438..15f1311465015c72fcc82210a15df0bb24532928 100644 (file)
@@ -771,12 +771,14 @@ static int asus_read_brightness(struct backlight_device *bd)
 {
        struct asus_laptop *asus = bl_get_data(bd);
        unsigned long long value;
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        rv = acpi_evaluate_integer(asus->handle, METHOD_BRIGHTNESS_GET,
                                   NULL, &value);
-       if (ACPI_FAILURE(rv))
+       if (ACPI_FAILURE(rv)) {
                pr_warn("Error reading brightness\n");
+               return 0;
+       }
 
        return value;
 }
@@ -865,7 +867,7 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr,
        int len = 0;
        unsigned long long temp;
        char buf[16];           /* enough for all info */
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        /*
         * We use the easy way, we don't care of off and count,
@@ -946,11 +948,10 @@ static ssize_t sysfs_acpi_set(struct asus_laptop *asus,
                              const char *method)
 {
        int rv, value;
-       int out = 0;
 
        rv = parse_arg(buf, count, &value);
-       if (rv > 0)
-               out = value ? 1 : 0;
+       if (rv <= 0)
+               return rv;
 
        if (write_acpi_int(asus->handle, method, value))
                return -ENODEV;
@@ -1265,7 +1266,7 @@ static DEVICE_ATTR_RO(ls_value);
 static int asus_gps_status(struct asus_laptop *asus)
 {
        unsigned long long status;
-       acpi_status rv = AE_OK;
+       acpi_status rv;
 
        rv = acpi_evaluate_integer(asus->handle, METHOD_GPS_STATUS,
                                   NULL, &status);
index a96630d5234674353790275695b4668198d1d402..a26dca3640ea750becfceffc9d78aa92d643efb1 100644 (file)
@@ -114,6 +114,7 @@ MODULE_LICENSE("GPL");
 #define ASUS_WMI_DEVID_LED6            0x00020016
 
 /* Backlight and Brightness */
+#define ASUS_WMI_DEVID_ALS_ENABLE      0x00050001 /* Ambient Light Sensor */
 #define ASUS_WMI_DEVID_BACKLIGHT       0x00050011
 #define ASUS_WMI_DEVID_BRIGHTNESS      0x00050012
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT   0x00050021
@@ -1730,6 +1731,7 @@ ASUS_WMI_CREATE_DEVICE_ATTR(touchpad, 0644, ASUS_WMI_DEVID_TOUCHPAD);
 ASUS_WMI_CREATE_DEVICE_ATTR(camera, 0644, ASUS_WMI_DEVID_CAMERA);
 ASUS_WMI_CREATE_DEVICE_ATTR(cardr, 0644, ASUS_WMI_DEVID_CARDREADER);
 ASUS_WMI_CREATE_DEVICE_ATTR(lid_resume, 0644, ASUS_WMI_DEVID_LID_RESUME);
+ASUS_WMI_CREATE_DEVICE_ATTR(als_enable, 0644, ASUS_WMI_DEVID_ALS_ENABLE);
 
 static ssize_t store_cpufv(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
@@ -1756,6 +1758,7 @@ static struct attribute *platform_attributes[] = {
        &dev_attr_cardr.attr,
        &dev_attr_touchpad.attr,
        &dev_attr_lid_resume.attr,
+       &dev_attr_als_enable.attr,
        NULL
 };
 
@@ -1776,6 +1779,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
                devid = ASUS_WMI_DEVID_TOUCHPAD;
        else if (attr == &dev_attr_lid_resume.attr)
                devid = ASUS_WMI_DEVID_LID_RESUME;
+       else if (attr == &dev_attr_als_enable.attr)
+               devid = ASUS_WMI_DEVID_ALS_ENABLE;
 
        if (devid != -1)
                ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
index b51a2008d7821908f3da26ec21868d1dd3a71b59..dcd9f40a4b1898279a17b6121a1c39804ddef43d 100644 (file)
@@ -28,6 +28,7 @@ struct rbtn_data {
        enum rbtn_type type;
        struct rfkill *rfkill;
        struct input_dev *input_dev;
+       bool suspended;
 };
 
 
@@ -235,9 +236,55 @@ static const struct acpi_device_id rbtn_ids[] = {
        { "", 0 },
 };
 
+#ifdef CONFIG_PM_SLEEP
+static void ACPI_SYSTEM_XFACE rbtn_clear_suspended_flag(void *context)
+{
+       struct rbtn_data *rbtn_data = context;
+
+       rbtn_data->suspended = false;
+}
+
+static int rbtn_suspend(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct rbtn_data *rbtn_data = acpi_driver_data(device);
+
+       rbtn_data->suspended = true;
+
+       return 0;
+}
+
+static int rbtn_resume(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct rbtn_data *rbtn_data = acpi_driver_data(device);
+       acpi_status status;
+
+       /*
+        * Upon resume, some BIOSes send an ACPI notification thet triggers
+        * an unwanted input event. In order to ignore it, we use a flag
+        * that we set at suspend and clear once we have received the extra
+        * ACPI notification. Since ACPI notifications are delivered
+        * asynchronously to drivers, we clear the flag from the workqueue
+        * used to deliver the notifications. This should be enough
+        * to have the flag cleared only after we received the extra
+        * notification, if any.
+        */
+       status = acpi_os_execute(OSL_NOTIFY_HANDLER,
+                        rbtn_clear_suspended_flag, rbtn_data);
+       if (ACPI_FAILURE(status))
+               rbtn_clear_suspended_flag(rbtn_data);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(rbtn_pm_ops, rbtn_suspend, rbtn_resume);
+
 static struct acpi_driver rbtn_driver = {
        .name = "dell-rbtn",
        .ids = rbtn_ids,
+       .drv.pm = &rbtn_pm_ops,
        .ops = {
                .add = rbtn_add,
                .remove = rbtn_remove,
@@ -399,6 +446,15 @@ static void rbtn_notify(struct acpi_device *device, u32 event)
 {
        struct rbtn_data *rbtn_data = device->driver_data;
 
+       /*
+        * Some BIOSes send a notification at resume.
+        * Ignore it to prevent unwanted input events.
+        */
+       if (rbtn_data->suspended) {
+               dev_dbg(&device->dev, "ACPI notification ignored\n");
+               return;
+       }
+
        if (event != 0x80) {
                dev_info(&device->dev, "Received unknown event (0x%x)\n",
                         event);
index ffc84cc7b1c79fc8b8bdecd5cd2ae1201b338194..ce41bc34288df5fa159c35b58237df5b2b500819 100644 (file)
@@ -69,7 +69,7 @@
 #include <linux/kfifo.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 #include <linux/leds.h>
 #endif
 #include <acpi/video.h>
 /* FUNC interface - responses */
 #define UNSUPPORTED_CMD 0x80000000
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 /* FUNC interface - LED control */
 #define FUNC_LED_OFF   0x1
 #define FUNC_LED_ON    0x30001
 #define KEYBOARD_LAMPS 0x100
 #define LOGOLAMP_POWERON 0x2000
 #define LOGOLAMP_ALWAYS  0x4000
+#define RADIO_LED_ON   0x20
 #endif
 
 /* Hotkey details */
@@ -174,13 +175,14 @@ struct fujitsu_hotkey_t {
        int rfkill_state;
        int logolamp_registered;
        int kblamps_registered;
+       int radio_led_registered;
 };
 
 static struct fujitsu_hotkey_t *fujitsu_hotkey;
 
 static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event);
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 static enum led_brightness logolamp_get(struct led_classdev *cdev);
 static void logolamp_set(struct led_classdev *cdev,
                               enum led_brightness brightness);
@@ -200,6 +202,16 @@ static struct led_classdev kblamps_led = {
  .brightness_get = kblamps_get,
  .brightness_set = kblamps_set
 };
+
+static enum led_brightness radio_led_get(struct led_classdev *cdev);
+static void radio_led_set(struct led_classdev *cdev,
+                              enum led_brightness brightness);
+
+static struct led_classdev radio_led = {
+ .name = "fujitsu::radio_led",
+ .brightness_get = radio_led_get,
+ .brightness_set = radio_led_set
+};
 #endif
 
 #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG
@@ -249,7 +261,7 @@ static int call_fext_func(int cmd, int arg0, int arg1, int arg2)
        return value;
 }
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 /* LED class callbacks */
 
 static void logolamp_set(struct led_classdev *cdev,
@@ -275,6 +287,15 @@ static void kblamps_set(struct led_classdev *cdev,
                call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_OFF);
 }
 
+static void radio_led_set(struct led_classdev *cdev,
+                               enum led_brightness brightness)
+{
+       if (brightness >= LED_FULL)
+               call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, RADIO_LED_ON);
+       else
+               call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, 0x0);
+}
+
 static enum led_brightness logolamp_get(struct led_classdev *cdev)
 {
        enum led_brightness brightness = LED_OFF;
@@ -299,6 +320,16 @@ static enum led_brightness kblamps_get(struct led_classdev *cdev)
 
        return brightness;
 }
+
+static enum led_brightness radio_led_get(struct led_classdev *cdev)
+{
+       enum led_brightness brightness = LED_OFF;
+
+       if (call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0) & RADIO_LED_ON)
+               brightness = LED_FULL;
+
+       return brightness;
+}
 #endif
 
 /* Hardware access for LCD brightness control */
@@ -872,7 +903,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
        /* Suspect this is a keymap of the application panel, print it */
        pr_info("BTNI: [0x%x]\n", call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0));
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
        if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) {
                result = led_classdev_register(&fujitsu->pf_device->dev,
                                                &logolamp_led);
@@ -895,6 +926,23 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
                               result);
                }
        }
+
+       /*
+        * BTNI bit 24 seems to indicate the presence of a radio toggle
+        * button in place of a slide switch, and all such machines appear
+        * to also have an RF LED.  Therefore use bit 24 as an indicator
+        * that an RF LED is present.
+        */
+       if (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) & BIT(24)) {
+               result = led_classdev_register(&fujitsu->pf_device->dev,
+                                               &radio_led);
+               if (result == 0) {
+                       fujitsu_hotkey->radio_led_registered = 1;
+               } else {
+                       pr_err("Could not register LED handler for radio LED, error %i\n",
+                              result);
+               }
+       }
 #endif
 
        return result;
@@ -915,12 +963,15 @@ static int acpi_fujitsu_hotkey_remove(struct acpi_device *device)
        struct fujitsu_hotkey_t *fujitsu_hotkey = acpi_driver_data(device);
        struct input_dev *input = fujitsu_hotkey->input;
 
-#if defined(CONFIG_LEDS_CLASS) || defined(CONFIG_LEDS_CLASS_MODULE)
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
        if (fujitsu_hotkey->logolamp_registered)
                led_classdev_unregister(&logolamp_led);
 
        if (fujitsu_hotkey->kblamps_registered)
                led_classdev_unregister(&kblamps_led);
+
+       if (fujitsu_hotkey->radio_led_registered)
+               led_classdev_unregister(&radio_led);
 #endif
 
        input_unregister_device(input);
index be3bc2f4edd4279d04749aba6db4f93aedf06c4f..4a23fbc66b715bcc4fce2a6ddccb2c8b743ae7b2 100644 (file)
 #define CFG_CAMERA_BIT (19)
 
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-static const char ideapad_wmi_fnesc_event[] = "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6";
+static const char *const ideapad_wmi_fnesc_events[] = {
+       "26CAB2E5-5CF1-46AE-AAC3-4A12B6BA50E6", /* Yoga 3 */
+       "56322276-8493-4CE8-A783-98C991274F5E", /* Yoga 700 */
+};
 #endif
 
 enum {
@@ -93,6 +96,7 @@ struct ideapad_private {
        struct dentry *debug;
        unsigned long cfg;
        bool has_hw_rfkill_switch;
+       const char *fnesc_guid;
 };
 
 static bool no_bt_rfkill;
@@ -989,8 +993,16 @@ static int ideapad_acpi_add(struct platform_device *pdev)
                ACPI_DEVICE_NOTIFY, ideapad_acpi_notify, priv);
        if (ret)
                goto notification_failed;
+
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-       ret = wmi_install_notify_handler(ideapad_wmi_fnesc_event, ideapad_wmi_notify, priv);
+       for (i = 0; i < ARRAY_SIZE(ideapad_wmi_fnesc_events); i++) {
+               ret = wmi_install_notify_handler(ideapad_wmi_fnesc_events[i],
+                                                ideapad_wmi_notify, priv);
+               if (ret == AE_OK) {
+                       priv->fnesc_guid = ideapad_wmi_fnesc_events[i];
+                       break;
+               }
+       }
        if (ret != AE_OK && ret != AE_NOT_EXIST)
                goto notification_failed_wmi;
 #endif
@@ -1020,7 +1032,8 @@ static int ideapad_acpi_remove(struct platform_device *pdev)
        int i;
 
 #if IS_ENABLED(CONFIG_ACPI_WMI)
-       wmi_remove_notify_handler(ideapad_wmi_fnesc_event);
+       if (priv->fnesc_guid)
+               wmi_remove_notify_handler(priv->fnesc_guid);
 #endif
        acpi_remove_notify_handler(priv->adev->handle,
                ACPI_DEVICE_NOTIFY, ideapad_acpi_notify);
index 0a919d81662cdd9fe19506323d78d1c05867e604..cbe01021c939b1c8c9dab2053ac749e0dd6cc792 100644 (file)
@@ -306,33 +306,32 @@ static int sensor_set_auxtrip(acpi_handle handle, int index, int value)
 #define to_intel_menlow_attr(_attr)    \
        container_of(_attr, struct intel_menlow_attribute, attr)
 
-static ssize_t aux0_show(struct device *dev,
-                        struct device_attribute *dev_attr, char *buf)
+static ssize_t aux_show(struct device *dev, struct device_attribute *dev_attr,
+                       char *buf, int idx)
 {
        struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
        unsigned long long value;
        int result;
 
-       result = sensor_get_auxtrip(attr->handle, 0, &value);
+       result = sensor_get_auxtrip(attr->handle, idx, &value);
 
        return result ? result : sprintf(buf, "%lu", DECI_KELVIN_TO_CELSIUS(value));
 }
 
-static ssize_t aux1_show(struct device *dev,
+static ssize_t aux0_show(struct device *dev,
                         struct device_attribute *dev_attr, char *buf)
 {
-       struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
-       unsigned long long value;
-       int result;
-
-       result = sensor_get_auxtrip(attr->handle, 1, &value);
+       return aux_show(dev, dev_attr, buf, 0);
+}
 
-       return result ? result : sprintf(buf, "%lu", DECI_KELVIN_TO_CELSIUS(value));
+static ssize_t aux1_show(struct device *dev,
+                        struct device_attribute *dev_attr, char *buf)
+{
+       return aux_show(dev, dev_attr, buf, 1);
 }
 
-static ssize_t aux0_store(struct device *dev,
-                         struct device_attribute *dev_attr,
-                         const char *buf, size_t count)
+static ssize_t aux_store(struct device *dev, struct device_attribute *dev_attr,
+                        const char *buf, size_t count, int idx)
 {
        struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
        int value;
@@ -345,27 +344,23 @@ static ssize_t aux0_store(struct device *dev,
        if (value < 0)
                return -EINVAL;
 
-       result = sensor_set_auxtrip(attr->handle, 0, CELSIUS_TO_DECI_KELVIN(value));
+       result = sensor_set_auxtrip(attr->handle, idx, 
+                                   CELSIUS_TO_DECI_KELVIN(value));
        return result ? result : count;
 }
 
-static ssize_t aux1_store(struct device *dev,
+static ssize_t aux0_store(struct device *dev,
                          struct device_attribute *dev_attr,
                          const char *buf, size_t count)
 {
-       struct intel_menlow_attribute *attr = to_intel_menlow_attr(dev_attr);
-       int value;
-       int result;
-
-       /*Sanity check; should be a positive integer */
-       if (!sscanf(buf, "%d", &value))
-               return -EINVAL;
-
-       if (value < 0)
-               return -EINVAL;
+       return aux_store(dev, dev_attr, buf, count, 0);
+}
 
-       result = sensor_set_auxtrip(attr->handle, 1, CELSIUS_TO_DECI_KELVIN(value));
-       return result ? result : count;
+static ssize_t aux1_store(struct device *dev,
+                         struct device_attribute *dev_attr,
+                         const char *buf, size_t count)
+{
+       return aux_store(dev, dev_attr, buf, count, 1);
 }
 
 /* BIOS can enable/disable the thermal user application in dabney platform */
diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c
new file mode 100644 (file)
index 0000000..2776bec
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Intel Core SoC Power Management Controller Driver
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/pmc_core.h>
+
+#include "intel_pmc_core.h"
+
+static struct pmc_dev pmc;
+
+static const struct pci_device_id pmc_pci_ids[] = {
+       { PCI_VDEVICE(INTEL, SPT_PMC_PCI_DEVICE_ID), (kernel_ulong_t)NULL },
+       { 0, },
+};
+
+static inline u32 pmc_core_reg_read(struct pmc_dev *pmcdev, int reg_offset)
+{
+       return readl(pmcdev->regbase + reg_offset);
+}
+
+static inline u32 pmc_core_adjust_slp_s0_step(u32 value)
+{
+       return value * SPT_PMC_SLP_S0_RES_COUNTER_STEP;
+}
+
+/**
+ * intel_pmc_slp_s0_counter_read() - Read SLP_S0 residency.
+ * @data: Out param that contains current SLP_S0 count.
+ *
+ * This API currently supports Intel Skylake SoC and Sunrise
+ * Point Platform Controller Hub. Future platform support
+ * should be added for platforms that support low power modes
+ * beyond Package C10 state.
+ *
+ * SLP_S0_RESIDENCY counter counts in 100 us granularity per
+ * step hence function populates the multiplied value in out
+ * parameter @data.
+ *
+ * Return: an error code or 0 on success.
+ */
+int intel_pmc_slp_s0_counter_read(u32 *data)
+{
+       struct pmc_dev *pmcdev = &pmc;
+       u32 value;
+
+       if (!pmcdev->has_slp_s0_res)
+               return -EACCES;
+
+       value = pmc_core_reg_read(pmcdev, SPT_PMC_SLP_S0_RES_COUNTER_OFFSET);
+       *data = pmc_core_adjust_slp_s0_step(value);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(intel_pmc_slp_s0_counter_read);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static int pmc_core_dev_state_show(struct seq_file *s, void *unused)
+{
+       struct pmc_dev *pmcdev = s->private;
+       u32 counter_val;
+
+       counter_val = pmc_core_reg_read(pmcdev,
+                                       SPT_PMC_SLP_S0_RES_COUNTER_OFFSET);
+       seq_printf(s, "%u\n", pmc_core_adjust_slp_s0_step(counter_val));
+
+       return 0;
+}
+
+static int pmc_core_dev_state_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, pmc_core_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_core_dev_state_ops = {
+       .open           = pmc_core_dev_state_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev)
+{
+       debugfs_remove_recursive(pmcdev->dbgfs_dir);
+}
+
+static int pmc_core_dbgfs_register(struct pmc_dev *pmcdev)
+{
+       struct dentry *dir, *file;
+
+       dir = debugfs_create_dir("pmc_core", NULL);
+       if (!dir)
+               return -ENOMEM;
+
+       pmcdev->dbgfs_dir = dir;
+       file = debugfs_create_file("slp_s0_residency_usec", S_IFREG | S_IRUGO,
+                                  dir, pmcdev, &pmc_core_dev_state_ops);
+
+       if (!file) {
+               pmc_core_dbgfs_unregister(pmcdev);
+               return -ENODEV;
+       }
+
+       return 0;
+}
+#else
+static inline int pmc_core_dbgfs_register(struct pmc_dev *pmcdev)
+{
+       return 0;
+}
+
+static inline void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static const struct x86_cpu_id intel_pmc_core_ids[] = {
+       { X86_VENDOR_INTEL, 6, 0x4e, X86_FEATURE_MWAIT,
+               (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
+       { X86_VENDOR_INTEL, 6, 0x5e, X86_FEATURE_MWAIT,
+               (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
+       {}
+};
+
+static int pmc_core_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+       struct device *ptr_dev = &dev->dev;
+       struct pmc_dev *pmcdev = &pmc;
+       const struct x86_cpu_id *cpu_id;
+       int err;
+
+       cpu_id = x86_match_cpu(intel_pmc_core_ids);
+       if (!cpu_id) {
+               dev_dbg(&dev->dev, "PMC Core: cpuid mismatch.\n");
+               return -EINVAL;
+       }
+
+       err = pcim_enable_device(dev);
+       if (err < 0) {
+               dev_dbg(&dev->dev, "PMC Core: failed to enable Power Management Controller.\n");
+               return err;
+       }
+
+       err = pci_read_config_dword(dev,
+                                   SPT_PMC_BASE_ADDR_OFFSET,
+                                   &pmcdev->base_addr);
+       if (err < 0) {
+               dev_dbg(&dev->dev, "PMC Core: failed to read PCI config space.\n");
+               return err;
+       }
+       dev_dbg(&dev->dev, "PMC Core: PWRMBASE is %#x\n", pmcdev->base_addr);
+
+       pmcdev->regbase = devm_ioremap_nocache(ptr_dev,
+                                             pmcdev->base_addr,
+                                             SPT_PMC_MMIO_REG_LEN);
+       if (!pmcdev->regbase) {
+               dev_dbg(&dev->dev, "PMC Core: ioremap failed.\n");
+               return -ENOMEM;
+       }
+
+       err = pmc_core_dbgfs_register(pmcdev);
+       if (err < 0) {
+               dev_err(&dev->dev, "PMC Core: debugfs register failed.\n");
+               return err;
+       }
+
+       pmc.has_slp_s0_res = true;
+       return 0;
+}
+
+static struct pci_driver intel_pmc_core_driver = {
+       .name = "intel_pmc_core",
+       .id_table = pmc_pci_ids,
+       .probe = pmc_core_probe,
+};
+
+builtin_pci_driver(intel_pmc_core_driver);
diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h
new file mode 100644 (file)
index 0000000..a9dadaf
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Intel Core SoC Power Management Controller Header File
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Authors: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
+ *          Vishwanath Somayaji <vishwanath.somayaji@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_CORE_H
+#define PMC_CORE_H
+
+/* Sunrise Point Power Management Controller PCI Device ID */
+#define SPT_PMC_PCI_DEVICE_ID                  0x9d21
+#define SPT_PMC_BASE_ADDR_OFFSET               0x48
+#define SPT_PMC_SLP_S0_RES_COUNTER_OFFSET      0x13c
+#define SPT_PMC_MMIO_REG_LEN                   0x100
+#define SPT_PMC_SLP_S0_RES_COUNTER_STEP                0x64
+
+/**
+ * struct pmc_dev - pmc device structure
+ * @base_addr:         comtains pmc base address
+ * @regbase:           pointer to io-remapped memory location
+ * @dbgfs_dir:         path to debug fs interface
+ * @feature_available: flag to indicate whether
+ *                     the feature is available
+ *                     on a particular platform or not.
+ *
+ * pmc_dev contains info about power management controller device.
+ */
+struct pmc_dev {
+       u32 base_addr;
+       void __iomem *regbase;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+       struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+       bool has_slp_s0_res;
+};
+
+#endif /* PMC_CORE_H */
index a695a436a1c32630f192fded73b3279de5a2b144..0d4c3808a6d892f38b0bf6f3ae87dc9f8ae8dba0 100644 (file)
@@ -25,7 +25,7 @@
 
 struct telemetry_core_config {
        struct telemetry_plt_config *plt_config;
-       struct telemetry_core_ops *telem_ops;
+       const struct telemetry_core_ops *telem_ops;
 };
 
 static struct telemetry_core_config telm_core_conf;
@@ -95,7 +95,7 @@ static int telemetry_def_reset_events(void)
        return 0;
 }
 
-static struct telemetry_core_ops telm_defpltops = {
+static const struct telemetry_core_ops telm_defpltops = {
        .set_sampling_period = telemetry_def_set_sampling_period,
        .get_sampling_period = telemetry_def_get_sampling_period,
        .get_trace_verbosity = telemetry_def_get_trace_verbosity,
@@ -332,7 +332,7 @@ EXPORT_SYMBOL_GPL(telemetry_set_trace_verbosity);
  *
  * Return: 0 success, < 0 for failure
  */
-int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
                          struct telemetry_plt_config *pltconfig)
 {
        if (ops)
index 781bd10ca7ac5ba147e4e718d1d54577a7626e33..09c84a2b1c2cef7ecbeaf19795dfdd2048c5c66e 100644 (file)
@@ -1081,7 +1081,7 @@ out:
        return ret;
 }
 
-static struct telemetry_core_ops telm_pltops = {
+static const struct telemetry_core_ops telm_pltops = {
        .get_trace_verbosity = telemetry_plt_get_trace_verbosity,
        .set_trace_verbosity = telemetry_plt_set_trace_verbosity,
        .set_sampling_period = telemetry_plt_set_sampling_period,
index e9caa347a9bf3b174cb9b3aaaf0ecbea1dad16df..1dba3598cfcbdc531f1cdde7249974f0af0d3ef0 100644 (file)
@@ -1446,6 +1446,9 @@ static void sony_nc_function_cleanup(struct platform_device *pd)
 {
        unsigned int i, result, bitmask, handle;
 
+       if (!handles)
+               return;
+
        /* get enabled events and disable them */
        sony_nc_int_call(sony_nc_acpi_handle, "SN01", NULL, &bitmask);
        sony_nc_int_call(sony_nc_acpi_handle, "SN03", &bitmask, &result);
index 700e0fa0eec2ec75f7ebe094299a5312e01a8c8b..6505c97705e1486391088baa8728aafd3540403a 100644 (file)
@@ -24,6 +24,8 @@
 #define SURFACE_BUTTON_OBJ_NAME                "VGBI"
 #define SURFACE_BUTTON_DEVICE_NAME     "Surface Pro 3/4 Buttons"
 
+#define SURFACE_BUTTON_NOTIFY_TABLET_MODE      0xc8
+
 #define SURFACE_BUTTON_NOTIFY_PRESS_POWER      0xc6
 #define SURFACE_BUTTON_NOTIFY_RELEASE_POWER    0xc7
 
@@ -33,7 +35,7 @@
 #define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_UP  0xc0
 #define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_UP        0xc1
 
-#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN        0xc2
+#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN                0xc2
 #define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN      0xc3
 
 ACPI_MODULE_NAME("surface pro 3 button");
@@ -105,9 +107,12 @@ static void surface_button_notify(struct acpi_device *device, u32 event)
        case SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN:
                key_code = KEY_VOLUMEDOWN;
                break;
+       case SURFACE_BUTTON_NOTIFY_TABLET_MODE:
+               dev_warn_once(&device->dev, "Tablet mode is not supported\n");
+               break;
        default:
                dev_info_ratelimited(&device->dev,
-                                 "Unsupported event [0x%x]\n", event);
+                                    "Unsupported event [0x%x]\n", event);
                break;
        }
        input = button->input;
index 9255ff3ee81ac74e5fc269f256aed955132e730d..c3bfa1fe95bf3fb9c61d36c1f42012547ee07580 100644 (file)
@@ -5001,6 +5001,8 @@ static int kbdlight_set_level(int level)
        return 0;
 }
 
+static int kbdlight_set_level_and_update(int level);
+
 static int kbdlight_get_level(void)
 {
        int status = 0;
@@ -5068,7 +5070,7 @@ static void kbdlight_set_worker(struct work_struct *work)
                        container_of(work, struct tpacpi_led_classdev, work);
 
        if (likely(tpacpi_lifecycle == TPACPI_LIFE_RUNNING))
-               kbdlight_set_level(data->new_state);
+               kbdlight_set_level_and_update(data->new_state);
 }
 
 static void kbdlight_sysfs_set(struct led_classdev *led_cdev,
@@ -5099,7 +5101,6 @@ static struct tpacpi_led_classdev tpacpi_led_kbdlight = {
                .max_brightness = 2,
                .brightness_set = &kbdlight_sysfs_set,
                .brightness_get = &kbdlight_sysfs_get,
-               .flags          = LED_CORE_SUSPENDRESUME,
        }
 };
 
@@ -5137,6 +5138,20 @@ static void kbdlight_exit(void)
        flush_workqueue(tpacpi_wq);
 }
 
+static int kbdlight_set_level_and_update(int level)
+{
+       int ret;
+       struct led_classdev *led_cdev;
+
+       ret = kbdlight_set_level(level);
+       led_cdev = &tpacpi_led_kbdlight.led_classdev;
+
+       if (ret == 0 && !(led_cdev->flags & LED_SUSPENDED))
+               led_cdev->brightness = level;
+
+       return ret;
+}
+
 static int kbdlight_read(struct seq_file *m)
 {
        int level;
@@ -5177,13 +5192,35 @@ static int kbdlight_write(char *buf)
        if (level == -1)
                return -EINVAL;
 
-       return kbdlight_set_level(level);
+       return kbdlight_set_level_and_update(level);
+}
+
+static void kbdlight_suspend(void)
+{
+       struct led_classdev *led_cdev;
+
+       if (!tp_features.kbdlight)
+               return;
+
+       led_cdev = &tpacpi_led_kbdlight.led_classdev;
+       led_update_brightness(led_cdev);
+       led_classdev_suspend(led_cdev);
+}
+
+static void kbdlight_resume(void)
+{
+       if (!tp_features.kbdlight)
+               return;
+
+       led_classdev_resume(&tpacpi_led_kbdlight.led_classdev);
 }
 
 static struct ibm_struct kbdlight_driver_data = {
        .name = "kbdlight",
        .read = kbdlight_read,
        .write = kbdlight_write,
+       .suspend = kbdlight_suspend,
+       .resume = kbdlight_resume,
        .exit = kbdlight_exit,
 };
 
index 680fbc795a0a350a9c55f5e10e10a7090a8bfaea..dba3843c53b8d2162d10100d2473be79014d4b4b 100644 (file)
@@ -75,6 +75,7 @@ static void free_pwms(struct pwm_chip *chip)
 
        for (i = 0; i < chip->npwm; i++) {
                struct pwm_device *pwm = &chip->pwms[i];
+
                radix_tree_delete(&pwm_tree, pwm->pwm);
        }
 
@@ -128,13 +129,6 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
        set_bit(PWMF_REQUESTED, &pwm->flags);
        pwm->label = label;
 
-       /*
-        * FIXME: This should be removed once all PWM users properly make use
-        * of struct pwm_args to initialize the PWM device. As long as this is
-        * here, the PWM state and hardware state can get out of sync.
-        */
-       pwm_apply_args(pwm);
-
        return 0;
 }
 
@@ -233,6 +227,19 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
 }
 EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
+static bool pwm_ops_check(const struct pwm_ops *ops)
+{
+       /* driver supports legacy, non-atomic operation */
+       if (ops->config && ops->enable && ops->disable)
+               return true;
+
+       /* driver supports atomic operation */
+       if (ops->apply)
+               return true;
+
+       return false;
+}
+
 /**
  * pwmchip_add_with_polarity() - register a new PWM chip
  * @chip: the PWM chip to add
@@ -251,8 +258,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
        unsigned int i;
        int ret;
 
-       if (!chip || !chip->dev || !chip->ops || !chip->ops->config ||
-           !chip->ops->enable || !chip->ops->disable || !chip->npwm)
+       if (!chip || !chip->dev || !chip->ops || !chip->npwm)
+               return -EINVAL;
+
+       if (!pwm_ops_check(chip->ops))
                return -EINVAL;
 
        mutex_lock(&pwm_lock);
@@ -261,7 +270,7 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
        if (ret < 0)
                goto out;
 
-       chip->pwms = kzalloc(chip->npwm * sizeof(*pwm), GFP_KERNEL);
+       chip->pwms = kcalloc(chip->npwm, sizeof(*pwm), GFP_KERNEL);
        if (!chip->pwms) {
                ret = -ENOMEM;
                goto out;
@@ -275,8 +284,10 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
                pwm->chip = chip;
                pwm->pwm = chip->base + i;
                pwm->hwpwm = i;
-               pwm->polarity = polarity;
-               mutex_init(&pwm->lock);
+               pwm->state.polarity = polarity;
+
+               if (chip->ops->get_state)
+                       chip->ops->get_state(chip, pwm, &pwm->state);
 
                radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
        }
@@ -436,107 +447,138 @@ void pwm_free(struct pwm_device *pwm)
 EXPORT_SYMBOL_GPL(pwm_free);
 
 /**
- * pwm_config() - change a PWM device configuration
+ * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
- * @duty_ns: "on" time (in nanoseconds)
- * @period_ns: duration (in nanoseconds) of one cycle
- *
- * Returns: 0 on success or a negative error code on failure.
+ * @state: new state to apply. This can be adjusted by the PWM driver
+ *        if the requested config is not achievable, for example,
+ *        ->duty_cycle and ->period might be approximated.
  */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
 {
        int err;
 
-       if (!pwm || duty_ns < 0 || period_ns <= 0 || duty_ns > period_ns)
+       if (!pwm)
                return -EINVAL;
 
-       err = pwm->chip->ops->config(pwm->chip, pwm, duty_ns, period_ns);
-       if (err)
-               return err;
-
-       pwm->duty_cycle = duty_ns;
-       pwm->period = period_ns;
+       if (!memcmp(state, &pwm->state, sizeof(*state)))
+               return 0;
 
-       return 0;
-}
-EXPORT_SYMBOL_GPL(pwm_config);
+       if (pwm->chip->ops->apply) {
+               err = pwm->chip->ops->apply(pwm->chip, pwm, state);
+               if (err)
+                       return err;
 
-/**
- * pwm_set_polarity() - configure the polarity of a PWM signal
- * @pwm: PWM device
- * @polarity: new polarity of the PWM signal
- *
- * Note that the polarity cannot be configured while the PWM device is
- * enabled.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
-{
-       int err;
+               pwm->state = *state;
+       } else {
+               /*
+                * FIXME: restore the initial state in case of error.
+                */
+               if (state->polarity != pwm->state.polarity) {
+                       if (!pwm->chip->ops->set_polarity)
+                               return -ENOTSUPP;
+
+                       /*
+                        * Changing the polarity of a running PWM is
+                        * only allowed when the PWM driver implements
+                        * ->apply().
+                        */
+                       if (pwm->state.enabled) {
+                               pwm->chip->ops->disable(pwm->chip, pwm);
+                               pwm->state.enabled = false;
+                       }
+
+                       err = pwm->chip->ops->set_polarity(pwm->chip, pwm,
+                                                          state->polarity);
+                       if (err)
+                               return err;
+
+                       pwm->state.polarity = state->polarity;
+               }
 
-       if (!pwm || !pwm->chip->ops)
-               return -EINVAL;
+               if (state->period != pwm->state.period ||
+                   state->duty_cycle != pwm->state.duty_cycle) {
+                       err = pwm->chip->ops->config(pwm->chip, pwm,
+                                                    state->duty_cycle,
+                                                    state->period);
+                       if (err)
+                               return err;
 
-       if (!pwm->chip->ops->set_polarity)
-               return -ENOSYS;
+                       pwm->state.duty_cycle = state->duty_cycle;
+                       pwm->state.period = state->period;
+               }
 
-       mutex_lock(&pwm->lock);
+               if (state->enabled != pwm->state.enabled) {
+                       if (state->enabled) {
+                               err = pwm->chip->ops->enable(pwm->chip, pwm);
+                               if (err)
+                                       return err;
+                       } else {
+                               pwm->chip->ops->disable(pwm->chip, pwm);
+                       }
 
-       if (pwm_is_enabled(pwm)) {
-               err = -EBUSY;
-               goto unlock;
+                       pwm->state.enabled = state->enabled;
+               }
        }
 
-       err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
-       if (err)
-               goto unlock;
-
-       pwm->polarity = polarity;
-
-unlock:
-       mutex_unlock(&pwm->lock);
-       return err;
+       return 0;
 }
-EXPORT_SYMBOL_GPL(pwm_set_polarity);
+EXPORT_SYMBOL_GPL(pwm_apply_state);
 
 /**
- * pwm_enable() - start a PWM output toggling
+ * pwm_adjust_config() - adjust the current PWM config to the PWM arguments
  * @pwm: PWM device
  *
- * Returns: 0 on success or a negative error code on failure.
+ * This function will adjust the PWM config to the PWM arguments provided
+ * by the DT or PWM lookup table. This is particularly useful to adapt
+ * the bootloader config to the Linux one.
  */
-int pwm_enable(struct pwm_device *pwm)
+int pwm_adjust_config(struct pwm_device *pwm)
 {
-       int err = 0;
+       struct pwm_state state;
+       struct pwm_args pargs;
 
-       if (!pwm)
-               return -EINVAL;
+       pwm_get_args(pwm, &pargs);
+       pwm_get_state(pwm, &state);
 
-       mutex_lock(&pwm->lock);
+       /*
+        * If the current period is zero it means that either the PWM driver
+        * does not support initial state retrieval or the PWM has not yet
+        * been configured.
+        *
+        * In either case, we setup the new period and polarity, and assign a
+        * duty cycle of 0.
+        */
+       if (!state.period) {
+               state.duty_cycle = 0;
+               state.period = pargs.period;
+               state.polarity = pargs.polarity;
 
-       if (!test_and_set_bit(PWMF_ENABLED, &pwm->flags)) {
-               err = pwm->chip->ops->enable(pwm->chip, pwm);
-               if (err)
-                       clear_bit(PWMF_ENABLED, &pwm->flags);
+               return pwm_apply_state(pwm, &state);
        }
 
-       mutex_unlock(&pwm->lock);
+       /*
+        * Adjust the PWM duty cycle/period based on the period value provided
+        * in PWM args.
+        */
+       if (pargs.period != state.period) {
+               u64 dutycycle = (u64)state.duty_cycle * pargs.period;
 
-       return err;
-}
-EXPORT_SYMBOL_GPL(pwm_enable);
+               do_div(dutycycle, state.period);
+               state.duty_cycle = dutycycle;
+               state.period = pargs.period;
+       }
 
-/**
- * pwm_disable() - stop a PWM output toggling
- * @pwm: PWM device
- */
-void pwm_disable(struct pwm_device *pwm)
-{
-       if (pwm && test_and_clear_bit(PWMF_ENABLED, &pwm->flags))
-               pwm->chip->ops->disable(pwm->chip, pwm);
+       /*
+        * If the polarity changed, we should also change the duty cycle.
+        */
+       if (pargs.polarity != state.polarity) {
+               state.polarity = pargs.polarity;
+               state.duty_cycle = state.period - state.duty_cycle;
+       }
+
+       return pwm_apply_state(pwm, &state);
 }
-EXPORT_SYMBOL_GPL(pwm_disable);
+EXPORT_SYMBOL_GPL(pwm_adjust_config);
 
 static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
 {
@@ -754,13 +796,13 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
        if (!chip)
                goto out;
 
-       pwm->args.period = chosen->period;
-       pwm->args.polarity = chosen->polarity;
-
        pwm = pwm_request_from_chip(chip, chosen->index, con_id ?: dev_id);
        if (IS_ERR(pwm))
                goto out;
 
+       pwm->args.period = chosen->period;
+       pwm->args.polarity = chosen->polarity;
+
 out:
        mutex_unlock(&pwm_lookup_lock);
        return pwm;
@@ -907,15 +949,23 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 
        for (i = 0; i < chip->npwm; i++) {
                struct pwm_device *pwm = &chip->pwms[i];
+               struct pwm_state state;
+
+               pwm_get_state(pwm, &state);
 
                seq_printf(s, " pwm-%-3d (%-20.20s):", i, pwm->label);
 
                if (test_bit(PWMF_REQUESTED, &pwm->flags))
                        seq_puts(s, " requested");
 
-               if (pwm_is_enabled(pwm))
+               if (state.enabled)
                        seq_puts(s, " enabled");
 
+               seq_printf(s, " period: %u ns", state.period);
+               seq_printf(s, " duty: %u ns", state.duty_cycle);
+               seq_printf(s, " polarity: %s",
+                          state.polarity ? "inverse" : "normal");
+
                seq_puts(s, "\n");
        }
 }
index 7101c7020bf454700f06ed525adf5a6d8757298e..bd0ebd04856a8a2988162d10cbd5e0204eb4639d 100644 (file)
@@ -75,7 +75,7 @@ static int crc_pwm_config(struct pwm_chip *c, struct pwm_device *pwm,
                return -EINVAL;
        }
 
-       if (pwm->period != period_ns) {
+       if (pwm_get_period(pwm) != period_ns) {
                int clk_div;
 
                /* changing the clk divisor, need to disable fisrt */
index 9861fed4e67d04e8d9f8f8488c7c276b3560fe7d..19dc64cab2f0f9c88b6aa15275a63b936bdf34c8 100644 (file)
@@ -249,7 +249,7 @@ static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
                           LPC18XX_PWM_EVSTATEMSK(lpc18xx_data->duty_event),
                           LPC18XX_PWM_EVSTATEMSK_ALL);
 
-       if (pwm->polarity == PWM_POLARITY_NORMAL) {
+       if (pwm_get_polarity(pwm) == PWM_POLARITY_NORMAL) {
                set_event = lpc18xx_pwm->period_event;
                clear_event = lpc18xx_data->duty_event;
                res_action = LPC18XX_PWM_RES_SET;
index b7e6ecba7d5c23422323acb4dfa6c6fe432a3a29..3e95090cd7cf5c21c758225b547cb5c5c57ed7e0 100644 (file)
@@ -192,7 +192,7 @@ static int pwm_omap_dmtimer_config(struct pwm_chip *chip,
                load_value, load_value, match_value, match_value);
 
        omap->pdata->set_pwm(omap->dm_timer,
-                             pwm->polarity == PWM_POLARITY_INVERSED,
+                             pwm_get_polarity(pwm) == PWM_POLARITY_INVERSED,
                              true,
                              PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE);
 
index 7b8ac06781378680a6f22d906c3252dd2023f0d6..1c85ecc9e7ac076e7ad6e13333a1a6f4b15b07b6 100644 (file)
@@ -157,7 +157,7 @@ static int rcar_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                return div;
 
        /* Let the core driver set pwm->period if disabled and duty_ns == 0 */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags) && !duty_ns)
+       if (!pwm_is_enabled(pwm) && !duty_ns)
                return 0;
 
        rcar_pwm_update(rp, RCAR_PWMCR_SYNC, RCAR_PWMCR_SYNC, RCAR_PWMCR);
index 67af9f62361fdc4b712c8adafbe4e331e32af9ae..03a99a53c39e2709247ce64f21629c4ac5181b79 100644 (file)
@@ -354,7 +354,8 @@ static int sun4i_pwm_probe(struct platform_device *pdev)
        val = sun4i_pwm_readl(pwm, PWM_CTRL_REG);
        for (i = 0; i < pwm->chip.npwm; i++)
                if (!(val & BIT_CH(PWM_ACT_STATE, i)))
-                       pwm->chip.pwms[i].polarity = PWM_POLARITY_INVERSED;
+                       pwm_set_polarity(&pwm->chip.pwms[i],
+                                        PWM_POLARITY_INVERSED);
        clk_disable_unprepare(pwm->clk);
 
        return 0;
index 9c90886f41234153d044d24f94a9ae1e82efbd83..d98599249a05688137325fcf906415e14aa7aed4 100644 (file)
@@ -26,6 +26,7 @@
 struct pwm_export {
        struct device child;
        struct pwm_device *pwm;
+       struct mutex lock;
 };
 
 static struct pwm_export *child_to_pwm_export(struct device *child)
@@ -45,15 +46,20 @@ static ssize_t period_show(struct device *child,
                           char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
 
-       return sprintf(buf, "%u\n", pwm_get_period(pwm));
+       pwm_get_state(pwm, &state);
+
+       return sprintf(buf, "%u\n", state.period);
 }
 
 static ssize_t period_store(struct device *child,
                            struct device_attribute *attr,
                            const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        unsigned int val;
        int ret;
 
@@ -61,7 +67,11 @@ static ssize_t period_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val);
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.period = val;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -71,15 +81,20 @@ static ssize_t duty_cycle_show(struct device *child,
                               char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       return sprintf(buf, "%u\n", pwm_get_duty_cycle(pwm));
+       return sprintf(buf, "%u\n", state.duty_cycle);
 }
 
 static ssize_t duty_cycle_store(struct device *child,
                                struct device_attribute *attr,
                                const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        unsigned int val;
        int ret;
 
@@ -87,7 +102,11 @@ static ssize_t duty_cycle_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, val, pwm_get_period(pwm));
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.duty_cycle = val;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -97,33 +116,46 @@ static ssize_t enable_show(struct device *child,
                           char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       return sprintf(buf, "%d\n", pwm_is_enabled(pwm));
+       return sprintf(buf, "%d\n", state.enabled);
 }
 
 static ssize_t enable_store(struct device *child,
                            struct device_attribute *attr,
                            const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
+       struct pwm_state state;
        int val, ret;
 
        ret = kstrtoint(buf, 0, &val);
        if (ret)
                return ret;
 
+       mutex_lock(&export->lock);
+
+       pwm_get_state(pwm, &state);
+
        switch (val) {
        case 0:
-               pwm_disable(pwm);
+               state.enabled = false;
                break;
        case 1:
-               ret = pwm_enable(pwm);
+               state.enabled = true;
                break;
        default:
                ret = -EINVAL;
-               break;
+               goto unlock;
        }
 
+       pwm_apply_state(pwm, &state);
+
+unlock:
+       mutex_unlock(&export->lock);
        return ret ? : size;
 }
 
@@ -133,8 +165,11 @@ static ssize_t polarity_show(struct device *child,
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
        const char *polarity = "unknown";
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
 
-       switch (pwm_get_polarity(pwm)) {
+       switch (state.polarity) {
        case PWM_POLARITY_NORMAL:
                polarity = "normal";
                break;
@@ -151,8 +186,10 @@ static ssize_t polarity_store(struct device *child,
                              struct device_attribute *attr,
                              const char *buf, size_t size)
 {
-       struct pwm_device *pwm = child_to_pwm_device(child);
+       struct pwm_export *export = child_to_pwm_export(child);
+       struct pwm_device *pwm = export->pwm;
        enum pwm_polarity polarity;
+       struct pwm_state state;
        int ret;
 
        if (sysfs_streq(buf, "normal"))
@@ -162,7 +199,11 @@ static ssize_t polarity_store(struct device *child,
        else
                return -EINVAL;
 
-       ret = pwm_set_polarity(pwm, polarity);
+       mutex_lock(&export->lock);
+       pwm_get_state(pwm, &state);
+       state.polarity = polarity;
+       ret = pwm_apply_state(pwm, &state);
+       mutex_unlock(&export->lock);
 
        return ret ? : size;
 }
@@ -203,6 +244,7 @@ static int pwm_export_child(struct device *parent, struct pwm_device *pwm)
        }
 
        export->pwm = pwm;
+       mutex_init(&export->lock);
 
        export->child.release = pwm_export_release;
        export->child.parent = parent;
index b83908670a9ab0a229029353b669cbd82956eca2..bed53c46dd90657f5432dd053f4cee8e774763b8 100644 (file)
@@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
                                                struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-                        void __pmem **kaddr, pfn_t *pfn);
+                        void __pmem **kaddr, pfn_t *pfn, long size);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -884,7 +884,7 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-                       void __pmem **kaddr, pfn_t *pfn)
+                       void __pmem **kaddr, pfn_t *pfn, long size)
 {
        struct dcssblk_dev_info *dev_info;
        unsigned long offset, dev_sz;
index 3c3e56df526efb251cff6d859867d6484884c904..a003ba26ca6e7ec6b0bfadaeb2932d504381419b 100644 (file)
@@ -1059,7 +1059,7 @@ static const struct pmic_wrapper_type pwrap_mt2701 = {
        .regs = mt2701_regs,
        .type = PWRAP_MT2701,
        .arb_en_all = 0x3f,
-       .int_en_all = ~(BIT(31) | BIT(2)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(2)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE_NEW,
        .wdt_src = PWRAP_WDT_SRC_MASK_ALL,
        .has_bridge = 0,
@@ -1071,7 +1071,7 @@ static struct pmic_wrapper_type pwrap_mt8135 = {
        .regs = mt8135_regs,
        .type = PWRAP_MT8135,
        .arb_en_all = 0x1ff,
-       .int_en_all = ~(BIT(31) | BIT(1)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(1)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE,
        .wdt_src = PWRAP_WDT_SRC_MASK_ALL,
        .has_bridge = 1,
@@ -1083,7 +1083,7 @@ static struct pmic_wrapper_type pwrap_mt8173 = {
        .regs = mt8173_regs,
        .type = PWRAP_MT8173,
        .arb_en_all = 0x3f,
-       .int_en_all = ~(BIT(31) | BIT(1)),
+       .int_en_all = ~(u32)(BIT(31) | BIT(1)),
        .spi_w = PWRAP_MAN_CMD_SPI_WRITE,
        .wdt_src = PWRAP_WDT_SRC_MASK_NO_STAUPD,
        .has_bridge = 0,
index 9d8c84bb15446a75153d64f94a5900ab165193ac..4b931ec8d90b610f498a3e317e0523e9d1aa6900 100644 (file)
@@ -410,7 +410,6 @@ config SPI_OMAP_UWIRE
 config SPI_OMAP24XX
        tristate "McSPI driver for OMAP"
        depends on HAS_DMA
-       depends on ARM || ARM64 || AVR32 || HEXAGON || MIPS || SUPERH
        depends on ARCH_OMAP2PLUS || COMPILE_TEST
        help
          SPI master controller for OMAP24XX and later Multichannel SPI
@@ -432,10 +431,23 @@ config SPI_OMAP_100K
 
 config SPI_ORION
        tristate "Orion SPI master"
-       depends on PLAT_ORION || COMPILE_TEST
+       depends on PLAT_ORION || ARCH_MVEBU || COMPILE_TEST
        help
          This enables using the SPI master controller on the Orion chips.
 
+config SPI_PIC32
+       tristate "Microchip PIC32 series SPI"
+       depends on MACH_PIC32 || COMPILE_TEST
+       help
+         SPI driver for Microchip PIC32 SPI master controller.
+
+config SPI_PIC32_SQI
+       tristate "Microchip PIC32 Quad SPI driver"
+       depends on MACH_PIC32 || COMPILE_TEST
+       depends on HAS_DMA
+       help
+         SPI driver for PIC32 Quad SPI controller.
+
 config SPI_PL022
        tristate "ARM AMBA PL022 SSP controller"
        depends on ARM_AMBA
@@ -469,7 +481,6 @@ config SPI_PXA2XX_PCI
 
 config SPI_ROCKCHIP
        tristate "Rockchip SPI controller driver"
-       depends on ARM || ARM64 || AVR32 || HEXAGON || MIPS || SUPERH
        help
          This selects a driver for Rockchip SPI controller.
 
@@ -569,7 +580,7 @@ config SPI_SIRF
 
 config SPI_ST_SSC4
        tristate "STMicroelectronics SPI SSC-based driver"
-       depends on ARCH_STI
+       depends on ARCH_STI || COMPILE_TEST
        help
          STMicroelectronics SoCs support for SPI. If you say yes to
          this option, support will be included for the SSC driven SPI.
@@ -656,7 +667,7 @@ config SPI_XILINX
 
 config SPI_XLP
        tristate "Netlogic XLP SPI controller driver"
-       depends on CPU_XLP || COMPILE_TEST
+       depends on CPU_XLP || ARCH_VULCAN || COMPILE_TEST
        help
          Enable support for the SPI controller on the Netlogic XLP SoCs.
          Currently supported XLP variants are XLP8XX, XLP3XX, XLP2XX, XLP9XX
index fbb255c5a608d270ccdb10f1d7ea8f2bd3e24c45..3c74d003535bb9fb922392901c0a9905dd9cdcbd 100644 (file)
@@ -62,6 +62,8 @@ obj-$(CONFIG_SPI_OMAP_100K)           += spi-omap-100k.o
 obj-$(CONFIG_SPI_OMAP24XX)             += spi-omap2-mcspi.o
 obj-$(CONFIG_SPI_TI_QSPI)              += spi-ti-qspi.o
 obj-$(CONFIG_SPI_ORION)                        += spi-orion.o
+obj-$(CONFIG_SPI_PIC32)                        += spi-pic32.o
+obj-$(CONFIG_SPI_PIC32_SQI)            += spi-pic32-sqi.o
 obj-$(CONFIG_SPI_PL022)                        += spi-pl022.o
 obj-$(CONFIG_SPI_PPC4xx)               += spi-ppc4xx.o
 spi-pxa2xx-platform-objs               := spi-pxa2xx.o spi-pxa2xx-dma.o
index c968ab210a5157482f199f23328d7fb22c27e0c7..2b1456e5e2219c19cf5ab83c2b2cd61d987e118f 100644 (file)
@@ -525,7 +525,6 @@ static int spi_engine_probe(struct platform_device *pdev)
        if (ret)
                goto err_ref_clk_disable;
 
-       master->dev.parent = &pdev->dev;
        master->dev.of_node = pdev->dev.of_node;
        master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_3WIRE;
        master->bits_per_word_mask = SPI_BPW_MASK(8);
index cc3f938f0a6b67632515ee0606f67417779043e9..afb51699dbb5a6953821189ed03a9bfbfa8d9b7a 100644 (file)
@@ -10,6 +10,7 @@
 #include "spi-bcm53xx.h"
 
 #define BCM53XXSPI_MAX_SPI_BAUD        13500000        /* 216 MHz? */
+#define BCM53XXSPI_FLASH_WINDOW        SZ_32M
 
 /* The longest observed required wait was 19 ms */
 #define BCM53XXSPI_SPE_TIMEOUT_MS      80
 struct bcm53xxspi {
        struct bcma_device *core;
        struct spi_master *master;
+       void __iomem *mmio_base;
 
        size_t read_offset;
+       bool bspi;                              /* Boot SPI mode with memory mapping */
 };
 
 static inline u32 bcm53xxspi_read(struct bcm53xxspi *b53spi, u16 offset)
@@ -32,6 +35,50 @@ static inline void bcm53xxspi_write(struct bcm53xxspi *b53spi, u16 offset,
        bcma_write32(b53spi->core, offset, value);
 }
 
+static void bcm53xxspi_disable_bspi(struct bcm53xxspi *b53spi)
+{
+       struct device *dev = &b53spi->core->dev;
+       unsigned long deadline;
+       u32 tmp;
+
+       if (!b53spi->bspi)
+               return;
+
+       tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
+       if (tmp & 0x1)
+               return;
+
+       deadline = jiffies + usecs_to_jiffies(200);
+       do {
+               tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_BUSY_STATUS);
+               if (!(tmp & 0x1)) {
+                       bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL,
+                                        0x1);
+                       ndelay(200);
+                       b53spi->bspi = false;
+                       return;
+               }
+               udelay(1);
+       } while (!time_after_eq(jiffies, deadline));
+
+       dev_warn(dev, "Timeout disabling BSPI\n");
+}
+
+static void bcm53xxspi_enable_bspi(struct bcm53xxspi *b53spi)
+{
+       u32 tmp;
+
+       if (b53spi->bspi)
+               return;
+
+       tmp = bcm53xxspi_read(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL);
+       if (!(tmp & 0x1))
+               return;
+
+       bcm53xxspi_write(b53spi, B53SPI_BSPI_MAST_N_BOOT_CTRL, 0x0);
+       b53spi->bspi = true;
+}
+
 static inline unsigned int bcm53xxspi_calc_timeout(size_t len)
 {
        /* Do some magic calculation based on length and buad. Add 10% and 1. */
@@ -176,6 +223,8 @@ static int bcm53xxspi_transfer_one(struct spi_master *master,
        u8 *buf;
        size_t left;
 
+       bcm53xxspi_disable_bspi(b53spi);
+
        if (t->tx_buf) {
                buf = (u8 *)t->tx_buf;
                left = t->len;
@@ -206,6 +255,22 @@ static int bcm53xxspi_transfer_one(struct spi_master *master,
        return 0;
 }
 
+static int bcm53xxspi_flash_read(struct spi_device *spi,
+                                struct spi_flash_read_message *msg)
+{
+       struct bcm53xxspi *b53spi = spi_master_get_devdata(spi->master);
+       int ret = 0;
+
+       if (msg->from + msg->len > BCM53XXSPI_FLASH_WINDOW)
+               return -EINVAL;
+
+       bcm53xxspi_enable_bspi(b53spi);
+       memcpy_fromio(msg->buf, b53spi->mmio_base + msg->from, msg->len);
+       msg->retlen = msg->len;
+
+       return ret;
+}
+
 /**************************************************
  * BCMA
  **************************************************/
@@ -222,6 +287,7 @@ MODULE_DEVICE_TABLE(bcma, bcm53xxspi_bcma_tbl);
 
 static int bcm53xxspi_bcma_probe(struct bcma_device *core)
 {
+       struct device *dev = &core->dev;
        struct bcm53xxspi *b53spi;
        struct spi_master *master;
        int err;
@@ -231,7 +297,7 @@ static int bcm53xxspi_bcma_probe(struct bcma_device *core)
                return -ENOTSUPP;
        }
 
-       master = spi_alloc_master(&core->dev, sizeof(*b53spi));
+       master = spi_alloc_master(dev, sizeof(*b53spi));
        if (!master)
                return -ENOMEM;
 
@@ -239,11 +305,19 @@ static int bcm53xxspi_bcma_probe(struct bcma_device *core)
        b53spi->master = master;
        b53spi->core = core;
 
+       if (core->addr_s[0])
+               b53spi->mmio_base = devm_ioremap(dev, core->addr_s[0],
+                                                BCM53XXSPI_FLASH_WINDOW);
+       b53spi->bspi = true;
+       bcm53xxspi_disable_bspi(b53spi);
+
        master->transfer_one = bcm53xxspi_transfer_one;
+       if (b53spi->mmio_base)
+               master->spi_flash_read = bcm53xxspi_flash_read;
 
        bcma_set_drvdata(core, b53spi);
 
-       err = devm_spi_register_master(&core->dev, master);
+       err = devm_spi_register_master(dev, master);
        if (err) {
                spi_master_put(master);
                bcma_set_drvdata(core, NULL);
index 121a4135b5401b4dbe025f822c927d0f4d1b2947..1c57ce64abba029a209fca1caf5495f294c5fa85 100644 (file)
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
 
 /* Name of this driver */
 #define CDNS_SPI_NAME          "cdns-spi"
 
 /* Register offset definitions */
-#define CDNS_SPI_CR_OFFSET     0x00 /* Configuration  Register, RW */
-#define CDNS_SPI_ISR_OFFSET    0x04 /* Interrupt Status Register, RO */
-#define CDNS_SPI_IER_OFFSET    0x08 /* Interrupt Enable Register, WO */
-#define CDNS_SPI_IDR_OFFSET    0x0c /* Interrupt Disable Register, WO */
-#define CDNS_SPI_IMR_OFFSET    0x10 /* Interrupt Enabled Mask Register, RO */
-#define CDNS_SPI_ER_OFFSET     0x14 /* Enable/Disable Register, RW */
-#define CDNS_SPI_DR_OFFSET     0x18 /* Delay Register, RW */
-#define CDNS_SPI_TXD_OFFSET    0x1C /* Data Transmit Register, WO */
-#define CDNS_SPI_RXD_OFFSET    0x20 /* Data Receive Register, RO */
-#define CDNS_SPI_SICR_OFFSET   0x24 /* Slave Idle Count Register, RW */
-#define CDNS_SPI_THLD_OFFSET   0x28 /* Transmit FIFO Watermark Register,RW */
-
+#define CDNS_SPI_CR    0x00 /* Configuration  Register, RW */
+#define CDNS_SPI_ISR   0x04 /* Interrupt Status Register, RO */
+#define CDNS_SPI_IER   0x08 /* Interrupt Enable Register, WO */
+#define CDNS_SPI_IDR   0x0c /* Interrupt Disable Register, WO */
+#define CDNS_SPI_IMR   0x10 /* Interrupt Enabled Mask Register, RO */
+#define CDNS_SPI_ER    0x14 /* Enable/Disable Register, RW */
+#define CDNS_SPI_DR    0x18 /* Delay Register, RW */
+#define CDNS_SPI_TXD   0x1C /* Data Transmit Register, WO */
+#define CDNS_SPI_RXD   0x20 /* Data Receive Register, RO */
+#define CDNS_SPI_SICR  0x24 /* Slave Idle Count Register, RW */
+#define CDNS_SPI_THLD  0x28 /* Transmit FIFO Watermark Register,RW */
+
+#define SPI_AUTOSUSPEND_TIMEOUT                3000
 /*
  * SPI Configuration Register bit Masks
  *
  * This register contains various control bits that affect the operation
  * of the SPI controller
  */
-#define CDNS_SPI_CR_MANSTRT_MASK       0x00010000 /* Manual TX Start */
-#define CDNS_SPI_CR_CPHA_MASK          0x00000004 /* Clock Phase Control */
-#define CDNS_SPI_CR_CPOL_MASK          0x00000002 /* Clock Polarity Control */
-#define CDNS_SPI_CR_SSCTRL_MASK                0x00003C00 /* Slave Select Mask */
-#define CDNS_SPI_CR_PERI_SEL_MASK      0x00000200 /* Peripheral Select Decode */
-#define CDNS_SPI_CR_BAUD_DIV_MASK      0x00000038 /* Baud Rate Divisor Mask */
-#define CDNS_SPI_CR_MSTREN_MASK                0x00000001 /* Master Enable Mask */
-#define CDNS_SPI_CR_MANSTRTEN_MASK     0x00008000 /* Manual TX Enable Mask */
-#define CDNS_SPI_CR_SSFORCE_MASK       0x00004000 /* Manual SS Enable Mask */
-#define CDNS_SPI_CR_BAUD_DIV_4_MASK    0x00000008 /* Default Baud Div Mask */
-#define CDNS_SPI_CR_DEFAULT_MASK       (CDNS_SPI_CR_MSTREN_MASK | \
-                                       CDNS_SPI_CR_SSCTRL_MASK | \
-                                       CDNS_SPI_CR_SSFORCE_MASK | \
-                                       CDNS_SPI_CR_BAUD_DIV_4_MASK)
+#define CDNS_SPI_CR_MANSTRT    0x00010000 /* Manual TX Start */
+#define CDNS_SPI_CR_CPHA               0x00000004 /* Clock Phase Control */
+#define CDNS_SPI_CR_CPOL               0x00000002 /* Clock Polarity Control */
+#define CDNS_SPI_CR_SSCTRL             0x00003C00 /* Slave Select Mask */
+#define CDNS_SPI_CR_PERI_SEL   0x00000200 /* Peripheral Select Decode */
+#define CDNS_SPI_CR_BAUD_DIV   0x00000038 /* Baud Rate Divisor Mask */
+#define CDNS_SPI_CR_MSTREN             0x00000001 /* Master Enable Mask */
+#define CDNS_SPI_CR_MANSTRTEN  0x00008000 /* Manual TX Enable Mask */
+#define CDNS_SPI_CR_SSFORCE    0x00004000 /* Manual SS Enable Mask */
+#define CDNS_SPI_CR_BAUD_DIV_4 0x00000008 /* Default Baud Div Mask */
+#define CDNS_SPI_CR_DEFAULT    (CDNS_SPI_CR_MSTREN | \
+                                       CDNS_SPI_CR_SSCTRL | \
+                                       CDNS_SPI_CR_SSFORCE | \
+                                       CDNS_SPI_CR_BAUD_DIV_4)
 
 /*
  * SPI Configuration Register - Baud rate and slave select
  * All the four interrupt registers (Status/Mask/Enable/Disable) have the same
  * bit definitions.
  */
-#define CDNS_SPI_IXR_TXOW_MASK 0x00000004 /* SPI TX FIFO Overwater */
-#define CDNS_SPI_IXR_MODF_MASK 0x00000002 /* SPI Mode Fault */
-#define CDNS_SPI_IXR_RXNEMTY_MASK 0x00000010 /* SPI RX FIFO Not Empty */
-#define CDNS_SPI_IXR_DEFAULT_MASK      (CDNS_SPI_IXR_TXOW_MASK | \
-                                       CDNS_SPI_IXR_MODF_MASK)
-#define CDNS_SPI_IXR_TXFULL_MASK       0x00000008 /* SPI TX Full */
-#define CDNS_SPI_IXR_ALL_MASK  0x0000007F /* SPI all interrupts */
+#define CDNS_SPI_IXR_TXOW      0x00000004 /* SPI TX FIFO Overwater */
+#define CDNS_SPI_IXR_MODF      0x00000002 /* SPI Mode Fault */
+#define CDNS_SPI_IXR_RXNEMTY 0x00000010 /* SPI RX FIFO Not Empty */
+#define CDNS_SPI_IXR_DEFAULT   (CDNS_SPI_IXR_TXOW | \
+                                       CDNS_SPI_IXR_MODF)
+#define CDNS_SPI_IXR_TXFULL    0x00000008 /* SPI TX Full */
+#define CDNS_SPI_IXR_ALL       0x0000007F /* SPI all interrupts */
 
 /*
  * SPI Enable Register bit Masks
  *
  * This register is used to enable or disable the SPI controller
  */
-#define CDNS_SPI_ER_ENABLE_MASK        0x00000001 /* SPI Enable Bit Mask */
-#define CDNS_SPI_ER_DISABLE_MASK       0x0 /* SPI Disable Bit Mask */
+#define CDNS_SPI_ER_ENABLE     0x00000001 /* SPI Enable Bit Mask */
+#define CDNS_SPI_ER_DISABLE    0x0 /* SPI Disable Bit Mask */
 
 /* SPI FIFO depth in bytes */
 #define CDNS_SPI_FIFO_DEPTH    128
@@ -149,56 +151,51 @@ static inline void cdns_spi_write(struct cdns_spi *xspi, u32 offset, u32 val)
  */
 static void cdns_spi_init_hw(struct cdns_spi *xspi)
 {
-       u32 ctrl_reg = CDNS_SPI_CR_DEFAULT_MASK;
+       u32 ctrl_reg = CDNS_SPI_CR_DEFAULT;
 
        if (xspi->is_decoded_cs)
-               ctrl_reg |= CDNS_SPI_CR_PERI_SEL_MASK;
+               ctrl_reg |= CDNS_SPI_CR_PERI_SEL;
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
-       cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                      CDNS_SPI_IXR_ALL_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
+       cdns_spi_write(xspi, CDNS_SPI_IDR, CDNS_SPI_IXR_ALL);
 
        /* Clear the RX FIFO */
-       while (cdns_spi_read(xspi, CDNS_SPI_ISR_OFFSET) &
-              CDNS_SPI_IXR_RXNEMTY_MASK)
-               cdns_spi_read(xspi, CDNS_SPI_RXD_OFFSET);
-
-       cdns_spi_write(xspi, CDNS_SPI_ISR_OFFSET,
-                      CDNS_SPI_IXR_ALL_MASK);
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_ENABLE_MASK);
+       while (cdns_spi_read(xspi, CDNS_SPI_ISR) & CDNS_SPI_IXR_RXNEMTY)
+               cdns_spi_read(xspi, CDNS_SPI_RXD);
+
+       cdns_spi_write(xspi, CDNS_SPI_ISR, CDNS_SPI_IXR_ALL);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
 }
 
 /**
  * cdns_spi_chipselect - Select or deselect the chip select line
  * @spi:       Pointer to the spi_device structure
- * @is_on:     Select(0) or deselect (1) the chip select line
+ * @is_high:   Select(0) or deselect (1) the chip select line
  */
 static void cdns_spi_chipselect(struct spi_device *spi, bool is_high)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg;
 
-       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
 
        if (is_high) {
                /* Deselect the slave */
-               ctrl_reg |= CDNS_SPI_CR_SSCTRL_MASK;
+               ctrl_reg |= CDNS_SPI_CR_SSCTRL;
        } else {
                /* Select the slave */
-               ctrl_reg &= ~CDNS_SPI_CR_SSCTRL_MASK;
+               ctrl_reg &= ~CDNS_SPI_CR_SSCTRL;
                if (!(xspi->is_decoded_cs))
                        ctrl_reg |= ((~(CDNS_SPI_SS0 << spi->chip_select)) <<
                                     CDNS_SPI_SS_SHIFT) &
-                                    CDNS_SPI_CR_SSCTRL_MASK;
+                                    CDNS_SPI_CR_SSCTRL;
                else
                        ctrl_reg |= (spi->chip_select << CDNS_SPI_SS_SHIFT) &
-                                    CDNS_SPI_CR_SSCTRL_MASK;
+                                    CDNS_SPI_CR_SSCTRL;
        }
 
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
 }
 
 /**
@@ -212,14 +209,15 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg, new_ctrl_reg;
 
-       new_ctrl_reg = ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       new_ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
+       ctrl_reg = new_ctrl_reg;
 
        /* Set the SPI clock phase and clock polarity */
-       new_ctrl_reg &= ~(CDNS_SPI_CR_CPHA_MASK | CDNS_SPI_CR_CPOL_MASK);
+       new_ctrl_reg &= ~(CDNS_SPI_CR_CPHA | CDNS_SPI_CR_CPOL);
        if (spi->mode & SPI_CPHA)
-               new_ctrl_reg |= CDNS_SPI_CR_CPHA_MASK;
+               new_ctrl_reg |= CDNS_SPI_CR_CPHA;
        if (spi->mode & SPI_CPOL)
-               new_ctrl_reg |= CDNS_SPI_CR_CPOL_MASK;
+               new_ctrl_reg |= CDNS_SPI_CR_CPOL;
 
        if (new_ctrl_reg != ctrl_reg) {
                /*
@@ -228,11 +226,9 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
                 * polarity as it will cause the SPI slave to see spurious clock
                 * transitions. To workaround the issue toggle the ER register.
                 */
-               cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                                  CDNS_SPI_ER_DISABLE_MASK);
-               cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, new_ctrl_reg);
-               cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                                  CDNS_SPI_ER_ENABLE_MASK);
+               cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
+               cdns_spi_write(xspi, CDNS_SPI_CR, new_ctrl_reg);
+               cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
        }
 }
 
@@ -251,7 +247,7 @@ static void cdns_spi_config_clock_mode(struct spi_device *spi)
  * controller.
  */
 static void cdns_spi_config_clock_freq(struct spi_device *spi,
-                                 struct spi_transfer *transfer)
+                                      struct spi_transfer *transfer)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(spi->master);
        u32 ctrl_reg, baud_rate_val;
@@ -259,7 +255,7 @@ static void cdns_spi_config_clock_freq(struct spi_device *spi,
 
        frequency = clk_get_rate(xspi->ref_clk);
 
-       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR_OFFSET);
+       ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR);
 
        /* Set the clock frequency */
        if (xspi->speed_hz != transfer->speed_hz) {
@@ -269,12 +265,12 @@ static void cdns_spi_config_clock_freq(struct spi_device *spi,
                       (frequency / (2 << baud_rate_val)) > transfer->speed_hz)
                        baud_rate_val++;
 
-               ctrl_reg &= ~CDNS_SPI_CR_BAUD_DIV_MASK;
+               ctrl_reg &= ~CDNS_SPI_CR_BAUD_DIV;
                ctrl_reg |= baud_rate_val << CDNS_SPI_BAUD_DIV_SHIFT;
 
                xspi->speed_hz = frequency / (2 << baud_rate_val);
        }
-       cdns_spi_write(xspi, CDNS_SPI_CR_OFFSET, ctrl_reg);
+       cdns_spi_write(xspi, CDNS_SPI_CR, ctrl_reg);
 }
 
 /**
@@ -313,10 +309,9 @@ static void cdns_spi_fill_tx_fifo(struct cdns_spi *xspi)
        while ((trans_cnt < CDNS_SPI_FIFO_DEPTH) &&
               (xspi->tx_bytes > 0)) {
                if (xspi->txbuf)
-                       cdns_spi_write(xspi, CDNS_SPI_TXD_OFFSET,
-                                      *xspi->txbuf++);
+                       cdns_spi_write(xspi, CDNS_SPI_TXD, *xspi->txbuf++);
                else
-                       cdns_spi_write(xspi, CDNS_SPI_TXD_OFFSET, 0);
+                       cdns_spi_write(xspi, CDNS_SPI_TXD, 0);
 
                xspi->tx_bytes--;
                trans_cnt++;
@@ -344,19 +339,18 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
        u32 intr_status, status;
 
        status = IRQ_NONE;
-       intr_status = cdns_spi_read(xspi, CDNS_SPI_ISR_OFFSET);
-       cdns_spi_write(xspi, CDNS_SPI_ISR_OFFSET, intr_status);
+       intr_status = cdns_spi_read(xspi, CDNS_SPI_ISR);
+       cdns_spi_write(xspi, CDNS_SPI_ISR, intr_status);
 
-       if (intr_status & CDNS_SPI_IXR_MODF_MASK) {
+       if (intr_status & CDNS_SPI_IXR_MODF) {
                /* Indicate that transfer is completed, the SPI subsystem will
                 * identify the error as the remaining bytes to be
                 * transferred is non-zero
                 */
-               cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                              CDNS_SPI_IXR_DEFAULT_MASK);
+               cdns_spi_write(xspi, CDNS_SPI_IDR, CDNS_SPI_IXR_DEFAULT);
                spi_finalize_current_transfer(master);
                status = IRQ_HANDLED;
-       } else if (intr_status & CDNS_SPI_IXR_TXOW_MASK) {
+       } else if (intr_status & CDNS_SPI_IXR_TXOW) {
                unsigned long trans_cnt;
 
                trans_cnt = xspi->rx_bytes - xspi->tx_bytes;
@@ -365,7 +359,7 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
                while (trans_cnt) {
                        u8 data;
 
-                       data = cdns_spi_read(xspi, CDNS_SPI_RXD_OFFSET);
+                       data = cdns_spi_read(xspi, CDNS_SPI_RXD);
                        if (xspi->rxbuf)
                                *xspi->rxbuf++ = data;
 
@@ -378,8 +372,8 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
                        cdns_spi_fill_tx_fifo(xspi);
                } else {
                        /* Transfer is completed */
-                       cdns_spi_write(xspi, CDNS_SPI_IDR_OFFSET,
-                                      CDNS_SPI_IXR_DEFAULT_MASK);
+                       cdns_spi_write(xspi, CDNS_SPI_IDR,
+                                      CDNS_SPI_IXR_DEFAULT);
                        spi_finalize_current_transfer(master);
                }
                status = IRQ_HANDLED;
@@ -387,6 +381,7 @@ static irqreturn_t cdns_spi_irq(int irq, void *dev_id)
 
        return status;
 }
+
 static int cdns_prepare_message(struct spi_master *master,
                                struct spi_message *msg)
 {
@@ -421,8 +416,7 @@ static int cdns_transfer_one(struct spi_master *master,
 
        cdns_spi_fill_tx_fifo(xspi);
 
-       cdns_spi_write(xspi, CDNS_SPI_IER_OFFSET,
-                      CDNS_SPI_IXR_DEFAULT_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_IER, CDNS_SPI_IXR_DEFAULT);
        return transfer->len;
 }
 
@@ -439,8 +433,7 @@ static int cdns_prepare_transfer_hardware(struct spi_master *master)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_ENABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_ENABLE);
 
        return 0;
 }
@@ -458,8 +451,7 @@ static int cdns_unprepare_transfer_hardware(struct spi_master *master)
 {
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
 
        return 0;
 }
@@ -481,7 +473,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        u32 num_cs;
 
        master = spi_alloc_master(&pdev->dev, sizeof(*xspi));
-       if (master == NULL)
+       if (!master)
                return -ENOMEM;
 
        xspi = spi_master_get_devdata(master);
@@ -521,6 +513,11 @@ static int cdns_spi_probe(struct platform_device *pdev)
                goto clk_dis_apb;
        }
 
+       pm_runtime_enable(&pdev->dev);
+       pm_runtime_use_autosuspend(&pdev->dev);
+       pm_runtime_set_autosuspend_delay(&pdev->dev, SPI_AUTOSUSPEND_TIMEOUT);
+       pm_runtime_set_active(&pdev->dev);
+
        ret = of_property_read_u32(pdev->dev.of_node, "num-cs", &num_cs);
        if (ret < 0)
                master->num_chipselect = CDNS_SPI_DEFAULT_NUM_CS;
@@ -535,11 +532,14 @@ static int cdns_spi_probe(struct platform_device *pdev)
        /* SPI controller initializations */
        cdns_spi_init_hw(xspi);
 
+       pm_runtime_mark_last_busy(&pdev->dev);
+       pm_runtime_put_autosuspend(&pdev->dev);
+
        irq = platform_get_irq(pdev, 0);
        if (irq <= 0) {
                ret = -ENXIO;
                dev_err(&pdev->dev, "irq number is invalid\n");
-               goto remove_master;
+               goto clk_dis_all;
        }
 
        ret = devm_request_irq(&pdev->dev, irq, cdns_spi_irq,
@@ -547,7 +547,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        if (ret != 0) {
                ret = -ENXIO;
                dev_err(&pdev->dev, "request_irq failed\n");
-               goto remove_master;
+               goto clk_dis_all;
        }
 
        master->prepare_transfer_hardware = cdns_prepare_transfer_hardware;
@@ -555,6 +555,7 @@ static int cdns_spi_probe(struct platform_device *pdev)
        master->transfer_one = cdns_transfer_one;
        master->unprepare_transfer_hardware = cdns_unprepare_transfer_hardware;
        master->set_cs = cdns_spi_chipselect;
+       master->auto_runtime_pm = true;
        master->mode_bits = SPI_CPOL | SPI_CPHA;
 
        /* Set to default valid value */
@@ -572,6 +573,8 @@ static int cdns_spi_probe(struct platform_device *pdev)
        return ret;
 
 clk_dis_all:
+       pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
        clk_disable_unprepare(xspi->ref_clk);
 clk_dis_apb:
        clk_disable_unprepare(xspi->pclk);
@@ -595,11 +598,12 @@ static int cdns_spi_remove(struct platform_device *pdev)
        struct spi_master *master = platform_get_drvdata(pdev);
        struct cdns_spi *xspi = spi_master_get_devdata(master);
 
-       cdns_spi_write(xspi, CDNS_SPI_ER_OFFSET,
-                      CDNS_SPI_ER_DISABLE_MASK);
+       cdns_spi_write(xspi, CDNS_SPI_ER, CDNS_SPI_ER_DISABLE);
 
        clk_disable_unprepare(xspi->ref_clk);
        clk_disable_unprepare(xspi->pclk);
+       pm_runtime_set_suspended(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
 
        spi_unregister_master(master);
 
@@ -613,21 +617,14 @@ static int cdns_spi_remove(struct platform_device *pdev)
  * This function disables the SPI controller and
  * changes the driver state to "suspend"
  *
- * Return:     Always 0
+ * Return:     0 on success and error value on error
  */
 static int __maybe_unused cdns_spi_suspend(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct spi_master *master = platform_get_drvdata(pdev);
-       struct cdns_spi *xspi = spi_master_get_devdata(master);
-
-       spi_master_suspend(master);
-
-       clk_disable_unprepare(xspi->ref_clk);
-
-       clk_disable_unprepare(xspi->pclk);
 
-       return 0;
+       return spi_master_suspend(master);
 }
 
 /**
@@ -642,8 +639,23 @@ static int __maybe_unused cdns_spi_resume(struct device *dev)
 {
        struct platform_device *pdev = to_platform_device(dev);
        struct spi_master *master = platform_get_drvdata(pdev);
+
+       return spi_master_resume(master);
+}
+
+/**
+ * cdns_spi_runtime_resume - Runtime resume method for the SPI driver
+ * @dev:       Address of the platform_device structure
+ *
+ * This function enables the clocks
+ *
+ * Return:     0 on success and error value on error
+ */
+static int __maybe_unused cnds_runtime_resume(struct device *dev)
+{
+       struct spi_master *master = dev_get_drvdata(dev);
        struct cdns_spi *xspi = spi_master_get_devdata(master);
-       int ret = 0;
+       int ret;
 
        ret = clk_prepare_enable(xspi->pclk);
        if (ret) {
@@ -657,13 +669,33 @@ static int __maybe_unused cdns_spi_resume(struct device *dev)
                clk_disable(xspi->pclk);
                return ret;
        }
-       spi_master_resume(master);
+       return 0;
+}
+
+/**
+ * cdns_spi_runtime_suspend - Runtime suspend method for the SPI driver
+ * @dev:       Address of the platform_device structure
+ *
+ * This function disables the clocks
+ *
+ * Return:     Always 0
+ */
+static int __maybe_unused cnds_runtime_suspend(struct device *dev)
+{
+       struct spi_master *master = dev_get_drvdata(dev);
+       struct cdns_spi *xspi = spi_master_get_devdata(master);
+
+       clk_disable_unprepare(xspi->ref_clk);
+       clk_disable_unprepare(xspi->pclk);
 
        return 0;
 }
 
-static SIMPLE_DEV_PM_OPS(cdns_spi_dev_pm_ops, cdns_spi_suspend,
-                        cdns_spi_resume);
+static const struct dev_pm_ops cdns_spi_dev_pm_ops = {
+       SET_RUNTIME_PM_OPS(cnds_runtime_suspend,
+                          cnds_runtime_resume, NULL)
+       SET_SYSTEM_SLEEP_PM_OPS(cdns_spi_suspend, cdns_spi_resume)
+};
 
 static const struct of_device_id cdns_spi_of_match[] = {
        { .compatible = "xlnx,zynq-spi-r1p6" },
index fddb7a3be322be41041013282624fd866a573400..d36c11b73a35ca656ab04e9c5ef0492f96950b32 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
-#include <linux/edma.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
@@ -33,8 +32,6 @@
 
 #include <linux/platform_data/spi-davinci.h>
 
-#define SPI_NO_RESOURCE                ((resource_size_t)-1)
-
 #define CS_DEFAULT     0xFF
 
 #define SPIFMT_PHASE_MASK      BIT(16)
@@ -130,8 +127,6 @@ struct davinci_spi {
 
        struct dma_chan         *dma_rx;
        struct dma_chan         *dma_tx;
-       int                     dma_rx_chnum;
-       int                     dma_tx_chnum;
 
        struct davinci_spi_platform_data pdata;
 
@@ -797,35 +792,19 @@ static irqreturn_t davinci_spi_irq(s32 irq, void *data)
 
 static int davinci_spi_request_dma(struct davinci_spi *dspi)
 {
-       dma_cap_mask_t mask;
        struct device *sdev = dspi->bitbang.master->dev.parent;
-       int r;
-
-       dma_cap_zero(mask);
-       dma_cap_set(DMA_SLAVE, mask);
 
-       dspi->dma_rx = dma_request_channel(mask, edma_filter_fn,
-                                          &dspi->dma_rx_chnum);
-       if (!dspi->dma_rx) {
-               dev_err(sdev, "request RX DMA channel failed\n");
-               r = -ENODEV;
-               goto rx_dma_failed;
-       }
+       dspi->dma_rx = dma_request_chan(sdev, "rx");
+       if (IS_ERR(dspi->dma_rx))
+               return PTR_ERR(dspi->dma_rx);
 
-       dspi->dma_tx = dma_request_channel(mask, edma_filter_fn,
-                                          &dspi->dma_tx_chnum);
-       if (!dspi->dma_tx) {
-               dev_err(sdev, "request TX DMA channel failed\n");
-               r = -ENODEV;
-               goto tx_dma_failed;
+       dspi->dma_tx = dma_request_chan(sdev, "tx");
+       if (IS_ERR(dspi->dma_tx)) {
+               dma_release_channel(dspi->dma_rx);
+               return PTR_ERR(dspi->dma_tx);
        }
 
        return 0;
-
-tx_dma_failed:
-       dma_release_channel(dspi->dma_rx);
-rx_dma_failed:
-       return r;
 }
 
 #if defined(CONFIG_OF)
@@ -936,8 +915,6 @@ static int davinci_spi_probe(struct platform_device *pdev)
        struct davinci_spi *dspi;
        struct davinci_spi_platform_data *pdata;
        struct resource *r;
-       resource_size_t dma_rx_chan = SPI_NO_RESOURCE;
-       resource_size_t dma_tx_chan = SPI_NO_RESOURCE;
        int ret = 0;
        u32 spipc0;
 
@@ -1044,27 +1021,15 @@ static int davinci_spi_probe(struct platform_device *pdev)
                }
        }
 
-       r = platform_get_resource(pdev, IORESOURCE_DMA, 0);
-       if (r)
-               dma_rx_chan = r->start;
-       r = platform_get_resource(pdev, IORESOURCE_DMA, 1);
-       if (r)
-               dma_tx_chan = r->start;
-
        dspi->bitbang.txrx_bufs = davinci_spi_bufs;
-       if (dma_rx_chan != SPI_NO_RESOURCE &&
-           dma_tx_chan != SPI_NO_RESOURCE) {
-               dspi->dma_rx_chnum = dma_rx_chan;
-               dspi->dma_tx_chnum = dma_tx_chan;
-
-               ret = davinci_spi_request_dma(dspi);
-               if (ret)
-                       goto free_clk;
-
-               dev_info(&pdev->dev, "DMA: supported\n");
-               dev_info(&pdev->dev, "DMA: RX channel: %pa, TX channel: %pa, event queue: %d\n",
-                               &dma_rx_chan, &dma_tx_chan,
-                               pdata->dma_event_q);
+
+       ret = davinci_spi_request_dma(dspi);
+       if (ret == -EPROBE_DEFER) {
+               goto free_clk;
+       } else if (ret) {
+               dev_info(&pdev->dev, "DMA is not supported (%d)\n", ret);
+               dspi->dma_rx = NULL;
+               dspi->dma_tx = NULL;
        }
 
        dspi->get_rx = davinci_spi_rx_buf_u8;
@@ -1102,8 +1067,10 @@ static int davinci_spi_probe(struct platform_device *pdev)
        return ret;
 
 free_dma:
-       dma_release_channel(dspi->dma_rx);
-       dma_release_channel(dspi->dma_tx);
+       if (dspi->dma_rx) {
+               dma_release_channel(dspi->dma_rx);
+               dma_release_channel(dspi->dma_tx);
+       }
 free_clk:
        clk_disable_unprepare(dspi->clk);
 free_master:
@@ -1134,6 +1101,11 @@ static int davinci_spi_remove(struct platform_device *pdev)
        clk_disable_unprepare(dspi->clk);
        spi_master_put(master);
 
+       if (dspi->dma_rx) {
+               dma_release_channel(dspi->dma_rx);
+               dma_release_channel(dspi->dma_tx);
+       }
+
        return 0;
 }
 
index 3b7d91d94feace23ea2b0c9070a5d22a77aa9685..b62a99caacc0648ff24a116417f220ccb1a63b45 100644 (file)
@@ -683,6 +683,7 @@ static int dln2_spi_probe(struct platform_device *pdev)
        struct spi_master *master;
        struct dln2_spi *dln2;
        struct dln2_platform_data *pdata = dev_get_platdata(&pdev->dev);
+       struct device *dev = &pdev->dev;
        int ret;
 
        master = spi_alloc_master(&pdev->dev, sizeof(*dln2));
@@ -700,6 +701,7 @@ static int dln2_spi_probe(struct platform_device *pdev)
        }
 
        dln2->master = master;
+       dln2->master->dev.of_node = dev->of_node;
        dln2->pdev = pdev;
        dln2->port = pdata->port;
        /* cs/mode can never be 0xff, so the first transfer will set them */
index 332ccb0539a77710e3c4783cf2468acd25a8c04f..ef7db75c92c13b34af62dd6c0377d3aa16bfcabd 100644 (file)
@@ -67,7 +67,7 @@ static int spi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        dws->irq = pdev->irq;
 
        /*
-        * Specific handling for paltforms, like dma setup,
+        * Specific handling for platforms, like dma setup,
         * clock rate, FIFO depth.
         */
        if (desc) {
index c1a2d747b24686cb835541c68bb2f20f2304d009..9e9dadb52b3db0bea2f1d23eee12c3479e0dc925 100644 (file)
@@ -121,18 +121,22 @@ enum dspi_trans_mode {
 
 struct fsl_dspi_devtype_data {
        enum dspi_trans_mode trans_mode;
+       u8 max_clock_factor;
 };
 
 static const struct fsl_dspi_devtype_data vf610_data = {
        .trans_mode = DSPI_EOQ_MODE,
+       .max_clock_factor = 2,
 };
 
 static const struct fsl_dspi_devtype_data ls1021a_v1_data = {
        .trans_mode = DSPI_TCFQ_MODE,
+       .max_clock_factor = 8,
 };
 
 static const struct fsl_dspi_devtype_data ls2085a_data = {
        .trans_mode = DSPI_TCFQ_MODE,
+       .max_clock_factor = 8,
 };
 
 struct fsl_dspi {
@@ -726,6 +730,9 @@ static int dspi_probe(struct platform_device *pdev)
        }
        clk_prepare_enable(dspi->clk);
 
+       master->max_speed_hz =
+               clk_get_rate(dspi->clk) / dspi->devtype_data->max_clock_factor;
+
        init_waitqueue_head(&dspi->waitq);
        platform_set_drvdata(pdev, master);
 
index 7cb0c1921495959dcb6c919c3cb9f1811a5e90bb..8d85a3c343dab635811cf4b6a871a841977fa09f 100644 (file)
@@ -245,7 +245,12 @@ static int fsl_espi_bufs(struct spi_device *spi, struct spi_transfer *t)
        if (ret)
                return ret;
 
-       wait_for_completion(&mpc8xxx_spi->done);
+       /* Won't hang up forever, SPI bus sometimes got lost interrupts... */
+       ret = wait_for_completion_timeout(&mpc8xxx_spi->done, 2 * HZ);
+       if (ret == 0)
+               dev_err(mpc8xxx_spi->dev,
+                       "Transaction hanging up (left %d bytes)\n",
+                       mpc8xxx_spi->count);
 
        /* disable rx ints */
        mpc8xxx_spi_write_reg(&reg_base->mask, 0);
@@ -539,16 +544,31 @@ void fsl_espi_cpu_irq(struct mpc8xxx_spi *mspi, u32 events)
        if (events & SPIE_NE) {
                u32 rx_data, tmp;
                u8 rx_data_8;
+               int rx_nr_bytes = 4;
+               int ret;
 
                /* Spin until RX is done */
-               while (SPIE_RXCNT(events) < min(4, mspi->len)) {
-                       cpu_relax();
-                       events = mpc8xxx_spi_read_reg(&reg_base->event);
+               if (SPIE_RXCNT(events) < min(4, mspi->len)) {
+                       ret = spin_event_timeout(
+                               !(SPIE_RXCNT(events =
+                               mpc8xxx_spi_read_reg(&reg_base->event)) <
+                                               min(4, mspi->len)),
+                                               10000, 0); /* 10 msec */
+                       if (!ret)
+                               dev_err(mspi->dev,
+                                        "tired waiting for SPIE_RXCNT\n");
                }
 
                if (mspi->len >= 4) {
                        rx_data = mpc8xxx_spi_read_reg(&reg_base->receive);
+               } else if (mspi->len <= 0) {
+                       dev_err(mspi->dev,
+                               "unexpected RX(SPIE_NE) interrupt occurred,\n"
+                               "(local rxlen %d bytes, reg rxlen %d bytes)\n",
+                               min(4, mspi->len), SPIE_RXCNT(events));
+                       rx_nr_bytes = 0;
                } else {
+                       rx_nr_bytes = mspi->len;
                        tmp = mspi->len;
                        rx_data = 0;
                        while (tmp--) {
@@ -559,7 +579,7 @@ void fsl_espi_cpu_irq(struct mpc8xxx_spi *mspi, u32 events)
                        rx_data <<= (4 - mspi->len) * 8;
                }
 
-               mspi->len -= 4;
+               mspi->len -= rx_nr_bytes;
 
                if (mspi->rx)
                        mspi->get_rx(rx_data, mspi);
index 07e4ce8273df56533aee2e8adf563c6026a956c0..3b170093989fcc35cf312a842fa095038d0c649d 100644 (file)
@@ -175,6 +175,7 @@ err:
 static int octeon_spi_probe(struct platform_device *pdev)
 {
        struct resource *res_mem;
+       void __iomem *reg_base;
        struct spi_master *master;
        struct octeon_spi *p;
        int err = -ENOENT;
@@ -186,19 +187,13 @@ static int octeon_spi_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, master);
 
        res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-       if (res_mem == NULL) {
-               dev_err(&pdev->dev, "found no memory resource\n");
-               err = -ENXIO;
-               goto fail;
-       }
-       if (!devm_request_mem_region(&pdev->dev, res_mem->start,
-                                    resource_size(res_mem), res_mem->name)) {
-               dev_err(&pdev->dev, "request_mem_region failed\n");
+       reg_base = devm_ioremap_resource(&pdev->dev, res_mem);
+       if (IS_ERR(reg_base)) {
+               err = PTR_ERR(reg_base);
                goto fail;
        }
-       p->register_base = (u64)devm_ioremap(&pdev->dev, res_mem->start,
-                                            resource_size(res_mem));
+
+       p->register_base = (u64)reg_base;
 
        master->num_chipselect = 4;
        master->mode_bits = SPI_CPHA |
index 0caa3c8bef46c46e0ed66bf89f518cc5c5236449..1d237e93a2895c3f05ecca846df0d1c1db99ba5a 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/omap-dma.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
@@ -103,9 +102,6 @@ struct omap2_mcspi_dma {
        struct dma_chan *dma_tx;
        struct dma_chan *dma_rx;
 
-       int dma_tx_sync_dev;
-       int dma_rx_sync_dev;
-
        struct completion dma_tx_completion;
        struct completion dma_rx_completion;
 
@@ -964,8 +960,7 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
        struct spi_master       *master = spi->master;
        struct omap2_mcspi      *mcspi;
        struct omap2_mcspi_dma  *mcspi_dma;
-       dma_cap_mask_t mask;
-       unsigned sig;
+       int ret = 0;
 
        mcspi = spi_master_get_devdata(master);
        mcspi_dma = mcspi->dma_channels + spi->chip_select;
@@ -973,34 +968,25 @@ static int omap2_mcspi_request_dma(struct spi_device *spi)
        init_completion(&mcspi_dma->dma_rx_completion);
        init_completion(&mcspi_dma->dma_tx_completion);
 
-       dma_cap_zero(mask);
-       dma_cap_set(DMA_SLAVE, mask);
-       sig = mcspi_dma->dma_rx_sync_dev;
-
-       mcspi_dma->dma_rx =
-               dma_request_slave_channel_compat(mask, omap_dma_filter_fn,
-                                                &sig, &master->dev,
-                                                mcspi_dma->dma_rx_ch_name);
-       if (!mcspi_dma->dma_rx)
+       mcspi_dma->dma_rx = dma_request_chan(&master->dev,
+                                            mcspi_dma->dma_rx_ch_name);
+       if (IS_ERR(mcspi_dma->dma_rx)) {
+               ret = PTR_ERR(mcspi_dma->dma_rx);
+               mcspi_dma->dma_rx = NULL;
                goto no_dma;
+       }
 
-       sig = mcspi_dma->dma_tx_sync_dev;
-       mcspi_dma->dma_tx =
-               dma_request_slave_channel_compat(mask, omap_dma_filter_fn,
-                                                &sig, &master->dev,
-                                                mcspi_dma->dma_tx_ch_name);
-
-       if (!mcspi_dma->dma_tx) {
+       mcspi_dma->dma_tx = dma_request_chan(&master->dev,
+                                            mcspi_dma->dma_tx_ch_name);
+       if (IS_ERR(mcspi_dma->dma_tx)) {
+               ret = PTR_ERR(mcspi_dma->dma_tx);
+               mcspi_dma->dma_tx = NULL;
                dma_release_channel(mcspi_dma->dma_rx);
                mcspi_dma->dma_rx = NULL;
-               goto no_dma;
        }
 
-       return 0;
-
 no_dma:
-       dev_warn(&spi->dev, "not using DMA for McSPI\n");
-       return -EAGAIN;
+       return ret;
 }
 
 static int omap2_mcspi_setup(struct spi_device *spi)
@@ -1039,8 +1025,9 @@ static int omap2_mcspi_setup(struct spi_device *spi)
 
        if (!mcspi_dma->dma_rx || !mcspi_dma->dma_tx) {
                ret = omap2_mcspi_request_dma(spi);
-               if (ret < 0 && ret != -EAGAIN)
-                       return ret;
+               if (ret)
+                       dev_warn(&spi->dev, "not using DMA for McSPI (%d)\n",
+                                ret);
        }
 
        ret = pm_runtime_get_sync(mcspi->dev);
@@ -1434,42 +1421,8 @@ static int omap2_mcspi_probe(struct platform_device *pdev)
        }
 
        for (i = 0; i < master->num_chipselect; i++) {
-               char *dma_rx_ch_name = mcspi->dma_channels[i].dma_rx_ch_name;
-               char *dma_tx_ch_name = mcspi->dma_channels[i].dma_tx_ch_name;
-               struct resource *dma_res;
-
-               sprintf(dma_rx_ch_name, "rx%d", i);
-               if (!pdev->dev.of_node) {
-                       dma_res =
-                               platform_get_resource_byname(pdev,
-                                                            IORESOURCE_DMA,
-                                                            dma_rx_ch_name);
-                       if (!dma_res) {
-                               dev_dbg(&pdev->dev,
-                                       "cannot get DMA RX channel\n");
-                               status = -ENODEV;
-                               break;
-                       }
-
-                       mcspi->dma_channels[i].dma_rx_sync_dev =
-                               dma_res->start;
-               }
-               sprintf(dma_tx_ch_name, "tx%d", i);
-               if (!pdev->dev.of_node) {
-                       dma_res =
-                               platform_get_resource_byname(pdev,
-                                                            IORESOURCE_DMA,
-                                                            dma_tx_ch_name);
-                       if (!dma_res) {
-                               dev_dbg(&pdev->dev,
-                                       "cannot get DMA TX channel\n");
-                               status = -ENODEV;
-                               break;
-                       }
-
-                       mcspi->dma_channels[i].dma_tx_sync_dev =
-                               dma_res->start;
-               }
+               sprintf(mcspi->dma_channels[i].dma_rx_ch_name, "rx%d", i);
+               sprintf(mcspi->dma_channels[i].dma_tx_ch_name, "tx%d", i);
        }
 
        if (status < 0)
diff --git a/drivers/spi/spi-pic32-sqi.c b/drivers/spi/spi-pic32-sqi.c
new file mode 100644 (file)
index 0000000..ca3c8d9
--- /dev/null
@@ -0,0 +1,727 @@
+/*
+ * PIC32 Quad SPI controller driver.
+ *
+ * Purna Chandra Mandal <purna.mandal@microchip.com>
+ * Copyright (c) 2016, Microchip Technology Inc.
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+
+/* SQI registers */
+#define PESQI_XIP_CONF1_REG    0x00
+#define PESQI_XIP_CONF2_REG    0x04
+#define PESQI_CONF_REG         0x08
+#define PESQI_CTRL_REG         0x0C
+#define PESQI_CLK_CTRL_REG     0x10
+#define PESQI_CMD_THRES_REG    0x14
+#define PESQI_INT_THRES_REG    0x18
+#define PESQI_INT_ENABLE_REG   0x1C
+#define PESQI_INT_STAT_REG     0x20
+#define PESQI_TX_DATA_REG      0x24
+#define PESQI_RX_DATA_REG      0x28
+#define PESQI_STAT1_REG                0x2C
+#define PESQI_STAT2_REG                0x30
+#define PESQI_BD_CTRL_REG      0x34
+#define PESQI_BD_CUR_ADDR_REG  0x38
+#define PESQI_BD_BASE_ADDR_REG 0x40
+#define PESQI_BD_STAT_REG      0x44
+#define PESQI_BD_POLL_CTRL_REG 0x48
+#define PESQI_BD_TX_DMA_STAT_REG       0x4C
+#define PESQI_BD_RX_DMA_STAT_REG       0x50
+#define PESQI_THRES_REG                0x54
+#define PESQI_INT_SIGEN_REG    0x58
+
+/* PESQI_CONF_REG fields */
+#define PESQI_MODE             0x7
+#define  PESQI_MODE_BOOT       0
+#define  PESQI_MODE_PIO                1
+#define  PESQI_MODE_DMA                2
+#define  PESQI_MODE_XIP                3
+#define PESQI_MODE_SHIFT       0
+#define PESQI_CPHA             BIT(3)
+#define PESQI_CPOL             BIT(4)
+#define PESQI_LSBF             BIT(5)
+#define PESQI_RXLATCH          BIT(7)
+#define PESQI_SERMODE          BIT(8)
+#define PESQI_WP_EN            BIT(9)
+#define PESQI_HOLD_EN          BIT(10)
+#define PESQI_BURST_EN         BIT(12)
+#define PESQI_CS_CTRL_HW       BIT(15)
+#define PESQI_SOFT_RESET       BIT(16)
+#define PESQI_LANES_SHIFT      20
+#define  PESQI_SINGLE_LANE     0
+#define  PESQI_DUAL_LANE       1
+#define  PESQI_QUAD_LANE       2
+#define PESQI_CSEN_SHIFT       24
+#define PESQI_EN               BIT(23)
+
+/* PESQI_CLK_CTRL_REG fields */
+#define PESQI_CLK_EN           BIT(0)
+#define PESQI_CLK_STABLE       BIT(1)
+#define PESQI_CLKDIV_SHIFT     8
+#define PESQI_CLKDIV           0xff
+
+/* PESQI_INT_THR/CMD_THR_REG */
+#define PESQI_TXTHR_MASK       0x1f
+#define PESQI_TXTHR_SHIFT      8
+#define PESQI_RXTHR_MASK       0x1f
+#define PESQI_RXTHR_SHIFT      0
+
+/* PESQI_INT_EN/INT_STAT/INT_SIG_EN_REG */
+#define PESQI_TXEMPTY          BIT(0)
+#define PESQI_TXFULL           BIT(1)
+#define PESQI_TXTHR            BIT(2)
+#define PESQI_RXEMPTY          BIT(3)
+#define PESQI_RXFULL           BIT(4)
+#define PESQI_RXTHR            BIT(5)
+#define PESQI_BDDONE           BIT(9)  /* BD processing complete */
+#define PESQI_PKTCOMP          BIT(10) /* packet processing complete */
+#define PESQI_DMAERR           BIT(11) /* error */
+
+/* PESQI_BD_CTRL_REG */
+#define PESQI_DMA_EN           BIT(0) /* enable DMA engine */
+#define PESQI_POLL_EN          BIT(1) /* enable polling */
+#define PESQI_BDP_START                BIT(2) /* start BD processor */
+
+/* PESQI controller buffer descriptor */
+struct buf_desc {
+       u32 bd_ctrl;    /* control */
+       u32 bd_status;  /* reserved */
+       u32 bd_addr;    /* DMA buffer addr */
+       u32 bd_nextp;   /* next item in chain */
+};
+
+/* bd_ctrl */
+#define BD_BUFLEN              0x1ff
+#define BD_CBD_INT_EN          BIT(16) /* Current BD is processed */
+#define BD_PKT_INT_EN          BIT(17) /* All BDs of PKT processed */
+#define BD_LIFM                        BIT(18) /* last data of pkt */
+#define BD_LAST                        BIT(19) /* end of list */
+#define BD_DATA_RECV           BIT(20) /* receive data */
+#define BD_DDR                 BIT(21) /* DDR mode */
+#define BD_DUAL                        BIT(22) /* Dual SPI */
+#define BD_QUAD                        BIT(23) /* Quad SPI */
+#define BD_LSBF                        BIT(25) /* LSB First */
+#define BD_STAT_CHECK          BIT(27) /* Status poll */
+#define BD_DEVSEL_SHIFT                28      /* CS */
+#define BD_CS_DEASSERT         BIT(30) /* de-assert CS after current BD */
+#define BD_EN                  BIT(31) /* BD owned by H/W */
+
+/**
+ * struct ring_desc - Representation of SQI ring descriptor
+ * @list:      list element to add to free or used list.
+ * @bd:                PESQI controller buffer descriptor
+ * @bd_dma:    DMA address of PESQI controller buffer descriptor
+ * @xfer_len:  transfer length
+ */
+struct ring_desc {
+       struct list_head list;
+       struct buf_desc *bd;
+       dma_addr_t bd_dma;
+       u32 xfer_len;
+};
+
+/* Global constants */
+#define PESQI_BD_BUF_LEN_MAX   256
+#define PESQI_BD_COUNT         256 /* max 64KB data per spi message */
+
+struct pic32_sqi {
+       void __iomem            *regs;
+       struct clk              *sys_clk;
+       struct clk              *base_clk; /* drives spi clock */
+       struct spi_master       *master;
+       int                     irq;
+       struct completion       xfer_done;
+       struct ring_desc        *ring;
+       void                    *bd;
+       dma_addr_t              bd_dma;
+       struct list_head        bd_list_free; /* free */
+       struct list_head        bd_list_used; /* allocated */
+       struct spi_device       *cur_spi;
+       u32                     cur_speed;
+       u8                      cur_mode;
+};
+
+static inline void pic32_setbits(void __iomem *reg, u32 set)
+{
+       writel(readl(reg) | set, reg);
+}
+
+static inline void pic32_clrbits(void __iomem *reg, u32 clr)
+{
+       writel(readl(reg) & ~clr, reg);
+}
+
+static int pic32_sqi_set_clk_rate(struct pic32_sqi *sqi, u32 sck)
+{
+       u32 val, div;
+
+       /* div = base_clk / (2 * spi_clk) */
+       div = clk_get_rate(sqi->base_clk) / (2 * sck);
+       div &= PESQI_CLKDIV;
+
+       val = readl(sqi->regs + PESQI_CLK_CTRL_REG);
+       /* apply new divider */
+       val &= ~(PESQI_CLK_STABLE | (PESQI_CLKDIV << PESQI_CLKDIV_SHIFT));
+       val |= div << PESQI_CLKDIV_SHIFT;
+       writel(val, sqi->regs + PESQI_CLK_CTRL_REG);
+
+       /* wait for stability */
+       return readl_poll_timeout(sqi->regs + PESQI_CLK_CTRL_REG, val,
+                                 val & PESQI_CLK_STABLE, 1, 5000);
+}
+
+static inline void pic32_sqi_enable_int(struct pic32_sqi *sqi)
+{
+       u32 mask = PESQI_DMAERR | PESQI_BDDONE | PESQI_PKTCOMP;
+
+       writel(mask, sqi->regs + PESQI_INT_ENABLE_REG);
+       /* INT_SIGEN works as interrupt-gate to INTR line */
+       writel(mask, sqi->regs + PESQI_INT_SIGEN_REG);
+}
+
+static inline void pic32_sqi_disable_int(struct pic32_sqi *sqi)
+{
+       writel(0, sqi->regs + PESQI_INT_ENABLE_REG);
+       writel(0, sqi->regs + PESQI_INT_SIGEN_REG);
+}
+
+static irqreturn_t pic32_sqi_isr(int irq, void *dev_id)
+{
+       struct pic32_sqi *sqi = dev_id;
+       u32 enable, status;
+
+       enable = readl(sqi->regs + PESQI_INT_ENABLE_REG);
+       status = readl(sqi->regs + PESQI_INT_STAT_REG);
+
+       /* check spurious interrupt */
+       if (!status)
+               return IRQ_NONE;
+
+       if (status & PESQI_DMAERR) {
+               enable = 0;
+               goto irq_done;
+       }
+
+       if (status & PESQI_TXTHR)
+               enable &= ~(PESQI_TXTHR | PESQI_TXFULL | PESQI_TXEMPTY);
+
+       if (status & PESQI_RXTHR)
+               enable &= ~(PESQI_RXTHR | PESQI_RXFULL | PESQI_RXEMPTY);
+
+       if (status & PESQI_BDDONE)
+               enable &= ~PESQI_BDDONE;
+
+       /* packet processing completed */
+       if (status & PESQI_PKTCOMP) {
+               /* mask all interrupts */
+               enable = 0;
+               /* complete trasaction */
+               complete(&sqi->xfer_done);
+       }
+
+irq_done:
+       /* interrupts are sticky, so mask when handled */
+       writel(enable, sqi->regs + PESQI_INT_ENABLE_REG);
+
+       return IRQ_HANDLED;
+}
+
+static struct ring_desc *ring_desc_get(struct pic32_sqi *sqi)
+{
+       struct ring_desc *rdesc;
+
+       if (list_empty(&sqi->bd_list_free))
+               return NULL;
+
+       rdesc = list_first_entry(&sqi->bd_list_free, struct ring_desc, list);
+       list_del(&rdesc->list);
+       list_add_tail(&rdesc->list, &sqi->bd_list_used);
+       return rdesc;
+}
+
+static void ring_desc_put(struct pic32_sqi *sqi, struct ring_desc *rdesc)
+{
+       list_del(&rdesc->list);
+       list_add(&rdesc->list, &sqi->bd_list_free);
+}
+
+static int pic32_sqi_one_transfer(struct pic32_sqi *sqi,
+                                 struct spi_message *mesg,
+                                 struct spi_transfer *xfer)
+{
+       struct spi_device *spi = mesg->spi;
+       struct scatterlist *sg, *sgl;
+       struct ring_desc *rdesc;
+       struct buf_desc *bd;
+       int nents, i;
+       u32 bd_ctrl;
+       u32 nbits;
+
+       /* Device selection */
+       bd_ctrl = spi->chip_select << BD_DEVSEL_SHIFT;
+
+       /* half-duplex: select transfer buffer, direction and lane */
+       if (xfer->rx_buf) {
+               bd_ctrl |= BD_DATA_RECV;
+               nbits = xfer->rx_nbits;
+               sgl = xfer->rx_sg.sgl;
+               nents = xfer->rx_sg.nents;
+       } else {
+               nbits = xfer->tx_nbits;
+               sgl = xfer->tx_sg.sgl;
+               nents = xfer->tx_sg.nents;
+       }
+
+       if (nbits & SPI_NBITS_QUAD)
+               bd_ctrl |= BD_QUAD;
+       else if (nbits & SPI_NBITS_DUAL)
+               bd_ctrl |= BD_DUAL;
+
+       /* LSB first */
+       if (spi->mode & SPI_LSB_FIRST)
+               bd_ctrl |= BD_LSBF;
+
+       /* ownership to hardware */
+       bd_ctrl |= BD_EN;
+
+       for_each_sg(sgl, sg, nents, i) {
+               /* get ring descriptor */
+               rdesc = ring_desc_get(sqi);
+               if (!rdesc)
+                       break;
+
+               bd = rdesc->bd;
+
+               /* BD CTRL: length */
+               rdesc->xfer_len = sg_dma_len(sg);
+               bd->bd_ctrl = bd_ctrl;
+               bd->bd_ctrl |= rdesc->xfer_len;
+
+               /* BD STAT */
+               bd->bd_status = 0;
+
+               /* BD BUFFER ADDRESS */
+               bd->bd_addr = sg->dma_address;
+       }
+
+       return 0;
+}
+
+static int pic32_sqi_prepare_hardware(struct spi_master *master)
+{
+       struct pic32_sqi *sqi = spi_master_get_devdata(master);
+
+       /* enable spi interface */
+       pic32_setbits(sqi->regs + PESQI_CONF_REG, PESQI_EN);
+       /* enable spi clk */
+       pic32_setbits(sqi->regs + PESQI_CLK_CTRL_REG, PESQI_CLK_EN);
+
+       return 0;
+}
+
+static bool pic32_sqi_can_dma(struct spi_master *master,
+                             struct spi_device *spi,
+                             struct spi_transfer *x)
+{
+       /* Do DMA irrespective of transfer size */
+       return true;
+}
+
+static int pic32_sqi_one_message(struct spi_master *master,
+                                struct spi_message *msg)
+{
+       struct spi_device *spi = msg->spi;
+       struct ring_desc *rdesc, *next;
+       struct spi_transfer *xfer;
+       struct pic32_sqi *sqi;
+       int ret = 0, mode;
+       u32 val;
+
+       sqi = spi_master_get_devdata(master);
+
+       reinit_completion(&sqi->xfer_done);
+       msg->actual_length = 0;
+
+       /* We can't handle spi_transfer specific "speed_hz", "bits_per_word"
+        * and "delay_usecs". But spi_device specific speed and mode change
+        * can be handled at best during spi chip-select switch.
+        */
+       if (sqi->cur_spi != spi) {
+               /* set spi speed */
+               if (sqi->cur_speed != spi->max_speed_hz) {
+                       sqi->cur_speed = spi->max_speed_hz;
+                       ret = pic32_sqi_set_clk_rate(sqi, spi->max_speed_hz);
+                       if (ret)
+                               dev_warn(&spi->dev, "set_clk, %d\n", ret);
+               }
+
+               /* set spi mode */
+               mode = spi->mode & (SPI_MODE_3 | SPI_LSB_FIRST);
+               if (sqi->cur_mode != mode) {
+                       val = readl(sqi->regs + PESQI_CONF_REG);
+                       val &= ~(PESQI_CPOL | PESQI_CPHA | PESQI_LSBF);
+                       if (mode & SPI_CPOL)
+                               val |= PESQI_CPOL;
+                       if (mode & SPI_LSB_FIRST)
+                               val |= PESQI_LSBF;
+                       val |= PESQI_CPHA;
+                       writel(val, sqi->regs + PESQI_CONF_REG);
+
+                       sqi->cur_mode = mode;
+               }
+               sqi->cur_spi = spi;
+       }
+
+       /* prepare hardware desc-list(BD) for transfer(s) */
+       list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+               ret = pic32_sqi_one_transfer(sqi, msg, xfer);
+               if (ret) {
+                       dev_err(&spi->dev, "xfer %p err\n", xfer);
+                       goto xfer_out;
+               }
+       }
+
+       /* BDs are prepared and chained. Now mark LAST_BD, CS_DEASSERT at last
+        * element of the list.
+        */
+       rdesc = list_last_entry(&sqi->bd_list_used, struct ring_desc, list);
+       rdesc->bd->bd_ctrl |= BD_LAST | BD_CS_DEASSERT |
+                             BD_LIFM | BD_PKT_INT_EN;
+
+       /* set base address BD list for DMA engine */
+       rdesc = list_first_entry(&sqi->bd_list_used, struct ring_desc, list);
+       writel(rdesc->bd_dma, sqi->regs + PESQI_BD_BASE_ADDR_REG);
+
+       /* enable interrupt */
+       pic32_sqi_enable_int(sqi);
+
+       /* enable DMA engine */
+       val = PESQI_DMA_EN | PESQI_POLL_EN | PESQI_BDP_START;
+       writel(val, sqi->regs + PESQI_BD_CTRL_REG);
+
+       /* wait for xfer completion */
+       ret = wait_for_completion_timeout(&sqi->xfer_done, 5 * HZ);
+       if (ret <= 0) {
+               dev_err(&sqi->master->dev, "wait timedout/interrupted\n");
+               ret = -EIO;
+               msg->status = ret;
+       } else {
+               /* success */
+               msg->status = 0;
+               ret = 0;
+       }
+
+       /* disable DMA */
+       writel(0, sqi->regs + PESQI_BD_CTRL_REG);
+
+       pic32_sqi_disable_int(sqi);
+
+xfer_out:
+       list_for_each_entry_safe_reverse(rdesc, next,
+                                        &sqi->bd_list_used, list) {
+               /* Update total byte transferred */
+               msg->actual_length += rdesc->xfer_len;
+               /* release ring descr */
+               ring_desc_put(sqi, rdesc);
+       }
+       spi_finalize_current_message(spi->master);
+
+       return ret;
+}
+
+static int pic32_sqi_unprepare_hardware(struct spi_master *master)
+{
+       struct pic32_sqi *sqi = spi_master_get_devdata(master);
+
+       /* disable clk */
+       pic32_clrbits(sqi->regs + PESQI_CLK_CTRL_REG, PESQI_CLK_EN);
+       /* disable spi */
+       pic32_clrbits(sqi->regs + PESQI_CONF_REG, PESQI_EN);
+
+       return 0;
+}
+
+static int ring_desc_ring_alloc(struct pic32_sqi *sqi)
+{
+       struct ring_desc *rdesc;
+       struct buf_desc *bd;
+       int i;
+
+       /* allocate coherent DMAable memory for hardware buffer descriptors. */
+       sqi->bd = dma_zalloc_coherent(&sqi->master->dev,
+                                     sizeof(*bd) * PESQI_BD_COUNT,
+                                     &sqi->bd_dma, GFP_DMA32);
+       if (!sqi->bd) {
+               dev_err(&sqi->master->dev, "failed allocating dma buffer\n");
+               return -ENOMEM;
+       }
+
+       /* allocate software ring descriptors */
+       sqi->ring = kcalloc(PESQI_BD_COUNT, sizeof(*rdesc), GFP_KERNEL);
+       if (!sqi->ring) {
+               dma_free_coherent(&sqi->master->dev,
+                                 sizeof(*bd) * PESQI_BD_COUNT,
+                                 sqi->bd, sqi->bd_dma);
+               return -ENOMEM;
+       }
+
+       bd = (struct buf_desc *)sqi->bd;
+
+       INIT_LIST_HEAD(&sqi->bd_list_free);
+       INIT_LIST_HEAD(&sqi->bd_list_used);
+
+       /* initialize ring-desc */
+       for (i = 0, rdesc = sqi->ring; i < PESQI_BD_COUNT; i++, rdesc++) {
+               INIT_LIST_HEAD(&rdesc->list);
+               rdesc->bd = &bd[i];
+               rdesc->bd_dma = sqi->bd_dma + (void *)&bd[i] - (void *)bd;
+               list_add_tail(&rdesc->list, &sqi->bd_list_free);
+       }
+
+       /* Prepare BD: chain to next BD(s) */
+       for (i = 0, rdesc = sqi->ring; i < PESQI_BD_COUNT - 1; i++)
+               bd[i].bd_nextp = rdesc[i + 1].bd_dma;
+       bd[PESQI_BD_COUNT - 1].bd_nextp = 0;
+
+       return 0;
+}
+
+static void ring_desc_ring_free(struct pic32_sqi *sqi)
+{
+       dma_free_coherent(&sqi->master->dev,
+                         sizeof(struct buf_desc) * PESQI_BD_COUNT,
+                         sqi->bd, sqi->bd_dma);
+       kfree(sqi->ring);
+}
+
+static void pic32_sqi_hw_init(struct pic32_sqi *sqi)
+{
+       unsigned long flags;
+       u32 val;
+
+       /* Soft-reset of PESQI controller triggers interrupt.
+        * We are not yet ready to handle them so disable CPU
+        * interrupt for the time being.
+        */
+       local_irq_save(flags);
+
+       /* assert soft-reset */
+       writel(PESQI_SOFT_RESET, sqi->regs + PESQI_CONF_REG);
+
+       /* wait until clear */
+       readl_poll_timeout_atomic(sqi->regs + PESQI_CONF_REG, val,
+                                 !(val & PESQI_SOFT_RESET), 1, 5000);
+
+       /* disable all interrupts */
+       pic32_sqi_disable_int(sqi);
+
+       /* Now it is safe to enable back CPU interrupt */
+       local_irq_restore(flags);
+
+       /* tx and rx fifo interrupt threshold */
+       val = readl(sqi->regs + PESQI_CMD_THRES_REG);
+       val &= ~(PESQI_TXTHR_MASK << PESQI_TXTHR_SHIFT);
+       val &= ~(PESQI_RXTHR_MASK << PESQI_RXTHR_SHIFT);
+       val |= (1U << PESQI_TXTHR_SHIFT) | (1U << PESQI_RXTHR_SHIFT);
+       writel(val, sqi->regs + PESQI_CMD_THRES_REG);
+
+       val = readl(sqi->regs + PESQI_INT_THRES_REG);
+       val &= ~(PESQI_TXTHR_MASK << PESQI_TXTHR_SHIFT);
+       val &= ~(PESQI_RXTHR_MASK << PESQI_RXTHR_SHIFT);
+       val |= (1U << PESQI_TXTHR_SHIFT) | (1U << PESQI_RXTHR_SHIFT);
+       writel(val, sqi->regs + PESQI_INT_THRES_REG);
+
+       /* default configuration */
+       val = readl(sqi->regs + PESQI_CONF_REG);
+
+       /* set mode: DMA */
+       val &= ~PESQI_MODE;
+       val |= PESQI_MODE_DMA << PESQI_MODE_SHIFT;
+       writel(val, sqi->regs + PESQI_CONF_REG);
+
+       /* DATAEN - SQIID0-ID3 */
+       val |= PESQI_QUAD_LANE << PESQI_LANES_SHIFT;
+
+       /* burst/INCR4 enable */
+       val |= PESQI_BURST_EN;
+
+       /* CSEN - all CS */
+       val |= 3U << PESQI_CSEN_SHIFT;
+       writel(val, sqi->regs + PESQI_CONF_REG);
+
+       /* write poll count */
+       writel(0, sqi->regs + PESQI_BD_POLL_CTRL_REG);
+
+       sqi->cur_speed = 0;
+       sqi->cur_mode = -1;
+}
+
+static int pic32_sqi_probe(struct platform_device *pdev)
+{
+       struct spi_master *master;
+       struct pic32_sqi *sqi;
+       struct resource *reg;
+       int ret;
+
+       master = spi_alloc_master(&pdev->dev, sizeof(*sqi));
+       if (!master)
+               return -ENOMEM;
+
+       sqi = spi_master_get_devdata(master);
+       sqi->master = master;
+
+       reg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       sqi->regs = devm_ioremap_resource(&pdev->dev, reg);
+       if (IS_ERR(sqi->regs)) {
+               ret = PTR_ERR(sqi->regs);
+               goto err_free_master;
+       }
+
+       /* irq */
+       sqi->irq = platform_get_irq(pdev, 0);
+       if (sqi->irq < 0) {
+               dev_err(&pdev->dev, "no irq found\n");
+               ret = sqi->irq;
+               goto err_free_master;
+       }
+
+       /* clocks */
+       sqi->sys_clk = devm_clk_get(&pdev->dev, "reg_ck");
+       if (IS_ERR(sqi->sys_clk)) {
+               ret = PTR_ERR(sqi->sys_clk);
+               dev_err(&pdev->dev, "no sys_clk ?\n");
+               goto err_free_master;
+       }
+
+       sqi->base_clk = devm_clk_get(&pdev->dev, "spi_ck");
+       if (IS_ERR(sqi->base_clk)) {
+               ret = PTR_ERR(sqi->base_clk);
+               dev_err(&pdev->dev, "no base clk ?\n");
+               goto err_free_master;
+       }
+
+       ret = clk_prepare_enable(sqi->sys_clk);
+       if (ret) {
+               dev_err(&pdev->dev, "sys clk enable failed\n");
+               goto err_free_master;
+       }
+
+       ret = clk_prepare_enable(sqi->base_clk);
+       if (ret) {
+               dev_err(&pdev->dev, "base clk enable failed\n");
+               clk_disable_unprepare(sqi->sys_clk);
+               goto err_free_master;
+       }
+
+       init_completion(&sqi->xfer_done);
+
+       /* initialize hardware */
+       pic32_sqi_hw_init(sqi);
+
+       /* allocate buffers & descriptors */
+       ret = ring_desc_ring_alloc(sqi);
+       if (ret) {
+               dev_err(&pdev->dev, "ring alloc failed\n");
+               goto err_disable_clk;
+       }
+
+       /* install irq handlers */
+       ret = request_irq(sqi->irq, pic32_sqi_isr, 0,
+                         dev_name(&pdev->dev), sqi);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request_irq(%d), failed\n", sqi->irq);
+               goto err_free_ring;
+       }
+
+       /* register master */
+       master->num_chipselect  = 2;
+       master->max_speed_hz    = clk_get_rate(sqi->base_clk);
+       master->dma_alignment   = 32;
+       master->max_dma_len     = PESQI_BD_BUF_LEN_MAX;
+       master->dev.of_node     = of_node_get(pdev->dev.of_node);
+       master->mode_bits       = SPI_MODE_3 | SPI_MODE_0 | SPI_TX_DUAL |
+                                 SPI_RX_DUAL | SPI_TX_QUAD | SPI_RX_QUAD;
+       master->flags           = SPI_MASTER_HALF_DUPLEX;
+       master->can_dma         = pic32_sqi_can_dma;
+       master->bits_per_word_mask      = SPI_BPW_RANGE_MASK(8, 32);
+       master->transfer_one_message    = pic32_sqi_one_message;
+       master->prepare_transfer_hardware       = pic32_sqi_prepare_hardware;
+       master->unprepare_transfer_hardware     = pic32_sqi_unprepare_hardware;
+
+       ret = devm_spi_register_master(&pdev->dev, master);
+       if (ret) {
+               dev_err(&master->dev, "failed registering spi master\n");
+               free_irq(sqi->irq, sqi);
+               goto err_free_ring;
+       }
+
+       platform_set_drvdata(pdev, sqi);
+
+       return 0;
+
+err_free_ring:
+       ring_desc_ring_free(sqi);
+
+err_disable_clk:
+       clk_disable_unprepare(sqi->base_clk);
+       clk_disable_unprepare(sqi->sys_clk);
+
+err_free_master:
+       spi_master_put(master);
+       return ret;
+}
+
+static int pic32_sqi_remove(struct platform_device *pdev)
+{
+       struct pic32_sqi *sqi = platform_get_drvdata(pdev);
+
+       /* release resources */
+       free_irq(sqi->irq, sqi);
+       ring_desc_ring_free(sqi);
+
+       /* disable clk */
+       clk_disable_unprepare(sqi->base_clk);
+       clk_disable_unprepare(sqi->sys_clk);
+
+       return 0;
+}
+
+static const struct of_device_id pic32_sqi_of_ids[] = {
+       {.compatible = "microchip,pic32mzda-sqi",},
+       {},
+};
+MODULE_DEVICE_TABLE(of, pic32_sqi_of_ids);
+
+static struct platform_driver pic32_sqi_driver = {
+       .driver = {
+               .name = "sqi-pic32",
+               .of_match_table = of_match_ptr(pic32_sqi_of_ids),
+       },
+       .probe = pic32_sqi_probe,
+       .remove = pic32_sqi_remove,
+};
+
+module_platform_driver(pic32_sqi_driver);
+
+MODULE_AUTHOR("Purna Chandra Mandal <purna.mandal@microchip.com>");
+MODULE_DESCRIPTION("Microchip SPI driver for PIC32 SQI controller.");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c
new file mode 100644 (file)
index 0000000..73db87f
--- /dev/null
@@ -0,0 +1,878 @@
+/*
+ * Microchip PIC32 SPI controller driver.
+ *
+ * Purna Chandra Mandal <purna.mandal@microchip.com>
+ * Copyright (c) 2016, Microchip Technology Inc.
+ *
+ * This program is free software; you can distribute it and/or modify it
+ * under the terms of the GNU General Public License (Version 2) as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/clkdev.h>
+#include <linux/delay.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+
+/* SPI controller registers */
+struct pic32_spi_regs {
+       u32 ctrl;
+       u32 ctrl_clr;
+       u32 ctrl_set;
+       u32 ctrl_inv;
+       u32 status;
+       u32 status_clr;
+       u32 status_set;
+       u32 status_inv;
+       u32 buf;
+       u32 dontuse[3];
+       u32 baud;
+       u32 dontuse2[3];
+       u32 ctrl2;
+       u32 ctrl2_clr;
+       u32 ctrl2_set;
+       u32 ctrl2_inv;
+};
+
+/* Bit fields of SPI Control Register */
+#define CTRL_RX_INT_SHIFT      0  /* Rx interrupt generation */
+#define  RX_FIFO_EMTPY         0
+#define  RX_FIFO_NOT_EMPTY     1 /* not empty */
+#define  RX_FIFO_HALF_FULL     2 /* full by half or more */
+#define  RX_FIFO_FULL          3 /* completely full */
+
+#define CTRL_TX_INT_SHIFT      2  /* TX interrupt generation */
+#define  TX_FIFO_ALL_EMPTY     0 /* completely empty */
+#define  TX_FIFO_EMTPY         1 /* empty */
+#define  TX_FIFO_HALF_EMPTY    2 /* empty by half or more */
+#define  TX_FIFO_NOT_FULL      3 /* atleast one empty */
+
+#define CTRL_MSTEN     BIT(5) /* enable master mode */
+#define CTRL_CKP       BIT(6) /* active low */
+#define CTRL_CKE       BIT(8) /* Tx on falling edge */
+#define CTRL_SMP       BIT(9) /* Rx at middle or end of tx */
+#define CTRL_BPW_MASK  0x03   /* bits per word/sample */
+#define CTRL_BPW_SHIFT 10
+#define  PIC32_BPW_8   0
+#define  PIC32_BPW_16  1
+#define  PIC32_BPW_32  2
+#define CTRL_SIDL      BIT(13) /* sleep when idle */
+#define CTRL_ON                BIT(15) /* enable macro */
+#define CTRL_ENHBUF    BIT(16) /* enable enhanced buffering */
+#define CTRL_MCLKSEL   BIT(23) /* select clock source */
+#define CTRL_MSSEN     BIT(28) /* macro driven /SS */
+#define CTRL_FRMEN     BIT(31) /* enable framing mode */
+
+/* Bit fields of SPI Status Register */
+#define STAT_RF_EMPTY  BIT(5) /* RX Fifo empty */
+#define STAT_RX_OV     BIT(6) /* err, s/w needs to clear */
+#define STAT_TX_UR     BIT(8) /* UR in Framed SPI modes */
+#define STAT_FRM_ERR   BIT(12) /* Multiple Frame Sync pulse */
+#define STAT_TF_LVL_MASK       0x1F
+#define STAT_TF_LVL_SHIFT      16
+#define STAT_RF_LVL_MASK       0x1F
+#define STAT_RF_LVL_SHIFT      24
+
+/* Bit fields of SPI Baud Register */
+#define BAUD_MASK              0x1ff
+
+/* Bit fields of SPI Control2 Register */
+#define CTRL2_TX_UR_EN         BIT(10) /* Enable int on Tx under-run */
+#define CTRL2_RX_OV_EN         BIT(11) /* Enable int on Rx over-run */
+#define CTRL2_FRM_ERR_EN       BIT(12) /* Enable frame err int */
+
+/* Minimum DMA transfer size */
+#define PIC32_DMA_LEN_MIN      64
+
+struct pic32_spi {
+       dma_addr_t              dma_base;
+       struct pic32_spi_regs __iomem *regs;
+       int                     fault_irq;
+       int                     rx_irq;
+       int                     tx_irq;
+       u32                     fifo_n_byte; /* FIFO depth in bytes */
+       struct clk              *clk;
+       struct spi_master       *master;
+       /* Current controller setting */
+       u32                     speed_hz; /* spi-clk rate */
+       u32                     mode;
+       u32                     bits_per_word;
+       u32                     fifo_n_elm; /* FIFO depth in words */
+#define PIC32F_DMA_PREP                0 /* DMA chnls configured */
+       unsigned long           flags;
+       /* Current transfer state */
+       struct completion       xfer_done;
+       /* PIO transfer specific */
+       const void              *tx;
+       const void              *tx_end;
+       const void              *rx;
+       const void              *rx_end;
+       int                     len;
+       void (*rx_fifo)(struct pic32_spi *);
+       void (*tx_fifo)(struct pic32_spi *);
+};
+
+static inline void pic32_spi_enable(struct pic32_spi *pic32s)
+{
+       writel(CTRL_ON | CTRL_SIDL, &pic32s->regs->ctrl_set);
+}
+
+static inline void pic32_spi_disable(struct pic32_spi *pic32s)
+{
+       writel(CTRL_ON | CTRL_SIDL, &pic32s->regs->ctrl_clr);
+
+       /* avoid SPI registers read/write at immediate next CPU clock */
+       ndelay(20);
+}
+
+static void pic32_spi_set_clk_rate(struct pic32_spi *pic32s, u32 spi_ck)
+{
+       u32 div;
+
+       /* div = (clk_in / 2 * spi_ck) - 1 */
+       div = DIV_ROUND_CLOSEST(clk_get_rate(pic32s->clk), 2 * spi_ck) - 1;
+
+       writel(div & BAUD_MASK, &pic32s->regs->baud);
+}
+
+static inline u32 pic32_rx_fifo_level(struct pic32_spi *pic32s)
+{
+       u32 sr = readl(&pic32s->regs->status);
+
+       return (sr >> STAT_RF_LVL_SHIFT) & STAT_RF_LVL_MASK;
+}
+
+static inline u32 pic32_tx_fifo_level(struct pic32_spi *pic32s)
+{
+       u32 sr = readl(&pic32s->regs->status);
+
+       return (sr >> STAT_TF_LVL_SHIFT) & STAT_TF_LVL_MASK;
+}
+
+/* Return the max entries we can fill into tx fifo */
+static u32 pic32_tx_max(struct pic32_spi *pic32s, int n_bytes)
+{
+       u32 tx_left, tx_room, rxtx_gap;
+
+       tx_left = (pic32s->tx_end - pic32s->tx) / n_bytes;
+       tx_room = pic32s->fifo_n_elm - pic32_tx_fifo_level(pic32s);
+
+       /*
+        * Another concern is about the tx/rx mismatch, we
+        * though to use (pic32s->fifo_n_byte - rxfl - txfl) as
+        * one maximum value for tx, but it doesn't cover the
+        * data which is out of tx/rx fifo and inside the
+        * shift registers. So a ctrl from sw point of
+        * view is taken.
+        */
+       rxtx_gap = ((pic32s->rx_end - pic32s->rx) -
+                   (pic32s->tx_end - pic32s->tx)) / n_bytes;
+       return min3(tx_left, tx_room, (u32)(pic32s->fifo_n_elm - rxtx_gap));
+}
+
+/* Return the max entries we should read out of rx fifo */
+static u32 pic32_rx_max(struct pic32_spi *pic32s, int n_bytes)
+{
+       u32 rx_left = (pic32s->rx_end - pic32s->rx) / n_bytes;
+
+       return min_t(u32, rx_left, pic32_rx_fifo_level(pic32s));
+}
+
+#define BUILD_SPI_FIFO_RW(__name, __type, __bwl)               \
+static void pic32_spi_rx_##__name(struct pic32_spi *pic32s)    \
+{                                                              \
+       __type v;                                               \
+       u32 mx = pic32_rx_max(pic32s, sizeof(__type));          \
+       for (; mx; mx--) {                                      \
+               v = read##__bwl(&pic32s->regs->buf);            \
+               if (pic32s->rx_end - pic32s->len)               \
+                       *(__type *)(pic32s->rx) = v;            \
+               pic32s->rx += sizeof(__type);                   \
+       }                                                       \
+}                                                              \
+                                                               \
+static void pic32_spi_tx_##__name(struct pic32_spi *pic32s)    \
+{                                                              \
+       __type v;                                               \
+       u32 mx = pic32_tx_max(pic32s, sizeof(__type));          \
+       for (; mx ; mx--) {                                     \
+               v = (__type)~0U;                                \
+               if (pic32s->tx_end - pic32s->len)               \
+                       v = *(__type *)(pic32s->tx);            \
+               write##__bwl(v, &pic32s->regs->buf);            \
+               pic32s->tx += sizeof(__type);                   \
+       }                                                       \
+}
+
+BUILD_SPI_FIFO_RW(byte, u8, b);
+BUILD_SPI_FIFO_RW(word, u16, w);
+BUILD_SPI_FIFO_RW(dword, u32, l);
+
+static void pic32_err_stop(struct pic32_spi *pic32s, const char *msg)
+{
+       /* disable all interrupts */
+       disable_irq_nosync(pic32s->fault_irq);
+       disable_irq_nosync(pic32s->rx_irq);
+       disable_irq_nosync(pic32s->tx_irq);
+
+       /* Show err message and abort xfer with err */
+       dev_err(&pic32s->master->dev, "%s\n", msg);
+       if (pic32s->master->cur_msg)
+               pic32s->master->cur_msg->status = -EIO;
+       complete(&pic32s->xfer_done);
+}
+
+static irqreturn_t pic32_spi_fault_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+       u32 status;
+
+       status = readl(&pic32s->regs->status);
+
+       /* Error handling */
+       if (status & (STAT_RX_OV | STAT_TX_UR)) {
+               writel(STAT_RX_OV, &pic32s->regs->status_clr);
+               writel(STAT_TX_UR, &pic32s->regs->status_clr);
+               pic32_err_stop(pic32s, "err_irq: fifo ov/ur-run\n");
+               return IRQ_HANDLED;
+       }
+
+       if (status & STAT_FRM_ERR) {
+               pic32_err_stop(pic32s, "err_irq: frame error");
+               return IRQ_HANDLED;
+       }
+
+       if (!pic32s->master->cur_msg) {
+               pic32_err_stop(pic32s, "err_irq: no mesg");
+               return IRQ_NONE;
+       }
+
+       return IRQ_NONE;
+}
+
+static irqreturn_t pic32_spi_rx_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+
+       pic32s->rx_fifo(pic32s);
+
+       /* rx complete ? */
+       if (pic32s->rx_end == pic32s->rx) {
+               /* disable all interrupts */
+               disable_irq_nosync(pic32s->fault_irq);
+               disable_irq_nosync(pic32s->rx_irq);
+
+               /* complete current xfer */
+               complete(&pic32s->xfer_done);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t pic32_spi_tx_irq(int irq, void *dev_id)
+{
+       struct pic32_spi *pic32s = dev_id;
+
+       pic32s->tx_fifo(pic32s);
+
+       /* tx complete? disable tx interrupt */
+       if (pic32s->tx_end == pic32s->tx)
+               disable_irq_nosync(pic32s->tx_irq);
+
+       return IRQ_HANDLED;
+}
+
+static void pic32_spi_dma_rx_notify(void *data)
+{
+       struct pic32_spi *pic32s = data;
+
+       complete(&pic32s->xfer_done);
+}
+
+static int pic32_spi_dma_transfer(struct pic32_spi *pic32s,
+                                 struct spi_transfer *xfer)
+{
+       struct spi_master *master = pic32s->master;
+       struct dma_async_tx_descriptor *desc_rx;
+       struct dma_async_tx_descriptor *desc_tx;
+       dma_cookie_t cookie;
+       int ret;
+
+       if (!master->dma_rx || !master->dma_tx)
+               return -ENODEV;
+
+       desc_rx = dmaengine_prep_slave_sg(master->dma_rx,
+                                         xfer->rx_sg.sgl,
+                                         xfer->rx_sg.nents,
+                                         DMA_FROM_DEVICE,
+                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       if (!desc_rx) {
+               ret = -EINVAL;
+               goto err_dma;
+       }
+
+       desc_tx = dmaengine_prep_slave_sg(master->dma_tx,
+                                         xfer->tx_sg.sgl,
+                                         xfer->tx_sg.nents,
+                                         DMA_TO_DEVICE,
+                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       if (!desc_tx) {
+               ret = -EINVAL;
+               goto err_dma;
+       }
+
+       /* Put callback on the RX transfer, that should finish last */
+       desc_rx->callback = pic32_spi_dma_rx_notify;
+       desc_rx->callback_param = pic32s;
+
+       cookie = dmaengine_submit(desc_rx);
+       ret = dma_submit_error(cookie);
+       if (ret)
+               goto err_dma;
+
+       cookie = dmaengine_submit(desc_tx);
+       ret = dma_submit_error(cookie);
+       if (ret)
+               goto err_dma_tx;
+
+       dma_async_issue_pending(master->dma_rx);
+       dma_async_issue_pending(master->dma_tx);
+
+       return 0;
+
+err_dma_tx:
+       dmaengine_terminate_all(master->dma_rx);
+err_dma:
+       return ret;
+}
+
+static int pic32_spi_dma_config(struct pic32_spi *pic32s, u32 dma_width)
+{
+       int buf_offset = offsetof(struct pic32_spi_regs, buf);
+       struct spi_master *master = pic32s->master;
+       struct dma_slave_config cfg;
+       int ret;
+
+       cfg.device_fc = true;
+       cfg.src_addr = pic32s->dma_base + buf_offset;
+       cfg.dst_addr = pic32s->dma_base + buf_offset;
+       cfg.src_maxburst = pic32s->fifo_n_elm / 2; /* fill one-half */
+       cfg.dst_maxburst = pic32s->fifo_n_elm / 2; /* drain one-half */
+       cfg.src_addr_width = dma_width;
+       cfg.dst_addr_width = dma_width;
+       /* tx channel */
+       cfg.slave_id = pic32s->tx_irq;
+       cfg.direction = DMA_MEM_TO_DEV;
+       ret = dmaengine_slave_config(master->dma_tx, &cfg);
+       if (ret) {
+               dev_err(&master->dev, "tx channel setup failed\n");
+               return ret;
+       }
+       /* rx channel */
+       cfg.slave_id = pic32s->rx_irq;
+       cfg.direction = DMA_DEV_TO_MEM;
+       ret = dmaengine_slave_config(master->dma_rx, &cfg);
+       if (ret)
+               dev_err(&master->dev, "rx channel setup failed\n");
+
+       return ret;
+}
+
+static int pic32_spi_set_word_size(struct pic32_spi *pic32s, u8 bits_per_word)
+{
+       enum dma_slave_buswidth dmawidth;
+       u32 buswidth, v;
+
+       switch (bits_per_word) {
+       case 8:
+               pic32s->rx_fifo = pic32_spi_rx_byte;
+               pic32s->tx_fifo = pic32_spi_tx_byte;
+               buswidth = PIC32_BPW_8;
+               dmawidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
+               break;
+       case 16:
+               pic32s->rx_fifo = pic32_spi_rx_word;
+               pic32s->tx_fifo = pic32_spi_tx_word;
+               buswidth = PIC32_BPW_16;
+               dmawidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
+               break;
+       case 32:
+               pic32s->rx_fifo = pic32_spi_rx_dword;
+               pic32s->tx_fifo = pic32_spi_tx_dword;
+               buswidth = PIC32_BPW_32;
+               dmawidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
+               break;
+       default:
+               /* not supported */
+               return -EINVAL;
+       }
+
+       /* calculate maximum number of words fifos can hold */
+       pic32s->fifo_n_elm = DIV_ROUND_UP(pic32s->fifo_n_byte,
+                                         bits_per_word / 8);
+       /* set word size */
+       v = readl(&pic32s->regs->ctrl);
+       v &= ~(CTRL_BPW_MASK << CTRL_BPW_SHIFT);
+       v |= buswidth << CTRL_BPW_SHIFT;
+       writel(v, &pic32s->regs->ctrl);
+
+       /* re-configure dma width, if required */
+       if (test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               pic32_spi_dma_config(pic32s, dmawidth);
+
+       return 0;
+}
+
+static int pic32_spi_prepare_hardware(struct spi_master *master)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       pic32_spi_enable(pic32s);
+
+       return 0;
+}
+
+static int pic32_spi_prepare_message(struct spi_master *master,
+                                    struct spi_message *msg)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+       struct spi_device *spi = msg->spi;
+       u32 val;
+
+       /* set device specific bits_per_word */
+       if (pic32s->bits_per_word != spi->bits_per_word) {
+               pic32_spi_set_word_size(pic32s, spi->bits_per_word);
+               pic32s->bits_per_word = spi->bits_per_word;
+       }
+
+       /* device specific speed change */
+       if (pic32s->speed_hz != spi->max_speed_hz) {
+               pic32_spi_set_clk_rate(pic32s, spi->max_speed_hz);
+               pic32s->speed_hz = spi->max_speed_hz;
+       }
+
+       /* device specific mode change */
+       if (pic32s->mode != spi->mode) {
+               val = readl(&pic32s->regs->ctrl);
+               /* active low */
+               if (spi->mode & SPI_CPOL)
+                       val |= CTRL_CKP;
+               else
+                       val &= ~CTRL_CKP;
+               /* tx on rising edge */
+               if (spi->mode & SPI_CPHA)
+                       val &= ~CTRL_CKE;
+               else
+                       val |= CTRL_CKE;
+
+               /* rx at end of tx */
+               val |= CTRL_SMP;
+               writel(val, &pic32s->regs->ctrl);
+               pic32s->mode = spi->mode;
+       }
+
+       return 0;
+}
+
+static bool pic32_spi_can_dma(struct spi_master *master,
+                             struct spi_device *spi,
+                             struct spi_transfer *xfer)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       /* skip using DMA on small size transfer to avoid overhead.*/
+       return (xfer->len >= PIC32_DMA_LEN_MIN) &&
+              test_bit(PIC32F_DMA_PREP, &pic32s->flags);
+}
+
+static int pic32_spi_one_transfer(struct spi_master *master,
+                                 struct spi_device *spi,
+                                 struct spi_transfer *transfer)
+{
+       struct pic32_spi *pic32s;
+       bool dma_issued = false;
+       int ret;
+
+       pic32s = spi_master_get_devdata(master);
+
+       /* handle transfer specific word size change */
+       if (transfer->bits_per_word &&
+           (transfer->bits_per_word != pic32s->bits_per_word)) {
+               ret = pic32_spi_set_word_size(pic32s, transfer->bits_per_word);
+               if (ret)
+                       return ret;
+               pic32s->bits_per_word = transfer->bits_per_word;
+       }
+
+       /* handle transfer specific speed change */
+       if (transfer->speed_hz && (transfer->speed_hz != pic32s->speed_hz)) {
+               pic32_spi_set_clk_rate(pic32s, transfer->speed_hz);
+               pic32s->speed_hz = transfer->speed_hz;
+       }
+
+       reinit_completion(&pic32s->xfer_done);
+
+       /* transact by DMA mode */
+       if (transfer->rx_sg.nents && transfer->tx_sg.nents) {
+               ret = pic32_spi_dma_transfer(pic32s, transfer);
+               if (ret) {
+                       dev_err(&spi->dev, "dma submit error\n");
+                       return ret;
+               }
+
+               /* DMA issued */
+               dma_issued = true;
+       } else {
+               /* set current transfer information */
+               pic32s->tx = (const void *)transfer->tx_buf;
+               pic32s->rx = (const void *)transfer->rx_buf;
+               pic32s->tx_end = pic32s->tx + transfer->len;
+               pic32s->rx_end = pic32s->rx + transfer->len;
+               pic32s->len = transfer->len;
+
+               /* transact by interrupt driven PIO */
+               enable_irq(pic32s->fault_irq);
+               enable_irq(pic32s->rx_irq);
+               enable_irq(pic32s->tx_irq);
+       }
+
+       /* wait for completion */
+       ret = wait_for_completion_timeout(&pic32s->xfer_done, 2 * HZ);
+       if (ret <= 0) {
+               dev_err(&spi->dev, "wait error/timedout\n");
+               if (dma_issued) {
+                       dmaengine_terminate_all(master->dma_rx);
+                       dmaengine_terminate_all(master->dma_rx);
+               }
+               ret = -ETIMEDOUT;
+       } else {
+               ret = 0;
+       }
+
+       return ret;
+}
+
+static int pic32_spi_unprepare_message(struct spi_master *master,
+                                      struct spi_message *msg)
+{
+       /* nothing to do */
+       return 0;
+}
+
+static int pic32_spi_unprepare_hardware(struct spi_master *master)
+{
+       struct pic32_spi *pic32s = spi_master_get_devdata(master);
+
+       pic32_spi_disable(pic32s);
+
+       return 0;
+}
+
+/* This may be called multiple times by same spi dev */
+static int pic32_spi_setup(struct spi_device *spi)
+{
+       if (!spi->max_speed_hz) {
+               dev_err(&spi->dev, "No max speed HZ parameter\n");
+               return -EINVAL;
+       }
+
+       /* PIC32 spi controller can drive /CS during transfer depending
+        * on tx fifo fill-level. /CS will stay asserted as long as TX
+        * fifo is non-empty, else will be deasserted indicating
+        * completion of the ongoing transfer. This might result into
+        * unreliable/erroneous SPI transactions.
+        * To avoid that we will always handle /CS by toggling GPIO.
+        */
+       if (!gpio_is_valid(spi->cs_gpio))
+               return -EINVAL;
+
+       gpio_direction_output(spi->cs_gpio, !(spi->mode & SPI_CS_HIGH));
+
+       return 0;
+}
+
+static void pic32_spi_cleanup(struct spi_device *spi)
+{
+       /* de-activate cs-gpio */
+       gpio_direction_output(spi->cs_gpio, !(spi->mode & SPI_CS_HIGH));
+}
+
+static void pic32_spi_dma_prep(struct pic32_spi *pic32s, struct device *dev)
+{
+       struct spi_master *master = pic32s->master;
+       dma_cap_mask_t mask;
+
+       dma_cap_zero(mask);
+       dma_cap_set(DMA_SLAVE, mask);
+
+       master->dma_rx = dma_request_slave_channel_compat(mask, NULL, NULL,
+                                                         dev, "spi-rx");
+       if (!master->dma_rx) {
+               dev_warn(dev, "RX channel not found.\n");
+               goto out_err;
+       }
+
+       master->dma_tx = dma_request_slave_channel_compat(mask, NULL, NULL,
+                                                         dev, "spi-tx");
+       if (!master->dma_tx) {
+               dev_warn(dev, "TX channel not found.\n");
+               goto out_err;
+       }
+
+       if (pic32_spi_dma_config(pic32s, DMA_SLAVE_BUSWIDTH_1_BYTE))
+               goto out_err;
+
+       /* DMA chnls allocated and prepared */
+       set_bit(PIC32F_DMA_PREP, &pic32s->flags);
+
+       return;
+
+out_err:
+       if (master->dma_rx)
+               dma_release_channel(master->dma_rx);
+
+       if (master->dma_tx)
+               dma_release_channel(master->dma_tx);
+}
+
+static void pic32_spi_dma_unprep(struct pic32_spi *pic32s)
+{
+       if (!test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               return;
+
+       clear_bit(PIC32F_DMA_PREP, &pic32s->flags);
+       if (pic32s->master->dma_rx)
+               dma_release_channel(pic32s->master->dma_rx);
+
+       if (pic32s->master->dma_tx)
+               dma_release_channel(pic32s->master->dma_tx);
+}
+
+static void pic32_spi_hw_init(struct pic32_spi *pic32s)
+{
+       u32 ctrl;
+
+       /* disable hardware */
+       pic32_spi_disable(pic32s);
+
+       ctrl = readl(&pic32s->regs->ctrl);
+       /* enable enhanced fifo of 128bit deep */
+       ctrl |= CTRL_ENHBUF;
+       pic32s->fifo_n_byte = 16;
+
+       /* disable framing mode */
+       ctrl &= ~CTRL_FRMEN;
+
+       /* enable master mode while disabled */
+       ctrl |= CTRL_MSTEN;
+
+       /* set tx fifo threshold interrupt */
+       ctrl &= ~(0x3 << CTRL_TX_INT_SHIFT);
+       ctrl |= (TX_FIFO_HALF_EMPTY << CTRL_TX_INT_SHIFT);
+
+       /* set rx fifo threshold interrupt */
+       ctrl &= ~(0x3 << CTRL_RX_INT_SHIFT);
+       ctrl |= (RX_FIFO_NOT_EMPTY << CTRL_RX_INT_SHIFT);
+
+       /* select clk source */
+       ctrl &= ~CTRL_MCLKSEL;
+
+       /* set manual /CS mode */
+       ctrl &= ~CTRL_MSSEN;
+
+       writel(ctrl, &pic32s->regs->ctrl);
+
+       /* enable error reporting */
+       ctrl = CTRL2_TX_UR_EN | CTRL2_RX_OV_EN | CTRL2_FRM_ERR_EN;
+       writel(ctrl, &pic32s->regs->ctrl2_set);
+}
+
+static int pic32_spi_hw_probe(struct platform_device *pdev,
+                             struct pic32_spi *pic32s)
+{
+       struct resource *mem;
+       int ret;
+
+       mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       pic32s->regs = devm_ioremap_resource(&pdev->dev, mem);
+       if (IS_ERR(pic32s->regs))
+               return PTR_ERR(pic32s->regs);
+
+       pic32s->dma_base = mem->start;
+
+       /* get irq resources: err-irq, rx-irq, tx-irq */
+       pic32s->fault_irq = platform_get_irq_byname(pdev, "fault");
+       if (pic32s->fault_irq < 0) {
+               dev_err(&pdev->dev, "fault-irq not found\n");
+               return pic32s->fault_irq;
+       }
+
+       pic32s->rx_irq = platform_get_irq_byname(pdev, "rx");
+       if (pic32s->rx_irq < 0) {
+               dev_err(&pdev->dev, "rx-irq not found\n");
+               return pic32s->rx_irq;
+       }
+
+       pic32s->tx_irq = platform_get_irq_byname(pdev, "tx");
+       if (pic32s->tx_irq < 0) {
+               dev_err(&pdev->dev, "tx-irq not found\n");
+               return pic32s->tx_irq;
+       }
+
+       /* get clock */
+       pic32s->clk = devm_clk_get(&pdev->dev, "mck0");
+       if (IS_ERR(pic32s->clk)) {
+               dev_err(&pdev->dev, "clk not found\n");
+               ret = PTR_ERR(pic32s->clk);
+               goto err_unmap_mem;
+       }
+
+       ret = clk_prepare_enable(pic32s->clk);
+       if (ret)
+               goto err_unmap_mem;
+
+       pic32_spi_hw_init(pic32s);
+
+       return 0;
+
+err_unmap_mem:
+       dev_err(&pdev->dev, "%s failed, err %d\n", __func__, ret);
+       return ret;
+}
+
+static int pic32_spi_probe(struct platform_device *pdev)
+{
+       struct spi_master *master;
+       struct pic32_spi *pic32s;
+       int ret;
+
+       master = spi_alloc_master(&pdev->dev, sizeof(*pic32s));
+       if (!master)
+               return -ENOMEM;
+
+       pic32s = spi_master_get_devdata(master);
+       pic32s->master = master;
+
+       ret = pic32_spi_hw_probe(pdev, pic32s);
+       if (ret)
+               goto err_master;
+
+       master->dev.of_node     = of_node_get(pdev->dev.of_node);
+       master->mode_bits       = SPI_MODE_3 | SPI_MODE_0 | SPI_CS_HIGH;
+       master->num_chipselect  = 1; /* single chip-select */
+       master->max_speed_hz    = clk_get_rate(pic32s->clk);
+       master->setup           = pic32_spi_setup;
+       master->cleanup         = pic32_spi_cleanup;
+       master->flags           = SPI_MASTER_MUST_TX | SPI_MASTER_MUST_RX;
+       master->bits_per_word_mask      = SPI_BPW_MASK(8) | SPI_BPW_MASK(16) |
+                                         SPI_BPW_MASK(32);
+       master->transfer_one            = pic32_spi_one_transfer;
+       master->prepare_message         = pic32_spi_prepare_message;
+       master->unprepare_message       = pic32_spi_unprepare_message;
+       master->prepare_transfer_hardware       = pic32_spi_prepare_hardware;
+       master->unprepare_transfer_hardware     = pic32_spi_unprepare_hardware;
+
+       /* optional DMA support */
+       pic32_spi_dma_prep(pic32s, &pdev->dev);
+       if (test_bit(PIC32F_DMA_PREP, &pic32s->flags))
+               master->can_dma = pic32_spi_can_dma;
+
+       init_completion(&pic32s->xfer_done);
+       pic32s->mode = -1;
+
+       /* install irq handlers (with irq-disabled) */
+       irq_set_status_flags(pic32s->fault_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->fault_irq,
+                              pic32_spi_fault_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request fault-irq %d\n", pic32s->rx_irq);
+               goto err_bailout;
+       }
+
+       /* receive interrupt handler */
+       irq_set_status_flags(pic32s->rx_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->rx_irq,
+                              pic32_spi_rx_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request rx-irq %d\n", pic32s->rx_irq);
+               goto err_bailout;
+       }
+
+       /* transmit interrupt handler */
+       irq_set_status_flags(pic32s->tx_irq, IRQ_NOAUTOEN);
+       ret = devm_request_irq(&pdev->dev, pic32s->tx_irq,
+                              pic32_spi_tx_irq, IRQF_NO_THREAD,
+                              dev_name(&pdev->dev), pic32s);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "request tx-irq %d\n", pic32s->tx_irq);
+               goto err_bailout;
+       }
+
+       /* register master */
+       ret = devm_spi_register_master(&pdev->dev, master);
+       if (ret) {
+               dev_err(&master->dev, "failed registering spi master\n");
+               goto err_bailout;
+       }
+
+       platform_set_drvdata(pdev, pic32s);
+
+       return 0;
+
+err_bailout:
+       clk_disable_unprepare(pic32s->clk);
+err_master:
+       spi_master_put(master);
+       return ret;
+}
+
+static int pic32_spi_remove(struct platform_device *pdev)
+{
+       struct pic32_spi *pic32s;
+
+       pic32s = platform_get_drvdata(pdev);
+       pic32_spi_disable(pic32s);
+       clk_disable_unprepare(pic32s->clk);
+       pic32_spi_dma_unprep(pic32s);
+
+       return 0;
+}
+
+static const struct of_device_id pic32_spi_of_match[] = {
+       {.compatible = "microchip,pic32mzda-spi",},
+       {},
+};
+MODULE_DEVICE_TABLE(of, pic32_spi_of_match);
+
+static struct platform_driver pic32_spi_driver = {
+       .driver = {
+               .name = "spi-pic32",
+               .of_match_table = of_match_ptr(pic32_spi_of_match),
+       },
+       .probe = pic32_spi_probe,
+       .remove = pic32_spi_remove,
+};
+
+module_platform_driver(pic32_spi_driver);
+
+MODULE_AUTHOR("Purna Chandra Mandal <purna.mandal@microchip.com>");
+MODULE_DESCRIPTION("Microchip SPI driver for PIC32 SPI controller.");
+MODULE_LICENSE("GPL v2");
index 365fc22c35729372e22dee90271dc3a37820d27c..a18a03d0afb709a51eb9515cfee7d8acf181d2e6 100644 (file)
@@ -33,12 +33,10 @@ static int pxa2xx_spi_map_dma_buffer(struct driver_data *drv_data,
                dmadev = drv_data->tx_chan->device->dev;
                sgt = &drv_data->tx_sgt;
                buf = drv_data->tx;
-               drv_data->tx_map_len = len;
        } else {
                dmadev = drv_data->rx_chan->device->dev;
                sgt = &drv_data->rx_sgt;
                buf = drv_data->rx;
-               drv_data->rx_map_len = len;
        }
 
        nents = DIV_ROUND_UP(len, SZ_2K);
@@ -55,11 +53,7 @@ static int pxa2xx_spi_map_dma_buffer(struct driver_data *drv_data,
        for_each_sg(sgt->sgl, sg, sgt->nents, i) {
                size_t bytes = min_t(size_t, len, SZ_2K);
 
-               if (buf)
-                       sg_set_buf(sg, pbuf, bytes);
-               else
-                       sg_set_buf(sg, drv_data->dummy, bytes);
-
+               sg_set_buf(sg, pbuf, bytes);
                pbuf += bytes;
                len -= bytes;
        }
@@ -133,9 +127,6 @@ static void pxa2xx_spi_dma_transfer_complete(struct driver_data *drv_data,
                if (!error) {
                        pxa2xx_spi_unmap_dma_buffers(drv_data);
 
-                       drv_data->tx += drv_data->tx_map_len;
-                       drv_data->rx += drv_data->rx_map_len;
-
                        msg->actual_length += drv_data->len;
                        msg->state = pxa2xx_spi_next_transfer(drv_data);
                } else {
@@ -267,19 +258,22 @@ irqreturn_t pxa2xx_spi_dma_transfer(struct driver_data *drv_data)
 int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
 {
        struct dma_async_tx_descriptor *tx_desc, *rx_desc;
+       int err = 0;
 
        tx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_MEM_TO_DEV);
        if (!tx_desc) {
                dev_err(&drv_data->pdev->dev,
                        "failed to get DMA TX descriptor\n");
-               return -EBUSY;
+               err = -EBUSY;
+               goto err_tx;
        }
 
        rx_desc = pxa2xx_spi_dma_prepare_one(drv_data, DMA_DEV_TO_MEM);
        if (!rx_desc) {
                dev_err(&drv_data->pdev->dev,
                        "failed to get DMA RX descriptor\n");
-               return -EBUSY;
+               err = -EBUSY;
+               goto err_rx;
        }
 
        /* We are ready when RX completes */
@@ -289,6 +283,12 @@ int pxa2xx_spi_dma_prepare(struct driver_data *drv_data, u32 dma_burst)
        dmaengine_submit(rx_desc);
        dmaengine_submit(tx_desc);
        return 0;
+
+err_rx:
+       dmaengine_terminate_async(drv_data->tx_chan);
+err_tx:
+       pxa2xx_spi_unmap_dma_buffers(drv_data);
+       return err;
 }
 
 void pxa2xx_spi_dma_start(struct driver_data *drv_data)
@@ -308,10 +308,6 @@ int pxa2xx_spi_dma_setup(struct driver_data *drv_data)
        dma_cap_zero(mask);
        dma_cap_set(DMA_SLAVE, mask);
 
-       drv_data->dummy = devm_kzalloc(dev, SZ_2K, GFP_KERNEL);
-       if (!drv_data->dummy)
-               return -ENOMEM;
-
        drv_data->tx_chan = dma_request_slave_channel_compat(mask,
                                pdata->dma_filter, pdata->tx_param, dev, "tx");
        if (!drv_data->tx_chan)
index 4fd7f9802f1b12fa1d21788f490aa185eb3cbd3e..5202de94f792c4bf8e38bf0d6c1da94586ae26f9 100644 (file)
@@ -173,8 +173,8 @@ static int pxa2xx_spi_pci_probe(struct pci_dev *dev,
        ssp->type = c->type;
 
        snprintf(buf, sizeof(buf), "pxa2xx-spi.%d", ssp->port_id);
-       ssp->clk = clk_register_fixed_rate(&dev->dev, buf , NULL,
-                                       CLK_IS_ROOT, c->max_clk_rate);
+       ssp->clk = clk_register_fixed_rate(&dev->dev, buf , NULL, 0,
+                                          c->max_clk_rate);
         if (IS_ERR(ssp->clk))
                return PTR_ERR(ssp->clk);
 
index 86138e4101b07f990634d06af60b4c5a63ce4607..fe07c0592b44c9eac5faf22409203392cdbc9e3a 100644 (file)
@@ -570,9 +570,8 @@ static void giveback(struct driver_data *drv_data)
                /* see if the next and current messages point
                 * to the same chip
                 */
-               if (next_msg && next_msg->spi != msg->spi)
-                       next_msg = NULL;
-               if (!next_msg || msg->state == ERROR_STATE)
+               if ((next_msg && next_msg->spi != msg->spi) ||
+                   msg->state == ERROR_STATE)
                        cs_deassert(drv_data);
        }
 
@@ -928,6 +927,7 @@ static void pump_transfers(unsigned long data)
        u32 dma_thresh = drv_data->cur_chip->dma_threshold;
        u32 dma_burst = drv_data->cur_chip->dma_burst_size;
        u32 change_mask = pxa2xx_spi_get_ssrc1_change_mask(drv_data);
+       int err;
 
        /* Get current state information */
        message = drv_data->cur_msg;
@@ -1047,7 +1047,12 @@ static void pump_transfers(unsigned long data)
                /* Ensure we have the correct interrupt handler */
                drv_data->transfer_handler = pxa2xx_spi_dma_transfer;
 
-               pxa2xx_spi_dma_prepare(drv_data, dma_burst);
+               err = pxa2xx_spi_dma_prepare(drv_data, dma_burst);
+               if (err) {
+                       message->status = err;
+                       giveback(drv_data);
+                       return;
+               }
 
                /* Clear status and start DMA engine */
                cr1 = chip->cr1 | dma_thresh | drv_data->dma_cr1;
@@ -1543,7 +1548,6 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
        drv_data->pdev = pdev;
        drv_data->ssp = ssp;
 
-       master->dev.parent = &pdev->dev;
        master->dev.of_node = pdev->dev.of_node;
        /* the spi->mode bits understood by this driver: */
        master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LOOP;
@@ -1556,6 +1560,7 @@ static int pxa2xx_spi_probe(struct platform_device *pdev)
        master->unprepare_transfer_hardware = pxa2xx_spi_unprepare_transfer;
        master->fw_translate_cs = pxa2xx_spi_fw_translate_cs;
        master->auto_runtime_pm = true;
+       master->flags = SPI_MASTER_MUST_RX | SPI_MASTER_MUST_TX;
 
        drv_data->ssp_type = ssp->type;
 
index a1ef889481447fd4eeeca749bba063014bddc3f6..e6b09000ff145f072f0b7e12aaf11a8eeba7624f 100644 (file)
@@ -56,7 +56,6 @@ struct driver_data {
        struct sg_table tx_sgt;
        int rx_nents;
        int tx_nents;
-       void *dummy;
        atomic_t dma_running;
 
        /* Current message transfer state info */
@@ -69,8 +68,6 @@ struct driver_data {
        void *rx;
        void *rx_end;
        int dma_mapped;
-       size_t rx_map_len;
-       size_t tx_map_len;
        u8 n_bytes;
        int (*write)(struct driver_data *drv_data);
        int (*read)(struct driver_data *drv_data);
index 810a7fae347988a7d9dc1101586234b8ef8efc77..c338ef1136f6c6052b72b9394f74ef89b58273a5 100644 (file)
@@ -937,6 +937,10 @@ static int spi_qup_pm_suspend_runtime(struct device *device)
        config = readl(controller->base + QUP_CONFIG);
        config |= QUP_CONFIG_CLOCK_AUTO_GATE;
        writel_relaxed(config, controller->base + QUP_CONFIG);
+
+       clk_disable_unprepare(controller->cclk);
+       clk_disable_unprepare(controller->iclk);
+
        return 0;
 }
 
@@ -945,6 +949,15 @@ static int spi_qup_pm_resume_runtime(struct device *device)
        struct spi_master *master = dev_get_drvdata(device);
        struct spi_qup *controller = spi_master_get_devdata(master);
        u32 config;
+       int ret;
+
+       ret = clk_prepare_enable(controller->iclk);
+       if (ret)
+               return ret;
+
+       ret = clk_prepare_enable(controller->cclk);
+       if (ret)
+               return ret;
 
        /* Disable clocks auto gaiting */
        config = readl_relaxed(controller->base + QUP_CONFIG);
@@ -1017,6 +1030,8 @@ static int spi_qup_remove(struct platform_device *pdev)
 
        pm_runtime_put_noidle(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
+       spi_master_put(master);
+
        return 0;
 }
 
index 6c6c0013ec7a92732d2540ade876c561359f8d70..cd89682065b98d7a06a7d7064ec30aff2a5e7fef 100644 (file)
@@ -744,10 +744,8 @@ static int rockchip_spi_probe(struct platform_device *pdev)
        rs->dma_rx.ch = dma_request_chan(rs->dev, "rx");
        if (IS_ERR(rs->dma_rx.ch)) {
                if (PTR_ERR(rs->dma_rx.ch) == -EPROBE_DEFER) {
-                       dma_release_channel(rs->dma_tx.ch);
-                       rs->dma_tx.ch = NULL;
                        ret = -EPROBE_DEFER;
-                       goto err_get_fifo_len;
+                       goto err_free_dma_tx;
                }
                dev_warn(rs->dev, "Failed to request RX DMA channel\n");
                rs->dma_rx.ch = NULL;
@@ -775,10 +773,11 @@ static int rockchip_spi_probe(struct platform_device *pdev)
 
 err_register_master:
        pm_runtime_disable(&pdev->dev);
-       if (rs->dma_tx.ch)
-               dma_release_channel(rs->dma_tx.ch);
        if (rs->dma_rx.ch)
                dma_release_channel(rs->dma_rx.ch);
+err_free_dma_tx:
+       if (rs->dma_tx.ch)
+               dma_release_channel(rs->dma_tx.ch);
 err_get_fifo_len:
        clk_disable_unprepare(rs->spiclk);
 err_spiclk_enable:
index f17c0abe299f418697774fa9351e724600442d51..d5adf9f31602385e764f50980dd93786812967b6 100644 (file)
@@ -345,12 +345,13 @@ static int spi_st_probe(struct platform_device *pdev)
        spi_st->clk = devm_clk_get(&pdev->dev, "ssc");
        if (IS_ERR(spi_st->clk)) {
                dev_err(&pdev->dev, "Unable to request clock\n");
-               return PTR_ERR(spi_st->clk);
+               ret = PTR_ERR(spi_st->clk);
+               goto put_master;
        }
 
        ret = spi_st_clk_enable(spi_st);
        if (ret)
-               return ret;
+               goto put_master;
 
        init_completion(&spi_st->done);
 
@@ -408,7 +409,8 @@ static int spi_st_probe(struct platform_device *pdev)
 
 clk_disable:
        spi_st_clk_disable(spi_st);
-
+put_master:
+       spi_master_put(master);
        return ret;
 }
 
index aab9b492c627ada109f4cb5ac9840b5b79cdf2f6..18aeaceee2862017f4cd4bb13a5e3c6bd36f6ea7 100644 (file)
@@ -360,7 +360,7 @@ static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
 
        ret = clk_enable(xqspi->refclk);
        if (ret)
-               goto clk_err;
+               return ret;
 
        ret = clk_enable(xqspi->pclk);
        if (ret)
@@ -369,6 +369,7 @@ static int zynqmp_prepare_transfer_hardware(struct spi_master *master)
        zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, GQSPI_EN_MASK);
        return 0;
 clk_err:
+       clk_disable(xqspi->refclk);
        return ret;
 }
 
index 0239b45eed928697d9ccb10c2a83c1ef51958c16..77e6e45951f4c5e1c69a81b6d4ddf370d7f1a075 100644 (file)
@@ -717,9 +717,11 @@ static int spi_map_buf(struct spi_master *master, struct device *dev,
        if (vmalloced_buf) {
                desc_len = min_t(int, max_seg_size, PAGE_SIZE);
                sgs = DIV_ROUND_UP(len + offset_in_page(buf), desc_len);
-       } else {
+       } else if (virt_addr_valid(buf)) {
                desc_len = min_t(int, max_seg_size, master->max_dma_len);
                sgs = DIV_ROUND_UP(len, desc_len);
+       } else {
+               return -EINVAL;
        }
 
        ret = sg_alloc_table(sgt, sgs, GFP_KERNEL);
@@ -933,7 +935,7 @@ static int spi_map_msg(struct spi_master *master, struct spi_message *msg)
  * spi_transfer_one_message - Default implementation of transfer_one_message()
  *
  * This is a standard implementation of transfer_one_message() for
- * drivers which impelment a transfer_one() operation.  It provides
+ * drivers which implement a transfer_one() operation.  It provides
  * standard handling of delays and chip select management.
  */
 static int spi_transfer_one_message(struct spi_master *master,
@@ -1764,6 +1766,7 @@ struct spi_master *spi_alloc_master(struct device *dev, unsigned size)
        master->num_chipselect = 1;
        master->dev.class = &spi_master_class;
        master->dev.parent = dev;
+       pm_suspend_ignore_children(&master->dev, true);
        spi_master_set_devdata(master, &master[1]);
 
        return master;
index 163f21a1298d89bcf21ccac195805b45e1f2adac..e389009fca42c0caa447dc8f9b37360e3565a456 100644 (file)
@@ -42,23 +42,33 @@ static inline struct spinand_state *mtd_to_state(struct mtd_info *mtd)
 static int enable_hw_ecc;
 static int enable_read_hw_ecc;
 
-static struct nand_ecclayout spinand_oob_64 = {
-       .eccbytes = 24,
-       .eccpos = {
-               1, 2, 3, 4, 5, 6,
-               17, 18, 19, 20, 21, 22,
-               33, 34, 35, 36, 37, 38,
-               49, 50, 51, 52, 53, 54, },
-       .oobfree = {
-               {.offset = 8,
-                       .length = 8},
-               {.offset = 24,
-                       .length = 8},
-               {.offset = 40,
-                       .length = 8},
-               {.offset = 56,
-                       .length = 8},
-       }
+static int spinand_ooblayout_64_ecc(struct mtd_info *mtd, int section,
+                                   struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 1;
+       oobregion->length = 6;
+
+       return 0;
+}
+
+static int spinand_ooblayout_64_free(struct mtd_info *mtd, int section,
+                                    struct mtd_oob_region *oobregion)
+{
+       if (section > 3)
+               return -ERANGE;
+
+       oobregion->offset = (section * 16) + 8;
+       oobregion->length = 8;
+
+       return 0;
+}
+
+static const struct mtd_ooblayout_ops spinand_oob_64_ops = {
+       .ecc = spinand_ooblayout_64_ecc,
+       .free = spinand_ooblayout_64_free,
 };
 #endif
 
@@ -886,11 +896,11 @@ static int spinand_probe(struct spi_device *spi_nand)
 
        chip->ecc.strength = 1;
        chip->ecc.total = chip->ecc.steps * chip->ecc.bytes;
-       chip->ecc.layout = &spinand_oob_64;
        chip->ecc.read_page = spinand_read_page_hwecc;
        chip->ecc.write_page = spinand_write_page_hwecc;
 #else
        chip->ecc.mode  = NAND_ECC_SOFT;
+       chip->ecc.algo  = NAND_ECC_HAMMING;
        if (spinand_disable_ecc(spi_nand) < 0)
                dev_info(&spi_nand->dev, "%s: disable ecc failed!\n",
                         __func__);
@@ -912,6 +922,9 @@ static int spinand_probe(struct spi_device *spi_nand)
 
        mtd->dev.parent = &spi_nand->dev;
        mtd->oobsize = 64;
+#ifdef CONFIG_MTD_SPINAND_ONDIEECC
+       mtd_set_ooblayout(mtd, &spinand_oob_64_ops);
+#endif
 
        if (nand_scan(mtd, 1))
                return -ENXIO;
index d89d60c8b6cf09cf934f220cdf3fb936ff1040bc..2d702ca6556f5b6185a95ec007ada7b124226272 100644 (file)
@@ -260,16 +260,6 @@ config ARMADA_THERMAL
          Enable this option if you want to have support for thermal management
          controller present in Armada 370 and Armada XP SoC.
 
-config TEGRA_SOCTHERM
-       tristate "Tegra SOCTHERM thermal management"
-       depends on ARCH_TEGRA
-       help
-         Enable this option for integrated thermal management support on NVIDIA
-         Tegra124 systems-on-chip. The driver supports four thermal zones
-         (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
-         zones to manage temperatures. This option is also required for the
-         emergency thermal reset (thermtrip) feature to function.
-
 config DB8500_CPUFREQ_COOLING
        tristate "DB8500 cpufreq cooling"
        depends on ARCH_U8500 || COMPILE_TEST
@@ -377,6 +367,17 @@ depends on ARCH_STI && OF
 source "drivers/thermal/st/Kconfig"
 endmenu
 
+config TANGO_THERMAL
+       tristate "Tango thermal management"
+       depends on ARCH_TANGO || COMPILE_TEST
+       help
+         Enable the Tango thermal driver, which supports the primitive
+         temperature sensor embedded in Tango chips since the SMP8758.
+         This sensor only generates a 1-bit signal to indicate whether
+         the die temperature exceeds a programmable threshold.
+
+source "drivers/thermal/tegra/Kconfig"
+
 config QCOM_SPMI_TEMP_ALARM
        tristate "Qualcomm SPMI PMIC Temperature Alarm"
        depends on OF && SPMI && IIO
@@ -388,4 +389,14 @@ config QCOM_SPMI_TEMP_ALARM
          real time die temperature if an ADC is present or an estimate of the
          temperature based upon the over temperature stage value.
 
+config GENERIC_ADC_THERMAL
+       tristate "Generic ADC based thermal sensor"
+       depends on IIO
+       help
+         This enabled a thermal sysfs driver for the temperature sensor
+         which is connected to the General Purpose ADC. The ADC channel
+         is read via IIO framework and the channel information is provided
+         to this driver. This driver reports the temperature by reading ADC
+         channel and converts it to temperature based on lookup table.
+
 endif
index 8e9cbc3b5679ae221bcdd5d6bb82e91ebcfb3f1d..10b07c14f8a9f75363323d0c69542606d8dfed96 100644 (file)
@@ -35,6 +35,7 @@ obj-y                         += samsung/
 obj-$(CONFIG_DOVE_THERMAL)     += dove_thermal.o
 obj-$(CONFIG_DB8500_THERMAL)   += db8500_thermal.o
 obj-$(CONFIG_ARMADA_THERMAL)   += armada_thermal.o
+obj-$(CONFIG_TANGO_THERMAL)    += tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)      += imx_thermal.o
 obj-$(CONFIG_DB8500_CPUFREQ_COOLING)   += db8500_cpufreq_cooling.o
 obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
@@ -46,6 +47,7 @@ obj-$(CONFIG_TI_SOC_THERMAL)  += ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_INTEL_PCH_THERMAL)        += intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)       += st/
-obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra_soctherm.o
+obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra/
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
 obj-$(CONFIG_MTK_THERMAL)      += mtk_thermal.o
+obj-$(CONFIG_GENERIC_ADC_THERMAL)      += thermal-generic-adc.o
index 70836c5b89bc411d3a1b91ebea91c3b8f92b4dba..fc52016d4e85be590e15f4e1976b8378d2c456d2 100644 (file)
@@ -29,7 +29,13 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
        struct thermal_instance *instance;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
-       tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
+
+       if (!tz->ops->get_trip_hyst) {
+               pr_warn_once("Undefined get_trip_hyst for thermal zone %s - "
+                               "running with default hysteresis zero\n", tz->type);
+               trip_hyst = 0;
+       } else
+               tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
 
        dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
                                trip, trip_temp, tz->temperature,
index 5e820b5415063b17fdd2982c0adfd5885db769b9..97fad8f51e1c891ec26a296e25a6059419e529a0 100644 (file)
@@ -160,7 +160,7 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp)
        struct hisi_thermal_sensor *sensor = _sensor;
        struct hisi_thermal_data *data = sensor->thermal;
 
-       int sensor_id = 0, i;
+       int sensor_id = -1, i;
        long max_temp = 0;
 
        *temp = hisi_thermal_get_sensor_temp(data, sensor);
@@ -168,12 +168,19 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp)
        sensor->sensor_temp = *temp;
 
        for (i = 0; i < HISI_MAX_SENSORS; i++) {
+               if (!data->sensors[i].tzd)
+                       continue;
+
                if (data->sensors[i].sensor_temp >= max_temp) {
                        max_temp = data->sensors[i].sensor_temp;
                        sensor_id = i;
                }
        }
 
+       /* If no sensor has been enabled, then skip to enable irq */
+       if (sensor_id == -1)
+               return 0;
+
        mutex_lock(&data->thermal_lock);
        data->irq_bind_sensor = sensor_id;
        mutex_unlock(&data->thermal_lock);
@@ -226,8 +233,12 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
                 sensor->thres_temp / 1000);
        mutex_unlock(&data->thermal_lock);
 
-       for (i = 0; i < HISI_MAX_SENSORS; i++)
+       for (i = 0; i < HISI_MAX_SENSORS; i++) {
+               if (!data->sensors[i].tzd)
+                       continue;
+
                thermal_zone_device_update(data->sensors[i].tzd);
+       }
 
        return IRQ_HANDLED;
 }
@@ -243,10 +254,11 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
        sensor->id = index;
        sensor->thermal = data;
 
-       sensor->tzd = thermal_zone_of_sensor_register(&pdev->dev, sensor->id,
-                               sensor, &hisi_of_thermal_ops);
+       sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
+                               sensor->id, sensor, &hisi_of_thermal_ops);
        if (IS_ERR(sensor->tzd)) {
                ret = PTR_ERR(sensor->tzd);
+               sensor->tzd = NULL;
                dev_err(&pdev->dev, "failed to register sensor id %d: %d\n",
                        sensor->id, ret);
                return ret;
@@ -331,28 +343,21 @@ static int hisi_thermal_probe(struct platform_device *pdev)
                return ret;
        }
 
+       hisi_thermal_enable_bind_irq_sensor(data);
+       irq_get_irqchip_state(data->irq, IRQCHIP_STATE_MASKED,
+                             &data->irq_enabled);
+
        for (i = 0; i < HISI_MAX_SENSORS; ++i) {
                ret = hisi_thermal_register_sensor(pdev, data,
                                                   &data->sensors[i], i);
-               if (ret) {
+               if (ret)
                        dev_err(&pdev->dev,
                                "failed to register thermal sensor: %d\n", ret);
-                       goto err_get_sensor_data;
-               }
+               else
+                       hisi_thermal_toggle_sensor(&data->sensors[i], true);
        }
 
-       hisi_thermal_enable_bind_irq_sensor(data);
-       data->irq_enabled = true;
-
-       for (i = 0; i < HISI_MAX_SENSORS; i++)
-               hisi_thermal_toggle_sensor(&data->sensors[i], true);
-
        return 0;
-
-err_get_sensor_data:
-       clk_disable_unprepare(data->clk);
-
-       return ret;
 }
 
 static int hisi_thermal_remove(struct platform_device *pdev)
@@ -363,8 +368,10 @@ static int hisi_thermal_remove(struct platform_device *pdev)
        for (i = 0; i < HISI_MAX_SENSORS; i++) {
                struct hisi_thermal_sensor *sensor = &data->sensors[i];
 
+               if (!sensor->tzd)
+                       continue;
+
                hisi_thermal_toggle_sensor(sensor, false);
-               thermal_zone_of_sensor_unregister(&pdev->dev, sensor->tzd);
        }
 
        hisi_thermal_disable_sensor(data);
index 36fa724a36c851d463afa53a67d1a8526efb75dc..42c1ac057bad85c3f3237ea5cc09bccc1aad7010 100644 (file)
@@ -198,49 +198,33 @@ static struct thermal_zone_device_ops proc_thermal_local_ops = {
        .get_temp       = proc_thermal_get_zone_temp,
 };
 
-static int proc_thermal_add(struct device *dev,
-                           struct proc_thermal_device **priv)
+static int proc_thermal_read_ppcc(struct proc_thermal_device *proc_priv)
 {
-       struct proc_thermal_device *proc_priv;
-       struct acpi_device *adev;
+       int i;
        acpi_status status;
        struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
        union acpi_object *elements, *ppcc;
        union acpi_object *p;
-       unsigned long long tmp;
-       struct thermal_zone_device_ops *ops = NULL;
-       int i;
-       int ret;
-
-       adev = ACPI_COMPANION(dev);
-       if (!adev)
-               return -ENODEV;
+       int ret = 0;
 
-       status = acpi_evaluate_object(adev->handle, "PPCC", NULL, &buf);
+       status = acpi_evaluate_object(proc_priv->adev->handle, "PPCC",
+                                     NULL, &buf);
        if (ACPI_FAILURE(status))
                return -ENODEV;
 
        p = buf.pointer;
        if (!p || (p->type != ACPI_TYPE_PACKAGE)) {
-               dev_err(dev, "Invalid PPCC data\n");
+               dev_err(proc_priv->dev, "Invalid PPCC data\n");
                ret = -EFAULT;
                goto free_buffer;
        }
+
        if (!p->package.count) {
-               dev_err(dev, "Invalid PPCC package size\n");
+               dev_err(proc_priv->dev, "Invalid PPCC package size\n");
                ret = -EFAULT;
                goto free_buffer;
        }
 
-       proc_priv = devm_kzalloc(dev, sizeof(*proc_priv), GFP_KERNEL);
-       if (!proc_priv) {
-               ret = -ENOMEM;
-               goto free_buffer;
-       }
-
-       proc_priv->dev = dev;
-       proc_priv->adev = adev;
-
        for (i = 0; i < min((int)p->package.count - 1, 2); ++i) {
                elements = &(p->package.elements[i+1]);
                if (elements->type != ACPI_TYPE_PACKAGE ||
@@ -257,12 +241,62 @@ static int proc_thermal_add(struct device *dev,
                proc_priv->power_limits[i].step_uw = ppcc[5].integer.value;
        }
 
+free_buffer:
+       kfree(buf.pointer);
+
+       return ret;
+}
+
+#define PROC_POWER_CAPABILITY_CHANGED  0x83
+static void proc_thermal_notify(acpi_handle handle, u32 event, void *data)
+{
+       struct proc_thermal_device *proc_priv = data;
+
+       if (!proc_priv)
+               return;
+
+       switch (event) {
+       case PROC_POWER_CAPABILITY_CHANGED:
+               proc_thermal_read_ppcc(proc_priv);
+               int340x_thermal_zone_device_update(proc_priv->int340x_zone);
+               break;
+       default:
+               dev_err(proc_priv->dev, "Unsupported event [0x%x]\n", event);
+               break;
+       }
+}
+
+
+static int proc_thermal_add(struct device *dev,
+                           struct proc_thermal_device **priv)
+{
+       struct proc_thermal_device *proc_priv;
+       struct acpi_device *adev;
+       acpi_status status;
+       unsigned long long tmp;
+       struct thermal_zone_device_ops *ops = NULL;
+       int ret;
+
+       adev = ACPI_COMPANION(dev);
+       if (!adev)
+               return -ENODEV;
+
+       proc_priv = devm_kzalloc(dev, sizeof(*proc_priv), GFP_KERNEL);
+       if (!proc_priv)
+               return -ENOMEM;
+
+       proc_priv->dev = dev;
+       proc_priv->adev = adev;
        *priv = proc_priv;
 
-       ret = sysfs_create_group(&dev->kobj,
-                                &power_limit_attribute_group);
+       ret = proc_thermal_read_ppcc(proc_priv);
+       if (!ret) {
+               ret = sysfs_create_group(&dev->kobj,
+                                        &power_limit_attribute_group);
+
+       }
        if (ret)
-               goto free_buffer;
+               return ret;
 
        status = acpi_evaluate_integer(adev->handle, "_TMP", NULL, &tmp);
        if (ACPI_FAILURE(status)) {
@@ -274,20 +308,32 @@ static int proc_thermal_add(struct device *dev,
 
        proc_priv->int340x_zone = int340x_thermal_zone_add(adev, ops);
        if (IS_ERR(proc_priv->int340x_zone)) {
-               sysfs_remove_group(&proc_priv->dev->kobj,
-                          &power_limit_attribute_group);
                ret = PTR_ERR(proc_priv->int340x_zone);
+               goto remove_group;
        } else
                ret = 0;
 
-free_buffer:
-       kfree(buf.pointer);
+       ret = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+                                         proc_thermal_notify,
+                                         (void *)proc_priv);
+       if (ret)
+               goto remove_zone;
+
+       return 0;
+
+remove_zone:
+       int340x_thermal_zone_remove(proc_priv->int340x_zone);
+remove_group:
+       sysfs_remove_group(&proc_priv->dev->kobj,
+                          &power_limit_attribute_group);
 
        return ret;
 }
 
 static void proc_thermal_remove(struct proc_thermal_device *proc_priv)
 {
+       acpi_remove_notify_handler(proc_priv->adev->handle,
+                                  ACPI_DEVICE_NOTIFY, proc_thermal_notify);
        int340x_thermal_zone_remove(proc_priv->int340x_zone);
        sysfs_remove_group(&proc_priv->dev->kobj,
                           &power_limit_attribute_group);
index 6c79588251d59b53e290908a7fd61705c6db3211..015ce2eb6eb7ba0a254e2918c8d9710011283fda 100644 (file)
@@ -510,12 +510,6 @@ static int start_power_clamp(void)
        unsigned long cpu;
        struct task_struct *thread;
 
-       /* check if pkg cstate counter is completely 0, abort in this case */
-       if (!has_pkg_state_counter()) {
-               pr_err("pkg cstate counter not functional, abort\n");
-               return -EINVAL;
-       }
-
        set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
        /* prevent cpu hotplug */
        get_online_cpus();
@@ -672,35 +666,11 @@ static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
        .set_cur_state = powerclamp_set_cur_state,
 };
 
-/* runs on Nehalem and later */
 static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
-       { X86_VENDOR_INTEL, 6, 0x1a},
-       { X86_VENDOR_INTEL, 6, 0x1c},
-       { X86_VENDOR_INTEL, 6, 0x1e},
-       { X86_VENDOR_INTEL, 6, 0x1f},
-       { X86_VENDOR_INTEL, 6, 0x25},
-       { X86_VENDOR_INTEL, 6, 0x26},
-       { X86_VENDOR_INTEL, 6, 0x2a},
-       { X86_VENDOR_INTEL, 6, 0x2c},
-       { X86_VENDOR_INTEL, 6, 0x2d},
-       { X86_VENDOR_INTEL, 6, 0x2e},
-       { X86_VENDOR_INTEL, 6, 0x2f},
-       { X86_VENDOR_INTEL, 6, 0x37},
-       { X86_VENDOR_INTEL, 6, 0x3a},
-       { X86_VENDOR_INTEL, 6, 0x3c},
-       { X86_VENDOR_INTEL, 6, 0x3d},
-       { X86_VENDOR_INTEL, 6, 0x3e},
-       { X86_VENDOR_INTEL, 6, 0x3f},
-       { X86_VENDOR_INTEL, 6, 0x45},
-       { X86_VENDOR_INTEL, 6, 0x46},
-       { X86_VENDOR_INTEL, 6, 0x47},
-       { X86_VENDOR_INTEL, 6, 0x4c},
-       { X86_VENDOR_INTEL, 6, 0x4d},
-       { X86_VENDOR_INTEL, 6, 0x4e},
-       { X86_VENDOR_INTEL, 6, 0x4f},
-       { X86_VENDOR_INTEL, 6, 0x56},
-       { X86_VENDOR_INTEL, 6, 0x57},
-       { X86_VENDOR_INTEL, 6, 0x5e},
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_ARAT },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_NONSTOP_TSC },
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_CONSTANT_TSC},
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
@@ -712,11 +682,12 @@ static int __init powerclamp_probe(void)
                                boot_cpu_data.x86, boot_cpu_data.x86_model);
                return -ENODEV;
        }
-       if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
-               !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
-               !boot_cpu_has(X86_FEATURE_MWAIT) ||
-               !boot_cpu_has(X86_FEATURE_ARAT))
+
+       /* The goal for idle time alignment is to achieve package cstate. */
+       if (!has_pkg_state_counter()) {
+               pr_info("No package C-state available");
                return -ENODEV;
+       }
 
        /* find the deepest mwait value */
        find_target_mwait();
index 507632b9648e3e193942ca3cc190c2cfab705d66..262ab0a2266f71242708cca87e524ba2bbf179f1 100644 (file)
@@ -144,7 +144,6 @@ struct mtk_thermal {
        s32 o_slope;
        s32 vts[MT8173_NUM_SENSORS];
 
-       struct thermal_zone_device *tzd;
 };
 
 struct mtk_thermal_bank_cfg {
@@ -572,16 +571,11 @@ static int mtk_thermal_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, mt);
 
-       mt->tzd = thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
-                               &mtk_thermal_ops);
-       if (IS_ERR(mt->tzd))
-               goto err_register;
+       devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
+                                            &mtk_thermal_ops);
 
        return 0;
 
-err_register:
-       clk_disable_unprepare(mt->clk_peri_therm);
-
 err_disable_clk_auxadc:
        clk_disable_unprepare(mt->clk_auxadc);
 
@@ -592,8 +586,6 @@ static int mtk_thermal_remove(struct platform_device *pdev)
 {
        struct mtk_thermal *mt = platform_get_drvdata(pdev);
 
-       thermal_zone_of_sensor_unregister(&pdev->dev, mt->tzd);
-
        clk_disable_unprepare(mt->clk_peri_therm);
        clk_disable_unprepare(mt->clk_auxadc);
 
index d8ec44b194d64a012c1dff1472c233f83f1119b0..b8e509c60848e4f94dc621185d334b59f8151c1a 100644 (file)
@@ -331,6 +331,14 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
        if (trip >= data->ntrips || trip < 0)
                return -EDOM;
 
+       if (data->ops->set_trip_temp) {
+               int ret;
+
+               ret = data->ops->set_trip_temp(data->sensor_data, trip, temp);
+               if (ret)
+                       return ret;
+       }
+
        /* thermal framework should take care of data->mask & (1 << trip) */
        data->trips[trip].temperature = temp;
 
@@ -906,7 +914,7 @@ finish:
        return tz;
 
 free_tbps:
-       for (i = 0; i < tz->num_tbps; i++)
+       for (i = i - 1; i >= 0; i--)
                of_node_put(tz->tbps[i].cooling_device);
        kfree(tz->tbps);
 free_trips:
index b677aada5b52860948d6731b0c164573aad4b9f9..f8a3c60bef94bcf32ca388ab0c2de8de034dab7c 100644 (file)
@@ -260,7 +260,7 @@ static int qpnp_tm_probe(struct platform_device *pdev)
        if (ret < 0)
                goto fail;
 
-       chip->tz_dev = thermal_zone_of_sensor_register(&pdev->dev, 0, chip,
+       chip->tz_dev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, chip,
                                                        &qpnp_tm_sensor_ops);
        if (IS_ERR(chip->tz_dev)) {
                dev_err(&pdev->dev, "failed to register sensor\n");
@@ -281,7 +281,6 @@ static int qpnp_tm_remove(struct platform_device *pdev)
 {
        struct qpnp_tm_chip *chip = dev_get_drvdata(&pdev->dev);
 
-       thermal_zone_of_sensor_unregister(&pdev->dev, chip->tz_dev);
        if (!IS_ERR(chip->adc))
                iio_channel_release(chip->adc);
 
index 82daba09e1503a53d67e6f26705f80e563208fe7..71a339271fa5fe9d796d86be9077ebbc13d4f0dc 100644 (file)
@@ -492,7 +492,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                        goto error_unregister;
 
                if (of_data == USE_OF_THERMAL)
-                       priv->zone = thermal_zone_of_sensor_register(
+                       priv->zone = devm_thermal_zone_of_sensor_register(
                                                dev, i, priv,
                                                &rcar_thermal_zone_of_ops);
                else
index 233a564442a0ef0028278d1799393bb8d1f41be2..5d491f16a866c24607bf87a0eaf12a7f399ac72f 100644 (file)
@@ -1,7 +1,5 @@
 /*
- * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
- *
- * Copyright (c) 2015, Fuzhou Rockchip Electronics Co., Ltd
+ * Copyright (c) 2014-2016, Fuzhou Rockchip Electronics Co., Ltd
  * Caesar Wang <wxt@rock-chips.com>
  *
  * This program is free software; you can redistribute it and/or modify it
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/platform_device.h>
+#include <linux/regmap.h>
 #include <linux/reset.h>
 #include <linux/thermal.h>
+#include <linux/mfd/syscon.h>
 #include <linux/pinctrl/consumer.h>
 
 /**
@@ -73,7 +73,7 @@ enum adc_sort_mode {
 #define SOC_MAX_SENSORS        2
 
 /**
- * struct chip_tsadc_table: hold information about chip-specific differences
+ * struct chip_tsadc_table - hold information about chip-specific differences
  * @id: conversion table
  * @length: size of conversion table
  * @data_mask: mask to apply on data inputs
@@ -86,6 +86,20 @@ struct chip_tsadc_table {
        enum adc_sort_mode mode;
 };
 
+/**
+ * struct rockchip_tsadc_chip - hold the private data of tsadc chip
+ * @chn_id[SOC_MAX_SENSORS]: the sensor id of chip correspond to the channel
+ * @chn_num: the channel number of tsadc chip
+ * @tshut_temp: the hardware-controlled shutdown temperature value
+ * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO)
+ * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH)
+ * @initialize: SoC special initialize tsadc controller method
+ * @irq_ack: clear the interrupt
+ * @get_temp: get the temperature
+ * @set_tshut_temp: set the hardware-controlled shutdown temperature
+ * @set_tshut_mode: set the hardware-controlled shutdown mode
+ * @table: the chip-specific conversion table
+ */
 struct rockchip_tsadc_chip {
        /* The sensor id of chip correspond to the ADC channel */
        int chn_id[SOC_MAX_SENSORS];
@@ -97,7 +111,8 @@ struct rockchip_tsadc_chip {
        enum tshut_polarity tshut_polarity;
 
        /* Chip-wide methods */
-       void (*initialize)(void __iomem *reg, enum tshut_polarity p);
+       void (*initialize)(struct regmap *grf,
+                          void __iomem *reg, enum tshut_polarity p);
        void (*irq_ack)(void __iomem *reg);
        void (*control)(void __iomem *reg, bool on);
 
@@ -112,12 +127,32 @@ struct rockchip_tsadc_chip {
        struct chip_tsadc_table table;
 };
 
+/**
+ * struct rockchip_thermal_sensor - hold the information of thermal sensor
+ * @thermal:  pointer to the platform/configuration data
+ * @tzd: pointer to a thermal zone
+ * @id: identifier of the thermal sensor
+ */
 struct rockchip_thermal_sensor {
        struct rockchip_thermal_data *thermal;
        struct thermal_zone_device *tzd;
        int id;
 };
 
+/**
+ * struct rockchip_thermal_data - hold the private data of thermal driver
+ * @chip: pointer to the platform/configuration data
+ * @pdev: platform device of thermal
+ * @reset: the reset controller of tsadc
+ * @sensors[SOC_MAX_SENSORS]: the thermal sensor
+ * @clk: the controller clock is divided by the exteral 24MHz
+ * @pclk: the advanced peripherals bus clock
+ * @grf: the general register file will be used to do static set by software
+ * @regs: the base address of tsadc controller
+ * @tshut_temp: the hardware-controlled shutdown temperature value
+ * @tshut_mode: the hardware-controlled shutdown mode (0:CRU 1:GPIO)
+ * @tshut_polarity: the hardware-controlled active polarity (0:LOW 1:HIGH)
+ */
 struct rockchip_thermal_data {
        const struct rockchip_tsadc_chip *chip;
        struct platform_device *pdev;
@@ -128,6 +163,7 @@ struct rockchip_thermal_data {
        struct clk *clk;
        struct clk *pclk;
 
+       struct regmap *grf;
        void __iomem *regs;
 
        int tshut_temp;
@@ -142,6 +178,7 @@ struct rockchip_thermal_data {
  * TSADCV3_* are used for newer SoCs than RK3288. (e.g: RK3228, RK3399)
  *
  */
+#define TSADCV2_USER_CON                       0x00
 #define TSADCV2_AUTO_CON                       0x04
 #define TSADCV2_INT_EN                         0x08
 #define TSADCV2_INT_PD                         0x0c
@@ -155,12 +192,7 @@ struct rockchip_thermal_data {
 #define TSADCV2_AUTO_EN                                BIT(0)
 #define TSADCV2_AUTO_SRC_EN(chn)               BIT(4 + (chn))
 #define TSADCV2_AUTO_TSHUT_POLARITY_HIGH       BIT(8)
-/**
- * TSADCV1_AUTO_Q_SEL_EN:
- * whether select (1024 - tsadc_q) as output
- * 1'b0:use tsadc_q as output(temperature-code is rising sequence)
- * 1'b1:use(1024 - tsadc_q) as output (temperature-code is falling sequence)
- */
+
 #define TSADCV3_AUTO_Q_SEL_EN                  BIT(1)
 
 #define TSADCV2_INT_SRC_EN(chn)                        BIT(chn)
@@ -177,19 +209,32 @@ struct rockchip_thermal_data {
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT     4
 #define TSADCV2_AUTO_PERIOD_TIME               250 /* msec */
 #define TSADCV2_AUTO_PERIOD_HT_TIME            50  /* msec */
+#define TSADCV2_USER_INTER_PD_SOC              0x340 /* 13 clocks */
 
-struct tsadc_table {
-       u32 code;
-       int temp;
-};
+#define GRF_SARADC_TESTBIT                     0x0e644
+#define GRF_TSADC_TESTBIT_L                    0x0e648
+#define GRF_TSADC_TESTBIT_H                    0x0e64c
+
+#define GRF_TSADC_TSEN_PD_ON                   (0x30003 << 0)
+#define GRF_TSADC_TSEN_PD_OFF                  (0x30000 << 0)
+#define GRF_SARADC_TESTBIT_ON                  (0x10001 << 2)
+#define GRF_TSADC_TESTBIT_H_ON                 (0x10001 << 2)
 
 /**
+ * struct tsadc_table - code to temperature conversion table
+ * @code: the value of adc channel
+ * @temp: the temperature
  * Note:
- * Code to Temperature mapping of the Temperature sensor is a piece wise linear
+ * code to temperature mapping of the temperature sensor is a piece wise linear
  * curve.Any temperature, code faling between to 2 give temperatures can be
  * linearly interpolated.
- * Code to Temperature mapping should be updated based on sillcon results.
+ * Code to Temperature mapping should be updated based on manufacturer results.
  */
+struct tsadc_table {
+       u32 code;
+       int temp;
+};
+
 static const struct tsadc_table rk3228_code_table[] = {
        {0, -40000},
        {588, -40000},
@@ -308,40 +353,40 @@ static const struct tsadc_table rk3368_code_table[] = {
 
 static const struct tsadc_table rk3399_code_table[] = {
        {0, -40000},
-       {593, -40000},
-       {598, -35000},
-       {603, -30000},
-       {609, -25000},
-       {614, -20000},
-       {619, -15000},
-       {625, -10000},
-       {630, -5000},
-       {635, 0},
-       {641, 5000},
-       {646, 10000},
-       {651, 15000},
-       {657, 20000},
-       {662, 25000},
-       {667, 30000},
-       {673, 35000},
-       {678, 40000},
-       {684, 45000},
-       {689, 50000},
-       {694, 55000},
-       {700, 60000},
-       {705, 65000},
-       {711, 70000},
-       {716, 75000},
-       {722, 80000},
-       {727, 85000},
-       {733, 90000},
-       {738, 95000},
-       {743, 100000},
-       {749, 105000},
-       {754, 110000},
-       {760, 115000},
-       {765, 120000},
-       {771, 125000},
+       {402, -40000},
+       {410, -35000},
+       {419, -30000},
+       {427, -25000},
+       {436, -20000},
+       {444, -15000},
+       {453, -10000},
+       {461, -5000},
+       {470, 0},
+       {478, 5000},
+       {487, 10000},
+       {496, 15000},
+       {504, 20000},
+       {513, 25000},
+       {521, 30000},
+       {530, 35000},
+       {538, 40000},
+       {547, 45000},
+       {555, 50000},
+       {564, 55000},
+       {573, 60000},
+       {581, 65000},
+       {590, 70000},
+       {599, 75000},
+       {607, 80000},
+       {616, 85000},
+       {624, 90000},
+       {633, 95000},
+       {642, 100000},
+       {650, 105000},
+       {659, 110000},
+       {668, 115000},
+       {677, 120000},
+       {685, 125000},
        {TSADCV3_DATA_MASK, 125000},
 };
 
@@ -405,8 +450,8 @@ static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
                        return -EAGAIN;         /* Incorrect reading */
 
                while (low <= high) {
-                       if (code >= table.id[mid - 1].code &&
-                           code < table.id[mid].code)
+                       if (code <= table.id[mid].code &&
+                           code > table.id[mid - 1].code)
                                break;
                        else if (code > table.id[mid].code)
                                low = mid + 1;
@@ -449,7 +494,7 @@ static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
  *     If the temperature is higher than COMP_INT or COMP_SHUT for
  *     "debounce" times, TSADC controller will generate interrupt or TSHUT.
  */
-static void rk_tsadcv2_initialize(void __iomem *regs,
+static void rk_tsadcv2_initialize(struct regmap *grf, void __iomem *regs,
                                  enum tshut_polarity tshut_polarity)
 {
        if (tshut_polarity == TSHUT_HIGH_ACTIVE)
@@ -466,6 +511,62 @@ static void rk_tsadcv2_initialize(void __iomem *regs,
                       regs + TSADCV2_AUTO_PERIOD_HT);
        writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
                       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
+
+       if (IS_ERR(grf)) {
+               pr_warn("%s: Missing rockchip,grf property\n", __func__);
+               return;
+       }
+}
+
+/**
+ * rk_tsadcv3_initialize - initialize TASDC Controller.
+ *
+ * (1) The tsadc control power sequence.
+ *
+ * (2) Set TSADC_V2_AUTO_PERIOD:
+ *     Configure the interleave between every two accessing of
+ *     TSADC in normal operation.
+ *
+ * (2) Set TSADCV2_AUTO_PERIOD_HT:
+ *     Configure the interleave between every two accessing of
+ *     TSADC after the temperature is higher than COM_SHUT or COM_INT.
+ *
+ * (3) Set TSADCV2_HIGH_INT_DEBOUNCE and TSADC_HIGHT_TSHUT_DEBOUNCE:
+ *     If the temperature is higher than COMP_INT or COMP_SHUT for
+ *     "debounce" times, TSADC controller will generate interrupt or TSHUT.
+ */
+static void rk_tsadcv3_initialize(struct regmap *grf, void __iomem *regs,
+                                 enum tshut_polarity tshut_polarity)
+{
+       /* The tsadc control power sequence */
+       if (IS_ERR(grf)) {
+               /* Set interleave value to workround ic time sync issue */
+               writel_relaxed(TSADCV2_USER_INTER_PD_SOC, regs +
+                              TSADCV2_USER_CON);
+       } else {
+               regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_ON);
+               mdelay(10);
+               regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_OFF);
+               usleep_range(15, 100); /* The spec note says at least 15 us */
+               regmap_write(grf, GRF_SARADC_TESTBIT, GRF_SARADC_TESTBIT_ON);
+               regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_TESTBIT_H_ON);
+               usleep_range(90, 200); /* The spec note says at least 90 us */
+       }
+
+       if (tshut_polarity == TSHUT_HIGH_ACTIVE)
+               writel_relaxed(0U | TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
+                              regs + TSADCV2_AUTO_CON);
+       else
+               writel_relaxed(0U & ~TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
+                              regs + TSADCV2_AUTO_CON);
+
+       writel_relaxed(TSADCV2_AUTO_PERIOD_TIME, regs + TSADCV2_AUTO_PERIOD);
+       writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+                      regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+       writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
+                      regs + TSADCV2_AUTO_PERIOD_HT);
+       writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+                      regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
@@ -498,10 +599,11 @@ static void rk_tsadcv2_control(void __iomem *regs, bool enable)
 }
 
 /**
- * @rk_tsadcv3_control:
- * TSADC controller works at auto mode, and some SoCs need set the tsadc_q_sel
- * bit on TSADCV2_AUTO_CON[1]. The (1024 - tsadc_q) as output adc value if
- * setting this bit to enable.
+ * rk_tsadcv3_control - the tsadc controller is enabled or disabled.
+ *
+ * NOTE: TSADC controller works at auto mode, and some SoCs need set the
+ * tsadc_q_sel bit on TSADCV2_AUTO_CON[1]. The (1024 - tsadc_q) as output
+ * adc value if setting this bit to enable.
  */
 static void rk_tsadcv3_control(void __iomem *regs, bool enable)
 {
@@ -603,6 +705,30 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
        },
 };
 
+static const struct rockchip_tsadc_chip rk3366_tsadc_data = {
+       .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+       .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
+       .chn_num = 2, /* two channels for tsadc */
+
+       .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+       .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+       .tshut_temp = 95000,
+
+       .initialize = rk_tsadcv3_initialize,
+       .irq_ack = rk_tsadcv3_irq_ack,
+       .control = rk_tsadcv3_control,
+       .get_temp = rk_tsadcv2_get_temp,
+       .set_tshut_temp = rk_tsadcv2_tshut_temp,
+       .set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+       .table = {
+               .id = rk3228_code_table,
+               .length = ARRAY_SIZE(rk3228_code_table),
+               .data_mask = TSADCV3_DATA_MASK,
+               .mode = ADC_INCREMENT,
+       },
+};
+
 static const struct rockchip_tsadc_chip rk3368_tsadc_data = {
        .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
        .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
@@ -636,7 +762,7 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = {
        .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
        .tshut_temp = 95000,
 
-       .initialize = rk_tsadcv2_initialize,
+       .initialize = rk_tsadcv3_initialize,
        .irq_ack = rk_tsadcv3_irq_ack,
        .control = rk_tsadcv3_control,
        .get_temp = rk_tsadcv2_get_temp,
@@ -660,6 +786,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = {
                .compatible = "rockchip,rk3288-tsadc",
                .data = (void *)&rk3288_tsadc_data,
        },
+       {
+               .compatible = "rockchip,rk3366-tsadc",
+               .data = (void *)&rk3366_tsadc_data,
+       },
        {
                .compatible = "rockchip,rk3368-tsadc",
                .data = (void *)&rk3368_tsadc_data,
@@ -768,6 +898,11 @@ static int rockchip_configure_from_dt(struct device *dev,
                return -EINVAL;
        }
 
+       /* The tsadc wont to handle the error in here since some SoCs didn't
+        * need this property.
+        */
+       thermal->grf = syscon_regmap_lookup_by_phandle(np, "rockchip,grf");
+
        return 0;
 }
 
@@ -786,8 +921,8 @@ rockchip_thermal_register_sensor(struct platform_device *pdev,
 
        sensor->thermal = thermal;
        sensor->id = id;
-       sensor->tzd = thermal_zone_of_sensor_register(&pdev->dev, id, sensor,
-                                                     &rockchip_of_thermal_ops);
+       sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev, id,
+                                       sensor, &rockchip_of_thermal_ops);
        if (IS_ERR(sensor->tzd)) {
                error = PTR_ERR(sensor->tzd);
                dev_err(&pdev->dev, "failed to register sensor %d: %d\n",
@@ -815,7 +950,7 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
        const struct of_device_id *match;
        struct resource *res;
        int irq;
-       int i, j;
+       int i;
        int error;
 
        match = of_match_node(of_rockchip_thermal_match, np);
@@ -888,7 +1023,8 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
                goto err_disable_pclk;
        }
 
-       thermal->chip->initialize(thermal->regs, thermal->tshut_polarity);
+       thermal->chip->initialize(thermal->grf, thermal->regs,
+                                 thermal->tshut_polarity);
 
        for (i = 0; i < thermal->chip->chn_num; i++) {
                error = rockchip_thermal_register_sensor(pdev, thermal,
@@ -898,9 +1034,6 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
                        dev_err(&pdev->dev,
                                "failed to register sensor[%d] : error = %d\n",
                                i, error);
-                       for (j = 0; j < i; j++)
-                               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                               thermal->sensors[j].tzd);
                        goto err_disable_pclk;
                }
        }
@@ -912,7 +1045,7 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
        if (error) {
                dev_err(&pdev->dev,
                        "failed to request tsadc irq: %d\n", error);
-               goto err_unregister_sensor;
+               goto err_disable_pclk;
        }
 
        thermal->chip->control(thermal->regs, true);
@@ -924,11 +1057,6 @@ static int rockchip_thermal_probe(struct platform_device *pdev)
 
        return 0;
 
-err_unregister_sensor:
-       while (i--)
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 thermal->sensors[i].tzd);
-
 err_disable_pclk:
        clk_disable_unprepare(thermal->pclk);
 err_disable_clk:
@@ -946,7 +1074,6 @@ static int rockchip_thermal_remove(struct platform_device *pdev)
                struct rockchip_thermal_sensor *sensor = &thermal->sensors[i];
 
                rockchip_thermal_toggle_sensor(sensor, false);
-               thermal_zone_of_sensor_unregister(&pdev->dev, sensor->tzd);
        }
 
        thermal->chip->control(thermal->regs, false);
@@ -988,12 +1115,15 @@ static int __maybe_unused rockchip_thermal_resume(struct device *dev)
                return error;
 
        error = clk_enable(thermal->pclk);
-       if (error)
+       if (error) {
+               clk_disable(thermal->clk);
                return error;
+       }
 
        rockchip_thermal_reset_controller(thermal->reset);
 
-       thermal->chip->initialize(thermal->regs, thermal->tshut_polarity);
+       thermal->chip->initialize(thermal->grf, thermal->regs,
+                                 thermal->tshut_polarity);
 
        for (i = 0; i < thermal->chip->chn_num; i++) {
                int id = thermal->sensors[i].id;
diff --git a/drivers/thermal/tango_thermal.c b/drivers/thermal/tango_thermal.c
new file mode 100644 (file)
index 0000000..70e0d9f
--- /dev/null
@@ -0,0 +1,109 @@
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/thermal.h>
+#include <linux/platform_device.h>
+
+/*
+ * According to a data sheet draft, "this temperature sensor uses a bandgap
+ * type of circuit to compare a voltage which has a negative temperature
+ * coefficient with a voltage that is proportional to absolute temperature.
+ * A resistor bank allows 41 different temperature thresholds to be selected
+ * and the logic output will then indicate whether the actual die temperature
+ * lies above or below the selected threshold."
+ */
+
+#define TEMPSI_CMD     0
+#define TEMPSI_RES     4
+#define TEMPSI_CFG     8
+
+#define CMD_OFF                0
+#define CMD_ON         1
+#define CMD_READ       2
+
+#define IDX_MIN                15
+#define IDX_MAX                40
+
+struct tango_thermal_priv {
+       void __iomem *base;
+       int thresh_idx;
+};
+
+static bool temp_above_thresh(void __iomem *base, int thresh_idx)
+{
+       writel(CMD_READ | thresh_idx << 8, base + TEMPSI_CMD);
+       usleep_range(10, 20);
+       writel(CMD_READ | thresh_idx << 8, base + TEMPSI_CMD);
+
+       return readl(base + TEMPSI_RES);
+}
+
+static int tango_get_temp(void *arg, int *res)
+{
+       struct tango_thermal_priv *priv = arg;
+       int idx = priv->thresh_idx;
+
+       if (temp_above_thresh(priv->base, idx)) {
+               /* Search upward by incrementing thresh_idx */
+               while (idx < IDX_MAX && temp_above_thresh(priv->base, ++idx))
+                       cpu_relax();
+               idx = idx - 1; /* always return lower bound */
+       } else {
+               /* Search downward by decrementing thresh_idx */
+               while (idx > IDX_MIN && !temp_above_thresh(priv->base, --idx))
+                       cpu_relax();
+       }
+
+       *res = (idx * 9 / 2 - 38) * 1000; /* millidegrees Celsius */
+       priv->thresh_idx = idx;
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops ops = {
+       .get_temp       = tango_get_temp,
+};
+
+static int tango_thermal_probe(struct platform_device *pdev)
+{
+       struct resource *res;
+       struct tango_thermal_priv *priv;
+       struct thermal_zone_device *tzdev;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       priv->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       priv->thresh_idx = IDX_MIN;
+       writel(0, priv->base + TEMPSI_CFG);
+       writel(CMD_ON, priv->base + TEMPSI_CMD);
+
+       tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, priv, &ops);
+       return PTR_ERR_OR_ZERO(tzdev);
+}
+
+static const struct of_device_id tango_sensor_ids[] = {
+       {
+               .compatible = "sigma,smp8758-thermal",
+       },
+       { /* sentinel */ }
+};
+
+static struct platform_driver tango_thermal_driver = {
+       .probe  = tango_thermal_probe,
+       .driver = {
+               .name           = "tango-thermal",
+               .of_match_table = tango_sensor_ids,
+       },
+};
+
+module_platform_driver(tango_thermal_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sigma Designs");
+MODULE_DESCRIPTION("Tango temperature sensor");
diff --git a/drivers/thermal/tegra/Kconfig b/drivers/thermal/tegra/Kconfig
new file mode 100644 (file)
index 0000000..cec586e
--- /dev/null
@@ -0,0 +1,13 @@
+menu "NVIDIA Tegra thermal drivers"
+depends on ARCH_TEGRA
+
+config TEGRA_SOCTHERM
+       tristate "Tegra SOCTHERM thermal management"
+       help
+         Enable this option for integrated thermal management support on NVIDIA
+         Tegra systems-on-chip. The driver supports four thermal zones
+         (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
+         zones to manage temperatures. This option is also required for the
+         emergency thermal reset (thermtrip) feature to function.
+
+endmenu
diff --git a/drivers/thermal/tegra/Makefile b/drivers/thermal/tegra/Makefile
new file mode 100644 (file)
index 0000000..1ce1af2
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra-soctherm.o
+
+tegra-soctherm-y                               := soctherm.o soctherm-fuse.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_124_SOC)    += tegra124-soctherm.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_132_SOC)    += tegra132-soctherm.o
+tegra-soctherm-$(CONFIG_ARCH_TEGRA_210_SOC)    += tegra210-soctherm.o
diff --git a/drivers/thermal/tegra/soctherm-fuse.c b/drivers/thermal/tegra/soctherm-fuse.c
new file mode 100644 (file)
index 0000000..2996318
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <soc/tegra/fuse.h>
+
+#include "soctherm.h"
+
+#define NOMINAL_CALIB_FT                       105
+#define NOMINAL_CALIB_CP                       25
+
+#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK     0x1fff
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK     (0x1fff << 13)
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT    13
+
+#define FUSE_TSENSOR_COMMON                    0x180
+
+/*
+ * Tegra210: Layout of bits in FUSE_TSENSOR_COMMON:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |       BASE_FT       |      BASE_CP      | SHFT_FT | SHIFT_CP  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Tegra12x, etc:
+ * In chips prior to Tegra210, this fuse was incorrectly sized as 26 bits,
+ * and didn't hold SHIFT_CP in [31:26]. Therefore these missing six bits
+ * were obtained via the FUSE_SPARE_REALIGNMENT_REG register [5:0].
+ *
+ * FUSE_TSENSOR_COMMON:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |-----------| SHFT_FT |       BASE_FT       |      BASE_CP      |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * FUSE_SPARE_REALIGNMENT_REG:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |---------------------------------------------------| SHIFT_CP  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+#define CALIB_COEFFICIENT 1000000LL
+
+/**
+ * div64_s64_precise() - wrapper for div64_s64()
+ * @a:  the dividend
+ * @b:  the divisor
+ *
+ * Implements division with fairly accurate rounding instead of truncation by
+ * shifting the dividend to the left by 16 so that the quotient has a
+ * much higher precision.
+ *
+ * Return: the quotient of a / b.
+ */
+static s64 div64_s64_precise(s64 a, s32 b)
+{
+       s64 r, al;
+
+       /* Scale up for increased precision division */
+       al = a << 16;
+
+       r = div64_s64(al * 2 + 1, 2 * b);
+       return r >> 16;
+}
+
+int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
+                           struct tsensor_shared_calib *shared)
+{
+       u32 val;
+       s32 shifted_cp, shifted_ft;
+       int err;
+
+       err = tegra_fuse_readl(FUSE_TSENSOR_COMMON, &val);
+       if (err)
+               return err;
+
+       shared->base_cp = (val & tfuse->fuse_base_cp_mask) >>
+                         tfuse->fuse_base_cp_shift;
+       shared->base_ft = (val & tfuse->fuse_base_ft_mask) >>
+                         tfuse->fuse_base_ft_shift;
+
+       shifted_ft = (val & tfuse->fuse_shift_ft_mask) >>
+                    tfuse->fuse_shift_ft_shift;
+       shifted_ft = sign_extend32(shifted_ft, 4);
+
+       if (tfuse->fuse_spare_realignment) {
+               err = tegra_fuse_readl(tfuse->fuse_spare_realignment, &val);
+               if (err)
+                       return err;
+       }
+
+       shifted_cp = sign_extend32(val, 5);
+
+       shared->actual_temp_cp = 2 * NOMINAL_CALIB_CP + shifted_cp;
+       shared->actual_temp_ft = 2 * NOMINAL_CALIB_FT + shifted_ft;
+
+       return 0;
+}
+
+int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor,
+                            const struct tsensor_shared_calib *shared,
+                            u32 *calibration)
+{
+       const struct tegra_tsensor_group *sensor_group;
+       u32 val, calib;
+       s32 actual_tsensor_ft, actual_tsensor_cp;
+       s32 delta_sens, delta_temp;
+       s32 mult, div;
+       s16 therma, thermb;
+       s64 temp;
+       int err;
+
+       sensor_group = sensor->group;
+
+       err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
+       if (err)
+               return err;
+
+       actual_tsensor_cp = (shared->base_cp * 64) + sign_extend32(val, 12);
+       val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK) >>
+             FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
+       actual_tsensor_ft = (shared->base_ft * 32) + sign_extend32(val, 12);
+
+       delta_sens = actual_tsensor_ft - actual_tsensor_cp;
+       delta_temp = shared->actual_temp_ft - shared->actual_temp_cp;
+
+       mult = sensor_group->pdiv * sensor->config->tsample_ate;
+       div = sensor->config->tsample * sensor_group->pdiv_ate;
+
+       temp = (s64)delta_temp * (1LL << 13) * mult;
+       therma = div64_s64_precise(temp, (s64)delta_sens * div);
+
+       temp = ((s64)actual_tsensor_ft * shared->actual_temp_cp) -
+               ((s64)actual_tsensor_cp * shared->actual_temp_ft);
+       thermb = div64_s64_precise(temp, delta_sens);
+
+       temp = (s64)therma * sensor->fuse_corr_alpha;
+       therma = div64_s64_precise(temp, CALIB_COEFFICIENT);
+
+       temp = (s64)thermb * sensor->fuse_corr_alpha + sensor->fuse_corr_beta;
+       thermb = div64_s64_precise(temp, CALIB_COEFFICIENT);
+
+       calib = ((u16)therma << SENSOR_CONFIG2_THERMA_SHIFT) |
+               ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
+
+       *calibration = calib;
+
+       return 0;
+}
+
+MODULE_AUTHOR("Wei Ni <wni@nvidia.com>");
+MODULE_DESCRIPTION("Tegra SOCTHERM fuse management");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
new file mode 100644 (file)
index 0000000..b865172
--- /dev/null
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author:
+ *     Mikko Perttunen <mperttunen@nvidia.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/thermal.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define SENSOR_CONFIG0                         0
+#define SENSOR_CONFIG0_STOP                    BIT(0)
+#define SENSOR_CONFIG0_CPTR_OVER               BIT(2)
+#define SENSOR_CONFIG0_OVER                    BIT(3)
+#define SENSOR_CONFIG0_TCALC_OVER              BIT(4)
+#define SENSOR_CONFIG0_TALL_MASK               (0xfffff << 8)
+#define SENSOR_CONFIG0_TALL_SHIFT              8
+
+#define SENSOR_CONFIG1                         4
+#define SENSOR_CONFIG1_TSAMPLE_MASK            0x3ff
+#define SENSOR_CONFIG1_TSAMPLE_SHIFT           0
+#define SENSOR_CONFIG1_TIDDQ_EN_MASK           (0x3f << 15)
+#define SENSOR_CONFIG1_TIDDQ_EN_SHIFT          15
+#define SENSOR_CONFIG1_TEN_COUNT_MASK          (0x3f << 24)
+#define SENSOR_CONFIG1_TEN_COUNT_SHIFT         24
+#define SENSOR_CONFIG1_TEMP_ENABLE             BIT(31)
+
+/*
+ * SENSOR_CONFIG2 is defined in soctherm.h
+ * because, it will be used by tegra_soctherm_fuse.c
+ */
+
+#define SENSOR_STATUS0                         0xc
+#define SENSOR_STATUS0_VALID_MASK              BIT(31)
+#define SENSOR_STATUS0_CAPTURE_MASK            0xffff
+
+#define SENSOR_STATUS1                         0x10
+#define SENSOR_STATUS1_TEMP_VALID_MASK         BIT(31)
+#define SENSOR_STATUS1_TEMP_MASK               0xffff
+
+#define READBACK_VALUE_MASK                    0xff00
+#define READBACK_VALUE_SHIFT                   8
+#define READBACK_ADD_HALF                      BIT(7)
+#define READBACK_NEGATE                                BIT(0)
+
+/* get val from register(r) mask bits(m) */
+#define REG_GET_MASK(r, m)     (((r) & (m)) >> (ffs(m) - 1))
+/* set val(v) to mask bits(m) of register(r) */
+#define REG_SET_MASK(r, m, v)  (((r) & ~(m)) | \
+                                (((v) & (m >> (ffs(m) - 1))) << (ffs(m) - 1)))
+
+static const int min_low_temp = -127000;
+static const int max_high_temp = 127000;
+
+struct tegra_thermctl_zone {
+       void __iomem *reg;
+       struct device *dev;
+       struct thermal_zone_device *tz;
+       const struct tegra_tsensor_group *sg;
+};
+
+struct tegra_soctherm {
+       struct reset_control *reset;
+       struct clk *clock_tsensor;
+       struct clk *clock_soctherm;
+       void __iomem *regs;
+       struct thermal_zone_device **thermctl_tzs;
+
+       u32 *calib;
+       struct tegra_soctherm_soc *soc;
+
+       struct dentry *debugfs_dir;
+};
+
+static void enable_tsensor(struct tegra_soctherm *tegra, unsigned int i)
+{
+       const struct tegra_tsensor *sensor = &tegra->soc->tsensors[i];
+       void __iomem *base = tegra->regs + sensor->base;
+       unsigned int val;
+
+       val = sensor->config->tall << SENSOR_CONFIG0_TALL_SHIFT;
+       writel(val, base + SENSOR_CONFIG0);
+
+       val  = (sensor->config->tsample - 1) << SENSOR_CONFIG1_TSAMPLE_SHIFT;
+       val |= sensor->config->tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
+       val |= sensor->config->ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
+       val |= SENSOR_CONFIG1_TEMP_ENABLE;
+       writel(val, base + SENSOR_CONFIG1);
+
+       writel(tegra->calib[i], base + SENSOR_CONFIG2);
+}
+
+/*
+ * Translate from soctherm readback format to millicelsius.
+ * The soctherm readback format in bits is as follows:
+ *   TTTTTTTT H______N
+ * where T's contain the temperature in Celsius,
+ * H denotes an addition of 0.5 Celsius and N denotes negation
+ * of the final value.
+ */
+static int translate_temp(u16 val)
+{
+       int t;
+
+       t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
+       if (val & READBACK_ADD_HALF)
+               t += 500;
+       if (val & READBACK_NEGATE)
+               t *= -1;
+
+       return t;
+}
+
+static int tegra_thermctl_get_temp(void *data, int *out_temp)
+{
+       struct tegra_thermctl_zone *zone = data;
+       u32 val;
+
+       val = readl(zone->reg);
+       val = REG_GET_MASK(val, zone->sg->sensor_temp_mask);
+       *out_temp = translate_temp(val);
+
+       return 0;
+}
+
+static int
+thermtrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
+                 int trip_temp);
+
+static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
+{
+       struct tegra_thermctl_zone *zone = data;
+       struct thermal_zone_device *tz = zone->tz;
+       const struct tegra_tsensor_group *sg = zone->sg;
+       struct device *dev = zone->dev;
+       enum thermal_trip_type type;
+       int ret;
+
+       if (!tz)
+               return -EINVAL;
+
+       ret = tz->ops->get_trip_type(tz, trip, &type);
+       if (ret)
+               return ret;
+
+       if (type != THERMAL_TRIP_CRITICAL)
+               return 0;
+
+       return thermtrip_program(dev, sg, temp);
+}
+
+static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
+       .get_temp = tegra_thermctl_get_temp,
+       .set_trip_temp = tegra_thermctl_set_trip_temp,
+};
+
+/**
+ * enforce_temp_range() - check and enforce temperature range [min, max]
+ * @trip_temp: the trip temperature to check
+ *
+ * Checks and enforces the permitted temperature range that SOC_THERM
+ * HW can support This is
+ * done while taking care of precision.
+ *
+ * Return: The precision adjusted capped temperature in millicelsius.
+ */
+static int enforce_temp_range(struct device *dev, int trip_temp)
+{
+       int temp;
+
+       temp = clamp_val(trip_temp, min_low_temp, max_high_temp);
+       if (temp != trip_temp)
+               dev_info(dev, "soctherm: trip temperature %d forced to %d\n",
+                        trip_temp, temp);
+       return temp;
+}
+
+/**
+ * thermtrip_program() - Configures the hardware to shut down the
+ * system if a given sensor group reaches a given temperature
+ * @dev: ptr to the struct device for the SOC_THERM IP block
+ * @sg: pointer to the sensor group to set the thermtrip temperature for
+ * @trip_temp: the temperature in millicelsius to trigger the thermal trip at
+ *
+ * Sets the thermal trip threshold of the given sensor group to be the
+ * @trip_temp.  If this threshold is crossed, the hardware will shut
+ * down.
+ *
+ * Note that, although @trip_temp is specified in millicelsius, the
+ * hardware is programmed in degrees Celsius.
+ *
+ * Return: 0 upon success, or %-EINVAL upon failure.
+ */
+static int thermtrip_program(struct device *dev,
+                            const struct tegra_tsensor_group *sg,
+                            int trip_temp)
+{
+       struct tegra_soctherm *ts = dev_get_drvdata(dev);
+       int temp;
+       u32 r;
+
+       if (!sg || !sg->thermtrip_threshold_mask)
+               return -EINVAL;
+
+       temp = enforce_temp_range(dev, trip_temp) / ts->soc->thresh_grain;
+
+       r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
+       r = REG_SET_MASK(r, sg->thermtrip_threshold_mask, temp);
+       r = REG_SET_MASK(r, sg->thermtrip_enable_mask, 1);
+       r = REG_SET_MASK(r, sg->thermtrip_any_en_mask, 0);
+       writel(r, ts->regs + THERMCTL_THERMTRIP_CTL);
+
+       return 0;
+}
+
+/**
+ * tegra_soctherm_set_hwtrips() - set HW trip point from DT data
+ * @dev: struct device * of the SOC_THERM instance
+ *
+ * Configure the SOC_THERM HW trip points, setting "THERMTRIP"
+ * trip points , using "critical" type trip_temp from thermal
+ * zone.
+ * After they have been configured, THERMTRIP will take action
+ * when the configured SoC thermal sensor group reaches a
+ * certain temperature.
+ *
+ * Return: 0 upon success, or a negative error code on failure.
+ * "Success" does not mean that trips was enabled; it could also
+ * mean that no node was found in DT.
+ * THERMTRIP has been enabled successfully when a message similar to
+ * this one appears on the serial console:
+ * "thermtrip: will shut down when sensor group XXX reaches YYYYYY mC"
+ */
+static int tegra_soctherm_set_hwtrips(struct device *dev,
+                                     const struct tegra_tsensor_group *sg,
+                                     struct thermal_zone_device *tz)
+{
+       int temperature;
+       int ret;
+
+       ret = tz->ops->get_crit_temp(tz, &temperature);
+       if (ret) {
+               dev_warn(dev, "thermtrip: %s: missing critical temperature\n",
+                        sg->name);
+               return ret;
+       }
+
+       ret = thermtrip_program(dev, sg, temperature);
+       if (ret) {
+               dev_err(dev, "thermtrip: %s: error during enable\n",
+                       sg->name);
+               return ret;
+       }
+
+       dev_info(dev,
+                "thermtrip: will shut down when %s reaches %d mC\n",
+                sg->name, temperature);
+
+       return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int regs_show(struct seq_file *s, void *data)
+{
+       struct platform_device *pdev = s->private;
+       struct tegra_soctherm *ts = platform_get_drvdata(pdev);
+       const struct tegra_tsensor *tsensors = ts->soc->tsensors;
+       const struct tegra_tsensor_group **ttgs = ts->soc->ttgs;
+       u32 r, state;
+       int i;
+
+       seq_puts(s, "-----TSENSE (convert HW)-----\n");
+
+       for (i = 0; i < ts->soc->num_tsensors; i++) {
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG1);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TEMP_ENABLE);
+
+               seq_printf(s, "%s: ", tsensors[i].name);
+               seq_printf(s, "En(%d) ", state);
+
+               if (!state) {
+                       seq_puts(s, "\n");
+                       continue;
+               }
+
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TIDDQ_EN_MASK);
+               seq_printf(s, "tiddq(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TEN_COUNT_MASK);
+               seq_printf(s, "ten_count(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG1_TSAMPLE_MASK);
+               seq_printf(s, "tsample(%d) ", state + 1);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_STATUS1);
+               state = REG_GET_MASK(r, SENSOR_STATUS1_TEMP_VALID_MASK);
+               seq_printf(s, "Temp(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_STATUS1_TEMP_MASK);
+               seq_printf(s, "%d) ", translate_temp(state));
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_STATUS0);
+               state = REG_GET_MASK(r, SENSOR_STATUS0_VALID_MASK);
+               seq_printf(s, "Capture(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_STATUS0_CAPTURE_MASK);
+               seq_printf(s, "%d) ", state);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG0);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_STOP);
+               seq_printf(s, "Stop(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_TALL_MASK);
+               seq_printf(s, "Tall(%d) ", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_TCALC_OVER);
+               seq_printf(s, "Over(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_OVER);
+               seq_printf(s, "%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG0_CPTR_OVER);
+               seq_printf(s, "%d) ", state);
+
+               r = readl(ts->regs + tsensors[i].base + SENSOR_CONFIG2);
+               state = REG_GET_MASK(r, SENSOR_CONFIG2_THERMA_MASK);
+               seq_printf(s, "Therm_A/B(%d/", state);
+               state = REG_GET_MASK(r, SENSOR_CONFIG2_THERMB_MASK);
+               seq_printf(s, "%d)\n", (s16)state);
+       }
+
+       r = readl(ts->regs + SENSOR_PDIV);
+       seq_printf(s, "PDIV: 0x%x\n", r);
+
+       r = readl(ts->regs + SENSOR_HOTSPOT_OFF);
+       seq_printf(s, "HOTSPOT: 0x%x\n", r);
+
+       seq_puts(s, "\n");
+       seq_puts(s, "-----SOC_THERM-----\n");
+
+       r = readl(ts->regs + SENSOR_TEMP1);
+       state = REG_GET_MASK(r, SENSOR_TEMP1_CPU_TEMP_MASK);
+       seq_printf(s, "Temperatures: CPU(%d) ", translate_temp(state));
+       state = REG_GET_MASK(r, SENSOR_TEMP1_GPU_TEMP_MASK);
+       seq_printf(s, " GPU(%d) ", translate_temp(state));
+       r = readl(ts->regs + SENSOR_TEMP2);
+       state = REG_GET_MASK(r, SENSOR_TEMP2_PLLX_TEMP_MASK);
+       seq_printf(s, " PLLX(%d) ", translate_temp(state));
+       state = REG_GET_MASK(r, SENSOR_TEMP2_MEM_TEMP_MASK);
+       seq_printf(s, " MEM(%d)\n", translate_temp(state));
+
+       r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
+       state = REG_GET_MASK(r, ttgs[0]->thermtrip_any_en_mask);
+       seq_printf(s, "Thermtrip Any En(%d)\n", state);
+       for (i = 0; i < ts->soc->num_ttgs; i++) {
+               state = REG_GET_MASK(r, ttgs[i]->thermtrip_enable_mask);
+               seq_printf(s, "     %s En(%d) ", ttgs[i]->name, state);
+               state = REG_GET_MASK(r, ttgs[i]->thermtrip_threshold_mask);
+               state *= ts->soc->thresh_grain;
+               seq_printf(s, "Thresh(%d)\n", state);
+       }
+
+       return 0;
+}
+
+static int regs_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, regs_show, inode->i_private);
+}
+
+static const struct file_operations regs_fops = {
+       .open           = regs_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static void soctherm_debug_init(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       struct dentry *root, *file;
+
+       root = debugfs_create_dir("soctherm", NULL);
+       if (!root) {
+               dev_err(&pdev->dev, "failed to create debugfs directory\n");
+               return;
+       }
+
+       tegra->debugfs_dir = root;
+
+       file = debugfs_create_file("reg_contents", 0644, root,
+                                  pdev, &regs_fops);
+       if (!file) {
+               dev_err(&pdev->dev, "failed to create debugfs file\n");
+               debugfs_remove_recursive(tegra->debugfs_dir);
+               tegra->debugfs_dir = NULL;
+       }
+}
+#else
+static inline void soctherm_debug_init(struct platform_device *pdev) {}
+#endif
+
+static int soctherm_clk_enable(struct platform_device *pdev, bool enable)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       int err;
+
+       if (!tegra->clock_soctherm || !tegra->clock_tsensor)
+               return -EINVAL;
+
+       reset_control_assert(tegra->reset);
+
+       if (enable) {
+               err = clk_prepare_enable(tegra->clock_soctherm);
+               if (err) {
+                       reset_control_deassert(tegra->reset);
+                       return err;
+               }
+
+               err = clk_prepare_enable(tegra->clock_tsensor);
+               if (err) {
+                       clk_disable_unprepare(tegra->clock_soctherm);
+                       reset_control_deassert(tegra->reset);
+                       return err;
+               }
+       } else {
+               clk_disable_unprepare(tegra->clock_tsensor);
+               clk_disable_unprepare(tegra->clock_soctherm);
+       }
+
+       reset_control_deassert(tegra->reset);
+
+       return 0;
+}
+
+static void soctherm_init(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       const struct tegra_tsensor_group **ttgs = tegra->soc->ttgs;
+       int i;
+       u32 pdiv, hotspot;
+
+       /* Initialize raw sensors */
+       for (i = 0; i < tegra->soc->num_tsensors; ++i)
+               enable_tsensor(tegra, i);
+
+       /* program pdiv and hotspot offsets per THERM */
+       pdiv = readl(tegra->regs + SENSOR_PDIV);
+       hotspot = readl(tegra->regs + SENSOR_HOTSPOT_OFF);
+       for (i = 0; i < tegra->soc->num_ttgs; ++i) {
+               pdiv = REG_SET_MASK(pdiv, ttgs[i]->pdiv_mask,
+                                   ttgs[i]->pdiv);
+               /* hotspot offset from PLLX, doesn't need to configure PLLX */
+               if (ttgs[i]->id == TEGRA124_SOCTHERM_SENSOR_PLLX)
+                       continue;
+               hotspot =  REG_SET_MASK(hotspot,
+                                       ttgs[i]->pllx_hotspot_mask,
+                                       ttgs[i]->pllx_hotspot_diff);
+       }
+       writel(pdiv, tegra->regs + SENSOR_PDIV);
+       writel(hotspot, tegra->regs + SENSOR_HOTSPOT_OFF);
+}
+
+static const struct of_device_id tegra_soctherm_of_match[] = {
+#ifdef CONFIG_ARCH_TEGRA_124_SOC
+       {
+               .compatible = "nvidia,tegra124-soctherm",
+               .data = &tegra124_soctherm,
+       },
+#endif
+#ifdef CONFIG_ARCH_TEGRA_132_SOC
+       {
+               .compatible = "nvidia,tegra132-soctherm",
+               .data = &tegra132_soctherm,
+       },
+#endif
+#ifdef CONFIG_ARCH_TEGRA_210_SOC
+       {
+               .compatible = "nvidia,tegra210-soctherm",
+               .data = &tegra210_soctherm,
+       },
+#endif
+       { },
+};
+MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
+
+static int tegra_soctherm_probe(struct platform_device *pdev)
+{
+       const struct of_device_id *match;
+       struct tegra_soctherm *tegra;
+       struct thermal_zone_device *z;
+       struct tsensor_shared_calib shared_calib;
+       struct resource *res;
+       struct tegra_soctherm_soc *soc;
+       unsigned int i;
+       int err;
+
+       match = of_match_node(tegra_soctherm_of_match, pdev->dev.of_node);
+       if (!match)
+               return -ENODEV;
+
+       soc = (struct tegra_soctherm_soc *)match->data;
+       if (soc->num_ttgs > TEGRA124_SOCTHERM_SENSOR_NUM)
+               return -EINVAL;
+
+       tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
+       if (!tegra)
+               return -ENOMEM;
+
+       dev_set_drvdata(&pdev->dev, tegra);
+
+       tegra->soc = soc;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       tegra->regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(tegra->regs))
+               return PTR_ERR(tegra->regs);
+
+       tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
+       if (IS_ERR(tegra->reset)) {
+               dev_err(&pdev->dev, "can't get soctherm reset\n");
+               return PTR_ERR(tegra->reset);
+       }
+
+       tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
+       if (IS_ERR(tegra->clock_tsensor)) {
+               dev_err(&pdev->dev, "can't get tsensor clock\n");
+               return PTR_ERR(tegra->clock_tsensor);
+       }
+
+       tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
+       if (IS_ERR(tegra->clock_soctherm)) {
+               dev_err(&pdev->dev, "can't get soctherm clock\n");
+               return PTR_ERR(tegra->clock_soctherm);
+       }
+
+       tegra->calib = devm_kzalloc(&pdev->dev,
+                                   sizeof(u32) * soc->num_tsensors,
+                                   GFP_KERNEL);
+       if (!tegra->calib)
+               return -ENOMEM;
+
+       /* calculate shared calibration data */
+       err = tegra_calc_shared_calib(soc->tfuse, &shared_calib);
+       if (err)
+               return err;
+
+       /* calculate tsensor calibaration data */
+       for (i = 0; i < soc->num_tsensors; ++i) {
+               err = tegra_calc_tsensor_calib(&soc->tsensors[i],
+                                              &shared_calib,
+                                              &tegra->calib[i]);
+               if (err)
+                       return err;
+       }
+
+       tegra->thermctl_tzs = devm_kzalloc(&pdev->dev,
+                                          sizeof(*z) * soc->num_ttgs,
+                                          GFP_KERNEL);
+       if (!tegra->thermctl_tzs)
+               return -ENOMEM;
+
+       err = soctherm_clk_enable(pdev, true);
+       if (err)
+               return err;
+
+       soctherm_init(pdev);
+
+       for (i = 0; i < soc->num_ttgs; ++i) {
+               struct tegra_thermctl_zone *zone =
+                       devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
+               if (!zone) {
+                       err = -ENOMEM;
+                       goto disable_clocks;
+               }
+
+               zone->reg = tegra->regs + soc->ttgs[i]->sensor_temp_offset;
+               zone->dev = &pdev->dev;
+               zone->sg = soc->ttgs[i];
+
+               z = devm_thermal_zone_of_sensor_register(&pdev->dev,
+                                                        soc->ttgs[i]->id, zone,
+                                                        &tegra_of_thermal_ops);
+               if (IS_ERR(z)) {
+                       err = PTR_ERR(z);
+                       dev_err(&pdev->dev, "failed to register sensor: %d\n",
+                               err);
+                       goto disable_clocks;
+               }
+
+               zone->tz = z;
+               tegra->thermctl_tzs[soc->ttgs[i]->id] = z;
+
+               /* Configure hw trip points */
+               tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+       }
+
+       soctherm_debug_init(pdev);
+
+       return 0;
+
+disable_clocks:
+       soctherm_clk_enable(pdev, false);
+
+       return err;
+}
+
+static int tegra_soctherm_remove(struct platform_device *pdev)
+{
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+
+       debugfs_remove_recursive(tegra->debugfs_dir);
+
+       soctherm_clk_enable(pdev, false);
+
+       return 0;
+}
+
+static int __maybe_unused soctherm_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+
+       soctherm_clk_enable(pdev, false);
+
+       return 0;
+}
+
+static int __maybe_unused soctherm_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+       struct tegra_soctherm_soc *soc = tegra->soc;
+       int err, i;
+
+       err = soctherm_clk_enable(pdev, true);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "Resume failed: enable clocks failed\n");
+               return err;
+       }
+
+       soctherm_init(pdev);
+
+       for (i = 0; i < soc->num_ttgs; ++i) {
+               struct thermal_zone_device *tz;
+
+               tz = tegra->thermctl_tzs[soc->ttgs[i]->id];
+               tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+       }
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tegra_soctherm_pm, soctherm_suspend, soctherm_resume);
+
+static struct platform_driver tegra_soctherm_driver = {
+       .probe = tegra_soctherm_probe,
+       .remove = tegra_soctherm_remove,
+       .driver = {
+               .name = "tegra_soctherm",
+               .pm = &tegra_soctherm_pm,
+               .of_match_table = tegra_soctherm_of_match,
+       },
+};
+module_platform_driver(tegra_soctherm_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra SOCTHERM thermal management driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h
new file mode 100644 (file)
index 0000000..28e18ec
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
+#define __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
+
+#define SENSOR_CONFIG2                          8
+#define SENSOR_CONFIG2_THERMA_MASK             (0xffff << 16)
+#define SENSOR_CONFIG2_THERMA_SHIFT            16
+#define SENSOR_CONFIG2_THERMB_MASK             0xffff
+#define SENSOR_CONFIG2_THERMB_SHIFT            0
+
+#define THERMCTL_THERMTRIP_CTL                 0x80
+/* BITs are defined in device file */
+
+#define SENSOR_PDIV                            0x1c0
+#define SENSOR_PDIV_CPU_MASK                   (0xf << 12)
+#define SENSOR_PDIV_GPU_MASK                   (0xf << 8)
+#define SENSOR_PDIV_MEM_MASK                   (0xf << 4)
+#define SENSOR_PDIV_PLLX_MASK                  (0xf << 0)
+
+#define SENSOR_HOTSPOT_OFF                     0x1c4
+#define SENSOR_HOTSPOT_CPU_MASK                        (0xff << 16)
+#define SENSOR_HOTSPOT_GPU_MASK                        (0xff << 8)
+#define SENSOR_HOTSPOT_MEM_MASK                        (0xff << 0)
+
+#define SENSOR_TEMP1                           0x1c8
+#define SENSOR_TEMP1_CPU_TEMP_MASK             (0xffff << 16)
+#define SENSOR_TEMP1_GPU_TEMP_MASK             0xffff
+#define SENSOR_TEMP2                           0x1cc
+#define SENSOR_TEMP2_MEM_TEMP_MASK             (0xffff << 16)
+#define SENSOR_TEMP2_PLLX_TEMP_MASK            0xffff
+
+/**
+ * struct tegra_tsensor_group - SOC_THERM sensor group data
+ * @name: short name of the temperature sensor group
+ * @id: numeric ID of the temperature sensor group
+ * @sensor_temp_offset: offset of the SENSOR_TEMP* register
+ * @sensor_temp_mask: bit mask for this sensor group in SENSOR_TEMP* register
+ * @pdiv: the sensor count post-divider to use during runtime
+ * @pdiv_ate: the sensor count post-divider used during automated test
+ * @pdiv_mask: register bitfield mask for the PDIV field for this sensor
+ * @pllx_hotspot_diff: hotspot offset from the PLLX sensor, must be 0 for
+    PLLX sensor group
+ * @pllx_hotspot_mask: register bitfield mask for the HOTSPOT field
+ */
+struct tegra_tsensor_group {
+       const char *name;
+       u8 id;
+       u16 sensor_temp_offset;
+       u32 sensor_temp_mask;
+       u32 pdiv, pdiv_ate, pdiv_mask;
+       u32 pllx_hotspot_diff, pllx_hotspot_mask;
+       u32 thermtrip_enable_mask;
+       u32 thermtrip_any_en_mask;
+       u32 thermtrip_threshold_mask;
+};
+
+struct tegra_tsensor_configuration {
+       u32 tall, tiddq_en, ten_count, pdiv, pdiv_ate, tsample, tsample_ate;
+};
+
+struct tegra_tsensor {
+       const char *name;
+       const u32 base;
+       const struct tegra_tsensor_configuration *config;
+       const u32 calib_fuse_offset;
+       /*
+        * Correction values used to modify values read from
+        * calibration fuses
+        */
+       const s32 fuse_corr_alpha, fuse_corr_beta;
+       const struct tegra_tsensor_group *group;
+};
+
+struct tegra_soctherm_fuse {
+       u32 fuse_base_cp_mask, fuse_base_cp_shift;
+       u32 fuse_base_ft_mask, fuse_base_ft_shift;
+       u32 fuse_shift_ft_mask, fuse_shift_ft_shift;
+       u32 fuse_spare_realignment;
+};
+
+struct tsensor_shared_calib {
+       u32 base_cp, base_ft;
+       u32 actual_temp_cp, actual_temp_ft;
+};
+
+struct tegra_soctherm_soc {
+       const struct tegra_tsensor *tsensors;
+       const unsigned int num_tsensors;
+       const struct tegra_tsensor_group **ttgs;
+       const unsigned int num_ttgs;
+       const struct tegra_soctherm_fuse *tfuse;
+       const int thresh_grain;
+};
+
+int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
+                           struct tsensor_shared_calib *shared);
+int tegra_calc_tsensor_calib(const struct tegra_tsensor *sensor,
+                            const struct tsensor_shared_calib *shared,
+                            u32 *calib);
+
+#ifdef CONFIG_ARCH_TEGRA_124_SOC
+extern const struct tegra_soctherm_soc tegra124_soctherm;
+#endif
+
+#ifdef CONFIG_ARCH_TEGRA_132_SOC
+extern const struct tegra_soctherm_soc tegra132_soctherm;
+#endif
+
+#ifdef CONFIG_ARCH_TEGRA_210_SOC
+extern const struct tegra_soctherm_soc tegra210_soctherm;
+#endif
+
+#endif
+
diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c
new file mode 100644 (file)
index 0000000..beb9d36
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA124_THERMTRIP_ANY_EN_MASK         (0x1 << 28)
+#define TEGRA124_THERMTRIP_MEM_EN_MASK         (0x1 << 27)
+#define TEGRA124_THERMTRIP_GPU_EN_MASK         (0x1 << 26)
+#define TEGRA124_THERMTRIP_CPU_EN_MASK         (0x1 << 25)
+#define TEGRA124_THERMTRIP_TSENSE_EN_MASK      (0x1 << 24)
+#define TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK  (0xff << 16)
+#define TEGRA124_THERMTRIP_CPU_THRESH_MASK     (0xff << 8)
+#define TEGRA124_THERMTRIP_TSENSE_THRESH_MASK  0xff
+
+#define TEGRA124_THRESH_GRAIN                  1000
+
+static const struct tegra_tsensor_configuration tegra124_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name   = "cpu",
+       .sensor_temp_offset     = SENSOR_TEMP1,
+       .sensor_temp_mask       = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra124_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA124_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra124_tsensor_groups[] = {
+       &tegra124_tsensor_group_cpu,
+       &tegra124_tsensor_group_gpu,
+       &tegra124_tsensor_group_pll,
+       &tegra124_tsensor_group_mem,
+};
+
+static const struct tegra_tsensor tegra124_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1135400,
+               .fuse_corr_beta = -6266900,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1122220,
+               .fuse_corr_beta = -5700700,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1127000,
+               .fuse_corr_beta = -6768200,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1110900,
+               .fuse_corr_beta = -6232000,
+               .group = &tegra124_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1122300,
+               .fuse_corr_beta = -5936400,
+               .group = &tegra124_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1145700,
+               .fuse_corr_beta = -7124600,
+               .group = &tegra124_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1120100,
+               .fuse_corr_beta = -6000500,
+               .group = &tegra124_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra124_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1106500,
+               .fuse_corr_beta = -6729300,
+               .group = &tegra124_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra124_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff,
+       .fuse_base_cp_shift = 0,
+       .fuse_base_ft_mask = 0x7ff << 10,
+       .fuse_base_ft_shift = 10,
+       .fuse_shift_ft_mask = 0x1f << 21,
+       .fuse_shift_ft_shift = 21,
+       .fuse_spare_realignment = 0x1fc,
+};
+
+const struct tegra_soctherm_soc tegra124_soctherm = {
+       .tsensors = tegra124_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra124_tsensors),
+       .ttgs = tegra124_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra124_tsensor_groups),
+       .tfuse = &tegra124_soctherm_fuse,
+       .thresh_grain = TEGRA124_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c
new file mode 100644 (file)
index 0000000..e2aa84e
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA132_THERMTRIP_ANY_EN_MASK         (0x1 << 28)
+#define TEGRA132_THERMTRIP_MEM_EN_MASK         (0x1 << 27)
+#define TEGRA132_THERMTRIP_GPU_EN_MASK         (0x1 << 26)
+#define TEGRA132_THERMTRIP_CPU_EN_MASK         (0x1 << 25)
+#define TEGRA132_THERMTRIP_TSENSE_EN_MASK      (0x1 << 24)
+#define TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK  (0xff << 16)
+#define TEGRA132_THERMTRIP_CPU_THRESH_MASK     (0xff << 8)
+#define TEGRA132_THERMTRIP_TSENSE_THRESH_MASK  0xff
+
+#define TEGRA132_THRESH_GRAIN                  1000
+
+static const struct tegra_tsensor_configuration tegra132_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name = "cpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra132_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA132_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra132_tsensor_groups[] = {
+       &tegra132_tsensor_group_cpu,
+       &tegra132_tsensor_group_gpu,
+       &tegra132_tsensor_group_pll,
+       &tegra132_tsensor_group_mem,
+};
+
+static struct tegra_tsensor tegra132_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1126600,
+               .fuse_corr_beta = -9433500,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1110800,
+               .fuse_corr_beta = -7383000,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1113800,
+               .fuse_corr_beta = -6215200,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1129600,
+               .fuse_corr_beta = -8196100,
+               .group = &tegra132_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1132900,
+               .fuse_corr_beta = -6755300,
+               .group = &tegra132_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1142300,
+               .fuse_corr_beta = -7374200,
+               .group = &tegra132_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1125100,
+               .fuse_corr_beta = -6350400,
+               .group = &tegra132_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra132_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1118100,
+               .fuse_corr_beta = -8208800,
+               .group = &tegra132_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra132_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff,
+       .fuse_base_cp_shift = 0,
+       .fuse_base_ft_mask = 0x7ff << 10,
+       .fuse_base_ft_shift = 10,
+       .fuse_shift_ft_mask = 0x1f << 21,
+       .fuse_shift_ft_shift = 21,
+       .fuse_spare_realignment = 0x1fc,
+};
+
+const struct tegra_soctherm_soc tegra132_soctherm = {
+       .tsensors = tegra132_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra132_tsensors),
+       .ttgs = tegra132_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra132_tsensor_groups),
+       .tfuse = &tegra132_soctherm_fuse,
+       .thresh_grain = TEGRA132_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c
new file mode 100644 (file)
index 0000000..19cc0ab
--- /dev/null
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2014-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <soc/tegra/fuse.h>
+
+#include <dt-bindings/thermal/tegra124-soctherm.h>
+
+#include "soctherm.h"
+
+#define TEGRA210_THERMTRIP_ANY_EN_MASK         (0x1 << 31)
+#define TEGRA210_THERMTRIP_MEM_EN_MASK         (0x1 << 30)
+#define TEGRA210_THERMTRIP_GPU_EN_MASK         (0x1 << 29)
+#define TEGRA210_THERMTRIP_CPU_EN_MASK         (0x1 << 28)
+#define TEGRA210_THERMTRIP_TSENSE_EN_MASK      (0x1 << 27)
+#define TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK  (0x1ff << 18)
+#define TEGRA210_THERMTRIP_CPU_THRESH_MASK     (0x1ff << 9)
+#define TEGRA210_THERMTRIP_TSENSE_THRESH_MASK  0x1ff
+
+#define TEGRA210_THRESH_GRAIN                  500
+
+static const struct tegra_tsensor_configuration tegra210_tsensor_config = {
+       .tall = 16300,
+       .tiddq_en = 1,
+       .ten_count = 1,
+       .tsample = 120,
+       .tsample_ate = 480,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_cpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_CPU,
+       .name = "cpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_CPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_CPU_MASK,
+       .pllx_hotspot_diff = 10,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_CPU_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_CPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_CPU_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_gpu = {
+       .id = TEGRA124_SOCTHERM_SENSOR_GPU,
+       .name = "gpu",
+       .sensor_temp_offset = SENSOR_TEMP1,
+       .sensor_temp_mask = SENSOR_TEMP1_GPU_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_GPU_MASK,
+       .pllx_hotspot_diff = 5,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_GPU_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_GPU_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_pll = {
+       .id = TEGRA124_SOCTHERM_SENSOR_PLLX,
+       .name = "pll",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_PLLX_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_PLLX_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_TSENSE_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_TSENSE_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group tegra210_tsensor_group_mem = {
+       .id = TEGRA124_SOCTHERM_SENSOR_MEM,
+       .name = "mem",
+       .sensor_temp_offset = SENSOR_TEMP2,
+       .sensor_temp_mask = SENSOR_TEMP2_MEM_TEMP_MASK,
+       .pdiv = 8,
+       .pdiv_ate = 8,
+       .pdiv_mask = SENSOR_PDIV_MEM_MASK,
+       .pllx_hotspot_diff = 0,
+       .pllx_hotspot_mask = SENSOR_HOTSPOT_MEM_MASK,
+       .thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
+       .thermtrip_enable_mask = TEGRA210_THERMTRIP_MEM_EN_MASK,
+       .thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+};
+
+static const struct tegra_tsensor_group *tegra210_tsensor_groups[] = {
+       &tegra210_tsensor_group_cpu,
+       &tegra210_tsensor_group_gpu,
+       &tegra210_tsensor_group_pll,
+       &tegra210_tsensor_group_mem,
+};
+
+static const struct tegra_tsensor tegra210_tsensors[] = {
+       {
+               .name = "cpu0",
+               .base = 0xc0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x098,
+               .fuse_corr_alpha = 1085000,
+               .fuse_corr_beta = 3244200,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu1",
+               .base = 0xe0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x084,
+               .fuse_corr_alpha = 1126200,
+               .fuse_corr_beta = -67500,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu2",
+               .base = 0x100,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x088,
+               .fuse_corr_alpha = 1098400,
+               .fuse_corr_beta = 2251100,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "cpu3",
+               .base = 0x120,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x12c,
+               .fuse_corr_alpha = 1108000,
+               .fuse_corr_beta = 602700,
+               .group = &tegra210_tsensor_group_cpu,
+       }, {
+               .name = "mem0",
+               .base = 0x140,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x158,
+               .fuse_corr_alpha = 1069200,
+               .fuse_corr_beta = 3549900,
+               .group = &tegra210_tsensor_group_mem,
+       }, {
+               .name = "mem1",
+               .base = 0x160,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x15c,
+               .fuse_corr_alpha = 1173700,
+               .fuse_corr_beta = -6263600,
+               .group = &tegra210_tsensor_group_mem,
+       }, {
+               .name = "gpu",
+               .base = 0x180,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x154,
+               .fuse_corr_alpha = 1074300,
+               .fuse_corr_beta = 2734900,
+               .group = &tegra210_tsensor_group_gpu,
+       }, {
+               .name = "pllx",
+               .base = 0x1a0,
+               .config = &tegra210_tsensor_config,
+               .calib_fuse_offset = 0x160,
+               .fuse_corr_alpha = 1039700,
+               .fuse_corr_beta = 6829100,
+               .group = &tegra210_tsensor_group_pll,
+       },
+};
+
+/*
+ * Mask/shift bits in FUSE_TSENSOR_COMMON and
+ * FUSE_TSENSOR_COMMON, which are described in
+ * tegra_soctherm_fuse.c
+ */
+static const struct tegra_soctherm_fuse tegra210_soctherm_fuse = {
+       .fuse_base_cp_mask = 0x3ff << 11,
+       .fuse_base_cp_shift = 11,
+       .fuse_base_ft_mask = 0x7ff << 21,
+       .fuse_base_ft_shift = 21,
+       .fuse_shift_ft_mask = 0x1f << 6,
+       .fuse_shift_ft_shift = 6,
+       .fuse_spare_realignment = 0,
+};
+
+const struct tegra_soctherm_soc tegra210_soctherm = {
+       .tsensors = tegra210_tsensors,
+       .num_tsensors = ARRAY_SIZE(tegra210_tsensors),
+       .ttgs = tegra210_tsensor_groups,
+       .num_ttgs = ARRAY_SIZE(tegra210_tsensor_groups),
+       .tfuse = &tegra210_soctherm_fuse,
+       .thresh_grain = TEGRA210_THRESH_GRAIN,
+};
diff --git a/drivers/thermal/tegra_soctherm.c b/drivers/thermal/tegra_soctherm.c
deleted file mode 100644 (file)
index 1369752..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Author:
- *     Mikko Perttunen <mperttunen@nvidia.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/err.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/reset.h>
-#include <linux/thermal.h>
-
-#include <soc/tegra/fuse.h>
-
-#define SENSOR_CONFIG0                         0
-#define SENSOR_CONFIG0_STOP                    BIT(0)
-#define SENSOR_CONFIG0_TALL_SHIFT              8
-#define SENSOR_CONFIG0_TCALC_OVER              BIT(4)
-#define SENSOR_CONFIG0_OVER                    BIT(3)
-#define SENSOR_CONFIG0_CPTR_OVER               BIT(2)
-
-#define SENSOR_CONFIG1                         4
-#define SENSOR_CONFIG1_TSAMPLE_SHIFT           0
-#define SENSOR_CONFIG1_TIDDQ_EN_SHIFT          15
-#define SENSOR_CONFIG1_TEN_COUNT_SHIFT         24
-#define SENSOR_CONFIG1_TEMP_ENABLE             BIT(31)
-
-#define SENSOR_CONFIG2                         8
-#define SENSOR_CONFIG2_THERMA_SHIFT            16
-#define SENSOR_CONFIG2_THERMB_SHIFT            0
-
-#define SENSOR_PDIV                            0x1c0
-#define SENSOR_PDIV_T124                       0x8888
-#define SENSOR_HOTSPOT_OFF                     0x1c4
-#define SENSOR_HOTSPOT_OFF_T124                        0x00060600
-#define SENSOR_TEMP1                           0x1c8
-#define SENSOR_TEMP2                           0x1cc
-
-#define SENSOR_TEMP_MASK                       0xffff
-#define READBACK_VALUE_MASK                    0xff00
-#define READBACK_VALUE_SHIFT                   8
-#define READBACK_ADD_HALF                      BIT(7)
-#define READBACK_NEGATE                                BIT(0)
-
-#define FUSE_TSENSOR8_CALIB                    0x180
-#define FUSE_SPARE_REALIGNMENT_REG_0           0x1fc
-
-#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK     0x1fff
-#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK     (0x1fff << 13)
-#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT    13
-
-#define FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK    0x3ff
-#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK    (0x7ff << 10)
-#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT   10
-
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_CP_MASK 0x3f
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK (0x1f << 21)
-#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT 21
-
-#define NOMINAL_CALIB_FT_T124                  105
-#define NOMINAL_CALIB_CP_T124                  25
-
-struct tegra_tsensor_configuration {
-       u32 tall, tsample, tiddq_en, ten_count, pdiv, tsample_ate, pdiv_ate;
-};
-
-struct tegra_tsensor {
-       const struct tegra_tsensor_configuration *config;
-       u32 base, calib_fuse_offset;
-       /* Correction values used to modify values read from calibration fuses */
-       s32 fuse_corr_alpha, fuse_corr_beta;
-};
-
-struct tegra_thermctl_zone {
-       void __iomem *reg;
-       unsigned int shift;
-};
-
-static const struct tegra_tsensor_configuration t124_tsensor_config = {
-       .tall = 16300,
-       .tsample = 120,
-       .tiddq_en = 1,
-       .ten_count = 1,
-       .pdiv = 8,
-       .tsample_ate = 480,
-       .pdiv_ate = 8
-};
-
-static const struct tegra_tsensor t124_tsensors[] = {
-       {
-               .config = &t124_tsensor_config,
-               .base = 0xc0,
-               .calib_fuse_offset = 0x098,
-               .fuse_corr_alpha = 1135400,
-               .fuse_corr_beta = -6266900,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0xe0,
-               .calib_fuse_offset = 0x084,
-               .fuse_corr_alpha = 1122220,
-               .fuse_corr_beta = -5700700,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x100,
-               .calib_fuse_offset = 0x088,
-               .fuse_corr_alpha = 1127000,
-               .fuse_corr_beta = -6768200,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x120,
-               .calib_fuse_offset = 0x12c,
-               .fuse_corr_alpha = 1110900,
-               .fuse_corr_beta = -6232000,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x140,
-               .calib_fuse_offset = 0x158,
-               .fuse_corr_alpha = 1122300,
-               .fuse_corr_beta = -5936400,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x160,
-               .calib_fuse_offset = 0x15c,
-               .fuse_corr_alpha = 1145700,
-               .fuse_corr_beta = -7124600,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x180,
-               .calib_fuse_offset = 0x154,
-               .fuse_corr_alpha = 1120100,
-               .fuse_corr_beta = -6000500,
-       },
-       {
-               .config = &t124_tsensor_config,
-               .base = 0x1a0,
-               .calib_fuse_offset = 0x160,
-               .fuse_corr_alpha = 1106500,
-               .fuse_corr_beta = -6729300,
-       },
-};
-
-struct tegra_soctherm {
-       struct reset_control *reset;
-       struct clk *clock_tsensor;
-       struct clk *clock_soctherm;
-       void __iomem *regs;
-
-       struct thermal_zone_device *thermctl_tzs[4];
-};
-
-struct tsensor_shared_calibration {
-       u32 base_cp, base_ft;
-       u32 actual_temp_cp, actual_temp_ft;
-};
-
-static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
-{
-       u32 val, shifted_cp, shifted_ft;
-       int err;
-
-       err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
-       if (err)
-               return err;
-       r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
-       r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
-               >> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
-       val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
-               >> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
-       shifted_ft = sign_extend32(val, 4);
-
-       err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
-       if (err)
-               return err;
-       shifted_cp = sign_extend32(val, 5);
-
-       r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
-       r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
-
-       return 0;
-}
-
-static s64 div64_s64_precise(s64 a, s64 b)
-{
-       s64 r, al;
-
-       /* Scale up for increased precision division */
-       al = a << 16;
-
-       r = div64_s64(al * 2 + 1, 2 * b);
-       return r >> 16;
-}
-
-static int
-calculate_tsensor_calibration(const struct tegra_tsensor *sensor,
-                             const struct tsensor_shared_calibration *shared,
-                             u32 *calib)
-{
-       u32 val;
-       s32 actual_tsensor_ft, actual_tsensor_cp, delta_sens, delta_temp,
-           mult, div;
-       s16 therma, thermb;
-       s64 tmp;
-       int err;
-
-       err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
-       if (err)
-               return err;
-
-       actual_tsensor_cp = (shared->base_cp * 64) + sign_extend32(val, 12);
-       val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
-               >> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
-       actual_tsensor_ft = (shared->base_ft * 32) + sign_extend32(val, 12);
-
-       delta_sens = actual_tsensor_ft - actual_tsensor_cp;
-       delta_temp = shared->actual_temp_ft - shared->actual_temp_cp;
-
-       mult = sensor->config->pdiv * sensor->config->tsample_ate;
-       div = sensor->config->tsample * sensor->config->pdiv_ate;
-
-       therma = div64_s64_precise((s64) delta_temp * (1LL << 13) * mult,
-                                  (s64) delta_sens * div);
-
-       tmp = (s64)actual_tsensor_ft * shared->actual_temp_cp -
-             (s64)actual_tsensor_cp * shared->actual_temp_ft;
-       thermb = div64_s64_precise(tmp, (s64)delta_sens);
-
-       therma = div64_s64_precise((s64)therma * sensor->fuse_corr_alpha,
-                                  (s64)1000000LL);
-       thermb = div64_s64_precise((s64)thermb * sensor->fuse_corr_alpha +
-                                  sensor->fuse_corr_beta, (s64)1000000LL);
-
-       *calib = ((u16)therma << SENSOR_CONFIG2_THERMA_SHIFT) |
-                ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
-
-       return 0;
-}
-
-static int enable_tsensor(struct tegra_soctherm *tegra,
-                         const struct tegra_tsensor *sensor,
-                         const struct tsensor_shared_calibration *shared)
-{
-       void __iomem *base = tegra->regs + sensor->base;
-       unsigned int val;
-       u32 calib;
-       int err;
-
-       err = calculate_tsensor_calibration(sensor, shared, &calib);
-       if (err)
-               return err;
-
-       val = sensor->config->tall << SENSOR_CONFIG0_TALL_SHIFT;
-       writel(val, base + SENSOR_CONFIG0);
-
-       val  = (sensor->config->tsample - 1) << SENSOR_CONFIG1_TSAMPLE_SHIFT;
-       val |= sensor->config->tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
-       val |= sensor->config->ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
-       val |= SENSOR_CONFIG1_TEMP_ENABLE;
-       writel(val, base + SENSOR_CONFIG1);
-
-       writel(calib, base + SENSOR_CONFIG2);
-
-       return 0;
-}
-
-/*
- * Translate from soctherm readback format to millicelsius.
- * The soctherm readback format in bits is as follows:
- *   TTTTTTTT H______N
- * where T's contain the temperature in Celsius,
- * H denotes an addition of 0.5 Celsius and N denotes negation
- * of the final value.
- */
-static int translate_temp(u16 val)
-{
-       long t;
-
-       t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
-       if (val & READBACK_ADD_HALF)
-               t += 500;
-       if (val & READBACK_NEGATE)
-               t *= -1;
-
-       return t;
-}
-
-static int tegra_thermctl_get_temp(void *data, int *out_temp)
-{
-       struct tegra_thermctl_zone *zone = data;
-       u32 val;
-
-       val = (readl(zone->reg) >> zone->shift) & SENSOR_TEMP_MASK;
-       *out_temp = translate_temp(val);
-
-       return 0;
-}
-
-static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
-       .get_temp = tegra_thermctl_get_temp,
-};
-
-static const struct of_device_id tegra_soctherm_of_match[] = {
-       { .compatible = "nvidia,tegra124-soctherm" },
-       { },
-};
-MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
-
-struct thermctl_zone_desc {
-       unsigned int offset;
-       unsigned int shift;
-};
-
-static const struct thermctl_zone_desc t124_thermctl_temp_zones[] = {
-       { SENSOR_TEMP1, 16 },
-       { SENSOR_TEMP2, 16 },
-       { SENSOR_TEMP1, 0 },
-       { SENSOR_TEMP2, 0 }
-};
-
-static int tegra_soctherm_probe(struct platform_device *pdev)
-{
-       struct tegra_soctherm *tegra;
-       struct thermal_zone_device *tz;
-       struct tsensor_shared_calibration shared_calib;
-       struct resource *res;
-       unsigned int i;
-       int err;
-
-       const struct tegra_tsensor *tsensors = t124_tsensors;
-
-       tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
-       if (!tegra)
-               return -ENOMEM;
-
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       tegra->regs = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(tegra->regs))
-               return PTR_ERR(tegra->regs);
-
-       tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
-       if (IS_ERR(tegra->reset)) {
-               dev_err(&pdev->dev, "can't get soctherm reset\n");
-               return PTR_ERR(tegra->reset);
-       }
-
-       tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
-       if (IS_ERR(tegra->clock_tsensor)) {
-               dev_err(&pdev->dev, "can't get tsensor clock\n");
-               return PTR_ERR(tegra->clock_tsensor);
-       }
-
-       tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
-       if (IS_ERR(tegra->clock_soctherm)) {
-               dev_err(&pdev->dev, "can't get soctherm clock\n");
-               return PTR_ERR(tegra->clock_soctherm);
-       }
-
-       reset_control_assert(tegra->reset);
-
-       err = clk_prepare_enable(tegra->clock_soctherm);
-       if (err)
-               return err;
-
-       err = clk_prepare_enable(tegra->clock_tsensor);
-       if (err) {
-               clk_disable_unprepare(tegra->clock_soctherm);
-               return err;
-       }
-
-       reset_control_deassert(tegra->reset);
-
-       /* Initialize raw sensors */
-
-       err = calculate_shared_calibration(&shared_calib);
-       if (err)
-               goto disable_clocks;
-
-       for (i = 0; i < ARRAY_SIZE(t124_tsensors); ++i) {
-               err = enable_tsensor(tegra, tsensors + i, &shared_calib);
-               if (err)
-                       goto disable_clocks;
-       }
-
-       writel(SENSOR_PDIV_T124, tegra->regs + SENSOR_PDIV);
-       writel(SENSOR_HOTSPOT_OFF_T124, tegra->regs + SENSOR_HOTSPOT_OFF);
-
-       /* Initialize thermctl sensors */
-
-       for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
-               struct tegra_thermctl_zone *zone =
-                       devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
-               if (!zone) {
-                       err = -ENOMEM;
-                       goto unregister_tzs;
-               }
-
-               zone->reg = tegra->regs + t124_thermctl_temp_zones[i].offset;
-               zone->shift = t124_thermctl_temp_zones[i].shift;
-
-               tz = thermal_zone_of_sensor_register(&pdev->dev, i, zone,
-                                                    &tegra_of_thermal_ops);
-               if (IS_ERR(tz)) {
-                       err = PTR_ERR(tz);
-                       dev_err(&pdev->dev, "failed to register sensor: %d\n",
-                               err);
-                       goto unregister_tzs;
-               }
-
-               tegra->thermctl_tzs[i] = tz;
-       }
-
-       return 0;
-
-unregister_tzs:
-       while (i--)
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 tegra->thermctl_tzs[i]);
-
-disable_clocks:
-       clk_disable_unprepare(tegra->clock_tsensor);
-       clk_disable_unprepare(tegra->clock_soctherm);
-
-       return err;
-}
-
-static int tegra_soctherm_remove(struct platform_device *pdev)
-{
-       struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
-               thermal_zone_of_sensor_unregister(&pdev->dev,
-                                                 tegra->thermctl_tzs[i]);
-       }
-
-       clk_disable_unprepare(tegra->clock_tsensor);
-       clk_disable_unprepare(tegra->clock_soctherm);
-
-       return 0;
-}
-
-static struct platform_driver tegra_soctherm_driver = {
-       .probe = tegra_soctherm_probe,
-       .remove = tegra_soctherm_remove,
-       .driver = {
-               .name = "tegra-soctherm",
-               .of_match_table = tegra_soctherm_of_match,
-       },
-};
-module_platform_driver(tegra_soctherm_driver);
-
-MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
-MODULE_DESCRIPTION("NVIDIA Tegra SOCTHERM thermal management driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/thermal-generic-adc.c b/drivers/thermal/thermal-generic-adc.c
new file mode 100644 (file)
index 0000000..73f55d6
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Generic ADC thermal driver
+ *
+ * Copyright (C) 2016 NVIDIA CORPORATION. All rights reserved.
+ *
+ * Author: Laxman Dewangan <ldewangan@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/iio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+struct gadc_thermal_info {
+       struct device *dev;
+       struct thermal_zone_device *tz_dev;
+       struct iio_channel *channel;
+       s32 *lookup_table;
+       int nlookup_table;
+};
+
+static int gadc_thermal_adc_to_temp(struct gadc_thermal_info *gti, int val)
+{
+       int temp, adc_hi, adc_lo;
+       int i;
+
+       for (i = 0; i < gti->nlookup_table; i++) {
+               if (val >= gti->lookup_table[2 * i + 1])
+                       break;
+       }
+
+       if (i == 0) {
+               temp = gti->lookup_table[0];
+       } else if (i >= (gti->nlookup_table - 1)) {
+               temp = gti->lookup_table[2 * (gti->nlookup_table - 1)];
+       } else {
+               adc_hi = gti->lookup_table[2 * i - 1];
+               adc_lo = gti->lookup_table[2 * i + 1];
+               temp = gti->lookup_table[2 * i];
+               temp -= ((val - adc_lo) * 1000) / (adc_hi - adc_lo);
+       }
+
+       return temp;
+}
+
+static int gadc_thermal_get_temp(void *data, int *temp)
+{
+       struct gadc_thermal_info *gti = data;
+       int val;
+       int ret;
+
+       ret = iio_read_channel_processed(gti->channel, &val);
+       if (ret < 0) {
+               dev_err(gti->dev, "IIO channel read failed %d\n", ret);
+               return ret;
+       }
+       *temp = gadc_thermal_adc_to_temp(gti, val);
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops gadc_thermal_ops = {
+       .get_temp = gadc_thermal_get_temp,
+};
+
+static int gadc_thermal_read_linear_lookup_table(struct device *dev,
+                                                struct gadc_thermal_info *gti)
+{
+       struct device_node *np = dev->of_node;
+       int ntable;
+       int ret;
+
+       ntable = of_property_count_elems_of_size(np, "temperature-lookup-table",
+                                                sizeof(u32));
+       if (ntable < 0) {
+               dev_err(dev, "Lookup table is not provided\n");
+               return ntable;
+       }
+
+       if (ntable % 2) {
+               dev_err(dev, "Pair of temperature vs ADC read value missing\n");
+               return -EINVAL;
+       }
+
+       gti->lookup_table = devm_kzalloc(dev, sizeof(*gti->lookup_table) *
+                                        ntable, GFP_KERNEL);
+       if (!gti->lookup_table)
+               return -ENOMEM;
+
+       ret = of_property_read_u32_array(np, "temperature-lookup-table",
+                                        (u32 *)gti->lookup_table, ntable);
+       if (ret < 0) {
+               dev_err(dev, "Failed to read temperature lookup table: %d\n",
+                       ret);
+               return ret;
+       }
+
+       gti->nlookup_table = ntable / 2;
+
+       return 0;
+}
+
+static int gadc_thermal_probe(struct platform_device *pdev)
+{
+       struct gadc_thermal_info *gti;
+       int ret;
+
+       if (!pdev->dev.of_node) {
+               dev_err(&pdev->dev, "Only DT based supported\n");
+               return -ENODEV;
+       }
+
+       gti = devm_kzalloc(&pdev->dev, sizeof(*gti), GFP_KERNEL);
+       if (!gti)
+               return -ENOMEM;
+
+       ret = gadc_thermal_read_linear_lookup_table(&pdev->dev, gti);
+       if (ret < 0)
+               return ret;
+
+       gti->dev = &pdev->dev;
+       platform_set_drvdata(pdev, gti);
+
+       gti->channel = iio_channel_get(&pdev->dev, "sensor-channel");
+       if (IS_ERR(gti->channel)) {
+               ret = PTR_ERR(gti->channel);
+               dev_err(&pdev->dev, "IIO channel not found: %d\n", ret);
+               return ret;
+       }
+
+       gti->tz_dev = thermal_zone_of_sensor_register(&pdev->dev, 0,
+                                                     gti, &gadc_thermal_ops);
+       if (IS_ERR(gti->tz_dev)) {
+               ret = PTR_ERR(gti->tz_dev);
+               dev_err(&pdev->dev, "Thermal zone sensor register failed: %d\n",
+                       ret);
+               goto sensor_fail;
+       }
+
+       return 0;
+
+sensor_fail:
+       iio_channel_release(gti->channel);
+
+       return ret;
+}
+
+static int gadc_thermal_remove(struct platform_device *pdev)
+{
+       struct gadc_thermal_info *gti = platform_get_drvdata(pdev);
+
+       thermal_zone_of_sensor_unregister(&pdev->dev, gti->tz_dev);
+       iio_channel_release(gti->channel);
+
+       return 0;
+}
+
+static const struct of_device_id of_adc_thermal_match[] = {
+       { .compatible = "generic-adc-thermal", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, of_adc_thermal_match);
+
+static struct platform_driver gadc_thermal_driver = {
+       .driver = {
+               .name = "generic-adc-thermal",
+               .of_match_table = of_adc_thermal_match,
+       },
+       .probe = gadc_thermal_probe,
+       .remove = gadc_thermal_remove,
+};
+
+module_platform_driver(gadc_thermal_driver);
+
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_DESCRIPTION("Generic ADC thermal driver using IIO framework with DT");
+MODULE_LICENSE("GPL v2");
index b213a12222956185677e11208224514a4468217c..15c0a9ac2209eab4a19e358370f53bcf3288a5a5 100644 (file)
@@ -337,7 +337,7 @@ int ti_thermal_expose_sensor(struct ti_bandgap *bgp, int id,
                return -EINVAL;
 
        /* in case this is specified by DT */
-       data->ti_thermal = thermal_zone_of_sensor_register(bgp->dev, id,
+       data->ti_thermal = devm_thermal_zone_of_sensor_register(bgp->dev, id,
                                        data, &ti_of_thermal_ops);
        if (IS_ERR(data->ti_thermal)) {
                /* Create thermal zone */
@@ -368,9 +368,6 @@ int ti_thermal_remove_sensor(struct ti_bandgap *bgp, int id)
        if (data && data->ti_thermal) {
                if (data->our_zone)
                        thermal_zone_device_unregister(data->ti_thermal);
-               else
-                       thermal_zone_of_sensor_unregister(bgp->dev,
-                                                         data->ti_thermal);
        }
 
        return 0;
index 7fc919f7da4de1878c7617c0eb46e6a7950b06b6..97f0a2bd93edfb64bc326adf29724bf456548ad8 100644 (file)
@@ -555,7 +555,7 @@ static int pkg_temp_thermal_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long) hcpu;
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
                get_core_online(cpu);
index 712a84978e972bf29082575cdb083ec9450be657..188b1ff03f5f23ef837c3ef2135e6bacdf9d1394 100644 (file)
@@ -113,6 +113,35 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
 static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
+/*
+ * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
+ * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
+ * If a device implements the former but not the latter we would typically
+ * expect broken_intx_masking be set and require an exclusive interrupt.
+ * However since we do have control of the device's ability to assert INTx,
+ * we can instead pretend that the device does not implement INTx, virtualizing
+ * the pin register to report zero and maintaining DisINTx set on the host.
+ */
+static bool vfio_pci_nointx(struct pci_dev *pdev)
+{
+       switch (pdev->vendor) {
+       case PCI_VENDOR_ID_INTEL:
+               switch (pdev->device) {
+               /* All i40e (XL710/X710) 10/20/40GbE NICs */
+               case 0x1572:
+               case 0x1574:
+               case 0x1580 ... 0x1581:
+               case 0x1583 ... 0x1589:
+               case 0x37d0 ... 0x37d2:
+                       return true;
+               default:
+                       return false;
+               }
+       }
+
+       return false;
+}
+
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
@@ -136,23 +165,29 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
                pr_debug("%s: Couldn't store %s saved state\n",
                         __func__, dev_name(&pdev->dev));
 
-       ret = vfio_config_init(vdev);
-       if (ret) {
-               kfree(vdev->pci_saved_state);
-               vdev->pci_saved_state = NULL;
-               pci_disable_device(pdev);
-               return ret;
+       if (likely(!nointxmask)) {
+               if (vfio_pci_nointx(pdev)) {
+                       dev_info(&pdev->dev, "Masking broken INTx support\n");
+                       vdev->nointx = true;
+                       pci_intx(pdev, 0);
+               } else
+                       vdev->pci_2_3 = pci_intx_mask_supported(pdev);
        }
 
-       if (likely(!nointxmask))
-               vdev->pci_2_3 = pci_intx_mask_supported(pdev);
-
        pci_read_config_word(pdev, PCI_COMMAND, &cmd);
        if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
                cmd &= ~PCI_COMMAND_INTX_DISABLE;
                pci_write_config_word(pdev, PCI_COMMAND, cmd);
        }
 
+       ret = vfio_config_init(vdev);
+       if (ret) {
+               kfree(vdev->pci_saved_state);
+               vdev->pci_saved_state = NULL;
+               pci_disable_device(pdev);
+               return ret;
+       }
+
        msix_pos = pdev->msix_cap;
        if (msix_pos) {
                u16 flags;
@@ -304,7 +339,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
        if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
                u8 pin;
                pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-               if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin)
+               if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && !vdev->nointx && pin)
                        return 1;
 
        } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
index 142c533efec7592be65bf8354db2c0c0d16b05c1..93601407dab8a04b44c8a379b9ff0d480c12303d 100644 (file)
@@ -408,6 +408,7 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev)
 {
        struct pci_dev *pdev = vdev->pdev;
        u32 *rbar = vdev->rbar;
+       u16 cmd;
        int i;
 
        if (pdev->is_virtfn)
@@ -420,6 +421,12 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev)
                pci_user_write_config_dword(pdev, i, *rbar);
 
        pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
+
+       if (vdev->nointx) {
+               pci_user_read_config_word(pdev, PCI_COMMAND, &cmd);
+               cmd |= PCI_COMMAND_INTX_DISABLE;
+               pci_user_write_config_word(pdev, PCI_COMMAND, cmd);
+       }
 }
 
 static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
@@ -515,6 +522,23 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
        return count;
 }
 
+/* Test whether BARs match the value we think they should contain */
+static bool vfio_need_bar_restore(struct vfio_pci_device *vdev)
+{
+       int i = 0, pos = PCI_BASE_ADDRESS_0, ret;
+       u32 bar;
+
+       for (; pos <= PCI_BASE_ADDRESS_5; i++, pos += 4) {
+               if (vdev->rbar[i]) {
+                       ret = pci_user_read_config_dword(vdev->pdev, pos, &bar);
+                       if (ret || vdev->rbar[i] != bar)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
 static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
                                   int count, struct perm_bits *perm,
                                   int offset, __le32 val)
@@ -553,7 +577,8 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
                 * SR-IOV devices will trigger this, but we catch them later
                 */
                if ((new_mem && virt_mem && !phys_mem) ||
-                   (new_io && virt_io && !phys_io))
+                   (new_io && virt_io && !phys_io) ||
+                   vfio_need_bar_restore(vdev))
                        vfio_bar_restore(vdev);
        }
 
@@ -1124,9 +1149,12 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
                        return pcibios_err_to_errno(ret);
 
                if (PCI_X_CMD_VERSION(word)) {
-                       /* Test for extended capabilities */
-                       pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
-                       vdev->extended_caps = (dword != 0);
+                       if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+                               /* Test for extended capabilities */
+                               pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE,
+                                                     &dword);
+                               vdev->extended_caps = (dword != 0);
+                       }
                        return PCI_CAP_PCIX_SIZEOF_V2;
                } else
                        return PCI_CAP_PCIX_SIZEOF_V0;
@@ -1138,9 +1166,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
 
                return byte;
        case PCI_CAP_ID_EXP:
-               /* Test for extended capabilities */
-               pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
-               vdev->extended_caps = (dword != 0);
+               if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) {
+                       /* Test for extended capabilities */
+                       pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
+                       vdev->extended_caps = (dword != 0);
+               }
 
                /* length based on version */
                if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
@@ -1545,7 +1575,7 @@ int vfio_config_init(struct vfio_pci_device *vdev)
                *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
        }
 
-       if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX))
+       if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
                vconfig[PCI_INTERRUPT_PIN] = 0;
 
        ret = vfio_cap_init(vdev);
index 8a7d546d18a0d91be62a6836df508d38cdf57b3a..016c14a1b454bfe5a4528a243ef400d539d9b05b 100644 (file)
@@ -83,6 +83,7 @@ struct vfio_pci_device {
        bool                    bardirty;
        bool                    has_vga;
        bool                    needs_reset;
+       bool                    nointx;
        struct pci_saved_state  *pci_saved_state;
        int                     refcnt;
        struct eventfd_ctx      *err_trigger;
index 3054e3fa63ac0f30d52b9507c405d436df1bc750..80378ddadc5ca4581bcba2792dc49f56209f2902 100644 (file)
@@ -331,14 +331,12 @@ static void tce_iommu_free_table(struct iommu_table *tbl);
 static void tce_iommu_release(void *iommu_data)
 {
        struct tce_container *container = iommu_data;
-       struct iommu_table_group *table_group;
        struct tce_iommu_group *tcegrp;
        long i;
 
        while (tce_groups_attached(container)) {
                tcegrp = list_first_entry(&container->group_list,
                                struct tce_iommu_group, next);
-               table_group = iommu_group_get_iommudata(tcegrp->grp);
                tce_iommu_detach_group(iommu_data, tcegrp->grp);
        }
 
index 35fe4825a4546012702a56efac3bc57803b93a0c..60d6c2ac87aa526b77d308532dc7289ddef85ab5 100644 (file)
@@ -162,7 +162,7 @@ static int lm3630a_intr_config(struct lm3630a_chip *pchip)
 
 static void lm3630a_pwm_ctrl(struct lm3630a_chip *pchip, int br, int br_max)
 {
-       unsigned int period = pwm_get_period(pchip->pwmd);
+       unsigned int period = pchip->pdata->pwm_period;
        unsigned int duty = br * period / br_max;
 
        pwm_config(pchip->pwmd, duty, period);
@@ -424,8 +424,13 @@ static int lm3630a_probe(struct i2c_client *client,
                        dev_err(&client->dev, "fail : get pwm device\n");
                        return PTR_ERR(pchip->pwmd);
                }
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pchip->pwmd);
        }
-       pchip->pwmd->period = pdata->pwm_period;
 
        /* interrupt enable  : irq 0 is not allowed */
        pchip->irq = client->irq;
index daca9e6a2bb31590071cfc7100e4c1ee8ca4f1ec..e5b14f52628fdd365ac1a9318cc8ad8be7fe89ca 100644 (file)
@@ -246,6 +246,12 @@ static void lp855x_pwm_ctrl(struct lp855x *lp, int br, int max_br)
                        return;
 
                lp->pwm = pwm;
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pwm);
        }
 
        pwm_config(lp->pwm, duty, period);
index 5d583d7a517bc45ece6463a12f2b897ef513c1e2..cf869ec90cce89be06ce7884a684993c3ecaf4fe 100644 (file)
@@ -145,6 +145,12 @@ static void lp8788_pwm_ctrl(struct lp8788_bl *bl, int br, int max_br)
                }
 
                bl->pwm = pwm;
+
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(pwm);
        }
 
        pwm_config(bl->pwm, duty, period);
index 64f9e1b8655f4b29f64f95c6cf9b3a0e9cc04c6d..b2b366bb0f9784294a287ea220777cb9cc5a9e19 100644 (file)
@@ -201,6 +201,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        struct device_node *node = pdev->dev.of_node;
        struct pwm_bl_data *pb;
        int initial_blank = FB_BLANK_UNBLANK;
+       struct pwm_args pargs;
        int ret;
 
        if (!data) {
@@ -306,17 +307,22 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 
        dev_dbg(&pdev->dev, "got pwm for backlight\n");
 
+       /*
+        * FIXME: pwm_apply_args() should be removed when switching to
+        * the atomic PWM API.
+        */
+       pwm_apply_args(pb->pwm);
+
        /*
         * The DT case will set the pwm_period_ns field to 0 and store the
         * period, parsed from the DT, in the PWM device. For the non-DT case,
         * set the period from platform data if it has not already been set
         * via the PWM lookup table.
         */
-       pb->period = pwm_get_period(pb->pwm);
-       if (!pb->period && (data->pwm_period_ns > 0)) {
+       pwm_get_args(pb->pwm, &pargs);
+       pb->period = pargs.period;
+       if (!pb->period && (data->pwm_period_ns > 0))
                pb->period = data->pwm_period_ns;
-               pwm_set_period(pb->pwm, data->pwm_period_ns);
-       }
 
        pb->lth_brightness = data->lth_brightness * (pb->period / pb->scale);
 
index 21dafe53ca492421fd6fe747e32ab3f7aa643dbd..a9c45c89b15eaca20f6a62957ff92dd20775a6de 100644 (file)
@@ -286,6 +286,7 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
 {
        int ret;
        u32 precharge, dclk, com_invdir, compins;
+       struct pwm_args pargs;
 
        if (par->device_info->need_pwm) {
                par->pwm = pwm_get(&par->client->dev, NULL);
@@ -294,7 +295,15 @@ static int ssd1307fb_init(struct ssd1307fb_par *par)
                        return PTR_ERR(par->pwm);
                }
 
-               par->pwm_period = pwm_get_period(par->pwm);
+               /*
+                * FIXME: pwm_apply_args() should be removed when switching to
+                * the atomic PWM API.
+                */
+               pwm_apply_args(par->pwm);
+
+               pwm_get_args(par->pwm, &pargs);
+
+               par->pwm_period = pargs.period;
                /* Enable the PWM */
                pwm_config(par->pwm, par->pwm_period / 2, par->pwm_period);
                pwm_enable(par->pwm);
index 7b6d74f0c72f630835f52c370f2fb3c5a113e612..476c0e3a7150694b258e40787f303d6b46c29c17 100644 (file)
@@ -75,7 +75,7 @@ struct virtio_balloon {
 
        /* The array of pfns we tell the Host about. */
        unsigned int num_pfns;
-       u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
+       __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
        /* Memory statistics */
        struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
@@ -127,14 +127,16 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 }
 
-static void set_page_pfns(u32 pfns[], struct page *page)
+static void set_page_pfns(struct virtio_balloon *vb,
+                         __virtio32 pfns[], struct page *page)
 {
        unsigned int i;
 
        /* Set balloon pfns pointing at this page.
         * Note that the first pfn points at start of the page. */
        for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
-               pfns[i] = page_to_balloon_pfn(page) + i;
+               pfns[i] = cpu_to_virtio32(vb->vdev,
+                                         page_to_balloon_pfn(page) + i);
 }
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
@@ -158,7 +160,7 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
                        msleep(200);
                        break;
                }
-               set_page_pfns(vb->pfns + vb->num_pfns, page);
+               set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
                vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
                if (!virtio_has_feature(vb->vdev,
                                        VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
@@ -177,10 +179,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 static void release_pages_balloon(struct virtio_balloon *vb)
 {
        unsigned int i;
+       struct page *page;
 
        /* Find pfns pointing at start of each page, get pages and free them. */
        for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               struct page *page = balloon_pfn_to_page(vb->pfns[i]);
+               page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev,
+                                                          vb->pfns[i]));
                if (!virtio_has_feature(vb->vdev,
                                        VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
                        adjust_managed_page_count(page, 1);
@@ -203,7 +207,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
                page = balloon_page_dequeue(vb_dev_info);
                if (!page)
                        break;
-               set_page_pfns(vb->pfns + vb->num_pfns, page);
+               set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
                vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
        }
 
@@ -471,13 +475,13 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
        __count_vm_event(BALLOON_MIGRATE);
        spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
        vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-       set_page_pfns(vb->pfns, newpage);
+       set_page_pfns(vb, vb->pfns, newpage);
        tell_host(vb, vb->inflate_vq);
 
        /* balloon's page migration 2nd step -- deflate "page" */
        balloon_page_delete(page);
        vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-       set_page_pfns(vb->pfns, page);
+       set_page_pfns(vb, vb->pfns, page);
        tell_host(vb, vb->deflate_vq);
 
        mutex_unlock(&vb->balloon_lock);
index 5b45e277697be7d1695f92b0de2c4d934db192a2..b54f26c55dfd1df46fcd5114660406edfd7e2820 100644 (file)
@@ -661,6 +661,14 @@ config ATLAS7_WATCHDOG
          To compile this driver as a module, choose M here: the
          module will be called atlas7_wdt.
 
+config RENESAS_WDT
+       tristate "Renesas WDT Watchdog"
+       depends on ARCH_RENESAS || COMPILE_TEST
+       select WATCHDOG_CORE
+       help
+         This driver adds watchdog support for the integrated watchdogs in the
+         Renesas R-Car and other SH-Mobile SoCs (usually named RWDT or SWDT).
+
 # AVR32 Architecture
 
 config AT32AP700X_WDT
index 9bde095ff691b0ebe5f98201084b81cedf81da77..a46e7c1380ac44c6a3429405711751a8118f4aeb 100644 (file)
@@ -73,6 +73,7 @@ obj-$(CONFIG_DIGICOLOR_WATCHDOG) += digicolor_wdt.o
 obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o
 obj-$(CONFIG_BCM7038_WDT) += bcm7038_wdt.o
 obj-$(CONFIG_ATLAS7_WATCHDOG) += atlas7_wdt.o
+obj-$(CONFIG_RENESAS_WDT) += renesas_wdt.o
 
 # AVR32 Architecture
 obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
index 02007689e9ca29976a2c9c8a2474558802beec27..71ee07950e630492a29f007cab84ddc04525cb69 100644 (file)
@@ -611,9 +611,7 @@ static int cpwd_probe(struct platform_device *op)
        }
 
        if (p->broken) {
-               init_timer(&cpwd_timer);
-               cpwd_timer.function     = cpwd_brokentimer;
-               cpwd_timer.data         = (unsigned long) p;
+               setup_timer(&cpwd_timer, cpwd_brokentimer, (unsigned long)p);
                cpwd_timer.expires      = WD_BTIMEOUT;
 
                pr_info("PLD defect workaround enabled for model %s\n",
index 016bd9355190bcf8c80cb1cd66e384bd2719f498..d4ba262da7ba2221741758d1378160c1dbe7a958 100644 (file)
@@ -38,7 +38,7 @@
 
 #define SIO_F71808FG_LD_WDT    0x07    /* Watchdog timer logical device */
 #define SIO_UNLOCK_KEY         0x87    /* Key to enable Super-I/O */
-#define SIO_LOCK_KEY           0xAA    /* Key to diasble Super-I/O */
+#define SIO_LOCK_KEY           0xAA    /* Key to disable Super-I/O */
 
 #define SIO_REG_LDSEL          0x07    /* Logical device select */
 #define SIO_REG_DEVID          0x20    /* Device ID (2 bytes) */
@@ -59,6 +59,7 @@
 #define SIO_F71869A_ID         0x1007  /* Chipset ID */
 #define SIO_F71882_ID          0x0541  /* Chipset ID */
 #define SIO_F71889_ID          0x0723  /* Chipset ID */
+#define SIO_F81865_ID          0x0704  /* Chipset ID */
 
 #define F71808FG_REG_WDO_CONF          0xf0
 #define F71808FG_REG_WDT_CONF          0xf5
 
 #define F71808FG_FLAG_WDOUT_EN         7
 
-#define F71808FG_FLAG_WDTMOUT_STS      5
+#define F71808FG_FLAG_WDTMOUT_STS      6
 #define F71808FG_FLAG_WD_EN            5
 #define F71808FG_FLAG_WD_PULSE         4
 #define F71808FG_FLAG_WD_UNIT          3
 
+#define F81865_REG_WDO_CONF            0xfa
+#define F81865_FLAG_WDOUT_EN           0
+
 /* Default values */
 #define WATCHDOG_TIMEOUT       60      /* 1 minute default timeout */
 #define WATCHDOG_MAX_TIMEOUT   (60 * 255)
@@ -112,7 +116,7 @@ module_param(start_withtimeout, uint, 0);
 MODULE_PARM_DESC(start_withtimeout, "Start watchdog timer on module load with"
        " given initial timeout. Zero (default) disables this feature.");
 
-enum chips { f71808fg, f71858fg, f71862fg, f71869, f71882fg, f71889fg };
+enum chips { f71808fg, f71858fg, f71862fg, f71869, f71882fg, f71889fg, f81865 };
 
 static const char *f71808e_names[] = {
        "f71808fg",
@@ -121,6 +125,7 @@ static const char *f71808e_names[] = {
        "f71869",
        "f71882fg",
        "f71889fg",
+       "f81865",
 };
 
 /* Super-I/O Function prototypes */
@@ -360,6 +365,11 @@ static int watchdog_start(void)
                        superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf);
                break;
 
+       case f81865:
+               /* Set pin 70 to WDTRST# */
+               superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5);
+               break;
+
        default:
                /*
                 * 'default' label to shut up the compiler and catch
@@ -371,8 +381,13 @@ static int watchdog_start(void)
 
        superio_select(watchdog.sioaddr, SIO_F71808FG_LD_WDT);
        superio_set_bit(watchdog.sioaddr, SIO_REG_ENABLE, 0);
-       superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDO_CONF,
-                       F71808FG_FLAG_WDOUT_EN);
+
+       if (watchdog.type == f81865)
+               superio_set_bit(watchdog.sioaddr, F81865_REG_WDO_CONF,
+                               F81865_FLAG_WDOUT_EN);
+       else
+               superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDO_CONF,
+                               F71808FG_FLAG_WDOUT_EN);
 
        superio_set_bit(watchdog.sioaddr, F71808FG_REG_WDT_CONF,
                        F71808FG_FLAG_WD_EN);
@@ -655,7 +670,7 @@ static int __init watchdog_init(int sioaddr)
        superio_select(watchdog.sioaddr, SIO_F71808FG_LD_WDT);
 
        wdt_conf = superio_inb(sioaddr, F71808FG_REG_WDT_CONF);
-       watchdog.caused_reboot = wdt_conf & F71808FG_FLAG_WDTMOUT_STS;
+       watchdog.caused_reboot = wdt_conf & BIT(F71808FG_FLAG_WDTMOUT_STS);
 
        superio_exit(sioaddr);
 
@@ -770,6 +785,9 @@ static int __init f71808e_find(int sioaddr)
                /* Confirmed (by datasheet) not to have a watchdog. */
                err = -ENODEV;
                goto exit;
+       case SIO_F81865_ID:
+               watchdog.type = f81865;
+               break;
        default:
                pr_info("Unrecognized Fintek device: %04x\n",
                        (unsigned int)devid);
index 331aed831dac8419198d78e77b895b2d5ec883e0..62f346bb43484282aa7bd2665ddf62efb473eda0 100644 (file)
@@ -37,6 +37,8 @@
 
 #define IMX2_WDT_WCR           0x00            /* Control Register */
 #define IMX2_WDT_WCR_WT                (0xFF << 8)     /* -> Watchdog Timeout Field */
+#define IMX2_WDT_WCR_WDA       (1 << 5)        /* -> External Reset WDOG_B */
+#define IMX2_WDT_WCR_SRS       (1 << 4)        /* -> Software Reset Signal */
 #define IMX2_WDT_WCR_WRE       (1 << 3)        /* -> WDOG Reset Enable */
 #define IMX2_WDT_WCR_WDE       (1 << 2)        /* -> Watchdog Enable */
 #define IMX2_WDT_WCR_WDZST     (1 << 0)        /* -> Watchdog timer Suspend */
@@ -59,6 +61,7 @@ struct imx2_wdt_device {
        struct clk *clk;
        struct regmap *regmap;
        struct watchdog_device wdog;
+       bool ext_reset;
 };
 
 static bool nowayout = WATCHDOG_NOWAYOUT;
@@ -83,6 +86,12 @@ static int imx2_wdt_restart(struct watchdog_device *wdog, unsigned long action,
        struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
        unsigned int wcr_enable = IMX2_WDT_WCR_WDE;
 
+       /* Use internal reset or external - not both */
+       if (wdev->ext_reset)
+               wcr_enable |= IMX2_WDT_WCR_SRS; /* do not assert int reset */
+       else
+               wcr_enable |= IMX2_WDT_WCR_WDA; /* do not assert ext-reset */
+
        /* Assert SRS signal */
        regmap_write(wdev->regmap, IMX2_WDT_WCR, wcr_enable);
        /*
@@ -112,8 +121,12 @@ static inline void imx2_wdt_setup(struct watchdog_device *wdog)
        val |= IMX2_WDT_WCR_WDZST;
        /* Strip the old watchdog Time-Out value */
        val &= ~IMX2_WDT_WCR_WT;
-       /* Generate reset if WDOG times out */
-       val &= ~IMX2_WDT_WCR_WRE;
+       /* Generate internal chip-level reset if WDOG times out */
+       if (!wdev->ext_reset)
+               val &= ~IMX2_WDT_WCR_WRE;
+       /* Or if external-reset assert WDOG_B reset only on time-out */
+       else
+               val |= IMX2_WDT_WCR_WRE;
        /* Keep Watchdog Disabled */
        val &= ~IMX2_WDT_WCR_WDE;
        /* Set the watchdog's Time-Out value */
@@ -230,6 +243,8 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
        regmap_read(wdev->regmap, IMX2_WDT_WRSR, &val);
        wdog->bootstatus = val & IMX2_WDT_WRSR_TOUT ? WDIOF_CARDRESET : 0;
 
+       wdev->ext_reset = of_property_read_bool(pdev->dev.of_node,
+                                               "fsl,ext-reset-output");
        wdog->timeout = clamp_t(unsigned, timeout, 1, IMX2_WDT_MAX_TIME);
        if (wdog->timeout != timeout)
                dev_warn(&pdev->dev, "Initial timeout out of range! Clamped from %u to %u\n",
index 6a7d5c365438120d5a31d59038f1aaea777b691c..c8d51ddb26d5d26acb3c14251a60cdeb21b0be2c 100644 (file)
@@ -160,10 +160,8 @@ static int jz4740_wdt_probe(struct platform_device *pdev)
 
        drvdata = devm_kzalloc(&pdev->dev, sizeof(struct jz4740_wdt_drvdata),
                               GFP_KERNEL);
-       if (!drvdata) {
-               dev_err(&pdev->dev, "Unable to alloacate watchdog device\n");
+       if (!drvdata)
                return -ENOMEM;
-       }
 
        if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT)
                heartbeat = DEFAULT_HEARTBEAT;
index 14521c8b3d5a81916cc04d9b6e7d242837c2e678..b55981f88a08334328d409eff8badba7606908be 100644 (file)
@@ -431,7 +431,7 @@ static int octeon_wdt_cpu_callback(struct notifier_block *nfb,
 {
        unsigned int cpu = (unsigned long)hcpu;
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                octeon_wdt_disable_interrupt(cpu);
                break;
index 20563ccb7be0f5400c1166b0895adc73da758f30..a043fa4f60e5feea2944acad9302091879540706 100644 (file)
@@ -21,6 +21,7 @@
 
 #define WDT_RST                0x38
 #define WDT_EN         0x40
+#define WDT_STS                0x44
 #define WDT_BITE_TIME  0x5C
 
 struct qcom_wdt {
@@ -108,7 +109,8 @@ static const struct watchdog_ops qcom_wdt_ops = {
 static const struct watchdog_info qcom_wdt_info = {
        .options        = WDIOF_KEEPALIVEPING
                        | WDIOF_MAGICCLOSE
-                       | WDIOF_SETTIMEOUT,
+                       | WDIOF_SETTIMEOUT
+                       | WDIOF_CARDRESET,
        .identity       = KBUILD_MODNAME,
 };
 
@@ -171,6 +173,9 @@ static int qcom_wdt_probe(struct platform_device *pdev)
        wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
        wdt->wdd.parent = &pdev->dev;
 
+       if (readl(wdt->base + WDT_STS) & 1)
+               wdt->wdd.bootstatus = WDIOF_CARDRESET;
+
        /*
         * If 'timeout-sec' unspecified in devicetree, assume a 30 second
         * default, unless the max timeout is less than 30 seconds, then use
diff --git a/drivers/watchdog/renesas_wdt.c b/drivers/watchdog/renesas_wdt.c
new file mode 100644 (file)
index 0000000..cf61c92
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * Watchdog driver for Renesas WDT watchdog
+ *
+ * Copyright (C) 2015-16 Wolfram Sang, Sang Engineering <wsa@sang-engineering.com>
+ * Copyright (C) 2015-16 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/watchdog.h>
+
+#define RWTCNT         0
+#define RWTCSRA                4
+#define RWTCSRA_WOVF   BIT(4)
+#define RWTCSRA_WRFLG  BIT(5)
+#define RWTCSRA_TME    BIT(7)
+
+#define RWDT_DEFAULT_TIMEOUT 60U
+
+static const unsigned int clk_divs[] = { 1, 4, 16, 32, 64, 128, 1024 };
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+                               __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+struct rwdt_priv {
+       void __iomem *base;
+       struct watchdog_device wdev;
+       struct clk *clk;
+       unsigned int clks_per_sec;
+       u8 cks;
+};
+
+static void rwdt_write(struct rwdt_priv *priv, u32 val, unsigned int reg)
+{
+       if (reg == RWTCNT)
+               val |= 0x5a5a0000;
+       else
+               val |= 0xa5a5a500;
+
+       writel_relaxed(val, priv->base + reg);
+}
+
+static int rwdt_init_timeout(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       rwdt_write(priv, 65536 - wdev->timeout * priv->clks_per_sec, RWTCNT);
+
+       return 0;
+}
+
+static int rwdt_start(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       clk_prepare_enable(priv->clk);
+
+       rwdt_write(priv, priv->cks, RWTCSRA);
+       rwdt_init_timeout(wdev);
+
+       while (readb_relaxed(priv->base + RWTCSRA) & RWTCSRA_WRFLG)
+               cpu_relax();
+
+       rwdt_write(priv, priv->cks | RWTCSRA_TME, RWTCSRA);
+
+       return 0;
+}
+
+static int rwdt_stop(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+
+       rwdt_write(priv, priv->cks, RWTCSRA);
+       clk_disable_unprepare(priv->clk);
+
+       return 0;
+}
+
+static unsigned int rwdt_get_timeleft(struct watchdog_device *wdev)
+{
+       struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
+       u16 val = readw_relaxed(priv->base + RWTCNT);
+
+       return DIV_ROUND_CLOSEST(65536 - val, priv->clks_per_sec);
+}
+
+static const struct watchdog_info rwdt_ident = {
+       .options = WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT,
+       .identity = "Renesas WDT Watchdog",
+};
+
+static const struct watchdog_ops rwdt_ops = {
+       .owner = THIS_MODULE,
+       .start = rwdt_start,
+       .stop = rwdt_stop,
+       .ping = rwdt_init_timeout,
+       .get_timeleft = rwdt_get_timeleft,
+};
+
+static int rwdt_probe(struct platform_device *pdev)
+{
+       struct rwdt_priv *priv;
+       struct resource *res;
+       unsigned long rate;
+       unsigned int clks_per_sec;
+       int ret, i;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       priv->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       priv->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(priv->clk))
+               return PTR_ERR(priv->clk);
+
+       rate = clk_get_rate(priv->clk);
+       if (!rate)
+               return -ENOENT;
+
+       for (i = ARRAY_SIZE(clk_divs) - 1; i >= 0; i--) {
+               clks_per_sec = DIV_ROUND_UP(rate, clk_divs[i]);
+               if (clks_per_sec) {
+                       priv->clks_per_sec = clks_per_sec;
+                       priv->cks = i;
+                       break;
+               }
+       }
+
+       if (!clks_per_sec) {
+               dev_err(&pdev->dev, "Can't find suitable clock divider\n");
+               return -ERANGE;
+       }
+
+       pm_runtime_enable(&pdev->dev);
+       pm_runtime_get_sync(&pdev->dev);
+
+       priv->wdev.info = &rwdt_ident,
+       priv->wdev.ops = &rwdt_ops,
+       priv->wdev.parent = &pdev->dev;
+       priv->wdev.min_timeout = 1;
+       priv->wdev.max_timeout = 65536 / clks_per_sec;
+       priv->wdev.timeout = min(priv->wdev.max_timeout, RWDT_DEFAULT_TIMEOUT);
+
+       platform_set_drvdata(pdev, priv);
+       watchdog_set_drvdata(&priv->wdev, priv);
+       watchdog_set_nowayout(&priv->wdev, nowayout);
+
+       /* This overrides the default timeout only if DT configuration was found */
+       ret = watchdog_init_timeout(&priv->wdev, 0, &pdev->dev);
+       if (ret)
+               dev_warn(&pdev->dev, "Specified timeout value invalid, using default\n");
+
+       ret = watchdog_register_device(&priv->wdev);
+       if (ret < 0) {
+               pm_runtime_put(&pdev->dev);
+               pm_runtime_disable(&pdev->dev);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int rwdt_remove(struct platform_device *pdev)
+{
+       struct rwdt_priv *priv = platform_get_drvdata(pdev);
+
+       watchdog_unregister_device(&priv->wdev);
+       pm_runtime_put(&pdev->dev);
+       pm_runtime_disable(&pdev->dev);
+
+       return 0;
+}
+
+/*
+ * This driver does also fit for R-Car Gen2 (r8a779[0-4]) WDT. However, for SMP
+ * to work there, one also needs a RESET (RST) driver which does not exist yet
+ * due to HW issues. This needs to be solved before adding compatibles here.
+ */
+static const struct of_device_id rwdt_ids[] = {
+       { .compatible = "renesas,rcar-gen3-wdt", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, rwdt_ids);
+
+static struct platform_driver rwdt_driver = {
+       .driver = {
+               .name = "renesas_wdt",
+               .of_match_table = rwdt_ids,
+       },
+       .probe = rwdt_probe,
+       .remove = rwdt_remove,
+};
+module_platform_driver(rwdt_driver);
+
+MODULE_DESCRIPTION("Renesas WDT Watchdog Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Wolfram Sang <wsa@sang-engineering.com>");
index f90812170657988b2089093765967b0dd5958965..517a733175ef84c8f8a7be3f79d8b97a3b6eed9e 100644 (file)
@@ -275,9 +275,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
                return rc;
        }
 
-       init_timer(&wdt->timer);
-       wdt->timer.function     = sh_wdt_ping;
-       wdt->timer.data         = (unsigned long)wdt;
+       setup_timer(&wdt->timer, sh_wdt_ping, (unsigned long)wdt);
        wdt->timer.expires      = next_ping_period(clock_division_ratio);
 
        dev_info(&pdev->dev, "initialized.\n");
index 6467b91f2245c6012d47f40d67179b74e92f0e8d..028618c5eebacfee0cbe59abbae4caad069068ec 100644 (file)
@@ -73,6 +73,13 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started."
 /*
  * Some TCO specific functions
  */
+
+static bool tco_has_sp5100_reg_layout(struct pci_dev *dev)
+{
+       return dev->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
+              dev->revision < 0x40;
+}
+
 static void tco_timer_start(void)
 {
        u32 val;
@@ -129,7 +136,7 @@ static void tco_timer_enable(void)
 {
        int val;
 
-       if (sp5100_tco_pci->revision >= 0x40) {
+       if (!tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                /* For SB800 or later */
                /* Set the Watchdog timer resolution to 1 sec */
                outb(SB800_PM_WATCHDOG_CONFIG, SB800_IO_PM_INDEX_REG);
@@ -342,8 +349,7 @@ static unsigned char sp5100_tco_setupdevice(void)
        /*
         * Determine type of southbridge chipset.
         */
-       if (sp5100_tco_pci->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
-           sp5100_tco_pci->revision < 0x40) {
+       if (tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                dev_name = SP5100_DEVNAME;
                index_reg = SP5100_IO_PM_INDEX_REG;
                data_reg = SP5100_IO_PM_DATA_REG;
@@ -388,8 +394,7 @@ static unsigned char sp5100_tco_setupdevice(void)
         * Secondly, Find the watchdog timer MMIO address
         * from SBResource_MMIO register.
         */
-       if (sp5100_tco_pci->device == PCI_DEVICE_ID_ATI_SBX00_SMBUS &&
-           sp5100_tco_pci->revision < 0x40) {
+       if (tco_has_sp5100_reg_layout(sp5100_tco_pci)) {
                /* Read SBResource_MMIO from PCI config(PCI_Reg: 9Ch) */
                pci_read_config_dword(sp5100_tco_pci,
                                      SP5100_SB_RESOURCE_MMIO_BASE, &val);
index 981a668b17e331ce47aafb95b87aabd8422a3cb6..7c3ba58ae1bee52022a0a9a2c06ab2a68f2d6ee9 100644 (file)
@@ -104,7 +104,7 @@ static void watchdog_check_min_max_timeout(struct watchdog_device *wdd)
  * timeout module parameter (if it is valid value) or the timeout-sec property
  * (only if it is a valid value and the timeout_parm is out of bounds).
  * If none of them are valid then we keep the old value (which should normally
- * be the default timeout value.
+ * be the default timeout value).
  *
  * A zero is returned on success and -EINVAL for failure.
  */
index e2c5abbb45ffd77d0b838fcebdab0536a637af8e..3595cffa24ea49877e05f21dc254589f67909553 100644 (file)
@@ -736,7 +736,6 @@ static int watchdog_release(struct inode *inode, struct file *file)
                watchdog_ping(wdd);
        }
 
-       cancel_delayed_work_sync(&wd_data->work);
        watchdog_update_worker(wdd);
 
        /* make sure that /dev/watchdog can be re-opened */
index 9b7a35c9e51ddd5a9d3bdb3319cfb9f1a35ff627..030e91b38e32bcf077713e31b7cc56a5a35e5924 100644 (file)
@@ -8,6 +8,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o                      := $(nostackp)
 
 CFLAGS_efi.o                           += -fshort-wchar
+LDFLAGS                                        += $(call ld-option, --no-wchar-size-warning)
 
 dom0-$(CONFIG_PCI) += pci.o
 dom0-$(CONFIG_USB_SUPPORT) += dbgp.o
index cb7138c97c692da92b71af1e6adc6539020702b4..71d49a95f8c0244f30c937b5d9275f0a4caede28 100644 (file)
@@ -487,7 +487,8 @@ static void eoi_pirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data))) {
+       if (unlikely(irqd_is_setaffinity_pending(data)) &&
+           likely(!irqd_irq_disabled(data))) {
                int masked = test_and_set_mask(evtchn);
 
                clear_evtchn(evtchn);
@@ -1370,7 +1371,8 @@ static void ack_dynirq(struct irq_data *data)
        if (!VALID_EVTCHN(evtchn))
                return;
 
-       if (unlikely(irqd_is_setaffinity_pending(data))) {
+       if (unlikely(irqd_is_setaffinity_pending(data)) &&
+           likely(!irqd_irq_disabled(data))) {
                int masked = test_and_set_mask(evtchn);
 
                clear_evtchn(evtchn);
index dc495383ad7335b2022f65df051700ca180de7d3..67939578cd6d0744bfc800e92bfc8f642b2cc7f5 100644 (file)
@@ -748,7 +748,7 @@ static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u)
        return rc;
 }
 
-#define GNTDEV_COPY_BATCH 24
+#define GNTDEV_COPY_BATCH 16
 
 struct gntdev_copy_batch {
        struct gnttab_copy ops[GNTDEV_COPY_BATCH];
index 6725f59c18e6b5f6b2ddcabaf1c83bd3d7c48406..b8fcb416be72983e77a11d033f044d14685f8e07 100644 (file)
@@ -52,6 +52,7 @@ config FS_DAX_PMD
        depends on FS_DAX
        depends on ZONE_DEVICE
        depends on TRANSPARENT_HUGEPAGE
+       depends on BROKEN
 
 endif # BLOCK
 
index 1089dbf259255e3c572ee1a15b6f16a509a20dfb..71ccab1d22c6133623ac640dffe30ad858afabe4 100644 (file)
@@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
+void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+       struct va_format vaf;
+       va_list args;
+
+       va_start(args, fmt);
+       vaf.fmt = fmt;
+       vaf.va = &args;
+       printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
+       va_end(args);
+}
+
 static void bdev_write_inode(struct block_device *bdev)
 {
        struct inode *inode = bdev->bd_inode;
@@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
+       avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
        if (!avail)
                return -ERANGE;
        if (avail > 0 && avail & ~PAGE_MASK)
@@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+/**
+ * bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+       struct blk_dax_ctl dax = {
+               .sector = 0,
+               .size = PAGE_SIZE,
+       };
+       int err;
+
+       if (blocksize != PAGE_SIZE) {
+               vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
+               return -EINVAL;
+       }
+
+       err = bdev_direct_access(sb->s_bdev, &dax);
+       if (err < 0) {
+               switch (err) {
+               case -EOPNOTSUPP:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: device does not support dax");
+                       break;
+               case -EINVAL:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: unaligned partition for dax");
+                       break;
+               default:
+                       vfs_msg(sb, KERN_ERR,
+                               "error: dax access failed (%d)", err);
+               }
+               return err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
+
+/**
+ * bdev_dax_capable() - Return if the raw device is capable for dax
+ * @bdev: The device for raw block device access
+ */
+bool bdev_dax_capable(struct block_device *bdev)
+{
+       struct blk_dax_ctl dax = {
+               .size = PAGE_SIZE,
+       };
+
+       if (!IS_ENABLED(CONFIG_FS_DAX))
+               return false;
+
+       dax.sector = 0;
+       if (bdev_direct_access(bdev, &dax) < 0)
+               return false;
+
+       dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
+       if (bdev_direct_access(bdev, &dax) < 0)
+               return false;
+
+       return true;
+}
+
 /*
  * pseudo-fs
  */
@@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static bool blkdev_dax_capable(struct block_device *bdev)
-{
-       struct gendisk *disk = bdev->bd_disk;
-
-       if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
-               return false;
-
-       /*
-        * If the partition is not aligned on a page boundary, we can't
-        * do dax I/O to it.
-        */
-       if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-                       || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-               return false;
-
-       /*
-        * If the device has known bad blocks, force all I/O through the
-        * driver / page cache.
-        *
-        * TODO: support finer grained dax error handling
-        */
-       if (disk->bb && disk->bb->count)
-               return false;
-
-       return true;
-}
-
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
                        if (!ret) {
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                               if (!blkdev_dax_capable(bdev))
+                               if (!bdev_dax_capable(bdev))
                                        bdev->bd_inode->i_flags &= ~S_DAX;
                        }
 
@@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                goto out_clear;
                        }
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-                       if (!blkdev_dax_capable(bdev))
+                       if (!bdev_dax_capable(bdev))
                                bdev->bd_inode->i_flags &= ~S_DAX;
                }
        } else {
index 43098cd9602bfcebab1095e03fe033533db5e8ba..eeb71e5de27aca7ef1dca3f3a22e3abaa64e467b 100644 (file)
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
  * Finish an async read(ahead) op.
  */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_osd_data *osd_data;
-       int rc = req->r_result;
-       int bytes = le32_to_cpu(msg->hdr.data_len);
+       int rc = req->r_result <= 0 ? req->r_result : 0;
+       int bytes = req->r_result >= 0 ? req->r_result : 0;
        int num_pages;
        int i;
 
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
        req->r_callback = finish_read;
        req->r_inode = inode;
 
-       ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
        dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
        ret = ceph_osdc_start_request(osdc, req, false);
        if (ret < 0)
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                                   truncate_seq, truncate_size,
                                   &inode->i_mtime, &page, 1);
        if (err < 0) {
-               dout("writepage setting page/mapping error %d %p\n", err, page);
+               struct writeback_control tmp_wbc;
+               if (!wbc)
+                       wbc = &tmp_wbc;
+               if (err == -ERESTARTSYS) {
+                       /* killed by SIGKILL */
+                       dout("writepage interrupted page %p\n", page);
+                       redirty_page_for_writepage(wbc, page);
+                       end_page_writeback(page);
+                       goto out;
+               }
+               dout("writepage setting page/mapping error %d %p\n",
+                    err, page);
                SetPageError(page);
                mapping_set_error(&inode->i_data, err);
-               if (wbc)
-                       wbc->pages_skipped++;
+               wbc->pages_skipped++;
        } else {
                dout("writepage cleaned page %p\n", page);
                err = 0;  /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
        BUG_ON(!inode);
        ihold(inode);
        err = writepage_nounlock(page, wbc);
+       if (err == -ERESTARTSYS) {
+               /* direct memory reclaimer was killed by SIGKILL. return 0
+                * to prevent caller from setting mapping/page error */
+               err = 0;
+       }
        unlock_page(page);
        iput(inode);
        return err;
 }
 
-
 /*
  * lame release_pages helper.  release_pages() isn't exported to
  * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
  * If we get an error, set the mapping error bit, but not the individual
  * page error bits.
  */
-static void writepages_finish(struct ceph_osd_request *req,
-                             struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
 {
        struct inode *inode = req->r_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        bool remove_page;
 
-
        dout("writepages_finish %p rc %d\n", inode, rc);
        if (rc < 0)
                mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                                clear_bdi_congested(&fsc->backing_dev_info,
                                                    BLK_RW_ASYNC);
 
+                       if (rc < 0)
+                               SetPageError(page);
+
                        ceph_put_snap_context(page_snap_context(page));
                        page->private = 0;
                        ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
        if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-               pr_warn("writepage_start %p on forced umount\n", inode);
-               truncate_pagecache(inode, 0);
+               if (ci->i_wrbuffer_ref > 0) {
+                       pr_warn_ratelimited(
+                               "writepage_start %p %lld forced umount\n",
+                               inode, ceph_ino(inode));
+               }
                mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
@@ -1063,10 +1079,7 @@ new_request:
                        pages = NULL;
                }
 
-               vino = ceph_vino(inode);
-               ceph_osdc_build_request(req, offset, snapc, vino.snap,
-                                       &inode->i_mtime);
-
+               req->r_mtime = inode->i_mtime;
                rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
                BUG_ON(rc);
                req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
                mapping->writeback_index = index;
 
 out:
-       if (req)
-               ceph_osdc_put_request(req);
+       ceph_osdc_put_request(req);
        ceph_put_snap_context(snapc);
        dout("writepages done, rc = %d\n", rc);
        return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
                            struct page *page)
 {
        struct inode *inode = file_inode(file);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        loff_t page_off = pos & PAGE_MASK;
        int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
 
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               dout(" page %p forced umount\n", page);
+               unlock_page(page);
+               return -EIO;
+       }
+
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
        wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                       r = wait_event_interruptible(ci->i_cap_wq,
+                       r = wait_event_killable(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
                        if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
        .direct_IO = ceph_direct_io,
 };
 
+static void ceph_block_sigs(sigset_t *oldset)
+{
+       sigset_t mask;
+       siginitsetinv(&mask, sigmask(SIGKILL));
+       sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+       sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 
 /*
  * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *pinned_page = NULL;
        loff_t off = vmf->pgoff << PAGE_SHIFT;
        int want, got, ret;
+       sigset_t oldset;
+
+       ceph_block_sigs(&oldset);
 
        dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
             inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_CACHE;
-       while (1) {
-               got = 0;
-               ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
-                                   -1, &got, &pinned_page);
-               if (ret == 0)
-                       break;
-               if (ret != -ERESTARTSYS) {
-                       WARN_ON(1);
-                       return VM_FAULT_SIGBUS;
-               }
-       }
+
+       got = 0;
+       ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+       if (ret < 0)
+               goto out_restore;
+
        dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
             inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
 
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        ceph_put_cap_refs(ci, got);
 
        if (ret != -EAGAIN)
-               return ret;
+               goto out_restore;
 
        /* read inline data */
        if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                                                ~__GFP_FS));
                if (!page) {
                        ret = VM_FAULT_OOM;
-                       goto out;
+                       goto out_inline;
                }
                ret1 = __ceph_do_getattr(inode, page,
                                         CEPH_STAT_CAP_INLINE_DATA, true);
                if (ret1 < 0 || off >= i_size_read(inode)) {
                        unlock_page(page);
                        put_page(page);
-                       ret = VM_FAULT_SIGBUS;
-                       goto out;
+                       if (ret1 < 0)
+                               ret = ret1;
+                       else
+                               ret = VM_FAULT_SIGBUS;
+                       goto out_inline;
                }
                if (ret1 < PAGE_SIZE)
                        zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                SetPageUptodate(page);
                vmf->page = page;
                ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+               dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+                    inode, off, (size_t)PAGE_SIZE, ret);
        }
-out:
-       dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
-            inode, off, (size_t)PAGE_SIZE, ret);
+out_restore:
+       ceph_restore_sigs(&oldset);
+       if (ret < 0)
+               ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
        return ret;
 }
 
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size = i_size_read(inode);
        size_t len;
        int want, got, ret;
+       sigset_t oldset;
 
        prealloc_cf = ceph_alloc_cap_flush();
        if (!prealloc_cf)
-               return VM_FAULT_SIGBUS;
+               return VM_FAULT_OOM;
+
+       ceph_block_sigs(&oldset);
 
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                ret = ceph_uninline_data(vma->vm_file, locked_page);
                if (locked_page)
                        unlock_page(locked_page);
-               if (ret < 0) {
-                       ret = VM_FAULT_SIGBUS;
+               if (ret < 0)
                        goto out_free;
-               }
        }
 
        if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
        else
                want = CEPH_CAP_FILE_BUFFER;
-       while (1) {
-               got = 0;
-               ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
-                                   &got, NULL);
-               if (ret == 0)
-                       break;
-               if (ret != -ERESTARTSYS) {
-                       WARN_ON(1);
-                       ret = VM_FAULT_SIGBUS;
-                       goto out_free;
-               }
-       }
+
+       got = 0;
+       ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+                           &got, NULL);
+       if (ret < 0)
+               goto out_free;
+
        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
             inode, off, len, ceph_cap_string(got));
 
        /* Update time before taking page lock */
        file_update_time(vma->vm_file);
 
-       lock_page(page);
+       do {
+               lock_page(page);
 
-       ret = VM_FAULT_NOPAGE;
-       if ((off > size) ||
-           (page->mapping != inode->i_mapping)) {
-               unlock_page(page);
-               goto out;
-       }
+               if ((off > size) || (page->mapping != inode->i_mapping)) {
+                       unlock_page(page);
+                       ret = VM_FAULT_NOPAGE;
+                       break;
+               }
+
+               ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+               if (ret >= 0) {
+                       /* success.  we'll keep the page locked. */
+                       set_page_dirty(page);
+                       ret = VM_FAULT_LOCKED;
+               }
+       } while (ret == -EAGAIN);
 
-       ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-       if (ret >= 0) {
-               /* success.  we'll keep the page locked. */
-               set_page_dirty(page);
-               ret = VM_FAULT_LOCKED;
-       } else {
-               if (ret == -ENOMEM)
-                       ret = VM_FAULT_OOM;
-               else
-                       ret = VM_FAULT_SIGBUS;
-       }
-out:
        if (ret == VM_FAULT_LOCKED ||
            ci->i_inline_version != CEPH_INLINE_NONE) {
                int dirty;
@@ -1495,8 +1523,10 @@ out:
             inode, off, len, ceph_cap_string(got), ret);
        ceph_put_cap_refs(ci, got);
 out_free:
+       ceph_restore_sigs(&oldset);
        ceph_free_cap_flush(prealloc_cf);
-
+       if (ret < 0)
+               ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
        return ret;
 }
 
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                goto out;
        }
 
-       ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+       req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                        goto out_put;
        }
 
-       ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+       req->r_mtime = inode->i_mtime;
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!err)
                err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
        rd_req->r_flags = CEPH_OSD_FLAG_READ;
        osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
        rd_req->r_base_oloc.pool = pool;
-       snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
-                "%llx.00000000", ci->i_vino.ino);
-       rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+       ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+       err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+       if (err)
+               goto out_unlock;
 
        wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
                                         1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
                goto out_unlock;
        }
 
-       wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
-                         CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-       wr_req->r_base_oloc.pool = pool;
-       wr_req->r_base_oid = rd_req->r_base_oid;
+       ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+       ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+       err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+       if (err)
+               goto out_unlock;
 
        /* one page should be large enough for STAT data */
        pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
        osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
                                     0, false, true);
-       ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-                               &ci->vfs_inode.i_mtime);
        err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-       ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
-                               &ci->vfs_inode.i_mtime);
+       wr_req->r_mtime = ci->vfs_inode.i_mtime;
        err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
        if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 out_unlock:
        up_write(&mdsc->pool_perm_rwsem);
 
-       if (rd_req)
-               ceph_osdc_put_request(rd_req);
-       if (wr_req)
-               ceph_osdc_put_request(wr_req);
+       ceph_osdc_put_request(rd_req);
+       ceph_osdc_put_request(wr_req);
 out:
        if (!err)
                err = have;
index a351480dbabc95891e4b61f83fb92485f8ea7b18..c052b5bf219b54d3c3cd0b15f85a3f153c710fbf 100644 (file)
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
        unlock_page(page);
 }
 
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
        return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
                (ci->i_fscache_gen == ci->i_rdcache_gen));
index cfaeef18cbcabc4baa818c54fb17ef4576f9368d..c17b5d76d75ee96515717ce29cc62971f80acca4 100644 (file)
@@ -1656,7 +1656,7 @@ retry_locked:
         */
        if ((!is_delayed || mdsc->stopping) &&
            !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
-           ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+           !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
            inode->i_data.nrpages &&            /* have cached pages */
            (revoking & (CEPH_CAP_FILE_CACHE|
                         CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
 
                revoking = cap->implemented & ~cap->issued;
                dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-                    cap->mds, cap, ceph_cap_string(cap->issued),
-                    ceph_cap_string(cap_used),
+                    cap->mds, cap, ceph_cap_string(cap_used),
+                    ceph_cap_string(cap->issued),
                     ceph_cap_string(cap->implemented),
                     ceph_cap_string(revoking));
 
@@ -2317,7 +2317,7 @@ again:
 
        /* make sure file is actually open */
        file_wanted = __ceph_caps_file_wanted(ci);
-       if ((file_wanted & need) == 0) {
+       if ((file_wanted & need) != need) {
                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
                     ceph_cap_string(need), ceph_cap_string(file_wanted));
                *err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
                        goto out_unlock;
                }
 
-               if (!__ceph_is_any_caps(ci) &&
-                   ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-                       dout("get_cap_refs %p forced umount\n", inode);
-                       *err = -EIO;
-                       ret = 1;
-                       goto out_unlock;
+               if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+                       int mds_wanted;
+                       if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+                           CEPH_MOUNT_SHUTDOWN) {
+                               dout("get_cap_refs %p forced umount\n", inode);
+                               *err = -EIO;
+                               ret = 1;
+                               goto out_unlock;
+                       }
+                       mds_wanted = __ceph_caps_mds_wanted(ci);
+                       if ((mds_wanted & need) != need) {
+                               dout("get_cap_refs %p caps were dropped"
+                                    " (session killed?)\n", inode);
+                               *err = -ESTALE;
+                               ret = 1;
+                               goto out_unlock;
+                       }
+                       if ((mds_wanted & file_wanted) ==
+                           (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                               ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
 
                dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                        if (err == -EAGAIN)
                                continue;
                        if (err < 0)
-                               return err;
+                               ret = err;
                } else {
                        ret = wait_event_interruptible(ci->i_cap_wq,
                                        try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                                continue;
                        if (err < 0)
                                ret = err;
-                       if (ret < 0)
-                               return ret;
+               }
+               if (ret < 0) {
+                       if (err == -ESTALE) {
+                               /* session was killed, try renew caps */
+                               ret = ceph_renew_caps(&ci->vfs_inode);
+                               if (ret == 0)
+                                       continue;
+                       }
+                       return ret;
                }
 
                if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
            ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-           !ci->i_wrbuffer_ref) {
+           !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
                if (try_nonblocking_invalidate(inode)) {
                        /* there were locked pages.. invalidate later
                           in a separate thread. */
@@ -3226,6 +3247,8 @@ retry:
 
        if (target < 0) {
                __ceph_remove_cap(cap, false);
+               if (!ci->i_auth_cap)
+                       ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                goto out_unlock;
        }
 
index 31f831471ed287e470bc38fc8632b19d6aed61d2..39ff678e567fcb5c31d9729081119adaa4578def 100644 (file)
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   path ? path : "");
                        spin_unlock(&req->r_old_dentry->d_lock);
                        kfree(path);
-               } else if (req->r_path2) {
+               } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
                        if (req->r_ino2.ino)
                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
                                           req->r_path2);
index 3ab1192d2029301da26ce411987287aa3b309f1d..6e0fedf6713b5130af5c79cdb14649e90ba8427c 100644 (file)
@@ -70,16 +70,42 @@ out_unlock:
 }
 
 /*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ *     (0xff << 52) | ((24 bits hash) << 28) |
+ *     (the nth entry has hash collision);
+ * - frag+name order;
+ *     ((frag value) << 28) | (the nth entry in frag);
  */
+#define OFFSET_BITS    28
+#define OFFSET_MASK    ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER     (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+       loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+       if (hash_order)
+               fpos |= HASH_ORDER;
+       return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+       return (p & HASH_ORDER) == HASH_ORDER;
+}
+
 static unsigned fpos_frag(loff_t p)
 {
-       return p >> 32;
+       return p >> OFFSET_BITS;
 }
+
+static unsigned fpos_hash(loff_t p)
+{
+       return ceph_frag_value(fpos_frag(p));
+}
+
 static unsigned fpos_off(loff_t p)
 {
-       return p & 0xffffffff;
+       return p & OFFSET_MASK;
 }
 
 static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
        return 0;
 }
 
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+                       struct ceph_readdir_cache_control *cache_ctl)
+{
+       struct inode *dir = d_inode(parent);
+       struct dentry *dentry;
+       unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+       loff_t ptr_pos = idx * sizeof(struct dentry *);
+       pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+       if (ptr_pos >= i_size_read(dir))
+               return NULL;
+
+       if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+               ceph_readdir_cache_release(cache_ctl);
+               cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+               if (!cache_ctl->page) {
+                       dout(" page %lu not found\n", ptr_pgoff);
+                       return ERR_PTR(-EAGAIN);
+               }
+               /* reading/filling the cache are serialized by
+                  i_mutex, no need to use page lock */
+               unlock_page(cache_ctl->page);
+               cache_ctl->dentries = kmap(cache_ctl->page);
+       }
+
+       cache_ctl->index = idx & idx_mask;
+
+       rcu_read_lock();
+       spin_lock(&parent->d_lock);
+       /* check i_size again here, because empty directory can be
+        * marked as complete while not holding the i_mutex. */
+       if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+               dentry = cache_ctl->dentries[cache_ctl->index];
+       else
+               dentry = NULL;
+       spin_unlock(&parent->d_lock);
+       if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+               dentry = NULL;
+       rcu_read_unlock();
+       return dentry ? : ERR_PTR(-EAGAIN);
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct inode *dir = d_inode(parent);
        struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
-       unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
-       int err = 0;
-       loff_t ptr_pos = 0;
        struct ceph_readdir_cache_control cache_ctl = {};
+       u64 idx = 0;
+       int err = 0;
 
-       dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+       dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
+
+       /* search start position */
+       if (ctx->pos > 2) {
+               u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+               while (count > 0) {
+                       u64 step = count >> 1;
+                       dentry = __dcache_find_get_entry(parent, idx + step,
+                                                        &cache_ctl);
+                       if (!dentry) {
+                               /* use linar search */
+                               idx = 0;
+                               break;
+                       }
+                       if (IS_ERR(dentry)) {
+                               err = PTR_ERR(dentry);
+                               goto out;
+                       }
+                       di = ceph_dentry(dentry);
+                       spin_lock(&dentry->d_lock);
+                       if (fpos_cmp(di->offset, ctx->pos) < 0) {
+                               idx += step + 1;
+                               count -= step + 1;
+                       } else {
+                               count = step;
+                       }
+                       spin_unlock(&dentry->d_lock);
+                       dput(dentry);
+               }
 
-       /* we can calculate cache index for the first dirfrag */
-       if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
-               cache_ctl.index = fpos_off(ctx->pos) - 2;
-               BUG_ON(cache_ctl.index < 0);
-               ptr_pos = cache_ctl.index * sizeof(struct dentry *);
+               dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
        }
 
-       while (true) {
-               pgoff_t pgoff;
-               bool emit_dentry;
 
-               if (ptr_pos >= i_size_read(dir)) {
+       for (;;) {
+               bool emit_dentry = false;
+               dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+               if (!dentry) {
                        fi->flags |= CEPH_F_ATEND;
                        err = 0;
                        break;
                }
-
-               err = -EAGAIN;
-               pgoff = ptr_pos >> PAGE_SHIFT;
-               if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
-                       ceph_readdir_cache_release(&cache_ctl);
-                       cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
-                       if (!cache_ctl.page) {
-                               dout(" page %lu not found\n", pgoff);
-                               break;
-                       }
-                       /* reading/filling the cache are serialized by
-                        * i_mutex, no need to use page lock */
-                       unlock_page(cache_ctl.page);
-                       cache_ctl.dentries = kmap(cache_ctl.page);
+               if (IS_ERR(dentry)) {
+                       err = PTR_ERR(dentry);
+                       goto out;
                }
 
-               rcu_read_lock();
-               spin_lock(&parent->d_lock);
-               /* check i_size again here, because empty directory can be
-                * marked as complete while not holding the i_mutex. */
-               if (ceph_dir_is_complete_ordered(dir) &&
-                   ptr_pos < i_size_read(dir))
-                       dentry = cache_ctl.dentries[cache_ctl.index % nsize];
-               else
-                       dentry = NULL;
-               spin_unlock(&parent->d_lock);
-               if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
-                       dentry = NULL;
-               rcu_read_unlock();
-               if (!dentry)
-                       break;
-
-               emit_dentry = false;
                di = ceph_dentry(dentry);
                spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
                    d_really_is_positive(dentry) &&
-                   ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
-                   ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
                    fpos_cmp(ctx->pos, di->offset) <= 0) {
                        emit_dentry = true;
                }
                spin_unlock(&dentry->d_lock);
 
                if (emit_dentry) {
-                       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                       dout(" %llx dentry %p %pd %p\n", di->offset,
                             dentry, dentry, d_inode(dentry));
                        ctx->pos = di->offset;
                        if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
                } else {
                        dput(dentry);
                }
-
-               cache_ctl.index++;
-               ptr_pos += sizeof(struct dentry *);
        }
+out:
        ceph_readdir_cache_release(&cache_ctl);
        if (last) {
                int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        return err;
 }
 
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+       if (!fi->last_readdir)
+               return true;
+       if (is_hash_order(pos))
+               return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+       else
+               return fi->frag != fpos_frag(pos);
+}
+
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-       unsigned frag = fpos_frag(ctx->pos);
-       int off = fpos_off(ctx->pos);
+       int i;
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
 
-       dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+       dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
        if (fi->flags & CEPH_F_ATEND)
                return 0;
 
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 1;
-               off = 1;
        }
        if (ctx->pos == 1) {
                ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                            inode->i_mode >> 12))
                        return 0;
                ctx->pos = 2;
-               off = 2;
        }
 
        /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                err = __dcache_readdir(file, ctx, shared_gen);
                if (err != -EAGAIN)
                        return err;
-               frag = fpos_frag(ctx->pos);
-               off = fpos_off(ctx->pos);
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        /* proceed with a normal readdir */
 more:
        /* do we have the correct frag content buffered? */
-       if (fi->frag != frag || fi->last_readdir == NULL) {
+       if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
+               unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -305,6 +372,13 @@ more:
                        fi->last_readdir = NULL;
                }
 
+               if (is_hash_order(ctx->pos)) {
+                       frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                               NULL, NULL);
+               } else {
+                       frag = fpos_frag(ctx->pos);
+               }
+
                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
                     ceph_vinop(inode), frag, fi->last_name);
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
                req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
+               req->r_args.readdir.flags =
+                               cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
 
                req->r_inode = inode;
                ihold(inode);
@@ -340,22 +416,26 @@ more:
                        ceph_mdsc_put_request(req);
                        return err;
                }
-               dout("readdir got and parsed readdir result=%d"
-                    " on frag %x, end=%d, complete=%d\n", err, frag,
+               dout("readdir got and parsed readdir result=%d on "
+                    "frag %x, end=%d, complete=%d, hash_order=%d\n",
+                    err, frag,
                     (int)req->r_reply_info.dir_end,
-                    (int)req->r_reply_info.dir_complete);
-
+                    (int)req->r_reply_info.dir_complete,
+                    (int)req->r_reply_info.hash_order);
 
-               /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                       off = req->r_readdir_offset;
-                       fi->next_offset = off;
+                       if (!rinfo->hash_order) {
+                               fi->next_offset = req->r_readdir_offset;
+                               /* adjust ctx->pos to beginning of frag */
+                               ctx->pos = ceph_make_fpos(frag,
+                                                         fi->next_offset,
+                                                         false);
+                       }
                }
 
                fi->frag = frag;
-               fi->offset = fi->next_offset;
                fi->last_readdir = req;
 
                if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
                                fi->dir_ordered_count = 0;
-                       } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                       } else if (ceph_frag_is_leftmost(frag) &&
+                                  fi->next_offset == 2) {
                                /* note dir version at start of readdir so
                                 * we can tell if any dentries get dropped */
                                fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
                        fi->dir_release_count = 0;
                }
 
-               if (req->r_reply_info.dir_end) {
-                       kfree(fi->last_name);
-                       fi->last_name = NULL;
-                       if (ceph_frag_is_rightmost(frag))
-                               fi->next_offset = 2;
-                       else
-                               fi->next_offset = 0;
-               } else {
-                       err = note_last_dentry(fi,
-                                      rinfo->dir_dname[rinfo->dir_nr-1],
-                                      rinfo->dir_dname_len[rinfo->dir_nr-1],
-                                      fi->next_offset + rinfo->dir_nr);
+               /* note next offset and last dentry name */
+               if (rinfo->dir_nr > 0) {
+                       struct ceph_mds_reply_dir_entry *rde =
+                                       rinfo->dir_entries + (rinfo->dir_nr-1);
+                       unsigned next_offset = req->r_reply_info.dir_end ?
+                                       2 : (fpos_off(rde->offset) + 1);
+                       err = note_last_dentry(fi, rde->name, rde->name_len,
+                                              next_offset);
                        if (err)
                                return err;
+               } else if (req->r_reply_info.dir_end) {
+                       fi->next_offset = 2;
+                       /* keep last name */
                }
        }
 
        rinfo = &fi->last_readdir->r_reply_info;
-       dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
-            rinfo->dir_nr, off, fi->offset);
-
-       ctx->pos = ceph_make_fpos(frag, off);
-       while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-               struct ceph_mds_reply_inode *in =
-                       rinfo->dir_in[off - fi->offset].in;
+       dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+            fi->frag, rinfo->dir_nr, ctx->pos,
+            rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+       i = 0;
+       /* search start position */
+       if (rinfo->dir_nr > 0) {
+               int step, nr = rinfo->dir_nr;
+               while (nr > 0) {
+                       step = nr >> 1;
+                       if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+                               i +=  step + 1;
+                               nr -= step + 1;
+                       } else {
+                               nr = step;
+                       }
+               }
+       }
+       for (; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
 
-               dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-                    off, off - fi->offset, rinfo->dir_nr, ctx->pos,
-                    rinfo->dir_dname_len[off - fi->offset],
-                    rinfo->dir_dname[off - fi->offset], in);
-               BUG_ON(!in);
-               ftype = le32_to_cpu(in->mode) >> 12;
-               vino.ino = le64_to_cpu(in->ino);
-               vino.snap = le64_to_cpu(in->snapid);
+               BUG_ON(rde->offset < ctx->pos);
+
+               ctx->pos = rde->offset;
+               dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+                    i, rinfo->dir_nr, ctx->pos,
+                    rde->name_len, rde->name, &rde->inode.in);
+
+               BUG_ON(!rde->inode.in);
+               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
                ino = ceph_vino_to_ino(vino);
-               if (!dir_emit(ctx,
-                           rinfo->dir_dname[off - fi->offset],
-                           rinfo->dir_dname_len[off - fi->offset],
-                           ceph_translate_ino(inode->i_sb, ino), ftype)) {
+
+               if (!dir_emit(ctx, rde->name, rde->name_len,
+                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
-               off++;
                ctx->pos++;
        }
 
-       if (fi->last_name) {
+       if (fi->next_offset > 2) {
                ceph_mdsc_put_request(fi->last_readdir);
                fi->last_readdir = NULL;
                goto more;
        }
 
        /* more frags? */
-       if (!ceph_frag_is_rightmost(frag)) {
-               frag = ceph_frag_next(frag);
-               off = 0;
-               ctx->pos = ceph_make_fpos(frag, off);
+       if (!ceph_frag_is_rightmost(fi->frag)) {
+               unsigned frag = ceph_frag_next(fi->frag);
+               if (is_hash_order(ctx->pos)) {
+                       loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+                                                       fi->next_offset, true);
+                       if (new_pos > ctx->pos)
+                               ctx->pos = new_pos;
+                       /* keep last_name */
+               } else {
+                       ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+                       kfree(fi->last_name);
+                       fi->last_name = NULL;
+               }
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
@@ -467,7 +570,7 @@ more:
        return 0;
 }
 
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
 {
        if (fi->last_readdir) {
                ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        fi->last_name = NULL;
        fi->dir_release_count = 0;
        fi->readdir_cache_idx = -1;
-       if (ceph_frag_is_leftmost(frag))
-               fi->next_offset = 2;  /* compensate for . and .. */
-       else
-               fi->next_offset = 0;
+       fi->next_offset = 2;  /* compensate for . and .. */
        fi->flags &= ~CEPH_F_ATEND;
 }
 
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+       struct ceph_mds_reply_info_parsed *rinfo;
+       loff_t chunk_offset;
+       if (new_pos == 0)
+               return true;
+       if (is_hash_order(new_pos)) {
+               /* no need to reset last_name for a forward seek when
+                * dentries are sotred in hash order */
+       } else if (fi->frag |= fpos_frag(new_pos)) {
+               return true;
+       }
+       rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+       if (!rinfo || !rinfo->dir_nr)
+               return true;
+       chunk_offset = rinfo->dir_entries[0].offset;
+       return new_pos < chunk_offset ||
+              is_hash_order(new_pos) != is_hash_order(chunk_offset);
+}
+
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
-       loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
 
        inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        }
 
        if (offset >= 0) {
+               if (need_reset_readdir(fi, offset)) {
+                       dout("dir_llseek dropping %p content\n", file);
+                       reset_readdir(fi);
+               } else if (is_hash_order(offset) && offset > file->f_pos) {
+                       /* for hash offset, we don't know if a forward seek
+                        * is within same frag */
+                       fi->dir_release_count = 0;
+                       fi->readdir_cache_idx = -1;
+               }
+
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                        fi->flags &= ~CEPH_F_ATEND;
                }
                retval = offset;
-
-               if (offset == 0 ||
-                   fpos_frag(offset) != fi->frag ||
-                   fpos_off(offset) < fi->offset) {
-                       /* discard buffered readdir content on seekdir(0), or
-                        * seek to new frag, or seek prior to current chunk */
-                       dout("dir_llseek dropping %p content\n", file);
-                       reset_readdir(fi, fpos_frag(offset));
-               } else if (fpos_cmp(offset, old_offset) > 0) {
-                       /* reset dir_release_count if we did a forward seek */
-                       fi->dir_release_count = 0;
-                       fi->readdir_cache_idx = -1;
-               }
        }
 out:
        inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        return dentry;
 }
 
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
        return ceph_ino(inode) == CEPH_INO_ROOT &&
                strncmp(dentry->d_name.name, ".ceph", 5) == 0;
index 4f1dc7120916be699939ce03d46374bde72f6cca..a888df6f2d71b99a2e01890c25ff40d48c4dd649 100644 (file)
@@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
        return ret;
 }
 
+/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_request *req;
+       int err, flags, wanted;
+
+       spin_lock(&ci->i_ceph_lock);
+       wanted = __ceph_caps_file_wanted(ci);
+       if (__ceph_is_any_real_caps(ci) &&
+           (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+               int issued = __ceph_caps_issued(ci, NULL);
+               spin_unlock(&ci->i_ceph_lock);
+               dout("renew caps %p want %s issued %s updating mds_wanted\n",
+                    inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+               ceph_check_caps(ci, 0, NULL);
+               return 0;
+       }
+       spin_unlock(&ci->i_ceph_lock);
+
+       flags = 0;
+       if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+               flags = O_RDWR;
+       else if (wanted & CEPH_CAP_FILE_RD)
+               flags = O_RDONLY;
+       else if (wanted & CEPH_CAP_FILE_WR)
+               flags = O_WRONLY;
+#ifdef O_LAZY
+       if (wanted & CEPH_CAP_FILE_LAZYIO)
+               flags |= O_LAZY;
+#endif
+
+       req = prepare_open_request(inode->i_sb, flags, 0);
+       if (IS_ERR(req)) {
+               err = PTR_ERR(req);
+               goto out;
+       }
+
+       req->r_inode = inode;
+       ihold(inode);
+       req->r_num_caps = 1;
+       req->r_fmode = -1;
+
+       err = ceph_mdsc_do_request(mdsc, NULL, req);
+       ceph_mdsc_put_request(req);
+out:
+       dout("renew caps %p open result=%d\n", inode, err);
+       return err < 0 ? err : 0;
+}
+
 /*
  * If we already have the requisite capabilities, we can satisfy
  * the open request locally (no need to request new caps from the
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
        kfree(aio_req);
 }
 
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
-                                 struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
 {
        int rc = req->r_result;
        struct inode *inode = req->r_inode;
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
        req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
                        CEPH_OSD_FLAG_ONDISK |
                        CEPH_OSD_FLAG_WRITE;
-       req->r_base_oloc = orig_req->r_base_oloc;
-       req->r_base_oid = orig_req->r_base_oid;
+       ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+       ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+
+       ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+       if (ret) {
+               ceph_osdc_put_request(req);
+               req = orig_req;
+               goto out;
+       }
 
        req->r_ops[0] = orig_req->r_ops[0];
        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-       ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
-                               snapc, CEPH_NOSNAP, &aio_req->mtime);
+       req->r_mtime = aio_req->mtime;
+       req->r_data_offset = req->r_ops[0].extent.offset;
 
        ceph_osdc_put_request(orig_req);
 
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
        if (ret < 0) {
                req->r_result = ret;
-               ceph_aio_complete_req(req, NULL);
+               ceph_aio_complete_req(req);
        }
 
        ceph_put_snap_context(snapc);
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
                list_add_tail(&req->r_unsafe_item,
                              &ci->i_unsafe_writes);
                spin_unlock(&ci->i_unsafe_lock);
+
+               complete_all(&req->r_completion);
        } else {
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_item);
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                        (pos+len) | (PAGE_SIZE - 1));
 
                        osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+                       req->r_mtime = mtime;
                }
 
-
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
                                                 false, false);
 
-               ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
                if (aio_req) {
                        aio_req->total_len += len;
                        aio_req->num_reqs++;
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                                                              req, false);
                        if (ret < 0) {
                                req->r_result = ret;
-                               ceph_aio_complete_req(req, NULL);
+                               ceph_aio_complete_req(req);
                        }
                }
                return -EIOCBQUEUED;
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
                                                false, true);
 
-               /* BUG_ON(vino.snap != CEPH_NOSNAP); */
-               ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+               req->r_mtime = mtime;
                ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                goto out;
        }
 
-       ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
-                               &inode->i_mtime);
-
+       req->r_mtime = inode->i_mtime;
        ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
        if (!ret) {
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
index e669cfa9d79371186fbecffd0ac0f384faa74a2f..f059b5997072c399fcf6cdccbfcc806d4605a581 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
                diri_auth = ci->i_auth_cap->mds;
        spin_unlock(&ci->i_ceph_lock);
 
+       if (mds == -1) /* CDIR_AUTH_PARENT */
+               mds = diri_auth;
+
        mutex_lock(&ci->i_fragtree_mutex);
        if (ndist == 0 && mds == diri_auth) {
                /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
        return err;
 }
 
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+       struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+       struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+       return ceph_frag_compare(ls->frag, rs->frag);
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+       if (!frag)
+               return f == ceph_frag_make(0, 0);
+       if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+               return false;
+       return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
 static int ceph_fill_fragtree(struct inode *inode,
                              struct ceph_frag_tree_head *fragtree,
                              struct ceph_mds_reply_dirfrag *dirinfo)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_inode_frag *frag;
+       struct ceph_inode_frag *frag, *prev_frag = NULL;
        struct rb_node *rb_node;
-       int i;
-       u32 id, nsplits;
+       unsigned i, split_by, nsplits;
+       u32 id;
        bool update = false;
 
        mutex_lock(&ci->i_fragtree_mutex);
        nsplits = le32_to_cpu(fragtree->nsplits);
-       if (nsplits) {
+       if (nsplits != ci->i_fragtree_nsplits) {
+               update = true;
+       } else if (nsplits) {
                i = prandom_u32() % nsplits;
                id = le32_to_cpu(fragtree->splits[i].frag);
                if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
        if (!update)
                goto out_unlock;
 
+       if (nsplits > 1) {
+               sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+                    frag_tree_split_cmp, NULL);
+       }
+
        dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
        rb_node = rb_first(&ci->i_fragtree);
        for (i = 0; i < nsplits; i++) {
                id = le32_to_cpu(fragtree->splits[i].frag);
+               split_by = le32_to_cpu(fragtree->splits[i].by);
+               if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+                       pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+                              "frag %x split by %d\n", ceph_vinop(inode),
+                              i, nsplits, id, split_by);
+                       continue;
+               }
                frag = NULL;
                while (rb_node) {
                        frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
                                break;
                        }
                        rb_node = rb_next(rb_node);
-                       rb_erase(&frag->node, &ci->i_fragtree);
-                       kfree(frag);
+                       /* delete stale split/leaf node */
+                       if (frag->split_by > 0 ||
+                           !is_frag_child(frag->frag, prev_frag)) {
+                               rb_erase(&frag->node, &ci->i_fragtree);
+                               if (frag->split_by > 0)
+                                       ci->i_fragtree_nsplits--;
+                               kfree(frag);
+                       }
                        frag = NULL;
                }
                if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
                        if (IS_ERR(frag))
                                continue;
                }
-               frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+               if (frag->split_by == 0)
+                       ci->i_fragtree_nsplits++;
+               frag->split_by = split_by;
                dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+               prev_frag = frag;
        }
        while (rb_node) {
                frag = rb_entry(rb_node, struct ceph_inode_frag, node);
                rb_node = rb_next(rb_node);
-               rb_erase(&frag->node, &ci->i_fragtree);
-               kfree(frag);
+               /* delete stale split/leaf node */
+               if (frag->split_by > 0 ||
+                   !is_frag_child(frag->frag, prev_frag)) {
+                       rb_erase(&frag->node, &ci->i_fragtree);
+                       if (frag->split_by > 0)
+                               ci->i_fragtree_nsplits--;
+                       kfree(frag);
+               }
        }
 out_unlock:
        mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
                rb_erase(n, &ci->i_fragtree);
                kfree(frag);
        }
+       ci->i_fragtree_nsplits = 0;
 
        __ceph_destroy_xattrs(ci);
        if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+       return (size + (1<<9) - 1) >> 9;
+}
+
 /*
  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
  * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        size = 0;
                }
                i_size_write(inode, size);
-               inode->i_blocks = (size + (1<<9) - 1) >> 9;
+               inode->i_blocks = calc_inode_blocks(size);
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
                        dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
                        spin_unlock(&ci->i_ceph_lock);
 
-                       err = -EINVAL;
-                       if (WARN_ON(symlen != i_size_read(inode)))
-                               goto out;
+                       if (symlen != i_size_read(inode)) {
+                               pr_err("fill_inode %llx.%llx BAD symlink "
+                                       "size %lld\n", ceph_vinop(inode),
+                                       i_size_read(inode));
+                               i_size_write(inode, symlen);
+                               inode->i_blocks = calc_inode_blocks(symlen);
+                       }
 
                        err = -ENOMEM;
                        sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        int i, err = 0;
 
        for (i = 0; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                struct inode *in;
                int rc;
 
-               vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-               vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                in = ceph_get_inode(req->r_dentry->d_sb, vino);
                if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                        dout("new_inode badness got %d\n", err);
                        continue;
                }
-               rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+               rc = fill_inode(in, NULL, &rde->inode, NULL, session,
                                req->r_request_started, -1,
                                &req->r_caps_reservation);
                if (rc < 0) {
                        pr_err("fill_inode badness on %p got %d\n", in, rc);
                        err = rc;
-                       continue;
                }
+               iput(in);
        }
 
        return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
        struct dentry *parent = req->r_dentry;
+       struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct qstr dname;
        struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        int err = 0, skipped = 0, ret, i;
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-       struct ceph_dentry_info *di;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+       u32 last_hash = 0;
+       u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
 
        if (req->r_aborted)
                return readdir_prepopulate_inodes_only(req, session);
 
+       if (rinfo->hash_order && req->r_path2) {
+               last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                         req->r_path2, strlen(req->r_path2));
+               last_hash = ceph_frag_value(last_hash);
+       }
+
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                dout("readdir_prepopulate got new frag %x -> %x\n",
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
-               if (ceph_frag_is_leftmost(frag))
+               if (!rinfo->hash_order)
                        req->r_readdir_offset = 2;
-               else
-                       req->r_readdir_offset = 0;
        }
 
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
                /* note dir version at start of readdir so we can tell
                 * if any dentries get dropped */
-               struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
                req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
                req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
                req->r_readdir_cache_idx = 0;
        }
 
        cache_ctl.index = req->r_readdir_cache_idx;
+       fpos_offset = req->r_readdir_offset;
 
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
+               struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
 
-               dname.name = rinfo->dir_dname[i];
-               dname.len = rinfo->dir_dname_len[i];
+               dname.name = rde->name;
+               dname.len = rde->name_len;
                dname.hash = full_name_hash(dname.name, dname.len);
 
-               vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-               vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+               vino.ino = le64_to_cpu(rde->inode.in->ino);
+               vino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+               if (rinfo->hash_order) {
+                       u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+                                                rde->name, rde->name_len);
+                       hash = ceph_frag_value(hash);
+                       if (hash != last_hash)
+                               fpos_offset = 2;
+                       last_hash = hash;
+                       rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+               } else {
+                       rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+               }
 
 retry_lookup:
                dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
                        }
                }
 
-               ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+               ret = fill_inode(in, NULL, &rde->inode, NULL, session,
                                 req->r_request_started, -1,
                                 &req->r_caps_reservation);
                if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
                        dn = realdn;
                }
 
-               di = dn->d_fsdata;
-               di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+               ceph_dentry(dn)->offset = rde->offset;
 
-               update_dentry_lease(dn, rinfo->dir_dlease[i],
-                                   req->r_session,
+               update_dentry_lease(dn, rde->lease, req->r_session,
                                    req->r_request_started);
 
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
        i_size_write(inode, size);
-       inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+       inode->i_blocks = calc_inode_blocks(size);
 
        /* tell the MDS if we are approaching max_size */
        if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
        struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
                                                  i_pg_inv_work);
        struct inode *inode = &ci->vfs_inode;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        u32 orig_gen;
        int check = 0;
 
        mutex_lock(&ci->i_truncate_mutex);
+
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+                                   inode, ceph_ino(inode));
+               mapping_set_error(inode->i_mapping, -EIO);
+               truncate_pagecache(inode, 0);
+               mutex_unlock(&ci->i_truncate_mutex);
+               goto out;
+       }
+
        spin_lock(&ci->i_ceph_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
        orig_gen = ci->i_rdcache_gen;
        spin_unlock(&ci->i_ceph_lock);
 
-       truncate_pagecache(inode, 0);
+       if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+               pr_err("invalidate_pages %p fails\n", inode);
+       }
 
        spin_lock(&ci->i_ceph_lock);
        if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
                if ((issued & CEPH_CAP_FILE_EXCL) &&
                    attr->ia_size > inode->i_size) {
                        i_size_write(inode, attr->ia_size);
-                       inode->i_blocks =
-                               (attr->ia_size + (1 << 9) - 1) >> 9;
+                       inode->i_blocks = calc_inode_blocks(attr->ia_size);
                        inode->i_ctime = attr->ia_ctime;
                        ci->i_reported_size = attr->ia_size;
                        dirtied |= CEPH_CAP_FILE_EXCL;
index f851d8d70158ea0aaf3dd87945cc7f256570d58d..be6b1657b1af2aa98eb70d614c45b2e2c556b009 100644 (file)
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        if (copy_from_user(&dl, arg, sizeof(dl)))
                return -EFAULT;
 
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
                                          &dl.object_no, &dl.object_offset,
                                          &olen);
        if (r < 0) {
-               up_read(&osdc->map_sem);
+               up_read(&osdc->lock);
                return -EIO;
        }
        dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
                 ceph_ino(inode), dl.object_no);
 
        oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-       ceph_oid_set_name(&oid, dl.object_name);
+       ceph_oid_printf(&oid, "%s", dl.object_name);
 
-       r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+       r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
        if (r < 0) {
-               up_read(&osdc->map_sem);
+               up_read(&osdc->lock);
                return r;
        }
 
-       dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+       dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
        if (dl.osd >= 0) {
                struct ceph_entity_addr *a =
                        ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        } else {
                memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
        }
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
 
        /* send result back to user */
        if (copy_to_user(arg, &dl, sizeof(dl)))
index 85b8517f17a09bb051f36f102329258a822ec4f5..2103b823bec0786aa60e48fa3957f252d5010017 100644 (file)
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
 
        ceph_decode_need(p, end, sizeof(num) + 2, bad);
        num = ceph_decode_32(p);
-       info->dir_end = ceph_decode_8(p);
-       info->dir_complete = ceph_decode_8(p);
+       {
+               u16 flags = ceph_decode_16(p);
+               info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+               info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+               info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+       }
        if (num == 0)
                goto done;
 
-       BUG_ON(!info->dir_in);
-       info->dir_dname = (void *)(info->dir_in + num);
-       info->dir_dname_len = (void *)(info->dir_dname + num);
-       info->dir_dlease = (void *)(info->dir_dname_len + num);
-       if ((unsigned long)(info->dir_dlease + num) >
-           (unsigned long)info->dir_in + info->dir_buf_size) {
+       BUG_ON(!info->dir_entries);
+       if ((unsigned long)(info->dir_entries + num) >
+           (unsigned long)info->dir_entries + info->dir_buf_size) {
                pr_err("dir contents are larger than expected\n");
                WARN_ON(1);
                goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
 
        info->dir_nr = num;
        while (num) {
+               struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
                /* dentry */
                ceph_decode_need(p, end, sizeof(u32)*2, bad);
-               info->dir_dname_len[i] = ceph_decode_32(p);
-               ceph_decode_need(p, end, info->dir_dname_len[i], bad);
-               info->dir_dname[i] = *p;
-               *p += info->dir_dname_len[i];
-               dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
-                    info->dir_dname[i]);
-               info->dir_dlease[i] = *p;
+               rde->name_len = ceph_decode_32(p);
+               ceph_decode_need(p, end, rde->name_len, bad);
+               rde->name = *p;
+               *p += rde->name_len;
+               dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+               rde->lease = *p;
                *p += sizeof(struct ceph_mds_reply_lease);
 
                /* inode */
-               err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+               err = parse_reply_info_in(p, end, &rde->inode, features);
                if (err < 0)
                        goto out_bad;
+               /* ceph_readdir_prepopulate() will update it */
+               rde->offset = 0;
                i++;
                num--;
        }
@@ -345,9 +348,9 @@ out_bad:
 
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-       if (!info->dir_in)
+       if (!info->dir_entries)
                return;
-       free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+       free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
 
 
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req);
 }
 
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
-                                            u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
        struct ceph_mds_request *req;
-       struct rb_node *n = mdsc->request_tree.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_mds_request, r_node);
-               if (tid < req->r_tid)
-                       n = n->rb_left;
-               else if (tid > req->r_tid)
-                       n = n->rb_right;
-               else {
-                       ceph_mdsc_get_request(req);
-                       return req;
-               }
-       }
-       return NULL;
-}
 
-static void __insert_request(struct ceph_mds_client *mdsc,
-                            struct ceph_mds_request *new)
-{
-       struct rb_node **p = &mdsc->request_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_mds_request *req = NULL;
+       req = lookup_request(&mdsc->request_tree, tid);
+       if (req)
+               ceph_mdsc_get_request(req);
 
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_mds_request, r_node);
-               if (new->r_tid < req->r_tid)
-                       p = &(*p)->rb_left;
-               else if (new->r_tid > req->r_tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->r_node, parent, p);
-       rb_insert_color(&new->r_node, &mdsc->request_tree);
+       return req;
 }
 
 /*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
-       __insert_request(mdsc, req);
+       insert_request(&mdsc->request_tree, req);
 
        req->r_uid = current_fsuid();
        req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                }
        }
 
-       rb_erase(&req->r_node, &mdsc->request_tree);
-       RB_CLEAR_NODE(&req->r_node);
+       erase_request(&mdsc->request_tree, req);
 
        if (req->r_unsafe_dir && req->r_got_unsafe) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
        int metadata_bytes = 0;
        int metadata_key_count = 0;
        struct ceph_options *opt = mdsc->fsc->client->options;
+       struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
        void *p;
 
        const char* metadata[][2] = {
                {"hostname", utsname()->nodename},
                {"kernel_version", utsname()->release},
-               {"entity_id", opt->name ? opt->name : ""},
+               {"entity_id", opt->name ? : ""},
+               {"root", fsopt->server_path ? : "/"},
                {NULL, NULL}
        };
 
@@ -1149,9 +1125,11 @@ out:
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                  void *arg)
 {
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
        struct ceph_inode_info *ci = ceph_inode(inode);
        LIST_HEAD(to_remove);
-       int drop = 0;
+       bool drop = false;
+       bool invalidate = false;
 
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
                struct ceph_cap_flush *cf;
-               struct ceph_mds_client *mdsc =
-                       ceph_sb_to_client(inode->i_sb)->mdsc;
+               struct ceph_mds_client *mdsc = fsc->mdsc;
+
+               ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
+               if (ci->i_wrbuffer_ref > 0 &&
+                   ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                       invalidate = true;
 
                while (true) {
                        struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                inode, ceph_ino(inode));
                        ci->i_dirty_caps = 0;
                        list_del_init(&ci->i_dirty_item);
-                       drop = 1;
+                       drop = true;
                }
                if (!list_empty(&ci->i_flushing_item)) {
                        pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        ci->i_flushing_caps = 0;
                        list_del_init(&ci->i_flushing_item);
                        mdsc->num_cap_flushing--;
-                       drop = 1;
+                       drop = true;
                }
                spin_unlock(&mdsc->cap_dirty_lock);
 
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                list_del(&cf->list);
                ceph_free_cap_flush(cf);
        }
-       while (drop--)
+
+       wake_up_all(&ci->i_cap_wq);
+       if (invalidate)
+               ceph_queue_invalidate(inode);
+       if (drop)
                iput(inode);
        return 0;
 }
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
  */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
+       struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+       struct super_block *sb = fsc->sb;
        dout("remove_session_caps on %p\n", session);
-       iterate_session_caps(session, remove_session_caps_cb, NULL);
+       iterate_session_caps(session, remove_session_caps_cb, fsc);
 
        spin_lock(&session->s_cap_lock);
        if (session->s_nr_caps > 0) {
-               struct super_block *sb = session->s_mdsc->fsc->sb;
                struct inode *inode;
                struct ceph_cap *cap, *prev = NULL;
                struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
 
-       wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_wanted_max_size = 0;
                ci->i_requested_max_size = 0;
                spin_unlock(&ci->i_ceph_lock);
        }
+       wake_up_all(&ci->i_cap_wq);
        return 0;
 }
 
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        struct ceph_inode_info *ci = ceph_inode(dir);
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
-       size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
-                     sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+       size_t size = sizeof(struct ceph_mds_reply_dir_entry);
        int order, num_entries;
 
        spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
        order = get_order(size * num_entries);
        while (order >= 0) {
-               rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
-                                                       __GFP_NOWARN,
-                                                       order);
-               if (rinfo->dir_in)
+               rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+                                                            __GFP_NOWARN,
+                                                            order);
+               if (rinfo->dir_entries)
                        break;
                order--;
        }
-       if (!rinfo->dir_in)
+       if (!rinfo->dir_entries)
                return -ENOMEM;
 
        num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        INIT_LIST_HEAD(&req->r_unsafe_target_item);
        req->r_fmode = -1;
        kref_init(&req->r_kref);
+       RB_CLEAR_NODE(&req->r_node);
        INIT_LIST_HEAD(&req->r_wait);
        init_completion(&req->r_completion);
        init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* get request, session */
        tid = le64_to_cpu(msg->hdr.tid);
        mutex_lock(&mdsc->mutex);
-       req = __lookup_request(mdsc, tid);
+       req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("handle_reply on unknown tid %llu\n", tid);
                mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        fwd_seq = ceph_decode_32(&p);
 
        mutex_lock(&mdsc->mutex);
-       req = __lookup_request(mdsc, tid);
+       req = lookup_get_request(mdsc, tid);
        if (!req) {
                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
index ee69a537dba53bc7770ac92997263d1a1ec190ad..e7d38aac71093f41bc4c50bfc339cffa3c7069b7 100644 (file)
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
        u32 pool_ns_len;
 };
 
+struct ceph_mds_reply_dir_entry {
+       char                          *name;
+       u32                           name_len;
+       struct ceph_mds_reply_lease   *lease;
+       struct ceph_mds_reply_info_in inode;
+       loff_t                        offset;
+};
+
 /*
  * parsed info about an mds reply, including information about
  * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
                        struct ceph_mds_reply_dirfrag *dir_dir;
                        size_t                        dir_buf_size;
                        int                           dir_nr;
-                       char                          **dir_dname;
-                       u32                           *dir_dname_len;
-                       struct ceph_mds_reply_lease   **dir_dlease;
-                       struct ceph_mds_reply_info_in *dir_in;
-                       u8                            dir_complete, dir_end;
+                       bool                          dir_complete;
+                       bool                          dir_end;
+                       bool                          hash_order;
+                       struct ceph_mds_reply_dir_entry  *dir_entries;
                };
 
                /* for create results */
index 261531e55e9d0f9af3707f937d6efe7cfe612cf6..8c3591a7fbaeef67244f9ad678ebeaf005a64734 100644 (file)
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        const void *start = *p;
        int i, j, n;
        int err = -EINVAL;
-       u16 version;
+       u8 mdsmap_v, mdsmap_cv;
 
        m = kzalloc(sizeof(*m), GFP_NOFS);
        if (m == NULL)
                return ERR_PTR(-ENOMEM);
 
-       ceph_decode_16_safe(p, end, version, bad);
-       if (version > 3) {
-               pr_warn("got mdsmap version %d > 3, failing", version);
-               goto bad;
+       ceph_decode_need(p, end, 1 + 1, bad);
+       mdsmap_v = ceph_decode_8(p);
+       mdsmap_cv = ceph_decode_8(p);
+       if (mdsmap_v >= 4) {
+              u32 mdsmap_len;
+              ceph_decode_32_safe(p, end, mdsmap_len, bad);
+              if (end < *p + mdsmap_len)
+                      goto bad;
+              end = *p + mdsmap_len;
        }
 
        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                u32 namelen;
                s32 mds, inc, state;
                u64 state_seq;
-               u8 infoversion;
+               u8 info_v;
+               void *info_end = NULL;
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
                struct ceph_mds_info *info;
 
-               ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+               ceph_decode_need(p, end, sizeof(u64) + 1, bad);
                global_id = ceph_decode_64(p);
-               infoversion = ceph_decode_8(p);
+               info_v= ceph_decode_8(p);
+               if (info_v >= 4) {
+                       u32 info_len;
+                       u8 info_cv;
+                       ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+                       info_cv = ceph_decode_8(p);
+                       info_len = ceph_decode_32(p);
+                       info_end = *p + info_len;
+                       if (info_end > end)
+                               goto bad;
+               }
+
+               ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
                *p += sizeof(u64);
                namelen = ceph_decode_32(p);  /* skip mds name */
                *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
-               if (infoversion >= 2) {
+               if (info_v >= 2) {
                        ceph_decode_32_safe(p, end, num_export_targets, bad);
                        pexport_targets = *p;
                        *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        num_export_targets = 0;
                }
 
+               if (info_end && *p != info_end) {
+                       if (*p > info_end)
+                               goto bad;
+                       *p = info_end;
+               }
+
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
        m->m_cas_pg_pool = ceph_decode_64(p);
 
        /* ok, we don't care about the rest. */
+       *p = end;
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
        return m;
 
index f12d5e2955c223c35145bdf550d5669b823a14bc..91e02481ce06843cb231580458ebd3dc4ad7d111 100644 (file)
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
  * mount options
  */
 enum {
+       Opt_mds_namespace,
        Opt_wsize,
        Opt_rsize,
        Opt_rasize,
@@ -143,6 +144,7 @@ enum {
 };
 
 static match_table_t fsopt_tokens = {
+       {Opt_mds_namespace, "mds_namespace=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
                break;
 
                /* misc */
+       case Opt_mds_namespace:
+               fsopt->mds_namespace = intval;
+               break;
        case Opt_wsize:
                fsopt->wsize = intval;
                break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
        dout("destroy_mount_options %p\n", args);
        kfree(args->snapdir_name);
+       kfree(args->server_path);
        kfree(args);
 }
 
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
        if (ret)
                return ret;
 
+       ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+       if (ret)
+               return ret;
+
        return ceph_compare_options(new_opt, fsc->client);
 }
 
 static int parse_mount_options(struct ceph_mount_options **pfsopt,
                               struct ceph_options **popt,
                               int flags, char *options,
-                              const char *dev_name,
-                              const char **path)
+                              const char *dev_name)
 {
        struct ceph_mount_options *fsopt;
        const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        fsopt->congestion_kb = default_congestion_kb();
+       fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
 
        /*
         * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         */
        dev_name_end = strchr(dev_name, '/');
        if (dev_name_end) {
-               /* skip over leading '/' for path */
-               *path = dev_name_end + 1;
+               fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+               if (!fsopt->server_path) {
+                       err = -ENOMEM;
+                       goto out;
+               }
        } else {
-               /* path is empty */
                dev_name_end = dev_name + strlen(dev_name);
-               *path = dev_name_end;
        }
        err = -EINVAL;
        dev_name_end--;         /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
                goto out;
        }
        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-       dout("server path '%s'\n", *path);
+       if (fsopt->server_path)
+               dout("server path '%s'\n", fsopt->server_path);
 
        *popt = ceph_parse_options(options, dev_name, dev_name_end,
                                 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noacl");
 #endif
 
+       if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+               seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 {
        struct ceph_fs_client *fsc;
        const u64 supported_features =
-               CEPH_FEATURE_FLOCK |
-               CEPH_FEATURE_DIRLAYOUTHASH |
-               CEPH_FEATURE_MDS_INLINE_DATA;
+               CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
+               CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
        const u64 required_features = 0;
        int page_count;
        size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+       fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 
        fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
-                     const char *path)
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 {
        int err;
        unsigned long started = jiffies;  /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
                        goto fail;
        }
 
-       if (path[0] == 0) {
+       if (!fsc->mount_options->server_path) {
                root = fsc->sb->s_root;
                dget(root);
        } else {
-               dout("mount opening base mountpoint\n");
+               const char *path = fsc->mount_options->server_path + 1;
+               dout("mount opening path %s\n", path);
                root = open_root_dentry(fsc, path, started);
                if (IS_ERR(root)) {
                        err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        struct dentry *res;
        int err;
        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-       const char *path = NULL;
        struct ceph_mount_options *fsopt = NULL;
        struct ceph_options *opt = NULL;
 
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        flags |= MS_POSIXACL;
 #endif
-       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
        if (err < 0) {
                res = ERR_PTR(err);
                goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
                }
        }
 
-       res = ceph_real_mount(fsc, path);
+       res = ceph_real_mount(fsc);
        if (IS_ERR(res))
                goto out_splat;
        dout("root %p inode %p ino %llx.%llx\n", res,
index 7b99eb756477ead228ae5e06611d6f3c9a9ce39b..0130a85921917faa6b5cf40c263468460921eab6 100644 (file)
@@ -62,6 +62,7 @@ struct ceph_mount_options {
        int cap_release_safety;
        int max_readdir;       /* max readdir result (entires) */
        int max_readdir_bytes; /* max readdir result (bytes) */
+       int mds_namespace;
 
        /*
         * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
         */
 
        char *snapdir_name;   /* default ".snap" */
+       char *server_path;    /* default  "/" */
 };
 
 struct ceph_fs_client {
@@ -295,6 +297,7 @@ struct ceph_inode_info {
        u64 i_files, i_subdirs;
 
        struct rb_root i_fragtree;
+       int i_fragtree_nsplits;
        struct mutex i_fragtree_mutex;
 
        struct ceph_inode_xattrs_info i_xattrs;
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD         (1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR         (1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED      (1 << 7)  /* security initialized */
+#define CEPH_I_CAP_DROPPED     (1 << 8)  /* caps were forcibly dropped */
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
                                           long long release_count,
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
        return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
 
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-       return ((loff_t)frag << 32) | (loff_t)off;
-}
-
 /*
  * caps helpers
  */
@@ -632,7 +631,6 @@ struct ceph_file_info {
        struct ceph_mds_request *last_readdir;
 
        /* readdir: position within a frag */
-       unsigned offset;       /* offset of last chunk, adjusted for . and .. */
        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
        long long dir_release_count;
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
+extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                            struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
 
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
                               struct dentry *dentry, int err);
index 0d66722c6a52b4162253baed1db9dd9cb1bb23e3..dacc1bd8562974022befe05446db1b50afc0f6ad 100644 (file)
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
        char buf[128];
 
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name) {
                size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
                                ret = -ERANGE;
                }
        }
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
        return ret;
 }
 
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
        s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
        const char *pool_name;
 
-       down_read(&osdc->map_sem);
+       down_read(&osdc->lock);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
        if (pool_name)
                ret = snprintf(val, size, "%s", pool_name);
        else
                ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-       up_read(&osdc->map_sem);
+       up_read(&osdc->lock);
        return ret;
 }
 
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_pagelist *pagelist = NULL;
+       int op = CEPH_MDS_OP_SETXATTR;
        int err;
 
        if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                if (err)
                        goto out;
        } else if (!value) {
-               flags |= CEPH_XATTR_REMOVE;
+               if (flags & CEPH_XATTR_REPLACE)
+                       op = CEPH_MDS_OP_RMXATTR;
+               else
+                       flags |= CEPH_XATTR_REMOVE;
        }
 
        dout("setxattr value=%.*s\n", (int)size, value);
 
        /* do request */
-       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
-                                      USE_AUTH_MDS);
+       req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
        }
 
-       req->r_args.setxattr.flags = cpu_to_le32(flags);
        req->r_path2 = kstrdup(name, GFP_NOFS);
        if (!req->r_path2) {
                ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
                goto out;
        }
 
-       req->r_pagelist = pagelist;
-       pagelist = NULL;
+       if (op == CEPH_MDS_OP_SETXATTR) {
+               req->r_args.setxattr.flags = cpu_to_le32(flags);
+               req->r_pagelist = pagelist;
+               pagelist = NULL;
+       }
 
        req->r_inode = inode;
        ihold(inode);
index 8754e9aa14ad2ee9b37598ffd517235a9266b572..be6e48b0a46c269d55b4befcdedaf1750e1e233f 100644 (file)
@@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
        }
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
@@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name,
        dirent = buf->previous;
 
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user_unaligned(offset, &dirent->d_off))
                        goto efault;
        }
index a345c168acaae4956dad1cba8707061b5d22f534..761495bf5eb91d97c6483646d89a67f10933864f 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
-#define RADIX_DAX_MASK 0xf
-#define RADIX_DAX_SHIFT        4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+/*
+ * We use lowest available bit in exceptional entry for locking, other two
+ * bits to determine entry type. In total 3 special bits.
+ */
+#define RADIX_DAX_SHIFT        (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
+#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
+               RADIX_TREE_EXCEPTIONAL_ENTRY))
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+       int i;
+
+       for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+               init_waitqueue_head(wait_table + i);
+       return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+                                             pgoff_t index)
+{
+       unsigned long hash = hash_long((unsigned long)mapping ^ index,
+                                      DAX_WAIT_TABLE_BITS);
+       return wait_table + hash;
+}
 
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
@@ -87,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
        return page;
 }
 
-/*
- * dax_clear_sectors() is called from within transaction context from XFS,
- * and hence this means the stack from this point must follow GFP_NOFS
- * semantics for all operations.
- */
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
-{
-       struct blk_dax_ctl dax = {
-               .sector = _sector,
-               .size = _size,
-       };
-
-       might_sleep();
-       do {
-               long count, sz;
-
-               count = dax_map_atomic(bdev, &dax);
-               if (count < 0)
-                       return count;
-               sz = min_t(long, count, SZ_128K);
-               clear_pmem(dax.addr, sz);
-               dax.size -= sz;
-               dax.sector += sz / 512;
-               dax_unmap_atomic(bdev, &dax);
-               cond_resched();
-       } while (dax.size);
-
-       wmb_pmem();
-       return 0;
-}
-EXPORT_SYMBOL_GPL(dax_clear_sectors);
-
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
-               loff_t pos, loff_t end)
-{
-       loff_t final = end - pos + first; /* The final byte of the buffer */
-
-       if (first > 0)
-               clear_pmem(addr, first);
-       if (final < size)
-               clear_pmem(addr + final, size - final);
-}
-
 static bool buffer_written(struct buffer_head *bh)
 {
        return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
        struct blk_dax_ctl dax = {
                .addr = (void __pmem *) ERR_PTR(-EIO),
        };
+       unsigned blkbits = inode->i_blkbits;
+       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+                                                               >> blkbits;
 
        if (rw == READ)
                end = min(end, i_size_read(inode));
@@ -176,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
        while (pos < end) {
                size_t len;
                if (pos == max) {
-                       unsigned blkbits = inode->i_blkbits;
                        long page = pos >> PAGE_SHIFT;
                        sector_t block = page << (PAGE_SHIFT - blkbits);
                        unsigned first = pos - (block << blkbits);
@@ -192,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                        bh->b_size = 1 << blkbits;
                                bh_max = pos - first + bh->b_size;
                                bdev = bh->b_bdev;
+                               /*
+                                * We allow uninitialized buffers for writes
+                                * beyond EOF as those cannot race with faults
+                                */
+                               WARN_ON_ONCE(
+                                       (buffer_new(bh) && block < file_blks) ||
+                                       (rw == WRITE && buffer_unwritten(bh)));
                        } else {
                                unsigned done = bh->b_size -
                                                (bh_max - (pos - first));
@@ -211,11 +205,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                        rc = map_len;
                                        break;
                                }
-                               if (buffer_unwritten(bh) || buffer_new(bh)) {
-                                       dax_new_buf(dax.addr, map_len, first,
-                                                       pos, end);
-                                       need_wmb = true;
-                               }
                                dax.addr += first;
                                size = map_len - first;
                        }
@@ -276,15 +265,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        memset(&bh, 0, sizeof(bh));
        bh.b_bdev = inode->i_sb->s_bdev;
 
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
-               struct address_space *mapping = inode->i_mapping;
+       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
                inode_lock(inode);
-               retval = filemap_write_and_wait_range(mapping, pos, end - 1);
-               if (retval) {
-                       inode_unlock(inode);
-                       goto out;
-               }
-       }
 
        /* Protects against truncate */
        if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -305,11 +287,267 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 
        if (!(flags & DIO_SKIP_DIO_COUNT))
                inode_dio_end(inode);
- out:
        return retval;
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+       struct address_space *mapping;
+       unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+       wait_queue_t wait;
+       struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+                                      int sync, void *keyp)
+{
+       struct exceptional_entry_key *key = keyp;
+       struct wait_exceptional_entry_queue *ewait =
+               container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+       if (key->mapping != ewait->key.mapping ||
+           key->index != ewait->key.index)
+               return 0;
+       return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+       return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+       entry |= RADIX_DAX_ENTRY_LOCK;
+       radix_tree_replace_slot(slot, (void *)entry);
+       return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+       unsigned long entry = (unsigned long)
+               radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+       entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+       radix_tree_replace_slot(slot, (void *)entry);
+       return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+                                       pgoff_t index, void ***slotp)
+{
+       void *ret, **slot;
+       struct wait_exceptional_entry_queue ewait;
+       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+       init_wait(&ewait.wait);
+       ewait.wait.func = wake_exceptional_entry_func;
+       ewait.key.mapping = mapping;
+       ewait.key.index = index;
+
+       for (;;) {
+               ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+                                         &slot);
+               if (!ret || !radix_tree_exceptional_entry(ret) ||
+                   !slot_locked(mapping, slot)) {
+                       if (slotp)
+                               *slotp = slot;
+                       return ret;
+               }
+               prepare_to_wait_exclusive(wq, &ewait.wait,
+                                         TASK_UNINTERRUPTIBLE);
+               spin_unlock_irq(&mapping->tree_lock);
+               schedule();
+               finish_wait(wq, &ewait.wait);
+               spin_lock_irq(&mapping->tree_lock);
+       }
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *ret, **slot;
+
+restart:
+       spin_lock_irq(&mapping->tree_lock);
+       ret = get_unlocked_mapping_entry(mapping, index, &slot);
+       /* No entry for given index? Make sure radix tree is big enough. */
+       if (!ret) {
+               int err;
+
+               spin_unlock_irq(&mapping->tree_lock);
+               err = radix_tree_preload(
+                               mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+               if (err)
+                       return ERR_PTR(err);
+               ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+                              RADIX_DAX_ENTRY_LOCK);
+               spin_lock_irq(&mapping->tree_lock);
+               err = radix_tree_insert(&mapping->page_tree, index, ret);
+               radix_tree_preload_end();
+               if (err) {
+                       spin_unlock_irq(&mapping->tree_lock);
+                       /* Someone already created the entry? */
+                       if (err == -EEXIST)
+                               goto restart;
+                       return ERR_PTR(err);
+               }
+               /* Good, we have inserted empty locked entry into the tree. */
+               mapping->nrexceptional++;
+               spin_unlock_irq(&mapping->tree_lock);
+               return ret;
+       }
+       /* Normal page in radix tree? */
+       if (!radix_tree_exceptional_entry(ret)) {
+               struct page *page = ret;
+
+               get_page(page);
+               spin_unlock_irq(&mapping->tree_lock);
+               lock_page(page);
+               /* Page got truncated? Retry... */
+               if (unlikely(page->mapping != mapping)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto restart;
+               }
+               return page;
+       }
+       ret = lock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+                                  pgoff_t index, bool wake_all)
+{
+       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+       /*
+        * Checking for locked entry and prepare_to_wait_exclusive() happens
+        * under mapping->tree_lock, ditto for entry handling in our callers.
+        * So at this point all tasks that could have seen our entry locked
+        * must be in the waitqueue and the following check will see them.
+        */
+       if (waitqueue_active(wq)) {
+               struct exceptional_entry_key key;
+
+               key.mapping = mapping;
+               key.index = index;
+               __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+       }
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *ret, **slot;
+
+       spin_lock_irq(&mapping->tree_lock);
+       ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+       if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+                        !slot_locked(mapping, slot))) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return;
+       }
+       unlock_slot(mapping, slot);
+       spin_unlock_irq(&mapping->tree_lock);
+       dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+                                    pgoff_t index, void *entry)
+{
+       if (!radix_tree_exceptional_entry(entry)) {
+               unlock_page(entry);
+               put_page(entry);
+       } else {
+               dax_unlock_mapping_entry(mapping, index);
+       }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+                                      pgoff_t index, void *entry)
+{
+       if (!radix_tree_exceptional_entry(entry))
+               return;
+
+       /* We have to wake up next waiter for the radix tree entry lock */
+       dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       void *entry;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+       /*
+        * This gets called from truncate / punch_hole path. As such, the caller
+        * must hold locks protecting against concurrent modifications of the
+        * radix tree (usually fs-private i_mmap_sem for writing). Since the
+        * caller has seen exceptional entry for this index, we better find it
+        * at that index as well...
+        */
+       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return 0;
+       }
+       radix_tree_delete(&mapping->page_tree, index);
+       mapping->nrexceptional--;
+       spin_unlock_irq(&mapping->tree_lock);
+       dax_wake_mapping_entry_waiter(mapping, index, true);
+
+       return 1;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating
  * a new page in the file would cause excessive storage usage for
@@ -318,24 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-                                                       struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+                        struct vm_fault *vmf)
 {
-       unsigned long size;
-       struct inode *inode = mapping->host;
-       if (!page)
-               page = find_or_create_page(mapping, vmf->pgoff,
-                                               GFP_KERNEL | __GFP_ZERO);
-       if (!page)
-               return VM_FAULT_OOM;
-       /* Recheck i_size under page lock to avoid truncate race */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size) {
-               unlock_page(page);
-               put_page(page);
-               return VM_FAULT_SIGBUS;
+       struct page *page;
+
+       /* Hole page already exists? Return it...  */
+       if (!radix_tree_exceptional_entry(entry)) {
+               vmf->page = entry;
+               return VM_FAULT_LOCKED;
        }
 
+       /* This will replace locked radix tree entry with a hole page */
+       page = find_or_create_page(mapping, vmf->pgoff,
+                                  vmf->gfp_mask | __GFP_ZERO);
+       if (!page) {
+               put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+               return VM_FAULT_OOM;
+       }
        vmf->page = page;
        return VM_FAULT_LOCKED;
 }
@@ -359,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
        return 0;
 }
 
-#define NO_SECTOR -1
 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-               sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+                                     struct vm_fault *vmf,
+                                     void *entry, sector_t sector)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-       pgoff_t pmd_index = DAX_PMD_INDEX(index);
-       int type, error = 0;
-       void *entry;
+       int error = 0;
+       bool hole_fill = false;
+       void *new_entry;
+       pgoff_t index = vmf->pgoff;
 
-       WARN_ON_ONCE(pmd_entry && !dirty);
-       if (dirty)
+       if (vmf->flags & FAULT_FLAG_WRITE)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-       spin_lock_irq(&mapping->tree_lock);
-
-       entry = radix_tree_lookup(page_tree, pmd_index);
-       if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-               index = pmd_index;
-               goto dirty;
+       /* Replacing hole page with block mapping? */
+       if (!radix_tree_exceptional_entry(entry)) {
+               hole_fill = true;
+               /*
+                * Unmap the page now before we remove it from page cache below.
+                * The page is locked so it cannot be faulted in again.
+                */
+               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                   PAGE_SIZE, 0);
+               error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+               if (error)
+                       return ERR_PTR(error);
        }
 
-       entry = radix_tree_lookup(page_tree, index);
-       if (entry) {
-               type = RADIX_DAX_TYPE(entry);
-               if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-                                       type != RADIX_DAX_PMD)) {
-                       error = -EIO;
+       spin_lock_irq(&mapping->tree_lock);
+       new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+                      RADIX_DAX_ENTRY_LOCK);
+       if (hole_fill) {
+               __delete_from_page_cache(entry, NULL);
+               /* Drop pagecache reference */
+               put_page(entry);
+               error = radix_tree_insert(page_tree, index, new_entry);
+               if (error) {
+                       new_entry = ERR_PTR(error);
                        goto unlock;
                }
+               mapping->nrexceptional++;
+       } else {
+               void **slot;
+               void *ret;
 
-               if (!pmd_entry || type == RADIX_DAX_PMD)
-                       goto dirty;
-
-               /*
-                * We only insert dirty PMD entries into the radix tree.  This
-                * means we don't need to worry about removing a dirty PTE
-                * entry and inserting a clean PMD entry, thus reducing the
-                * range we would flush with a follow-up fsync/msync call.
-                */
-               radix_tree_delete(&mapping->page_tree, index);
-               mapping->nrexceptional--;
-       }
-
-       if (sector == NO_SECTOR) {
-               /*
-                * This can happen during correct operation if our pfn_mkwrite
-                * fault raced against a hole punch operation.  If this
-                * happens the pte that was hole punched will have been
-                * unmapped and the radix tree entry will have been removed by
-                * the time we are called, but the call will still happen.  We
-                * will return all the way up to wp_pfn_shared(), where the
-                * pte_same() check will fail, eventually causing page fault
-                * to be retried by the CPU.
-                */
-               goto unlock;
+               ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+               WARN_ON_ONCE(ret != entry);
+               radix_tree_replace_slot(slot, new_entry);
        }
-
-       error = radix_tree_insert(page_tree, index,
-                       RADIX_DAX_ENTRY(sector, pmd_entry));
-       if (error)
-               goto unlock;
-
-       mapping->nrexceptional++;
- dirty:
-       if (dirty)
+       if (vmf->flags & FAULT_FLAG_WRITE)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  unlock:
        spin_unlock_irq(&mapping->tree_lock);
-       return error;
+       if (hole_fill) {
+               radix_tree_preload_end();
+               /*
+                * We don't need hole page anymore, it has been replaced with
+                * locked radix tree entry now.
+                */
+               if (mapping->a_ops->freepage)
+                       mapping->a_ops->freepage(entry);
+               unlock_page(entry);
+               put_page(entry);
+       }
+       return new_entry;
 }
 
 static int dax_writeback_one(struct block_device *bdev,
@@ -555,56 +788,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+                       struct buffer_head *bh, void **entryp,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       struct address_space *mapping = inode->i_mapping;
        struct block_device *bdev = bh->b_bdev;
        struct blk_dax_ctl dax = {
-               .sector = to_sector(bh, inode),
+               .sector = to_sector(bh, mapping->host),
                .size = bh->b_size,
        };
-       pgoff_t size;
-       int error;
-
-       i_mmap_lock_read(mapping);
-
-       /*
-        * Check truncate didn't happen while we were allocating a block.
-        * If it did, this block may or may not be still allocated to the
-        * file.  We can't tell the filesystem to free it because we can't
-        * take i_mutex here.  In the worst case, the file still has blocks
-        * allocated past the end of the file.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (unlikely(vmf->pgoff >= size)) {
-               error = -EIO;
-               goto out;
-       }
+       void *ret;
+       void *entry = *entryp;
 
-       if (dax_map_atomic(bdev, &dax) < 0) {
-               error = PTR_ERR(dax.addr);
-               goto out;
-       }
-
-       if (buffer_unwritten(bh) || buffer_new(bh)) {
-               clear_pmem(dax.addr, PAGE_SIZE);
-               wmb_pmem();
-       }
+       if (dax_map_atomic(bdev, &dax) < 0)
+               return PTR_ERR(dax.addr);
        dax_unmap_atomic(bdev, &dax);
 
-       error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-                       vmf->flags & FAULT_FLAG_WRITE);
-       if (error)
-               goto out;
-
-       error = vm_insert_mixed(vma, vaddr, dax.pfn);
+       ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+       *entryp = ret;
 
- out:
-       i_mmap_unlock_read(mapping);
-
-       return error;
+       return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
 /**
@@ -612,24 +818,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- *     to written so the data written to them is exposed. This is required for
- *     required by write faults for filesystems that will return unwritten
- *     extent mappings from @get_block, but it is optional for reads as
- *     dax_insert_mapping() will always zero unwritten blocks. If the fs does
- *     not support unwritten extents, the it should pass NULL.
  *
  * When a page fault occurs, filesystems may call this helper in their
  * fault handler for DAX files. __dax_fault() assumes the caller has done all
  * the necessary locking for the page fault to proceed successfully.
  */
 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block, dax_iodone_t complete_unwritten)
+                       get_block_t get_block)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-       struct page *page;
+       void *entry;
        struct buffer_head bh;
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
        unsigned blkbits = inode->i_blkbits;
@@ -638,6 +838,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        int error;
        int major = 0;
 
+       /*
+        * Check whether offset isn't beyond end of file now. Caller is supposed
+        * to hold locks serializing us with truncate / punch hole so this is
+        * a reliable test.
+        */
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -647,49 +852,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_SIZE;
 
- repeat:
-       page = find_get_page(mapping, vmf->pgoff);
-       if (page) {
-               if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-                       put_page(page);
-                       return VM_FAULT_RETRY;
-               }
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto repeat;
-               }
-               size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               if (unlikely(vmf->pgoff >= size)) {
-                       /*
-                        * We have a struct page covering a hole in the file
-                        * from a read fault and we've raced with a truncate
-                        */
-                       error = -EIO;
-                       goto unlock_page;
-               }
+       entry = grab_mapping_entry(mapping, vmf->pgoff);
+       if (IS_ERR(entry)) {
+               error = PTR_ERR(entry);
+               goto out;
        }
 
        error = get_block(inode, block, &bh, 0);
        if (!error && (bh.b_size < PAGE_SIZE))
                error = -EIO;           /* fs corruption? */
        if (error)
-               goto unlock_page;
-
-       if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
-               if (vmf->flags & FAULT_FLAG_WRITE) {
-                       error = get_block(inode, block, &bh, 1);
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
-                       if (!error && (bh.b_size < PAGE_SIZE))
-                               error = -EIO;
-                       if (error)
-                               goto unlock_page;
-               } else {
-                       return dax_load_hole(mapping, page, vmf);
-               }
-       }
+               goto unlock_entry;
 
        if (vmf->cow_page) {
                struct page *new_page = vmf->cow_page;
@@ -698,53 +871,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
-                       goto unlock_page;
-               vmf->page = page;
-               if (!page) {
-                       i_mmap_lock_read(mapping);
-                       /* Check we didn't race with truncate */
-                       size = (i_size_read(inode) + PAGE_SIZE - 1) >>
-                                                               PAGE_SHIFT;
-                       if (vmf->pgoff >= size) {
-                               i_mmap_unlock_read(mapping);
-                               error = -EIO;
-                               goto out;
-                       }
+                       goto unlock_entry;
+               if (!radix_tree_exceptional_entry(entry)) {
+                       vmf->page = entry;
+                       return VM_FAULT_LOCKED;
                }
-               return VM_FAULT_LOCKED;
+               vmf->entry = entry;
+               return VM_FAULT_DAX_LOCKED;
        }
 
-       /* Check we didn't race with a read fault installing a new page */
-       if (!page && major)
-               page = find_lock_page(mapping, vmf->pgoff);
-
-       if (page) {
-               unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-                                                       PAGE_SIZE, 0);
-               delete_from_page_cache(page);
-               unlock_page(page);
-               put_page(page);
-               page = NULL;
-       }
-
-       /*
-        * If we successfully insert the new mapping over an unwritten extent,
-        * we need to ensure we convert the unwritten extent. If there is an
-        * error inserting the mapping, the filesystem needs to leave it as
-        * unwritten to prevent exposure of the stale underlying data to
-        * userspace, but we still need to call the completion function so
-        * the private resources on the mapping buffer can be released. We
-        * indicate what the callback should do via the uptodate variable, same
-        * as for normal BH based IO completions.
-        */
-       error = dax_insert_mapping(inode, &bh, vma, vmf);
-       if (buffer_unwritten(&bh)) {
-               if (complete_unwritten)
-                       complete_unwritten(&bh, !error);
-               else
-                       WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+       if (!buffer_mapped(&bh)) {
+               if (vmf->flags & FAULT_FLAG_WRITE) {
+                       error = get_block(inode, block, &bh, 1);
+                       count_vm_event(PGMAJFAULT);
+                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                       major = VM_FAULT_MAJOR;
+                       if (!error && (bh.b_size < PAGE_SIZE))
+                               error = -EIO;
+                       if (error)
+                               goto unlock_entry;
+               } else {
+                       return dax_load_hole(mapping, entry, vmf);
+               }
        }
 
+       /* Filesystem should not return unwritten buffers to us! */
+       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
+       error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
@@ -752,13 +907,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if ((error < 0) && (error != -EBUSY))
                return VM_FAULT_SIGBUS | major;
        return VM_FAULT_NOPAGE | major;
-
- unlock_page:
-       if (page) {
-               unlock_page(page);
-               put_page(page);
-       }
-       goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
 
@@ -772,7 +920,7 @@ EXPORT_SYMBOL(__dax_fault);
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-             get_block_t get_block, dax_iodone_t complete_unwritten)
+             get_block_t get_block)
 {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +929,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+       result = __dax_fault(vma, vmf, get_block);
        if (vmf->flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
@@ -789,7 +937,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 /*
  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
  * more often than one might expect in the below function.
@@ -815,8 +963,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 #define dax_pmd_dbg(bh, address, reason)       __dax_dbg(bh, address, reason, "dax_pmd")
 
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd, unsigned int flags, get_block_t get_block,
-               dax_iodone_t complete_unwritten)
+               pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
@@ -828,7 +975,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        struct block_device *bdev;
        pgoff_t size, pgoff;
        sector_t block;
-       int error, result = 0;
+       int result = 0;
        bool alloc = false;
 
        /* dax pmd mappings require pfn_t_devmap() */
@@ -875,6 +1022,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                if (get_block(inode, block, &bh, 1) != 0)
                        return VM_FAULT_SIGBUS;
                alloc = true;
+               WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
        }
 
        bdev = bh.b_bdev;
@@ -900,26 +1048,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                truncate_pagecache_range(inode, lstart, lend);
        }
 
-       i_mmap_lock_read(mapping);
-
-       /*
-        * If a truncate happened while we were allocating blocks, we may
-        * leave blocks allocated to the file that are beyond EOF.  We can't
-        * take i_mutex here, so just leave them hanging; they'll be freed
-        * when the file is deleted.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (pgoff >= size) {
-               result = VM_FAULT_SIGBUS;
-               goto out;
-       }
-       if ((pgoff | PG_PMD_COLOUR) >= size) {
-               dax_pmd_dbg(&bh, address,
-                               "offset + huge page size > file size");
-               goto fallback;
-       }
-
-       if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+       if (!write && !buffer_mapped(&bh)) {
                spinlock_t *ptl;
                pmd_t entry;
                struct page *zero_page = get_huge_zero_page();
@@ -954,8 +1083,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                long length = dax_map_atomic(bdev, &dax);
 
                if (length < 0) {
-                       result = VM_FAULT_SIGBUS;
-                       goto out;
+                       dax_pmd_dbg(&bh, address, "dax-error fallback");
+                       goto fallback;
                }
                if (length < PMD_SIZE) {
                        dax_pmd_dbg(&bh, address, "dax-length too small");
@@ -973,14 +1102,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                        dax_pmd_dbg(&bh, address, "pfn not in memmap");
                        goto fallback;
                }
-
-               if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-                       clear_pmem(dax.addr, PMD_SIZE);
-                       wmb_pmem();
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       result |= VM_FAULT_MAJOR;
-               }
                dax_unmap_atomic(bdev, &dax);
 
                /*
@@ -999,13 +1120,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                 * the write to insert a dirty entry.
                 */
                if (write) {
-                       error = dax_radix_entry(mapping, pgoff, dax.sector,
-                                       true, true);
-                       if (error) {
-                               dax_pmd_dbg(&bh, address,
-                                               "PMD radix insertion failed");
-                               goto fallback;
-                       }
+                       /*
+                        * We should insert radix-tree entry and dirty it here.
+                        * For now this is broken...
+                        */
                }
 
                dev_dbg(part_to_dev(bdev->bd_part),
@@ -1018,11 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        }
 
  out:
-       i_mmap_unlock_read(mapping);
-
-       if (buffer_unwritten(&bh))
-               complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
        return result;
 
  fallback:
@@ -1042,8 +1155,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
  * pmd_fault handler for DAX files.
  */
 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-                       pmd_t *pmd, unsigned int flags, get_block_t get_block,
-                       dax_iodone_t complete_unwritten)
+                       pmd_t *pmd, unsigned int flags, get_block_t get_block)
 {
        int result;
        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +1164,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
-                               complete_unwritten);
+       result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
        if (flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
@@ -1070,27 +1181,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
-       int error;
-
-       /*
-        * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-        * RADIX_DAX_PTE entry already exists in the radix tree from a
-        * previous call to __dax_fault().  We just want to look up that PTE
-        * entry using vmf->pgoff and make sure the dirty tag is set.  This
-        * saves us from having to make a call to get_block() here to look
-        * up the sector.
-        */
-       error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
-                       true);
+       struct address_space *mapping = file->f_mapping;
+       void *entry;
+       pgoff_t index = vmf->pgoff;
 
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM;
-       if (error)
-               return VM_FAULT_SIGBUS;
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+       if (!entry || !radix_tree_exceptional_entry(entry))
+               goto out;
+       radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+       put_unlocked_mapping_entry(mapping, index, entry);
+out:
+       spin_unlock_irq(&mapping->tree_lock);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
+static bool dax_range_is_aligned(struct block_device *bdev,
+                                unsigned int offset, unsigned int length)
+{
+       unsigned short sector_size = bdev_logical_block_size(bdev);
+
+       if (!IS_ALIGNED(offset, sector_size))
+               return false;
+       if (!IS_ALIGNED(length, sector_size))
+               return false;
+
+       return true;
+}
+
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+               unsigned int offset, unsigned int length)
+{
+       struct blk_dax_ctl dax = {
+               .sector         = sector,
+               .size           = PAGE_SIZE,
+       };
+
+       if (dax_range_is_aligned(bdev, offset, length)) {
+               sector_t start_sector = dax.sector + (offset >> 9);
+
+               return blkdev_issue_zeroout(bdev, start_sector,
+                               length >> 9, GFP_NOFS, true);
+       } else {
+               if (dax_map_atomic(bdev, &dax) < 0)
+                       return PTR_ERR(dax.addr);
+               clear_pmem(dax.addr + offset, length);
+               wmb_pmem();
+               dax_unmap_atomic(bdev, &dax);
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+
 /**
  * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
@@ -1102,12 +1245,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  * page in a DAX file.  This is intended for hole-punch operations.  If
  * you are truncating a file, the helper function dax_truncate_page() may be
  * more convenient.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks.  Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
  */
 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
                                                        get_block_t get_block)
@@ -1126,23 +1263,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_SIZE;
        err = get_block(inode, index, &bh, 0);
-       if (err < 0)
+       if (err < 0 || !buffer_written(&bh))
                return err;
-       if (buffer_written(&bh)) {
-               struct block_device *bdev = bh.b_bdev;
-               struct blk_dax_ctl dax = {
-                       .sector = to_sector(&bh, inode),
-                       .size = PAGE_SIZE,
-               };
 
-               if (dax_map_atomic(bdev, &dax) < 0)
-                       return PTR_ERR(dax.addr);
-               clear_pmem(dax.addr + offset, length);
-               wmb_pmem();
-               dax_unmap_atomic(bdev, &dax);
-       }
-
-       return 0;
+       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
+                       offset, length);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
@@ -1154,12 +1279,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
  *
  * Similar to block_truncate_page(), this function can be called by a
  * filesystem when it is truncating a DAX file to handle the partial page.
- *
- * We work in terms of PAGE_SIZE here for commonality with
- * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
- * took care of disposing of the unnecessary blocks.  Even if the filesystem
- * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
- * since the file might be mmapped.
  */
 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 {
index c1400b109805b8778d8ee947ecad3060539f5ebe..868c02317b05c00746e62c1a33d0d45c860e8811 100644 (file)
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        down_read(&ei->dax_sem);
 
-       ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+       ret = __dax_fault(vma, vmf, ext2_get_block);
 
        up_read(&ei->dax_sem);
        if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
        }
        down_read(&ei->dax_sem);
 
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
 
        up_read(&ei->dax_sem);
        if (flags & FAULT_FLAG_WRITE)
index b675610391b8b87d8c4699de9533cead8bffc477..fcbe58641e401bf79597a6d3ca0fdea0b5136d87 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/dax.h>
+#include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
@@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode,
                 * so that it's not found by another thread before it's
                 * initialised
                 */
-               err = dax_clear_sectors(inode->i_sb->s_bdev,
-                               le32_to_cpu(chain[depth-1].key) <<
-                               (inode->i_blkbits - 9),
-                               1 << inode->i_blkbits);
+               err = sb_issue_zeroout(inode->i_sb,
+                               le32_to_cpu(chain[depth-1].key), count,
+                               GFP_NOFS);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
                }
-       }
+       } else
+               set_buffer_new(bh_result);
 
        ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
        mutex_unlock(&ei->truncate_mutex);
-       set_buffer_new(bh_result);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
index b78caf25f746220ce635a3bd946bedeaf7e3e750..1d9379568aa833b1fdd48a8ff112d343e3a7f773 100644 (file)
@@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
        if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-               if (blocksize != PAGE_SIZE) {
-                       ext2_msg(sb, KERN_ERR,
-                                       "error: unsupported blocksize for dax");
+               err = bdev_dax_supported(sb, blocksize);
+               if (err)
                        goto failed_mount;
-               }
-               if (!sb->s_bdev->bd_disk->fops->direct_access) {
-                       ext2_msg(sb, KERN_ERR,
-                                       "error: device does not support dax");
-                       goto failed_mount;
-               }
        }
 
        /* If the blocksize doesn't match, re-read the thing.. */
index fe1f50fe764ff9238354e2e30491c6e1e6d149b9..3020fd70c392d1f2b55913e374ed11831b2aa967 100644 (file)
@@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
 
-       return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+       jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+       return 1;
 }
 
 /*
index 5d00bf0602545f9e60e7b27790df2bd868285dbc..68323e3da3fa889fdbfa6e3e0b6e8736da11ca36 100644 (file)
@@ -150,6 +150,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
 
+               if (fatal_signal_pending(current)) {
+                       err = -ERESTARTSYS;
+                       goto errout;
+               }
+               cond_resched();
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
index 72f4c9e00e975974c7a20a4e24ff9b9ea2f8e03d..b84aa1ca480a9564e51d61bc2a0d782c8de104ef 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -581,6 +582,9 @@ enum {
 #define EXT4_GET_BLOCKS_ZERO                   0x0200
 #define EXT4_GET_BLOCKS_CREATE_ZERO            (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
+       /* Caller will submit data before dropping transaction handle. This
+        * allows jbd2 to avoid submitting data before commit. */
+#define EXT4_GET_BLOCKS_IO_SUBMIT              0x0400
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -1505,6 +1509,9 @@ struct ext4_sb_info {
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
+
+       /* Barrier between changing inodes' journal flags and writepages ops. */
+       struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1549,7 +1556,6 @@ enum {
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
-       EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
        EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
 };
 
@@ -2521,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-                           struct buffer_head *bh_result, int create);
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
 int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2581,7 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -3329,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
        }
 }
 
+static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
+{
+       int blksize = 1 << inode->i_blkbits;
+
+       return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
+}
+
 #endif /* __KERNEL__ */
 
 #define EFSBADCRC      EBADMSG         /* Bad CRC detected */
index 5f58462110953dc61c9bd85101acd69c33a51331..09c1ef38cbe6aaff2c03185b31da20255df8990b 100644 (file)
@@ -359,10 +359,21 @@ static inline int ext4_journal_force_commit(journal_t *journal)
        return 0;
 }
 
-static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+static inline int ext4_jbd2_inode_add_write(handle_t *handle,
+                                           struct inode *inode)
 {
        if (ext4_handle_valid(handle))
-               return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+               return jbd2_journal_inode_add_write(handle,
+                                                   EXT4_I(inode)->jinode);
+       return 0;
+}
+
+static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
+                                          struct inode *inode)
+{
+       if (ext4_handle_valid(handle))
+               return jbd2_journal_inode_add_wait(handle,
+                                                  EXT4_I(inode)->jinode);
        return 0;
 }
 
index 95bf4679ac5485ef35240495806a034b1fdf86bf..2a2eef9c14e4b66212597b651bd87aa9f3ed6b7c 100644 (file)
@@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
 
        if (!ext4_handle_valid(handle))
                return 0;
-       if (handle->h_buffer_credits > needed)
+       if (handle->h_buffer_credits >= needed)
                return 0;
-       err = ext4_journal_extend(handle, needed);
+       /*
+        * If we need to extend the journal get a few extra blocks
+        * while we're at it for efficiency's sake.
+        */
+       needed += 3;
+       err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
@@ -907,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 
                eh = ext_block_hdr(bh);
                ppos++;
-               if (unlikely(ppos > depth)) {
-                       put_bh(bh);
-                       EXT4_ERROR_INODE(inode,
-                                        "ppos %d > depth %d", ppos, depth);
-                       ret = -EFSCORRUPTED;
-                       goto err;
-               }
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
        }
@@ -2583,7 +2581,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                }
        } else
                ext4_error(sbi->s_sb, "strange request: removal(2) "
-                          "%u-%u from %u:%u\n",
+                          "%u-%u from %u:%u",
                           from, to, le32_to_cpu(ex->ee_block), ee_len);
        return 0;
 }
@@ -3738,7 +3736,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
        if (ee_block != map->m_lblk || ee_len > map->m_len) {
 #ifdef EXT4_DEBUG
                ext4_warning("Inode (%ld) finished: extent logical block %llu,"
-                            " len %u; IO logical block %llu, len %u\n",
+                            " len %u; IO logical block %llu, len %u",
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
index e38b987ac7f5f709abb8ea1c8fb9f83ca32ce723..37e059202cd2fa333dff053b327d8faf85f9d4ac 100644 (file)
@@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
            (status & EXTENT_STATUS_WRITTEN)) {
                ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
                                " delayed and written which can potentially "
-                               " cause data loss.\n", lblk, len);
+                               " cause data loss.", lblk, len);
                WARN_ON(1);
        }
 
index 00ff6912adb305f1b939e695257abd10f74a3616..df44c877892a542f89c528a6c8fe0a93a79dceea 100644 (file)
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (IS_ERR(handle))
                result = VM_FAULT_SIGBUS;
        else
-               result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
+               result = __dax_fault(vma, vmf, ext4_dax_get_block);
 
        if (write) {
                if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
                result = VM_FAULT_SIGBUS;
        else
                result = __dax_pmd_fault(vma, addr, pmd, flags,
-                               ext4_dax_mmap_get_block, NULL);
+                                        ext4_dax_get_block);
 
        if (write) {
                if (!IS_ERR(handle))
@@ -373,7 +373,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        if (ext4_encrypted_inode(d_inode(dir)) &&
            !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) {
                ext4_warning(inode->i_sb,
-                            "Inconsistent encryption contexts: %lu/%lu\n",
+                            "Inconsistent encryption contexts: %lu/%lu",
                             (unsigned long) d_inode(dir)->i_ino,
                             (unsigned long) inode->i_ino);
                dput(dir);
index 237b877d316d1174687341abb34d49f05b56b127..3da4cf8d18b68ccae8b93984ee1d0d154903a863 100644 (file)
@@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        ext4_group_t block_group;
        int bit;
-       struct buffer_head *bitmap_bh;
+       struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
-       long err = -EIO;
+       int err = -EFSCORRUPTED;
 
-       /* Error cases - e2fsck has already cleaned up for us */
-       if (ino > max_ino) {
-               ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
-               err = -EFSCORRUPTED;
-               goto error;
-       }
+       if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+               goto bad_orphan;
 
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh)) {
-               err = PTR_ERR(bitmap_bh);
-               ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
-                            ino, err);
-               goto error;
+               ext4_error(sb, "inode bitmap error %ld for orphan %lu",
+                          ino, PTR_ERR(bitmap_bh));
+               return (struct inode *) bitmap_bh;
        }
 
        /* Having the inode bit set should be a 100% indicator that this
@@ -1179,15 +1174,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
                goto bad_orphan;
 
        inode = ext4_iget(sb, ino);
-       if (IS_ERR(inode))
-               goto iget_failed;
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
+                          ino, err);
+               return inode;
+       }
 
        /*
-        * If the orphans has i_nlinks > 0 then it should be able to be
-        * truncated, otherwise it won't be removed from the orphan list
-        * during processing and an infinite loop will result.
+        * If the orphans has i_nlinks > 0 then it should be able to
+        * be truncated, otherwise it won't be removed from the orphan
+        * list during processing and an infinite loop will result.
+        * Similarly, it must not be a bad inode.
         */
-       if (inode->i_nlink && !ext4_can_truncate(inode))
+       if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
+           is_bad_inode(inode))
                goto bad_orphan;
 
        if (NEXT_ORPHAN(inode) > max_ino)
@@ -1195,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        brelse(bitmap_bh);
        return inode;
 
-iget_failed:
-       err = PTR_ERR(inode);
-       inode = NULL;
 bad_orphan:
-       ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
-       printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
-              bit, (unsigned long long)bitmap_bh->b_blocknr,
-              ext4_test_bit(bit, bitmap_bh->b_data));
-       printk(KERN_WARNING "inode=%p\n", inode);
+       ext4_error(sb, "bad orphan inode %lu", ino);
+       if (bitmap_bh)
+               printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+                      bit, (unsigned long long)bitmap_bh->b_blocknr,
+                      ext4_test_bit(bit, bitmap_bh->b_data));
        if (inode) {
-               printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
+               printk(KERN_ERR "is_bad_inode(inode)=%d\n",
                       is_bad_inode(inode));
-               printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
+               printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
-               printk(KERN_WARNING "max_ino=%lu\n", max_ino);
-               printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
+               printk(KERN_ERR "max_ino=%lu\n", max_ino);
+               printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
                iput(inode);
        }
        brelse(bitmap_bh);
-error:
        return ERR_PTR(err);
 }
 
index 627b7e8f9ef3714b4ed310c4fd98faae03869dee..bc15c2c17633079a54de855baf1272b0124f19eb 100644 (file)
@@ -648,133 +648,6 @@ out:
        return err;
 }
 
-/*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       loff_t offset = iocb->ki_pos;
-       handle_t *handle;
-       ssize_t ret;
-       int orphan = 0;
-       size_t count = iov_iter_count(iter);
-       int retries = 0;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               loff_t final_size = offset + count;
-
-               if (final_size > inode->i_size) {
-                       /* Credits for sb + inode write */
-                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-                       if (IS_ERR(handle)) {
-                               ret = PTR_ERR(handle);
-                               goto out;
-                       }
-                       ret = ext4_orphan_add(handle, inode);
-                       if (ret) {
-                               ext4_journal_stop(handle);
-                               goto out;
-                       }
-                       orphan = 1;
-                       ei->i_disksize = inode->i_size;
-                       ext4_journal_stop(handle);
-               }
-       }
-
-retry:
-       if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
-               /*
-                * Nolock dioread optimization may be dynamically disabled
-                * via ext4_inode_block_unlocked_dio(). Check inode's state
-                * while holding extra i_dio_count ref.
-                */
-               inode_dio_begin(inode);
-               smp_mb();
-               if (unlikely(ext4_test_inode_state(inode,
-                                                   EXT4_STATE_DIOREAD_LOCK))) {
-                       inode_dio_end(inode);
-                       goto locked;
-               }
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter,
-                                       ext4_dio_get_block, NULL, 0);
-               else
-                       ret = __blockdev_direct_IO(iocb, inode,
-                                                  inode->i_sb->s_bdev, iter,
-                                                  ext4_dio_get_block,
-                                                  NULL, NULL, 0);
-               inode_dio_end(inode);
-       } else {
-locked:
-               if (IS_DAX(inode))
-                       ret = dax_do_io(iocb, inode, iter,
-                                       ext4_dio_get_block, NULL, DIO_LOCKING);
-               else
-                       ret = blockdev_direct_IO(iocb, inode, iter,
-                                                ext4_dio_get_block);
-
-               if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
-                       loff_t isize = i_size_read(inode);
-                       loff_t end = offset + count;
-
-                       if (end > isize)
-                               ext4_truncate_failed_write(inode);
-               }
-       }
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-
-       if (orphan) {
-               int err;
-
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /* This is really bad luck. We've written the data
-                        * but cannot extend i_size. Bail out and pretend
-                        * the write failed... */
-                       ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
-
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size) {
-                               ei->i_disksize = end;
-                               i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
-       }
-out:
-       return ret;
-}
-
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate a new block at @lblocks for non extent file based file
index 7bc6c855cc18ca992ac3830d3b3010e63597bc5d..ff7538c26992ea34f429de3d4d38a9a7f8f61262 100644 (file)
@@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
                        ext4_warning(dir->i_sb,
                                     "bad inline directory (dir #%lu) - "
                                     "inode %u, rec_len %u, name_len %d"
-                                    "inline size %d\n",
+                                    "inline size %d",
                                     dir->i_ino, le32_to_cpu(de->inode),
                                     le16_to_cpu(de->rec_len), de->name_len,
                                     inline_size);
index 79b298d397b43b1fadabe82fdeebe2cf3e0cb735..f7140ca66e3bf2751eb8103a37f249b49521b6ff 100644 (file)
@@ -684,6 +684,24 @@ out_sem:
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
+
+               /*
+                * Inodes with freshly allocated blocks where contents will be
+                * visible after transaction commit must be on transaction's
+                * ordered data list.
+                */
+               if (map->m_flags & EXT4_MAP_NEW &&
+                   !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+                   !(flags & EXT4_GET_BLOCKS_ZERO) &&
+                   !IS_NOQUOTA(inode) &&
+                   ext4_should_order_data(inode)) {
+                       if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+                               ret = ext4_jbd2_inode_add_wait(handle, inode);
+                       else
+                               ret = ext4_jbd2_inode_add_write(handle, inode);
+                       if (ret)
+                               return ret;
+               }
        }
        return retval;
 }
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
        int i_size_changed = 0;
 
        trace_ext4_write_end(inode, pos, len, copied);
-       if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
-               ret = ext4_jbd2_file_inode(handle, inode);
-               if (ret) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto errout;
-               }
-       }
-
        if (ext4_has_inline_data(inode)) {
                ret = ext4_write_inline_data_end(inode, pos, len,
                                                 copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
         * the data was copied into the page cache.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
-                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
+                          EXT4_GET_BLOCKS_METADATA_NOFAIL |
+                          EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
        struct blk_plug plug;
        bool give_up_on_write = false;
 
+       percpu_down_read(&sbi->s_journal_flag_rwsem);
        trace_ext4_writepages(inode, wbc);
 
-       if (dax_mapping(mapping))
-               return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-                                                  wbc);
+       if (dax_mapping(mapping)) {
+               ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+                                                 wbc);
+               goto out_writepages;
+       }
 
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
 out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
+       percpu_up_read(&sbi->s_journal_flag_rwsem);
        return ret;
 }
 
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
-int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
-                           struct buffer_head *bh_result, int create)
+/*
+ * Get block function for DAX IO and mmap faults. It takes care of converting
+ * unwritten extents to written ones and initializes new / converted blocks
+ * to zeros.
+ */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
 {
-       int ret, err;
-       int credits;
-       struct ext4_map_blocks map;
-       handle_t *handle = NULL;
-       int flags = 0;
-
-       ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       map.m_lblk = iblock;
-       map.m_len = bh_result->b_size >> inode->i_blkbits;
-       credits = ext4_chunk_trans_blocks(inode, map.m_len);
-       if (create) {
-               flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       return ret;
-               }
-       }
+       int ret;
 
-       ret = ext4_map_blocks(handle, inode, &map, flags);
-       if (create) {
-               err = ext4_journal_stop(handle);
-               if (ret >= 0 && err < 0)
-                       ret = err;
-       }
-       if (ret <= 0)
-               goto out;
-       if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-               int err2;
+       ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
+       if (!create)
+               return _ext4_get_block(inode, iblock, bh_result, 0);
 
-               /*
-                * We are protected by i_mmap_sem so we know block cannot go
-                * away from under us even though we dropped i_data_sem.
-                * Convert extent to written and write zeros there.
-                *
-                * Note: We may get here even when create == 0.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
+       ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                  EXT4_GET_BLOCKS_PRE_IO |
+                                  EXT4_GET_BLOCKS_CREATE_ZERO);
+       if (ret < 0)
+               return ret;
 
-               err = ext4_map_blocks(handle, inode, &map,
-                     EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (err < 0)
-                       ret = err;
-               err2 = ext4_journal_stop(handle);
-               if (err2 < 0 && ret > 0)
-                       ret = err2;
-       }
-out:
-       WARN_ON_ONCE(ret == 0 && create);
-       if (ret > 0) {
-               map_bh(bh_result, inode->i_sb, map.m_pblk);
+       if (buffer_unwritten(bh_result)) {
                /*
-                * At least for now we have to clear BH_New so that DAX code
-                * doesn't attempt to zero blocks again in a racy way.
+                * We are protected by i_mmap_sem or i_mutex so we know block
+                * cannot go away from under us even though we dropped
+                * i_data_sem. Convert extent to written and write zeros there.
                 */
-               map.m_flags &= ~EXT4_MAP_NEW;
-               ext4_update_bh_state(bh_result, map.m_flags);
-               bh_result->b_size = map.m_len << inode->i_blkbits;
-               ret = 0;
+               ret = ext4_get_block_trans(inode, iblock, bh_result,
+                                          EXT4_GET_BLOCKS_CONVERT |
+                                          EXT4_GET_BLOCKS_CREATE_ZERO);
+               if (ret < 0)
+                       return ret;
        }
-       return ret;
+       /*
+        * At least for now we have to clear BH_New so that DAX code
+        * doesn't attempt to zero blocks again in a racy way.
+        */
+       clear_buffer_new(bh_result);
+       return 0;
+}
+#else
+/* Just define empty function, it will never get called. */
+int ext4_dax_get_block(struct inode *inode, sector_t iblock,
+                      struct buffer_head *bh_result, int create)
+{
+       BUG();
+       return 0;
 }
 #endif
 
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 }
 
 /*
- * For ext4 extent files, ext4 will do direct-io write to holes,
+ * Handling of direct IO writes.
+ *
+ * For ext4 extent files, ext4 will do direct-io write even to holes,
  * preallocated extents, and those write extend the file, no need to
  * fall back to buffered IO.
  *
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * if the machine crashes during the write.
  *
  */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        ssize_t ret;
        loff_t offset = iocb->ki_pos;
        size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        get_block_t *get_block_func = NULL;
        int dio_flags = 0;
        loff_t final_size = offset + count;
+       int orphan = 0;
+       handle_t *handle;
 
-       /* Use the old path for reads and writes beyond i_size. */
-       if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-               return ext4_ind_direct_IO(iocb, iter);
+       if (final_size > inode->i_size) {
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+               orphan = 1;
+               ei->i_disksize = inode->i_size;
+               ext4_journal_stop(handle);
+       }
 
        BUG_ON(iocb->private == NULL);
 
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         * conversion. This also disallows race between truncate() and
         * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
         */
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_begin(inode);
+       inode_dio_begin(inode);
 
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                inode_unlock(inode);
 
        /*
-        * We could direct write to holes and fallocate.
+        * For extent mapped files we could direct write to holes and fallocate.
         *
         * Allocated blocks to fill the hole are marked as unwritten to prevent
         * parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        iocb->private = NULL;
        if (overwrite)
                get_block_func = ext4_dio_get_block_overwrite;
-       else if (is_sync_kiocb(iocb)) {
+       else if (IS_DAX(inode)) {
+               /*
+                * We can avoid zeroing for aligned DAX writes beyond EOF. Other
+                * writes need zeroing either because they can race with page
+                * faults or because they use partial blocks.
+                */
+               if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
+                   ext4_aligned_io(inode, offset, count))
+                       get_block_func = ext4_dio_get_block;
+               else
+                       get_block_func = ext4_dax_get_block;
+               dio_flags = DIO_LOCKING;
+       } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+                  round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
+               get_block_func = ext4_dio_get_block;
+               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
+       } else if (is_sync_kiocb(iocb)) {
                get_block_func = ext4_dio_get_block_unwritten_sync;
                dio_flags = DIO_LOCKING;
        } else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
        BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-       if (IS_DAX(inode))
+       if (IS_DAX(inode)) {
                ret = dax_do_io(iocb, inode, iter, get_block_func,
                                ext4_end_io_dio, dio_flags);
-       else
+       else
                ret = __blockdev_direct_IO(iocb, inode,
                                           inode->i_sb->s_bdev, iter,
                                           get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
        }
 
-       if (iov_iter_rw(iter) == WRITE)
-               inode_dio_end(inode);
+       inode_dio_end(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
        if (overwrite)
                inode_lock(inode);
 
+       if (ret < 0 && final_size > inode->i_size)
+               ext4_truncate_failed_write(inode);
+
+       /* Handle extending of i_size after direct IO write */
+       if (orphan) {
+               int err;
+
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+
+                       goto out;
+               }
+               if (inode->i_nlink)
+                       ext4_orphan_del(handle, inode);
+               if (ret > 0) {
+                       loff_t end = offset + ret;
+                       if (end > inode->i_size) {
+                               ei->i_disksize = end;
+                               i_size_write(inode, end);
+                               /*
+                                * We're going to return a positive `ret'
+                                * here due to non-zero-length I/O, so there's
+                                * no way of reporting error returns from
+                                * ext4_mark_inode_dirty() to userspace.  So
+                                * ignore it.
+                                */
+                               ext4_mark_inode_dirty(handle, inode);
+                       }
+               }
+               err = ext4_journal_stop(handle);
+               if (ret == 0)
+                       ret = err;
+       }
+out:
+       return ret;
+}
+
+static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+       int unlocked = 0;
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       ssize_t ret;
+
+       if (ext4_should_dioread_nolock(inode)) {
+               /*
+                * Nolock dioread optimization may be dynamically disabled
+                * via ext4_inode_block_unlocked_dio(). Check inode's state
+                * while holding extra i_dio_count ref.
+                */
+               inode_dio_begin(inode);
+               smp_mb();
+               if (unlikely(ext4_test_inode_state(inode,
+                                                   EXT4_STATE_DIOREAD_LOCK)))
+                       inode_dio_end(inode);
+               else
+                       unlocked = 1;
+       }
+       if (IS_DAX(inode)) {
+               ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
+                               NULL, unlocked ? 0 : DIO_LOCKING);
+       } else {
+               ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                          iter, ext4_dio_get_block,
+                                          NULL, NULL,
+                                          unlocked ? 0 : DIO_LOCKING);
+       }
+       if (unlocked)
+               inode_dio_end(inode);
        return ret;
 }
 
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                return 0;
 
        trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               ret = ext4_ext_direct_IO(iocb, iter);
+       if (iov_iter_rw(iter) == READ)
+               ret = ext4_direct_IO_read(iocb, iter);
        else
-               ret = ext4_ind_direct_IO(iocb, iter);
+               ret = ext4_direct_IO_write(iocb, iter);
        trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
        return ret;
 }
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
 {
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
-               ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
-               break;
        case EXT4_INODE_WRITEBACK_DATA_MODE:
-               ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
        } else {
                err = 0;
                mark_buffer_dirty(bh);
-               if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
-                       err = ext4_jbd2_file_inode(handle, inode);
+               if (ext4_should_order_data(inode))
+                       err = ext4_jbd2_inode_add_write(handle, inode);
        }
 
 unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        journal_t *journal;
        handle_t *handle;
        int err;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
        /*
         * We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
-       /* We have to allocate physical blocks for delalloc blocks
-        * before flushing journal. otherwise delalloc blocks can not
-        * be allocated any more. even more truncate on delalloc blocks
-        * could trigger BUG by flushing delalloc blocks in journal.
-        * There is no delalloc block in non-journal data mode.
-        */
-       if (val && test_opt(inode->i_sb, DELALLOC)) {
-               err = ext4_alloc_da_blocks(inode);
-               if (err < 0)
-                       return err;
-       }
 
        /* Wait for all existing dio workers */
        ext4_inode_block_unlocked_dio(inode);
        inode_dio_wait(inode);
 
+       /*
+        * Before flushing the journal and switching inode's aops, we have
+        * to flush all dirty data the inode has. There can be outstanding
+        * delayed allocations, there can be unwritten extents created by
+        * fallocate or buffered writes in dioread_nolock mode covered by
+        * dirty data which can be converted only after flushing the dirty
+        * data (and journalled aops don't know how to handle these cases).
+        */
+       if (val) {
+               down_write(&EXT4_I(inode)->i_mmap_sem);
+               err = filemap_write_and_wait(inode->i_mapping);
+               if (err < 0) {
+                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       ext4_inode_resume_unlocked_dio(inode);
+                       return err;
+               }
+       }
+
+       percpu_down_write(&sbi->s_journal_flag_rwsem);
        jbd2_journal_lock_updates(journal);
 
        /*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                err = jbd2_journal_flush(journal);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
+                       percpu_up_write(&sbi->s_journal_flag_rwsem);
                        ext4_inode_resume_unlocked_dio(inode);
                        return err;
                }
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        ext4_set_aops(inode);
 
        jbd2_journal_unlock_updates(journal);
+       percpu_up_write(&sbi->s_journal_flag_rwsem);
+
+       if (val)
+               up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
 
        /* Finally we can mark the inode as dirty. */
index 7497f50cb29336add4aa7ef17ddf5176193493d1..28cc412852afba41650ffeee2e713e2345b60511 100644 (file)
@@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
                struct dquot *transfer_to[MAXQUOTAS] = { };
 
                transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
-               if (transfer_to[PRJQUOTA]) {
+               if (!IS_ERR(transfer_to[PRJQUOTA])) {
                        err = __dquot_transfer(inode, transfer_to);
                        dqput(transfer_to[PRJQUOTA]);
                        if (err)
index eeeade76012ecf66f59340332d6838451b4e6eda..c1ab3ec30423f65878789c6602cb63dc82b6746f 100644 (file)
@@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 {
        int order = 1;
+       int bb_incr = 1 << (e4b->bd_blkbits - 1);
        void *bb;
 
        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
-               bb += 1 << (e4b->bd_blkbits - order);
+               bb += bb_incr;
+               bb_incr >>= 1;
                order++;
        }
        return 0;
@@ -2583,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
-       unsigned offset;
+       unsigned offset, offset_incr;
        unsigned max;
        int ret;
 
@@ -2612,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb)
 
        i = 1;
        offset = 0;
+       offset_incr = 1 << (sb->s_blocksize_bits - 1);
        max = sb->s_blocksize << 2;
        do {
                sbi->s_mb_offsets[i] = offset;
                sbi->s_mb_maxs[i] = max;
-               offset += 1 << (sb->s_blocksize_bits - i);
+               offset += offset_incr;
+               offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
        } while (i <= sb->s_blocksize_bits + 1);
@@ -4935,7 +4939,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               ext4_warning(sb, "too much blocks added to group %u\n",
+               ext4_warning(sb, "too much blocks added to group %u",
                             block_group);
                err = -EINVAL;
                goto error_return;
index 24445275d330e07cb38623e0d548989a2a7621c4..23d436d6f8b8fe1c0e69cbe2a957dcd141448abb 100644 (file)
@@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
        __ext4_warning(sb, function, line, "%s", msg);
        __ext4_warning(sb, function, line,
                       "MMP failure info: last update time: %llu, last update "
-                      "node: %s, last update device: %s\n",
+                      "node: %s, last update device: %s",
                       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
                       mmp->mmp_nodename, mmp->mmp_bdevname);
 }
@@ -353,7 +353,7 @@ skip:
         * wait for MMP interval and check mmp_seq.
         */
        if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
-               ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+               ext4_warning(sb, "MMP startup interrupted, failing mount");
                goto failed;
        }
 
index 325cef48b39a8d23788dc17ef056ccf79a7b58f6..a920c5d29fac0a5b5ef83c9cca18feeb3dcc0fe7 100644 (file)
@@ -400,7 +400,7 @@ data_copy:
 
        /* Even in case of data=writeback it is reasonable to pin
         * inode to transaction, to prevent unexpected data loss */
-       *err = ext4_jbd2_file_inode(handle, orig_inode);
+       *err = ext4_jbd2_inode_add_write(handle, orig_inode);
 
 unlock_pages:
        unlock_page(pagep[0]);
index 5611ec9348d7368abe2574b28899311375202118..ec4c39952e847462c9a4a62f7d4bcc1afb74fd37 100644 (file)
@@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
 
        while (1) {
+               if (fatal_signal_pending(current)) {
+                       err = -ERESTARTSYS;
+                       goto errout;
+               }
+               cond_resched();
                block = dx_get_block(frame->at);
                ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
                                             start_hash, start_minor_hash);
@@ -1613,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                        if (nokey)
                                return ERR_PTR(-ENOKEY);
                        ext4_warning(inode->i_sb,
-                                    "Inconsistent encryption contexts: %lu/%lu\n",
+                                    "Inconsistent encryption contexts: %lu/%lu",
                                     (unsigned long) dir->i_ino,
                                     (unsigned long) inode->i_ino);
                        return ERR_PTR(-EPERM);
@@ -2828,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
                         * list entries can cause panics at unmount time.
                         */
                        mutex_lock(&sbi->s_orphan_lock);
-                       list_del(&EXT4_I(inode)->i_orphan);
+                       list_del_init(&EXT4_I(inode)->i_orphan);
                        mutex_unlock(&sbi->s_orphan_lock);
                }
        }
index e4fc8ea45d7888fe3677f052e1af8dff39d443ab..2a01df9cc1c3214ee0e106eee262e7a3d1cee284 100644 (file)
@@ -342,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
        if (bio) {
                int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
                            WRITE_SYNC : WRITE;
-               bio_get(io->io_bio);
                submit_bio(io_op, io->io_bio);
-               bio_put(io->io_bio);
        }
        io->io_bio = NULL;
 }
index 34038e3598d59fa2b4bcaf2304d31602e803d5e0..cf681004b1965fba00be6e97ee9a4cbe57eb3654 100644 (file)
@@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb)
         */
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                ext4_warning(sb, "There are errors in the filesystem, "
-                            "so online resizing is not allowed\n");
+                            "so online resizing is not allowed");
                return -EPERM;
        }
 
index 304c712dbe12e8ba7d5c5abfaca30f7ef3f5e5a6..3822a5aedc61b241d937c474192bfa57e5b3b2c5 100644 (file)
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+       percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3416,16 +3417,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
-               if (blocksize != PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR,
-                                       "error: unsupported blocksize for dax");
-                       goto failed_mount;
-               }
-               if (!sb->s_bdev->bd_disk->fops->direct_access) {
-                       ext4_msg(sb, KERN_ERR,
-                                       "error: device does not support dax");
+               err = bdev_dax_supported(sb, blocksize);
+               if (err)
                        goto failed_mount;
-               }
        }
 
        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
@@ -3930,6 +3924,9 @@ no_journal:
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
+       if (!err)
+               err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount6;
index 2ad98d6e19f43c369d9eb1f65640374e1e133b80..70078096117d3e956e86be7ba358b724286989d9 100644 (file)
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
 
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               if (!(jinode->i_flags & JI_WRITE_DATA))
+                       continue;
                mapping = jinode->i_vfs_inode->i_mapping;
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
        /* For locking, see the comment in journal_submit_data_buffers() */
        spin_lock(&journal->j_list_lock);
        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               if (!(jinode->i_flags & JI_WAIT_DATA))
+                       continue;
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
index 435f0b26ac2038e4f8037b5b1f2e4d15dbc9b2d2..b31852f76f46585137021df6266a18ba743b8528 100644 (file)
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_inode_add_write);
+EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
index 2c56c3e3219463b165828c7c9c1d2a4ab8ef608e..1749519b362fa6ca7385de310d5c67096746278d 100644 (file)
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 /*
  * File inode in the inode list of the handle's transaction
  */
-int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
+                                  unsigned long flags)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
         * and if jinode->i_next_transaction == transaction, commit code
         * will only file the inode where we want it.
         */
-       if (jinode->i_transaction == transaction ||
-           jinode->i_next_transaction == transaction)
+       if ((jinode->i_transaction == transaction ||
+           jinode->i_next_transaction == transaction) &&
+           (jinode->i_flags & flags) == flags)
                return 0;
 
        spin_lock(&journal->j_list_lock);
-
+       jinode->i_flags |= flags;
+       /* Is inode already attached where we need it? */
        if (jinode->i_transaction == transaction ||
            jinode->i_next_transaction == transaction)
                goto done;
@@ -2523,6 +2526,17 @@ done:
        return 0;
 }
 
+int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
+{
+       return jbd2_journal_file_inode(handle, jinode,
+                                      JI_WRITE_DATA | JI_WAIT_DATA);
+}
+
+int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
+{
+       return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
+}
+
 /*
  * File truncate and transaction commit interact with each other in a
  * non-trivial way.  If a transaction writing data block A is
index 5375571cf6e19587c911c497d4be5089a2862727..15b124c18ed8235e9aec4ad8bd0edd36b969759c 100644 (file)
@@ -4542,7 +4542,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link)
 out:
        return len;
 }
-EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
index 618ced381a1405ff7c31f704409781c7ef1e00f6..aaa2e8d3df6f214d92e8ecd24a48c6e4eb029e69 100644 (file)
@@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        }
 
        if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
-                                       &args->cbl_range)) {
+                               &args->cbl_range,
+                               be32_to_cpu(args->cbl_stateid.seqid))) {
                rv = NFS4_OK;
                goto unlock;
        }
@@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        cps->slot = slot;
 
        /* The ca_maxresponsesize_cached is 0 with no DRC */
-       if (args->csa_cachethis != 0)
-               return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+       if (args->csa_cachethis != 0) {
+               status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+               goto out_unlock;
+       }
 
        /*
         * Check for pending referring calls.  If a match is found, a
index 976c90608e5618103d385a3436cafe461873a867..d81f96aacd51e71b1da710b477e7d44ff9a90b24 100644 (file)
@@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
        p = read_buf(xdr, NFS4_STATEID_SIZE);
        if (unlikely(p == NULL))
                return htonl(NFS4ERR_RESOURCE);
-       memcpy(stateid, p, NFS4_STATEID_SIZE);
+       memcpy(stateid->data, p, NFS4_STATEID_SIZE);
        return 0;
 }
 
+static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
        __be32 *p;
@@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
        __be32 *p;
        __be32 status;
 
-       status = decode_stateid(xdr, &args->stateid);
+       status = decode_delegation_stateid(xdr, &args->stateid);
        if (unlikely(status != 0))
                goto out;
        p = read_buf(xdr, 4);
@@ -227,6 +233,11 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
 
 static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
                                       struct xdr_stream *xdr,
@@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
                }
                p = xdr_decode_hyper(p, &args->cbl_range.offset);
                p = xdr_decode_hyper(p, &args->cbl_range.length);
-               status = decode_stateid(xdr, &args->cbl_stateid);
+               status = decode_layout_stateid(xdr, &args->cbl_stateid);
                if (unlikely(status != 0))
                        goto out;
        } else if (args->cbl_recall_type == RETURN_FSID) {
index 5166adcfc0fb2379ed75c37b0aa09c8da7550607..322c2585bc341eb49d5045e0898101ae7a1abab9 100644 (file)
@@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp)
 
 /**
  * nfs4_copy_delegation_stateid - Copy inode's state ID information
- * @dst: stateid data structure to fill in
  * @inode: inode to check
  * @flags: delegation type requirement
+ * @dst: stateid data structure to fill in
+ * @cred: optional argument to retrieve credential
  *
  * Returns "true" and fills in "dst->data" * if inode had a delegation,
  * otherwise "false" is returned.
  */
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
-               fmode_t flags)
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
+               nfs4_stateid *dst, struct rpc_cred **cred)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
@@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
        if (ret) {
                nfs4_stateid_copy(dst, &delegation->stateid);
                nfs_mark_delegation_referenced(delegation);
+               if (cred)
+                       *cred = get_rpccred(delegation->cred);
        }
        rcu_read_unlock();
        return ret;
index 333063e032f01813762bc89f6d6db23ddc696c92..64724d252a7973074fcff861a31164c9df7415f9 100644 (file)
@@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
-bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
+bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
index 741a92c470bb5f173e12f6c7f63c103b49f356e2..979b3c4dee6aedbb354faefd1aca8cf1b3be8789 100644 (file)
@@ -87,6 +87,7 @@ struct nfs_direct_req {
        int                     mirror_count;
 
        ssize_t                 count,          /* bytes actually processed */
+                               max_count,      /* max expected count */
                                bytes_left,     /* bytes left to be sent */
                                io_start,       /* start of IO */
                                error;          /* any reported error */
@@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
        int i;
        ssize_t count;
 
+       WARN_ON_ONCE(dreq->count >= dreq->max_count);
+
        if (dreq->mirror_count == 1) {
                dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
                dreq->count += hdr->good_bytes;
@@ -275,7 +278,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq)
 {
-       cinfo->lock = &dreq->inode->i_lock;
+       cinfo->inode = dreq->inode;
        cinfo->mds = &dreq->mds_cinfo;
        cinfo->ds = &dreq->ds_cinfo;
        cinfo->dreq = dreq;
@@ -591,7 +594,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
                goto out_unlock;
 
        dreq->inode = inode;
-       dreq->bytes_left = count;
+       dreq->bytes_left = dreq->max_count = count;
        dreq->io_start = iocb->ki_pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -630,13 +633,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
                                  struct list_head *list,
                                  struct nfs_commit_info *cinfo)
 {
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
 #ifdef CONFIG_NFS_V4_1
        if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
                NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
        nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
@@ -671,13 +674,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                if (!nfs_pageio_add_request(&desc, req)) {
                        nfs_list_remove_request(req);
                        nfs_list_add_request(req, &failed);
-                       spin_lock(cinfo.lock);
+                       spin_lock(&cinfo.inode->i_lock);
                        dreq->flags = 0;
                        if (desc.pg_error < 0)
                                dreq->error = desc.pg_error;
                        else
                                dreq->error = -EIO;
-                       spin_unlock(cinfo.lock);
+                       spin_unlock(&cinfo.inode->i_lock);
                }
                nfs_release_request(req);
        }
@@ -1023,7 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
                goto out_unlock;
 
        dreq->inode = inode;
-       dreq->bytes_left = iov_iter_count(iter);
+       dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
index 3384dc8e66836c1fe4bc4e0a507a7e9250c5e719..aa59757389dc8b38887b4cce4dcc5a287a2ba42a 100644 (file)
@@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
        }
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        if (cinfo->ds->nbuckets >= size)
                goto out;
        for (i = 0; i < cinfo->ds->nbuckets; i++) {
@@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
        swap(cinfo->ds->buckets, buckets);
        cinfo->ds->nbuckets = size;
 out:
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        kfree(buckets);
        return 0;
 }
@@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_READ,
+                                          false,
                                           GFP_KERNEL);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                           0,
                                           NFS4_MAX_UINT64,
                                           IOMODE_RW,
+                                          false,
                                           GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
index 0cb1abd535e38469201478ee597e74045e593bc5..0e8018bc98808bf29828e4861aee0c1ed9df3b31 100644 (file)
@@ -26,6 +26,8 @@
 
 #define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
 
+static struct group_info       *ff_zero_group;
+
 static struct pnfs_layout_hdr *
 ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 {
@@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
        kfree(FF_LAYOUT_FROM_HDR(lo));
 }
 
-static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
        __be32 *p;
 
        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
        if (unlikely(p == NULL))
                return -ENOBUFS;
-       memcpy(stateid, p, NFS4_STATEID_SIZE);
+       stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
+       memcpy(stateid->data, p, NFS4_STATEID_SIZE);
        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
                p[0], p[1], p[2], p[3]);
        return 0;
@@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
+       struct rpc_cred *cred;
+
        ff_layout_remove_mirror(mirror);
        kfree(mirror->fh_versions);
-       if (mirror->cred)
-               put_rpccred(mirror->cred);
+       cred = rcu_access_pointer(mirror->ro_cred);
+       if (cred)
+               put_rpccred(cred);
+       cred = rcu_access_pointer(mirror->rw_cred);
+       if (cred)
+               put_rpccred(cred);
        nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
        kfree(mirror);
 }
@@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
 {
        u64 new_end, old_end;
 
+       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+               return false;
        if (new->pls_range.iomode != old->pls_range.iomode)
                return false;
        old_end = pnfs_calc_offset_end(old->pls_range.offset,
@@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
                        new_end);
        if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
                set_bit(NFS_LSEG_ROC, &new->pls_flags);
-       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
-               set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
        return true;
 }
 
@@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                struct nfs4_ff_layout_mirror *mirror;
                struct nfs4_deviceid devid;
                struct nfs4_deviceid_node *idnode;
-               u32 ds_count;
-               u32 fh_count;
+               struct auth_cred acred = { .group_info = ff_zero_group };
+               struct rpc_cred __rcu *cred;
+               u32 ds_count, fh_count, id;
                int j;
 
                rc = -EIO;
@@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 
                /* stateid */
-               rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+               rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
                if (rc)
                        goto out_err_free;
 
@@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                fls->mirror_array[i]->fh_versions_cnt = fh_count;
 
                /* user */
-               rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+               rc = decode_name(&stream, &id);
                if (rc)
                        goto out_err_free;
 
+               acred.uid = make_kuid(&init_user_ns, id);
+
                /* group */
-               rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+               rc = decode_name(&stream, &id);
                if (rc)
                        goto out_err_free;
 
+               acred.gid = make_kgid(&init_user_ns, id);
+
+               /* find the cred for it */
+               rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags));
+               if (IS_ERR(cred)) {
+                       rc = PTR_ERR(cred);
+                       goto out_err_free;
+               }
+
+               if (lgr->range.iomode == IOMODE_READ)
+                       rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+               else
+                       rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+
                mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
                if (mirror != fls->mirror_array[i]) {
+                       /* swap cred ptrs so free_mirror will clean up old */
+                       if (lgr->range.iomode == IOMODE_READ) {
+                               cred = xchg(&mirror->ro_cred, cred);
+                               rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
+                       } else {
+                               cred = xchg(&mirror->rw_cred, cred);
+                               rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+                       }
                        ff_layout_free_mirror(fls->mirror_array[i]);
                        fls->mirror_array[i] = mirror;
                }
 
-               dprintk("%s: uid %d gid %d\n", __func__,
-                       fls->mirror_array[i]->uid,
-                       fls->mirror_array[i]->gid);
+               dprintk("%s: iomode %s uid %u gid %u\n", __func__,
+                       lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
+                       from_kuid(&init_user_ns, acred.uid),
+                       from_kgid(&init_user_ns, acred.gid));
        }
 
        p = xdr_inline_decode(&stream, 4);
@@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
        else {
                int i;
 
-               spin_lock(cinfo->lock);
+               spin_lock(&cinfo->inode->i_lock);
                if (cinfo->ds->nbuckets != 0)
                        kfree(buckets);
                else {
@@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                                        NFS_INVALID_STABLE_HOW;
                        }
                }
-               spin_unlock(cinfo->lock);
+               spin_unlock(&cinfo->inode->i_lock);
                return 0;
        }
 }
@@ -785,6 +820,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
        return NULL;
 }
 
+static void
+ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
+                     struct nfs_page *req,
+                     bool strict_iomode)
+{
+retry_strict:
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                          req->wb_context,
+                                          0,
+                                          NFS4_MAX_UINT64,
+                                          IOMODE_READ,
+                                          strict_iomode,
+                                          GFP_KERNEL);
+       if (IS_ERR(pgio->pg_lseg)) {
+               pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+               pgio->pg_lseg = NULL;
+       }
+
+       /* If we don't have checking, do get a IOMODE_RW
+        * segment, and the server wants to avoid READs
+        * there, then retry!
+        */
+       if (pgio->pg_lseg && !strict_iomode &&
+           ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
+               strict_iomode = true;
+               goto retry_strict;
+       }
+}
+
 static void
 ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
@@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        int ds_idx;
 
        /* Use full layout for now */
-       if (!pgio->pg_lseg) {
-               pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                                                  req->wb_context,
-                                                  0,
-                                                  NFS4_MAX_UINT64,
-                                                  IOMODE_READ,
-                                                  GFP_KERNEL);
-               if (IS_ERR(pgio->pg_lseg)) {
-                       pgio->pg_error = PTR_ERR(pgio->pg_lseg);
-                       pgio->pg_lseg = NULL;
-                       return;
-               }
-       }
+       if (!pgio->pg_lseg)
+               ff_layout_pg_get_read(pgio, req, false);
+       else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg))
+               ff_layout_pg_get_read(pgio, req, true);
+
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                goto out_mds;
 
        ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
-       if (!ds)
-               goto out_mds;
+       if (!ds) {
+               if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                       goto out_pnfs;
+               else
+                       goto out_mds;
+       }
+
        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 
        pgio->pg_mirror_idx = ds_idx;
@@ -828,6 +890,12 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_read_mds(pgio);
+       return;
+
+out_pnfs:
+       pnfs_set_lo_fail(pgio->pg_lseg);
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = NULL;
 }
 
 static void
@@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 
        for (i = 0; i < pgio->pg_mirror_count; i++) {
                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
-               if (!ds)
-                       goto out_mds;
+               if (!ds) {
+                       if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
+                               goto out_pnfs;
+                       else
+                               goto out_mds;
+               }
                pgm = &pgio->pg_mirrors[i];
                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
@@ -883,6 +956,12 @@ out_mds:
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = NULL;
        nfs_pageio_reset_write_mds(pgio);
+       return;
+
+out_pnfs:
+       pnfs_set_lo_fail(pgio->pg_lseg);
+       pnfs_put_lseg(pgio->pg_lseg);
+       pgio->pg_lseg = NULL;
 }
 
 static unsigned int
@@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
                                                   0,
                                                   NFS4_MAX_UINT64,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
-               if (ff_layout_no_fallback_to_mds(lseg) ||
-                   ff_layout_has_available_ds(lseg))
+               if (ff_layout_avoid_mds_available_ds(lseg))
                        return -NFS4ERR_RESET_TO_PNFS;
 reset:
                dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
                                        hdr->pgio_mirror_idx + 1,
                                        &hdr->pgio_mirror_idx))
                        goto out_eagain;
-               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
-                       &hdr->lseg->pls_layout->plh_flags);
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
@@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 }
 
 static bool
-ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx)
 {
        /* No mirroring for now */
        struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
                rpc_exit(task, -EIO);
                return -EIO;
        }
-       if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
-               dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-               if (ff_layout_has_available_ds(hdr->lseg))
-                       pnfs_read_resend_pnfs(hdr);
-               else
-                       ff_layout_reset_read(hdr);
-               rpc_exit(task, 0);
+       if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+               rpc_exit(task, -EHOSTDOWN);
                return -EAGAIN;
        }
-       hdr->pgio_done_cb = ff_layout_read_done_cb;
 
        ff_layout_read_record_layoutstats_start(task, hdr);
        return 0;
@@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
                return -EIO;
        }
 
-       if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
-               bool retry_pnfs;
-
-               retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
-               dprintk("%s task %u reset io to %s\n", __func__,
-                       task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
-               ff_layout_reset_write(hdr, retry_pnfs);
-               rpc_exit(task, 0);
+       if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) {
+               rpc_exit(task, -EHOSTDOWN);
                return -EAGAIN;
        }
 
@@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
                goto out_failed;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                goto out_failed;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
                ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
 
+       hdr->pgio_done_cb = ff_layout_read_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
@@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
                          vers == 3 ? &ff_layout_read_call_ops_v3 :
                                      &ff_layout_read_call_ops_v4,
                          0, RPC_TASK_SOFTCONN);
-
+       put_rpccred(ds_cred);
        return PNFS_ATTEMPTED;
 
 out_failed:
-       if (ff_layout_has_available_ds(lseg))
+       if (ff_layout_avoid_mds_available_ds(lseg))
                return PNFS_TRY_AGAIN;
        return PNFS_NOT_ATTEMPTED;
 }
@@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
                return PNFS_NOT_ATTEMPTED;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                return PNFS_NOT_ATTEMPTED;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
                          vers == 3 ? &ff_layout_write_call_ops_v3 :
                                      &ff_layout_write_call_ops_v4,
                          sync, RPC_TASK_SOFTCONN);
+       put_rpccred(ds_cred);
        return PNFS_ATTEMPTED;
 }
 
@@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        struct rpc_clnt *ds_clnt;
        struct rpc_cred *ds_cred;
        u32 idx;
-       int vers;
+       int vers, ret;
        struct nfs_fh *fh;
 
        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
@@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
                goto out_err;
 
        ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
-       if (IS_ERR(ds_cred))
+       if (!ds_cred)
                goto out_err;
 
        vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        if (fh)
                data->args.fh = fh;
 
-       return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+       ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
                                               &ff_layout_commit_call_ops_v4,
                                   how, RPC_TASK_SOFTCONN);
+       put_rpccred(ds_cred);
+       return ret;
 out_err:
        pnfs_generic_prepare_to_resend_writes(data);
        pnfs_generic_commit_release(data);
@@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void)
 {
        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
               __func__);
+       if (!ff_zero_group) {
+               ff_zero_group = groups_alloc(0);
+               if (!ff_zero_group)
+                       return -ENOMEM;
+       }
        return pnfs_register_layoutdriver(&flexfilelayout_type);
 }
 
@@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void)
        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
               __func__);
        pnfs_unregister_layoutdriver(&flexfilelayout_type);
+       if (ff_zero_group) {
+               put_group_info(ff_zero_group);
+               ff_zero_group = NULL;
+       }
 }
 
 MODULE_ALIAS("nfs-layouttype4-4");
index dd353bb7dc0a01ddb76eabdca99a77b75193fe82..1bcdb15d0c41a74d139ce7db378c3e19844052a0 100644 (file)
@@ -10,7 +10,8 @@
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
 #define FF_FLAGS_NO_LAYOUTCOMMIT 1
-#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_IO_THRU_MDS  2
+#define FF_FLAGS_NO_READ_IO      4
 
 #include "../pnfs.h"
 
@@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror {
        u32                             fh_versions_cnt;
        struct nfs_fh                   *fh_versions;
        nfs4_stateid                    stateid;
-       u32                             uid;
-       u32                             gid;
-       struct rpc_cred                 *cred;
+       struct rpc_cred __rcu           *ro_cred;
+       struct rpc_cred __rcu           *rw_cred;
        atomic_t                        ref;
        spinlock_t                      lock;
        struct nfs4_ff_layoutstat       read_stat;
@@ -153,6 +153,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
        return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
 }
 
+static inline bool
+ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+       return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO;
+}
+
 static inline bool
 ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
 {
@@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
 struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
                                       u32 ds_idx, struct rpc_cred *mdscred);
 bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
+
 #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
index add0e5a70bd60f70479452ddc8a0ccf853c2c87f..0aa36be71fceaaf3ec532af442358b5ca80828ab 100644 (file)
@@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
                return e1->opnum < e2->opnum ? -1 : 1;
        if (e1->status != e2->status)
                return e1->status < e2->status ? -1 : 1;
-       ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+       ret = memcmp(e1->stateid.data, e2->stateid.data,
+                       sizeof(e1->stateid.data));
        if (ret != 0)
                return ret;
        ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
@@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
        return 0;
 }
 
-/* currently we only support AUTH_NONE and AUTH_SYS */
-static rpc_authflavor_t
-nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+static struct rpc_cred *
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
 {
-       if (mirror->uid == (u32)-1)
-               return RPC_AUTH_NULL;
-       return RPC_AUTH_UNIX;
-}
+       struct rpc_cred *cred, __rcu **pcred;
 
-/* fetch cred for NFSv3 DS */
-static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
-                                     struct nfs4_pnfs_ds *ds)
-{
-       if (ds->ds_clp && !mirror->cred &&
-           mirror->mirror_ds->ds_versions[0].version == 3) {
-               struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
-               struct rpc_cred *cred;
-               struct auth_cred acred = {
-                       .uid = make_kuid(&init_user_ns, mirror->uid),
-                       .gid = make_kgid(&init_user_ns, mirror->gid),
-               };
-
-               /* AUTH_NULL ignores acred */
-               cred = auth->au_ops->lookup_cred(auth, &acred, 0);
-               if (IS_ERR(cred)) {
-                       dprintk("%s: lookup_cred failed with %ld\n",
-                               __func__, PTR_ERR(cred));
-                       return PTR_ERR(cred);
-               } else {
-                       if (cmpxchg(&mirror->cred, NULL, cred))
-                               put_rpccred(cred);
-               }
-       }
-       return 0;
+       if (iomode == IOMODE_READ)
+               pcred = &mirror->ro_cred;
+       else
+               pcred = &mirror->rw_cred;
+
+       rcu_read_lock();
+       do {
+               cred = rcu_dereference(*pcred);
+               if (!cred)
+                       break;
+
+               cred = get_rpccred_rcu(cred);
+       } while(!cred);
+       rcu_read_unlock();
+       return cred;
 }
 
 struct nfs_fh *
@@ -356,7 +343,23 @@ out:
        return fh;
 }
 
-/* Upon return, either ds is connected, or ds is NULL */
+/**
+ * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
+ * @lseg: the layout segment we're operating on
+ * @ds_idx: index of the DS to use
+ * @fail_return: return layout on connect failure?
+ *
+ * Try to prepare a DS connection to accept an RPC call. This involves
+ * selecting a mirror to use and connecting the client to it if it's not
+ * already connected.
+ *
+ * Since we only need a single functioning mirror to satisfy a read, we don't
+ * want to return the layout if there is one. For writes though, any down
+ * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
+ * between the two cases.
+ *
+ * Returns a pointer to a connected DS object on success or NULL on failure.
+ */
 struct nfs4_pnfs_ds *
 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                          bool fail_return)
@@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        struct inode *ino = lseg->pls_layout->plh_inode;
        struct nfs_server *s = NFS_SERVER(ino);
        unsigned int max_payload;
-       rpc_authflavor_t flavor;
 
        if (!ff_layout_mirror_valid(lseg, mirror)) {
                pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
@@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
        smp_rmb();
        if (ds->ds_clp)
-               goto out_update_creds;
-
-       flavor = nfs4_ff_layout_choose_authflavor(mirror);
+               goto out;
 
        /* FIXME: For now we assume the server sent only one version of NFS
         * to use for the DS.
@@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                             dataserver_retrans,
                             mirror->mirror_ds->ds_versions[0].version,
                             mirror->mirror_ds->ds_versions[0].minor_version,
-                            flavor);
+                            RPC_AUTH_UNIX);
 
        /* connect success, check rsize/wsize limit */
        if (ds->ds_clp) {
@@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                                         mirror, lseg->pls_range.offset,
                                         lseg->pls_range.length, NFS4ERR_NXIO,
                                         OP_ILLEGAL, GFP_NOIO);
-               if (!fail_return) {
-                       if (ff_layout_has_available_ds(lseg))
-                               set_bit(NFS_LAYOUT_RETURN_REQUESTED,
-                                       &lseg->pls_layout->plh_flags);
-                       else
-                               pnfs_error_mark_layout_for_return(ino, lseg);
-               } else
+               if (fail_return || !ff_layout_has_available_ds(lseg))
                        pnfs_error_mark_layout_for_return(ino, lseg);
                ds = NULL;
-               goto out;
        }
-out_update_creds:
-       if (ff_layout_update_mirror_cred(mirror, ds))
-               ds = NULL;
 out:
        return ds;
 }
@@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
                      struct rpc_cred *mdscred)
 {
        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
-       struct rpc_cred *cred = ERR_PTR(-EINVAL);
-
-       if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
-               goto out;
+       struct rpc_cred *cred;
 
-       if (mirror && mirror->cred)
-               cred = mirror->cred;
-       else
-               cred = mdscred;
-out:
+       if (mirror) {
+               cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode);
+               if (!cred)
+                       cred = get_rpccred(mdscred);
+       } else {
+               cred = get_rpccred(mdscred);
+       }
        return cred;
 }
 
@@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
        return ff_rw_layout_has_available_ds(lseg);
 }
 
+bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
+{
+       return ff_layout_no_fallback_to_mds(lseg) ||
+              ff_layout_has_available_ds(lseg);
+}
+
+bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
+{
+       return lseg->pls_range.iomode == IOMODE_RW &&
+              ff_layout_no_read_on_rw(lseg);
+}
+
 module_param(dataserver_retrans, uint, 0644);
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
                        "retries a request before it attempts further "
index f1d1d2c472e99fbde4bfa5f9c30dfedd05c28b90..5154fa65a2f2a20efad5eeb9667bd10974f8aaed 100644 (file)
@@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
                             u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
index b587ccd3108344b3c29c75e2fa40d2e884f3ed1c..b6cd15314bab4e3260c5fdc2809ebf54a02b4291 100644 (file)
@@ -13,6 +13,7 @@
 
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
index dff83460e5a63ff0b6578f419a793d70720c3997..aa03ed09ba06f79763146b0a6f8cb171c9778b0c 100644 (file)
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        return err;
 }
 
+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+                               struct nfs_lock_context *src_lock,
+                               struct file *dst, loff_t pos_dst,
+                               struct nfs_lock_context *dst_lock,
+                               size_t count)
+{
+       struct nfs42_copy_args args = {
+               .src_fh         = NFS_FH(file_inode(src)),
+               .src_pos        = pos_src,
+               .dst_fh         = NFS_FH(file_inode(dst)),
+               .dst_pos        = pos_dst,
+               .count          = count,
+       };
+       struct nfs42_copy_res res;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+               .rpc_argp = &args,
+               .rpc_resp = &res,
+       };
+       struct inode *dst_inode = file_inode(dst);
+       struct nfs_server *server = NFS_SERVER(dst_inode);
+       int status;
+
+       status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+                                    src_lock, FMODE_READ);
+       if (status)
+               return status;
+
+       status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+                                    dst_lock, FMODE_WRITE);
+       if (status)
+               return status;
+
+       status = nfs4_call_sync(server->client, server, &msg,
+                               &args.seq_args, &res.seq_res, 0);
+       if (status == -ENOTSUPP)
+               server->caps &= ~NFS_CAP_COPY;
+       if (status)
+               return status;
+
+       if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+               status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+               if (status)
+                       return status;
+       }
+
+       truncate_pagecache_range(dst_inode, pos_dst,
+                                pos_dst + res.write_res.count);
+
+       return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+                       struct file *dst, loff_t pos_dst,
+                       size_t count)
+{
+       struct nfs_server *server = NFS_SERVER(file_inode(dst));
+       struct nfs_lock_context *src_lock;
+       struct nfs_lock_context *dst_lock;
+       struct nfs4_exception src_exception = { };
+       struct nfs4_exception dst_exception = { };
+       ssize_t err, err2;
+
+       if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+               return -EOPNOTSUPP;
+
+       src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+       if (IS_ERR(src_lock))
+               return PTR_ERR(src_lock);
+
+       src_exception.inode = file_inode(src);
+       src_exception.state = src_lock->open_context->state;
+
+       dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+       if (IS_ERR(dst_lock)) {
+               err = PTR_ERR(dst_lock);
+               goto out_put_src_lock;
+       }
+
+       dst_exception.inode = file_inode(dst);
+       dst_exception.state = dst_lock->open_context->state;
+
+       do {
+               inode_lock(file_inode(dst));
+               err = _nfs42_proc_copy(src, pos_src, src_lock,
+                                      dst, pos_dst, dst_lock, count);
+               inode_unlock(file_inode(dst));
+
+               if (err == -ENOTSUPP) {
+                       err = -EOPNOTSUPP;
+                       break;
+               }
+
+               err2 = nfs4_handle_exception(server, err, &src_exception);
+               err  = nfs4_handle_exception(server, err, &dst_exception);
+               if (!err)
+                       err = err2;
+       } while (src_exception.retry || dst_exception.retry);
+
+       nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+       nfs_put_lock_context(src_lock);
+       return err;
+}
+
 static loff_t _nfs42_proc_llseek(struct file *filep,
                struct nfs_lock_context *lock, loff_t offset, int whence)
 {
@@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
                         * with the current stateid.
                         */
                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
                        spin_unlock(&inode->i_lock);
                        pnfs_free_lseg_list(&head);
                } else
index 0ca482a51e53200c3a3de15e347e8ba138b78c72..6dc6f2aea0d6c5380d42369ae44f48cfbdcf7622 100644 (file)
@@ -9,9 +9,22 @@
 #define encode_fallocate_maxsz         (encode_stateid_maxsz + \
                                         2 /* offset */ + \
                                         2 /* length */)
+#define NFS42_WRITE_RES_SIZE           (1 /* wr_callback_id size */ +\
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        2 /* wr_count */ + \
+                                        1 /* wr_committed */ + \
+                                        XDR_QUADLEN(NFS4_VERIFIER_SIZE))
 #define encode_allocate_maxsz          (op_encode_hdr_maxsz + \
                                         encode_fallocate_maxsz)
 #define decode_allocate_maxsz          (op_decode_hdr_maxsz)
+#define encode_copy_maxsz              (op_encode_hdr_maxsz +          \
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+                                        2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz              (op_decode_hdr_maxsz + \
+                                        NFS42_WRITE_RES_SIZE + \
+                                        1 /* cr_consecutive */ + \
+                                        1 /* cr_synchronous */)
 #define encode_deallocate_maxsz                (op_encode_hdr_maxsz + \
                                         encode_fallocate_maxsz)
 #define decode_deallocate_maxsz                (op_decode_hdr_maxsz)
                                         decode_putfh_maxsz + \
                                         decode_allocate_maxsz + \
                                         decode_getattr_maxsz)
+#define NFS4_enc_copy_sz               (compound_encode_hdr_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        encode_savefh_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        encode_copy_maxsz)
+#define NFS4_dec_copy_sz               (compound_decode_hdr_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        decode_savefh_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        decode_copy_maxsz)
 #define NFS4_enc_deallocate_sz         (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
                                         encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
        encode_fallocate(xdr, args);
 }
 
+static void encode_copy(struct xdr_stream *xdr,
+                       struct nfs42_copy_args *args,
+                       struct compound_hdr *hdr)
+{
+       encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+       encode_nfs4_stateid(xdr, &args->src_stateid);
+       encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+       encode_uint64(xdr, args->src_pos);
+       encode_uint64(xdr, args->dst_pos);
+       encode_uint64(xdr, args->count);
+
+       encode_uint32(xdr, 1); /* consecutive = true */
+       encode_uint32(xdr, 1); /* synchronous = true */
+       encode_uint32(xdr, 0); /* src server list */
+}
+
 static void encode_deallocate(struct xdr_stream *xdr,
                              struct nfs42_falloc_args *args,
                              struct compound_hdr *hdr)
@@ -181,6 +221,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
 
+/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+                             struct xdr_stream *xdr,
+                             struct nfs42_copy_args *args)
+{
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, args->src_fh, &hdr);
+       encode_savefh(xdr, &hdr);
+       encode_putfh(xdr, args->dst_fh, &hdr);
+       encode_copy(xdr, args, &hdr);
+       encode_nops(&hdr);
+}
+
 /*
  * Encode DEALLOCATE request
  */
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
        return decode_op_hdr(xdr, OP_ALLOCATE);
 }
 
+static int decode_write_response(struct xdr_stream *xdr,
+                                struct nfs42_write_res *res)
+{
+       __be32 *p;
+       int stateids;
+
+       p = xdr_inline_decode(xdr, 4 + 8 + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+
+       stateids = be32_to_cpup(p++);
+       p = xdr_decode_hyper(p, &res->count);
+       res->verifier.committed = be32_to_cpup(p);
+       return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+                                   struct nfs42_copy_res *res) {
+       __be32 *p;
+
+       p = xdr_inline_decode(xdr, 4 + 4);
+       if (unlikely(!p))
+               goto out_overflow;
+
+       res->consecutive = be32_to_cpup(p++);
+       res->synchronous = be32_to_cpup(p++);
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+       int status;
+
+       status = decode_op_hdr(xdr, OP_COPY);
+       if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+               status = decode_copy_requirements(xdr, res);
+               if (status)
+                       return status;
+               return NFS4ERR_OFFLOAD_NO_REQS;
+       } else if (status)
+               return status;
+
+       status = decode_write_response(xdr, &res->write_res);
+       if (status)
+               return status;
+
+       return decode_copy_requirements(xdr, res);
+}
+
 static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
        return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -330,6 +446,36 @@ out:
        return status;
 }
 
+/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+                            struct xdr_stream *xdr,
+                            struct nfs42_copy_res *res)
+{
+       struct compound_hdr hdr;
+       int status;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_savefh(xdr);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       status = decode_copy(xdr, res);
+out:
+       return status;
+}
+
 /*
  * Decode DEALLOCATE request
  */
index 4afdee420d253862e7dded0d84f6743b00de337f..768456fa1b177a2f3b35c9352d1949c9fc44a03b 100644 (file)
@@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
                                      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
-               fmode_t, const struct nfs_lockowner *);
+extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
+               const struct nfs_lockowner *, nfs4_stateid *,
+               struct rpc_cred **);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4;
 
 static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
 {
-       memcpy(dst, src, sizeof(*dst));
+       memcpy(dst->data, src->data, sizeof(dst->data));
+       dst->type = src->type;
 }
 
 static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
 {
-       return memcmp(dst, src, sizeof(*dst)) == 0;
+       if (dst->type != src->type)
+               return false;
+       return memcmp(dst->data, src->data, sizeof(dst->data)) == 0;
 }
 
 static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
index d0390516467c00f7f7506e3832b561338cd5bb79..014b0e41ace5a8f5b17510fd32d9403faa0c4084 100644 (file)
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
 }
 
 #ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+                                   struct file *file_out, loff_t pos_out,
+                                   size_t count, unsigned int flags)
+{
+       struct inode *in_inode = file_inode(file_in);
+       struct inode *out_inode = file_inode(file_out);
+       int ret;
+
+       if (in_inode == out_inode)
+               return -EINVAL;
+
+       /* flush any pending writes */
+       ret = nfs_sync_inode(in_inode);
+       if (ret)
+               return ret;
+       ret = nfs_sync_inode(out_inode);
+       if (ret)
+               return ret;
+
+       return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
 {
        loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
        .check_flags    = nfs_check_flags,
        .setlease       = simple_nosetlease,
 #ifdef CONFIG_NFS_V4_2
+       .copy_file_range = nfs4_copy_file_range,
        .llseek         = nfs4_file_llseek,
        .fallocate      = nfs42_fallocate,
        .clone_file_range = nfs42_clone_file_range,
index 084e8570da18bbdac42416a353fa1e4786403f51..223982eb38c943aa4fbde960b889eb9dbc4853c8 100644 (file)
 #define NFS4_POLL_RETRY_MIN    (HZ/10)
 #define NFS4_POLL_RETRY_MAX    (15*HZ)
 
+/* file attributes which can be mapped to nfs attributes */
+#define NFS4_VALID_ATTRS (ATTR_MODE \
+       | ATTR_UID \
+       | ATTR_GID \
+       | ATTR_SIZE \
+       | ATTR_ATIME \
+       | ATTR_MTIME \
+       | ATTR_CTIME \
+       | ATTR_ATIME_SET \
+       | ATTR_MTIME_SET)
+
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
                case -NFS4ERR_DELAY:
                        nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
+               case -NFS4ERR_RECALLCONFLICT:
                        exception->delay = 1;
                        return 0;
 
@@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir,
        if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
            (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
                nfs4_exclusive_attrset(opendata, sattr, &label);
-
-               nfs_fattr_init(opendata->o_res.f_attr);
-               status = nfs4_do_setattr(state->inode, cred,
-                               opendata->o_res.f_attr, sattr,
-                               state, label, olabel);
-               if (status == 0) {
-                       nfs_setattr_update_inode(state->inode, sattr,
-                                       opendata->o_res.f_attr);
-                       nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+               /*
+                * send create attributes which was not set by open
+                * with an extra setattr.
+                */
+               if (sattr->ia_valid & NFS4_VALID_ATTRS) {
+                       nfs_fattr_init(opendata->o_res.f_attr);
+                       status = nfs4_do_setattr(state->inode, cred,
+                                       opendata->o_res.f_attr, sattr,
+                                       state, label, olabel);
+                       if (status == 0) {
+                               nfs_setattr_update_inode(state->inode, sattr,
+                                               opendata->o_res.f_attr);
+                               nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+                       }
                }
        }
        if (opened && opendata->file_created)
@@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                .rpc_resp       = &res,
                .rpc_cred       = cred,
         };
+       struct rpc_cred *delegation_cred = NULL;
        unsigned long timestamp = jiffies;
        fmode_t fmode;
        bool truncate;
@@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
        fmode = truncate ? FMODE_WRITE : FMODE_READ;
 
-       if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+       if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
                /* Use that stateid */
        } else if (truncate && state != NULL) {
                struct nfs_lockowner lockowner = {
@@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                };
                if (!nfs4_valid_open_stateid(state))
                        return -EBADF;
-               if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
-                               &lockowner) == -EIO)
+               if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
+                               &arg.stateid, &delegation_cred) == -EIO)
                        return -EBADF;
        } else
                nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+       if (delegation_cred)
+               msg.rpc_cred = delegation_cred;
 
        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+
+       put_rpccred(delegation_cred);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
        trace_nfs4_setattr(inode, &arg.stateid, status);
@@ -4285,7 +4307,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
 
        if (l_ctx != NULL)
                lockowner = &l_ctx->lockowner;
-       return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+       return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
 }
 EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
 
@@ -6054,6 +6076,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
+       struct nfs4_state_owner *sp = state->owner;
        unsigned char fl_flags = request->fl_flags;
        int status = -ENOLCK;
 
@@ -6068,6 +6091,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        status = do_vfs_lock(state->inode, request);
        if (status < 0)
                goto out;
+       mutex_lock(&sp->so_delegreturn_mutex);
        down_read(&nfsi->rwsem);
        if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
                /* Yes: cache locks! */
@@ -6075,9 +6099,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                request->fl_flags = fl_flags & ~FL_SLEEP;
                status = do_vfs_lock(state->inode, request);
                up_read(&nfsi->rwsem);
+               mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
        }
        up_read(&nfsi->rwsem);
+       mutex_unlock(&sp->so_delegreturn_mutex);
        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
        request->fl_flags = fl_flags;
@@ -7351,9 +7377,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  * always set csa_cachethis to FALSE because the current implementation
  * of the back channel DRC only supports caching the CB_SEQUENCE operation.
  */
-static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
+static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
+                                   struct rpc_clnt *clnt)
 {
        unsigned int max_rqst_sz, max_resp_sz;
+       unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
 
        max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
        max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -7371,8 +7399,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
        /* Back channel attributes */
-       args->bc_attrs.max_rqst_sz = PAGE_SIZE;
-       args->bc_attrs.max_resp_sz = PAGE_SIZE;
+       args->bc_attrs.max_rqst_sz = max_bc_payload;
+       args->bc_attrs.max_resp_sz = max_bc_payload;
        args->bc_attrs.max_resp_sz_cached = 0;
        args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
        args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
@@ -7476,7 +7504,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        };
        int status;
 
-       nfs4_init_channel_attrs(&args);
+       nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
        args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
        status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -7820,40 +7848,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
        struct nfs4_session *session = nfs4_get_session(server);
-       int ret;
 
        dprintk("--> %s\n", __func__);
-       /* Note the is a race here, where a CB_LAYOUTRECALL can come in
-        * right now covering the LAYOUTGET we are about to send.
-        * However, that is not so catastrophic, and there seems
-        * to be no way to prevent it completely.
-        */
-       if (nfs41_setup_sequence(session, &lgp->args.seq_args,
-                               &lgp->res.seq_res, task))
-               return;
-       ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
-                                         NFS_I(lgp->args.inode)->layout,
-                                         &lgp->args.range,
-                                         lgp->args.ctx->state);
-       if (ret < 0)
-               rpc_exit(task, ret);
+       nfs41_setup_sequence(session, &lgp->args.seq_args,
+                               &lgp->res.seq_res, task);
+       dprintk("<-- %s\n", __func__);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
+
+       dprintk("--> %s\n", __func__);
+       nfs41_sequence_done(task, &lgp->res.seq_res);
+       dprintk("<-- %s\n", __func__);
+}
+
+static int
+nfs4_layoutget_handle_exception(struct rpc_task *task,
+               struct nfs4_layoutget *lgp, struct nfs4_exception *exception)
+{
        struct inode *inode = lgp->args.inode;
        struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
-       struct nfs4_state *state = NULL;
-       unsigned long timeo, now, giveup;
+       int status = task->tk_status;
 
        dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
-       if (!nfs41_sequence_done(task, &lgp->res.seq_res))
-               goto out;
-
-       switch (task->tk_status) {
+       switch (status) {
        case 0:
                goto out;
 
@@ -7863,57 +7885,43 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
         * retry go inband.
         */
        case -NFS4ERR_LAYOUTUNAVAILABLE:
-               task->tk_status = -ENODATA;
+               status = -ENODATA;
                goto out;
        /*
         * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
         * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
         */
        case -NFS4ERR_BADLAYOUT:
-               goto out_overflow;
+               status = -EOVERFLOW;
+               goto out;
        /*
         * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
         * (or clients) writing to the same RAID stripe except when
         * the minlength argument is 0 (see RFC5661 section 18.43.3).
+        *
+        * Treat it like we would RECALLCONFLICT -- we retry for a little
+        * while, and then eventually give up.
         */
        case -NFS4ERR_LAYOUTTRYLATER:
-               if (lgp->args.minlength == 0)
-                       goto out_overflow;
-       /*
-        * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
-        * existing layout before getting a new one).
-        */
-       case -NFS4ERR_RECALLCONFLICT:
-               timeo = rpc_get_timeout(task->tk_client);
-               giveup = lgp->args.timestamp + timeo;
-               now = jiffies;
-               if (time_after(giveup, now)) {
-                       unsigned long delay;
-
-                       /* Delay for:
-                        * - Not less then NFS4_POLL_RETRY_MIN.
-                        * - One last time a jiffie before we give up
-                        * - exponential backoff (time_now minus start_attempt)
-                        */
-                       delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
-                                   min((giveup - now - 1),
-                                       now - lgp->args.timestamp));
-
-                       dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
-                               __func__, delay);
-                       rpc_delay(task, delay);
-                       /* Do not call nfs4_async_handle_error() */
-                       goto out_restart;
+               if (lgp->args.minlength == 0) {
+                       status = -EOVERFLOW;
+                       goto out;
                }
-               break;
+               /* Fallthrough */
+       case -NFS4ERR_RECALLCONFLICT:
+               nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
+                                       exception);
+               status = -ERECALLCONFLICT;
+               goto out;
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
+               exception->timeout = 0;
                spin_lock(&inode->i_lock);
                if (nfs4_stateid_match(&lgp->args.stateid,
                                        &lgp->args.ctx->state->stateid)) {
                        spin_unlock(&inode->i_lock);
                        /* If the open stateid was bad, then recover it. */
-                       state = lgp->args.ctx->state;
+                       exception->state = lgp->args.ctx->state;
                        break;
                }
                lo = NFS_I(inode)->layout;
@@ -7926,25 +7934,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                         * with the current stateid.
                         */
                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
                        spin_unlock(&inode->i_lock);
                        pnfs_free_lseg_list(&head);
                } else
                        spin_unlock(&inode->i_lock);
-               goto out_restart;
+               status = -EAGAIN;
+               goto out;
        }
-       if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
-               goto out_restart;
+
+       status = nfs4_handle_exception(server, status, exception);
+       if (exception->retry)
+               status = -EAGAIN;
 out:
        dprintk("<-- %s\n", __func__);
-       return;
-out_restart:
-       task->tk_status = 0;
-       rpc_restart_call_prepare(task);
-       return;
-out_overflow:
-       task->tk_status = -EOVERFLOW;
-       goto out;
+       return status;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
@@ -8013,7 +8017,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 };
 
 struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
 {
        struct inode *inode = lgp->args.inode;
        struct nfs_server *server = NFS_SERVER(inode);
@@ -8033,6 +8037,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                .flags = RPC_TASK_ASYNC,
        };
        struct pnfs_layout_segment *lseg = NULL;
+       struct nfs4_exception exception = { .timeout = *timeout };
        int status = 0;
 
        dprintk("--> %s\n", __func__);
@@ -8046,7 +8051,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
                return ERR_PTR(-ENOMEM);
        }
        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
-       lgp->args.timestamp = jiffies;
 
        lgp->res.layoutp = &lgp->args.layout;
        lgp->res.seq_res.sr_slot = NULL;
@@ -8056,13 +8060,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        if (IS_ERR(task))
                return ERR_CAST(task);
        status = nfs4_wait_for_completion_rpc_task(task);
-       if (status == 0)
-               status = task->tk_status;
+       if (status == 0) {
+               status = nfs4_layoutget_handle_exception(task, lgp, &exception);
+               *timeout = exception.timeout;
+       }
+
        trace_nfs4_layoutget(lgp->args.ctx,
                        &lgp->args.range,
                        &lgp->res.range,
                        &lgp->res.stateid,
                        status);
+
        /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
        if (status == 0 && lgp->res.layoutp->len)
                lseg = pnfs_layout_process(lgp);
@@ -8118,7 +8126,8 @@ static void nfs4_layoutreturn_release(void *calldata)
 
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+       pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
+                       be32_to_cpu(lrp->args.stateid.seqid));
        pnfs_mark_layout_returned_if_empty(lo);
        if (lrp->res.lrs_present)
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
@@ -8653,6 +8662,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
                const nfs4_stateid *s2)
 {
+       if (s1->type != s2->type)
+               return false;
+
        if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
                return false;
 
@@ -8793,6 +8805,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_STATEID_NFSV41
                | NFS_CAP_ATOMIC_OPEN_V1
                | NFS_CAP_ALLOCATE
+               | NFS_CAP_COPY
                | NFS_CAP_DEALLOCATE
                | NFS_CAP_SEEK
                | NFS_CAP_LAYOUTSTATS
index d854693a15b0e2443779986552d29d9db3f6cdc2..5075592df145a9af6d18df74a0292ff862334576 100644 (file)
 
 #define OPENOWNER_POOL_SIZE    8
 
-const nfs4_stateid zero_stateid;
+const nfs4_stateid zero_stateid = {
+       .data = { 0 },
+       .type = NFS4_SPECIAL_STATEID_TYPE,
+};
 static DEFINE_MUTEX(nfs_clid_init_mutex);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
@@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
  * Byte-range lock aware utility to initialize the stateid of read/write
  * requests.
  */
-int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
-               fmode_t fmode, const struct nfs_lockowner *lockowner)
+int nfs4_select_rw_stateid(struct nfs4_state *state,
+               fmode_t fmode, const struct nfs_lockowner *lockowner,
+               nfs4_stateid *dst, struct rpc_cred **cred)
 {
-       int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+       int ret;
+
+       if (cred != NULL)
+               *cred = NULL;
+       ret = nfs4_copy_lock_stateid(dst, state, lockowner);
        if (ret == -EIO)
                /* A lost lock - don't even consider delegations */
                goto out;
        /* returns true if delegation stateid found and copied */
-       if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+       if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) {
                ret = 0;
                goto out;
        }
index 2c8d05dae5b16a384c088a2e0fd21ada3cf55f96..9c150b15378223697b8b088dd58e9e7a7ae53562 100644 (file)
@@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
                { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },    \
                { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },          \
                { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },      \
+               { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },    \
+               { PNFS_UPDATE_LAYOUT_RETRY, "retrying" },       \
                { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
 
 TRACE_EVENT(pnfs_update_layout,
@@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout,
                        u64 count,
                        enum pnfs_iomode iomode,
                        struct pnfs_layout_hdr *lo,
+                       struct pnfs_layout_segment *lseg,
                        enum pnfs_update_layout_reason reason
                ),
-               TP_ARGS(inode, pos, count, iomode, lo, reason),
+               TP_ARGS(inode, pos, count, iomode, lo, lseg, reason),
                TP_STRUCT__entry(
                        __field(dev_t, dev)
                        __field(u64, fileid)
@@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout,
                        __field(enum pnfs_iomode, iomode)
                        __field(int, layoutstateid_seq)
                        __field(u32, layoutstateid_hash)
+                       __field(long, lseg)
                        __field(enum pnfs_update_layout_reason, reason)
                ),
                TP_fast_assign(
@@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout,
                                __entry->layoutstateid_seq = 0;
                                __entry->layoutstateid_hash = 0;
                        }
+                       __entry->lseg = (long)lseg;
                ),
                TP_printk(
                        "fileid=%02x:%02x:%llu fhandle=0x%08x "
                        "iomode=%s pos=%llu count=%llu "
-                       "layoutstateid=%d:0x%08x (%s)",
+                       "layoutstateid=%d:0x%08x lseg=0x%lx (%s)",
                        MAJOR(__entry->dev), MINOR(__entry->dev),
                        (unsigned long long)__entry->fileid,
                        __entry->fhandle,
@@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout,
                        (unsigned long long)__entry->pos,
                        (unsigned long long)__entry->count,
                        __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+                       __entry->lseg,
                        show_pnfs_update_layout_reason(__entry->reason)
                )
 );
index 88474a4fc669053ab078e71f49d818a9e161a2f1..661e753fe1c93d0c6be59fd26c0339316d50ee12 100644 (file)
@@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
        return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
 }
 
+static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_OPEN_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
+static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LOCK_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
+static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_DELEGATION_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
        int status;
@@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
        if (status == -EIO)
                goto out;
        if (status == 0) {
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_lock_stateid(xdr, &res->stateid);
                if (unlikely(status))
                        goto out;
        } else if (status == -NFS4ERR_DENIED)
@@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
        if (status != -EIO)
                nfs_increment_lock_seqid(status, res->seqid);
        if (status == 0)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_lock_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
        __be32 *p;
        int status;
 
-       status = decode_stateid(xdr, &res->delegation);
+       status = decode_delegation_stateid(xdr, &res->delegation);
        if (unlikely(status))
                return status;
        p = xdr_inline_decode(xdr, 4);
@@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
        nfs_increment_open_seqid(status, res->seqid);
        if (status)
                return status;
-       status = decode_stateid(xdr, &res->stateid);
+       status = decode_open_stateid(xdr, &res->stateid);
        if (unlikely(status))
                return status;
 
@@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
        if (!status)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_open_stateid(xdr, &res->stateid);
        return status;
 }
 
@@ -5838,6 +5856,12 @@ out_overflow:
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+       stateid->type = NFS4_LAYOUT_STATEID_TYPE;
+       return decode_stateid(xdr, stateid);
+}
+
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct nfs4_getdeviceinfo_res *res)
 {
@@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
        if (unlikely(!p))
                goto out_overflow;
        res->return_on_close = be32_to_cpup(p);
-       decode_stateid(xdr, &res->stateid);
+       decode_layout_stateid(xdr, &res->stateid);
        p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
@@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr,
                goto out_overflow;
        res->lrs_present = be32_to_cpup(p);
        if (res->lrs_present)
-               status = decode_stateid(xdr, &res->stateid);
+               status = decode_layout_stateid(xdr, &res->stateid);
        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -7515,6 +7539,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(DEALLOCATE,        enc_deallocate,         dec_deallocate),
        PROC(LAYOUTSTATS,       enc_layoutstats,        dec_layoutstats),
        PROC(CLONE,             enc_clone,              dec_clone),
+       PROC(COPY,              enc_copy,               dec_copy),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
index 1f6db42310575b15785ee8cf43083b7d134ba872..174dd4cf5747f50afaa4d7217fdee16a1e6de11a 100644 (file)
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
         * long write-back delay. This will be adjusted in
         * update_nfs_request below if the region is not locked. */
        req->wb_page    = page;
-       req->wb_index   = page_file_index(page);
-       get_page(page);
+       if (page) {
+               req->wb_index = page_file_index(page);
+               get_page(page);
+       }
        req->wb_offset  = offset;
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
index 89a5ef4df08a3a548af9a8e16a25eff5b50095c2..0c7e0d45a4de6ee1fba11c40417d3fb01678049b 100644 (file)
@@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
        };
 
        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-       return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+       return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
 }
 
 static int
@@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 
        spin_lock(&inode->i_lock);
        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
-       pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+       pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
        spin_unlock(&inode->i_lock);
        pnfs_free_lseg_list(&head);
        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
@@ -522,13 +522,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
        return rv;
 }
 
-/* Returns count of number of matching invalid lsegs remaining in list
- * after call.
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+       return (s32)(s1 - s2) > 0;
+}
+
+/**
+ * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
+ * @lo: layout header containing the lsegs
+ * @tmp_list: list head where doomed lsegs should go
+ * @recall_range: optional recall range argument to match (may be NULL)
+ * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
+ *
+ * Walk the list of lsegs in the layout header, and tear down any that should
+ * be destroyed. If "recall_range" is specified then the segment must match
+ * that range. If "seq" is non-zero, then only match segments that were handed
+ * out at or before that sequence.
+ *
+ * Returns number of matching invalid lsegs remaining in list after scanning
+ * it and purging them.
  */
 int
 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
-                           const struct pnfs_layout_range *recall_range)
+                           const struct pnfs_layout_range *recall_range,
+                           u32 seq)
 {
        struct pnfs_layout_segment *lseg, *next;
        int remaining = 0;
@@ -540,10 +562,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (!recall_range ||
                    should_free_lseg(&lseg->pls_range, recall_range)) {
-                       dprintk("%s: freeing lseg %p iomode %d "
+                       if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+                               continue;
+                       dprintk("%s: freeing lseg %p iomode %d seq %u"
                                "offset %llu length %llu\n", __func__,
-                               lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
-                               lseg->pls_range.length);
+                               lseg, lseg->pls_range.iomode, lseg->pls_seq,
+                               lseg->pls_range.offset, lseg->pls_range.length);
                        if (!mark_lseg_invalid(lseg, tmp_list))
                                remaining++;
                }
@@ -730,15 +754,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        pnfs_destroy_layouts_byclid(clp, false);
 }
 
-/*
- * Compare 2 layout stateid sequence ids, to see which is newer,
- * taking into account wraparound issues.
- */
-static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
-{
-       return (s32)(s1 - s2) > 0;
-}
-
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -781,50 +796,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
-int
-pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-                             const struct pnfs_layout_range *range,
-                             struct nfs4_state *open_state)
-{
-       int status = 0;
-
-       dprintk("--> %s\n", __func__);
-       spin_lock(&lo->plh_inode->i_lock);
-       if (pnfs_layoutgets_blocked(lo)) {
-               status = -EAGAIN;
-       } else if (!nfs4_valid_open_stateid(open_state)) {
-               status = -EBADF;
-       } else if (list_empty(&lo->plh_segs) ||
-                  test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
-               int seq;
-
-               do {
-                       seq = read_seqbegin(&open_state->seqlock);
-                       nfs4_stateid_copy(dst, &open_state->stateid);
-               } while (read_seqretry(&open_state->seqlock, seq));
-       } else
-               nfs4_stateid_copy(dst, &lo->plh_stateid);
-       spin_unlock(&lo->plh_inode->i_lock);
-       dprintk("<-- %s\n", __func__);
-       return status;
-}
-
 /*
-* Get layout from server.
-*    for now, assume that whole file layouts are requested.
-*    arg->offset: 0
-*    arg->length: all ones
-*/
+ * Get layout from server.
+ *    for now, assume that whole file layouts are requested.
+ *    arg->offset: 0
+ *    arg->length: all ones
+ */
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
+          nfs4_stateid *stateid,
           const struct pnfs_layout_range *range,
-          gfp_t gfp_flags)
+          long *timeout, gfp_t gfp_flags)
 {
        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
-       struct pnfs_layout_segment *lseg;
        loff_t i_size;
 
        dprintk("--> %s\n", __func__);
@@ -834,40 +821,31 @@ send_layoutget(struct pnfs_layout_hdr *lo,
         * store in lseg. If we race with a concurrent seqid morphing
         * op, then re-send the LAYOUTGET.
         */
-       do {
-               lgp = kzalloc(sizeof(*lgp), gfp_flags);
-               if (lgp == NULL)
-                       return NULL;
-
-               i_size = i_size_read(ino);
-
-               lgp->args.minlength = PAGE_SIZE;
-               if (lgp->args.minlength > range->length)
-                       lgp->args.minlength = range->length;
-               if (range->iomode == IOMODE_READ) {
-                       if (range->offset >= i_size)
-                               lgp->args.minlength = 0;
-                       else if (i_size - range->offset < lgp->args.minlength)
-                               lgp->args.minlength = i_size - range->offset;
-               }
-               lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-               pnfs_copy_range(&lgp->args.range, range);
-               lgp->args.type = server->pnfs_curr_ld->id;
-               lgp->args.inode = ino;
-               lgp->args.ctx = get_nfs_open_context(ctx);
-               lgp->gfp_flags = gfp_flags;
-               lgp->cred = lo->plh_lc_cred;
-
-               lseg = nfs4_proc_layoutget(lgp, gfp_flags);
-       } while (lseg == ERR_PTR(-EAGAIN));
-
-       if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
-               lseg = NULL;
-       else
-               pnfs_layout_clear_fail_bit(lo,
-                               pnfs_iomode_to_fail_bit(range->iomode));
+       lgp = kzalloc(sizeof(*lgp), gfp_flags);
+       if (lgp == NULL)
+               return ERR_PTR(-ENOMEM);
 
-       return lseg;
+       i_size = i_size_read(ino);
+
+       lgp->args.minlength = PAGE_SIZE;
+       if (lgp->args.minlength > range->length)
+               lgp->args.minlength = range->length;
+       if (range->iomode == IOMODE_READ) {
+               if (range->offset >= i_size)
+                       lgp->args.minlength = 0;
+               else if (i_size - range->offset < lgp->args.minlength)
+                       lgp->args.minlength = i_size - range->offset;
+       }
+       lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+       pnfs_copy_range(&lgp->args.range, range);
+       lgp->args.type = server->pnfs_curr_ld->id;
+       lgp->args.inode = ino;
+       lgp->args.ctx = get_nfs_open_context(ctx);
+       nfs4_stateid_copy(&lgp->args.stateid, stateid);
+       lgp->gfp_flags = gfp_flags;
+       lgp->cred = lo->plh_lc_cred;
+
+       return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
 }
 
 static void pnfs_clear_layoutcommit(struct inode *inode,
@@ -899,6 +877,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
                return false;
        lo->plh_return_iomode = 0;
+       lo->plh_return_seq = 0;
        pnfs_get_layout_hdr(lo);
        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
        return true;
@@ -969,6 +948,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
                bool send;
 
                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+               stateid.seqid = cpu_to_be32(lo->plh_return_seq);
                iomode = lo->plh_return_iomode;
                send = pnfs_prepare_layoutreturn(lo);
                spin_unlock(&inode->i_lock);
@@ -1012,7 +992,7 @@ _pnfs_return_layout(struct inode *ino)
        pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
        pnfs_clear_layoutcommit(ino, &tmp_list);
-       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
 
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                struct pnfs_layout_range range = {
@@ -1341,23 +1321,28 @@ out_existing:
 
 /*
  * iomode matching rules:
- * iomode      lseg    match
- * -----       -----   -----
- * ANY         READ    true
- * ANY         RW      true
- * RW          READ    false
- * RW          RW      true
- * READ                READ    true
- * READ                RW      true
+ * iomode      lseg    strict match
+ *                      iomode
+ * -----       -----   ------ -----
+ * ANY         READ    N/A    true
+ * ANY         RW      N/A    true
+ * RW          READ    N/A    false
+ * RW          RW      N/A    true
+ * READ                READ    N/A    true
+ * READ                RW      true   false
+ * READ                RW      false  true
  */
 static bool
 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
-                const struct pnfs_layout_range *range)
+                const struct pnfs_layout_range *range,
+                bool strict_iomode)
 {
        struct pnfs_layout_range range1;
 
        if ((range->iomode == IOMODE_RW &&
             ls_range->iomode != IOMODE_RW) ||
+           (range->iomode != ls_range->iomode &&
+            strict_iomode == true) ||
            !pnfs_lseg_range_intersecting(ls_range, range))
                return 0;
 
@@ -1372,7 +1357,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
  */
 static struct pnfs_layout_segment *
 pnfs_find_lseg(struct pnfs_layout_hdr *lo,
-               struct pnfs_layout_range *range)
+               struct pnfs_layout_range *range,
+               bool strict_iomode)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
 
@@ -1381,7 +1367,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
-                   pnfs_lseg_range_match(&lseg->pls_range, range)) {
+                   pnfs_lseg_range_match(&lseg->pls_range, range,
+                                         strict_iomode)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
                }
@@ -1498,6 +1485,7 @@ pnfs_update_layout(struct inode *ino,
                   loff_t pos,
                   u64 count,
                   enum pnfs_iomode iomode,
+                  bool strict_iomode,
                   gfp_t gfp_flags)
 {
        struct pnfs_layout_range arg = {
@@ -1505,27 +1493,30 @@ pnfs_update_layout(struct inode *ino,
                .offset = pos,
                .length = count,
        };
-       unsigned pg_offset;
+       unsigned pg_offset, seq;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs_client *clp = server->nfs_client;
-       struct pnfs_layout_hdr *lo;
+       struct pnfs_layout_hdr *lo = NULL;
        struct pnfs_layout_segment *lseg = NULL;
+       nfs4_stateid stateid;
+       long timeout = 0;
+       unsigned long giveup = jiffies + rpc_get_timeout(server->client);
        bool first;
 
        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_NO_PNFS);
                goto out;
        }
 
        if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
                goto out;
        }
 
        if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_MDSTHRESH);
                goto out;
        }
@@ -1536,14 +1527,14 @@ lookup_again:
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                spin_unlock(&ino->i_lock);
-               trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_NOMEM);
                goto out;
        }
 
        /* Do we even need to bother with this? */
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_BULK_RECALL);
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
@@ -1551,14 +1542,34 @@ lookup_again:
 
        /* if LAYOUTGET already failed once we don't try again */
        if (pnfs_layout_io_test_failed(lo, iomode)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
                goto out_unlock;
        }
 
-       first = list_empty(&lo->plh_segs);
-       if (first) {
-               /* The first layoutget for the file. Need to serialize per
+       lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
+       if (lseg) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+                               PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+               goto out_unlock;
+       }
+
+       if (!nfs4_valid_open_stateid(ctx->state)) {
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+                               PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+               goto out_unlock;
+       }
+
+       /*
+        * Choose a stateid for the LAYOUTGET. If we don't have a layout
+        * stateid, or it has been invalidated, then we must use the open
+        * stateid.
+        */
+       if (lo->plh_stateid.seqid == 0 ||
+           test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
+
+               /*
+                * The first layoutget for the file. Need to serialize per
                 * RFC 5661 Errata 3208.
                 */
                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
@@ -1567,18 +1578,17 @@ lookup_again:
                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
                                    TASK_UNINTERRUPTIBLE);
                        pnfs_put_layout_hdr(lo);
+                       dprintk("%s retrying\n", __func__);
                        goto lookup_again;
                }
+
+               first = true;
+               do {
+                       seq = read_seqbegin(&ctx->state->seqlock);
+                       nfs4_stateid_copy(&stateid, &ctx->state->stateid);
+               } while (read_seqretry(&ctx->state->seqlock, seq));
        } else {
-               /* Check to see if the layout for the given range
-                * already exists
-                */
-               lseg = pnfs_find_lseg(lo, &arg);
-               if (lseg) {
-                       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
-                                       PNFS_UPDATE_LAYOUT_FOUND_CACHED);
-                       goto out_unlock;
-               }
+               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        }
 
        /*
@@ -1593,15 +1603,17 @@ lookup_again:
                                pnfs_clear_first_layoutget(lo);
                        pnfs_put_layout_hdr(lo);
                        dprintk("%s retrying\n", __func__);
+                       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+                                       lseg, PNFS_UPDATE_LAYOUT_RETRY);
                        goto lookup_again;
                }
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                PNFS_UPDATE_LAYOUT_RETURN);
                goto out_put_layout_hdr;
        }
 
        if (pnfs_layoutgets_blocked(lo)) {
-               trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+               trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                PNFS_UPDATE_LAYOUT_BLOCKED);
                goto out_unlock;
        }
@@ -1626,10 +1638,36 @@ lookup_again:
        if (arg.length != NFS4_MAX_UINT64)
                arg.length = PAGE_ALIGN(arg.length);
 
-       lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-       atomic_dec(&lo->plh_outstanding);
-       trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+       lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
+       trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+       if (IS_ERR(lseg)) {
+               switch(PTR_ERR(lseg)) {
+               case -ERECALLCONFLICT:
+                       if (time_after(jiffies, giveup))
+                               lseg = NULL;
+                       /* Fallthrough */
+               case -EAGAIN:
+                       pnfs_put_layout_hdr(lo);
+                       if (first)
+                               pnfs_clear_first_layoutget(lo);
+                       if (lseg) {
+                               trace_pnfs_update_layout(ino, pos, count,
+                                       iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+                               goto lookup_again;
+                       }
+                       /* Fallthrough */
+               default:
+                       if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
+                               pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+                               lseg = NULL;
+                       }
+               }
+       } else {
+               pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+       }
+
+       atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
        if (first)
                pnfs_clear_first_layoutget(lo);
@@ -1678,38 +1716,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_segment *lseg;
        struct inode *ino = lo->plh_inode;
        LIST_HEAD(free_me);
-       int status = -EINVAL;
 
        if (!pnfs_sanity_check_layout_range(&res->range))
-               goto out;
+               return ERR_PTR(-EINVAL);
 
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
-       if (!lseg || IS_ERR(lseg)) {
+       if (IS_ERR_OR_NULL(lseg)) {
                if (!lseg)
-                       status = -ENOMEM;
-               else
-                       status = PTR_ERR(lseg);
-               dprintk("%s: Could not allocate layout: error %d\n",
-                      __func__, status);
-               goto out;
+                       lseg = ERR_PTR(-ENOMEM);
+
+               dprintk("%s: Could not allocate layout: error %ld\n",
+                      __func__, PTR_ERR(lseg));
+               return lseg;
        }
 
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
+       lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
 
        spin_lock(&ino->i_lock);
        if (pnfs_layoutgets_blocked(lo)) {
                dprintk("%s forget reply due to state\n", __func__);
-               goto out_forget_reply;
+               goto out_forget;
        }
 
        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
                /* existing state ID, make sure the sequence number matches. */
                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                        dprintk("%s forget reply due to sequence\n", __func__);
-                       status = -EAGAIN;
-                       goto out_forget_reply;
+                       goto out_forget;
                }
                pnfs_set_layout_stateid(lo, &res->stateid, false);
        } else {
@@ -1718,7 +1754,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                 * inode invalid, and don't bother validating the stateid
                 * sequence number.
                 */
-               pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
+               pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
 
                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
@@ -1735,18 +1771,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me);
        return lseg;
-out:
-       return ERR_PTR(status);
 
-out_forget_reply:
+out_forget:
        spin_unlock(&ino->i_lock);
        lseg->pls_layout = lo;
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-       goto out;
+       return ERR_PTR(-EAGAIN);
 }
 
 static void
-pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+                        u32 seq)
 {
        if (lo->plh_return_iomode == iomode)
                return;
@@ -1754,6 +1789,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
                iomode = IOMODE_ANY;
        lo->plh_return_iomode = iomode;
        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+       if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+               lo->plh_return_seq = seq;
 }
 
 /**
@@ -1769,7 +1806,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
 int
 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *return_range)
+                               const struct pnfs_layout_range *return_range,
+                               u32 seq)
 {
        struct pnfs_layout_segment *lseg, *next;
        int remaining = 0;
@@ -1792,8 +1830,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                continue;
                        remaining++;
                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
-                       pnfs_set_plh_return_iomode(lo, return_range->iomode);
                }
+
+       if (remaining)
+               pnfs_set_plh_return_info(lo, return_range->iomode, seq);
+
        return remaining;
 }
 
@@ -1810,13 +1851,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
        bool return_now = false;
 
        spin_lock(&inode->i_lock);
-       pnfs_set_plh_return_iomode(lo, range.iomode);
+       pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
        /*
         * mark all matching lsegs so that we are sure to have no live
         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
         * for how it works.
         */
-       if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+       if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
+                                               &range, lseg->pls_seq)) {
                nfs4_stateid stateid;
                enum pnfs_iomode iomode = lo->plh_return_iomode;
 
@@ -1849,6 +1891,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
                                                   req_offset(req),
                                                   rd_size,
                                                   IOMODE_READ,
+                                                  false,
                                                   GFP_KERNEL);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -1873,6 +1916,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                                   req_offset(req),
                                                   wb_size,
                                                   IOMODE_RW,
+                                                  false,
                                                   GFP_NOFS);
                if (IS_ERR(pgio->pg_lseg)) {
                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
@@ -2143,12 +2187,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
 }
 
 /* Resend all requests through pnfs. */
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
 {
        struct nfs_pageio_descriptor pgio;
 
-       nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
-       return nfs_pageio_resend(&pgio, hdr);
+       if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+               nfs_pageio_init_read(&pgio, hdr->inode, false,
+                                       hdr->completion_ops);
+               hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
+       }
 }
 EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
 
@@ -2158,12 +2205,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
-       int err = 0;
 
        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
        if (trypnfs == PNFS_TRY_AGAIN)
-               err = pnfs_read_resend_pnfs(hdr);
-       if (trypnfs == PNFS_NOT_ATTEMPTED || err)
+               pnfs_read_resend_pnfs(hdr);
+       if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
                pnfs_read_through_mds(desc, hdr);
 }
 
@@ -2405,7 +2451,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
        spin_lock(&inode->i_lock);
        if (!NFS_I(inode)->layout) {
                spin_unlock(&inode->i_lock);
-               goto out;
+               goto out_clear_layoutstats;
        }
        hdr = NFS_I(inode)->layout;
        pnfs_get_layout_hdr(hdr);
@@ -2434,6 +2480,7 @@ out_free:
        kfree(data);
 out_put:
        pnfs_put_layout_hdr(hdr);
+out_clear_layoutstats:
        smp_mb__before_atomic();
        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
        smp_mb__after_atomic();
index 1ac1db5f6dadb6508cf8658a53385add279eb9fa..b21bd0bee784391b0b88483d9b9d3ee7cce91bde 100644 (file)
@@ -64,6 +64,7 @@ struct pnfs_layout_segment {
        struct list_head pls_lc_list;
        struct pnfs_layout_range pls_range;
        atomic_t pls_refcount;
+       u32 pls_seq;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
        struct work_struct pls_work;
@@ -194,6 +195,7 @@ struct pnfs_layout_hdr {
        unsigned long           plh_flags;
        nfs4_stateid            plh_stateid;
        u32                     plh_barrier; /* ignore lower seqids */
+       u32                     plh_return_seq;
        enum pnfs_iomode        plh_return_iomode;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
@@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev,
                                   struct rpc_cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
+extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
 /* pnfs.c */
@@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             const nfs4_stateid *new,
                             bool update_barrier);
-int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
-                                 struct pnfs_layout_hdr *lo,
-                                 const struct pnfs_layout_range *range,
-                                 struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *recall_range);
+                               const struct pnfs_layout_range *recall_range,
+                               u32 seq);
 int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                               const struct pnfs_layout_range *recall_range);
+                               const struct pnfs_layout_range *recall_range,
+                               u32 seq);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_pgio_header *);
 void pnfs_ld_read_done(struct nfs_pgio_header *);
-int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
+void pnfs_read_resend_pnfs(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
                                               u64 count,
                                               enum pnfs_iomode iomode,
+                                              bool strict_iomode,
                                               gfp_t gfp_flags);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
index 4aaed890048fd34d1fdda141a6479e4fec7ba59b..0dfc476da3e10918bcf0242d779cda343e6dac61 100644 (file)
@@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
 
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this must be called holding the inode (/cinfo) lock
+ * Note this must be called holding i_lock
  */
 void
 pnfs_generic_clear_request_commit(struct nfs_page *req,
@@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
                if (!nfs_lock_request(req))
                        continue;
                kref_get(&req->wb_kref);
-               if (cond_resched_lock(cinfo->lock))
+               if (cond_resched_lock(&cinfo->inode->i_lock))
                        list_safe_reset_next(req, tmp, wb_list);
                nfs_request_remove_commit_list(req, cinfo);
                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
@@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
        struct list_head *dst = &bucket->committing;
        int ret;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
        ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
        if (ret) {
                cinfo->ds->nwritten -= ret;
@@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
        int i, rv = 0, cnt;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
                cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
                                                       cinfo, max);
@@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
        struct pnfs_layout_segment *freeme;
        int i;
 
-       lockdep_assert_held(cinfo->lock);
+       lockdep_assert_held(&cinfo->inode->i_lock);
 restart:
        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
                if (pnfs_generic_transfer_commit_list(&b->written, dst,
                                                      cinfo, 0)) {
                        freeme = b->wlseg;
                        b->wlseg = NULL;
-                       spin_unlock(cinfo->lock);
+                       spin_unlock(&cinfo->inode->i_lock);
                        pnfs_put_lseg(freeme);
-                       spin_lock(cinfo->lock);
+                       spin_lock(&cinfo->inode->i_lock);
                        goto restart;
                }
        }
@@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
        LIST_HEAD(pages);
        int i;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        for (i = idx; i < fl_cinfo->nbuckets; i++) {
                bucket = &fl_cinfo->buckets[i];
                if (list_empty(&bucket->committing))
@@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
                freeme = bucket->clseg;
                bucket->clseg = NULL;
                list_splice_init(&bucket->committing, &pages);
-               spin_unlock(cinfo->lock);
+               spin_unlock(&cinfo->inode->i_lock);
                nfs_retry_commit(&pages, freeme, cinfo, i);
                pnfs_put_lseg(freeme);
-               spin_lock(cinfo->lock);
+               spin_lock(&cinfo->inode->i_lock);
        }
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 }
 
 static unsigned int
@@ -238,14 +238,31 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages,
        struct pnfs_commit_bucket *bucket;
 
        bucket = &cinfo->ds->buckets[data->ds_commit_index];
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        list_splice_init(&bucket->committing, pages);
        data->lseg = bucket->clseg;
        bucket->clseg = NULL;
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
 
 }
 
+/* Helper function for pnfs_generic_commit_pagelist to catch an empty
+ * page list. This can happen when two commits race. */
+static bool
+pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
+                                         struct nfs_commit_data *data,
+                                         struct nfs_commit_info *cinfo)
+{
+       if (list_empty(pages)) {
+               if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
+                       wake_up_atomic_t(&cinfo->mds->rpcs_out);
+               nfs_commitdata_release(data);
+               return true;
+       }
+
+       return false;
+}
+
 /* This follows nfs_commit_list pretty closely */
 int
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -280,6 +297,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        list_for_each_entry_safe(data, tmp, &list, pages) {
                list_del_init(&data->pages);
                if (data->ds_commit_index < 0) {
+                       /* another commit raced with us */
+                       if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
+                               data, cinfo))
+                               continue;
+
                        nfs_init_commit(data, mds_pages, NULL, cinfo);
                        nfs_initiate_commit(NFS_CLIENT(inode), data,
                                            NFS_PROTO(data->inode),
@@ -288,6 +310,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                        LIST_HEAD(pages);
 
                        pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+
+                       /* another commit raced with us */
+                       if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
+                               data, cinfo))
+                               continue;
+
                        nfs_init_commit(data, &pages, data->lseg, cinfo);
                        initiate_commit(data, how);
                }
@@ -874,12 +902,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        struct list_head *list;
        struct pnfs_commit_bucket *buckets;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        buckets = cinfo->ds->buckets;
        list = &buckets[ds_commit_idx].written;
        if (list_empty(list)) {
                if (!pnfs_is_valid_lseg(lseg)) {
-                       spin_unlock(cinfo->lock);
+                       spin_unlock(&cinfo->inode->i_lock);
                        cinfo->completion_ops->resched_write(cinfo, req);
                        return;
                }
@@ -896,7 +924,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        cinfo->ds->nwritten++;
 
        nfs_request_add_commit_list_locked(req, list, cinfo);
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
index f1268280244e4504c175db1844a306eefb1f0d00..2137e0202f2560d39dc383a01ecf33fcfcf4958d 100644 (file)
@@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = {
 
 enum {
        Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma,
+       Opt_xprt_rdma6,
 
        Opt_xprt_err
 };
@@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = {
        { Opt_xprt_tcp, "tcp" },
        { Opt_xprt_tcp6, "tcp6" },
        { Opt_xprt_rdma, "rdma" },
+       { Opt_xprt_rdma6, "rdma6" },
 
        { Opt_xprt_err, NULL }
 };
@@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
                                break;
+                       case Opt_xprt_rdma6:
+                               protofamily = AF_INET6;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
@@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1,
                                     struct nfs_server *server2)
 {
        struct sockaddr *sap1, *sap2;
+       struct rpc_xprt *xprt1 = server1->client->cl_xprt;
+       struct rpc_xprt *xprt2 = server2->client->cl_xprt;
+
+       if (!net_eq(xprt1->xprt_net, xprt2->xprt_net))
+               return 0;
 
        sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
        sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
index 5f4fd53e5764884391569d010661fe1639b35053..e1c74d3db64de48851c0d758ade6df2741da9d30 100644 (file)
@@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req)
 static int wb_priority(struct writeback_control *wbc)
 {
        int ret = 0;
-       if (wbc->for_reclaim)
-               return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
+
        if (wbc->sync_mode == WB_SYNC_ALL)
                ret = FLUSH_COND_STABLE;
        return ret;
@@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                head = req->wb_head;
 
                spin_lock(&inode->i_lock);
-               if (likely(!PageSwapCache(head->wb_page))) {
+               if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
                        set_page_private(head->wb_page, 0);
                        ClearPagePrivate(head->wb_page);
                        smp_mb__after_atomic();
@@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
-       __set_page_dirty_nobuffers(req->wb_page);
+       if (req->wb_page)
+               __set_page_dirty_nobuffers(req->wb_page);
 }
 
 /*
@@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold the cinfo->lock, and the nfs_page lock.
+ * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-       spin_unlock(cinfo->lock);
-       nfs_mark_page_unstable(req->wb_page, cinfo);
+       spin_unlock(&cinfo->inode->i_lock);
+       if (req->wb_page)
+               nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
@@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
                                      struct inode *inode)
 {
-       cinfo->lock = &inode->i_lock;
+       cinfo->inode = inode;
        cinfo->mds = &NFS_I(inode)->commit_info;
        cinfo->ds = pnfs_get_ds_info(inode);
        cinfo->dreq = NULL;
@@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
        return cinfo->mds->ncommit;
 }
 
-/* cinfo->lock held by caller */
+/* cinfo->inode->i_lock held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                     struct nfs_commit_info *cinfo, int max)
@@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
                if (!nfs_lock_request(req))
                        continue;
                kref_get(&req->wb_kref);
-               if (cond_resched_lock(cinfo->lock))
+               if (cond_resched_lock(&cinfo->inode->i_lock))
                        list_safe_reset_next(req, tmp, wb_list);
                nfs_request_remove_commit_list(req, cinfo);
                nfs_list_add_request(req, dst);
@@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
        int ret = 0;
 
-       spin_lock(cinfo->lock);
+       spin_lock(&cinfo->inode->i_lock);
        if (cinfo->mds->ncommit > 0) {
                const int max = INT_MAX;
 
@@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
                                           cinfo, max);
                ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
        }
-       spin_unlock(cinfo->lock);
+       spin_unlock(&cinfo->inode->i_lock);
        return ret;
 }
 
@@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 {
        struct nfs_commit_data  *data;
 
+       /* another commit raced with us */
+       if (list_empty(head))
+               return 0;
+
        data = nfs_commitdata_alloc();
 
        if (!data)
@@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
        return -ENOMEM;
 }
 
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
+{
+       struct inode *inode = file_inode(file);
+       struct nfs_open_context *open;
+       struct nfs_commit_info cinfo;
+       struct nfs_page *req;
+       int ret;
+
+       open = get_nfs_open_context(nfs_file_open_context(file));
+       req  = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
+       if (IS_ERR(req)) {
+               ret = PTR_ERR(req);
+               goto out_put;
+       }
+
+       nfs_init_cinfo_from_inode(&cinfo, inode);
+
+       memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
+       nfs_request_add_commit_list(req, &cinfo);
+       ret = nfs_commit_inode(inode, FLUSH_SYNC);
+       if (ret > 0)
+               ret = 0;
+
+       nfs_free_request(req);
+out_put:
+       put_nfs_open_context(open);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_commit_file);
+
 /*
  * COMMIT call returned
  */
@@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-               nfs_clear_page_commit(req->wb_page);
+               if (req->wb_page)
+                       nfs_clear_page_commit(req->wb_page);
 
                dprintk("NFS:       commit (%s/%llu %d@%lld)",
                        req->wb_context->dentry->d_sb->s_id,
index 93d5853f8c99b84e5b0c530eeaf044396699ecc6..dba2ff8eaa68e3365deac3d61df3da5641099d9e 100644 (file)
@@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         */
        hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
        dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-               - hdr;
+               + rqstp->rq_arg.tail[0].iov_len - hdr;
        /*
         * Round the length of the data which was specified up to
         * the next multiple of XDR units and then compare that
index 825c7bc8d789716749138583953c26e630294a93..953c0755cb37e23697a2308800ccaf7bf85232cf 100644 (file)
@@ -289,7 +289,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 
                status = nfserr_bad_stateid;
                mutex_lock(&ls->ls_mutex);
-               if (stateid->si_generation > stid->sc_stateid.si_generation)
+               if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid))
                        goto out_unlock_stid;
                if (layout_type != ls->ls_layout_type)
                        goto out_unlock_stid;
index 0462eeddfff9997f9de2fa0fb53a100deeff941d..f5f82e145018059bbeb265cd42a279a3009dfd6b 100644 (file)
@@ -4651,12 +4651,6 @@ grace_disallows_io(struct net *net, struct inode *inode)
        return opens_in_grace(net) && mandatory_lock(inode);
 }
 
-/* Returns true iff a is later than b: */
-static bool stateid_generation_after(stateid_t *a, stateid_t *b)
-{
-       return (s32)(a->si_generation - b->si_generation) > 0;
-}
-
 static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
        /*
@@ -4670,7 +4664,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
                return nfs_ok;
 
        /* If the client sends us a stateid from the future, it's buggy: */
-       if (stateid_generation_after(in, ref))
+       if (nfsd4_stateid_generation_after(in, ref))
                return nfserr_bad_stateid;
        /*
         * However, we could see a stateid from the past, even from a
index c050c53036a62f5dfd596755b426d5fbd5538501..986e51e5ceac882f703e1d0262fdc66d356a453f 100644 (file)
@@ -573,6 +573,11 @@ enum nfsd4_cb_op {
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
+/* Returns true iff a is later than b: */
+static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+       return (s32)(a->si_generation - b->si_generation) > 0;
+}
 
 struct nfsd4_compound_state;
 struct nfsd_net;
index 0748777f2e2a0dcef7e269897e8d5694e05f56f8..c56a7679df93a2bc3df147fe8f408b74221988cb 100644 (file)
@@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        }
        if (is_bad_inode(inode)) {
                iput(inode);
-               if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
-                   (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
-                       /* Return OCFS2_FILECHECK_ERR_XXX related errno */
-                       inode = ERR_PTR(rc);
-               else
-                       inode = ERR_PTR(-ESTALE);
+               inode = ERR_PTR(rc);
                goto bail;
        }
 
index f4cd3c3e9fb70d708d57a3d8dc15f92492e4ea12..497a4171ef61f6209a32303530b79c72439962cf 100644 (file)
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 
 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-       return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+       return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
index a86c6c04b9bcf06797e27e17075b4cc6112d0e62..68ef06efe6bc4a530133d54e57833330a1b40470 100644 (file)
@@ -182,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
        }
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
@@ -261,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
                return -EINVAL;
        dirent = buf->previous;
        if (dirent) {
+               if (signal_pending(current))
+                       return -EINTR;
                if (__put_user(offset, &dirent->d_off))
                        goto efault;
        }
index b11945e15fde2b62d59c5c81c5aead48fb0e5d83..fc81e771488a670ebeac9606cc815dab1ded6446 100644 (file)
@@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
  * operations to the correct xattr_handler.
  */
 #define for_each_xattr_handler(handlers, handler)              \
+       if (handlers)                                           \
                for ((handler) = *(handlers)++;                 \
                        (handler) != NULL;                      \
                        (handler) = *(handlers)++)
@@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
        const struct xattr_handler *handler;
 
        if (!*name)
-               return NULL;
+               return ERR_PTR(-EINVAL);
 
        for_each_xattr_handler(handlers, handler) {
                const char *n;
index 686ba6fb20ddc942638f6e698b98bf4c1bb8785f..339c696bbc0186b9a0064ec0146d5e143bbe2569 100644 (file)
@@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 }
 
 void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-            xfs_km_flags_t flags)
+kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
 {
-       void    *new;
+       int     retries = 0;
+       gfp_t   lflags = kmem_flags_convert(flags);
+       void    *ptr;
 
-       new = kmem_alloc(newsize, flags);
-       if (ptr) {
-               if (new)
-                       memcpy(new, ptr,
-                               ((oldsize < newsize) ? oldsize : newsize));
-               kmem_free(ptr);
-       }
-       return new;
+       do {
+               ptr = krealloc(old, newsize, lflags);
+               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                       return ptr;
+               if (!(++retries % 100))
+                       xfs_err(NULL,
+       "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
+                               current->comm, current->pid,
+                               newsize, __func__, lflags);
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+       } while (1);
 }
 
 void *
index d1c66e465ca5629fe3e330b9f6b87b07a5f9a1fa..689f746224e7ab8a0fbf3d2f9acb4f1dd68a9a16 100644 (file)
@@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
 static inline void  kmem_free(const void *ptr)
 {
        kvfree(ptr);
index fa3b948ef9c2561e8b34aae282ca8c0f02a22abc..4e126f41a0aa97d1f73773ea8efb89ef7a52746c 100644 (file)
@@ -242,37 +242,21 @@ xfs_attr_set(
                        return error;
        }
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 
        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
-
-       if (rsvd)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
-                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
-       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
-       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
-       if (error) {
-               xfs_trans_cancel(args.trans);
+       error = xfs_trans_alloc(mp, &tres, args.total, 0,
+                       rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+       if (error)
                return error;
-       }
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
 
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
@@ -428,32 +412,16 @@ xfs_attr_remove(
        if (error)
                return error;
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
-
        /*
         * Root fork attributes can use reserved data blocks for this
         * operation if necessary
         */
-
-       if (flags & ATTR_ROOT)
-               args.trans->t_flags |= XFS_TRANS_RESERVE;
-
-       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
-                                 XFS_ATTRRM_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(args.trans);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
+                       XFS_ATTRRM_SPACE_RES(mp), 0,
+                       (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
+                       &args.trans);
+       if (error)
                return error;
-       }
 
        xfs_ilock(dp, XFS_ILOCK_EXCL);
        /*
index ce41d7fe753c5dcbb7edfe85771e1a121c4e580b..932381caef1bc421cb9a41e79191845d6fc60346 100644 (file)
@@ -1121,15 +1121,14 @@ xfs_bmap_add_attrfork(
 
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-       tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+
        blks = XFS_ADDAFORK_SPACE_RES(mp);
-       if (rsvd)
-               tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
+                       rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+       if (error)
                return error;
-       }
+
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
@@ -6026,13 +6025,10 @@ xfs_bmap_split_extent(
        xfs_fsblock_t           firstfsb;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                       XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+                       XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
index 974d62e677f45ada4ef7ca4e5ac916c9dbe6ffbe..e5bb9cc3b243b9fd451db6dcc429edb6a24644bf 100644 (file)
@@ -257,15 +257,12 @@ xfs_dir2_block_to_sf(
         *
         * Convert the inode to local format and copy the data in.
         */
-       dp->i_df.if_flags &= ~XFS_IFEXTENTS;
-       dp->i_df.if_flags |= XFS_IFINLINE;
-       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
        ASSERT(dp->i_df.if_bytes == 0);
-       xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+       xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+       dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+       dp->i_d.di_size = size;
 
        logflags |= XFS_ILOG_DDATA;
-       memcpy(dp->i_df.if_u1.if_data, dst, size);
-       dp->i_d.di_size = size;
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
index 11faf7df14c8099e49759f51f0315dd5caec6632..bbcc8c7a44b3ffde66bf152583e87ebbd9763da7 100644 (file)
@@ -231,6 +231,48 @@ xfs_iformat_fork(
        return error;
 }
 
+void
+xfs_init_local_fork(
+       struct xfs_inode        *ip,
+       int                     whichfork,
+       const void              *data,
+       int                     size)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       int                     mem_size = size, real_size = 0;
+       bool                    zero_terminate;
+
+       /*
+        * If we are using the local fork to store a symlink body we need to
+        * zero-terminate it so that we can pass it back to the VFS directly.
+        * Overallocate the in-memory fork by one for that and add a zero
+        * to terminate it below.
+        */
+       zero_terminate = S_ISLNK(VFS_I(ip)->i_mode);
+       if (zero_terminate)
+               mem_size++;
+
+       if (size == 0)
+               ifp->if_u1.if_data = NULL;
+       else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
+               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+       else {
+               real_size = roundup(mem_size, 4);
+               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+       }
+
+       if (size) {
+               memcpy(ifp->if_u1.if_data, data, size);
+               if (zero_terminate)
+                       ifp->if_u1.if_data[size] = '\0';
+       }
+
+       ifp->if_bytes = size;
+       ifp->if_real_bytes = real_size;
+       ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+       ifp->if_flags |= XFS_IFINLINE;
+}
+
 /*
  * The file is in-lined in the on-disk inode.
  * If it fits into if_inline_data, then copy
@@ -248,8 +290,6 @@ xfs_iformat_local(
        int             whichfork,
        int             size)
 {
-       xfs_ifork_t     *ifp;
-       int             real_size;
 
        /*
         * If the size is unreasonable, then something
@@ -265,22 +305,8 @@ xfs_iformat_local(
                                     ip->i_mount, dip);
                return -EFSCORRUPTED;
        }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       real_size = 0;
-       if (size == 0)
-               ifp->if_u1.if_data = NULL;
-       else if (size <= sizeof(ifp->if_u2.if_inline_data))
-               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-       else {
-               real_size = roundup(size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-       }
-       ifp->if_bytes = size;
-       ifp->if_real_bytes = real_size;
-       if (size)
-               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFINLINE;
+
+       xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
        return 0;
 }
 
@@ -516,7 +542,6 @@ xfs_iroot_realloc(
                new_max = cur_max + rec_diff;
                new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
                                KM_SLEEP | KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
@@ -660,7 +685,6 @@ xfs_idata_realloc(
                                ifp->if_u1.if_data =
                                        kmem_realloc(ifp->if_u1.if_data,
                                                        real_size,
-                                                       ifp->if_real_bytes,
                                                        KM_SLEEP | KM_NOFS);
                        }
                } else {
@@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct(
                if (rnew_size != ifp->if_real_bytes) {
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
-                                               rnew_size,
-                                               ifp->if_real_bytes, KM_NOFS);
+                                               rnew_size, KM_NOFS);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect(
        if (new_size == 0) {
                xfs_iext_destroy(ifp);
        } else {
-               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-                       kmem_realloc(ifp->if_u1.if_ext_irec,
-                               new_size, size, KM_NOFS);
+               ifp->if_u1.if_ext_irec =
+                       kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
        }
 }
 
@@ -1496,6 +1518,24 @@ xfs_iext_indirect_to_direct(
        }
 }
 
+/*
+ * Remove all records from the indirection array.
+ */
+STATIC void
+xfs_iext_irec_remove_all(
+       struct xfs_ifork *ifp)
+{
+       int             nlists;
+       int             i;
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       for (i = 0; i < nlists; i++)
+               kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
+       kmem_free(ifp->if_u1.if_ext_irec);
+       ifp->if_flags &= ~XFS_IFEXTIREC;
+}
+
 /*
  * Free incore file extents.
  */
@@ -1504,14 +1544,7 @@ xfs_iext_destroy(
        xfs_ifork_t     *ifp)           /* inode fork pointer */
 {
        if (ifp->if_flags & XFS_IFEXTIREC) {
-               int     erp_idx;
-               int     nlists;
-
-               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-               }
-               ifp->if_flags &= ~XFS_IFEXTIREC;
+               xfs_iext_irec_remove_all(ifp);
        } else if (ifp->if_real_bytes) {
                kmem_free(ifp->if_u1.if_extents);
        } else if (ifp->if_bytes) {
index 7d3b1ed6dcbe934dcea8d7b92ccb3342f784bff6..f95e072ae6468240a6ae0cb8d1dc094eafbf59fc 100644 (file)
@@ -134,6 +134,7 @@ void                xfs_iroot_realloc(struct xfs_inode *, int, int);
 int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
 int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
                                  int);
+void           xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 
 struct xfs_bmbt_rec_host *
                xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
index d54a8018b079dd3f0c078e5fdf56cf48a151a545..e8f49c029ff05098ddc91eeeffe5ba7102eff77f 100644 (file)
@@ -211,6 +211,11 @@ typedef struct xfs_trans_header {
 
 #define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
 
+/*
+ * The only type valid for th_type in CIL-enabled file system logs:
+ */
+#define XFS_TRANS_CHECKPOINT   40
+
 /*
  * Log item types.
  */
index 8a53eaa349f44884354139fcde0a9c17b35e741a..12ca86778e023e4261998660f39b482927a16c02 100644 (file)
@@ -838,12 +838,10 @@ xfs_sync_sb(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0,
+                       XFS_TRANS_NO_WRITECOUNT, &tp);
+       if (error)
                return error;
-       }
 
        xfs_log_sb(tp);
        if (wait)
index 81ac870834da9e63515553e3fa291318acd1e73a..16002b5ec4eb82c2988fc6f559f0e3ed995ec1e9 100644 (file)
@@ -55,103 +55,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 
-/*
- * Transaction types.  Used to distinguish types of buffers. These never reach
- * the log.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE     1
-#define XFS_TRANS_SETATTR_SIZE         2
-#define XFS_TRANS_INACTIVE             3
-#define XFS_TRANS_CREATE               4
-#define XFS_TRANS_CREATE_TRUNC         5
-#define XFS_TRANS_TRUNCATE_FILE                6
-#define XFS_TRANS_REMOVE               7
-#define XFS_TRANS_LINK                 8
-#define XFS_TRANS_RENAME               9
-#define XFS_TRANS_MKDIR                        10
-#define XFS_TRANS_RMDIR                        11
-#define XFS_TRANS_SYMLINK              12
-#define XFS_TRANS_SET_DMATTRS          13
-#define XFS_TRANS_GROWFS               14
-#define XFS_TRANS_STRAT_WRITE          15
-#define XFS_TRANS_DIOSTRAT             16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define        XFS_TRANS_WRITEID               18
-#define        XFS_TRANS_ADDAFORK              19
-#define        XFS_TRANS_ATTRINVAL             20
-#define        XFS_TRANS_ATRUNCATE             21
-#define        XFS_TRANS_ATTR_SET              22
-#define        XFS_TRANS_ATTR_RM               23
-#define        XFS_TRANS_ATTR_FLAG             24
-#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_SB_CHANGE            26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1               27
-#define XFS_TRANS_DUMMY2               28
-#define XFS_TRANS_QM_QUOTAOFF          29
-#define XFS_TRANS_QM_DQALLOC           30
-#define XFS_TRANS_QM_SETQLIM           31
-#define XFS_TRANS_QM_DQCLUSTER         32
-#define XFS_TRANS_QM_QINOCREATE                33
-#define XFS_TRANS_QM_QUOTAOFF_END      34
-#define XFS_TRANS_FSYNC_TS             35
-#define        XFS_TRANS_GROWFSRT_ALLOC        36
-#define        XFS_TRANS_GROWFSRT_ZERO         37
-#define        XFS_TRANS_GROWFSRT_FREE         38
-#define        XFS_TRANS_SWAPEXT               39
-#define        XFS_TRANS_CHECKPOINT            40
-#define        XFS_TRANS_ICREATE               41
-#define        XFS_TRANS_CREATE_TMPFILE        42
-#define        XFS_TRANS_TYPE_MAX              43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-       { XFS_TRANS_CREATE,             "CREATE" }, \
-       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-       { XFS_TRANS_REMOVE,             "REMOVE" }, \
-       { XFS_TRANS_LINK,               "LINK" }, \
-       { XFS_TRANS_RENAME,             "RENAME" }, \
-       { XFS_TRANS_MKDIR,              "MKDIR" }, \
-       { XFS_TRANS_RMDIR,              "RMDIR" }, \
-       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-       { XFS_TRANS_GROWFS,             "GROWFS" }, \
-       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-       { XFS_TRANS_WRITEID,            "WRITEID" }, \
-       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-       { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
-       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-       { XFS_TRANS_ICREATE,            "ICREATE" }, \
-       { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
-       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
@@ -181,8 +84,9 @@ int  xfs_log_calc_minimum_size(struct xfs_mount *);
 #define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
 #define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
-                                          count in superblock */
+#define XFS_TRANS_NO_WRITECOUNT 0x40   /* do not elevate SB writecount */
+#define XFS_TRANS_NOFS         0x80    /* pass KM_NOFS to kmem_alloc */
+
 /*
  * Field values for xfs_trans_mod_sb.
  */
index c535887c60a8e613d8bdc69725ecd5d891115fca..4c463b99fe574341043cb1f6ab612a59df881d31 100644 (file)
@@ -84,23 +84,71 @@ xfs_find_bdev_for_inode(
 }
 
 /*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory.  Do not use the ioend after this.
+ * We're now finished for good with this page.  Update the page state via the
+ * associated buffer_heads, paying attention to the start and end offsets that
+ * we need to process on the page.
+ */
+static void
+xfs_finish_page_writeback(
+       struct inode            *inode,
+       struct bio_vec          *bvec,
+       int                     error)
+{
+       unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
+       struct buffer_head      *head, *bh;
+       unsigned int            off = 0;
+
+       ASSERT(bvec->bv_offset < PAGE_SIZE);
+       ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
+       ASSERT(end < PAGE_SIZE);
+       ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0);
+
+       bh = head = page_buffers(bvec->bv_page);
+
+       do {
+               if (off < bvec->bv_offset)
+                       goto next_bh;
+               if (off > end)
+                       break;
+               bh->b_end_io(bh, !error);
+next_bh:
+               off += bh->b_size;
+       } while ((bh = bh->b_this_page) != head);
+}
+
+/*
+ * We're now finished for good with this ioend structure.  Update the page
+ * state, release holds on bios, and finally free up memory.  Do not use the
+ * ioend after this.
  */
 STATIC void
 xfs_destroy_ioend(
-       xfs_ioend_t             *ioend)
+       struct xfs_ioend        *ioend,
+       int                     error)
 {
-       struct buffer_head      *bh, *next;
+       struct inode            *inode = ioend->io_inode;
+       struct bio              *last = ioend->io_bio;
+       struct bio              *bio, *next;
 
-       for (bh = ioend->io_buffer_head; bh; bh = next) {
-               next = bh->b_private;
-               bh->b_end_io(bh, !ioend->io_error);
-       }
+       for (bio = &ioend->io_inline_bio; bio; bio = next) {
+               struct bio_vec  *bvec;
+               int             i;
+
+               /*
+                * For the last bio, bi_private points to the ioend, so we
+                * need to explicitly end the iteration here.
+                */
+               if (bio == last)
+                       next = NULL;
+               else
+                       next = bio->bi_private;
 
-       mempool_free(ioend, xfs_ioend_pool);
+               /* walk each page on bio, ending page IO on them */
+               bio_for_each_segment_all(bvec, bio, i)
+                       xfs_finish_page_writeback(inode, bvec, error);
+
+               bio_put(bio);
+       }
 }
 
 /*
@@ -120,13 +168,9 @@ xfs_setfilesize_trans_alloc(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        ioend->io_append_trans = tp;
 
@@ -174,7 +218,8 @@ xfs_setfilesize(
 
 STATIC int
 xfs_setfilesize_ioend(
-       struct xfs_ioend        *ioend)
+       struct xfs_ioend        *ioend,
+       int                     error)
 {
        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        struct xfs_trans        *tp = ioend->io_append_trans;
@@ -188,36 +233,14 @@ xfs_setfilesize_ioend(
        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
        /* we abort the update if there was an IO error */
-       if (ioend->io_error) {
+       if (error) {
                xfs_trans_cancel(tp);
-               return ioend->io_error;
+               return error;
        }
 
        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 }
 
-/*
- * Schedule IO completion handling on the final put of an ioend.
- *
- * If there is no work to do we might as well call it a day and free the
- * ioend right now.
- */
-STATIC void
-xfs_finish_ioend(
-       struct xfs_ioend        *ioend)
-{
-       if (atomic_dec_and_test(&ioend->io_remaining)) {
-               struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
-
-               if (ioend->io_type == XFS_IO_UNWRITTEN)
-                       queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-               else if (ioend->io_append_trans)
-                       queue_work(mp->m_data_workqueue, &ioend->io_work);
-               else
-                       xfs_destroy_ioend(ioend);
-       }
-}
-
 /*
  * IO write completion.
  */
@@ -225,16 +248,17 @@ STATIC void
 xfs_end_io(
        struct work_struct *work)
 {
-       xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
-       struct xfs_inode *ip = XFS_I(ioend->io_inode);
-       int             error = 0;
+       struct xfs_ioend        *ioend =
+               container_of(work, struct xfs_ioend, io_work);
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       int                     error = ioend->io_bio->bi_error;
 
        /*
         * Set an error if the mount has shut down and proceed with end I/O
         * processing so it can perform whatever cleanups are necessary.
         */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               ioend->io_error = -EIO;
+               error = -EIO;
 
        /*
         * For unwritten extents we need to issue transactions to convert a
@@ -244,55 +268,33 @@ xfs_end_io(
         * on error.
         */
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
-               if (ioend->io_error)
+               if (error)
                        goto done;
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
        } else if (ioend->io_append_trans) {
-               error = xfs_setfilesize_ioend(ioend);
+               error = xfs_setfilesize_ioend(ioend, error);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
 
 done:
-       if (error)
-               ioend->io_error = error;
-       xfs_destroy_ioend(ioend);
+       xfs_destroy_ioend(ioend, error);
 }
 
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-       struct inode            *inode,
-       unsigned int            type)
+STATIC void
+xfs_end_bio(
+       struct bio              *bio)
 {
-       xfs_ioend_t             *ioend;
-
-       ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
-       /*
-        * Set the count to 1 initially, which will prevent an I/O
-        * completion callback from happening before we have started
-        * all the I/O from calling the completion routine too early.
-        */
-       atomic_set(&ioend->io_remaining, 1);
-       ioend->io_error = 0;
-       INIT_LIST_HEAD(&ioend->io_list);
-       ioend->io_type = type;
-       ioend->io_inode = inode;
-       ioend->io_buffer_head = NULL;
-       ioend->io_buffer_tail = NULL;
-       ioend->io_offset = 0;
-       ioend->io_size = 0;
-       ioend->io_append_trans = NULL;
+       struct xfs_ioend        *ioend = bio->bi_private;
+       struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 
-       INIT_WORK(&ioend->io_work, xfs_end_io);
-       return ioend;
+       if (ioend->io_type == XFS_IO_UNWRITTEN)
+               queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+       else if (ioend->io_append_trans)
+               queue_work(mp->m_data_workqueue, &ioend->io_work);
+       else
+               xfs_destroy_ioend(ioend, bio->bi_error);
 }
 
 STATIC int
@@ -364,50 +366,6 @@ xfs_imap_valid(
                offset < imap->br_startoff + imap->br_blockcount;
 }
 
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
-       struct bio              *bio)
-{
-       xfs_ioend_t             *ioend = bio->bi_private;
-
-       if (!ioend->io_error)
-               ioend->io_error = bio->bi_error;
-
-       /* Toss bio and pass work off to an xfsdatad thread */
-       bio->bi_private = NULL;
-       bio->bi_end_io = NULL;
-       bio_put(bio);
-
-       xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
-       struct writeback_control *wbc,
-       xfs_ioend_t             *ioend,
-       struct bio              *bio)
-{
-       atomic_inc(&ioend->io_remaining);
-       bio->bi_private = ioend;
-       bio->bi_end_io = xfs_end_bio;
-       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-       struct buffer_head      *bh)
-{
-       struct bio              *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-
-       ASSERT(bio->bi_private == NULL);
-       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
-       return bio;
-}
-
 STATIC void
 xfs_start_buffer_writeback(
        struct buffer_head      *bh)
@@ -452,28 +410,35 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 
 /*
- * Submit all of the bios for an ioend. We are only passed a single ioend at a
- * time; the caller is responsible for chaining prior to submission.
+ * Submit the bio for an ioend. We are passed an ioend with a bio attached to
+ * it, and we submit that bio. The ioend may be used for multiple bio
+ * submissions, so we only want to allocate an append transaction for the ioend
+ * once. In the case of multiple bio submission, each bio will take an IO
+ * reference to the ioend to ensure that the ioend completion is only done once
+ * all bios have been submitted and the ioend is really done.
  *
  * If @fail is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the ioend chain rather
- * than submit it to IO. This typically only happens on a filesystem shutdown.
+ * and unlocked them. In this situation, we need to fail the bio and ioend
+ * rather than submit it to IO. This typically only happens on a filesystem
+ * shutdown.
  */
 STATIC int
 xfs_submit_ioend(
        struct writeback_control *wbc,
-       xfs_ioend_t             *ioend,
+       struct xfs_ioend        *ioend,
        int                     status)
 {
-       struct buffer_head      *bh;
-       struct bio              *bio;
-       sector_t                lastblock = 0;
-
        /* Reserve log space if we might write beyond the on-disk inode size. */
        if (!status &&
-            ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+           ioend->io_type != XFS_IO_UNWRITTEN &&
+           xfs_ioend_is_append(ioend) &&
+           !ioend->io_append_trans)
                status = xfs_setfilesize_trans_alloc(ioend);
+
+       ioend->io_bio->bi_private = ioend;
+       ioend->io_bio->bi_end_io = xfs_end_bio;
+
        /*
         * If we are failing the IO now, just mark the ioend with an
         * error and finish it. This will run IO completion immediately
@@ -481,33 +446,73 @@ xfs_submit_ioend(
         * time.
         */
        if (status) {
-               ioend->io_error = status;
-               xfs_finish_ioend(ioend);
+               ioend->io_bio->bi_error = status;
+               bio_endio(ioend->io_bio);
                return status;
        }
 
-       bio = NULL;
-       for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+                  ioend->io_bio);
+       return 0;
+}
 
-               if (!bio) {
-retry:
-                       bio = xfs_alloc_ioend_bio(bh);
-               } else if (bh->b_blocknr != lastblock + 1) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
-               }
+static void
+xfs_init_bio_from_bh(
+       struct bio              *bio,
+       struct buffer_head      *bh)
+{
+       bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_bdev = bh->b_bdev;
+}
 
-               if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-                       goto retry;
-               }
+static struct xfs_ioend *
+xfs_alloc_ioend(
+       struct inode            *inode,
+       unsigned int            type,
+       xfs_off_t               offset,
+       struct buffer_head      *bh)
+{
+       struct xfs_ioend        *ioend;
+       struct bio              *bio;
 
-               lastblock = bh->b_blocknr;
-       }
-       if (bio)
-               xfs_submit_ioend_bio(wbc, ioend, bio);
-       xfs_finish_ioend(ioend);
-       return 0;
+       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
+       xfs_init_bio_from_bh(bio, bh);
+
+       ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
+       INIT_LIST_HEAD(&ioend->io_list);
+       ioend->io_type = type;
+       ioend->io_inode = inode;
+       ioend->io_size = 0;
+       ioend->io_offset = offset;
+       INIT_WORK(&ioend->io_work, xfs_end_io);
+       ioend->io_append_trans = NULL;
+       ioend->io_bio = bio;
+       return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in xfs_destroy_ioend().
+ */
+static void
+xfs_chain_bio(
+       struct xfs_ioend        *ioend,
+       struct writeback_control *wbc,
+       struct buffer_head      *bh)
+{
+       struct bio *new;
+
+       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+       xfs_init_bio_from_bh(new, bh);
+
+       bio_chain(ioend->io_bio, new);
+       bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
+       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
+                  ioend->io_bio);
+       ioend->io_bio = new;
 }
 
 /*
@@ -523,27 +528,24 @@ xfs_add_to_ioend(
        struct buffer_head      *bh,
        xfs_off_t               offset,
        struct xfs_writepage_ctx *wpc,
+       struct writeback_control *wbc,
        struct list_head        *iolist)
 {
        if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
            bh->b_blocknr != wpc->last_block + 1 ||
            offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-               struct xfs_ioend        *new;
-
                if (wpc->ioend)
                        list_add(&wpc->ioend->io_list, iolist);
-
-               new = xfs_alloc_ioend(inode, wpc->io_type);
-               new->io_offset = offset;
-               new->io_buffer_head = bh;
-               new->io_buffer_tail = bh;
-               wpc->ioend = new;
-       } else {
-               wpc->ioend->io_buffer_tail->b_private = bh;
-               wpc->ioend->io_buffer_tail = bh;
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
        }
 
-       bh->b_private = NULL;
+       /*
+        * If the buffer doesn't fit into the bio we need to allocate a new
+        * one.  This shouldn't happen more than once for a given buffer.
+        */
+       while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
+               xfs_chain_bio(wpc->ioend, wbc, bh);
+
        wpc->ioend->io_size += bh->b_size;
        wpc->last_block = bh->b_blocknr;
        xfs_start_buffer_writeback(bh);
@@ -803,7 +805,7 @@ xfs_writepage_map(
                        lock_buffer(bh);
                        if (wpc->io_type != XFS_IO_OVERWRITE)
                                xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+                       xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
                        count++;
                }
 
@@ -1391,13 +1393,10 @@ xfs_end_io_direct_write(
 
                trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
-                       return error;
-               }
-               error = xfs_setfilesize(ip, tp, offset, size);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0,
+                               &tp);
+               if (!error)
+                       error = xfs_setfilesize(ip, tp, offset, size);
        }
 
        return error;
index b4421177b68dc1ba619625876c05ee8d22caf570..814aab7907134e14cbc3ccb7f72d14001a7834d1 100644 (file)
@@ -18,7 +18,7 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
-extern mempool_t *xfs_ioend_pool;
+extern struct bio_set *xfs_ioend_bioset;
 
 /*
  * Types of I/O for bmap clustering and I/O completion tracking.
@@ -37,22 +37,19 @@ enum {
        { XFS_IO_OVERWRITE,             "overwrite" }
 
 /*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
+ * Structure for buffered I/O completions.
  */
-typedef struct xfs_ioend {
+struct xfs_ioend {
        struct list_head        io_list;        /* next ioend in chain */
        unsigned int            io_type;        /* delalloc / unwritten */
-       int                     io_error;       /* I/O error code */
-       atomic_t                io_remaining;   /* hold count */
        struct inode            *io_inode;      /* file being written to */
-       struct buffer_head      *io_buffer_head;/* buffer linked list head */
-       struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
        struct xfs_trans        *io_append_trans;/* xact. for size update */
-} xfs_ioend_t;
+       struct bio              *io_bio;        /* bio being built */
+       struct bio              io_inline_bio;  /* MUST BE LAST! */
+};
 
 extern const struct address_space_operations xfs_address_space_operations;
 
index dd4824589470eb106a2b5a764da6039d56121726..e3da5d448bcff5189aa15b2e04b95ac4cd01d9f1 100644 (file)
@@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern {
  *========================================================================*/
 
 
+/* Return 0 on success, or -errno; other state communicated via *context */
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-                             unsigned char *, int, int, unsigned char *);
+                             unsigned char *, int, int);
 
 typedef struct xfs_attr_list_context {
        struct xfs_inode                *dp;            /* inode */
@@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context {
        int                             firstu;         /* first used byte in buffer */
        int                             flags;          /* from VOP call */
        int                             resynch;        /* T/F: resynch with cursor */
-       int                             put_value;      /* T/F: need value for listent */
        put_listent_func_t              put_listent;    /* list output fmt function */
        int                             index;          /* index into output buffer */
 } xfs_attr_list_context_t;
index 2bb959ada45bb5444830373a102fc4d7aced7273..55d214981ed27e6bb85490767939d25058815201 100644 (file)
@@ -405,21 +405,11 @@ xfs_attr_inactive(
                goto out_destroy_fork;
        xfs_iunlock(dp, lock_mode);
 
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
        lock_mode = 0;
-       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-       error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans);
        if (error)
-               goto out_cancel;
+               goto out_destroy_fork;
 
        lock_mode = XFS_ILOCK_EXCL;
        xfs_ilock(dp, lock_mode);
index 4fa14820e2e22b687ef852b81e1d6b9f9028caf3..d25f26b22ac92821cb2dbc95973e0c1804cebe10 100644 (file)
@@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                           sfe->flags,
                                           sfe->nameval,
                                           (int)sfe->namelen,
-                                          (int)sfe->valuelen,
-                                          &sfe->nameval[sfe->namelen]);
-
+                                          (int)sfe->valuelen);
+                       if (error)
+                               return error;
                        /*
                         * Either search callback finished early or
                         * didn't fit it all in the buffer after all.
                         */
                        if (context->seen_enough)
                                break;
-
-                       if (error)
-                               return error;
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                trace_xfs_attr_list_sf_all(context);
@@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                        sbp->flags,
                                        sbp->name,
                                        sbp->namelen,
-                                       sbp->valuelen,
-                                       &sbp->name[sbp->namelen]);
+                                       sbp->valuelen);
                if (error) {
                        kmem_free(sbuf);
                        return error;
@@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int(
         */
        retval = 0;
        for (; i < ichdr.count; entry++, i++) {
+               char *name;
+               int namelen, valuelen;
+
                if (be32_to_cpu(entry->hashval) != cursor->hashval) {
                        cursor->hashval = be32_to_cpu(entry->hashval);
                        cursor->offset = 0;
@@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int(
                        continue;               /* skip incomplete entries */
 
                if (entry->flags & XFS_ATTR_LOCAL) {
-                       xfs_attr_leaf_name_local_t *name_loc =
-                               xfs_attr3_leaf_name_local(leaf, i);
-
-                       retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_loc->nameval,
-                                               (int)name_loc->namelen,
-                                               be16_to_cpu(name_loc->valuelen),
-                                               &name_loc->nameval[name_loc->namelen]);
-                       if (retval)
-                               return retval;
+                       xfs_attr_leaf_name_local_t *name_loc;
+
+                       name_loc = xfs_attr3_leaf_name_local(leaf, i);
+                       name = name_loc->nameval;
+                       namelen = name_loc->namelen;
+                       valuelen = be16_to_cpu(name_loc->valuelen);
                } else {
-                       xfs_attr_leaf_name_remote_t *name_rmt =
-                               xfs_attr3_leaf_name_remote(leaf, i);
-
-                       int valuelen = be32_to_cpu(name_rmt->valuelen);
-
-                       if (context->put_value) {
-                               xfs_da_args_t args;
-
-                               memset((char *)&args, 0, sizeof(args));
-                               args.geo = context->dp->i_mount->m_attr_geo;
-                               args.dp = context->dp;
-                               args.whichfork = XFS_ATTR_FORK;
-                               args.valuelen = valuelen;
-                               args.rmtvaluelen = valuelen;
-                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                       args.dp->i_mount, valuelen);
-                               retval = xfs_attr_rmtval_get(&args);
-                               if (!retval)
-                                       retval = context->put_listent(context,
-                                                       entry->flags,
-                                                       name_rmt->name,
-                                                       (int)name_rmt->namelen,
-                                                       valuelen,
-                                                       args.value);
-                               kmem_free(args.value);
-                       } else {
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               NULL);
-                       }
-                       if (retval)
-                               return retval;
+                       xfs_attr_leaf_name_remote_t *name_rmt;
+
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       name = name_rmt->name;
+                       namelen = name_rmt->namelen;
+                       valuelen = be32_to_cpu(name_rmt->valuelen);
                }
+
+               retval = context->put_listent(context, entry->flags,
+                                             name, namelen, valuelen);
+               if (retval)
+                       break;
                if (context->seen_enough)
                        break;
                cursor->offset++;
@@ -551,8 +519,7 @@ xfs_attr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-       int             valuelen,
-       unsigned char   *value)
+       int             valuelen)
 {
        struct attrlist *alist = (struct attrlist *)context->alist;
        attrlist_ent_t *aep;
@@ -581,7 +548,7 @@ xfs_attr_put_listent(
                trace_xfs_attr_list_full(context);
                alist->al_more = 1;
                context->seen_enough = 1;
-               return 1;
+               return 0;
        }
 
        aep = (attrlist_ent_t *)&context->alist[context->firstu];
index 3b6309865c65336793a7ac84009188b44b394a32..586bb64e674bac152c54e7de2255400c7e00e965 100644 (file)
@@ -72,18 +72,11 @@ xfs_zero_extent(
        struct xfs_mount *mp = ip->i_mount;
        xfs_daddr_t     sector = xfs_fsb_to_db(ip, start_fsb);
        sector_t        block = XFS_BB_TO_FSBT(mp, sector);
-       ssize_t         size = XFS_FSB_TO_B(mp, count_fsb);
-
-       if (IS_DAX(VFS_I(ip)))
-               return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
-                               sector, size);
-
-       /*
-        * let the block layer decide on the fastest method of
-        * implementing the zeroing.
-        */
-       return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
 
+       return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+               block << (mp->m_super->s_blocksize_bits - 9),
+               count_fsb << (mp->m_super->s_blocksize_bits - 9),
+               GFP_NOFS, true);
 }
 
 /*
@@ -900,19 +893,15 @@ xfs_free_eofblocks(
                 * Free them up now by truncating the file to
                 * its current size.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
                if (need_iolock) {
-                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp);
+                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
                                return -EAGAIN;
-                       }
                }
 
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+                               &tp);
                if (error) {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        if (need_iolock)
                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                        return error;
@@ -1037,9 +1026,9 @@ xfs_alloc_file_space(
                /*
                 * Allocate and setup the transaction.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                         resblks, resrtextents);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+                               resrtextents, 0, &tp);
+
                /*
                 * Check for running out of space
                 */
@@ -1048,7 +1037,6 @@ xfs_alloc_file_space(
                         * Free the transaction structure.
                         */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1311,18 +1299,10 @@ xfs_free_file_space(
                 * transaction to dip into the reserve blocks to ensure
                 * the freeing of the space succeeds at ENOSPC.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
-
-               /*
-                * check for running out of space
-                */
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+                               &tp);
                if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp);
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1482,19 +1462,16 @@ xfs_shift_file_space(
        }
 
        while (!error && !done) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                /*
                 * We would need to reserve permanent block for transaction.
                 * This will come into picture when after shifting extent into
                 * hole we found that adjacent extents can be merged which
                 * may lead to freeing of a block during record update.
                 */
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                               XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+                               XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+               if (error)
                        break;
-               }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
@@ -1747,12 +1724,9 @@ xfs_swap_extents(
        if (error)
                goto out_unlock;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                goto out_unlock;
-       }
 
        /*
         * Lock and join the inodes to the tansaction so that transaction commit
index 9a2191b911377f94e38d81d57d5d037a7e19ae8b..e71cfbd5acb3c74df23be024c91420a07291a493 100644 (file)
@@ -1100,22 +1100,18 @@ xfs_bwrite(
        return error;
 }
 
-STATIC void
+static void
 xfs_buf_bio_end_io(
        struct bio              *bio)
 {
-       xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
+       struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
 
        /*
         * don't overwrite existing errors - otherwise we can lose errors on
         * buffers that require multiple bios to complete.
         */
-       if (bio->bi_error) {
-               spin_lock(&bp->b_lock);
-               if (!bp->b_io_error)
-                       bp->b_io_error = bio->bi_error;
-               spin_unlock(&bp->b_lock);
-       }
+       if (bio->bi_error)
+               cmpxchg(&bp->b_io_error, 0, bio->bi_error);
 
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
index 4eb89bd4ee73b6f4265eb63b8238e9571150bf26..8bfb974f0772844af09b11373fe11337d50685d7 100644 (file)
@@ -183,6 +183,26 @@ typedef struct xfs_buf {
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
        int                     b_error;        /* error code on I/O */
+
+       /*
+        * async write failure retry count. Initialised to zero on the first
+        * failure, then when it exceeds the maximum configured without a
+        * success the write is considered to be failed permanently and the
+        * iodone handler will take appropriate action.
+        *
+        * For retry timeouts, we record the jiffie of the first failure. This
+        * means that we can change the retry timeout for buffers already under
+        * I/O and thus avoid getting stuck in a retry loop with a long timeout.
+        *
+        * last_error is used to ensure that we are getting repeated errors, not
+        * different errors. e.g. a block device might change ENOSPC to EIO when
+        * a failure timeout occurs, so we want to re-initialise the error
+        * retry behaviour appropriately when that happens.
+        */
+       int                     b_retries;
+       unsigned long           b_first_retry_time; /* in jiffies */
+       int                     b_last_error;
+
        const struct xfs_buf_ops        *b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
index 99e91a0e554ea6512ce5eb43cb8a338804f550ae..34257992934c4184feba8505c8342e2e0a1e9edc 100644 (file)
@@ -1042,35 +1042,22 @@ xfs_buf_do_callbacks(
        }
 }
 
-/*
- * This is the iodone() function for buffers which have had callbacks
- * attached to them by xfs_buf_attach_iodone().  It should remove each
- * log item from the buffer's list and call the callback of each in turn.
- * When done, the buffer's fsprivate field is set to NULL and the buffer
- * is unlocked with a call to iodone().
- */
-void
-xfs_buf_iodone_callbacks(
+static bool
+xfs_buf_iodone_callback_error(
        struct xfs_buf          *bp)
 {
        struct xfs_log_item     *lip = bp->b_fspriv;
        struct xfs_mount        *mp = lip->li_mountp;
        static ulong            lasttime;
        static xfs_buftarg_t    *lasttarg;
-
-       if (likely(!bp->b_error))
-               goto do_callbacks;
+       struct xfs_error_cfg    *cfg;
 
        /*
         * If we've already decided to shutdown the filesystem because of
         * I/O errors, there's no point in giving this a retry.
         */
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               xfs_buf_stale(bp);
-               bp->b_flags |= XBF_DONE;
-               trace_xfs_buf_item_iodone(bp, _RET_IP_);
-               goto do_callbacks;
-       }
+       if (XFS_FORCED_SHUTDOWN(mp))
+               goto out_stale;
 
        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
@@ -1079,45 +1066,93 @@ xfs_buf_iodone_callbacks(
        }
        lasttarg = bp->b_target;
 
+       /* synchronous writes will have callers process the error */
+       if (!(bp->b_flags & XBF_ASYNC))
+               goto out_stale;
+
+       trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+       ASSERT(bp->b_iodone != NULL);
+
        /*
         * If the write was asynchronous then no one will be looking for the
-        * error.  Clear the error state and write the buffer out again.
-        *
-        * XXX: This helps against transient write errors, but we need to find
-        * a way to shut the filesystem down if the writes keep failing.
-        *
-        * In practice we'll shut the filesystem down soon as non-transient
-        * errors tend to affect the whole device and a failing log write
-        * will make us give up.  But we really ought to do better here.
+        * error.  If this is the first failure of this type, clear the error
+        * state and write the buffer out again. This means we always retry an
+        * async write failure at least once, but we also need to set the buffer
+        * up to behave correctly now for repeated failures.
         */
-       if (bp->b_flags & XBF_ASYNC) {
-               ASSERT(bp->b_iodone != NULL);
+       if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+            bp->b_last_error != bp->b_error) {
+               bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
+                               XBF_DONE | XBF_WRITE_FAIL);
+               bp->b_last_error = bp->b_error;
+               bp->b_retries = 0;
+               bp->b_first_retry_time = jiffies;
+
+               xfs_buf_ioerror(bp, 0);
+               xfs_buf_submit(bp);
+               return true;
+       }
 
-               trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+       /*
+        * Repeated failure on an async write. Take action according to the
+        * error configuration we have been set up to use.
+        */
+       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
 
-               xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
+       if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+           ++bp->b_retries > cfg->max_retries)
+                       goto permanent_error;
+       if (cfg->retry_timeout &&
+           time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+                       goto permanent_error;
 
-               if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
-                       bp->b_flags |= XBF_WRITE | XBF_ASYNC |
-                                      XBF_DONE | XBF_WRITE_FAIL;
-                       xfs_buf_submit(bp);
-               } else {
-                       xfs_buf_relse(bp);
-               }
+       /* At unmount we may treat errors differently */
+       if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+               goto permanent_error;
 
-               return;
-       }
+       /* still a transient error, higher layers will retry */
+       xfs_buf_ioerror(bp, 0);
+       xfs_buf_relse(bp);
+       return true;
 
        /*
-        * If the write of the buffer was synchronous, we want to make
-        * sure to return the error to the caller of xfs_bwrite().
+        * Permanent error - we need to trigger a shutdown if we haven't already
+        * to indicate that inconsistency will result from this action.
         */
+permanent_error:
+       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+out_stale:
        xfs_buf_stale(bp);
        bp->b_flags |= XBF_DONE;
-
        trace_xfs_buf_error_relse(bp, _RET_IP_);
+       return false;
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks attached
+ * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
+ * callback list, mark the buffer as having no more callbacks and then push the
+ * buffer through IO completion processing.
+ */
+void
+xfs_buf_iodone_callbacks(
+       struct xfs_buf          *bp)
+{
+       /*
+        * If there is an error, process it. Some errors require us
+        * to run callbacks after failure processing is done so we
+        * detect that and take appropriate action.
+        */
+       if (bp->b_error && xfs_buf_iodone_callback_error(bp))
+               return;
+
+       /*
+        * Successful IO or permanent error. Either way, we can clear the
+        * retry state here in preparation for the next error that may occur.
+        */
+       bp->b_last_error = 0;
+       bp->b_retries = 0;
 
-do_callbacks:
        xfs_buf_do_callbacks(bp);
        bp->b_fspriv = NULL;
        bp->b_iodone = NULL;
index 316b2a1bdba5f6da82f1bad0dcbc0708151a59d2..e0646659ce16eaafa7b35833fac127fd13b80582 100644 (file)
@@ -614,11 +614,10 @@ xfs_qm_dqread(
        trace_xfs_dqread(dqp);
 
        if (flags & XFS_QMOPT_DQALLOC) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
-                                         XFS_QM_DQALLOC_SPACE_RES(mp), 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+                               XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
                if (error)
-                       goto error1;
+                       goto error0;
        }
 
        /*
@@ -692,7 +691,7 @@ error0:
  * end of the chunk, skip ahead to first id in next allocated chunk
  * using the SEEK_DATA interface.
  */
-int
+static int
 xfs_dq_get_next_id(
        xfs_mount_t             *mp,
        uint                    type,
index 85ce3032f815ebc0ef1e360eeda9ce67eef2f9a1..47fc632954228febc4e3a5bac611dbc121137348 100644 (file)
@@ -145,12 +145,10 @@ xfs_update_prealloc_flags(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-       error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+                       0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1553,7 +1551,7 @@ xfs_filemap_page_mkwrite(
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
        if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
                ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
                ret = block_page_mkwrite_return(ret);
@@ -1587,7 +1585,7 @@ xfs_filemap_fault(
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1624,8 +1622,7 @@ xfs_filemap_pmd_fault(
        }
 
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
-                             NULL);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
        if (flags & FAULT_FLAG_WRITE)
index ee3aaa0a53179f761ffffe6257a58b637a6b7dc8..b4d75825ae3732d98ee19c8760719d92bd83a9cf 100644 (file)
@@ -198,14 +198,10 @@ xfs_growfs_data_private(
                        return error;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
-       tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
-                                 XFS_GROWFS_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+                       XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+       if (error)
                return error;
-       }
 
        /*
         * Write new AG headers to disk. Non-transactional, but written
@@ -243,8 +239,8 @@ xfs_growfs_data_private(
                agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
                agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
                agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-               agf->agf_flfirst = 0;
-               agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+               agf->agf_flfirst = cpu_to_be32(1);
+               agf->agf_fllast = 0;
                agf->agf_flcount = 0;
                tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
                agf->agf_freeblks = cpu_to_be32(tmpsize);
index bf2d60749278602b5b4afcda09ede7d3dd89fd1e..99ee6eee5e0b0d5af6e2e5124c4878a1fe2c4cb4 100644 (file)
@@ -37,9 +37,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
-                               struct xfs_perag *pag, struct xfs_inode *ip);
-
 /*
  * Allocate and initialise an xfs_inode.
  */
@@ -94,13 +91,6 @@ xfs_inode_free_callback(
        struct inode            *inode = container_of(head, struct inode, i_rcu);
        struct xfs_inode        *ip = XFS_I(inode);
 
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-       struct xfs_inode        *ip)
-{
        switch (VFS_I(ip)->i_mode & S_IFMT) {
        case S_IFREG:
        case S_IFDIR:
@@ -118,6 +108,25 @@ xfs_inode_free(
                ip->i_itemp = NULL;
        }
 
+       kmem_zone_free(xfs_inode_zone, ip);
+}
+
+static void
+__xfs_inode_free(
+       struct xfs_inode        *ip)
+{
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!xfs_isiflocked(ip));
+       XFS_STATS_DEC(ip->i_mount, vn_active);
+
+       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+void
+xfs_inode_free(
+       struct xfs_inode        *ip)
+{
        /*
         * Because we use RCU freeing we need to ensure the inode always
         * appears to be reclaimed with an invalid inode number when in the
@@ -129,12 +138,123 @@ xfs_inode_free(
        ip->i_ino = 0;
        spin_unlock(&ip->i_flags_lock);
 
-       /* asserts to verify all state is correct here */
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!xfs_isiflocked(ip));
-       XFS_STATS_DEC(ip->i_mount, vn_active);
+       __xfs_inode_free(ip);
+}
 
-       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+       struct xfs_mount        *mp)
+{
+
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+
+       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+       xfs_reclaim_work_queue(mp);
+}
+
+static void
+xfs_perag_set_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (pag->pag_ici_reclaimable++)
+               return;
+
+       /* propagate the reclaim tag up into the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
+                          XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+
+       /* schedule periodic background inode reclaim */
+       xfs_reclaim_work_queue(mp);
+
+       trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+static void
+xfs_perag_clear_reclaim_tag(
+       struct xfs_perag        *pag)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+
+       ASSERT(spin_is_locked(&pag->pag_ici_lock));
+       if (--pag->pag_ici_reclaimable)
+               return;
+
+       /* clear the reclaim tag from the perag radix tree */
+       spin_lock(&mp->m_perag_lock);
+       radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
+                            XFS_ICI_RECLAIM_TAG);
+       spin_unlock(&mp->m_perag_lock);
+       trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+}
+
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_perag        *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+
+       radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
+                          XFS_ICI_RECLAIM_TAG);
+       xfs_perag_set_reclaim_tag(pag);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+
+       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+STATIC void
+xfs_inode_clear_reclaim_tag(
+       struct xfs_perag        *pag,
+       xfs_ino_t               ino)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                            XFS_INO_TO_AGINO(pag->pag_mount, ino),
+                            XFS_ICI_RECLAIM_TAG);
+       xfs_perag_clear_reclaim_tag(pag);
 }
 
 /*
@@ -264,7 +384,7 @@ xfs_iget_cache_hit(
                 */
                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
                ip->i_flags |= XFS_INEW;
-               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+               xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
                inode->i_state = I_NEW;
 
                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
@@ -722,121 +842,6 @@ xfs_inode_ag_iterator_tag(
        return last_error;
 }
 
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-       struct xfs_mount        *mp)
-{
-
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-       }
-       rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_reclaim_work_queue(mp);
-}
-
-static void
-__xfs_inode_set_reclaim_tag(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip)
-{
-       radix_tree_tag_set(&pag->pag_ici_root,
-                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                          XFS_ICI_RECLAIM_TAG);
-
-       if (!pag->pag_ici_reclaimable) {
-               /* propagate the reclaim tag up into the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-
-               /* schedule periodic background inode reclaim */
-               xfs_reclaim_work_queue(ip->i_mount);
-
-               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-       pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-       xfs_inode_t     *ip)
-{
-       struct xfs_mount *mp = ip->i_mount;
-       struct xfs_perag *pag;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       spin_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       __xfs_inode_set_reclaim_tag(pag, ip);
-       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-       spin_unlock(&ip->i_flags_lock);
-       spin_unlock(&pag->pag_ici_lock);
-       xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       pag->pag_ici_reclaimable--;
-       if (!pag->pag_ici_reclaimable) {
-               /* clear the reclaim tag from the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-}
-
-STATIC void
-__xfs_inode_clear_reclaim_tag(
-       xfs_mount_t     *mp,
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       radix_tree_tag_clear(&pag->pag_ici_root,
-                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-       __xfs_inode_clear_reclaim(pag, ip);
-}
-
 /*
  * Grab the inode for reclaim exclusively.
  * Return 0 if we grabbed it, non-zero otherwise.
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
        int                     sync_mode)
 {
        struct xfs_buf          *bp = NULL;
+       xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
        int                     error;
 
 restart:
@@ -993,6 +999,22 @@ restart:
 
        xfs_iflock(ip);
 reclaim:
+       /*
+        * Because we use RCU freeing we need to ensure the inode always appears
+        * to be reclaimed with an invalid inode number when in the free state.
+        * We do this as early as possible under the ILOCK and flush lock so
+        * that xfs_iflush_cluster() can be guaranteed to detect races with us
+        * here. By doing this, we guarantee that once xfs_iflush_cluster has
+        * locked both the XFS_ILOCK and the flush lock that it will see either
+        * a valid, flushable inode that will serialise correctly against the
+        * locks below, or it will see a clean (and invalid) inode that it can
+        * skip.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
+
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -1006,9 +1028,9 @@ reclaim:
         */
        spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
-                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                               XFS_INO_TO_AGINO(ip->i_mount, ino)))
                ASSERT(0);
-       __xfs_inode_clear_reclaim(pag, ip);
+       xfs_perag_clear_reclaim_tag(pag);
        spin_unlock(&pag->pag_ici_lock);
 
        /*
@@ -1023,7 +1045,7 @@ reclaim:
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
-       xfs_inode_free(ip);
+       __xfs_inode_free(ip);
        return error;
 
 out_ifunlock:
index 96f606deee313aed506b7e7ee229fc801ba5de80..ee6799e0476f397b0aba305fee9f7cddef188b8e 100644 (file)
@@ -1030,7 +1030,7 @@ xfs_dir_ialloc(
                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
                }
 
-               code = xfs_trans_roll(&tp, 0);
+               code = xfs_trans_roll(&tp, NULL);
                if (committed != NULL)
                        *committed = 1;
 
@@ -1161,11 +1161,9 @@ xfs_create(
                rdev = 0;
                resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_mkdir;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
        } else {
                resblks = XFS_CREATE_SPACE_RES(mp, name->len);
                tres = &M_RES(mp)->tr_create;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        }
 
        /*
@@ -1174,20 +1172,19 @@ xfs_create(
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-       error = xfs_trans_reserve(tp, tres, resblks, 0);
+       error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
-               error = xfs_trans_reserve(tp, tres, resblks, 0);
+               error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        }
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-               error = xfs_trans_reserve(tp, tres, 0, 0);
+               error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-               goto out_trans_cancel;
-
+               goto out_release_inode;
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -1337,17 +1334,16 @@ xfs_create_tmpfile(
                return error;
 
        resblks = XFS_IALLOC_SPACE_RES(mp);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
-
        tres = &M_RES(mp)->tr_create_tmpfile;
-       error = xfs_trans_reserve(tp, tres, resblks, 0);
+
+       error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
-               error = xfs_trans_reserve(tp, tres, 0, 0);
+               error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_inode;
 
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
@@ -1432,15 +1428,14 @@ xfs_link(
        if (error)
                goto std_return;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
        }
        if (error)
-               goto error_return;
+               goto std_return;
 
        xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1710,11 +1705,9 @@ xfs_inactive_truncate(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -1764,8 +1757,6 @@ xfs_inactive_ifree(
        struct xfs_trans        *tp;
        int                     error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
        /*
         * The ifree transaction might need to allocate blocks for record
         * insertion to the finobt. We don't want to fail here at ENOSPC, so
@@ -1781,9 +1772,8 @@ xfs_inactive_ifree(
         * now remains allocated and sits on the unlinked list until the fs is
         * repaired.
         */
-       tp->t_flags |= XFS_TRANS_RESERVE;
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
-                                 XFS_IFREE_SPACE_RES(mp), 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+                       XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
        if (error) {
                if (error == -ENOSPC) {
                        xfs_warn_ratelimited(mp,
@@ -1792,7 +1782,6 @@ xfs_inactive_ifree(
                } else {
                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
                }
-               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -2525,11 +2514,6 @@ xfs_remove(
        if (error)
                goto std_return;
 
-       if (is_dir)
-               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-       else
-               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-
        /*
         * We try to get the real space reservation first,
         * allowing for directory btree deletion(s) implying
@@ -2540,14 +2524,15 @@ xfs_remove(
         * block from the directory.
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
        if (error == -ENOSPC) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
+                               &tp);
        }
        if (error) {
                ASSERT(error != -ENOSPC);
-               goto out_trans_cancel;
+               goto std_return;
        }
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
@@ -2855,6 +2840,7 @@ xfs_rename_alloc_whiteout(
         * and flag it as linkable.
         */
        drop_nlink(VFS_I(tmpfile));
+       xfs_setup_iops(tmpfile);
        xfs_finish_inode_setup(tmpfile);
        VFS_I(tmpfile)->i_state |= I_LINKABLE;
 
@@ -2910,15 +2896,15 @@ xfs_rename(
        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                inodes, &num_inodes);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
        if (error == -ENOSPC) {
                spaceres = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
+                               &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_wip;
 
        /*
         * Attach the dquots to the inodes
@@ -3155,6 +3141,7 @@ out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
 out_trans_cancel:
        xfs_trans_cancel(tp);
+out_release_wip:
        if (wip)
                IRELE(wip);
        return error;
@@ -3162,16 +3149,16 @@ out_trans_cancel:
 
 STATIC int
 xfs_iflush_cluster(
-       xfs_inode_t     *ip,
-       xfs_buf_t       *bp)
+       struct xfs_inode        *ip,
+       struct xfs_buf          *bp)
 {
-       xfs_mount_t             *mp = ip->i_mount;
+       struct xfs_mount        *mp = ip->i_mount;
        struct xfs_perag        *pag;
        unsigned long           first_index, mask;
        unsigned long           inodes_per_cluster;
-       int                     ilist_size;
-       xfs_inode_t             **ilist;
-       xfs_inode_t             *iq;
+       int                     cilist_size;
+       struct xfs_inode        **cilist;
+       struct xfs_inode        *cip;
        int                     nr_found;
        int                     clcount = 0;
        int                     bufwasdelwri;
@@ -3180,23 +3167,23 @@ xfs_iflush_cluster(
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
        inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-       ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
-       ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
-       if (!ilist)
+       cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+       cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
+       if (!cilist)
                goto out_put;
 
        mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
        rcu_read_lock();
        /* really need a gang lookup range call here */
-       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
                                        first_index, inodes_per_cluster);
        if (nr_found == 0)
                goto out_free;
 
        for (i = 0; i < nr_found; i++) {
-               iq = ilist[i];
-               if (iq == ip)
+               cip = cilist[i];
+               if (cip == ip)
                        continue;
 
                /*
@@ -3205,20 +3192,30 @@ xfs_iflush_cluster(
                 * We need to check under the i_flags_lock for a valid inode
                 * here. Skip it if it is not valid or the wrong inode.
                 */
-               spin_lock(&ip->i_flags_lock);
-               if (!ip->i_ino ||
-                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
-                       spin_unlock(&ip->i_flags_lock);
+               spin_lock(&cip->i_flags_lock);
+               if (!cip->i_ino ||
+                   __xfs_iflags_test(cip, XFS_ISTALE)) {
+                       spin_unlock(&cip->i_flags_lock);
                        continue;
                }
-               spin_unlock(&ip->i_flags_lock);
+
+               /*
+                * Once we fall off the end of the cluster, no point checking
+                * any more inodes in the list because they will also all be
+                * outside the cluster.
+                */
+               if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
+                       spin_unlock(&cip->i_flags_lock);
+                       break;
+               }
+               spin_unlock(&cip->i_flags_lock);
 
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
                 * later after the appropriate locks are acquired.
                 */
-               if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+               if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
                        continue;
 
                /*
@@ -3226,15 +3223,28 @@ xfs_iflush_cluster(
                 * then this inode cannot be flushed and is skipped.
                 */
 
-               if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+               if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
+                       continue;
+               if (!xfs_iflock_nowait(cip)) {
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
-               if (!xfs_iflock_nowait(iq)) {
-                       xfs_iunlock(iq, XFS_ILOCK_SHARED);
+               }
+               if (xfs_ipincount(cip)) {
+                       xfs_ifunlock(cip);
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
-               if (xfs_ipincount(iq)) {
-                       xfs_ifunlock(iq);
-                       xfs_iunlock(iq, XFS_ILOCK_SHARED);
+
+
+               /*
+                * Check the inode number again, just to be certain we are not
+                * racing with freeing in xfs_reclaim_inode(). See the comments
+                * in that function for more information as to why the initial
+                * check is not sufficient.
+                */
+               if (!cip->i_ino) {
+                       xfs_ifunlock(cip);
+                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
                        continue;
                }
 
@@ -3242,18 +3252,18 @@ xfs_iflush_cluster(
                 * arriving here means that this inode can be flushed.  First
                 * re-check that it's dirty before flushing.
                 */
-               if (!xfs_inode_clean(iq)) {
+               if (!xfs_inode_clean(cip)) {
                        int     error;
-                       error = xfs_iflush_int(iq, bp);
+                       error = xfs_iflush_int(cip, bp);
                        if (error) {
-                               xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                               xfs_iunlock(cip, XFS_ILOCK_SHARED);
                                goto cluster_corrupt_out;
                        }
                        clcount++;
                } else {
-                       xfs_ifunlock(iq);
+                       xfs_ifunlock(cip);
                }
-               xfs_iunlock(iq, XFS_ILOCK_SHARED);
+               xfs_iunlock(cip, XFS_ILOCK_SHARED);
        }
 
        if (clcount) {
@@ -3263,7 +3273,7 @@ xfs_iflush_cluster(
 
 out_free:
        rcu_read_unlock();
-       kmem_free(ilist);
+       kmem_free(cilist);
 out_put:
        xfs_perag_put(pag);
        return 0;
@@ -3306,8 +3316,8 @@ cluster_corrupt_out:
        /*
         * Unlocks the flush lock
         */
-       xfs_iflush_abort(iq, false);
-       kmem_free(ilist);
+       xfs_iflush_abort(cip, false);
+       kmem_free(cilist);
        xfs_perag_put(pag);
        return -EFSCORRUPTED;
 }
@@ -3327,7 +3337,7 @@ xfs_iflush(
        struct xfs_buf          **bpp)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_buf          *bp;
+       struct xfs_buf          *bp = NULL;
        struct xfs_dinode       *dip;
        int                     error;
 
@@ -3369,14 +3379,22 @@ xfs_iflush(
        }
 
        /*
-        * Get the buffer containing the on-disk inode.
+        * Get the buffer containing the on-disk inode. We are doing a try-lock
+        * operation here, so we may get  an EAGAIN error. In that case, we
+        * simply want to return with the inode still dirty.
+        *
+        * If we get any other error, we effectively have a corruption situation
+        * and we cannot flush the inode, so we treat it the same as failing
+        * xfs_iflush_int().
         */
        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
                               0);
-       if (error || !bp) {
+       if (error == -EAGAIN) {
                xfs_ifunlock(ip);
                return error;
        }
+       if (error)
+               goto corrupt_out;
 
        /*
         * First flush out the inode that xfs_iflush was called with.
@@ -3404,7 +3422,8 @@ xfs_iflush(
        return 0;
 
 corrupt_out:
-       xfs_buf_relse(bp);
+       if (bp)
+               xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
        error = -EFSCORRUPTED;
index 43e1d51b15eb84ca34e978166025b74d30e5b573..e52d7c7aeb5b7773558a4d218afbb2a69cdaafd4 100644 (file)
@@ -440,6 +440,9 @@ loff_t      __xfs_seek_hole_data(struct inode *inode, loff_t start,
 
 
 /* from xfs_iops.c */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+extern void xfs_setup_iops(struct xfs_inode *ip);
+
 /*
  * When setting up a newly allocated inode, we need to call
  * xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -447,7 +450,6 @@ loff_t      __xfs_seek_hole_data(struct inode *inode, loff_t start,
  * before we've completed instantiation. Otherwise we can do it
  * the moment the inode lookup is complete.
  */
-extern void xfs_setup_inode(struct xfs_inode *ip);
 static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 {
        xfs_iflags_clear(ip, XFS_INEW);
@@ -458,6 +460,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
 {
        xfs_setup_inode(ip);
+       xfs_setup_iops(ip);
        xfs_finish_inode_setup(ip);
 }
 
index c48b5b18d771fab685e23c03613a1c6e762efcb4..a1b07612224c95ea56391474a4bb790fec07fab6 100644 (file)
@@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork(
                         */
                        data_bytes = roundup(ip->i_df.if_bytes, 4);
                        ASSERT(ip->i_df.if_real_bytes == 0 ||
-                              ip->i_df.if_real_bytes == data_bytes);
+                              ip->i_df.if_real_bytes >= data_bytes);
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
@@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork(
                         */
                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
                        ASSERT(ip->i_afp->if_real_bytes == 0 ||
-                              ip->i_afp->if_real_bytes == data_bytes);
+                              ip->i_afp->if_real_bytes >= data_bytes);
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
                        xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
                                        ip->i_afp->if_u1.if_data,
@@ -479,6 +479,8 @@ STATIC uint
 xfs_inode_item_push(
        struct xfs_log_item     *lip,
        struct list_head        *buffer_list)
+               __releases(&lip->li_ailp->xa_lock)
+               __acquires(&lip->li_ailp->xa_lock)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
index bcb6c19ce3ea4fea69c536343c07ea222de3155d..dbca7375deefa3f7d2499f516697ffdd28178546 100644 (file)
@@ -277,7 +277,6 @@ xfs_readlink_by_handle(
 {
        struct dentry           *dentry;
        __u32                   olen;
-       void                    *link;
        int                     error;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -288,7 +287,7 @@ xfs_readlink_by_handle(
                return PTR_ERR(dentry);
 
        /* Restrict this handle operation to symlinks only. */
-       if (!d_is_symlink(dentry)) {
+       if (!d_inode(dentry)->i_op->readlink) {
                error = -EINVAL;
                goto out_dput;
        }
@@ -298,21 +297,8 @@ xfs_readlink_by_handle(
                goto out_dput;
        }
 
-       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-       if (!link) {
-               error = -ENOMEM;
-               goto out_dput;
-       }
-
-       error = xfs_readlink(XFS_I(d_inode(dentry)), link);
-       if (error)
-               goto out_kfree;
-       error = readlink_copy(hreq->ohandle, olen, link);
-       if (error)
-               goto out_kfree;
+       error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
 
- out_kfree:
-       kfree(link);
  out_dput:
        dput(dentry);
        return error;
@@ -334,12 +320,10 @@ xfs_set_dmattrs(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
+
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
@@ -1141,10 +1125,9 @@ xfs_ioctl_setattr_get_trans(
        if (XFS_FORCED_SHUTDOWN(mp))
                goto out_unlock;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
        if (error)
-               goto out_cancel;
+               return ERR_PTR(error);
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
index d81bdc080370e474f52b7fbd68c327cf576e74ea..58391355a44df71b7973ae132bb4b5adc2f77a9b 100644 (file)
@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
        int             error;
        int             lockmode;
        int             bmapi_flags = XFS_BMAPI_PREALLOC;
+       uint            tflags = 0;
 
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -191,11 +192,6 @@ xfs_iomap_write_direct(
        if (error)
                return error;
 
-       /*
-        * Allocate and setup the transaction
-        */
-       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
        /*
         * For DAX, we do not allocate unwritten extents, but instead we zero
         * the block before we commit the transaction.  Ideally we'd like to do
@@ -209,23 +205,17 @@ xfs_iomap_write_direct(
         * the reserve block pool for bmbt block allocation if there is no space
         * left but we need to do unwritten extent conversion.
         */
-
        if (IS_DAX(VFS_I(ip))) {
                bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
                if (ISUNWRITTEN(imap)) {
-                       tp->t_flags |= XFS_TRANS_RESERVE;
+                       tflags |= XFS_TRANS_RESERVE;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
                }
        }
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                 resblks, resrtextents);
-       /*
-        * Check for running out of space, note: need lock to return
-        */
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
+                       tflags, &tp);
+       if (error)
                return error;
-       }
 
        lockmode = XFS_ILOCK_EXCL;
        xfs_ilock(ip, lockmode);
@@ -726,15 +716,13 @@ xfs_iomap_write_allocate(
 
                nimaps = 0;
                while (nimaps == 0) {
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-                       tp->t_flags |= XFS_TRANS_RESERVE;
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                                 nres, 0);
-                       if (error) {
-                               xfs_trans_cancel(tp);
+
+                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+                                       0, XFS_TRANS_RESERVE, &tp);
+                       if (error)
                                return error;
-                       }
+
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
 
@@ -878,25 +866,18 @@ xfs_iomap_write_unwritten(
 
        do {
                /*
-                * set up a transaction to convert the range of extents
+                * Set up a transaction to convert the range of extents
                 * from unwritten to real. Do allocations in a loop until
                 * we have covered the range passed in.
                 *
-                * Note that we open code the transaction allocation here
-                * to pass KM_NOFS--we can't risk to recursing back into
-                * the filesystem here as we might be asked to write out
-                * the same inode that we complete here and might deadlock
-                * on the iolock.
+                * Note that we can't risk to recursing back into the filesystem
+                * here as we might be asked to write out the same inode that we
+                * complete here and might deadlock on the iolock.
                 */
-               sb_start_intwrite(mp->m_super);
-               tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
-               tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
-                                         resblks, 0);
-               if (error) {
-                       xfs_trans_cancel(tp);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+                               XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+               if (error)
                        return error;
-               }
 
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
index fb7dc61f4a29d7cee3d4683c675c4551e7669e52..c5d4eba6972eb17327be763e7bd914c5f4da3a59 100644 (file)
@@ -181,6 +181,8 @@ xfs_generic_create(
        }
 #endif
 
+       xfs_setup_iops(ip);
+
        if (tmpfile)
                d_tmpfile(dentry, inode);
        else
@@ -368,6 +370,8 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out_cleanup_inode;
 
+       xfs_setup_iops(cip);
+
        d_instantiate(dentry, inode);
        xfs_finish_inode_setup(cip);
        return 0;
@@ -442,6 +446,16 @@ xfs_vn_get_link(
        return ERR_PTR(error);
 }
 
+STATIC const char *
+xfs_vn_get_link_inline(
+       struct dentry           *dentry,
+       struct inode            *inode,
+       struct delayed_call     *done)
+{
+       ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+       return XFS_I(inode)->i_df.if_u1.if_data;
+}
+
 STATIC int
 xfs_vn_getattr(
        struct vfsmount         *mnt,
@@ -599,12 +613,12 @@ xfs_setattr_nonsize(
                        return error;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
        if (error)
-               goto out_trans_cancel;
+               goto out_dqrele;
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
 
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -633,12 +647,10 @@ xfs_setattr_nonsize(
                                                NULL, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (error)      /* out of quota */
-                               goto out_unlock;
+                               goto out_cancel;
                }
        }
 
-       xfs_trans_ijoin(tp, ip, 0);
-
        /*
         * Change file ownership.  Must be the owner or privileged.
         */
@@ -722,10 +734,9 @@ xfs_setattr_nonsize(
 
        return 0;
 
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_trans_cancel:
+out_cancel:
        xfs_trans_cancel(tp);
+out_dqrele:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        return error;
@@ -834,7 +845,7 @@ xfs_setattr_size(
         * We have to do all the page cache truncate work outside the
         * transaction context as the "lock" order is page lock->log space
         * reservation as defined by extent allocation in the writeback path.
-        * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+        * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
         * having already truncated the in-memory version of the file (i.e. made
         * user visible changes). There's not much we can do about this, except
         * to hope that the caller sees ENOMEM and retries the truncate
@@ -849,10 +860,9 @@ xfs_setattr_size(
                return error;
        truncate_setsize(inode, newsize);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error)
-               goto out_trans_cancel;
+               return error;
 
        lock_flags |= XFS_ILOCK_EXCL;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -971,12 +981,9 @@ xfs_vn_update_time(
 
        trace_xfs_update_time(ip);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (flags & S_CTIME)
@@ -1167,6 +1174,18 @@ static const struct inode_operations xfs_symlink_inode_operations = {
        .update_time            = xfs_vn_update_time,
 };
 
+static const struct inode_operations xfs_inline_symlink_inode_operations = {
+       .readlink               = generic_readlink,
+       .get_link               = xfs_vn_get_link_inline,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+       .update_time            = xfs_vn_update_time,
+};
+
 STATIC void
 xfs_diflags_to_iflags(
        struct inode            *inode,
@@ -1193,7 +1212,7 @@ xfs_diflags_to_iflags(
 }
 
 /*
- * Initialize the Linux inode and set up the operation vectors.
+ * Initialize the Linux inode.
  *
  * When reading existing inodes from disk this is called directly from xfs_iget,
  * when creating a new inode it is called from xfs_ialloc after setting up the
@@ -1232,32 +1251,12 @@ xfs_setup_inode(
        i_size_write(inode, ip->i_d.di_size);
        xfs_diflags_to_iflags(inode, ip);
 
-       ip->d_ops = ip->i_mount->m_nondir_inode_ops;
-       lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFREG:
-               inode->i_op = &xfs_inode_operations;
-               inode->i_fop = &xfs_file_operations;
-               inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       case S_IFDIR:
+       if (S_ISDIR(inode->i_mode)) {
                lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
-               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                       inode->i_op = &xfs_dir_ci_inode_operations;
-               else
-                       inode->i_op = &xfs_dir_inode_operations;
-               inode->i_fop = &xfs_dir_file_operations;
                ip->d_ops = ip->i_mount->m_dir_inode_ops;
-               break;
-       case S_IFLNK:
-               inode->i_op = &xfs_symlink_inode_operations;
-               if (!(ip->i_df.if_flags & XFS_IFINLINE))
-                       inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       default:
-               inode->i_op = &xfs_inode_operations;
-               init_special_inode(inode, inode->i_mode, inode->i_rdev);
-               break;
+       } else {
+               ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+               lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
        }
 
        /*
@@ -1277,3 +1276,35 @@ xfs_setup_inode(
                cache_no_acl(inode);
        }
 }
+
+void
+xfs_setup_iops(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = &ip->i_vnode;
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFREG:
+               inode->i_op = &xfs_inode_operations;
+               inode->i_fop = &xfs_file_operations;
+               inode->i_mapping->a_ops = &xfs_address_space_operations;
+               break;
+       case S_IFDIR:
+               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                       inode->i_op = &xfs_dir_ci_inode_operations;
+               else
+                       inode->i_op = &xfs_dir_inode_operations;
+               inode->i_fop = &xfs_dir_file_operations;
+               break;
+       case S_IFLNK:
+               if (ip->i_df.if_flags & XFS_IFINLINE)
+                       inode->i_op = &xfs_inline_symlink_inode_operations;
+               else
+                       inode->i_op = &xfs_symlink_inode_operations;
+               break;
+       default:
+               inode->i_op = &xfs_inode_operations;
+               init_special_inode(inode, inode->i_mode, inode->i_rdev);
+               break;
+       }
+}
index b49ccf5c1d7564402c39671c4e67b7cf92ab8082..bde02f1fba7323a2186edcd8462df8d2b9273fc5 100644 (file)
@@ -435,8 +435,7 @@ xfs_log_reserve(
        int                     cnt,
        struct xlog_ticket      **ticp,
        __uint8_t               client,
-       bool                    permanent,
-       uint                    t_type)
+       bool                    permanent)
 {
        struct xlog             *log = mp->m_log;
        struct xlog_ticket      *tic;
@@ -456,7 +455,6 @@ xfs_log_reserve(
        if (!tic)
                return -ENOMEM;
 
-       tic->t_trans_type = t_type;
        *ticp = tic;
 
        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-               error = xfs_log_reserve(mp, 600, 1, &tic,
-                                       XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+               error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
                if (!error) {
                        /* the data section must be 32 bit size aligned */
                        struct {
@@ -2032,58 +2029,8 @@ xlog_print_tic_res(
            REG_TYPE_STR(ICREATE, "inode create")
        };
 #undef REG_TYPE_STR
-#define TRANS_TYPE_STR(type)   [XFS_TRANS_##type] = #type
-       static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
-           TRANS_TYPE_STR(SETATTR_NOT_SIZE),
-           TRANS_TYPE_STR(SETATTR_SIZE),
-           TRANS_TYPE_STR(INACTIVE),
-           TRANS_TYPE_STR(CREATE),
-           TRANS_TYPE_STR(CREATE_TRUNC),
-           TRANS_TYPE_STR(TRUNCATE_FILE),
-           TRANS_TYPE_STR(REMOVE),
-           TRANS_TYPE_STR(LINK),
-           TRANS_TYPE_STR(RENAME),
-           TRANS_TYPE_STR(MKDIR),
-           TRANS_TYPE_STR(RMDIR),
-           TRANS_TYPE_STR(SYMLINK),
-           TRANS_TYPE_STR(SET_DMATTRS),
-           TRANS_TYPE_STR(GROWFS),
-           TRANS_TYPE_STR(STRAT_WRITE),
-           TRANS_TYPE_STR(DIOSTRAT),
-           TRANS_TYPE_STR(WRITEID),
-           TRANS_TYPE_STR(ADDAFORK),
-           TRANS_TYPE_STR(ATTRINVAL),
-           TRANS_TYPE_STR(ATRUNCATE),
-           TRANS_TYPE_STR(ATTR_SET),
-           TRANS_TYPE_STR(ATTR_RM),
-           TRANS_TYPE_STR(ATTR_FLAG),
-           TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
-           TRANS_TYPE_STR(SB_CHANGE),
-           TRANS_TYPE_STR(DUMMY1),
-           TRANS_TYPE_STR(DUMMY2),
-           TRANS_TYPE_STR(QM_QUOTAOFF),
-           TRANS_TYPE_STR(QM_DQALLOC),
-           TRANS_TYPE_STR(QM_SETQLIM),
-           TRANS_TYPE_STR(QM_DQCLUSTER),
-           TRANS_TYPE_STR(QM_QINOCREATE),
-           TRANS_TYPE_STR(QM_QUOTAOFF_END),
-           TRANS_TYPE_STR(FSYNC_TS),
-           TRANS_TYPE_STR(GROWFSRT_ALLOC),
-           TRANS_TYPE_STR(GROWFSRT_ZERO),
-           TRANS_TYPE_STR(GROWFSRT_FREE),
-           TRANS_TYPE_STR(SWAPEXT),
-           TRANS_TYPE_STR(CHECKPOINT),
-           TRANS_TYPE_STR(ICREATE),
-           TRANS_TYPE_STR(CREATE_TMPFILE)
-       };
-#undef TRANS_TYPE_STR
 
        xfs_warn(mp, "xlog_write: reservation summary:");
-       xfs_warn(mp, "  trans type  = %s (%u)",
-                ((ticket->t_trans_type <= 0 ||
-                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                 "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
-                ticket->t_trans_type);
        xfs_warn(mp, "  unit res    = %d bytes",
                 ticket->t_unit_res);
        xfs_warn(mp, "  current res = %d bytes",
@@ -3378,7 +3325,7 @@ xfs_log_force(
 {
        int     error;
 
-       trace_xfs_log_force(mp, 0);
+       trace_xfs_log_force(mp, 0, _RET_IP_);
        error = _xfs_log_force(mp, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3527,7 +3474,7 @@ xfs_log_force_lsn(
 {
        int     error;
 
-       trace_xfs_log_force(mp, lsn);
+       trace_xfs_log_force(mp, lsn, _RET_IP_);
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
        if (error)
                xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3709,7 +3656,6 @@ xlog_ticket_alloc(
        tic->t_tid              = prandom_u32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
-       tic->t_trans_type       = 0;
        if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
 
index aa533a7d50f2186f051b09c7dcfd78b431571d15..80ba0c047090165cb6353bcf32ae560fe82ef418 100644 (file)
@@ -161,8 +161,7 @@ int   xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
-                         bool             permanent,
-                         uint             t_type);
+                         bool             permanent);
 int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int      xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
index 4e7649351f5a25ab062396818861c9d7c2ee0d61..5e54e7955ea638a7c8fbbd885080fbf0005282f7 100644 (file)
@@ -51,7 +51,6 @@ xlog_cil_ticket_alloc(
 
        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
                                KM_SLEEP|KM_NOFS);
-       tic->t_trans_type = XFS_TRANS_CHECKPOINT;
 
        /*
         * set the current reservation to zero so we know to steal the basic
index ed8896310c00b64ff768a207a81e9f58b7836fc5..765f084759b5d5be555de8f71ec7556d55782f01 100644 (file)
@@ -175,7 +175,6 @@ typedef struct xlog_ticket {
        char               t_cnt;        /* current count                : 1  */
        char               t_clientid;   /* who does this belong to;     : 1  */
        char               t_flags;      /* properties of reservation    : 1  */
-       uint               t_trans_type; /* transaction type             : 4  */
 
         /* reservation array fields */
        uint               t_res_num;                    /* num in array : 4 */
index 396565f4324764058b979cf5e4c5bd96744f8ef8..83599784384686c2cb306bd2c2843422d5a2966c 100644 (file)
@@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-       ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
+       ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
        memcpy(&ptr[old_len], dp, len);
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4205,10 +4205,9 @@ xlog_recover_process_efi(
                }
        }
 
-       tp = xfs_trans_alloc(mp, 0);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error)
-               goto abort_error;
+               return error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
@@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket(
        int             offset;
        int             error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
        if (error)
-               goto out_abort;
+               goto out_error;
 
        error = xfs_read_agi(mp, tp, agno, &agibp);
        if (error)
index cfd4210dd01500203c5c90e2fe064442fe0998ac..e39b02351b4a257e92f9e2506a8af8f31aecc688 100644 (file)
@@ -89,7 +89,6 @@ xfs_uuid_mount(
        if (hole < 0) {
                xfs_uuid_table = kmem_realloc(xfs_uuid_table,
                        (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
-                       xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
                        KM_SLEEP);
                hole = xfs_uuid_table_size++;
        }
@@ -681,6 +680,9 @@ xfs_mountfs(
 
        xfs_set_maxicount(mp);
 
+       /* enable fail_at_unmount as default */
+       mp->m_fail_unmount = 1;
+
        error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
@@ -690,10 +692,15 @@ xfs_mountfs(
        if (error)
                goto out_remove_sysfs;
 
-       error = xfs_uuid_mount(mp);
+       error = xfs_error_sysfs_init(mp);
        if (error)
                goto out_del_stats;
 
+
+       error = xfs_uuid_mount(mp);
+       if (error)
+               goto out_remove_error_sysfs;
+
        /*
         * Set the minimum read and write sizes
         */
@@ -957,6 +964,7 @@ xfs_mountfs(
        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
  out_log_dealloc:
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
        xfs_log_mount_cancel(mp);
  out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -968,6 +976,8 @@ xfs_mountfs(
        xfs_da_unmount(mp);
  out_remove_uuid:
        xfs_uuid_unmount(mp);
+ out_remove_error_sysfs:
+       xfs_error_sysfs_del(mp);
  out_del_stats:
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
  out_remove_sysfs:
@@ -1005,6 +1015,14 @@ xfs_unmountfs(
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
 
+       /*
+        * We now need to tell the world we are unmounting. This will allow
+        * us to detect that the filesystem is going away and we should error
+        * out anything that we have been retrying in the background. This will
+        * prevent neverending retries in AIL pushing from hanging the unmount.
+        */
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
        /*
         * Flush all pending changes from the AIL.
         */
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
 #endif
        xfs_free_perag(mp);
 
+       xfs_error_sysfs_del(mp);
        xfs_sysfs_del(&mp->m_stats.xs_kobj);
        xfs_sysfs_del(&mp->m_kobj);
 }
index eafe257b357addf83152f0c0ca53128e320abc94..c1b798c7212618462ee2130814eac8544c691bf0 100644 (file)
@@ -37,6 +37,32 @@ enum {
        XFS_LOWSP_MAX,
 };
 
+/*
+ * Error Configuration
+ *
+ * Error classes define the subsystem the configuration belongs to.
+ * Error numbers define the errors that are configurable.
+ */
+enum {
+       XFS_ERR_METADATA,
+       XFS_ERR_CLASS_MAX,
+};
+enum {
+       XFS_ERR_DEFAULT,
+       XFS_ERR_EIO,
+       XFS_ERR_ENOSPC,
+       XFS_ERR_ENODEV,
+       XFS_ERR_ERRNO_MAX,
+};
+
+#define XFS_ERR_RETRY_FOREVER  -1
+
+struct xfs_error_cfg {
+       struct xfs_kobj kobj;
+       int             max_retries;
+       unsigned long   retry_timeout;  /* in jiffies, 0 = no timeout */
+};
+
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
@@ -127,6 +153,9 @@ typedef struct xfs_mount {
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
+       struct xfs_kobj         m_error_kobj;
+       struct xfs_kobj         m_error_meta_kobj;
+       struct xfs_error_cfg    m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
        struct xstats           m_stats;        /* per-fs stats */
 
        struct workqueue_struct *m_buf_workqueue;
@@ -148,6 +177,7 @@ typedef struct xfs_mount {
         */
        __uint32_t              m_generation;
 
+       bool                    m_fail_unmount;
 #ifdef DEBUG
        /*
         * DEBUG mode instrumentation to test and/or trigger delayed allocation
@@ -166,6 +196,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC                (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_UNMOUNTING   (1ULL << 1)     /* filesystem is unmounting */
 #define XFS_MOUNT_WAS_CLEAN    (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN  (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
@@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
 int    xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
                        xfs_off_t count_fsb);
 
+struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
+               int error_class, int error);
+
 #endif /* __XFS_MOUNT_H__ */
index 51ddaf2c2b8c96648d70f364d45d6bb95eca3947..d5b756669fb5f43763b1b634b1348a5d52941743 100644 (file)
@@ -308,12 +308,9 @@ xfs_fs_commit_blocks(
                        goto out_drop_iolock;
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+       if (error)
                goto out_drop_iolock;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
index be125e1758c1a5e4df36cfb8ec6e3e3643adc534..a60d9e2739d14a2ebcff8ee7dcedae7f5177bf3c 100644 (file)
@@ -783,13 +783,10 @@ xfs_qm_qino_alloc(
                }
        }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
-                                 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+                       XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        if (need_alloc) {
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
index f4d0e0a8f517c65913b8d45f383450384576b39e..475a3882a81fef1cedaabf33383d04c1ee8d3e4b 100644 (file)
@@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
        if (error) {
-               xfs_trans_cancel(tp);
                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
                goto out_put;
        }
@@ -436,12 +434,9 @@ xfs_qm_scall_setqlim(
        defq = xfs_get_defquota(dqp, q);
        xfs_dqunlock(dqp);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
+       if (error)
                goto out_rele;
-       }
 
        xfs_dqlock(dqp);
        xfs_trans_dqjoin(tp, dqp);
@@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end(
        int                     error;
        xfs_qoff_logitem_t      *qoffi;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
                                        flags & XFS_ALL_QUOTA_ACCT);
@@ -603,12 +594,9 @@ xfs_qm_log_quotaoff(
 
        *qoffstartp = NULL;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+       if (error)
                goto out;
-       }
 
        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
        xfs_trans_log_quotaoff_item(tp, qoffi);
index abf44435d04a3f4b898e21a00e45ee8ae607738a..3938b37d1043bb6fd98879fa4783b6bbec8cfef6 100644 (file)
@@ -780,15 +780,14 @@ xfs_growfs_rt_alloc(
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
                /*
                 * Reserve space & log for one extent added to the file.
                 */
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
-                                         resblks, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
+                               0, 0, &tp);
                if (error)
-                       goto out_trans_cancel;
+                       return error;
                /*
                 * Lock the inode.
                 */
@@ -823,14 +822,13 @@ xfs_growfs_rt_alloc(
                for (bno = map.br_startoff, fsbno = map.br_startblock;
                     bno < map.br_startoff + map.br_blockcount;
                     bno++, fsbno++) {
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
                        /*
                         * Reserve log for one block zeroing.
                         */
-                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
-                                                 0, 0);
+                       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
+                                       0, 0, 0, &tp);
                        if (error)
-                               goto out_trans_cancel;
+                               return error;
                        /*
                         * Lock the bitmap inode.
                         */
@@ -994,11 +992,10 @@ xfs_growfs_rt(
                /*
                 * Start a transaction, get the log reservation.
                 */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
-                                         0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
+                               &tp);
                if (error)
-                       goto error_cancel;
+                       break;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
index 187e14b696c200bac8d78745fce25c449b6f35f6..11ea5d51db56715cf2815dbda8ab3577ee0fd686 100644 (file)
@@ -58,8 +58,7 @@
 #include <linux/parser.h>
 
 static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
+struct bio_set *xfs_ioend_bioset;
 
 static struct kset *xfs_kset;          /* top-level xfs sysfs dir */
 #ifdef DEBUG
@@ -350,6 +349,7 @@ xfs_parseargs(
                case Opt_pqnoenforce:
                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+                       break;
                case Opt_gquota:
                case Opt_grpquota:
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
@@ -928,7 +928,7 @@ xfs_fs_alloc_inode(
 
 /*
  * Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
+ * the linux inode, we can inactivate and reclaim the inode.
  */
 STATIC void
 xfs_fs_destroy_inode(
@@ -938,9 +938,14 @@ xfs_fs_destroy_inode(
 
        trace_xfs_destroy_inode(ip);
 
-       XFS_STATS_INC(ip->i_mount, vn_reclaim);
+       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+       XFS_STATS_INC(ip->i_mount, vn_rele);
+       XFS_STATS_INC(ip->i_mount, vn_remove);
+
+       xfs_inactive(ip);
 
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+       XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
        /*
         * We should never get here with one of the reclaim flags already set.
@@ -987,24 +992,6 @@ xfs_fs_inode_init_once(
                     "xfsino", ip->i_ino);
 }
 
-STATIC void
-xfs_fs_evict_inode(
-       struct inode            *inode)
-{
-       xfs_inode_t             *ip = XFS_I(inode);
-
-       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-
-       trace_xfs_evict_inode(ip);
-
-       truncate_inode_pages_final(&inode->i_data);
-       clear_inode(inode);
-       XFS_STATS_INC(ip->i_mount, vn_rele);
-       XFS_STATS_INC(ip->i_mount, vn_remove);
-
-       xfs_inactive(ip);
-}
-
 /*
  * We do an unlocked check for XFS_IDONTCACHE here because we are already
  * serialised against cache hits here via the inode->i_lock and igrab() in
@@ -1276,6 +1263,16 @@ xfs_fs_remount(
                        return -EINVAL;
                }
 
+               if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+                   xfs_sb_has_ro_compat_feature(sbp,
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+                               (sbp->sb_features_ro_compat &
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                       return -EINVAL;
+               }
+
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
 
                /*
@@ -1558,14 +1555,12 @@ xfs_fs_fill_super(
 
        if (mp->m_flags & XFS_MOUNT_DAX) {
                xfs_warn(mp,
-       "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-               if (sb->s_blocksize != PAGE_SIZE) {
-                       xfs_alert(mp,
-               "Filesystem block size invalid for DAX Turning DAX off.");
-                       mp->m_flags &= ~XFS_MOUNT_DAX;
-               } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+               "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+
+               error = bdev_dax_supported(sb, sb->s_blocksize);
+               if (error) {
                        xfs_alert(mp,
-               "Block device does not support DAX Turning DAX off.");
+                       "DAX unsupported by block device. Turning off DAX.");
                        mp->m_flags &= ~XFS_MOUNT_DAX;
                }
        }
@@ -1663,7 +1658,6 @@ xfs_fs_free_cached_objects(
 static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
-       .evict_inode            = xfs_fs_evict_inode,
        .drop_inode             = xfs_fs_drop_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
@@ -1688,20 +1682,15 @@ MODULE_ALIAS_FS("xfs");
 STATIC int __init
 xfs_init_zones(void)
 {
-
-       xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-       if (!xfs_ioend_zone)
+       xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
+                       offsetof(struct xfs_ioend, io_inline_bio));
+       if (!xfs_ioend_bioset)
                goto out;
 
-       xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-                                                 xfs_ioend_zone);
-       if (!xfs_ioend_pool)
-               goto out_destroy_ioend_zone;
-
        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
                                                "xfs_log_ticket");
        if (!xfs_log_ticket_zone)
-               goto out_destroy_ioend_pool;
+               goto out_free_ioend_bioset;
 
        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
                                                "xfs_bmap_free_item");
@@ -1797,10 +1786,8 @@ xfs_init_zones(void)
        kmem_zone_destroy(xfs_bmap_free_item_zone);
  out_destroy_log_ticket_zone:
        kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
-       mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
-       kmem_zone_destroy(xfs_ioend_zone);
+ out_free_ioend_bioset:
+       bioset_free(xfs_ioend_bioset);
  out:
        return -ENOMEM;
 }
@@ -1826,9 +1813,7 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_btree_cur_zone);
        kmem_zone_destroy(xfs_bmap_free_item_zone);
        kmem_zone_destroy(xfs_log_ticket_zone);
-       mempool_destroy(xfs_ioend_pool);
-       kmem_zone_destroy(xfs_ioend_zone);
-
+       bioset_free(xfs_ioend_bioset);
 }
 
 STATIC int __init
index b44284c1adda15c647ddcc2feebf0a24302bdcbc..08a46c6181fdb698bf6b6deed28e21fa6c01ce7d 100644 (file)
@@ -131,6 +131,8 @@ xfs_readlink(
 
        trace_xfs_readlink(ip);
 
+       ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
+
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
@@ -150,12 +152,7 @@ xfs_readlink(
        }
 
 
-       if (ip->i_df.if_flags & XFS_IFINLINE) {
-               memcpy(link, ip->i_df.if_u1.if_data, pathlen);
-               link[pathlen] = '\0';
-       } else {
-               error = xfs_readlink_bmap(ip, link);
-       }
+       error = xfs_readlink_bmap(ip, link);
 
  out:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -221,7 +218,6 @@ xfs_symlink(
        if (error)
                return error;
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
        /*
         * The symlink will fit into the inode data fork?
         * There can't be any attributes so we get the whole variable part.
@@ -231,13 +227,15 @@ xfs_symlink(
        else
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
        if (error == -ENOSPC && fs_blocks == 0) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
+               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0,
+                               &tp);
        }
        if (error)
-               goto out_trans_cancel;
+               goto out_release_inode;
 
        xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
                      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
@@ -302,19 +300,11 @@ xfs_symlink(
         * If the symlink will fit into the inode, write it inline.
         */
        if (pathlen <= XFS_IFORK_DSIZE(ip)) {
-               xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
-               memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
-               ip->i_d.di_size = pathlen;
-
-               /*
-                * The inode was initially created in extent format.
-                */
-               ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
-               ip->i_df.if_flags |= XFS_IFINLINE;
+               xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
 
+               ip->i_d.di_size = pathlen;
                ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
                xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
        } else {
                int     offset;
 
@@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt(
         */
        ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp);
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+       if (error)
                return error;
-       }
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
index 6ced4f1434948d3757077c732a93d6fac88ea0cc..4c2c5508620819ea4075fe65c1b7390a47246add 100644 (file)
  */
 
 #include "xfs.h"
-#include "xfs_sysfs.h"
+#include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_sysfs.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
@@ -362,3 +363,291 @@ struct kobj_type xfs_log_ktype = {
        .sysfs_ops = &xfs_sysfs_ops,
        .default_attrs = xfs_log_attrs,
 };
+
+/*
+ * Metadata IO error configuration
+ *
+ * The sysfs structure here is:
+ *     ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
+ *
+ * where <class> allows us to discriminate between data IO and metadata IO,
+ * and any other future type of IO (e.g. special inode or directory error
+ * handling) we care to support.
+ */
+static inline struct xfs_error_cfg *
+to_error_cfg(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       return container_of(kobj, struct xfs_error_cfg, kobj);
+}
+
+static inline struct xfs_mount *
+err_to_mp(struct kobject *kobject)
+{
+       struct xfs_kobj *kobj = to_kobj(kobject);
+       return container_of(kobj, struct xfs_mount, m_error_kobj);
+}
+
+static ssize_t
+max_retries_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries);
+}
+
+static ssize_t
+max_retries_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val < -1)
+               return -EINVAL;
+
+       cfg->max_retries = val;
+       return count;
+}
+XFS_SYSFS_ATTR_RW(max_retries);
+
+static ssize_t
+retry_timeout_seconds_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%ld\n",
+                       jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC);
+}
+
+static ssize_t
+retry_timeout_seconds_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_error_cfg *cfg = to_error_cfg(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       /* 1 day timeout maximum */
+       if (val < 0 || val > 86400)
+               return -EINVAL;
+
+       cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
+       return count;
+}
+XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
+
+static ssize_t
+fail_at_unmount_show(
+       struct kobject  *kobject,
+       char            *buf)
+{
+       struct xfs_mount        *mp = err_to_mp(kobject);
+
+       return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+}
+
+static ssize_t
+fail_at_unmount_store(
+       struct kobject  *kobject,
+       const char      *buf,
+       size_t          count)
+{
+       struct xfs_mount        *mp = err_to_mp(kobject);
+       int             ret;
+       int             val;
+
+       ret = kstrtoint(buf, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val < 0 || val > 1)
+               return -EINVAL;
+
+       mp->m_fail_unmount = val;
+       return count;
+}
+XFS_SYSFS_ATTR_RW(fail_at_unmount);
+
+static struct attribute *xfs_error_attrs[] = {
+       ATTR_LIST(max_retries),
+       ATTR_LIST(retry_timeout_seconds),
+       NULL,
+};
+
+
+struct kobj_type xfs_error_cfg_ktype = {
+       .release = xfs_sysfs_release,
+       .sysfs_ops = &xfs_sysfs_ops,
+       .default_attrs = xfs_error_attrs,
+};
+
+struct kobj_type xfs_error_ktype = {
+       .release = xfs_sysfs_release,
+       .sysfs_ops = &xfs_sysfs_ops,
+};
+
+/*
+ * Error initialization tables. These need to be ordered in the same
+ * order as the enums used to index the array. All class init tables need to
+ * define a "default" behaviour as the first entry, all other entries can be
+ * empty.
+ */
+struct xfs_error_init {
+       char            *name;
+       int             max_retries;
+       int             retry_timeout;  /* in seconds */
+};
+
+static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
+       { .name = "default",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "EIO",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "ENOSPC",
+         .max_retries = XFS_ERR_RETRY_FOREVER,
+         .retry_timeout = 0,
+       },
+       { .name = "ENODEV",
+         .max_retries = 0,
+       },
+};
+
+static int
+xfs_error_sysfs_init_class(
+       struct xfs_mount        *mp,
+       int                     class,
+       const char              *parent_name,
+       struct xfs_kobj         *parent_kobj,
+       const struct xfs_error_init init[])
+{
+       struct xfs_error_cfg    *cfg;
+       int                     error;
+       int                     i;
+
+       ASSERT(class < XFS_ERR_CLASS_MAX);
+
+       error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
+                               &mp->m_error_kobj, parent_name);
+       if (error)
+               return error;
+
+       for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
+               cfg = &mp->m_error_cfg[class][i];
+               error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
+                                       parent_kobj, init[i].name);
+               if (error)
+                       goto out_error;
+
+               cfg->max_retries = init[i].max_retries;
+               cfg->retry_timeout = msecs_to_jiffies(
+                                       init[i].retry_timeout * MSEC_PER_SEC);
+       }
+       return 0;
+
+out_error:
+       /* unwind the entries that succeeded */
+       for (i--; i >= 0; i--) {
+               cfg = &mp->m_error_cfg[class][i];
+               xfs_sysfs_del(&cfg->kobj);
+       }
+       xfs_sysfs_del(parent_kobj);
+       return error;
+}
+
+int
+xfs_error_sysfs_init(
+       struct xfs_mount        *mp)
+{
+       int                     error;
+
+       /* .../xfs/<dev>/error/ */
+       error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
+                               &mp->m_kobj, "error");
+       if (error)
+               return error;
+
+       error = sysfs_create_file(&mp->m_error_kobj.kobject,
+                                 ATTR_LIST(fail_at_unmount));
+
+       if (error)
+               goto out_error;
+
+       /* .../xfs/<dev>/error/metadata/ */
+       error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
+                               "metadata", &mp->m_error_meta_kobj,
+                               xfs_error_meta_init);
+       if (error)
+               goto out_error;
+
+       return 0;
+
+out_error:
+       xfs_sysfs_del(&mp->m_error_kobj);
+       return error;
+}
+
+void
+xfs_error_sysfs_del(
+       struct xfs_mount        *mp)
+{
+       struct xfs_error_cfg    *cfg;
+       int                     i, j;
+
+       for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
+               for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
+                       cfg = &mp->m_error_cfg[i][j];
+
+                       xfs_sysfs_del(&cfg->kobj);
+               }
+       }
+       xfs_sysfs_del(&mp->m_error_meta_kobj);
+       xfs_sysfs_del(&mp->m_error_kobj);
+}
+
+struct xfs_error_cfg *
+xfs_error_get_cfg(
+       struct xfs_mount        *mp,
+       int                     error_class,
+       int                     error)
+{
+       struct xfs_error_cfg    *cfg;
+
+       switch (error) {
+       case EIO:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
+               break;
+       case ENOSPC:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
+               break;
+       case ENODEV:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
+               break;
+       default:
+               cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
+               break;
+       }
+
+       return cfg;
+}
index be692e59938db7e8cba4ca4186b1a8f90928887c..d04637181ef21709715d6320bf0569e3d715740a 100644 (file)
@@ -58,4 +58,7 @@ xfs_sysfs_del(
        wait_for_completion(&kobj->complete);
 }
 
+int    xfs_error_sysfs_init(struct xfs_mount *mp);
+void   xfs_error_sysfs_del(struct xfs_mount *mp);
+
 #endif /* __XFS_SYSFS_H__ */
index c8d58426008ed7ef49096097904ed13653a8cfe9..ea94ee0fe5ea2b8e9e089b857d82a30057e0b950 100644 (file)
@@ -364,7 +364,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
 DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
@@ -944,7 +943,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_ARGS(log, tic),
        TP_STRUCT__entry(
                __field(dev_t, dev)
-               __field(unsigned, trans_type)
                __field(char, ocnt)
                __field(char, cnt)
                __field(int, curr_res)
@@ -962,7 +960,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
        ),
        TP_fast_assign(
                __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->trans_type = tic->t_trans_type;
                __entry->ocnt = tic->t_ocnt;
                __entry->cnt = tic->t_cnt;
                __entry->curr_res = tic->t_curr_res;
@@ -980,14 +977,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_block = log->l_curr_block;
                __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
        ),
-       TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+       TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
                  "t_unit_res %u t_flags %s reserveq %s "
                  "writeq %s grant_reserve_cycle %d "
                  "grant_reserve_bytes %d grant_write_cycle %d "
                  "grant_write_bytes %d curr_cycle %d curr_block %d "
                  "tail_cycle %d tail_block %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
                  __entry->ocnt,
                  __entry->cnt,
                  __entry->curr_res,
@@ -1053,19 +1049,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
 )
 
 TRACE_EVENT(xfs_log_force,
-       TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
-       TP_ARGS(mp, lsn),
+       TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip),
+       TP_ARGS(mp, lsn, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_lsn_t, lsn)
+               __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->lsn = lsn;
+               __entry->caller_ip = caller_ip;
        ),
-       TP_printk("dev %d:%d lsn 0x%llx",
+       TP_printk("dev %d:%d lsn 0x%llx caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->lsn)
+                 __entry->lsn, (void *)__entry->caller_ip)
 )
 
 #define DEFINE_LOG_ITEM_EVENT(name) \
index 20c53666cb4b3272400bc7111d285f104e555d75..5f3d33d16e6706b9db55cb3919c35feea29408ca 100644 (file)
@@ -46,47 +46,6 @@ xfs_trans_init(
        xfs_trans_resv_calc(mp, M_RES(mp));
 }
 
-/*
- * This routine is called to allocate a transaction structure.
- * The type parameter indicates the type of the transaction.  These
- * are enumerated in xfs_trans.h.
- *
- * Dynamically allocate the transaction structure from the transaction
- * zone, initialize it, and return it to the caller.
- */
-xfs_trans_t *
-xfs_trans_alloc(
-       xfs_mount_t     *mp,
-       uint            type)
-{
-       xfs_trans_t     *tp;
-
-       sb_start_intwrite(mp->m_super);
-       tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
-       tp->t_flags |= XFS_TRANS_FREEZE_PROT;
-       return tp;
-}
-
-xfs_trans_t *
-_xfs_trans_alloc(
-       xfs_mount_t     *mp,
-       uint            type,
-       xfs_km_flags_t  memflags)
-{
-       xfs_trans_t     *tp;
-
-       WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
-       atomic_inc(&mp->m_active_trans);
-
-       tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
-       tp->t_type = type;
-       tp->t_mountp = mp;
-       INIT_LIST_HEAD(&tp->t_items);
-       INIT_LIST_HEAD(&tp->t_busy);
-       return tp;
-}
-
 /*
  * Free the transaction structure.  If there is more clean up
  * to do when the structure is freed, add it here.
@@ -99,7 +58,7 @@ xfs_trans_free(
        xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
        atomic_dec(&tp->t_mountp->m_active_trans);
-       if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+       if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
                sb_end_intwrite(tp->t_mountp->m_super);
        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
@@ -125,7 +84,6 @@ xfs_trans_dup(
         * Initialize the new transaction structure.
         */
        ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
-       ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        INIT_LIST_HEAD(&ntp->t_items);
        INIT_LIST_HEAD(&ntp->t_busy);
@@ -135,9 +93,9 @@ xfs_trans_dup(
 
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
                       (tp->t_flags & XFS_TRANS_RESERVE) |
-                      (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+                      (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
        /* We gave our writer reference to the new transaction */
-       tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
+       tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
@@ -165,7 +123,7 @@ xfs_trans_dup(
  * This does not do quota reservations. That typically is done by the
  * caller afterwards.
  */
-int
+static int
 xfs_trans_reserve(
        struct xfs_trans        *tp,
        struct xfs_trans_res    *resp,
@@ -219,7 +177,7 @@ xfs_trans_reserve(
                                                resp->tr_logres,
                                                resp->tr_logcount,
                                                &tp->t_ticket, XFS_TRANSACTION,
-                                               permanent, tp->t_type);
+                                               permanent);
                }
 
                if (error)
@@ -268,6 +226,42 @@ undo_blocks:
        return error;
 }
 
+int
+xfs_trans_alloc(
+       struct xfs_mount        *mp,
+       struct xfs_trans_res    *resp,
+       uint                    blocks,
+       uint                    rtextents,
+       uint                    flags,
+       struct xfs_trans        **tpp)
+{
+       struct xfs_trans        *tp;
+       int                     error;
+
+       if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+               sb_start_intwrite(mp->m_super);
+
+       WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+       atomic_inc(&mp->m_active_trans);
+
+       tp = kmem_zone_zalloc(xfs_trans_zone,
+               (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
+       tp->t_flags = flags;
+       tp->t_mountp = mp;
+       INIT_LIST_HEAD(&tp->t_items);
+       INIT_LIST_HEAD(&tp->t_busy);
+
+       error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+       if (error) {
+               xfs_trans_cancel(tp);
+               return error;
+       }
+
+       *tpp = tp;
+       return 0;
+}
+
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
index e7c49cf43fbc85c183e1728966d4f4506b94eaf6..9a462e892e4f33f9f50508b6ba8f33bfb348a420 100644 (file)
@@ -90,7 +90,6 @@ void  xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
  */
 typedef struct xfs_trans {
        unsigned int            t_magic;        /* magic number */
-       unsigned int            t_type;         /* transaction type */
        unsigned int            t_log_res;      /* amt of log space resvd */
        unsigned int            t_log_count;    /* count for perm log res */
        unsigned int            t_blk_res;      /* # of blocks resvd */
@@ -148,10 +147,9 @@ typedef struct xfs_trans {
 /*
  * XFS transaction mechanism exported interfaces.
  */
-xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
-                                 uint, uint);
+int            xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
+                       uint blocks, uint rtextents, uint flags,
+                       struct xfs_trans **tpp);
 void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 
 struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
index d111f691f313fdc980a9d9e9f6519d9bd30e06a0..ec58ff094b1dfbfcd634f5aad4b0fae2da8cc086 100644 (file)
@@ -146,7 +146,7 @@ __xfs_xattr_put_listent(
        arraytop = context->count + prefix_len + namelen + 1;
        if (arraytop > context->firstu) {
                context->count = -1;    /* insufficient space */
-               return 1;
+               return 0;
        }
        offset = (char *)context->alist + context->count;
        strncpy(offset, prefix, prefix_len);
@@ -166,8 +166,7 @@ xfs_xattr_put_listent(
        int             flags,
        unsigned char   *name,
        int             namelen,
-       int             valuelen,
-       unsigned char   *value)
+       int             valuelen)
 {
        char *prefix;
        int prefix_len;
@@ -221,11 +220,15 @@ xfs_xattr_put_listent(
 }
 
 ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+xfs_vn_listxattr(
+       struct dentry   *dentry,
+       char            *data,
+       size_t          size)
 {
        struct xfs_attr_list_context context;
        struct attrlist_cursor_kern cursor = { 0 };
-       struct inode            *inode = d_inode(dentry);
+       struct inode    *inode = d_inode(dentry);
+       int             error;
 
        /*
         * First read the regular on-disk attributes.
@@ -239,7 +242,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
        context.firstu = context.bufsize;
        context.put_listent = xfs_xattr_put_listent;
 
-       xfs_attr_list_int(&context);
+       error = xfs_attr_list_int(&context);
+       if (error)
+               return error;
        if (context.count < 0)
                return -ERANGE;
 
index 5d8ffa3e6f8c8a4e3715f3da4294abc860390386..c1cde3577551817c9f5581b98f331406e8c61f83 100644 (file)
@@ -7,10 +7,10 @@
 
 static __always_inline int preempt_count(void)
 {
-       return current_thread_info()->preempt_count;
+       return READ_ONCE(current_thread_info()->preempt_count);
 }
 
-static __always_inline int *preempt_count_ptr(void)
+static __always_inline volatile int *preempt_count_ptr(void)
 {
        return &current_thread_info()->preempt_count;
 }
index 85aaf66690f9cc896087a9460a2f1bb265cd2eca..729ab9fc325e2a5d728d929fdc8b3332d4867eef 100644 (file)
@@ -9,5 +9,6 @@
 #define TEGRA124_SOCTHERM_SENSOR_MEM 1
 #define TEGRA124_SOCTHERM_SENSOR_GPU 2
 #define TEGRA124_SOCTHERM_SENSOR_PLLX 3
+#define TEGRA124_SOCTHERM_SENSOR_NUM 4
 
 #endif
index b651aed9dc6b9084ed2dcae8832a033217d4dca4..dda39d8fa189b50d897c7c59bbba5b1f8bdab649 100644 (file)
@@ -24,9 +24,6 @@
 #include <linux/workqueue.h>
 
 struct arch_timer_kvm {
-       /* Is the timer enabled */
-       bool                    enabled;
-
        /* Virtual offset */
        cycle_t                 cntvoff;
 };
@@ -53,15 +50,15 @@ struct arch_timer_cpu {
        /* Timer IRQ */
        struct kvm_irq_level            irq;
 
-       /* VGIC mapping */
-       struct irq_phys_map             *map;
-
        /* Active IRQ state caching */
        bool                            active_cleared_last;
+
+       /* Is the timer enabled */
+       bool                    enabled;
 };
 
 int kvm_timer_hyp_init(void);
-void kvm_timer_enable(struct kvm *kvm);
+int kvm_timer_enable(struct kvm_vcpu *vcpu);
 void kvm_timer_init(struct kvm *kvm);
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
                         const struct kvm_irq_level *irq);
index be6037aa703dfe245199780660b007b94a9a36d5..da0a524802cbf4c4656a5fec7822c323ee415206 100644 (file)
 #ifndef __ASM_ARM_KVM_VGIC_H
 #define __ASM_ARM_KVM_VGIC_H
 
+#ifdef CONFIG_KVM_NEW_VGIC
+#include <kvm/vgic/vgic.h>
+#else
+
 #include <linux/kernel.h>
 #include <linux/kvm.h>
 #include <linux/irqreturn.h>
@@ -158,7 +162,6 @@ struct vgic_io_device {
 struct irq_phys_map {
        u32                     virt_irq;
        u32                     phys_irq;
-       u32                     irq;
 };
 
 struct irq_phys_map_entry {
@@ -305,9 +308,6 @@ struct vgic_cpu {
        unsigned long   *active_shared;
        unsigned long   *pend_act_shared;
 
-       /* Number of list registers on this CPU */
-       int             nr_lr;
-
        /* CPU vif control registers for world switch */
        union {
                struct vgic_v2_cpu_if   vgic_v2;
@@ -342,17 +342,18 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
 int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              struct irq_phys_map *map, bool level);
+                              unsigned int virt_irq, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
-                                          int virt_irq, int irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
 #define vgic_ready(k)          ((k)->arch.vgic.ready)
+#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
+                                ((i) < (k)->arch.vgic.nr_irqs))
 
 int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
                  const struct vgic_ops **ops,
@@ -370,4 +371,5 @@ static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
 }
 #endif
 
+#endif /* old VGIC include */
 #endif
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
new file mode 100644 (file)
index 0000000..3fbd175
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
+#define __ASM_ARM_KVM_VGIC_VGIC_H
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/irqreturn.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <kvm/iodev.h>
+
+#define VGIC_V3_MAX_CPUS       255
+#define VGIC_V2_MAX_CPUS       8
+#define VGIC_NR_IRQS_LEGACY     256
+#define VGIC_NR_SGIS           16
+#define VGIC_NR_PPIS           16
+#define VGIC_NR_PRIVATE_IRQS   (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE       (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI           1019
+#define VGIC_MAX_RESERVED      1023
+#define VGIC_MIN_LPI           8192
+
+enum vgic_type {
+       VGIC_V2,                /* Good ol' GICv2 */
+       VGIC_V3,                /* New fancy GICv3 */
+};
+
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+       /* type of the host GIC */
+       enum vgic_type          type;
+
+       /* Physical address of vgic virtual cpu interface */
+       phys_addr_t             vcpu_base;
+
+       /* virtual control interface mapping */
+       void __iomem            *vctrl_base;
+
+       /* Number of implemented list registers */
+       int                     nr_lr;
+
+       /* Maintenance IRQ number */
+       unsigned int            maint_irq;
+
+       /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+       int                     max_gic_vcpus;
+
+       /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+       bool                    can_emulate_gicv2;
+};
+
+extern struct vgic_global kvm_vgic_global_state;
+
+#define VGIC_V2_MAX_LRS                (1 << 6)
+#define VGIC_V3_MAX_LRS                16
+#define VGIC_V3_LR_INDEX(lr)   (VGIC_V3_MAX_LRS - 1 - lr)
+
+enum vgic_irq_config {
+       VGIC_CONFIG_EDGE = 0,
+       VGIC_CONFIG_LEVEL
+};
+
+struct vgic_irq {
+       spinlock_t irq_lock;            /* Protects the content of the struct */
+       struct list_head ap_list;
+
+       struct kvm_vcpu *vcpu;          /* SGIs and PPIs: The VCPU
+                                        * SPIs and LPIs: The VCPU whose ap_list
+                                        * this is queued on.
+                                        */
+
+       struct kvm_vcpu *target_vcpu;   /* The VCPU that this interrupt should
+                                        * be sent to, as a result of the
+                                        * targets reg (v2) or the
+                                        * affinity reg (v3).
+                                        */
+
+       u32 intid;                      /* Guest visible INTID */
+       bool pending;
+       bool line_level;                /* Level only */
+       bool soft_pending;              /* Level only */
+       bool active;                    /* not used for LPIs */
+       bool enabled;
+       bool hw;                        /* Tied to HW IRQ */
+       u32 hwintid;                    /* HW INTID number */
+       union {
+               u8 targets;                     /* GICv2 target VCPUs mask */
+               u32 mpidr;                      /* GICv3 target VCPU */
+       };
+       u8 source;                      /* GICv2 SGIs only */
+       u8 priority;
+       enum vgic_irq_config config;    /* Level or edge */
+};
+
+struct vgic_register_region;
+
+struct vgic_io_device {
+       gpa_t base_addr;
+       struct kvm_vcpu *redist_vcpu;
+       const struct vgic_register_region *regions;
+       int nr_regions;
+       struct kvm_io_device dev;
+};
+
+struct vgic_dist {
+       bool                    in_kernel;
+       bool                    ready;
+       bool                    initialized;
+
+       /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
+       u32                     vgic_model;
+
+       int                     nr_spis;
+
+       /* TODO: Consider moving to global state */
+       /* Virtual control interface mapping */
+       void __iomem            *vctrl_base;
+
+       /* base addresses in guest physical address space: */
+       gpa_t                   vgic_dist_base;         /* distributor */
+       union {
+               /* either a GICv2 CPU interface */
+               gpa_t                   vgic_cpu_base;
+               /* or a number of GICv3 redistributor regions */
+               gpa_t                   vgic_redist_base;
+       };
+
+       /* distributor enabled */
+       bool                    enabled;
+
+       struct vgic_irq         *spis;
+
+       struct vgic_io_device   dist_iodev;
+       struct vgic_io_device   *redist_iodevs;
+};
+
+struct vgic_v2_cpu_if {
+       u32             vgic_hcr;
+       u32             vgic_vmcr;
+       u32             vgic_misr;      /* Saved only */
+       u64             vgic_eisr;      /* Saved only */
+       u64             vgic_elrsr;     /* Saved only */
+       u32             vgic_apr;
+       u32             vgic_lr[VGIC_V2_MAX_LRS];
+};
+
+struct vgic_v3_cpu_if {
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       u32             vgic_hcr;
+       u32             vgic_vmcr;
+       u32             vgic_sre;       /* Restored only, change ignored */
+       u32             vgic_misr;      /* Saved only */
+       u32             vgic_eisr;      /* Saved only */
+       u32             vgic_elrsr;     /* Saved only */
+       u32             vgic_ap0r[4];
+       u32             vgic_ap1r[4];
+       u64             vgic_lr[VGIC_V3_MAX_LRS];
+#endif
+};
+
+struct vgic_cpu {
+       /* CPU vif control registers for world switch */
+       union {
+               struct vgic_v2_cpu_if   vgic_v2;
+               struct vgic_v3_cpu_if   vgic_v3;
+       };
+
+       unsigned int used_lrs;
+       struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
+
+       spinlock_t ap_list_lock;        /* Protects the ap_list */
+
+       /*
+        * List of IRQs that this VCPU should consider because they are either
+        * Active or Pending (hence the name; AP list), or because they recently
+        * were one of the two and need to be migrated off this list to another
+        * VCPU.
+        */
+       struct list_head ap_list_head;
+
+       u64 live_lrs;
+};
+
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
+void kvm_vgic_early_init(struct kvm *kvm);
+int kvm_vgic_create(struct kvm *kvm, u32 type);
+void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                       bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
+#define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
+#define vgic_initialized(k)    ((k)->arch.vgic.initialized)
+#define vgic_ready(k)          ((k)->arch.vgic.ready)
+#define vgic_valid_spi(k, i)   (((i) >= VGIC_NR_PRIVATE_IRQS) && \
+                       ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
+#else
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
+{
+}
+#endif
+
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+       return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
index 846513c73606bd96997015eb115f55fc59e3e39d..a5ac2cad5cb77424ff699429c006e34a0c7da784 100644 (file)
@@ -587,7 +587,6 @@ struct mtd_info;
 
 struct bcma_sflash {
        bool present;
-       u32 window;
        u32 blocksize;
        u16 numblocks;
        u32 size;
index 1fd8fdff2f813305fd7d4adb37a6d716b59aecc5..3d9cf326574fbb423de0c02617cddb464d6b4156 100644 (file)
@@ -768,6 +768,17 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
+#ifdef CONFIG_PRINTK
+#define vfs_msg(sb, level, fmt, ...)                           \
+       __vfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#else
+#define vfs_msg(sb, level, fmt, ...)                           \
+do {                                                           \
+       no_printk(fmt, ##__VA_ARGS__);                          \
+       __vfs_msg(sb, "", " ");                                 \
+} while (0)
+#endif
+
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
@@ -1660,7 +1671,7 @@ struct block_device_operations {
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        long (*direct_access)(struct block_device *, sector_t, void __pmem **,
-                       pfn_t *);
+                       pfn_t *, long);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1680,6 +1691,8 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
+extern int bdev_dax_supported(struct super_block *, int);
+extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
index b827e066e55a198ec13f41d52b52964bd72edf4d..146507df8650c0a5af41c2eea40705ffcda8424a 100644 (file)
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
        return ceph_frag_make(newbits,
                         ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
        return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
        return ceph_frag_value(f) == ceph_frag_mask(f);
 }
index 37f28bf55ce426bac3a8a533101f4afed21be53f..dfce616002ad97337d2c9f0ad9cff3d3a62485e2 100644 (file)
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
 
 /* watch-notify operations */
 enum {
-  WATCH_NOTIFY                         = 1, /* notifying watcher */
-  WATCH_NOTIFY_COMPLETE                        = 2, /* notifier notified when done */
+       CEPH_WATCH_EVENT_NOTIFY           = 1, /* notifying watcher */
+       CEPH_WATCH_EVENT_NOTIFY_COMPLETE  = 2, /* notifier notified when done */
+       CEPH_WATCH_EVENT_DISCONNECT       = 3, /* we were disconnected */
 };
 
 
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
        struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
+#define CEPH_FS_CLUSTER_ID_NONE  -1
+
 /*
  * mdsmap flags
  */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
 
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS    (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END          (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE     (1<<8)
+#define CEPH_READDIR_HASH_ORDER                (1<<9)
+
 union ceph_mds_request_args {
        struct {
                __le32 mask;                 /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
                __le32 max_bytes;
+               __le16 flags;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
index a6ef9cc267ec2cfd3940bab1d8e4f7030e31ef7b..19e9932f3e77194eae3f9be57b66fdfa74afbf51 100644 (file)
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
        return end >= *p && n <= end - *p;
 }
index db92a8d4926eed9f9bf155564c274b1552477e4b..690985daad1c6e9ed6d18b7515522f96ef3411c6 100644 (file)
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
                (off >> PAGE_SHIFT);
 }
 
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)            \
+static void insert_##name(struct rb_root *root, type *t)               \
+{                                                                      \
+       struct rb_node **n = &root->rb_node;                            \
+       struct rb_node *parent = NULL;                                  \
+                                                                       \
+       BUG_ON(!RB_EMPTY_NODE(&t->nodefld));                            \
+                                                                       \
+       while (*n) {                                                    \
+               type *cur = rb_entry(*n, type, nodefld);                \
+                                                                       \
+               parent = *n;                                            \
+               if (t->keyfld < cur->keyfld)                            \
+                       n = &(*n)->rb_left;                             \
+               else if (t->keyfld > cur->keyfld)                       \
+                       n = &(*n)->rb_right;                            \
+               else                                                    \
+                       BUG();                                          \
+       }                                                               \
+                                                                       \
+       rb_link_node(&t->nodefld, parent, n);                           \
+       rb_insert_color(&t->nodefld, root);                             \
+}                                                                      \
+static void erase_##name(struct rb_root *root, type *t)                        \
+{                                                                      \
+       BUG_ON(RB_EMPTY_NODE(&t->nodefld));                             \
+       rb_erase(&t->nodefld, root);                                    \
+       RB_CLEAR_NODE(&t->nodefld);                                     \
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)             \
+static type *lookup_##name(struct rb_root *root,                       \
+                          typeof(((type *)0)->keyfld) key)             \
+{                                                                      \
+       struct rb_node *n = root->rb_node;                              \
+                                                                       \
+       while (n) {                                                     \
+               type *cur = rb_entry(n, type, nodefld);                 \
+                                                                       \
+               if (key < cur->keyfld)                                  \
+                       n = n->rb_left;                                 \
+               else if (key > cur->keyfld)                             \
+                       n = n->rb_right;                                \
+               else                                                    \
+                       return cur;                                     \
+       }                                                               \
+                                                                       \
+       return NULL;                                                    \
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)                   \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)                    \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
index e230e7ed60d369f202020e6c955f50c644b89dfa..e2a92df08b47d2f00d986c3547accf4eeb53cf6c 100644 (file)
@@ -39,20 +39,31 @@ struct ceph_mon_request {
        ceph_monc_request_func_t do_request;
 };
 
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
 /*
  * ceph_mon_generic_request is being used for the statfs and
  * mon_get_version requests which are being done a bit differently
  * because we need to get data back to the caller
  */
 struct ceph_mon_generic_request {
+       struct ceph_mon_client *monc;
        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-       void *buf;
+
        struct completion completion;
+       ceph_monc_callback_t complete_cb;
+       u64 private_data;          /* r_tid/linger_id */
+
        struct ceph_msg *request;  /* original request */
        struct ceph_msg *reply;    /* and reply */
+
+       union {
+               struct ceph_statfs *st;
+               u64 newest;
+       } u;
 };
 
 struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
 
        /* pending generic requests */
        struct rb_root generic_request_tree;
-       int num_generic_requests;
        u64 last_tid;
 
        /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
                bool want;
                u32 have; /* epoch */
        } subs[3];
+       int fs_cluster_id; /* "mdsmap.<id>" sub */
 
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
                        bool continuous);
 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
                                 unsigned long timeout);
 
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
                               struct ceph_statfs *buf);
 
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
-                                   const char *what, u64 *newest);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                         u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                               ceph_monc_callback_t cb, u64 private_data);
 
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
index cbf460927c424b26c76665edca16337eb62ef9a9..19b14862d3e0d9afdc2a524e940f62944428ca89 100644 (file)
@@ -20,10 +20,11 @@ struct ceph_osd_client;
 /*
  * completion callback for async writepages
  */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-                                    struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
+#define CEPH_HOMELESS_OSD      -1
+
 /* a given osd we're communicating with */
 struct ceph_osd {
        atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
        int o_incarnation;
        struct rb_node o_node;
        struct ceph_connection o_con;
-       struct list_head o_requests;
-       struct list_head o_linger_requests;
+       struct rb_root o_requests;
+       struct rb_root o_linger_requests;
        struct list_head o_osd_lru;
        struct ceph_auth_handshake o_auth;
        unsigned long lru_ttl;
-       int o_marked_for_keepalive;
        struct list_head o_keepalive_item;
+       struct mutex lock;
 };
 
-
 #define CEPH_OSD_SLAB_OPS      2
 #define CEPH_OSD_MAX_OPS       16
 
@@ -104,15 +104,21 @@ struct ceph_osd_req_op {
                        struct ceph_osd_data response_data;
                        __u8 class_len;
                        __u8 method_len;
-                       __u8 argc;
+                       u32 indata_len;
                } cls;
                struct {
                        u64 cookie;
-                       u64 ver;
-                       u32 prot_ver;
-                       u32 timeout;
-                       __u8 flag;
+                       __u8 op;           /* CEPH_OSD_WATCH_OP_ */
+                       u32 gen;
                } watch;
+               struct {
+                       struct ceph_osd_data request_data;
+               } notify_ack;
+               struct {
+                       u64 cookie;
+                       struct ceph_osd_data request_data;
+                       struct ceph_osd_data response_data;
+               } notify;
                struct {
                        u64 expected_object_size;
                        u64 expected_write_size;
@@ -120,60 +126,73 @@ struct ceph_osd_req_op {
        };
 };
 
+struct ceph_osd_request_target {
+       struct ceph_object_id base_oid;
+       struct ceph_object_locator base_oloc;
+       struct ceph_object_id target_oid;
+       struct ceph_object_locator target_oloc;
+
+       struct ceph_pg pgid;
+       u32 pg_num;
+       u32 pg_num_mask;
+       struct ceph_osds acting;
+       struct ceph_osds up;
+       int size;
+       int min_size;
+       bool sort_bitwise;
+
+       unsigned int flags;                /* CEPH_OSD_FLAG_* */
+       bool paused;
+
+       int osd;
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
        u64             r_tid;              /* unique for this client */
        struct rb_node  r_node;
-       struct list_head r_req_lru_item;
-       struct list_head r_osd_item;
-       struct list_head r_linger_item;
-       struct list_head r_linger_osd_item;
+       struct rb_node  r_mc_node;          /* map check */
        struct ceph_osd *r_osd;
-       struct ceph_pg   r_pgid;
-       int              r_pg_osds[CEPH_PG_MAX_SIZE];
-       int              r_num_pg_osds;
+
+       struct ceph_osd_request_target r_t;
+#define r_base_oid     r_t.base_oid
+#define r_base_oloc    r_t.base_oloc
+#define r_flags                r_t.flags
 
        struct ceph_msg  *r_request, *r_reply;
-       int               r_flags;     /* any additional flags for the osd */
        u32               r_sent;      /* >0 if r_request is sending/sent */
 
        /* request osd ops array  */
        unsigned int            r_num_ops;
 
-       /* these are updated on each send */
-       __le32           *r_request_osdmap_epoch;
-       __le32           *r_request_flags;
-       __le64           *r_request_pool;
-       void             *r_request_pgid;
-       __le32           *r_request_attempts;
-       bool              r_paused;
-       struct ceph_eversion *r_request_reassert_version;
-
        int               r_result;
-       int               r_got_reply;
-       int               r_linger;
+       bool              r_got_reply;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-       struct completion r_completion, r_safe_completion;
+       struct completion r_completion;
+       struct completion r_safe_completion;  /* fsync waiter */
        ceph_osdc_callback_t r_callback;
        ceph_osdc_unsafe_callback_t r_unsafe_callback;
-       struct ceph_eversion r_reassert_version;
        struct list_head  r_unsafe_item;
 
        struct inode *r_inode;                /* for use by callbacks */
        void *r_priv;                         /* ditto */
 
-       struct ceph_object_locator r_base_oloc;
-       struct ceph_object_id r_base_oid;
-       struct ceph_object_locator r_target_oloc;
-       struct ceph_object_id r_target_oid;
-
-       u64               r_snapid;
-       unsigned long     r_stamp;            /* send OR check time */
+       /* set by submitter */
+       u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
+       struct ceph_snap_context *r_snapc;    /* for writes */
+       struct timespec r_mtime;              /* ditto */
+       u64 r_data_offset;                    /* ditto */
+       bool r_linger;                        /* don't resend on failure */
 
-       struct ceph_snap_context *r_snapc;    /* snap context for writes */
+       /* internal */
+       unsigned long r_stamp;                /* jiffies, send or check time */
+       int r_attempts;
+       struct ceph_eversion r_replay_version; /* aka reassert_version */
+       u32 r_last_force_resend;
+       u32 r_map_dne_bound;
 
        struct ceph_osd_req_op r_ops[];
 };
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
        struct ceph_object_locator oloc;
 };
 
-struct ceph_osd_event {
-       u64 cookie;
-       int one_shot;
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+                                u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
        struct ceph_osd_client *osdc;
-       void (*cb)(u64, u64, u8, void *);
-       void *data;
-       struct rb_node node;
-       struct list_head osd_node;
+       u64 linger_id;
+       bool committed;
+       bool is_watch;                  /* watch or notify */
+
+       struct ceph_osd *osd;
+       struct ceph_osd_request *reg_req;
+       struct ceph_osd_request *ping_req;
+       unsigned long ping_sent;
+       unsigned long watch_valid_thru;
+       struct list_head pending_lworks;
+
+       struct ceph_osd_request_target t;
+       u32 last_force_resend;
+       u32 map_dne_bound;
+
+       struct timespec mtime;
+
        struct kref kref;
-};
+       struct mutex lock;
+       struct rb_node node;            /* osd */
+       struct rb_node osdc_node;       /* osdc */
+       struct rb_node mc_node;         /* map check */
+       struct list_head scan_item;
+
+       struct completion reg_commit_wait;
+       struct completion notify_finish_wait;
+       int reg_commit_error;
+       int notify_finish_error;
+       int last_error;
+
+       u32 register_gen;
+       u64 notify_id;
+
+       rados_watchcb2_t wcb;
+       rados_watcherrcb_t errcb;
+       void *data;
 
-struct ceph_osd_event_work {
-       struct work_struct work;
-       struct ceph_osd_event *event;
-        u64 ver;
-        u64 notify_id;
-        u8 opcode;
+       struct page ***preply_pages;
+       size_t *preply_len;
 };
 
 struct ceph_osd_client {
        struct ceph_client     *client;
 
        struct ceph_osdmap     *osdmap;       /* current map */
-       struct rw_semaphore    map_sem;
-       struct completion      map_waiters;
-       u64                    last_requested_map;
+       struct rw_semaphore    lock;
 
-       struct mutex           request_mutex;
        struct rb_root         osds;          /* osds */
        struct list_head       osd_lru;       /* idle osds */
-       u64                    timeout_tid;   /* tid of timeout triggering rq */
-       u64                    last_tid;      /* tid of last request */
-       struct rb_root         requests;      /* pending requests */
-       struct list_head       req_lru;       /* in-flight lru */
-       struct list_head       req_unsent;    /* unsent/need-resend queue */
-       struct list_head       req_notarget;  /* map to no osd */
-       struct list_head       req_linger;    /* lingering requests */
-       int                    num_requests;
+       spinlock_t             osd_lru_lock;
+       struct ceph_osd        homeless_osd;
+       atomic64_t             last_tid;      /* tid of last request */
+       u64                    last_linger_id;
+       struct rb_root         linger_requests; /* lingering requests */
+       struct rb_root         map_checks;
+       struct rb_root         linger_map_checks;
+       atomic_t               num_requests;
+       atomic_t               num_homeless;
        struct delayed_work    timeout_work;
        struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
@@ -231,10 +276,6 @@ struct ceph_osd_client {
        struct ceph_msgpool     msgpool_op;
        struct ceph_msgpool     msgpool_op_reply;
 
-       spinlock_t              event_lock;
-       struct rb_root          event_tree;
-       u64                     event_count;
-
        struct workqueue_struct *notify_wq;
 };
 
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
                                        struct ceph_osd_request *osd_req,
                                        unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
-                                       struct ceph_osd_request *osd_req,
-                                       unsigned int which);
 
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
                                        unsigned int which,
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
                                 u16 opcode, const char *name, const void *value,
                                 size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                                       unsigned int which, u16 opcode,
-                                       u64 cookie, u64 version, int flag);
 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                       unsigned int which,
                                       u64 expected_object_size,
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
                                               unsigned int num_ops,
                                               bool use_mempool,
                                               gfp_t gfp_flags);
-
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                                   struct ceph_snap_context *snapc,
-                                   u64 snap_id,
-                                   struct timespec *mtime);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      struct ceph_file_layout *layout,
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                      u32 truncate_seq, u64 truncate_size,
                                      bool use_mempool);
 
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                        struct ceph_osd_request *req);
-
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                               struct ceph_vino vino,
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
                                struct timespec *mtime,
                                struct page **pages, int nr_pages);
 
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-                                 void (*event_cb)(u64, u64, u8, void *),
-                                 void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+               struct ceph_object_id *oid,
+               struct ceph_object_locator *oloc,
+               rados_watchcb2_t wcb,
+               rados_watcherrcb_t errcb,
+               void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                     struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                        struct ceph_object_id *oid,
+                        struct ceph_object_locator *oloc,
+                        u64 notify_id,
+                        u64 cookie,
+                        void *payload,
+                        size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                    struct ceph_object_id *oid,
+                    struct ceph_object_locator *oloc,
+                    void *payload,
+                    size_t payload_len,
+                    u32 timeout,
+                    struct page ***preply_pages,
+                    size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                         struct ceph_osd_linger_request *lreq);
 #endif
 
index e55c08bc3a960d0b5f0664e5c46ac9cd969ec97d..ddc426b22d8159811a9514f01e09d6758a1cf04c 100644 (file)
@@ -24,21 +24,29 @@ struct ceph_pg {
        uint32_t seed;
 };
 
-#define CEPH_POOL_FLAG_HASHPSPOOL  1
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+
+#define CEPH_POOL_FLAG_HASHPSPOOL      (1ULL << 0) /* hash pg seed and pool id
+                                                      together */
+#define CEPH_POOL_FLAG_FULL            (1ULL << 1) /* pool is full */
 
 struct ceph_pg_pool_info {
        struct rb_node node;
        s64 id;
-       u8 type;
+       u8 type; /* CEPH_POOL_TYPE_* */
        u8 size;
+       u8 min_size;
        u8 crush_ruleset;
        u8 object_hash;
+       u32 last_force_request_resend;
        u32 pg_num, pgp_num;
        int pg_num_mask, pgp_num_mask;
        s64 read_tier;
        s64 write_tier; /* wins for read+write ops */
-       u64 flags;
+       u64 flags; /* CEPH_POOL_FLAG_* */
        char *name;
+
+       bool was_full;  /* for handle_one_map() */
 };
 
 static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
        s64 pool;
 };
 
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+       oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+       return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+                                 const struct ceph_object_locator *src)
+{
+       dest->pool = src->pool;
+}
+
 /*
  * Maximum supported by kernel client object name length
  *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
  */
 #define CEPH_MAX_OID_NAME_LEN 100
 
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-       char name[CEPH_MAX_OID_NAME_LEN];
+       char *name;
+       char inline_name[CEPH_OID_INLINE_LEN];
        int name_len;
 };
 
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+       oid->name = oid->inline_name;
+       oid->name_len = 0;
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+       return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+                  const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                    const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
 struct ceph_pg_mapping {
        struct rb_node node;
        struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
 struct ceph_osdmap {
        struct ceph_fsid fsid;
        u32 epoch;
-       u32 mkfs_epoch;
        struct ceph_timespec created, modified;
 
        u32 flags;         /* CEPH_OSDMAP_* */
@@ -113,43 +172,19 @@ struct ceph_osdmap {
        int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
-                                    const char *name)
-{
-       int len;
-
-       len = strlen(name);
-       if (len > sizeof(oid->name)) {
-               WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-                    name, len, sizeof(oid->name));
-               len = sizeof(oid->name);
-       }
-
-       memcpy(oid->name, name, len);
-       oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-                                struct ceph_object_id *src)
-{
-       BUG_ON(src->name_len > sizeof(dest->name));
-       memcpy(dest->name, src->name, src->name_len);
-       dest->name_len = src->name_len;
-}
-
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
        return osd >= 0 && osd < map->max_osd &&
               (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
 
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
        return ceph_osd_exists(map, osd) &&
               (map->osd_state[osd] & CEPH_OSD_UP);
 }
 
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
        return !ceph_osd_is_up(map, osd);
 }
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
        return 0;
 }
 
+struct ceph_osdmap *ceph_osdmap_alloc(void);
 extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                           struct ceph_osdmap *map,
-                                           struct ceph_messenger *msgr);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                            struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
+struct ceph_osds {
+       int osds[CEPH_PG_MAX_SIZE];
+       int size;
+       int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+       set->size = 0;
+       set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                         u64 off, u64 len,
                                         u64 *bno, u64 *oxoff, u64 *oxlen);
 
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-                              struct ceph_object_locator *oloc,
-                              struct ceph_object_id *oid,
-                              struct ceph_pg *pg_out);
-
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
-                              struct ceph_pg pgid,
-                              int *osds, int *primary);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-                               struct ceph_pg pgid);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+                             struct ceph_object_id *oid,
+                             struct ceph_object_locator *oloc,
+                             struct ceph_pg *raw_pgid);
+
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid,
+                              struct ceph_osds *up,
+                              struct ceph_osds *acting);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                             const struct ceph_pg *raw_pgid);
 
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
                                                    u64 id);
index 2f822dca10463b796d6a2e5fa3b9bf25e361bf31..5c0da61cb763124c651e3287f4ecaf69f8cdfbc8 100644 (file)
@@ -114,8 +114,8 @@ struct ceph_object_layout {
  * compound epoch+version, used by storage layer to serialize mutations
  */
 struct ceph_eversion {
-       __le32 epoch;
        __le64 version;
+       __le32 epoch;
 } __attribute__ ((packed));
 
 /*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
        CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
        CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
        CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+       CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+       CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+                                                     pool uses pool snaps */
+       CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+       CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+       CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+       CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
@@ -415,7 +427,17 @@ enum {
        CEPH_OSD_CMPXATTR_MODE_U64    = 2
 };
 
-#define RADOS_NOTIFY_VER       1
+enum {
+       CEPH_OSD_WATCH_OP_UNWATCH = 0,
+       CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+       /* note: use only ODD ids to prevent pre-giant code from
+          interpreting the op as UNWATCH */
+       CEPH_OSD_WATCH_OP_WATCH = 3,
+       CEPH_OSD_WATCH_OP_RECONNECT = 5,
+       CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
 
 /*
  * an individual object operation.  each may be accompanied by some data
@@ -450,9 +472,13 @@ struct ceph_osd_op {
                } __attribute__ ((packed)) snap;
                struct {
                        __le64 cookie;
-                       __le64 ver;
-                       __u8 flag;      /* 0 = unwatch, 1 = watch */
+                       __le64 ver;     /* no longer used */
+                       __u8 op;        /* CEPH_OSD_WATCH_OP_* */
+                       __le32 gen;     /* registration generation */
                } __attribute__ ((packed)) watch;
+               struct {
+                       __le64 cookie;
+               } __attribute__ ((packed)) notify;
                struct {
                        __le64 offset, length;
                        __le64 src_offset;
index 982a6c4a62f3643e12ae6182e37544acd2b0c05f..43d5f0b799c7664c74593b5fd1ee36e836ac8861 100644 (file)
@@ -3,45 +3,62 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* We use lowest available exceptional entry bit for locking */
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+
 ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
                  get_block_t, dio_iodone_t, int flags);
-int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+                                  pgoff_t index, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
+int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+               unsigned int offset, unsigned int length);
 #else
 static inline struct page *read_dax_sector(struct block_device *bdev,
                sector_t n)
 {
        return ERR_PTR(-ENXIO);
 }
+/* Shouldn't ever be called when dax is disabled. */
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+                                           pgoff_t index)
+{
+       BUG();
+}
+static inline int __dax_zero_page_range(struct block_device *bdev,
+               sector_t sector, unsigned int offset, unsigned int length)
+{
+       return -ENXIO;
+}
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                               unsigned int flags, get_block_t, dax_iodone_t);
+                               unsigned int flags, get_block_t);
 int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
-                               unsigned int flags, get_block_t, dax_iodone_t);
+                               unsigned int flags, get_block_t);
 #else
 static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-                               pmd_t *pmd, unsigned int flags, get_block_t gb,
-                               dax_iodone_t di)
+                               pmd_t *pmd, unsigned int flags, get_block_t gb)
 {
        return VM_FAULT_FALLBACK;
 }
 #define __dax_pmd_fault dax_pmd_fault
 #endif
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb)    __dax_fault(vma, vmf, gb)
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
index 89627b9187f945b960a7e3612aad170bbe01b13e..7ce9fb1b7d28f3472be97621e624a4448ead577f 100644 (file)
@@ -28,5 +28,6 @@
 #define EBADTYPE       527     /* Type not supported by server */
 #define EJUKEBOX       528     /* Request initiated, but will not complete before timeout */
 #define EIOCBQUEUED    529     /* iocb queued, will get completion event */
+#define ERECALLCONFLICT        530     /* conflict with recalled state */
 
 #endif
index 96e45ea463e71e19575d1db97da22a0eb72d05c8..2f9ccbe6a6390a4f2b2faea3cac96fd49ce48e1c 100644 (file)
@@ -38,7 +38,7 @@ extern struct module __this_module;
 
 #ifdef CONFIG_MODULES
 
-#ifndef __GENKSYMS__
+#if defined(__KERNEL__) && !defined(__GENKSYMS__)
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
@@ -53,7 +53,7 @@ extern struct module __this_module;
 #endif
 
 /* For every exported symbol, place a struct in the __ksymtab section */
-#define __EXPORT_SYMBOL(sym, sec)                              \
+#define ___EXPORT_SYMBOL(sym, sec)                             \
        extern typeof(sym) sym;                                 \
        __CRC_SYMBOL(sym, sec)                                  \
        static const char __kstrtab_##sym[]                     \
@@ -65,6 +65,35 @@ extern struct module __this_module;
        __attribute__((section("___ksymtab" sec "+" #sym), unused))     \
        = { (unsigned long)&sym, __kstrtab_##sym }
 
+#if defined(__KSYM_DEPS__)
+
+/*
+ * For fine grained build dependencies, we want to tell the build system
+ * about each possible exported symbol even if they're not actually exported.
+ * We use a string pattern that is unlikely to be valid code that the build
+ * system filters out from the preprocessor output (see ksym_dep_filter
+ * in scripts/Kbuild.include).
+ */
+#define __EXPORT_SYMBOL(sym, sec)      === __KSYM_##sym ===
+
+#elif defined(CONFIG_TRIM_UNUSED_KSYMS)
+
+#include <linux/kconfig.h>
+#include <generated/autoksyms.h>
+
+#define __EXPORT_SYMBOL(sym, sec)                              \
+       __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+#define __cond_export_sym(sym, sec, conf)                      \
+       ___cond_export_sym(sym, sec, conf)
+#define ___cond_export_sym(sym, sec, enabled)                  \
+       __cond_export_sym_##enabled(sym, sec)
+#define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
+#define __cond_export_sym_0(sym, sec) /* nothing */
+
+#else
+#define __EXPORT_SYMBOL ___EXPORT_SYMBOL
+#endif
+
 #define EXPORT_SYMBOL(sym)                                     \
        __EXPORT_SYMBOL(sym, "")
 
index 5f61431d8673951af6da1bbba1fa443c04de56ce..9ace7f745bcdeef15b7b3e7beed2ed8b18bb991e 100644 (file)
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
 typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC               0x00000001
 #define MAY_WRITE              0x00000002
index 0023088b253b4d928669f9841a1f08dbeaf5307a..3f9778cbc79d0b1bb3b73e0a12695b722fa7a179 100644 (file)
 #define FSL_IFC_VERSION_MASK   0x0F0F0000
 #define FSL_IFC_VERSION_1_0_0  0x01000000
 #define FSL_IFC_VERSION_1_1_0  0x01010000
+#define FSL_IFC_VERSION_2_0_0  0x02000000
+
+#define PGOFFSET_64K   (64*1024)
+#define PGOFFSET_4K    (4*1024)
 
 /*
  * CSPR - Chip Select Property Register
@@ -723,20 +727,26 @@ struct fsl_ifc_nand {
        __be32 nand_evter_en;
        u32 res17[0x2];
        __be32 nand_evter_intr_en;
-       u32 res18[0x2];
+       __be32 nand_vol_addr_stat;
+       u32 res18;
        __be32 nand_erattr0;
        __be32 nand_erattr1;
        u32 res19[0x10];
        __be32 nand_fsr;
-       u32 res20;
-       __be32 nand_eccstat[4];
-       u32 res21[0x20];
+       u32 res20[0x3];
+       __be32 nand_eccstat[6];
+       u32 res21[0x1c];
        __be32 nanndcr;
        u32 res22[0x2];
        __be32 nand_autoboot_trgr;
        u32 res23;
        __be32 nand_mdr;
-       u32 res24[0x5C];
+       u32 res24[0x1C];
+       __be32 nand_dll_lowcfg0;
+       __be32 nand_dll_lowcfg1;
+       u32 res25;
+       __be32 nand_dll_lowstat;
+       u32 res26[0x3c];
 };
 
 /*
@@ -771,13 +781,12 @@ struct fsl_ifc_gpcm {
        __be32 gpcm_erattr1;
        __be32 gpcm_erattr2;
        __be32 gpcm_stat;
-       u32 res4[0x1F3];
 };
 
 /*
  * IFC Controller Registers
  */
-struct fsl_ifc_regs {
+struct fsl_ifc_global {
        __be32 ifc_rev;
        u32 res1[0x2];
        struct {
@@ -803,21 +812,26 @@ struct fsl_ifc_regs {
        } ftim_cs[FSL_IFC_BANK_COUNT];
        u32 res9[0x30];
        __be32 rb_stat;
-       u32 res10[0x2];
+       __be32 rb_map;
+       __be32 wb_map;
        __be32 ifc_gcr;
-       u32 res11[0x2];
+       u32 res10[0x2];
        __be32 cm_evter_stat;
-       u32 res12[0x2];
+       u32 res11[0x2];
        __be32 cm_evter_en;
-       u32 res13[0x2];
+       u32 res12[0x2];
        __be32 cm_evter_intr_en;
-       u32 res14[0x2];
+       u32 res13[0x2];
        __be32 cm_erattr0;
        __be32 cm_erattr1;
-       u32 res15[0x2];
+       u32 res14[0x2];
        __be32 ifc_ccr;
        __be32 ifc_csr;
-       u32 res16[0x2EB];
+       __be32 ddr_ccr_low;
+};
+
+
+struct fsl_ifc_runtime {
        struct fsl_ifc_nand ifc_nand;
        struct fsl_ifc_nor ifc_nor;
        struct fsl_ifc_gpcm ifc_gpcm;
@@ -831,7 +845,8 @@ extern int fsl_ifc_find(phys_addr_t addr_base);
 struct fsl_ifc_ctrl {
        /* device info */
        struct device                   *dev;
-       struct fsl_ifc_regs __iomem     *regs;
+       struct fsl_ifc_global __iomem   *gregs;
+       struct fsl_ifc_runtime __iomem  *rregs;
        int                             irq;
        int                             nand_irq;
        spinlock_t                      lock;
index 92f7177db2ce869a29db8813911c3a8a0c2b86b2..f27bb2c62fca5cf19a74f3f03c64b8cb9036bd19 100644 (file)
 /* iova structure */
 struct iova {
        struct rb_node  node;
-       unsigned long   pfn_hi; /* IOMMU dish out addr hi */
-       unsigned long   pfn_lo; /* IOMMU dish out addr lo */
+       unsigned long   pfn_hi; /* Highest allocated pfn */
+       unsigned long   pfn_lo; /* Lowest allocated pfn */
+};
+
+struct iova_magazine;
+struct iova_cpu_rcache;
+
+#define IOVA_RANGE_CACHE_MAX_SIZE 6    /* log of max cached IOVA range size (in pages) */
+#define MAX_GLOBAL_MAGS 32     /* magazines per bin */
+
+struct iova_rcache {
+       spinlock_t lock;
+       unsigned long depot_size;
+       struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+       struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +44,7 @@ struct iova_domain {
        unsigned long   granule;        /* pfn granularity for this domain */
        unsigned long   start_pfn;      /* Lower limit for this domain */
        unsigned long   dma_32bit_pfn;
+       struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];  /* IOVA range caches */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -78,6 +92,10 @@ void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
        unsigned long limit_pfn,
        bool size_aligned);
+void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
+                   unsigned long size);
+unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+                             unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
        unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -87,5 +105,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
        struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 
 #endif
index 9e6fdd33bdb2c15d3936fb198fd98cf26b2828cf..bfbd707de390df9ea6734406d7ac7fb94c26e335 100644 (file)
 #define ICH_LR_ACTIVE_BIT              (1ULL << 63)
 #define ICH_LR_PHYS_ID_SHIFT           32
 #define ICH_LR_PHYS_ID_MASK            (0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
+#define ICH_LR_PRIORITY_SHIFT          48
+
+/* These are for GICv2 emulation only */
+#define GICH_LR_VIRTUALID              (0x3ffUL << 0)
+#define GICH_LR_PHYSID_CPUID_SHIFT     (10)
+#define GICH_LR_PHYSID_CPUID           (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
 
 #define ICH_MISR_EOI                   (1 << 0)
 #define ICH_MISR_U                     (1 << 1)
index 9c940263ca230f8329bd4eec997d41ffc0d514b4..fd051855539b9ee2f871c1dbe9691af29bfb83d2 100644 (file)
@@ -33,6 +33,7 @@
 
 #define GIC_DIST_CTRL                  0x000
 #define GIC_DIST_CTR                   0x004
+#define GIC_DIST_IIDR                  0x008
 #define GIC_DIST_IGROUP                        0x080
 #define GIC_DIST_ENABLE_SET            0x100
 #define GIC_DIST_ENABLE_CLEAR          0x180
@@ -76,6 +77,7 @@
 #define GICH_LR_VIRTUALID              (0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT     (10)
 #define GICH_LR_PHYSID_CPUID           (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PRIORITY_SHIFT         23
 #define GICH_LR_STATE                  (3 << 28)
 #define GICH_LR_PENDING_BIT            (1 << 28)
 #define GICH_LR_ACTIVE_BIT             (1 << 29)
index fd1083c46c61f0f2163287f4b873968d4778aa07..efb232c5f66867851053da799c83428e576ebb57 100644 (file)
@@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 
 /* Flags in jbd_inode->i_flags */
 #define __JI_COMMIT_RUNNING 0
-/* Commit of the inode data in progress. We use this flag to protect us from
+#define __JI_WRITE_DATA 1
+#define __JI_WAIT_DATA 2
+
+/*
+ * Commit of the inode data in progress. We use this flag to protect us from
  * concurrent deletion of inode. We cannot use reference to inode for this
  * since we cannot afford doing last iput() on behalf of kjournald
  */
 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+/* Write allocated dirty buffers in this inode before commit */
+#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
+/* Wait for outstanding data writes for this inode before commit */
+#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
 
 /**
  * struct jbd_inode is the structure linking inodes in ordered mode
@@ -781,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end)
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *     number that will fit in j_blocksize
  * @j_last_sync_writer: most recent pid which did a synchronous write
- * @j_history: Buffer storing the transactions statistics history
- * @j_history_max: Maximum number of transactions in the statistics history
- * @j_history_cur: Current number of transactions in the statistics history
  * @j_history_lock: Protect the transactions statistics history
  * @j_proc_entry: procfs entry for the jbd statistics directory
  * @j_stats: Overall statistics
@@ -1270,7 +1275,8 @@ extern int           jbd2_journal_clear_err  (journal_t *);
 extern int        jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int        jbd2_journal_force_commit(journal_t *);
 extern int        jbd2_journal_force_commit_nested(journal_t *);
-extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int        jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
+extern int        jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
 extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
index b1fa8f11c95bac4b547d6b5d7ab2f159c28c8631..1c9c973a7dd9aef4d10abbed638147a539f5dd5b 100644 (file)
@@ -412,6 +412,8 @@ struct kvm {
 #endif
        long tlbs_dirty;
        struct list_head devices;
+       struct dentry *debugfs_dentry;
+       struct kvm_stat_data **debugfs_stat_data;
 };
 
 #define kvm_err(fmt, ...) \
@@ -991,6 +993,11 @@ enum kvm_stat_kind {
        KVM_STAT_VCPU,
 };
 
+struct kvm_stat_data {
+       int offset;
+       struct kvm *kvm;
+};
+
 struct kvm_stats_debugfs_item {
        const char *name;
        int offset;
index 2835d598d2581144a775dc9dada7b698a5c20703..a00ec816233ae8247cb8a409cb7a5546fe231e0e 100644 (file)
@@ -303,6 +303,12 @@ struct vm_fault {
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
+       void *entry;                    /* ->fault handler can alternatively
+                                        * return locked DAX entry. In that
+                                        * case handler should return
+                                        * VM_FAULT_DAX_LOCKED and fill in
+                                        * entry here.
+                                        */
        /* for ->map_pages() only */
        pgoff_t max_pgoff;              /* map pages for offset from pgoff till
                                         * max_pgoff inclusive */
@@ -1076,6 +1082,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to small */
+#define VM_FAULT_DAX_LOCKED 0x1000     /* ->fault has locked DAX entry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
index d553855503e6a8ecf80aad21486aefa44b577770..ca3e517980a0a2d3d58c6566350456f828e4446f 100644 (file)
@@ -514,7 +514,9 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
        atomic_long_t hugetlb_usage;
 #endif
+#ifdef CONFIG_MMU
        struct work_struct async_put_work;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
index 85800b48241fad5599f20793831b3e8e1dd7178f..45cde8cd39f2434f5f9ec02c702a91a7f6b971c9 100644 (file)
@@ -329,6 +329,7 @@ struct mmc_host {
        unsigned int            can_retune:1;   /* re-tuning can be used */
        unsigned int            doing_retune:1; /* re-tuning in progress */
        unsigned int            retune_now:1;   /* do re-tuning at next req */
+       unsigned int            retune_paused:1; /* re-tuning is temporarily disabled */
 
        int                     rescan_disable; /* disable card detection */
        int                     rescan_entered; /* used with nonremovable devices */
@@ -526,4 +527,7 @@ static inline void mmc_retune_recheck(struct mmc_host *host)
                host->retune_now = 1;
 }
 
+void mmc_retune_pause(struct mmc_host *host);
+void mmc_retune_unpause(struct mmc_host *host);
+
 #endif /* LINUX_MMC_HOST_H */
index c8be32e9fc49507702d187a33893f8f3870a38ef..ad3c3488073c7fa77f1fac02927ad1f7c5a9a0dc 100644 (file)
 
 #define FSMC_BUSY_WAIT_TIMEOUT (1 * HZ)
 
-/*
- * There are 13 bytes of ecc for every 512 byte block in FSMC version 8
- * and it has to be read consecutively and immediately after the 512
- * byte data block for hardware to generate the error bit offsets
- * Managing the ecc bytes in the following way is easier. This way is
- * similar to oobfree structure maintained already in u-boot nand driver
- */
-#define MAX_ECCPLACE_ENTRIES   32
-
-struct fsmc_nand_eccplace {
-       uint8_t offset;
-       uint8_t length;
-};
-
-struct fsmc_eccplace {
-       struct fsmc_nand_eccplace eccplace[MAX_ECCPLACE_ENTRIES];
-};
-
 struct fsmc_nand_timings {
        uint8_t tclr;
        uint8_t tar;
index 5e0eb7ccabd4213a948338c7a86198fa1f9429eb..3aa56e3104bb747da8fedd70da2d3570cfa54103 100644 (file)
 #endif
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_32
-# ifdef map_bankwidth
-#  undef map_bankwidth
-#  define map_bankwidth(map) ((map)->bankwidth)
-#  undef map_bankwidth_is_large
-#  define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
-#  undef map_words
-#  define map_words(map) map_calc_words(map)
-# else
-#  define map_bankwidth(map) 32
-#  define map_bankwidth_is_large(map) (1)
-#  define map_words(map) map_calc_words(map)
-# endif
+/* always use indirect access for 256-bit to preserve kernel stack */
+# undef map_bankwidth
+# define map_bankwidth(map) ((map)->bankwidth)
+# undef map_bankwidth_is_large
+# define map_bankwidth_is_large(map) (map_bankwidth(map) > BITS_PER_LONG/8)
+# undef map_words
+# define map_words(map) map_calc_words(map)
 #define map_bankwidth_is_32(map) (map_bankwidth(map) == 32)
 #undef MAX_MAP_BANKWIDTH
 #define MAX_MAP_BANKWIDTH 32
index ef9fea4fc40080dd5832d70405bbd87e2a605339..29a1706122035b74a52e77bfa3a1f6b60f115ee2 100644 (file)
@@ -96,16 +96,35 @@ struct mtd_oob_ops {
 
 #define MTD_MAX_OOBFREE_ENTRIES_LARGE  32
 #define MTD_MAX_ECCPOS_ENTRIES_LARGE   640
+/**
+ * struct mtd_oob_region - oob region definition
+ * @offset: region offset
+ * @length: region length
+ *
+ * This structure describes a region of the OOB area, and is used
+ * to retrieve ECC or free bytes sections.
+ * Each section is defined by an offset within the OOB area and a
+ * length.
+ */
+struct mtd_oob_region {
+       u32 offset;
+       u32 length;
+};
+
 /*
- * Internal ECC layout control structure. For historical reasons, there is a
- * similar, smaller struct nand_ecclayout_user (in mtd-abi.h) that is retained
- * for export to user-space via the ECCGETLAYOUT ioctl.
- * nand_ecclayout should be expandable in the future simply by the above macros.
+ * struct mtd_ooblayout_ops - NAND OOB layout operations
+ * @ecc: function returning an ECC region in the OOB area.
+ *      Should return -ERANGE if %section exceeds the total number of
+ *      ECC sections.
+ * @free: function returning a free region in the OOB area.
+ *       Should return -ERANGE if %section exceeds the total number of
+ *       free sections.
  */
-struct nand_ecclayout {
-       __u32 eccbytes;
-       __u32 eccpos[MTD_MAX_ECCPOS_ENTRIES_LARGE];
-       struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES_LARGE];
+struct mtd_ooblayout_ops {
+       int (*ecc)(struct mtd_info *mtd, int section,
+                  struct mtd_oob_region *oobecc);
+       int (*free)(struct mtd_info *mtd, int section,
+                   struct mtd_oob_region *oobfree);
 };
 
 struct module; /* only needed for owner field in mtd_info */
@@ -166,8 +185,8 @@ struct mtd_info {
        const char *name;
        int index;
 
-       /* ECC layout structure pointer - read only! */
-       struct nand_ecclayout *ecclayout;
+       /* OOB layout description */
+       const struct mtd_ooblayout_ops *ooblayout;
 
        /* the ecc step size. */
        unsigned int ecc_step_size;
@@ -253,6 +272,30 @@ struct mtd_info {
        int usecount;
 };
 
+int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
+                     struct mtd_oob_region *oobecc);
+int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte,
+                                int *section,
+                                struct mtd_oob_region *oobregion);
+int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf,
+                              const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf,
+                              u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_free(struct mtd_info *mtd, int section,
+                      struct mtd_oob_region *oobfree);
+int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf,
+                               const u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_set_databytes(struct mtd_info *mtd, const u8 *databuf,
+                               u8 *oobbuf, int start, int nbytes);
+int mtd_ooblayout_count_freebytes(struct mtd_info *mtd);
+int mtd_ooblayout_count_eccbytes(struct mtd_info *mtd);
+
+static inline void mtd_set_ooblayout(struct mtd_info *mtd,
+                                    const struct mtd_ooblayout_ops *ooblayout)
+{
+       mtd->ooblayout = ooblayout;
+}
+
 static inline void mtd_set_of_node(struct mtd_info *mtd,
                                   struct device_node *np)
 {
index 56574ba36555260fda145a059b5346836947e17d..fbe8e164a4ee93db99fab879c7284721546204e8 100644 (file)
@@ -116,9 +116,14 @@ typedef enum {
        NAND_ECC_HW,
        NAND_ECC_HW_SYNDROME,
        NAND_ECC_HW_OOB_FIRST,
-       NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
+enum nand_ecc_algo {
+       NAND_ECC_UNKNOWN,
+       NAND_ECC_HAMMING,
+       NAND_ECC_BCH,
+};
+
 /*
  * Constants for Hardware ECC
  */
@@ -458,6 +463,7 @@ struct nand_hw_control {
 /**
  * struct nand_ecc_ctrl - Control structure for ECC
  * @mode:      ECC mode
+ * @algo:      ECC algorithm
  * @steps:     number of ECC steps per page
  * @size:      data bytes per ECC step
  * @bytes:     ECC bytes per step
@@ -466,7 +472,6 @@ struct nand_hw_control {
  * @prepad:    padding information for syndrome based ECC generators
  * @postpad:   padding information for syndrome based ECC generators
  * @options:   ECC specific options (see NAND_ECC_XXX flags defined above)
- * @layout:    ECC layout control struct pointer
  * @priv:      pointer to private ECC control data
  * @hwctl:     function to control hardware ECC generator. Must only
  *             be provided if an hardware ECC is available
@@ -508,6 +513,7 @@ struct nand_hw_control {
  */
 struct nand_ecc_ctrl {
        nand_ecc_modes_t mode;
+       enum nand_ecc_algo algo;
        int steps;
        int size;
        int bytes;
@@ -516,7 +522,6 @@ struct nand_ecc_ctrl {
        int prepad;
        int postpad;
        unsigned int options;
-       struct nand_ecclayout   *layout;
        void *priv;
        void (*hwctl)(struct mtd_info *mtd, int mode);
        int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
@@ -740,6 +745,9 @@ struct nand_chip {
        void *priv;
 };
 
+extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
+extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
+
 static inline void nand_set_flash_node(struct nand_chip *chip,
                                       struct device_node *np)
 {
@@ -1070,4 +1078,18 @@ int nand_check_erased_ecc_chunk(void *data, int datalen,
                                void *ecc, int ecclen,
                                void *extraoob, int extraooblen,
                                int threshold);
+
+/* Default write_oob implementation */
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default write_oob syndrome implementation */
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                           int page);
+
+/* Default read_oob implementation */
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default read_oob syndrome implementation */
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+                          int page);
 #endif /* __LINUX_MTD_NAND_H */
index 4596503c9da9e0952d4953fa856572898aabc9be..0aaa98b219a461711306913674d5d582b40a617b 100644 (file)
@@ -80,7 +80,6 @@ struct onenand_bufferram {
  * @page_buf:          [INTERN] page main data buffer
  * @oob_buf:           [INTERN] page oob data buffer
  * @subpagesize:       [INTERN] holds the subpagesize
- * @ecclayout:         [REPLACEABLE] the default ecc placement scheme
  * @bbm:               [REPLACEABLE] pointer to Bad Block Management
  * @priv:              [OPTIONAL] pointer to private chip date
  */
@@ -134,7 +133,6 @@ struct onenand_chip {
 #endif
 
        int                     subpagesize;
-       struct nand_ecclayout   *ecclayout;
 
        void                    *bbm;
 
index 25f4d2a845c1a95f81185be9d12a2c4a540f21bd..65e91d0fa9817b8d66d126754ccec5ec3f3b40c0 100644 (file)
@@ -14,7 +14,7 @@
 
 struct sharpsl_nand_platform_data {
        struct nand_bbt_descr   *badblock_pattern;
-       struct nand_ecclayout   *ecc_layout;
+       const struct mtd_ooblayout_ops *ecc_layout;
        struct mtd_partition    *partitions;
        unsigned int            nr_partitions;
 };
index 3c36113a88e1d8e57f4a27032bb17130b3df3f7a..7f041bd88b8244f8fa243c71d508e55dd08b4160 100644 (file)
@@ -21,6 +21,7 @@
  * Sometimes these are the same as CFI IDs, but sometimes they aren't.
  */
 #define SNOR_MFR_ATMEL         CFI_MFR_ATMEL
+#define SNOR_MFR_GIGADEVICE    0xc8
 #define SNOR_MFR_INTEL         CFI_MFR_INTEL
 #define SNOR_MFR_MICRON                CFI_MFR_ST /* ST Micro <--> Micron */
 #define SNOR_MFR_MACRONIX      CFI_MFR_MACRONIX
index 011433478a14811dbac1faaed622d3b5599aeb96..bfed6b367350cdd34fe4e87598ec5cb94aac60f8 100644 (file)
@@ -50,12 +50,27 @@ struct nfs4_label {
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
 
-struct nfs_stateid4 {
-       __be32 seqid;
-       char other[NFS4_STATEID_OTHER_SIZE];
-} __attribute__ ((packed));
+struct nfs4_stateid_struct {
+       union {
+               char data[NFS4_STATEID_SIZE];
+               struct {
+                       __be32 seqid;
+                       char other[NFS4_STATEID_OTHER_SIZE];
+               } __attribute__ ((packed));
+       };
+
+       enum {
+               NFS4_INVALID_STATEID_TYPE = 0,
+               NFS4_SPECIAL_STATEID_TYPE,
+               NFS4_OPEN_STATEID_TYPE,
+               NFS4_LOCK_STATEID_TYPE,
+               NFS4_DELEGATION_STATEID_TYPE,
+               NFS4_LAYOUT_STATEID_TYPE,
+               NFS4_PNFS_DS_STATEID_TYPE,
+       } type;
+};
 
-typedef struct nfs_stateid4 nfs4_stateid;
+typedef struct nfs4_stateid_struct nfs4_stateid;
 
 enum nfs_opnum4 {
        OP_ACCESS = 3,
@@ -504,6 +519,7 @@ enum {
        NFSPROC4_CLNT_DEALLOCATE,
        NFSPROC4_CLNT_LAYOUTSTATS,
        NFSPROC4_CLNT_CLONE,
+       NFSPROC4_CLNT_COPY,
 };
 
 /* nfs41 types */
@@ -621,7 +637,9 @@ enum pnfs_update_layout_reason {
        PNFS_UPDATE_LAYOUT_IO_TEST_FAIL,
        PNFS_UPDATE_LAYOUT_FOUND_CACHED,
        PNFS_UPDATE_LAYOUT_RETURN,
+       PNFS_UPDATE_LAYOUT_RETRY,
        PNFS_UPDATE_LAYOUT_BLOCKED,
+       PNFS_UPDATE_LAYOUT_INVALID_OPEN,
        PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
 };
 
index 7fcc13c8cf1f3c465949453ee055a57cc4e79b0a..14a762d2734d91fff13fd17ebcf76978eb9feb2a 100644 (file)
@@ -246,5 +246,6 @@ struct nfs_server {
 #define NFS_CAP_DEALLOCATE     (1U << 21)
 #define NFS_CAP_LAYOUTSTATS    (1U << 22)
 #define NFS_CAP_CLONE          (1U << 23)
+#define NFS_CAP_COPY           (1U << 24)
 
 #endif
index ee8491dadbf385aa53b689efb1912d685790fbc9..c304a11b5b1aae93d5bb5a18ad995534f157b028 100644 (file)
@@ -233,7 +233,6 @@ struct nfs4_layoutget_args {
        struct inode *inode;
        struct nfs_open_context *ctx;
        nfs4_stateid stateid;
-       unsigned long timestamp;
        struct nfs4_layoutdriver_data layout;
 };
 
@@ -251,7 +250,6 @@ struct nfs4_layoutget {
        struct nfs4_layoutget_res res;
        struct rpc_cred *cred;
        gfp_t gfp_flags;
-       long timeout;
 };
 
 struct nfs4_getdeviceinfo_args {
@@ -1343,6 +1341,32 @@ struct nfs42_falloc_res {
        const struct nfs_server         *falloc_server;
 };
 
+struct nfs42_copy_args {
+       struct nfs4_sequence_args       seq_args;
+
+       struct nfs_fh                   *src_fh;
+       nfs4_stateid                    src_stateid;
+       u64                             src_pos;
+
+       struct nfs_fh                   *dst_fh;
+       nfs4_stateid                    dst_stateid;
+       u64                             dst_pos;
+
+       u64                             count;
+};
+
+struct nfs42_write_res {
+       u64                     count;
+       struct nfs_writeverf    verifier;
+};
+
+struct nfs42_copy_res {
+       struct nfs4_sequence_res        seq_res;
+       struct nfs42_write_res          write_res;
+       bool                            consecutive;
+       bool                            synchronous;
+};
+
 struct nfs42_seek_args {
        struct nfs4_sequence_args       seq_args;
 
@@ -1431,7 +1455,7 @@ struct nfs_commit_completion_ops {
 };
 
 struct nfs_commit_info {
-       spinlock_t                      *lock;  /* inode->i_lock */
+       struct inode                    *inode; /* Needed for inode->i_lock */
        struct nfs_mds_commit_info      *mds;
        struct pnfs_ds_commit_info      *ds;
        struct nfs_direct_req           *dreq;  /* O_DIRECT request */
diff --git a/include/linux/of_mtd.h b/include/linux/of_mtd.h
deleted file mode 100644 (file)
index e266caa..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2012 Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
- *
- * OF helpers for mtd.
- *
- * This file is released under the GPLv2
- */
-
-#ifndef __LINUX_OF_MTD_H
-#define __LINUX_OF_MTD_H
-
-#ifdef CONFIG_OF_MTD
-
-#include <linux/of.h>
-int of_get_nand_ecc_mode(struct device_node *np);
-int of_get_nand_ecc_step_size(struct device_node *np);
-int of_get_nand_ecc_strength(struct device_node *np);
-int of_get_nand_bus_width(struct device_node *np);
-bool of_get_nand_on_flash_bbt(struct device_node *np);
-
-#else /* CONFIG_OF_MTD */
-
-static inline int of_get_nand_ecc_mode(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_step_size(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_ecc_strength(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline int of_get_nand_bus_width(struct device_node *np)
-{
-       return -ENOSYS;
-}
-
-static inline bool of_get_nand_on_flash_bbt(struct device_node *np)
-{
-       return false;
-}
-
-#endif /* CONFIG_OF_MTD */
-
-#endif /* __LINUX_OF_MTD_H */
index d833eb4dd44633da82623df4e09564e934e6fa19..9e9d79e8efa5fd7cd3e38e6aa7d74bb5abb52809 100644 (file)
  *  option) any later version.
  */
 
-/* Maximum Number of Chip Selects */
-#define GPMC_CS_NUM            8
+#include <linux/platform_data/gpmc-omap.h>
 
 #define GPMC_CONFIG_WP         0x00000005
 
-#define GPMC_IRQ_FIFOEVENTENABLE       0x01
-#define GPMC_IRQ_COUNT_EVENT           0x02
-
-#define GPMC_BURST_4                   4       /* 4 word burst */
-#define GPMC_BURST_8                   8       /* 8 word burst */
-#define GPMC_BURST_16                  16      /* 16 word burst */
-#define GPMC_DEVWIDTH_8BIT             1       /* 8-bit device width */
-#define GPMC_DEVWIDTH_16BIT            2       /* 16-bit device width */
-#define GPMC_MUX_AAD                   1       /* Addr-Addr-Data multiplex */
-#define GPMC_MUX_AD                    2       /* Addr-Data multiplex */
-
-/* bool type time settings */
-struct gpmc_bool_timings {
-       bool cycle2cyclediffcsen;
-       bool cycle2cyclesamecsen;
-       bool we_extra_delay;
-       bool oe_extra_delay;
-       bool adv_extra_delay;
-       bool cs_extra_delay;
-       bool time_para_granularity;
-};
+/* IRQ numbers in GPMC IRQ domain for legacy boot use */
+#define GPMC_IRQ_FIFOEVENTENABLE       0
+#define GPMC_IRQ_COUNT_EVENT           1
 
-/*
- * Note that all values in this struct are in nanoseconds except sync_clk
- * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+/**
+ * gpmc_nand_ops - Interface between NAND and GPMC
+ * @nand_write_buffer_empty: get the NAND write buffer empty status.
  */
-struct gpmc_timings {
-       /* Minimum clock period for synchronous mode (in picoseconds) */
-       u32 sync_clk;
-
-       /* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
-       u32 cs_on;              /* Assertion time */
-       u32 cs_rd_off;          /* Read deassertion time */
-       u32 cs_wr_off;          /* Write deassertion time */
-
-       /* ADV signal timings corresponding to GPMC_CONFIG3 */
-       u32 adv_on;             /* Assertion time */
-       u32 adv_rd_off;         /* Read deassertion time */
-       u32 adv_wr_off;         /* Write deassertion time */
-       u32 adv_aad_mux_on;     /* ADV assertion time for AAD */
-       u32 adv_aad_mux_rd_off; /* ADV read deassertion time for AAD */
-       u32 adv_aad_mux_wr_off; /* ADV write deassertion time for AAD */
-
-       /* WE signals timings corresponding to GPMC_CONFIG4 */
-       u32 we_on;              /* WE assertion time */
-       u32 we_off;             /* WE deassertion time */
-
-       /* OE signals timings corresponding to GPMC_CONFIG4 */
-       u32 oe_on;              /* OE assertion time */
-       u32 oe_off;             /* OE deassertion time */
-       u32 oe_aad_mux_on;      /* OE assertion time for AAD */
-       u32 oe_aad_mux_off;     /* OE deassertion time for AAD */
-
-       /* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
-       u32 page_burst_access;  /* Multiple access word delay */
-       u32 access;             /* Start-cycle to first data valid delay */
-       u32 rd_cycle;           /* Total read cycle time */
-       u32 wr_cycle;           /* Total write cycle time */
-
-       u32 bus_turnaround;
-       u32 cycle2cycle_delay;
-
-       u32 wait_monitoring;
-       u32 clk_activation;
-
-       /* The following are only on OMAP3430 */
-       u32 wr_access;          /* WRACCESSTIME */
-       u32 wr_data_mux_bus;    /* WRDATAONADMUXBUS */
-
-       struct gpmc_bool_timings bool_timings;
+struct gpmc_nand_ops {
+       bool (*nand_writebuffer_empty)(void);
 };
 
-/* Device timings in picoseconds */
-struct gpmc_device_timings {
-       u32 t_ceasu;    /* address setup to CS valid */
-       u32 t_avdasu;   /* address setup to ADV valid */
-       /* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
-        * of tusb using these timings even for sync whilst
-        * ideally for adv_rd/(wr)_off it should have considered
-        * t_avdh instead. This indirectly necessitates r/w
-        * variations of t_avdp as it is possible to have one
-        * sync & other async
-        */
-       u32 t_avdp_r;   /* ADV low time (what about t_cer ?) */
-       u32 t_avdp_w;
-       u32 t_aavdh;    /* address hold time */
-       u32 t_oeasu;    /* address setup to OE valid */
-       u32 t_aa;       /* access time from ADV assertion */
-       u32 t_iaa;      /* initial access time */
-       u32 t_oe;       /* access time from OE assertion */
-       u32 t_ce;       /* access time from CS asertion */
-       u32 t_rd_cycle; /* read cycle time */
-       u32 t_cez_r;    /* read CS deassertion to high Z */
-       u32 t_cez_w;    /* write CS deassertion to high Z */
-       u32 t_oez;      /* OE deassertion to high Z */
-       u32 t_weasu;    /* address setup to WE valid */
-       u32 t_wpl;      /* write assertion time */
-       u32 t_wph;      /* write deassertion time */
-       u32 t_wr_cycle; /* write cycle time */
-
-       u32 clk;
-       u32 t_bacc;     /* burst access valid clock to output delay */
-       u32 t_ces;      /* CS setup time to clk */
-       u32 t_avds;     /* ADV setup time to clk */
-       u32 t_avdh;     /* ADV hold time from clk */
-       u32 t_ach;      /* address hold time from clk */
-       u32 t_rdyo;     /* clk to ready valid */
-
-       u32 t_ce_rdyz;  /* XXX: description ?, or use t_cez instead */
-       u32 t_ce_avd;   /* CS on to ADV on delay */
-
-       /* XXX: check the possibility of combining
-        * cyc_aavhd_oe & cyc_aavdh_we
-        */
-       u8 cyc_aavdh_oe;/* read address hold time in cycles */
-       u8 cyc_aavdh_we;/* write address hold time in cycles */
-       u8 cyc_oe;      /* access time from OE assertion in cycles */
-       u8 cyc_wpl;     /* write deassertion time in cycles */
-       u32 cyc_iaa;    /* initial access time in cycles */
-
-       /* extra delays */
-       bool ce_xdelay;
-       bool avd_xdelay;
-       bool oe_xdelay;
-       bool we_xdelay;
-};
+struct gpmc_nand_regs;
 
-struct gpmc_settings {
-       bool burst_wrap;        /* enables wrap bursting */
-       bool burst_read;        /* enables read page/burst mode */
-       bool burst_write;       /* enables write page/burst mode */
-       bool device_nand;       /* device is NAND */
-       bool sync_read;         /* enables synchronous reads */
-       bool sync_write;        /* enables synchronous writes */
-       bool wait_on_read;      /* monitor wait on reads */
-       bool wait_on_write;     /* monitor wait on writes */
-       u32 burst_len;          /* page/burst length */
-       u32 device_width;       /* device bus width (8 or 16 bit) */
-       u32 mux_add_data;       /* multiplex address & data */
-       u32 wait_pin;           /* wait-pin to be used */
-};
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+struct gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+                                            int cs);
+#else
+static inline gpmc_nand_ops *gpmc_omap_get_nand_ops(struct gpmc_nand_regs *regs,
+                                                   int cs)
+{
+       return NULL;
+}
+#endif /* CONFIG_OMAP_GPMC */
+
+/*--------------------------------*/
+
+/* deprecated APIs */
+#if IS_ENABLED(CONFIG_OMAP_GPMC)
+void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
+#else
+static inline void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs)
+{
+}
+#endif /* CONFIG_OMAP_GPMC */
+/*--------------------------------*/
 
 extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t,
                             struct gpmc_settings *gpmc_s,
                             struct gpmc_device_timings *dev_t);
 
-struct gpmc_nand_regs;
 struct device_node;
 
-extern void gpmc_update_nand_reg(struct gpmc_nand_regs *reg, int cs);
 extern int gpmc_get_client_irq(unsigned irq_config);
 
 extern unsigned int gpmc_ticks_to_ns(unsigned int ticks);
index 44f33834ad787dd07408aed96398add798df4be9..1a827cecd62fa36b8f7cec08a11deb7d65d000e2 100644 (file)
@@ -61,6 +61,14 @@ struct perf_callchain_entry {
        __u64                           ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
+struct perf_callchain_entry_ctx {
+       struct perf_callchain_entry *entry;
+       u32                         max_stack;
+       u32                         nr;
+       short                       contexts;
+       bool                        contexts_maxed;
+};
+
 struct perf_raw_record {
        u32                             size;
        void                            *data;
@@ -1061,20 +1069,36 @@ extern void perf_event_fork(struct task_struct *tsk);
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
-extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-                  bool crosstask, bool add_mark);
+                  u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
+
+static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
+{
+       if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
+               struct perf_callchain_entry *entry = ctx->entry;
+               entry->ip[entry->nr++] = ip;
+               ++ctx->contexts;
+               return 0;
+       } else {
+               ctx->contexts_maxed = true;
+               return -1; /* no more room, stop walking the stack */
+       }
+}
 
-static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-       if (entry->nr < sysctl_perf_event_max_stack) {
+       if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
+               struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
+               ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
diff --git a/include/linux/platform_data/gpmc-omap.h b/include/linux/platform_data/gpmc-omap.h
new file mode 100644 (file)
index 0000000..67ccdb0
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * OMAP GPMC Platform data
+ *
+ * Copyright (C) 2014 Texas Instruments, Inc. - http://www.ti.com
+ *     Roger Quadros <rogerq@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#ifndef _GPMC_OMAP_H_
+#define _GPMC_OMAP_H_
+
+/* Maximum Number of Chip Selects */
+#define GPMC_CS_NUM            8
+
+/* bool type time settings */
+struct gpmc_bool_timings {
+       bool cycle2cyclediffcsen;
+       bool cycle2cyclesamecsen;
+       bool we_extra_delay;
+       bool oe_extra_delay;
+       bool adv_extra_delay;
+       bool cs_extra_delay;
+       bool time_para_granularity;
+};
+
+/*
+ * Note that all values in this struct are in nanoseconds except sync_clk
+ * (which is in picoseconds), while the register values are in gpmc_fck cycles.
+ */
+struct gpmc_timings {
+       /* Minimum clock period for synchronous mode (in picoseconds) */
+       u32 sync_clk;
+
+       /* Chip-select signal timings corresponding to GPMC_CS_CONFIG2 */
+       u32 cs_on;              /* Assertion time */
+       u32 cs_rd_off;          /* Read deassertion time */
+       u32 cs_wr_off;          /* Write deassertion time */
+
+       /* ADV signal timings corresponding to GPMC_CONFIG3 */
+       u32 adv_on;             /* Assertion time */
+       u32 adv_rd_off;         /* Read deassertion time */
+       u32 adv_wr_off;         /* Write deassertion time */
+       u32 adv_aad_mux_on;     /* ADV assertion time for AAD */
+       u32 adv_aad_mux_rd_off; /* ADV read deassertion time for AAD */
+       u32 adv_aad_mux_wr_off; /* ADV write deassertion time for AAD */
+
+       /* WE signals timings corresponding to GPMC_CONFIG4 */
+       u32 we_on;              /* WE assertion time */
+       u32 we_off;             /* WE deassertion time */
+
+       /* OE signals timings corresponding to GPMC_CONFIG4 */
+       u32 oe_on;              /* OE assertion time */
+       u32 oe_off;             /* OE deassertion time */
+       u32 oe_aad_mux_on;      /* OE assertion time for AAD */
+       u32 oe_aad_mux_off;     /* OE deassertion time for AAD */
+
+       /* Access time and cycle time timings corresponding to GPMC_CONFIG5 */
+       u32 page_burst_access;  /* Multiple access word delay */
+       u32 access;             /* Start-cycle to first data valid delay */
+       u32 rd_cycle;           /* Total read cycle time */
+       u32 wr_cycle;           /* Total write cycle time */
+
+       u32 bus_turnaround;
+       u32 cycle2cycle_delay;
+
+       u32 wait_monitoring;
+       u32 clk_activation;
+
+       /* The following are only on OMAP3430 */
+       u32 wr_access;          /* WRACCESSTIME */
+       u32 wr_data_mux_bus;    /* WRDATAONADMUXBUS */
+
+       struct gpmc_bool_timings bool_timings;
+};
+
+/* Device timings in picoseconds */
+struct gpmc_device_timings {
+       u32 t_ceasu;    /* address setup to CS valid */
+       u32 t_avdasu;   /* address setup to ADV valid */
+       /* XXX: try to combine t_avdp_r & t_avdp_w. Issue is
+        * of tusb using these timings even for sync whilst
+        * ideally for adv_rd/(wr)_off it should have considered
+        * t_avdh instead. This indirectly necessitates r/w
+        * variations of t_avdp as it is possible to have one
+        * sync & other async
+        */
+       u32 t_avdp_r;   /* ADV low time (what about t_cer ?) */
+       u32 t_avdp_w;
+       u32 t_aavdh;    /* address hold time */
+       u32 t_oeasu;    /* address setup to OE valid */
+       u32 t_aa;       /* access time from ADV assertion */
+       u32 t_iaa;      /* initial access time */
+       u32 t_oe;       /* access time from OE assertion */
+       u32 t_ce;       /* access time from CS asertion */
+       u32 t_rd_cycle; /* read cycle time */
+       u32 t_cez_r;    /* read CS deassertion to high Z */
+       u32 t_cez_w;    /* write CS deassertion to high Z */
+       u32 t_oez;      /* OE deassertion to high Z */
+       u32 t_weasu;    /* address setup to WE valid */
+       u32 t_wpl;      /* write assertion time */
+       u32 t_wph;      /* write deassertion time */
+       u32 t_wr_cycle; /* write cycle time */
+
+       u32 clk;
+       u32 t_bacc;     /* burst access valid clock to output delay */
+       u32 t_ces;      /* CS setup time to clk */
+       u32 t_avds;     /* ADV setup time to clk */
+       u32 t_avdh;     /* ADV hold time from clk */
+       u32 t_ach;      /* address hold time from clk */
+       u32 t_rdyo;     /* clk to ready valid */
+
+       u32 t_ce_rdyz;  /* XXX: description ?, or use t_cez instead */
+       u32 t_ce_avd;   /* CS on to ADV on delay */
+
+       /* XXX: check the possibility of combining
+        * cyc_aavhd_oe & cyc_aavdh_we
+        */
+       u8 cyc_aavdh_oe;/* read address hold time in cycles */
+       u8 cyc_aavdh_we;/* write address hold time in cycles */
+       u8 cyc_oe;      /* access time from OE assertion in cycles */
+       u8 cyc_wpl;     /* write deassertion time in cycles */
+       u32 cyc_iaa;    /* initial access time in cycles */
+
+       /* extra delays */
+       bool ce_xdelay;
+       bool avd_xdelay;
+       bool oe_xdelay;
+       bool we_xdelay;
+};
+
+#define GPMC_BURST_4                   4       /* 4 word burst */
+#define GPMC_BURST_8                   8       /* 8 word burst */
+#define GPMC_BURST_16                  16      /* 16 word burst */
+#define GPMC_DEVWIDTH_8BIT             1       /* 8-bit device width */
+#define GPMC_DEVWIDTH_16BIT            2       /* 16-bit device width */
+#define GPMC_MUX_AAD                   1       /* Addr-Addr-Data multiplex */
+#define GPMC_MUX_AD                    2       /* Addr-Data multiplex */
+
+struct gpmc_settings {
+       bool burst_wrap;        /* enables wrap bursting */
+       bool burst_read;        /* enables read page/burst mode */
+       bool burst_write;       /* enables write page/burst mode */
+       bool device_nand;       /* device is NAND */
+       bool sync_read;         /* enables synchronous reads */
+       bool sync_write;        /* enables synchronous writes */
+       bool wait_on_read;      /* monitor wait on reads */
+       bool wait_on_write;     /* monitor wait on writes */
+       u32 burst_len;          /* page/burst length */
+       u32 device_width;       /* device bus width (8 or 16 bit) */
+       u32 mux_add_data;       /* multiplex address & data */
+       u32 wait_pin;           /* wait-pin to be used */
+};
+
+/* Data for each chip select */
+struct gpmc_omap_cs_data {
+       bool valid;                     /* data is valid */
+       bool is_nand;                   /* device within this CS is NAND */
+       struct gpmc_settings *settings;
+       struct gpmc_device_timings *device_timings;
+       struct gpmc_timings *gpmc_timings;
+       struct platform_device *pdev;   /* device within this CS region */
+       unsigned int pdata_size;
+};
+
+struct gpmc_omap_platform_data {
+       struct gpmc_omap_cs_data cs[GPMC_CS_NUM];
+};
+
+#endif /* _GPMC_OMAP_H */
index 090bbab0130a3d31b226b8fce8b50f9f6b231be9..17d57a18bac575b94f8aac1a372e642160b0b684 100644 (file)
@@ -45,7 +45,6 @@ enum omap_ecc {
 };
 
 struct gpmc_nand_regs {
-       void __iomem    *gpmc_status;
        void __iomem    *gpmc_nand_command;
        void __iomem    *gpmc_nand_address;
        void __iomem    *gpmc_nand_data;
@@ -64,21 +63,24 @@ struct gpmc_nand_regs {
        void __iomem    *gpmc_bch_result4[GPMC_BCH_NUM_REMAINDER];
        void __iomem    *gpmc_bch_result5[GPMC_BCH_NUM_REMAINDER];
        void __iomem    *gpmc_bch_result6[GPMC_BCH_NUM_REMAINDER];
+       /* Deprecated. Do not use */
+       void __iomem    *gpmc_status;
 };
 
 struct omap_nand_platform_data {
        int                     cs;
        struct mtd_partition    *parts;
        int                     nr_parts;
-       bool                    dev_ready;
        bool                    flash_bbt;
        enum nand_io            xfer_type;
        int                     devsize;
        enum omap_ecc           ecc_opt;
-       struct gpmc_nand_regs   reg;
 
-       /* for passing the partitions */
-       struct device_node      *of_node;
        struct device_node      *elm_of_node;
+
+       /* deprecated */
+       struct gpmc_nand_regs   reg;
+       struct device_node      *of_node;
+       bool                    dev_ready;
 };
 #endif
index b78d27c426290089b76546a8fc27d8b95f7216d0..17018f3c066ed5a44c76ca77ce63b6203725ef34 100644 (file)
@@ -5,59 +5,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 
-struct pwm_device;
 struct seq_file;
-
-#if IS_ENABLED(CONFIG_PWM)
-/*
- * pwm_request - request a PWM device
- */
-struct pwm_device *pwm_request(int pwm_id, const char *label);
-
-/*
- * pwm_free - free a PWM device
- */
-void pwm_free(struct pwm_device *pwm);
-
-/*
- * pwm_config - change a PWM device configuration
- */
-int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
-
-/*
- * pwm_enable - start a PWM output toggling
- */
-int pwm_enable(struct pwm_device *pwm);
-
-/*
- * pwm_disable - stop a PWM output toggling
- */
-void pwm_disable(struct pwm_device *pwm);
-#else
-static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
-{
-       return ERR_PTR(-ENODEV);
-}
-
-static inline void pwm_free(struct pwm_device *pwm)
-{
-}
-
-static inline int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
-{
-       return -EINVAL;
-}
-
-static inline int pwm_enable(struct pwm_device *pwm)
-{
-       return -EINVAL;
-}
-
-static inline void pwm_disable(struct pwm_device *pwm)
-{
-}
-#endif
-
 struct pwm_chip;
 
 /**
@@ -94,8 +42,21 @@ struct pwm_args {
 
 enum {
        PWMF_REQUESTED = 1 << 0,
-       PWMF_ENABLED = 1 << 1,
-       PWMF_EXPORTED = 1 << 2,
+       PWMF_EXPORTED = 1 << 1,
+};
+
+/*
+ * struct pwm_state - state of a PWM channel
+ * @period: PWM period (in nanoseconds)
+ * @duty_cycle: PWM duty cycle (in nanoseconds)
+ * @polarity: PWM polarity
+ * @enabled: PWM enabled status
+ */
+struct pwm_state {
+       unsigned int period;
+       unsigned int duty_cycle;
+       enum pwm_polarity polarity;
+       bool enabled;
 };
 
 /**
@@ -106,11 +67,8 @@ enum {
  * @pwm: global index of the PWM device
  * @chip: PWM chip providing this PWM device
  * @chip_data: chip-private data associated with the PWM device
- * @lock: used to serialize accesses to the PWM device where necessary
- * @period: period of the PWM signal (in nanoseconds)
- * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
- * @polarity: polarity of the PWM signal
  * @args: PWM arguments
+ * @state: curent PWM channel state
  */
 struct pwm_device {
        const char *label;
@@ -119,50 +77,68 @@ struct pwm_device {
        unsigned int pwm;
        struct pwm_chip *chip;
        void *chip_data;
-       struct mutex lock;
-
-       unsigned int period;
-       unsigned int duty_cycle;
-       enum pwm_polarity polarity;
 
        struct pwm_args args;
+       struct pwm_state state;
 };
 
+/**
+ * pwm_get_state() - retrieve the current PWM state
+ * @pwm: PWM device
+ * @state: state to fill with the current PWM state
+ */
+static inline void pwm_get_state(const struct pwm_device *pwm,
+                                struct pwm_state *state)
+{
+       *state = pwm->state;
+}
+
 static inline bool pwm_is_enabled(const struct pwm_device *pwm)
 {
-       return test_bit(PWMF_ENABLED, &pwm->flags);
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.enabled;
 }
 
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
        if (pwm)
-               pwm->period = period;
+               pwm->state.period = period;
 }
 
 static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->period : 0;
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.period;
 }
 
 static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
 {
        if (pwm)
-               pwm->duty_cycle = duty;
+               pwm->state.duty_cycle = duty;
 }
 
 static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->duty_cycle : 0;
-}
+       struct pwm_state state;
 
-/*
- * pwm_set_polarity - configure the polarity of a PWM signal
- */
-int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
+       pwm_get_state(pwm, &state);
+
+       return state.duty_cycle;
+}
 
 static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
 {
-       return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+       struct pwm_state state;
+
+       pwm_get_state(pwm, &state);
+
+       return state.polarity;
 }
 
 static inline void pwm_get_args(const struct pwm_device *pwm,
@@ -171,12 +147,6 @@ static inline void pwm_get_args(const struct pwm_device *pwm,
        *args = pwm->args;
 }
 
-static inline void pwm_apply_args(struct pwm_device *pwm)
-{
-       pwm_set_period(pwm, pwm->args.period);
-       pwm_set_polarity(pwm, pwm->args.polarity);
-}
-
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -185,6 +155,13 @@ static inline void pwm_apply_args(struct pwm_device *pwm)
  * @set_polarity: configure the polarity of this PWM
  * @enable: enable PWM output toggling
  * @disable: disable PWM output toggling
+ * @apply: atomically apply a new PWM config. The state argument
+ *        should be adjusted with the real hardware config (if the
+ *        approximate the period or duty_cycle value, state should
+ *        reflect it)
+ * @get_state: get the current PWM state. This function is only
+ *            called once per PWM device when the PWM chip is
+ *            registered.
  * @dbg_show: optional routine to show contents in debugfs
  * @owner: helps prevent removal of modules exporting active PWMs
  */
@@ -197,6 +174,10 @@ struct pwm_ops {
                            enum pwm_polarity polarity);
        int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
        void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
+       int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
+                    struct pwm_state *state);
+       void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
+                         struct pwm_state *state);
 #ifdef CONFIG_DEBUG_FS
        void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
@@ -232,6 +213,115 @@ struct pwm_chip {
 };
 
 #if IS_ENABLED(CONFIG_PWM)
+/* PWM user APIs */
+struct pwm_device *pwm_request(int pwm_id, const char *label);
+void pwm_free(struct pwm_device *pwm);
+int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+int pwm_adjust_config(struct pwm_device *pwm);
+
+/**
+ * pwm_config() - change a PWM device configuration
+ * @pwm: PWM device
+ * @duty_ns: "on" time (in nanoseconds)
+ * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+                            int period_ns)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.duty_cycle == duty_ns && state.period == period_ns)
+               return 0;
+
+       state.duty_cycle = duty_ns;
+       state.period = period_ns;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_set_polarity() - configure the polarity of a PWM signal
+ * @pwm: PWM device
+ * @polarity: new polarity of the PWM signal
+ *
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+                                  enum pwm_polarity polarity)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.polarity == polarity)
+               return 0;
+
+       /*
+        * Changing the polarity of a running PWM without adjusting the
+        * dutycycle/period value is a bit risky (can introduce glitches).
+        * Return -EBUSY in this case.
+        * Note that this is allowed when using pwm_apply_state() because
+        * the user specifies all the parameters.
+        */
+       if (state.enabled)
+               return -EBUSY;
+
+       state.polarity = polarity;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_enable() - start a PWM output toggling
+ * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return -EINVAL;
+
+       pwm_get_state(pwm, &state);
+       if (state.enabled)
+               return 0;
+
+       state.enabled = true;
+       return pwm_apply_state(pwm, &state);
+}
+
+/**
+ * pwm_disable() - stop a PWM output toggling
+ * @pwm: PWM device
+ */
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+       struct pwm_state state;
+
+       if (!pwm)
+               return;
+
+       pwm_get_state(pwm, &state);
+       if (!state.enabled)
+               return;
+
+       state.enabled = false;
+       pwm_apply_state(pwm, &state);
+}
+
+
+/* PWM provider APIs */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
@@ -257,6 +347,47 @@ void devm_pwm_put(struct device *dev, struct pwm_device *pwm);
 
 bool pwm_can_sleep(struct pwm_device *pwm);
 #else
+static inline struct pwm_device *pwm_request(int pwm_id, const char *label)
+{
+       return ERR_PTR(-ENODEV);
+}
+
+static inline void pwm_free(struct pwm_device *pwm)
+{
+}
+
+static inline int pwm_apply_state(struct pwm_device *pwm,
+                                 const struct pwm_state *state)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_adjust_config(struct pwm_device *pwm)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_config(struct pwm_device *pwm, int duty_ns,
+                            int period_ns)
+{
+       return -EINVAL;
+}
+
+static inline int pwm_set_polarity(struct pwm_device *pwm,
+                                  enum pwm_polarity polarity)
+{
+       return -ENOTSUPP;
+}
+
+static inline int pwm_enable(struct pwm_device *pwm)
+{
+       return -EINVAL;
+}
+
+static inline void pwm_disable(struct pwm_device *pwm)
+{
+}
+
 static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
        return -EINVAL;
@@ -328,6 +459,34 @@ static inline bool pwm_can_sleep(struct pwm_device *pwm)
 }
 #endif
 
+static inline void pwm_apply_args(struct pwm_device *pwm)
+{
+       /*
+        * PWM users calling pwm_apply_args() expect to have a fresh config
+        * where the polarity and period are set according to pwm_args info.
+        * The problem is, polarity can only be changed when the PWM is
+        * disabled.
+        *
+        * PWM drivers supporting hardware readout may declare the PWM device
+        * as enabled, and prevent polarity setting, which changes from the
+        * existing behavior, where all PWM devices are declared as disabled
+        * at startup (even if they are actually enabled), thus authorizing
+        * polarity setting.
+        *
+        * Instead of setting ->enabled to false, we call pwm_disable()
+        * before pwm_set_polarity() to ensure that everything is configured
+        * as expected, and the PWM is really disabled when the user request
+        * it.
+        *
+        * Note that PWM users requiring a smooth handover between the
+        * bootloader and the kernel (like critical regulators controlled by
+        * PWM devices) will have to switch to the atomic API and avoid calling
+        * pwm_apply_args().
+        */
+       pwm_disable(pwm);
+       pwm_set_polarity(pwm, pwm->args.polarity);
+}
+
 struct pwm_lookup {
        struct list_head list;
        const char *provider;
index 21c26e78aec5a36a018e8c95b7ea5221e8381eb1..6e42ada26345d507ffdec856107382a9fb67656a 100644 (file)
@@ -1539,6 +1539,7 @@ struct task_struct {
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
        unsigned sched_migrated:1;
+       unsigned sched_remote_wakeup:1;
        unsigned :0; /* force alignment to the next boundary */
 
        /* unserialized, strictly 'current' */
@@ -2744,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-/* same as above but performs the slow path from the async kontext. Can
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
 extern void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
index e0582106ef4faba81db1ff7912246623c3f7f1c2..7973a821ac58877a56ff5214793786948170ae8e 100644 (file)
@@ -277,7 +277,7 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
-       return lockless_dereference(s->sequence);
+       return lockless_dereference(s)->sequence;
 }
 
 /**
@@ -331,7 +331,7 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *     unsigned seq, idx;
  *
  *     do {
- *             seq = lockless_dereference(latch->seq);
+ *             seq = lockless_dereference(latch)->seq;
  *
  *             idx = seq & 0x01;
  *             entry = data_query(latch->data[idx], ...);
index 665cd0cd18b8b1f3d9933dcda96bd5624e7c0a3b..d1faa019c02ab37005745758cd54055f5bd18a7a 100644 (file)
@@ -111,22 +111,6 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 }
 #endif
 
-
-/**
- * virt_to_obj - returns address of the beginning of object.
- * @s: object's kmem_cache
- * @slab_page: address of slab page
- * @x: address within object memory range
- *
- * Returns address of the beginning of object
- */
-static inline void *virt_to_obj(struct kmem_cache *s,
-                               const void *slab_page,
-                               const void *x)
-{
-       return (void *)x - ((x - slab_page) % s->size);
-}
-
 void object_err(struct kmem_cache *s, struct page *page,
                u8 *object, char *reason);
 
index 857a9a1d82b58ad3569c7daeb9fffb17f690926a..1f03483f61e5714b1c07708fee2fe3d7ffa946e3 100644 (file)
@@ -372,6 +372,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @unprepare_message: undo any work done by prepare_message().
  * @spi_flash_read: to support spi-controller hardwares that provide
  *                  accelerated interface to read from flash devices.
+ * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *     number. Any individual value may be -ENOENT for CS lines that
  *     are not GPIOs (driven by the SPI controller itself).
@@ -529,6 +530,7 @@ struct spi_master {
                                 struct spi_message *message);
        int (*spi_flash_read)(struct  spi_device *spi,
                              struct spi_flash_read_message *msg);
+       bool (*flash_read_supported)(struct spi_device *spi);
 
        /*
         * These hooks are for drivers that use a generic implementation
@@ -1158,7 +1160,9 @@ struct spi_flash_read_message {
 /* SPI core interface for flash read support */
 static inline bool spi_flash_read_supported(struct spi_device *spi)
 {
-       return spi->master->spi_flash_read ? true : false;
+       return spi->master->spi_flash_read &&
+              (!spi->master->flash_read_supported ||
+              spi->master->flash_read_supported(spi));
 }
 
 int spi_flash_read(struct spi_device *spi,
index 6a241a277249c989990771a76bbb554b86514ddc..899791573a403ba8434c12a05b4594f23ac6c33b 100644 (file)
@@ -127,7 +127,7 @@ struct rpc_authops {
        void                    (*destroy)(struct rpc_auth *);
 
        struct rpc_cred *       (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
-       struct rpc_cred *       (*crcreate)(struct rpc_auth*, struct auth_cred *, int);
+       struct rpc_cred *       (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
        int                     (*list_pseudoflavors)(rpc_authflavor_t *, int);
        rpc_authflavor_t        (*info2flavor)(struct rpcsec_gss_info *);
        int                     (*flavor2info)(rpc_authflavor_t,
@@ -167,6 +167,7 @@ void                        rpc_destroy_authunix(void);
 
 struct rpc_cred *      rpc_lookup_cred(void);
 struct rpc_cred *      rpc_lookup_cred_nonblock(void);
+struct rpc_cred *      rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t);
 struct rpc_cred *      rpc_lookup_machine_cred(const char *service_name);
 int                    rpcauth_register(const struct rpc_authops *);
 int                    rpcauth_unregister(const struct rpc_authops *);
@@ -178,7 +179,7 @@ rpc_authflavor_t    rpcauth_get_pseudoflavor(rpc_authflavor_t,
 int                    rpcauth_get_gssinfo(rpc_authflavor_t,
                                struct rpcsec_gss_info *);
 int                    rpcauth_list_flavors(rpc_authflavor_t *, int);
-struct rpc_cred *      rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
+struct rpc_cred *      rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t);
 void                   rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *      rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *      rpcauth_generic_bind_cred(struct rpc_task *, struct rpc_cred *, int);
@@ -201,9 +202,28 @@ char *                     rpcauth_stringify_acceptor(struct rpc_cred *);
 static inline
 struct rpc_cred *      get_rpccred(struct rpc_cred *cred)
 {
-       atomic_inc(&cred->cr_count);
+       if (cred != NULL)
+               atomic_inc(&cred->cr_count);
        return cred;
 }
 
+/**
+ * get_rpccred_rcu - get a reference to a cred using rcu-protected pointer
+ * @cred: cred of which to take a reference
+ *
+ * In some cases, we may have a pointer to a credential to which we
+ * want to take a reference, but don't already have one. Because these
+ * objects are freed using RCU, we can access the cr_count while its
+ * on its way to destruction and only take a reference if it's not already
+ * zero.
+ */
+static inline struct rpc_cred *
+get_rpccred_rcu(struct rpc_cred *cred)
+{
+       if (atomic_inc_not_zero(&cred->cr_count))
+               return cred;
+       return NULL;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_AUTH_H */
index 9a7ddbaf116e56036ad74997773ffe178f2e6c18..19c659d1c0f86cd40fb9b90f48dac0cb7071b560 100644 (file)
@@ -176,6 +176,7 @@ void                rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 int            rpc_protocol(struct rpc_clnt *);
 struct net *   rpc_net_ns(struct rpc_clnt *);
 size_t         rpc_max_payload(struct rpc_clnt *);
+size_t         rpc_max_bc_payload(struct rpc_clnt *);
 unsigned long  rpc_get_timeout(struct rpc_clnt *clnt);
 void           rpc_force_rebind(struct rpc_clnt *);
 size_t         rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
index 807371357160ae97f6d6120ba83b26bbc48b8e66..59cbf16eaeb56c9093d67f38d831526b42251605 100644 (file)
@@ -158,9 +158,9 @@ typedef __be32      rpc_fraghdr;
 
 /*
  * Note that RFC 1833 does not put any size restrictions on the
- * netid string, but all currently defined netid's fit in 4 bytes.
+ * netid string, but all currently defined netid's fit in 5 bytes.
  */
-#define RPCBIND_MAXNETIDLEN    (4u)
+#define RPCBIND_MAXNETIDLEN    (5u)
 
 /*
  * Universal addresses are introduced in RFC 1833 and further spelled
index 3081339968c3b7e3224248e9bd91745bf2ec72b5..d6917b896d3a75bfb4f26652903d710471a4726b 100644 (file)
@@ -199,7 +199,7 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
                                    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
-extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg *, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
                                     struct rpcrdma_msg *,
                                     enum rpcrdma_errcode, __be32 *);
index fb0d212e0d3af2dd3ed78d4ec181e63f8e72dac1..5aa3834619a8c3222be6530f927ef30349e2fa74 100644 (file)
@@ -142,6 +142,7 @@ struct rpc_xprt_ops {
        int             (*bc_setup)(struct rpc_xprt *xprt,
                                    unsigned int min_reqs);
        int             (*bc_up)(struct svc_serv *serv, struct net *net);
+       size_t          (*bc_maxpayload)(struct rpc_xprt *xprt);
        void            (*bc_free_rqst)(struct rpc_rqst *rqst);
        void            (*bc_destroy)(struct rpc_xprt *xprt,
                                      unsigned int max_reqs);
index 767190b013638f2aa0c5e70a3f338adbc6db10a8..39267dc3486af5417f1f0642d353ec65d1e78aa5 100644 (file)
@@ -52,7 +52,9 @@
 #define RPCRDMA_DEF_SLOT_TABLE (128U)
 #define RPCRDMA_MAX_SLOT_TABLE (256U)
 
-#define RPCRDMA_DEF_INLINE  (1024)     /* default inline max */
+#define RPCRDMA_MIN_INLINE  (1024)     /* min inline thresh */
+#define RPCRDMA_DEF_INLINE  (1024)     /* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (3068)     /* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
index 1b8a5a7876ce67abbc0d6cac4004f816c31bf221..e45abe7db9a627dda034f7d4616fad2d3ae1f920 100644 (file)
@@ -340,6 +340,7 @@ struct thermal_zone_of_device_ops {
        int (*get_temp)(void *, int *);
        int (*get_trend)(void *, long *);
        int (*set_emul_temp)(void *, int);
+       int (*set_trip_temp)(void *, int, int);
 };
 
 /**
index 526fb3d2e43a416fd14f47bc33b8f7e034426145..f28292d73ddba75eddf58c4e909967b77c041bdf 100644 (file)
@@ -108,7 +108,7 @@ TRACE_EVENT(kvm_ioapic_set_irq,
                __entry->coalesced      = coalesced;
        ),
 
-       TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s",
+       TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
                  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
@@ -129,7 +129,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
                __entry->e              = e;
        ),
 
-       TP_printk("dst %x vec=%u (%s|%s|%s%s)",
+       TP_printk("dst %x vec %u (%s|%s|%s%s)",
                  (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
@@ -151,7 +151,7 @@ TRACE_EVENT(kvm_msi_set_irq,
                __entry->data           = data;
        ),
 
-       TP_printk("dst %u vec %x (%s|%s|%s%s)",
+       TP_printk("dst %u vec %u (%s|%s|%s%s)",
                  (u8)(__entry->address >> 12), (u8)__entry->data,
                  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->address & (1<<2)) ? "logical" : "physical",
index c51afb71bfabc658b16be2773ad5a0e78ca1d09b..a26415b5151ce1866e805129de53331c76724813 100644 (file)
@@ -127,8 +127,11 @@ __SYSCALL(__NR_unlinkat, sys_unlinkat)
 __SYSCALL(__NR_symlinkat, sys_symlinkat)
 #define __NR_linkat 37
 __SYSCALL(__NR_linkat, sys_linkat)
+#ifdef __ARCH_WANT_RENAMEAT
+/* renameat is superseded with flags by renameat2 */
 #define __NR_renameat 38
 __SYSCALL(__NR_renameat, sys_renameat)
+#endif /* __ARCH_WANT_RENAMEAT */
 
 /* fs/namespace.c */
 #define __NR_umount2 39
index 43fc8d21347246fbc5f9c8c2213b7fda86632da8..36ce552cf6a928b7c045db60c3eacdf8e0ae8204 100644 (file)
@@ -862,6 +862,7 @@ enum perf_event_type {
 };
 
 #define PERF_MAX_STACK_DEPTH           127
+#define PERF_MAX_CONTEXTS_PER_STACK      8
 
 enum perf_callchain_context {
        PERF_CONTEXT_HV                 = (__u64)-32,
index 763bb6950402ba44ac2d2375589f5c4c08a37a58..0ec1da2ef6521658adc2656f120508242348042b 100644 (file)
@@ -228,7 +228,7 @@ struct nand_oobfree {
  * complete set of ECC information. The ioctl truncates the larger internal
  * structure to retain binary compatibility with the static declaration of the
  * ioctl. Note that the "MTD_MAX_..._ENTRIES" macros represent the max size of
- * the user struct, not the MAX size of the internal struct nand_ecclayout.
+ * the user struct, not the MAX size of the internal OOB layout representation.
  */
 struct nand_ecclayout_user {
        __u32 eccbytes;
index a9c4aefd5436edf12d6bda5ab775fbee3e7a8fd8..f755a602d4a176e006dc2cb5830d1c4812bac81f 100644 (file)
@@ -1306,6 +1306,17 @@ source "usr/Kconfig"
 
 endif
 
+choice
+       prompt "Compiler optimization level"
+       default CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
+
+config CC_OPTIMIZE_FOR_PERFORMANCE
+       bool "Optimize for performance"
+       help
+         This is the default optimization level for the kernel, building
+         with the "-O2" compiler flag for best performance and most
+         helpful compile-time warnings.
+
 config CC_OPTIMIZE_FOR_SIZE
        bool "Optimize for size"
        help
@@ -1314,6 +1325,8 @@ config CC_OPTIMIZE_FOR_SIZE
 
          If unsure, say N.
 
+endchoice
+
 config SYSCTL
        bool
 
@@ -2049,6 +2062,22 @@ config MODULE_COMPRESS_XZ
 
 endchoice
 
+config TRIM_UNUSED_KSYMS
+       bool "Trim unused exported kernel symbols"
+       depends on MODULES && !UNUSED_SYMBOLS
+       help
+         The kernel and some modules make many symbols available for
+         other modules to use via EXPORT_SYMBOL() and variants. Depending
+         on the set of modules being selected in your kernel configuration,
+         many of those exported symbols might never be used.
+
+         This option allows for unused exported symbols to be dropped from
+         the build. In turn, this provides the compiler more opportunities
+         (especially when using LTO) for optimizing the code and reducing
+         binary size.  This might have some security advantages as well.
+
+         If unsure say N.
+
 endif # MODULES
 
 config MODULES_TREE_LOOKUP
index c8ee35287bfea0e4673b117eaf96b68f676ecf2b..080a2dfb58004c06de7320c05e3404baab46562e 100644 (file)
@@ -136,7 +136,8 @@ u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
                return -EINVAL;
 
-       trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+       trace = get_perf_callchain(regs, init_nr, kernel, user,
+                                  sysctl_perf_event_max_stack, false, false);
 
        if (unlikely(!trace))
                /* couldn't fetch the stack trace */
index b9325e7dcba1088d74e2502177d2a22ececce4dc..179ef46409646fd98ed9528ebd92786ae63dfd71 100644 (file)
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
        return (sizeof(struct perf_callchain_entry) +
-               sizeof(__u64) * sysctl_perf_event_max_stack);
+               sizeof(__u64) * (sysctl_perf_event_max_stack +
+                                sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -32,12 +34,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
                                  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
                                struct pt_regs *regs)
 {
 }
@@ -176,14 +178,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
        if (!kernel && !user)
                return NULL;
 
-       return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+       return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-                  bool crosstask, bool add_mark)
+                  u32 max_stack, bool crosstask, bool add_mark)
 {
        struct perf_callchain_entry *entry;
+       struct perf_callchain_entry_ctx ctx;
        int rctx;
 
        entry = get_callchain_entry(&rctx);
@@ -193,12 +196,16 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
        if (!entry)
                goto exit_put;
 
-       entry->nr = init_nr;
+       ctx.entry     = entry;
+       ctx.max_stack = max_stack;
+       ctx.nr        = entry->nr = init_nr;
+       ctx.contexts       = 0;
+       ctx.contexts_maxed = false;
 
        if (kernel && !user_mode(regs)) {
                if (add_mark)
-                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-               perf_callchain_kernel(entry, regs);
+                       perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(&ctx, regs);
        }
 
        if (user) {
@@ -214,8 +221,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                                goto exit_put;
 
                        if (add_mark)
-                               perf_callchain_store(entry, PERF_CONTEXT_USER);
-                       perf_callchain_user(entry, regs);
+                               perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+                       perf_callchain_user(&ctx, regs);
                }
        }
 
@@ -225,10 +232,15 @@ exit_put:
        return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       int new_value = sysctl_perf_event_max_stack, ret;
+       int *value = table->data;
+       int new_value = *value, ret;
        struct ctl_table new_table = *table;
 
        new_table.data = &new_value;
@@ -240,7 +252,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
        if (atomic_read(&nr_callchain_events))
                ret = -EBUSY;
        else
-               sysctl_perf_event_max_stack = new_value;
+               *value = new_value;
 
        mutex_unlock(&callchain_mutex);
 
index 47887bba944fe61f3284057ead7f9f9637ec3d19..5c2c355aa97ff4552c151f796d18cc8732017677 100644 (file)
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
        struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
                schedule_work(&mm->async_put_work);
        }
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
index c92e44855ddda18843a2b7c9102a323811d27917..1276aabaab5504508ce783d539c3be8acac84fee 100644 (file)
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
 
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
+       depends on !COMPILE_TEST
        depends on GCOV_KERNEL
        depends on ARCH_HAS_GCOV_PROFILE_ALL
        default n
index f231e0bb311ce0827d281d34f737a3a06405c072..bec0b647f9cc5291df0211c6532edd5ffe167eb2 100644 (file)
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
        free_percpu(brw->fast_read_ctr);
        brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely
index 404c0784b1fc32e7e5cdcf7d17010da638753067..7f2cae4620c7a949fdc294c06211884e2ca83afb 100644 (file)
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
        cookie = lockdep_pin_lock(&rq->lock);
 
        while (llist) {
+               int wake_flags = 0;
+
                p = llist_entry(llist, struct task_struct, wake_entry);
                llist = llist_next(llist);
-               /*
-                * See ttwu_queue(); we only call ttwu_queue_remote() when
-                * its a x-cpu wakeup.
-                */
-               ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+               if (p->sched_remote_wakeup)
+                       wake_flags = WF_MIGRATED;
+
+               ttwu_do_activate(rq, p, wake_flags, cookie);
        }
 
        lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
        irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
        struct rq *rq = cpu_rq(cpu);
 
+       p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
                if (!set_nr_if_polling(rq->idle))
                        smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
-               ttwu_queue_remote(p, cpu);
+               ttwu_queue_remote(p, cpu, wake_flags);
                return;
        }
 #endif
index 154ae3a51e8664edd68959611947a808d3c6475f..14c4aa25cc45d4591c339aede161be14ab388a22 100644 (file)
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cpufreq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy)
        mutex_unlock(&global_tunables_lock);
 
        sugov_policy_free(sg_policy);
-       pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+       pr_err("initialization failed (error %d)\n", ret);
        return ret;
 }
 
index 2effd84d83e3f5f8d8c2b0496d77cbe1a61b590e..87b2fc38398b1550921b694b12767b652d596e84 100644 (file)
@@ -1149,13 +1149,22 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "perf_event_max_stack",
-               .data           = NULL, /* filled in by handler */
+               .data           = &sysctl_perf_event_max_stack,
                .maxlen         = sizeof(sysctl_perf_event_max_stack),
                .mode           = 0644,
                .proc_handler   = perf_event_max_stack_handler,
                .extra1         = &zero,
                .extra2         = &six_hundred_forty_kb,
        },
+       {
+               .procname       = "perf_event_max_contexts_per_stack",
+               .data           = &sysctl_perf_event_max_contexts_per_stack,
+               .maxlen         = sizeof(sysctl_perf_event_max_contexts_per_stack),
+               .mode           = 0644,
+               .proc_handler   = perf_event_max_stack_handler,
+               .extra1         = &zero,
+               .extra2         = &one_thousand,
+       },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {
index 4a1515f4b452bd283d18ade33fe24d50ced6eff7..51a76af25c66355f3833965eb3c3edd8056d25e5 100644 (file)
@@ -657,9 +657,9 @@ static struct dma_debug_entry *dma_entry_alloc(void)
        spin_lock_irqsave(&free_entries_lock, flags);
 
        if (list_empty(&free_entries)) {
-               pr_err("DMA-API: debugging out of memory - disabling\n");
                global_disable = true;
                spin_unlock_irqrestore(&free_entries_lock, flags);
+               pr_err("DMA-API: debugging out of memory - disabling\n");
                return NULL;
        }
 
index 28cb4315fe57374119559fc186f72373b8457535..0cd522753ff5c6830673e6d9f0cb46d70b4c9c23 100644 (file)
 #define iterate_and_advance(i, n, v, I, B, K) {                        \
        if (unlikely(i->count < n))                             \
                n = i->count;                                   \
-       if (n) {                                                \
+       if (i->count) {                                         \
                size_t skip = i->iov_offset;                    \
                if (unlikely(i->type & ITER_BVEC)) {            \
                        const struct bio_vec *bvec;             \
index 2664c118b5d2eea6eee0aae18b9f62c333790a4e..22fa8189e4fc8bce3073ca4db3421aaadb5428e5 100644 (file)
@@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT
        default n
        depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        depends on MEMORY_HOTPLUG
+       depends on !FLATMEM
        help
          Ordinarily all struct pages are initialised during early boot in a
          single thread. On very large machines this can take a considerable
index 9665b1d4f318653942f80d46bd1b180c74f68174..00ae878b2a3860ce5b4bfe41967430e781ebabc1 100644 (file)
@@ -143,13 +143,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
                        return;
 
        /*
-        * Track node that only contains shadow entries.
+        * Track node that only contains shadow entries. DAX mappings contain
+        * no shadow entries and may contain other exceptional entries so skip
+        * those.
         *
         * Avoid acquiring the list_lru lock if already tracked.  The
         * list_empty() test is safe as node->private_list is
         * protected by mapping->tree_lock.
         */
-       if (!workingset_node_pages(node) &&
+       if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
            list_empty(&node->private_list)) {
                node->private_data = mapping;
                list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -580,14 +582,24 @@ static int page_cache_tree_insert(struct address_space *mapping,
                if (!radix_tree_exceptional_entry(p))
                        return -EEXIST;
 
-               if (WARN_ON(dax_mapping(mapping)))
-                       return -EINVAL;
-
-               if (shadowp)
-                       *shadowp = p;
                mapping->nrexceptional--;
-               if (node)
-                       workingset_node_shadows_dec(node);
+               if (!dax_mapping(mapping)) {
+                       if (shadowp)
+                               *shadowp = p;
+                       if (node)
+                               workingset_node_shadows_dec(node);
+               } else {
+                       /* DAX can replace empty locked entry with a hole */
+                       WARN_ON_ONCE(p !=
+                               (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+                                        RADIX_DAX_ENTRY_LOCK));
+                       /* DAX accounts exceptional entries as normal pages */
+                       if (node)
+                               workingset_node_pages_dec(node);
+                       /* Wakeup waiters for exceptional entry lock */
+                       dax_wake_mapping_entry_waiter(mapping, page->index,
+                                                     false);
+               }
        }
        radix_tree_replace_slot(slot, page);
        mapping->nrpages++;
index 7f7ac51d7faf678ec1f4686f4450e1adedb778c4..fb87923552ef2b90c65847fdb5c56644560ca7ec 100644 (file)
@@ -77,7 +77,6 @@ struct kasan_alloc_meta {
        struct kasan_track track;
        u32 state : 2;  /* enum kasan_state */
        u32 alloc_size : 30;
-       u32 reserved;
 };
 
 struct qlist_node {
index cf428d7b9a03a041516e613563d5d051dfc4c610..f6477a9dbe7ab42c98ff05bc4f2050d897eb6b0a 100644 (file)
@@ -1302,6 +1302,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
+                               /* Set a dummy value to return "true". */
+                               chosen = (void *) 1;
                                goto unlock;
                        case OOM_SCAN_OK:
                                break;
index a1b93d9e444981445e1b8e5f50ff914492156b8f..15322b73636b4272d6c84af6742a0822822301db 100644 (file)
@@ -63,6 +63,7 @@
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2492,8 +2493,6 @@ void unmap_mapping_range(struct address_space *mapping,
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
 
-
-       /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -2825,7 +2824,8 @@ oom:
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
                        pgoff_t pgoff, unsigned int flags,
-                       struct page *cow_page, struct page **page)
+                       struct page *cow_page, struct page **page,
+                       void **entry)
 {
        struct vm_fault vmf;
        int ret;
@@ -2840,8 +2840,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
-       if (!vmf.page)
-               goto out;
+       if (ret & VM_FAULT_DAX_LOCKED) {
+               *entry = vmf.entry;
+               return ret;
+       }
 
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
@@ -2855,7 +2857,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
- out:
        *page = vmf.page;
        return ret;
 }
@@ -3048,7 +3049,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
@@ -3071,6 +3072,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
        struct page *fault_page, *new_page;
+       void *fault_entry;
        struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
@@ -3088,26 +3090,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_OOM;
        }
 
-       ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+                        &fault_entry);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
 
-       if (fault_page)
+       if (!(ret & VM_FAULT_DAX_LOCKED))
                copy_user_highpage(new_page, fault_page, address, vma);
        __SetPageUptodate(new_page);
 
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
-               if (fault_page) {
+               if (!(ret & VM_FAULT_DAX_LOCKED)) {
                        unlock_page(fault_page);
                        put_page(fault_page);
                } else {
-                       /*
-                        * The fault handler has no page to lock, so it holds
-                        * i_mmap_lock for read to protect against truncate.
-                        */
-                       i_mmap_unlock_read(vma->vm_file->f_mapping);
+                       dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+                                                pgoff);
                }
                goto uncharge_out;
        }
@@ -3115,15 +3115,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
-       if (fault_page) {
+       if (!(ret & VM_FAULT_DAX_LOCKED)) {
                unlock_page(fault_page);
                put_page(fault_page);
        } else {
-               /*
-                * The fault handler has no page to lock, so it holds
-                * i_mmap_lock for read to protect against truncate.
-                */
-               i_mmap_unlock_read(vma->vm_file->f_mapping);
+               dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
        }
        return ret;
 uncharge_out:
@@ -3143,7 +3139,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        int dirtied = 0;
        int ret, tmp;
 
-       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+       ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
index b00272810871e1e62cc9eca33a06b15b3576a8c7..4064f8f53daa73aaca001747cbded8d9b451d7f7 100644 (file)
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
        if (shmem_mapping(mapping))
                return;
 
-       spin_lock_irq(&mapping->tree_lock);
-
        if (dax_mapping(mapping)) {
-               if (radix_tree_delete_item(&mapping->page_tree, index, entry))
-                       mapping->nrexceptional--;
-       } else {
-               /*
-                * Regular page slots are stabilized by the page lock even
-                * without the tree itself locked.  These unlocked entries
-                * need verification under the tree lock.
-                */
-               if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
-                                       &slot))
-                       goto unlock;
-               if (*slot != entry)
-                       goto unlock;
-               radix_tree_replace_slot(slot, NULL);
-               mapping->nrexceptional--;
-               if (!node)
-                       goto unlock;
-               workingset_node_shadows_dec(node);
-               /*
-                * Don't track node without shadow entries.
-                *
-                * Avoid acquiring the list_lru lock if already untracked.
-                * The list_empty() test is safe as node->private_list is
-                * protected by mapping->tree_lock.
-                */
-               if (!workingset_node_shadows(node) &&
-                   !list_empty(&node->private_list))
-                       list_lru_del(&workingset_shadow_nodes,
-                                       &node->private_list);
-               __radix_tree_delete_node(&mapping->page_tree, node);
+               dax_delete_mapping_entry(mapping, index);
+               return;
        }
+       spin_lock_irq(&mapping->tree_lock);
+       /*
+        * Regular page slots are stabilized by the page lock even
+        * without the tree itself locked.  These unlocked entries
+        * need verification under the tree lock.
+        */
+       if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+                               &slot))
+               goto unlock;
+       if (*slot != entry)
+               goto unlock;
+       radix_tree_replace_slot(slot, NULL);
+       mapping->nrexceptional--;
+       if (!node)
+               goto unlock;
+       workingset_node_shadows_dec(node);
+       /*
+        * Don't track node without shadow entries.
+        *
+        * Avoid acquiring the list_lru lock if already untracked.
+        * The list_empty() test is safe as node->private_list is
+        * protected by mapping->tree_lock.
+        */
+       if (!workingset_node_shadows(node) &&
+           !list_empty(&node->private_list))
+               list_lru_del(&workingset_shadow_nodes,
+                               &node->private_list);
+       __radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
        spin_unlock_irq(&mapping->tree_lock);
 }
index 72698db958e7e384d51996dfdfd6fa313ee6e2c5..b6d4f258cb53c20ba9d222765ba470d556da396d 100644 (file)
@@ -45,6 +45,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -483,16 +485,16 @@ static inline unsigned long zs_stat_get(struct size_class *class,
 
 #ifdef CONFIG_ZSMALLOC_STAT
 
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-       if (!debugfs_initialized())
-               return -ENODEV;
+       if (!debugfs_initialized()) {
+               pr_warn("debugfs not available, stat dir not created\n");
+               return;
+       }
 
        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
        if (!zs_stat_root)
-               return -ENOMEM;
-
-       return 0;
+               pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
 }
 
 static void __exit zs_stat_exit(void)
@@ -577,8 +579,10 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
        struct dentry *entry;
 
-       if (!zs_stat_root)
+       if (!zs_stat_root) {
+               pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
                return;
+       }
 
        entry = debugfs_create_dir(name, zs_stat_root);
        if (!entry) {
@@ -592,7 +596,8 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
        if (!entry) {
                pr_warn("%s: debugfs file entry <%s> creation failed\n",
                                name, "classes");
-               return;
+               debugfs_remove_recursive(pool->stat_dentry);
+               pool->stat_dentry = NULL;
        }
 }
 
@@ -602,9 +607,8 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 
 #else /* CONFIG_ZSMALLOC_STAT */
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-       return 0;
 }
 
 static void __exit zs_stat_exit(void)
@@ -2011,17 +2015,10 @@ static int __init zs_init(void)
        zpool_register_driver(&zs_zpool_driver);
 #endif
 
-       ret = zs_stat_init();
-       if (ret) {
-               pr_err("zs stat initialization failed\n");
-               goto stat_fail;
-       }
+       zs_stat_init();
+
        return 0;
 
-stat_fail:
-#ifdef CONFIG_ZPOOL
-       zpool_unregister_driver(&zs_zpool_driver);
-#endif
 notifier_fail:
        zs_unregister_cpu_notifier();
 
index dcc18c6f7cf9b96c303407c5ac5f13f564bb224d..55d2bfee16d798f6f01ae44c1052e824e590a0d8 100644 (file)
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
        return client->monc.monmap && client->monc.monmap->epoch &&
               client->osdc.osdmap && client->osdc.osdmap->epoch;
index 139a9cb19b0c6ca9b07e1184241c33b08cdd141f..3773a4fa11e35ba89b5325fd18609308c267ee52 100644 (file)
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
        }
 }
 
+const char *ceph_osd_watch_op_name(int o)
+{
+       switch (o) {
+       case CEPH_OSD_WATCH_OP_UNWATCH:
+               return "unwatch";
+       case CEPH_OSD_WATCH_OP_WATCH:
+               return "watch";
+       case CEPH_OSD_WATCH_OP_RECONNECT:
+               return "reconnect";
+       case CEPH_OSD_WATCH_OP_PING:
+               return "ping";
+       default:
+               return "???";
+       }
+}
+
 const char *ceph_osd_state_name(int s)
 {
        switch (s) {
index b902fbc7863ef893222401a5f2b9bdcc8db87431..e77b04ca7802c046af213abd25650a11bcc6a6a3 100644 (file)
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
        int i;
        struct ceph_client *client = s->private;
-       struct ceph_osdmap *map = client->osdc.osdmap;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
        struct rb_node *n;
 
        if (map == NULL)
                return 0;
 
-       seq_printf(s, "epoch %d\n", map->epoch);
-       seq_printf(s, "flags%s%s\n",
-                  (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-                  (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
+       down_read(&osdc->lock);
+       seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
 
        for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-               struct ceph_pg_pool_info *pool =
+               struct ceph_pg_pool_info *pi =
                        rb_entry(n, struct ceph_pg_pool_info, node);
 
-               seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-                          pool->id, pool->pg_num, pool->pg_num_mask,
-                          pool->read_tier, pool->write_tier);
+               seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
+                          pi->id, pi->name, pi->type, pi->size, pi->min_size,
+                          pi->pg_num, pi->pg_num_mask, pi->flags,
+                          pi->last_force_request_resend, pi->read_tier,
+                          pi->write_tier);
        }
        for (i = 0; i < map->max_osd; i++) {
                struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
                           pg->pgid.seed, pg->primary_temp.osd);
        }
 
+       up_read(&osdc->lock);
        return 0;
 }
 
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
                                        CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
                seq_putc(s, '\n');
        }
+       seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
 
        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
                __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
        return 0;
 }
 
-static int osdc_show(struct seq_file *s, void *pp)
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
-       struct ceph_client *client = s->private;
-       struct ceph_osd_client *osdc = &client->osdc;
-       struct rb_node *p;
+       int i;
 
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               struct ceph_osd_request *req;
-               unsigned int i;
-               int opcode;
+       seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+       for (i = 0; i < t->up.size; i++)
+               seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+       seq_printf(s, "]/%d\t[", t->up.primary);
+       for (i = 0; i < t->acting.size; i++)
+               seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+       seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+                  t->target_oid.name_len, t->target_oid.name, t->flags);
+       if (t->paused)
+               seq_puts(s, "\tP");
+}
 
-               req = rb_entry(p, struct ceph_osd_request, r_node);
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+       int i;
 
-               seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-                          req->r_osd ? req->r_osd->o_osd : -1,
-                          req->r_pgid.pool, req->r_pgid.seed);
+       seq_printf(s, "%llu\t", req->r_tid);
+       dump_target(s, &req->r_t);
 
-               seq_printf(s, "%.*s", req->r_base_oid.name_len,
-                          req->r_base_oid.name);
+       seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+                  le32_to_cpu(req->r_replay_version.epoch),
+                  le64_to_cpu(req->r_replay_version.version));
 
-               if (req->r_reassert_version.epoch)
-                       seq_printf(s, "\t%u'%llu",
-                          (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-                          le64_to_cpu(req->r_reassert_version.version));
-               else
-                       seq_printf(s, "\t");
+       for (i = 0; i < req->r_num_ops; i++) {
+               struct ceph_osd_req_op *op = &req->r_ops[i];
+
+               seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+                          ceph_osd_op_name(op->op));
+               if (op->op == CEPH_OSD_OP_WATCH)
+                       seq_printf(s, "-%s",
+                                  ceph_osd_watch_op_name(op->watch.op));
+       }
+
+       seq_putc(s, '\n');
+}
+
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       mutex_lock(&osd->lock);
+       for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               dump_request(s, req);
+       }
+
+       mutex_unlock(&osd->lock);
+}
 
-               for (i = 0; i < req->r_num_ops; i++) {
-                       opcode = req->r_ops[i].op;
-                       seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-                                  ceph_osd_op_name(opcode));
-               }
+static void dump_linger_request(struct seq_file *s,
+                               struct ceph_osd_linger_request *lreq)
+{
+       seq_printf(s, "%llu\t", lreq->linger_id);
+       dump_target(s, &lreq->t);
+
+       seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
+                  lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
+                  lreq->last_error);
+}
+
+static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       mutex_lock(&osd->lock);
+       for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               dump_linger_request(s, lreq);
+       }
+
+       mutex_unlock(&osd->lock);
+}
 
-               seq_printf(s, "\n");
+static int osdc_show(struct seq_file *s, void *pp)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct rb_node *n;
+
+       down_read(&osdc->lock);
+       seq_printf(s, "REQUESTS %d homeless %d\n",
+                  atomic_read(&osdc->num_requests),
+                  atomic_read(&osdc->num_homeless));
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               dump_requests(s, osd);
        }
-       mutex_unlock(&osdc->request_mutex);
+       dump_requests(s, &osdc->homeless_osd);
+
+       seq_puts(s, "LINGER REQUESTS\n");
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               dump_linger_requests(s, osd);
+       }
+       dump_linger_requests(s, &osdc->homeless_osd);
+
+       up_read(&osdc->lock);
        return 0;
 }
 
index cf638c009cfabe9e95d114c1e187785db46f958c..37c38a7fb5c586bf7698daaf71308b15bd973edb 100644 (file)
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
        BUG_ON(num < 1); /* monmap sub is always there */
        ceph_encode_32(&p, num);
        for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
-               const char *s = ceph_sub_str[i];
+               char buf[32];
+               int len;
 
                if (!monc->subs[i].want)
                        continue;
 
-               dout("%s %s start %llu flags 0x%x\n", __func__, s,
+               len = sprintf(buf, "%s", ceph_sub_str[i]);
+               if (i == CEPH_SUB_MDSMAP &&
+                   monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
+                       len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
+
+               dout("%s %s start %llu flags 0x%x\n", __func__, buf,
                     le64_to_cpu(monc->subs[i].item.start),
                     monc->subs[i].item.flags);
-               ceph_encode_string(&p, end, s, strlen(s));
+               ceph_encode_string(&p, end, buf, len);
                memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
                p += sizeof(monc->subs[i].item);
        }
 
-       BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+       BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
        msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
        ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 }
 EXPORT_SYMBOL(ceph_monc_got_map);
 
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+void ceph_monc_renew_subs(struct ceph_mon_client *monc)
 {
-       dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
        mutex_lock(&monc->mutex);
-       if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
-                                monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
-               __send_subscribe(monc);
+       __send_subscribe(monc);
        mutex_unlock(&monc->mutex);
 }
-EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+EXPORT_SYMBOL(ceph_monc_renew_subs);
 
 /*
  * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-       struct ceph_mon_client *monc, u64 tid)
-{
-       struct ceph_mon_generic_request *req;
-       struct rb_node *n = monc->generic_request_tree.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_mon_generic_request, node);
-               if (tid < req->tid)
-                       n = n->rb_left;
-               else if (tid > req->tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                           struct ceph_mon_generic_request *new)
-{
-       struct rb_node **p = &monc->generic_request_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_mon_generic_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_mon_generic_request, node);
-               if (new->tid < req->tid)
-                       p = &(*p)->rb_left;
-               else if (new->tid > req->tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
        struct ceph_mon_generic_request *req =
                container_of(kref, struct ceph_mon_generic_request, kref);
 
+       dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
+            req->reply);
+       WARN_ON(!RB_EMPTY_NODE(&req->node));
+
        if (req->reply)
                ceph_msg_put(req->reply);
        if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
 
 static void put_generic_request(struct ceph_mon_generic_request *req)
 {
-       kref_put(&req->kref, release_generic_request);
+       if (req)
+               kref_put(&req->kref, release_generic_request);
 }
 
 static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
        kref_get(&req->kref);
 }
 
+static struct ceph_mon_generic_request *
+alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
+{
+       struct ceph_mon_generic_request *req;
+
+       req = kzalloc(sizeof(*req), gfp);
+       if (!req)
+               return NULL;
+
+       req->monc = monc;
+       kref_init(&req->kref);
+       RB_CLEAR_NODE(&req->node);
+       init_completion(&req->completion);
+
+       dout("%s greq %p\n", __func__, req);
+       return req;
+}
+
+static void register_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+
+       WARN_ON(req->tid);
+
+       get_generic_request(req);
+       req->tid = ++monc->last_tid;
+       insert_generic_request(&monc->generic_request_tree, req);
+}
+
+static void send_generic_request(struct ceph_mon_client *monc,
+                                struct ceph_mon_generic_request *req)
+{
+       WARN_ON(!req->tid);
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       req->request->hdr.tid = cpu_to_le64(req->tid);
+       ceph_con_send(&monc->con, ceph_msg_get(req->request));
+}
+
+static void __finish_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       erase_generic_request(&monc->generic_request_tree, req);
+
+       ceph_msg_revoke(req->request);
+       ceph_msg_revoke_incoming(req->reply);
+}
+
+static void finish_generic_request(struct ceph_mon_generic_request *req)
+{
+       __finish_generic_request(req);
+       put_generic_request(req);
+}
+
+static void complete_generic_request(struct ceph_mon_generic_request *req)
+{
+       if (req->complete_cb)
+               req->complete_cb(req);
+       else
+               complete_all(&req->completion);
+       put_generic_request(req);
+}
+
+void cancel_generic_request(struct ceph_mon_generic_request *req)
+{
+       struct ceph_mon_client *monc = req->monc;
+       struct ceph_mon_generic_request *lookup_req;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+
+       mutex_lock(&monc->mutex);
+       lookup_req = lookup_generic_request(&monc->generic_request_tree,
+                                           req->tid);
+       if (lookup_req) {
+               WARN_ON(lookup_req != req);
+               finish_generic_request(req);
+       }
+
+       mutex_unlock(&monc->mutex);
+}
+
+static int wait_generic_request(struct ceph_mon_generic_request *req)
+{
+       int ret;
+
+       dout("%s greq %p tid %llu\n", __func__, req, req->tid);
+       ret = wait_for_completion_interruptible(&req->completion);
+       if (ret)
+               cancel_generic_request(req);
+       else
+               ret = req->result; /* completed */
+
+       return ret;
+}
+
 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
                                         struct ceph_msg_header *hdr,
                                         int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        struct ceph_msg *m;
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
+       req = lookup_generic_request(&monc->generic_request_tree, tid);
        if (!req) {
                dout("get_generic_reply %lld dne\n", tid);
                *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        return m;
 }
 
-static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
-                               struct ceph_mon_generic_request *req)
-{
-       int err;
-
-       /* register request */
-       req->tid = tid != 0 ? tid : ++monc->last_tid;
-       req->request->hdr.tid = cpu_to_le64(req->tid);
-       __insert_generic_request(monc, req);
-       monc->num_generic_requests++;
-       ceph_con_send(&monc->con, ceph_msg_get(req->request));
-       mutex_unlock(&monc->mutex);
-
-       err = wait_for_completion_interruptible(&req->completion);
-
-       mutex_lock(&monc->mutex);
-       rb_erase(&req->node, &monc->generic_request_tree);
-       monc->num_generic_requests--;
-
-       if (!err)
-               err = req->result;
-       return err;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-                             struct ceph_mon_generic_request *req)
-{
-       int err;
-
-       mutex_lock(&monc->mutex);
-       err = __do_generic_request(monc, 0, req);
-       mutex_unlock(&monc->mutex);
-
-       return err;
-}
-
 /*
  * statfs
  */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
        u64 tid = le64_to_cpu(msg->hdr.tid);
 
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-       dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
-       if (req) {
-               *(struct ceph_statfs *)req->buf = reply->st;
-               req->result = 0;
-               get_generic_request(req);
+       req = lookup_generic_request(&monc->generic_request_tree, tid);
+       if (!req) {
+               mutex_unlock(&monc->mutex);
+               return;
        }
+
+       req->result = 0;
+       *req->u.st = reply->st; /* struct */
+       __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-       if (req) {
-               complete_all(&req->completion);
-               put_generic_request(req);
-       }
+
+       complete_generic_request(req);
        return;
 
 bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 {
        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
-       int err;
+       int ret = -ENOMEM;
 
-       req = kzalloc(sizeof(*req), GFP_NOFS);
+       req = alloc_generic_request(monc, GFP_NOFS);
        if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = buf;
-       init_completion(&req->completion);
+               goto out;
 
-       err = -ENOMEM;
        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
                                    true);
        if (!req->request)
                goto out;
-       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
-                                 true);
+
+       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
        if (!req->reply)
                goto out;
 
+       req->u.st = buf;
+
+       mutex_lock(&monc->mutex);
+       register_generic_request(req);
        /* fill out request */
        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
+       send_generic_request(monc, req);
+       mutex_unlock(&monc->mutex);
 
-       err = do_generic_request(monc, req);
-
+       ret = wait_generic_request(req);
 out:
        put_generic_request(req);
-       return err;
+       return ret;
 }
 EXPORT_SYMBOL(ceph_monc_do_statfs);
 
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
        void *end = p + msg->front_alloc_len;
        u64 handle;
 
-       dout("%s %p tid %llu\n", __func__, msg, tid);
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
        ceph_decode_need(&p, end, 2*sizeof(u64), bad);
        handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
                goto bad;
 
        mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, handle);
-       if (req) {
-               *(u64 *)req->buf = ceph_decode_64(&p);
-               req->result = 0;
-               get_generic_request(req);
+       req = lookup_generic_request(&monc->generic_request_tree, handle);
+       if (!req) {
+               mutex_unlock(&monc->mutex);
+               return;
        }
+
+       req->result = 0;
+       req->u.newest = ceph_decode_64(&p);
+       __finish_generic_request(req);
        mutex_unlock(&monc->mutex);
-       if (req) {
-               complete_all(&req->completion);
-               put_generic_request(req);
-       }
 
+       complete_generic_request(req);
        return;
+
 bad:
        pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
 
-/*
- * Send MMonGetVersion and wait for the reply.
- *
- * @what: one of "mdsmap", "osdmap" or "monmap"
- */
-int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
-                            u64 *newest)
+static struct ceph_mon_generic_request *
+__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                       ceph_monc_callback_t cb, u64 private_data)
 {
        struct ceph_mon_generic_request *req;
-       void *p, *end;
-       u64 tid;
-       int err;
 
-       req = kzalloc(sizeof(*req), GFP_NOFS);
+       req = alloc_generic_request(monc, GFP_NOIO);
        if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = newest;
-       init_completion(&req->completion);
+               goto err_put_req;
 
        req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
                                    sizeof(u64) + sizeof(u32) + strlen(what),
-                                   GFP_NOFS, true);
-       if (!req->request) {
-               err = -ENOMEM;
-               goto out;
-       }
+                                   GFP_NOIO, true);
+       if (!req->request)
+               goto err_put_req;
 
-       req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
-                                 GFP_NOFS, true);
-       if (!req->reply) {
-               err = -ENOMEM;
-               goto out;
-       }
+       req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
+                                 true);
+       if (!req->reply)
+               goto err_put_req;
 
-       p = req->request->front.iov_base;
-       end = p + req->request->front_alloc_len;
+       req->complete_cb = cb;
+       req->private_data = private_data;
 
-       /* fill out request */
        mutex_lock(&monc->mutex);
-       tid = ++monc->last_tid;
-       ceph_encode_64(&p, tid); /* handle */
-       ceph_encode_string(&p, end, what, strlen(what));
+       register_generic_request(req);
+       {
+               void *p = req->request->front.iov_base;
+               void *const end = p + req->request->front_alloc_len;
+
+               ceph_encode_64(&p, req->tid); /* handle */
+               ceph_encode_string(&p, end, what, strlen(what));
+               WARN_ON(p != end);
+       }
+       send_generic_request(monc, req);
+       mutex_unlock(&monc->mutex);
 
-       err = __do_generic_request(monc, tid, req);
+       return req;
 
-       mutex_unlock(&monc->mutex);
-out:
+err_put_req:
        put_generic_request(req);
-       return err;
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+                         u64 *newest)
+{
+       struct ceph_mon_generic_request *req;
+       int ret;
+
+       req = __ceph_monc_get_version(monc, what, NULL, 0);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       ret = wait_generic_request(req);
+       if (!ret)
+               *newest = req->u.newest;
+
+       put_generic_request(req);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_monc_do_get_version);
+EXPORT_SYMBOL(ceph_monc_get_version);
+
+/*
+ * Send MMonGetVersion,
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+                               ceph_monc_callback_t cb, u64 private_data)
+{
+       struct ceph_mon_generic_request *req;
+
+       req = __ceph_monc_get_version(monc, what, cb, private_data);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       put_generic_request(req);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_monc_get_version_async);
 
 /*
  * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        if (!monc->m_subscribe_ack)
                goto out_auth;
 
-       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
                                         true);
        if (!monc->m_subscribe)
                goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
        monc->generic_request_tree = RB_ROOT;
-       monc->num_generic_requests = 0;
        monc->last_tid = 0;
 
+       monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
+
        return 0;
 
 out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 
        ceph_auth_destroy(monc->auth);
 
+       WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
+
        ceph_msg_put(monc->m_auth);
        ceph_msg_put(monc->m_auth_reply);
        ceph_msg_put(monc->m_subscribe);
index 40a53a70efdffe89085f4cfce89f8d59c5420521..0160d7d09a1e975b3cb8311d202f38462401efbf 100644 (file)
 #include <linux/ceph/auth.h>
 #include <linux/ceph/pagelist.h>
 
-#define OSD_OP_FRONT_LEN       4096
 #define OSD_OPREPLY_FRONT_LEN  512
 
 static struct kmem_cache       *ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-                              struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                       struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void link_linger(struct ceph_osd *osd,
+                       struct ceph_osd_linger_request *lreq);
+static void unlink_linger(struct ceph_osd *osd,
+                         struct ceph_osd_linger_request *lreq);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+       bool wrlocked = true;
+
+       if (unlikely(down_read_trylock(sem))) {
+               wrlocked = false;
+               up_read(sem);
+       }
+
+       return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+       WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+       WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       WARN_ON(!(mutex_is_locked(&osd->lock) &&
+                 rwsem_is_locked(&osdc->lock)) &&
+               !rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
+{
+       WARN_ON(!mutex_is_locked(&lreq->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-struct ceph_osd_data *
-osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-                       unsigned int which)
-{
-       return osd_req_op_data(osd_req, which, cls, response_data);
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);   /* ??? */
-
 void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
                        unsigned int which, struct page **pages,
                        u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
 
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pagelist_init(osd_data, pagelist);
+       osd_req->r_ops[which].cls.indata_len += pagelist->length;
+       osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
        osd_data = osd_req_op_data(osd_req, which, cls, request_data);
        ceph_osd_data_pages_init(osd_data, pages, length, alignment,
                                pages_from_pool, own_pages);
+       osd_req->r_ops[which].cls.indata_len += length;
+       osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_STAT:
                ceph_osd_data_release(&op->raw_data_in);
                break;
+       case CEPH_OSD_OP_NOTIFY_ACK:
+               ceph_osd_data_release(&op->notify_ack.request_data);
+               break;
+       case CEPH_OSD_OP_NOTIFY:
+               ceph_osd_data_release(&op->notify.request_data);
+               ceph_osd_data_release(&op->notify.response_data);
+               break;
        default:
                break;
        }
 }
 
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+       ceph_oid_init(&t->base_oid);
+       ceph_oloc_init(&t->base_oloc);
+       ceph_oid_init(&t->target_oid);
+       ceph_oloc_init(&t->target_oloc);
+
+       ceph_osds_init(&t->acting);
+       ceph_osds_init(&t->up);
+       t->size = -1;
+       t->min_size = -1;
+
+       t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_copy(struct ceph_osd_request_target *dest,
+                       const struct ceph_osd_request_target *src)
+{
+       ceph_oid_copy(&dest->base_oid, &src->base_oid);
+       ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
+       ceph_oid_copy(&dest->target_oid, &src->target_oid);
+       ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
+
+       dest->pgid = src->pgid; /* struct */
+       dest->pg_num = src->pg_num;
+       dest->pg_num_mask = src->pg_num_mask;
+       ceph_osds_copy(&dest->acting, &src->acting);
+       ceph_osds_copy(&dest->up, &src->up);
+       dest->size = src->size;
+       dest->min_size = src->min_size;
+       dest->sort_bitwise = src->sort_bitwise;
+
+       dest->flags = src->flags;
+       dest->paused = src->paused;
+
+       dest->osd = src->osd;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+       ceph_oid_destroy(&t->base_oid);
+       ceph_oid_destroy(&t->target_oid);
+}
+
 /*
  * requests
  */
+static void request_release_checks(struct ceph_osd_request *req)
+{
+       WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+       WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
+       WARN_ON(!list_empty(&req->r_unsafe_item));
+       WARN_ON(req->r_osd);
+}
+
 static void ceph_osdc_release_request(struct kref *kref)
 {
        struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
 
        dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
             req->r_request, req->r_reply);
-       WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-       WARN_ON(!list_empty(&req->r_req_lru_item));
-       WARN_ON(!list_empty(&req->r_osd_item));
-       WARN_ON(!list_empty(&req->r_linger_item));
-       WARN_ON(!list_empty(&req->r_linger_osd_item));
-       WARN_ON(req->r_osd);
+       request_release_checks(req);
 
        if (req->r_request)
                ceph_msg_put(req->r_request);
-       if (req->r_reply) {
-               ceph_msg_revoke_incoming(req->r_reply);
+       if (req->r_reply)
                ceph_msg_put(req->r_reply);
-       }
 
        for (which = 0; which < req->r_num_ops; which++)
                osd_req_op_data_release(req, which);
 
+       target_destroy(&req->r_t);
        ceph_put_snap_context(req->r_snapc);
+
        if (req->r_mempool)
                mempool_free(req, req->r_osdc->req_mempool);
        else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
 
 void ceph_osdc_put_request(struct ceph_osd_request *req)
 {
-       dout("%s %p (was %d)\n", __func__, req,
-            atomic_read(&req->r_kref.refcount));
-       kref_put(&req->r_kref, ceph_osdc_release_request);
+       if (req) {
+               dout("%s %p (was %d)\n", __func__, req,
+                    atomic_read(&req->r_kref.refcount));
+               kref_put(&req->r_kref, ceph_osdc_release_request);
+       }
 }
 EXPORT_SYMBOL(ceph_osdc_put_request);
 
+static void request_init(struct ceph_osd_request *req)
+{
+       /* req only, each op is zeroed in _osd_req_op_init() */
+       memset(req, 0, sizeof(*req));
+
+       kref_init(&req->r_kref);
+       init_completion(&req->r_completion);
+       init_completion(&req->r_safe_completion);
+       RB_CLEAR_NODE(&req->r_node);
+       RB_CLEAR_NODE(&req->r_mc_node);
+       INIT_LIST_HEAD(&req->r_unsafe_item);
+
+       target_init(&req->r_t);
+}
+
+/*
+ * This is ugly, but it allows us to reuse linger registration and ping
+ * requests, keeping the structure of the code around send_linger{_ping}()
+ * reasonable.  Setting up a min_nr=2 mempool for each linger request
+ * and dealing with copying ops (this blasts req only, watch op remains
+ * intact) isn't any better.
+ */
+static void request_reinit(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       bool mempool = req->r_mempool;
+       unsigned int num_ops = req->r_num_ops;
+       u64 snapid = req->r_snapid;
+       struct ceph_snap_context *snapc = req->r_snapc;
+       bool linger = req->r_linger;
+       struct ceph_msg *request_msg = req->r_request;
+       struct ceph_msg *reply_msg = req->r_reply;
+
+       dout("%s req %p\n", __func__, req);
+       WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
+       request_release_checks(req);
+
+       WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
+       WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
+       target_destroy(&req->r_t);
+
+       request_init(req);
+       req->r_osdc = osdc;
+       req->r_mempool = mempool;
+       req->r_num_ops = num_ops;
+       req->r_snapid = snapid;
+       req->r_snapc = snapc;
+       req->r_linger = linger;
+       req->r_request = request_msg;
+       req->r_reply = reply_msg;
+}
+
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               struct ceph_snap_context *snapc,
                                               unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                                               gfp_t gfp_flags)
 {
        struct ceph_osd_request *req;
-       struct ceph_msg *msg;
-       size_t msg_size;
 
        if (use_mempool) {
                BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        if (unlikely(!req))
                return NULL;
 
-       /* req only, each op is zeroed in _osd_req_op_init() */
-       memset(req, 0, sizeof(*req));
-
+       request_init(req);
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
        req->r_num_ops = num_ops;
+       req->r_snapid = CEPH_NOSNAP;
+       req->r_snapc = ceph_get_snap_context(snapc);
 
-       kref_init(&req->r_kref);
-       init_completion(&req->r_completion);
-       init_completion(&req->r_safe_completion);
-       RB_CLEAR_NODE(&req->r_node);
-       INIT_LIST_HEAD(&req->r_unsafe_item);
-       INIT_LIST_HEAD(&req->r_linger_item);
-       INIT_LIST_HEAD(&req->r_linger_osd_item);
-       INIT_LIST_HEAD(&req->r_req_lru_item);
-       INIT_LIST_HEAD(&req->r_osd_item);
-
-       req->r_base_oloc.pool = -1;
-       req->r_target_oloc.pool = -1;
+       dout("%s req %p\n", __func__, req);
+       return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-       msg_size = OSD_OPREPLY_FRONT_LEN;
-       if (num_ops > CEPH_OSD_SLAB_OPS) {
-               /* ceph_osd_op and rval */
-               msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-                           (sizeof(struct ceph_osd_op) + 4);
-       }
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_msg *msg;
+       int msg_size;
 
-       /* create reply message */
-       if (use_mempool)
-               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-       else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-                                  gfp_flags, true);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
-       req->r_reply = msg;
+       WARN_ON(ceph_oid_empty(&req->r_base_oid));
 
+       /* create request message */
        msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
        msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
        msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
        msg_size += 1 + 8 + 4 + 4; /* pgid */
-       msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-       msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+       msg_size += 4 + req->r_base_oid.name_len; /* oid */
+       msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
        msg_size += 8; /* snapid */
        msg_size += 8; /* snap_seq */
-       msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+       msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
        msg_size += 4; /* retry_attempt */
 
-       /* create request message; allow space for oid */
-       if (use_mempool)
+       if (req->r_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
+               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+       if (!msg)
+               return -ENOMEM;
 
        memset(msg->front.iov_base, 0, msg->front.iov_len);
-
        req->r_request = msg;
 
-       return req;
+       /* create reply message */
+       msg_size = OSD_OPREPLY_FRONT_LEN;
+       msg_size += req->r_base_oid.name_len;
+       msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
+
+       if (req->r_mempool)
+               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+       else
+               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+       if (!msg)
+               return -ENOMEM;
+
+       req->r_reply = msg;
+
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
        osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-       op->cls.argc = 0;       /* currently unused */
-
        op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
-void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
-                               unsigned int which, u16 opcode,
-                               u64 cookie, u64 version, int flag)
+/*
+ * @watch_opcode: CEPH_OSD_WATCH_OP_*
+ */
+static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
+                                 u64 cookie, u8 watch_opcode)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-                                                     opcode, 0);
-
-       BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+       struct ceph_osd_req_op *op;
 
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
        op->watch.cookie = cookie;
-       op->watch.ver = version;
-       if (opcode == CEPH_OSD_OP_WATCH && flag)
-               op->watch.flag = (u8)1;
+       op->watch.op = watch_opcode;
+       op->watch.gen = 0;
 }
-EXPORT_SYMBOL(osd_req_op_watch_init);
 
 void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
        }
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-                             struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+                            const struct ceph_osd_req_op *src)
 {
-       struct ceph_osd_req_op *src;
-       struct ceph_osd_data *osd_data;
-       u64 request_data_len = 0;
-       u64 data_length;
-
-       BUG_ON(which >= req->r_num_ops);
-       src = &req->r_ops[which];
        if (WARN_ON(!osd_req_opcode_valid(src->op))) {
                pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
        switch (src->op) {
        case CEPH_OSD_OP_STAT:
-               osd_data = &src->raw_data_in;
-               ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_READ:
        case CEPH_OSD_OP_WRITE:
        case CEPH_OSD_OP_WRITEFULL:
        case CEPH_OSD_OP_ZERO:
        case CEPH_OSD_OP_TRUNCATE:
-               if (src->op == CEPH_OSD_OP_WRITE ||
-                   src->op == CEPH_OSD_OP_WRITEFULL)
-                       request_data_len = src->extent.length;
                dst->extent.offset = cpu_to_le64(src->extent.offset);
                dst->extent.length = cpu_to_le64(src->extent.length);
                dst->extent.truncate_size =
                        cpu_to_le64(src->extent.truncate_size);
                dst->extent.truncate_seq =
                        cpu_to_le32(src->extent.truncate_seq);
-               osd_data = &src->extent.osd_data;
-               if (src->op == CEPH_OSD_OP_WRITE ||
-                   src->op == CEPH_OSD_OP_WRITEFULL)
-                       ceph_osdc_msg_data_add(req->r_request, osd_data);
-               else
-                       ceph_osdc_msg_data_add(req->r_reply, osd_data);
                break;
        case CEPH_OSD_OP_CALL:
                dst->cls.class_len = src->cls.class_len;
                dst->cls.method_len = src->cls.method_len;
-               osd_data = &src->cls.request_info;
-               ceph_osdc_msg_data_add(req->r_request, osd_data);
-               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-               request_data_len = osd_data->pagelist->length;
-
-               osd_data = &src->cls.request_data;
-               data_length = ceph_osd_data_length(osd_data);
-               if (data_length) {
-                       BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-                       dst->cls.indata_len = cpu_to_le32(data_length);
-                       ceph_osdc_msg_data_add(req->r_request, osd_data);
-                       src->indata_len += data_length;
-                       request_data_len += data_length;
-               }
-               osd_data = &src->cls.response_data;
-               ceph_osdc_msg_data_add(req->r_reply, osd_data);
+               dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
                break;
        case CEPH_OSD_OP_STARTSYNC:
                break;
-       case CEPH_OSD_OP_NOTIFY_ACK:
        case CEPH_OSD_OP_WATCH:
                dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-               dst->watch.ver = cpu_to_le64(src->watch.ver);
-               dst->watch.flag = src->watch.flag;
+               dst->watch.ver = cpu_to_le64(0);
+               dst->watch.op = src->watch.op;
+               dst->watch.gen = cpu_to_le32(src->watch.gen);
+               break;
+       case CEPH_OSD_OP_NOTIFY_ACK:
+               break;
+       case CEPH_OSD_OP_NOTIFY:
+               dst->notify.cookie = cpu_to_le64(src->notify.cookie);
                break;
        case CEPH_OSD_OP_SETALLOCHINT:
                dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
                dst->xattr.cmp_op = src->xattr.cmp_op;
                dst->xattr.cmp_mode = src->xattr.cmp_mode;
-               osd_data = &src->xattr.osd_data;
-               ceph_osdc_msg_data_add(req->r_request, osd_data);
-               request_data_len = osd_data->pagelist->length;
                break;
        case CEPH_OSD_OP_CREATE:
        case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
        dst->flags = cpu_to_le32(src->flags);
        dst->payload_len = cpu_to_le32(src->indata_len);
 
-       return request_data_len;
+       return src->indata_len;
 }
 
 /*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
        req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
                                        GFP_NOFS);
-       if (!req)
-               return ERR_PTR(-ENOMEM);
-
-       req->r_flags = flags;
+       if (!req) {
+               r = -ENOMEM;
+               goto fail;
+       }
 
        /* calculate max write size */
        r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-       if (r < 0) {
-               ceph_osdc_put_request(req);
-               return ERR_PTR(r);
-       }
+       if (r)
+               goto fail;
 
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
                osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                                       truncate_size, truncate_seq);
        }
 
+       req->r_flags = flags;
        req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+       ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
-       snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-                "%llx.%08llx", vino.ino, objnum);
-       req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+       req->r_snapid = vino.snap;
+       if (flags & CEPH_OSD_FLAG_WRITE)
+               req->r_data_offset = off;
+
+       r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+       if (r)
+               goto fail;
 
        return req;
+
+fail:
+       ceph_osdc_put_request(req);
+       return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *new)
-{
-       struct rb_node **p = &osdc->requests.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_osd_request, r_node);
-               if (new->r_tid < req->r_tid)
-                       p = &(*p)->rb_left;
-               else if (new->r_tid > req->r_tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->r_node, parent, p);
-       rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                u64 tid)
-{
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid)
-                       n = n->rb_left;
-               else if (tid > req->r_tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
+DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                   u64 tid)
+static bool osd_homeless(struct ceph_osd *osd)
 {
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid) {
-                       if (!n->rb_left)
-                               return req;
-                       n = n->rb_left;
-               } else if (tid > req->r_tid) {
-                       n = n->rb_right;
-               } else {
-                       return req;
-               }
-       }
-       return NULL;
+       return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static void __kick_linger_request(struct ceph_osd_request *req)
+static bool osd_registered(struct ceph_osd *osd)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
-       struct ceph_osd *osd = req->r_osd;
-
-       /*
-        * Linger requests need to be resent with a new tid to avoid
-        * the dup op detection logic on the OSDs.  Achieve this with
-        * a re-register dance instead of open-coding.
-        */
-       ceph_osdc_get_request(req);
-       if (!list_empty(&req->r_linger_item))
-               __unregister_linger_request(osdc, req);
-       else
-               __unregister_request(osdc, req);
-       __register_request(osdc, req);
-       ceph_osdc_put_request(req);
-
-       /*
-        * Unless request has been registered as both normal and
-        * lingering, __unregister{,_linger}_request clears r_osd.
-        * However, here we need to preserve r_osd to make sure we
-        * requeue on the same OSD.
-        */
-       WARN_ON(req->r_osd || !osd);
-       req->r_osd = osd;
+       verify_osdc_locked(osd->o_osdc);
 
-       dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-       __enqueue_request(req);
+       return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
- * Resubmit requests pending on the given osd.
+ * Assumes @osd is zero-initialized.
  */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-                               struct ceph_osd *osd)
+static void osd_init(struct ceph_osd *osd)
 {
-       struct ceph_osd_request *req, *nreq;
-       LIST_HEAD(resend);
-       LIST_HEAD(resend_linger);
-       int err;
-
-       dout("%s osd%d\n", __func__, osd->o_osd);
-       err = __reset_osd(osdc, osd);
-       if (err)
-               return;
-
-       /*
-        * Build up a list of requests to resend by traversing the
-        * osd's list of requests.  Requests for a given object are
-        * sent in tid order, and that is also the order they're
-        * kept on this list.  Therefore all requests that are in
-        * flight will be found first, followed by all requests that
-        * have not yet been sent.  And to resend requests while
-        * preserving this order we will want to put any sent
-        * requests back on the front of the osd client's unsent
-        * list.
-        *
-        * So we build a separate ordered list of already-sent
-        * requests for the affected osd and splice it onto the
-        * front of the osd client's unsent list.  Once we've seen a
-        * request that has not yet been sent we're done.  Those
-        * requests are already sitting right where they belong.
-        */
-       list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-               if (!req->r_sent)
-                       break;
-
-               if (!req->r_linger) {
-                       dout("%s requeueing %p tid %llu\n", __func__, req,
-                            req->r_tid);
-                       list_move_tail(&req->r_req_lru_item, &resend);
-                       req->r_flags |= CEPH_OSD_FLAG_RETRY;
-               } else {
-                       list_move_tail(&req->r_req_lru_item, &resend_linger);
-               }
-       }
-       list_splice(&resend, &osdc->req_unsent);
-
-       /*
-        * Both registered and not yet registered linger requests are
-        * enqueued with a new tid on the same OSD.  We add/move them
-        * to req_unsent/o_requests at the end to keep things in tid
-        * order.
-        */
-       list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-                                r_linger_osd_item) {
-               WARN_ON(!list_empty(&req->r_req_lru_item));
-               __kick_linger_request(req);
-       }
-
-       list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-               __kick_linger_request(req);
+       atomic_set(&osd->o_ref, 1);
+       RB_CLEAR_NODE(&osd->o_node);
+       osd->o_requests = RB_ROOT;
+       osd->o_linger_requests = RB_ROOT;
+       INIT_LIST_HEAD(&osd->o_osd_lru);
+       INIT_LIST_HEAD(&osd->o_keepalive_item);
+       osd->o_incarnation = 1;
+       mutex_init(&osd->lock);
 }
 
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static void osd_cleanup(struct ceph_osd *osd)
 {
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
-
-       if (!osd)
-               return;
-       dout("osd_reset osd%d\n", osd->o_osd);
-       osdc = osd->o_osdc;
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
-       __kick_osd_requests(osdc, osd);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
+       WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+       WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+       WARN_ON(!list_empty(&osd->o_osd_lru));
+       WARN_ON(!list_empty(&osd->o_keepalive_item));
+
+       if (osd->o_auth.authorizer) {
+               WARN_ON(osd_homeless(osd));
+               ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+       }
 }
 
 /*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 {
        struct ceph_osd *osd;
 
-       osd = kzalloc(sizeof(*osd), GFP_NOFS);
-       if (!osd)
-               return NULL;
+       WARN_ON(onum == CEPH_HOMELESS_OSD);
 
-       atomic_set(&osd->o_ref, 1);
+       osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
+       osd_init(osd);
        osd->o_osdc = osdc;
        osd->o_osd = onum;
-       RB_CLEAR_NODE(&osd->o_node);
-       INIT_LIST_HEAD(&osd->o_requests);
-       INIT_LIST_HEAD(&osd->o_linger_requests);
-       INIT_LIST_HEAD(&osd->o_osd_lru);
-       osd->o_incarnation = 1;
 
        ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
 
-       INIT_LIST_HEAD(&osd->o_keepalive_item);
        return osd;
 }
 
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
        if (atomic_dec_and_test(&osd->o_ref)) {
-               if (osd->o_auth.authorizer)
-                       ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
+               osd_cleanup(osd);
                kfree(osd);
        }
 }
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-       WARN_ON(!list_empty(&osd->o_requests));
-       WARN_ON(!list_empty(&osd->o_linger_requests));
-
-       list_del_init(&osd->o_osd_lru);
-       rb_erase(&osd->o_node, &osdc->osds);
-       RB_CLEAR_NODE(&osd->o_node);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-       if (!RB_EMPTY_NODE(&osd->o_node)) {
-               ceph_con_close(&osd->o_con);
-               __remove_osd(osdc, osd);
-               put_osd(osd);
-       }
-}
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-static void remove_all_osds(struct ceph_osd_client *osdc)
+static void __move_osd_to_lru(struct ceph_osd *osd)
 {
-       dout("%s %p\n", __func__, osdc);
-       mutex_lock(&osdc->request_mutex);
-       while (!RB_EMPTY_ROOT(&osdc->osds)) {
-               struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
-                                               struct ceph_osd, o_node);
-               remove_osd(osdc, osd);
-       }
-       mutex_unlock(&osdc->request_mutex);
-}
+       struct ceph_osd_client *osdc = osd->o_osdc;
 
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-                             struct ceph_osd *osd)
-{
-       dout("%s %p\n", __func__, osd);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
        BUG_ON(!list_empty(&osd->o_osd_lru));
 
+       spin_lock(&osdc->osd_lru_lock);
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+       spin_unlock(&osdc->osd_lru_lock);
+
        osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
-static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
-                                 struct ceph_osd *osd)
+static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-       dout("%s %p\n", __func__, osd);
-
-       if (list_empty(&osd->o_requests) &&
-           list_empty(&osd->o_linger_requests))
-               __move_osd_to_lru(osdc, osd);
+       if (RB_EMPTY_ROOT(&osd->o_requests) &&
+           RB_EMPTY_ROOT(&osd->o_linger_requests))
+               __move_osd_to_lru(osd);
 }
 
 static void __remove_osd_from_lru(struct ceph_osd *osd)
 {
-       dout("__remove_osd_from_lru %p\n", osd);
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       spin_lock(&osdc->osd_lru_lock);
        if (!list_empty(&osd->o_osd_lru))
                list_del_init(&osd->o_osd_lru);
+       spin_unlock(&osdc->osd_lru_lock);
 }
 
-static void remove_old_osds(struct ceph_osd_client *osdc)
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
 {
-       struct ceph_osd *osd, *nosd;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct rb_node *n;
 
-       dout("__remove_old_osds %p\n", osdc);
-       mutex_lock(&osdc->request_mutex);
-       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-               if (time_before(jiffies, osd->lru_ttl))
-                       break;
-               remove_osd(osdc, osd);
+       verify_osdc_wrlocked(osdc);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       ceph_con_close(&osd->o_con);
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               n = rb_next(n); /* unlink_request() */
+
+               dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+               unlink_request(osd, req);
+               link_request(&osdc->homeless_osd, req);
+       }
+       for (n = rb_first(&osd->o_linger_requests); n; ) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               n = rb_next(n); /* unlink_linger() */
+
+               dout(" reassigning lreq %p linger_id %llu\n", lreq,
+                    lreq->linger_id);
+               unlink_linger(osd, lreq);
+               link_linger(&osdc->homeless_osd, lreq);
        }
-       mutex_unlock(&osdc->request_mutex);
+
+       __remove_osd_from_lru(osd);
+       erase_osd(&osdc->osds, osd);
+       put_osd(osd);
 }
 
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
        struct ceph_entity_addr *peer_addr;
 
-       dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-       if (list_empty(&osd->o_requests) &&
-           list_empty(&osd->o_linger_requests)) {
-               remove_osd(osdc, osd);
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       if (RB_EMPTY_ROOT(&osd->o_requests) &&
+           RB_EMPTY_ROOT(&osd->o_linger_requests)) {
+               close_osd(osd);
                return -ENODEV;
        }
 
-       peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+       peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
        if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
                        !ceph_con_opened(&osd->o_con)) {
-               struct ceph_osd_request *req;
+               struct rb_node *n;
 
                dout("osd addr hasn't changed and connection never opened, "
                     "letting msgr retry\n");
                /* touch each r_stamp for handle_timeout()'s benfit */
-               list_for_each_entry(req, &osd->o_requests, r_osd_item)
+               for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(n, struct ceph_osd_request, r_node);
                        req->r_stamp = jiffies;
+               }
 
                return -EAGAIN;
        }
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
        return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+                                         bool wrlocked)
 {
-       struct rb_node **p = &osdc->osds.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd *osd = NULL;
+       struct ceph_osd *osd;
 
-       dout("__insert_osd %p osd%d\n", new, new->o_osd);
-       while (*p) {
-               parent = *p;
-               osd = rb_entry(parent, struct ceph_osd, o_node);
-               if (new->o_osd < osd->o_osd)
-                       p = &(*p)->rb_left;
-               else if (new->o_osd > osd->o_osd)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
+       if (wrlocked)
+               verify_osdc_wrlocked(osdc);
+       else
+               verify_osdc_locked(osdc);
 
-       rb_link_node(&new->o_node, parent, p);
-       rb_insert_color(&new->o_node, &osdc->osds);
+       if (o != CEPH_HOMELESS_OSD)
+               osd = lookup_osd(&osdc->osds, o);
+       else
+               osd = &osdc->homeless_osd;
+       if (!osd) {
+               if (!wrlocked)
+                       return ERR_PTR(-EAGAIN);
+
+               osd = create_osd(osdc, o);
+               insert_osd(&osdc->osds, osd);
+               ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+                             &osdc->osdmap->osd_addr[osd->o_osd]);
+       }
+
+       dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+       return osd;
 }
 
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+/*
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
+ */
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-       struct ceph_osd *osd;
-       struct rb_node *n = osdc->osds.rb_node;
-
-       while (n) {
-               osd = rb_entry(n, struct ceph_osd, o_node);
-               if (o < osd->o_osd)
-                       n = n->rb_left;
-               else if (o > osd->o_osd)
-                       n = n->rb_right;
-               else
-                       return osd;
-       }
-       return NULL;
+       verify_osd_locked(osd);
+       WARN_ON(!req->r_tid || req->r_osd);
+       dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+            req, req->r_tid);
+
+       if (!osd_homeless(osd))
+               __remove_osd_from_lru(osd);
+       else
+               atomic_inc(&osd->o_osdc->num_homeless);
+
+       get_osd(osd);
+       insert_request(&osd->o_requests, req);
+       req->r_osd = osd;
 }
 
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-       schedule_delayed_work(&osdc->timeout_work,
-                             osdc->client->options->osd_keepalive_timeout);
+       verify_osd_locked(osd);
+       WARN_ON(req->r_osd != osd);
+       dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+            req, req->r_tid);
+
+       req->r_osd = NULL;
+       erase_request(&osd->o_requests, req);
+       put_osd(osd);
+
+       if (!osd_homeless(osd))
+               maybe_move_osd_to_lru(osd);
+       else
+               atomic_dec(&osd->o_osdc->num_homeless);
 }
 
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+static bool __pool_full(struct ceph_pg_pool_info *pi)
 {
-       cancel_delayed_work(&osdc->timeout_work);
+       return pi->flags & CEPH_POOL_FLAG_FULL;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-                              struct ceph_osd_request *req)
+static bool have_pool_full(struct ceph_osd_client *osdc)
 {
-       req->r_tid = ++osdc->last_tid;
-       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-       dout("__register_request %p tid %lld\n", req, req->r_tid);
-       __insert_request(osdc, req);
-       ceph_osdc_get_request(req);
-       osdc->num_requests++;
-       if (osdc->num_requests == 1) {
-               dout(" first request, scheduling timeout\n");
-               __schedule_osd_timeout(osdc);
+       struct rb_node *n;
+
+       for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pi =
+                   rb_entry(n, struct ceph_pg_pool_info, node);
+
+               if (__pool_full(pi))
+                       return true;
        }
+
+       return false;
+}
+
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+       struct ceph_pg_pool_info *pi;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+       if (!pi)
+               return false;
+
+       return __pool_full(pi);
 }
 
 /*
- * called under osdc->request_mutex
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
  */
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                struct ceph_osd_request *req)
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+                                   const struct ceph_osd_request_target *t,
+                                   struct ceph_pg_pool_info *pi)
 {
-       if (RB_EMPTY_NODE(&req->r_node)) {
-               dout("__unregister_request %p tid %lld not registered\n",
-                       req, req->r_tid);
-               return;
+       bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+       bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+                      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                      __pool_full(pi);
+
+       WARN_ON(pi->id != t->base_oloc.pool);
+       return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+              (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+enum calc_target_result {
+       CALC_TARGET_NO_ACTION = 0,
+       CALC_TARGET_NEED_RESEND,
+       CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+                                          struct ceph_osd_request_target *t,
+                                          u32 *last_force_resend,
+                                          bool any_change)
+{
+       struct ceph_pg_pool_info *pi;
+       struct ceph_pg pgid, last_pgid;
+       struct ceph_osds up, acting;
+       bool force_resend = false;
+       bool need_check_tiering = false;
+       bool need_resend = false;
+       bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+                                            CEPH_OSDMAP_SORTBITWISE);
+       enum calc_target_result ct_res;
+       int ret;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+       if (!pi) {
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
        }
 
-       dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-       rb_erase(&req->r_node, &osdc->requests);
-       RB_CLEAR_NODE(&req->r_node);
-       osdc->num_requests--;
+       if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+               if (last_force_resend &&
+                   *last_force_resend < pi->last_force_request_resend) {
+                       *last_force_resend = pi->last_force_request_resend;
+                       force_resend = true;
+               } else if (!last_force_resend) {
+                       force_resend = true;
+               }
+       }
+       if (ceph_oid_empty(&t->target_oid) || force_resend) {
+               ceph_oid_copy(&t->target_oid, &t->base_oid);
+               need_check_tiering = true;
+       }
+       if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+               ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+               need_check_tiering = true;
+       }
 
-       if (req->r_osd) {
-               /* make sure the original request isn't in flight. */
-               ceph_msg_revoke(req->r_request);
+       if (need_check_tiering &&
+           (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+               if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+                       t->target_oloc.pool = pi->read_tier;
+               if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+                       t->target_oloc.pool = pi->write_tier;
+       }
 
-               list_del_init(&req->r_osd_item);
-               maybe_move_osd_to_lru(osdc, req->r_osd);
-               if (list_empty(&req->r_linger_osd_item))
-                       req->r_osd = NULL;
+       ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+                                       &t->target_oloc, &pgid);
+       if (ret) {
+               WARN_ON(ret != -ENOENT);
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
+       }
+       last_pgid.pool = pgid.pool;
+       last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+       ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+       if (any_change &&
+           ceph_is_new_interval(&t->acting,
+                                &acting,
+                                &t->up,
+                                &up,
+                                t->size,
+                                pi->size,
+                                t->min_size,
+                                pi->min_size,
+                                t->pg_num,
+                                pi->pg_num,
+                                t->sort_bitwise,
+                                sort_bitwise,
+                                &last_pgid))
+               force_resend = true;
+
+       if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+               t->paused = false;
+               need_resend = true;
        }
 
-       list_del_init(&req->r_req_lru_item);
-       ceph_osdc_put_request(req);
+       if (ceph_pg_compare(&t->pgid, &pgid) ||
+           ceph_osds_changed(&t->acting, &acting, any_change) ||
+           force_resend) {
+               t->pgid = pgid; /* struct */
+               ceph_osds_copy(&t->acting, &acting);
+               ceph_osds_copy(&t->up, &up);
+               t->size = pi->size;
+               t->min_size = pi->min_size;
+               t->pg_num = pi->pg_num;
+               t->pg_num_mask = pi->pg_num_mask;
+               t->sort_bitwise = sort_bitwise;
+
+               t->osd = acting.primary;
+               need_resend = true;
+       }
+
+       ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+       dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+       return ct_res;
+}
+
+static void setup_request_data(struct ceph_osd_request *req,
+                              struct ceph_msg *msg)
+{
+       u32 data_len = 0;
+       int i;
+
+       if (!list_empty(&msg->data))
+               return;
+
+       WARN_ON(msg->data_length);
+       for (i = 0; i < req->r_num_ops; i++) {
+               struct ceph_osd_req_op *op = &req->r_ops[i];
+
+               switch (op->op) {
+               /* request */
+               case CEPH_OSD_OP_WRITE:
+               case CEPH_OSD_OP_WRITEFULL:
+                       WARN_ON(op->indata_len != op->extent.length);
+                       ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+                       break;
+               case CEPH_OSD_OP_SETXATTR:
+               case CEPH_OSD_OP_CMPXATTR:
+                       WARN_ON(op->indata_len != op->xattr.name_len +
+                                                 op->xattr.value_len);
+                       ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+                       break;
+               case CEPH_OSD_OP_NOTIFY_ACK:
+                       ceph_osdc_msg_data_add(msg,
+                                              &op->notify_ack.request_data);
+                       break;
+
+               /* reply */
+               case CEPH_OSD_OP_STAT:
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->raw_data_in);
+                       break;
+               case CEPH_OSD_OP_READ:
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->extent.osd_data);
+                       break;
 
-       if (osdc->num_requests == 0) {
-               dout(" no requests, canceling timeout\n");
-               __cancel_osd_timeout(osdc);
+               /* both */
+               case CEPH_OSD_OP_CALL:
+                       WARN_ON(op->indata_len != op->cls.class_len +
+                                                 op->cls.method_len +
+                                                 op->cls.indata_len);
+                       ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+                       /* optional, can be NONE */
+                       ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+                       /* optional, can be NONE */
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->cls.response_data);
+                       break;
+               case CEPH_OSD_OP_NOTIFY:
+                       ceph_osdc_msg_data_add(msg,
+                                              &op->notify.request_data);
+                       ceph_osdc_msg_data_add(req->r_reply,
+                                              &op->notify.response_data);
+                       break;
+               }
+
+               data_len += op->indata_len;
        }
+
+       WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front_alloc_len;
+       u32 data_len = 0;
+       int i;
+
+       if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+               /* snapshots aren't writeable */
+               WARN_ON(req->r_snapid != CEPH_NOSNAP);
+       } else {
+               WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+                       req->r_data_offset || req->r_snapc);
+       }
+
+       setup_request_data(req, msg);
+
+       ceph_encode_32(&p, 1); /* client_inc, always 1 */
+       ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+       ceph_encode_32(&p, req->r_flags);
+       ceph_encode_timespec(p, &req->r_mtime);
+       p += sizeof(struct ceph_timespec);
+       /* aka reassert_version */
+       memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+       p += sizeof(req->r_replay_version);
+
+       /* oloc */
+       ceph_encode_8(&p, 4);
+       ceph_encode_8(&p, 4);
+       ceph_encode_32(&p, 8 + 4 + 4);
+       ceph_encode_64(&p, req->r_t.target_oloc.pool);
+       ceph_encode_32(&p, -1); /* preferred */
+       ceph_encode_32(&p, 0); /* key len */
+
+       /* pgid */
+       ceph_encode_8(&p, 1);
+       ceph_encode_64(&p, req->r_t.pgid.pool);
+       ceph_encode_32(&p, req->r_t.pgid.seed);
+       ceph_encode_32(&p, -1); /* preferred */
+
+       /* oid */
+       ceph_encode_32(&p, req->r_t.target_oid.name_len);
+       memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+       p += req->r_t.target_oid.name_len;
+
+       /* ops, can imply data */
+       ceph_encode_16(&p, req->r_num_ops);
+       for (i = 0; i < req->r_num_ops; i++) {
+               data_len += osd_req_encode_op(p, &req->r_ops[i]);
+               p += sizeof(struct ceph_osd_op);
+       }
+
+       ceph_encode_64(&p, req->r_snapid); /* snapid */
+       if (req->r_snapc) {
+               ceph_encode_64(&p, req->r_snapc->seq);
+               ceph_encode_32(&p, req->r_snapc->num_snaps);
+               for (i = 0; i < req->r_snapc->num_snaps; i++)
+                       ceph_encode_64(&p, req->r_snapc->snaps[i]);
+       } else {
+               ceph_encode_64(&p, 0); /* snap_seq */
+               ceph_encode_32(&p, 0); /* snaps len */
+       }
+
+       ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+       BUG_ON(p > end);
+       msg->front.iov_len = p - msg->front.iov_base;
+       msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+       msg->hdr.data_len = cpu_to_le32(data_len);
+       /*
+        * The header "data_off" is a hint to the receiver allowing it
+        * to align received data into its buffers such that there's no
+        * need to re-copy it before writing it to disk (direct I/O).
+        */
+       msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
+
+       dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+            req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+            req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
 }
 
 /*
- * Cancel a previously queued request message
+ * @req has to be assigned a tid and registered.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void send_request(struct ceph_osd_request *req)
 {
-       if (req->r_sent && req->r_osd) {
+       struct ceph_osd *osd = req->r_osd;
+
+       verify_osd_locked(osd);
+       WARN_ON(osd->o_osd != req->r_t.osd);
+
+       /*
+        * We may have a previously queued request message hanging
+        * around.  Cancel it to avoid corrupting the msgr.
+        */
+       if (req->r_sent)
                ceph_msg_revoke(req->r_request);
-               req->r_sent = 0;
+
+       req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+       if (req->r_attempts)
+               req->r_flags |= CEPH_OSD_FLAG_RETRY;
+       else
+               WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+       encode_request(req, req->r_request);
+
+       dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+            __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+            req->r_t.osd, req->r_flags, req->r_attempts);
+
+       req->r_t.paused = false;
+       req->r_stamp = jiffies;
+       req->r_attempts++;
+
+       req->r_sent = osd->o_incarnation;
+       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+       ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
+}
+
+static void maybe_request_map(struct ceph_osd_client *osdc)
+{
+       bool continuous = false;
+
+       verify_osdc_locked(osdc);
+       WARN_ON(!osdc->osdmap->epoch);
+
+       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+           ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+           ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+               dout("%s osdc %p continuous\n", __func__, osdc);
+               continuous = true;
+       } else {
+               dout("%s osdc %p onetime\n", __func__, osdc);
        }
+
+       if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                              osdc->osdmap->epoch + 1, continuous))
+               ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
-                                   struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req);
+
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-       WARN_ON(!req->r_linger);
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd *osd;
+       enum calc_target_result ct_res;
+       bool need_send = false;
+       bool promoted = false;
+
+       WARN_ON(req->r_tid || req->r_got_reply);
+       dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+       ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+       if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
+               goto promote;
+
+       osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+       if (IS_ERR(osd)) {
+               WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+               goto promote;
+       }
 
+       if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+           ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+               dout("req %p pausewr\n", req);
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+                  ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+               dout("req %p pauserd\n", req);
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+                  !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+                                    CEPH_OSD_FLAG_FULL_FORCE)) &&
+                  (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                   pool_full(osdc, req->r_t.base_oloc.pool))) {
+               dout("req %p full/pool_full\n", req);
+               pr_warn_ratelimited("FULL or reached pool quota\n");
+               req->r_t.paused = true;
+               maybe_request_map(osdc);
+       } else if (!osd_homeless(osd)) {
+               need_send = true;
+       } else {
+               maybe_request_map(osdc);
+       }
+
+       mutex_lock(&osd->lock);
+       /*
+        * Assign the tid atomically with send_request() to protect
+        * multiple writes to the same object from racing with each
+        * other, resulting in out of order ops on the OSDs.
+        */
+       req->r_tid = atomic64_inc_return(&osdc->last_tid);
+       link_request(osd, req);
+       if (need_send)
+               send_request(req);
+       mutex_unlock(&osd->lock);
+
+       if (ct_res == CALC_TARGET_POOL_DNE)
+               send_map_check(req);
+
+       if (promoted)
+               downgrade_write(&osdc->lock);
+       return;
+
+promote:
+       up_read(&osdc->lock);
+       down_write(&osdc->lock);
+       wrlocked = true;
+       promoted = true;
+       goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+       unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+       if (req->r_flags & CEPH_OSD_FLAG_READ) {
+               WARN_ON(req->r_flags & mask);
+               req->r_flags |= CEPH_OSD_FLAG_ACK;
+       } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+               WARN_ON(!(req->r_flags & mask));
+       else
+               WARN_ON(1);
+
+       WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+       atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
        ceph_osdc_get_request(req);
-       list_add_tail(&req->r_linger_item, &osdc->req_linger);
-       if (req->r_osd)
-               list_add_tail(&req->r_linger_osd_item,
-                             &req->r_osd->o_linger_requests);
+       account_request(req);
+       __submit_request(req, wrlocked);
 }
 
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-                                       struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
 {
-       WARN_ON(!req->r_linger);
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd *osd = req->r_osd;
 
-       if (list_empty(&req->r_linger_item)) {
-               dout("%s %p tid %llu not registered\n", __func__, req,
-                    req->r_tid);
+       verify_osd_locked(osd);
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+       WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
+       unlink_request(osd, req);
+       atomic_dec(&osdc->num_requests);
+
+       /*
+        * If an OSD has failed or returned and a request has been sent
+        * twice, it's possible to get a reply and end up here while the
+        * request message is queued for delivery.  We will ignore the
+        * reply, so not a big deal, but better to try and catch it.
+        */
+       ceph_msg_revoke(req->r_request);
+       ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+       __finish_request(req);
+       ceph_osdc_put_request(req);
+}
+
+static void __complete_request(struct ceph_osd_request *req)
+{
+       if (req->r_callback)
+               req->r_callback(req);
+       else
+               complete_all(&req->r_completion);
+}
+
+/*
+ * Note that this is open-coded in handle_reply(), which has to deal
+ * with ack vs commit, dup acks, etc.
+ */
+static void complete_request(struct ceph_osd_request *req, int err)
+{
+       dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
+
+       req->r_result = err;
+       __finish_request(req);
+       __complete_request(req);
+       complete_all(&req->r_safe_completion);
+       ceph_osdc_put_request(req);
+}
+
+static void cancel_map_check(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_request *lookup_req;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+       if (!lookup_req)
                return;
+
+       WARN_ON(lookup_req != req);
+       erase_request_mc(&osdc->map_checks, req);
+       ceph_osdc_put_request(req);
+}
+
+static void cancel_request(struct ceph_osd_request *req)
+{
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+       cancel_map_check(req);
+       finish_request(req);
+}
+
+static void check_pool_dne(struct ceph_osd_request *req)
+{
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(!map->epoch);
+
+       if (req->r_attempts) {
+               /*
+                * We sent a request earlier, which means that
+                * previously the pool existed, and now it does not
+                * (i.e., it was deleted).
+                */
+               req->r_map_dne_bound = map->epoch;
+               dout("%s req %p tid %llu pool disappeared\n", __func__, req,
+                    req->r_tid);
+       } else {
+               dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
+                    req, req->r_tid, req->r_map_dne_bound, map->epoch);
        }
 
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
-       list_del_init(&req->r_linger_item);
+       if (req->r_map_dne_bound) {
+               if (map->epoch >= req->r_map_dne_bound) {
+                       /* we had a new enough map */
+                       pr_info_ratelimited("tid %llu pool does not exist\n",
+                                           req->r_tid);
+                       complete_request(req, -ENOENT);
+               }
+       } else {
+               send_map_check(req);
+       }
+}
+
+static void map_check_cb(struct ceph_mon_generic_request *greq)
+{
+       struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+       struct ceph_osd_request *req;
+       u64 tid = greq->private_data;
+
+       WARN_ON(greq->result || !greq->u.newest);
 
-       if (req->r_osd) {
-               list_del_init(&req->r_linger_osd_item);
-               maybe_move_osd_to_lru(osdc, req->r_osd);
-               if (list_empty(&req->r_osd_item))
-                       req->r_osd = NULL;
+       down_write(&osdc->lock);
+       req = lookup_request_mc(&osdc->map_checks, tid);
+       if (!req) {
+               dout("%s tid %llu dne\n", __func__, tid);
+               goto out_unlock;
        }
+
+       dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
+            req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
+       if (!req->r_map_dne_bound)
+               req->r_map_dne_bound = greq->u.newest;
+       erase_request_mc(&osdc->map_checks, req);
+       check_pool_dne(req);
+
        ceph_osdc_put_request(req);
+out_unlock:
+       up_write(&osdc->lock);
 }
 
-void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req)
+static void send_map_check(struct ceph_osd_request *req)
 {
-       if (!req->r_linger) {
-               dout("set_request_linger %p\n", req);
-               req->r_linger = 1;
+       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_request *lookup_req;
+       int ret;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
+       if (lookup_req) {
+               WARN_ON(lookup_req != req);
+               return;
        }
+
+       ceph_osdc_get_request(req);
+       insert_request_mc(&osdc->map_checks, req);
+       ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                         map_check_cb, req->r_tid);
+       WARN_ON(ret);
 }
-EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
 /*
- * Returns whether a request should be blocked from being sent
- * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
+ * lingering requests, watch/notify v2 infrastructure
  */
-static bool __req_should_be_paused(struct ceph_osd_client *osdc,
-                                  struct ceph_osd_request *req)
+static void linger_release(struct kref *kref)
 {
-       bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
-       bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-       return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
-               (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+       struct ceph_osd_linger_request *lreq =
+           container_of(kref, struct ceph_osd_linger_request, kref);
+
+       dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
+            lreq->reg_req, lreq->ping_req);
+       WARN_ON(!RB_EMPTY_NODE(&lreq->node));
+       WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
+       WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
+       WARN_ON(!list_empty(&lreq->scan_item));
+       WARN_ON(!list_empty(&lreq->pending_lworks));
+       WARN_ON(lreq->osd);
+
+       if (lreq->reg_req)
+               ceph_osdc_put_request(lreq->reg_req);
+       if (lreq->ping_req)
+               ceph_osdc_put_request(lreq->ping_req);
+       target_destroy(&lreq->t);
+       kfree(lreq);
 }
 
+static void linger_put(struct ceph_osd_linger_request *lreq)
+{
+       if (lreq)
+               kref_put(&lreq->kref, linger_release);
+}
+
+static struct ceph_osd_linger_request *
+linger_get(struct ceph_osd_linger_request *lreq)
+{
+       kref_get(&lreq->kref);
+       return lreq;
+}
+
+static struct ceph_osd_linger_request *
+linger_alloc(struct ceph_osd_client *osdc)
+{
+       struct ceph_osd_linger_request *lreq;
+
+       lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
+       if (!lreq)
+               return NULL;
+
+       kref_init(&lreq->kref);
+       mutex_init(&lreq->lock);
+       RB_CLEAR_NODE(&lreq->node);
+       RB_CLEAR_NODE(&lreq->osdc_node);
+       RB_CLEAR_NODE(&lreq->mc_node);
+       INIT_LIST_HEAD(&lreq->scan_item);
+       INIT_LIST_HEAD(&lreq->pending_lworks);
+       init_completion(&lreq->reg_commit_wait);
+       init_completion(&lreq->notify_finish_wait);
+
+       lreq->osdc = osdc;
+       target_init(&lreq->t);
+
+       dout("%s lreq %p\n", __func__, lreq);
+       return lreq;
+}
+
+DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
+DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
+DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
+
 /*
- * Calculate mapping of a request to a PG.  Takes tiering into account.
+ * Create linger request <-> OSD session relation.
+ *
+ * @lreq has to be registered, @osd may be homeless.
  */
-static int __calc_request_pg(struct ceph_osdmap *osdmap,
-                            struct ceph_osd_request *req,
-                            struct ceph_pg *pg_out)
+static void link_linger(struct ceph_osd *osd,
+                       struct ceph_osd_linger_request *lreq)
 {
-       bool need_check_tiering;
+       verify_osd_locked(osd);
+       WARN_ON(!lreq->linger_id || lreq->osd);
+       dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+            osd->o_osd, lreq, lreq->linger_id);
 
-       need_check_tiering = false;
-       if (req->r_target_oloc.pool == -1) {
-               req->r_target_oloc = req->r_base_oloc; /* struct */
-               need_check_tiering = true;
+       if (!osd_homeless(osd))
+               __remove_osd_from_lru(osd);
+       else
+               atomic_inc(&osd->o_osdc->num_homeless);
+
+       get_osd(osd);
+       insert_linger(&osd->o_linger_requests, lreq);
+       lreq->osd = osd;
+}
+
+static void unlink_linger(struct ceph_osd *osd,
+                         struct ceph_osd_linger_request *lreq)
+{
+       verify_osd_locked(osd);
+       WARN_ON(lreq->osd != osd);
+       dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
+            osd->o_osd, lreq, lreq->linger_id);
+
+       lreq->osd = NULL;
+       erase_linger(&osd->o_linger_requests, lreq);
+       put_osd(osd);
+
+       if (!osd_homeless(osd))
+               maybe_move_osd_to_lru(osd);
+       else
+               atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static bool __linger_registered(struct ceph_osd_linger_request *lreq)
+{
+       verify_osdc_locked(lreq->osdc);
+
+       return !RB_EMPTY_NODE(&lreq->osdc_node);
+}
+
+static bool linger_registered(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       bool registered;
+
+       down_read(&osdc->lock);
+       registered = __linger_registered(lreq);
+       up_read(&osdc->lock);
+
+       return registered;
+}
+
+static void linger_register(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(lreq->linger_id);
+
+       linger_get(lreq);
+       lreq->linger_id = ++osdc->last_linger_id;
+       insert_linger_osdc(&osdc->linger_requests, lreq);
+}
+
+static void linger_unregister(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_osdc_wrlocked(osdc);
+
+       erase_linger_osdc(&osdc->linger_requests, lreq);
+       linger_put(lreq);
+}
+
+static void cancel_linger_request(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       WARN_ON(!req->r_linger);
+       cancel_request(req);
+       linger_put(lreq);
+}
+
+struct linger_work {
+       struct work_struct work;
+       struct ceph_osd_linger_request *lreq;
+       struct list_head pending_item;
+       unsigned long queued_stamp;
+
+       union {
+               struct {
+                       u64 notify_id;
+                       u64 notifier_id;
+                       void *payload; /* points into @msg front */
+                       size_t payload_len;
+
+                       struct ceph_msg *msg; /* for ceph_msg_put() */
+               } notify;
+               struct {
+                       int err;
+               } error;
+       };
+};
+
+static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
+                                      work_func_t workfn)
+{
+       struct linger_work *lwork;
+
+       lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
+       if (!lwork)
+               return NULL;
+
+       INIT_WORK(&lwork->work, workfn);
+       INIT_LIST_HEAD(&lwork->pending_item);
+       lwork->lreq = linger_get(lreq);
+
+       return lwork;
+}
+
+static void lwork_free(struct linger_work *lwork)
+{
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       mutex_lock(&lreq->lock);
+       list_del(&lwork->pending_item);
+       mutex_unlock(&lreq->lock);
+
+       linger_put(lreq);
+       kfree(lwork);
+}
+
+static void lwork_queue(struct linger_work *lwork)
+{
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       verify_lreq_locked(lreq);
+       WARN_ON(!list_empty(&lwork->pending_item));
+
+       lwork->queued_stamp = jiffies;
+       list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
+       queue_work(osdc->notify_wq, &lwork->work);
+}
+
+static void do_watch_notify(struct work_struct *w)
+{
+       struct linger_work *lwork = container_of(w, struct linger_work, work);
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       if (!linger_registered(lreq)) {
+               dout("%s lreq %p not registered\n", __func__, lreq);
+               goto out;
        }
-       if (req->r_target_oid.name_len == 0) {
-               ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
-               need_check_tiering = true;
+
+       WARN_ON(!lreq->is_watch);
+       dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
+            __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
+            lwork->notify.payload_len);
+       lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
+                 lwork->notify.notifier_id, lwork->notify.payload,
+                 lwork->notify.payload_len);
+
+out:
+       ceph_msg_put(lwork->notify.msg);
+       lwork_free(lwork);
+}
+
+static void do_watch_error(struct work_struct *w)
+{
+       struct linger_work *lwork = container_of(w, struct linger_work, work);
+       struct ceph_osd_linger_request *lreq = lwork->lreq;
+
+       if (!linger_registered(lreq)) {
+               dout("%s lreq %p not registered\n", __func__, lreq);
+               goto out;
        }
 
-       if (need_check_tiering &&
-           (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-               struct ceph_pg_pool_info *pi;
-
-               pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
-               if (pi) {
-                       if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
-                           pi->read_tier >= 0)
-                               req->r_target_oloc.pool = pi->read_tier;
-                       if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-                           pi->write_tier >= 0)
-                               req->r_target_oloc.pool = pi->write_tier;
+       dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
+       lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
+
+out:
+       lwork_free(lwork);
+}
+
+static void queue_watch_error(struct ceph_osd_linger_request *lreq)
+{
+       struct linger_work *lwork;
+
+       lwork = lwork_alloc(lreq, do_watch_error);
+       if (!lwork) {
+               pr_err("failed to allocate error-lwork\n");
+               return;
+       }
+
+       lwork->error.err = lreq->last_error;
+       lwork_queue(lwork);
+}
+
+static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
+                                      int result)
+{
+       if (!completion_done(&lreq->reg_commit_wait)) {
+               lreq->reg_commit_error = (result <= 0 ? result : 0);
+               complete_all(&lreq->reg_commit_wait);
+       }
+}
+
+static void linger_commit_cb(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
+            lreq->linger_id, req->r_result);
+       WARN_ON(!__linger_registered(lreq));
+       linger_reg_commit_complete(lreq, req->r_result);
+       lreq->committed = true;
+
+       if (!lreq->is_watch) {
+               struct ceph_osd_data *osd_data =
+                   osd_req_op_data(req, 0, notify, response_data);
+               void *p = page_address(osd_data->pages[0]);
+
+               WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
+                       osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+
+               /* make note of the notify_id */
+               if (req->r_ops[0].outdata_len >= sizeof(u64)) {
+                       lreq->notify_id = ceph_decode_64(&p);
+                       dout("lreq %p notify_id %llu\n", lreq,
+                            lreq->notify_id);
+               } else {
+                       dout("lreq %p no notify_id\n", lreq);
                }
-               /* !pi is caught in ceph_oloc_oid_to_pg() */
        }
 
-       return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-                                  &req->r_target_oid, pg_out);
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
+static int normalize_watch_error(int err)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
+       /*
+        * Translate ENOENT -> ENOTCONN so that a delete->disconnection
+        * notification and a failure to reconnect because we raced with
+        * the delete appear the same to the user.
+        */
+       if (err == -ENOENT)
+               err = -ENOTCONN;
 
-       dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-            req->r_osd ? req->r_osd->o_osd : -1);
+       return err;
+}
 
-       if (req->r_osd) {
-               __remove_osd_from_lru(req->r_osd);
-               list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-               list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+static void linger_reconnect_cb(struct ceph_osd_request *req)
+{
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
+            lreq, lreq->linger_id, req->r_result, lreq->last_error);
+       if (req->r_result < 0) {
+               if (!lreq->last_error) {
+                       lreq->last_error = normalize_watch_error(req->r_result);
+                       queue_watch_error(lreq);
+               }
+       }
+
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
+}
+
+static void send_linger(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_request *req = lreq->reg_req;
+       struct ceph_osd_req_op *op = &req->r_ops[0];
+
+       verify_osdc_wrlocked(req->r_osdc);
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+
+       if (req->r_osd)
+               cancel_linger_request(req);
+
+       request_reinit(req);
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+       req->r_flags = lreq->t.flags;
+       req->r_mtime = lreq->mtime;
+
+       mutex_lock(&lreq->lock);
+       if (lreq->is_watch && lreq->committed) {
+               WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+                       op->watch.cookie != lreq->linger_id);
+               op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+               op->watch.gen = ++lreq->register_gen;
+               dout("lreq %p reconnect register_gen %u\n", lreq,
+                    op->watch.gen);
+               req->r_callback = linger_reconnect_cb;
        } else {
-               list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+               if (!lreq->is_watch)
+                       lreq->notify_id = 0;
+               else
+                       WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+               dout("lreq %p register\n", lreq);
+               req->r_callback = linger_commit_cb;
        }
+       mutex_unlock(&lreq->lock);
+
+       req->r_priv = linger_get(lreq);
+       req->r_linger = true;
+
+       submit_request(req, true);
 }
 
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-                        struct ceph_osd_request *req, int force_resend)
+static void linger_ping_cb(struct ceph_osd_request *req)
 {
-       struct ceph_pg pgid;
-       int acting[CEPH_PG_MAX_SIZE];
-       int num, o;
-       int err;
-       bool was_paused;
-
-       dout("map_request %p tid %lld\n", req, req->r_tid);
-
-       err = __calc_request_pg(osdc->osdmap, req, &pgid);
-       if (err) {
-               list_move(&req->r_req_lru_item, &osdc->req_notarget);
-               return err;
-       }
-       req->r_pgid = pgid;
-
-       num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-       if (num < 0)
-               num = 0;
-
-       was_paused = req->r_paused;
-       req->r_paused = __req_should_be_paused(osdc, req);
-       if (was_paused && !req->r_paused)
-               force_resend = 1;
-
-       if ((!force_resend &&
-            req->r_osd && req->r_osd->o_osd == o &&
-            req->r_sent >= req->r_osd->o_incarnation &&
-            req->r_num_pg_osds == num &&
-            memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-           (req->r_osd == NULL && o == -1) ||
-           req->r_paused)
-               return 0;  /* no change */
-
-       dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-            req->r_tid, pgid.pool, pgid.seed, o,
-            req->r_osd ? req->r_osd->o_osd : -1);
-
-       /* record full pg acting set */
-       memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-       req->r_num_pg_osds = num;
-
-       if (req->r_osd) {
-               __cancel_request(req);
-               list_del_init(&req->r_osd_item);
-               list_del_init(&req->r_linger_osd_item);
-               req->r_osd = NULL;
-       }
-
-       req->r_osd = __lookup_osd(osdc, o);
-       if (!req->r_osd && o >= 0) {
-               err = -ENOMEM;
-               req->r_osd = create_osd(osdc, o);
-               if (!req->r_osd) {
-                       list_move(&req->r_req_lru_item, &osdc->req_notarget);
-                       goto out;
+       struct ceph_osd_linger_request *lreq = req->r_priv;
+
+       mutex_lock(&lreq->lock);
+       dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
+            __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
+            lreq->last_error);
+       if (lreq->register_gen == req->r_ops[0].watch.gen) {
+               if (!req->r_result) {
+                       lreq->watch_valid_thru = lreq->ping_sent;
+               } else if (!lreq->last_error) {
+                       lreq->last_error = normalize_watch_error(req->r_result);
+                       queue_watch_error(lreq);
                }
+       } else {
+               dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
+                    lreq->register_gen, req->r_ops[0].watch.gen);
+       }
 
-               dout("map_request osd %p is osd%d\n", req->r_osd, o);
-               __insert_osd(osdc, req->r_osd);
+       mutex_unlock(&lreq->lock);
+       linger_put(lreq);
+}
 
-               ceph_con_open(&req->r_osd->o_con,
-                             CEPH_ENTITY_TYPE_OSD, o,
-                             &osdc->osdmap->osd_addr[o]);
+static void send_linger_ping(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_request *req = lreq->ping_req;
+       struct ceph_osd_req_op *op = &req->r_ops[0];
+
+       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+               dout("%s PAUSERD\n", __func__);
+               return;
        }
 
-       __enqueue_request(req);
-       err = 1;   /* osd or pg changed */
+       lreq->ping_sent = jiffies;
+       dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
+            __func__, lreq, lreq->linger_id, lreq->ping_sent,
+            lreq->register_gen);
 
-out:
-       return err;
+       if (req->r_osd)
+               cancel_linger_request(req);
+
+       request_reinit(req);
+       target_copy(&req->r_t, &lreq->t);
+
+       WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
+               op->watch.cookie != lreq->linger_id ||
+               op->watch.op != CEPH_OSD_WATCH_OP_PING);
+       op->watch.gen = lreq->register_gen;
+       req->r_callback = linger_ping_cb;
+       req->r_priv = linger_get(lreq);
+       req->r_linger = true;
+
+       ceph_osdc_get_request(req);
+       account_request(req);
+       req->r_tid = atomic64_inc_return(&osdc->last_tid);
+       link_request(lreq->osd, req);
+       send_request(req);
+}
+
+static void linger_submit(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd *osd;
+
+       calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+       osd = lookup_create_osd(osdc, lreq->t.osd, true);
+       link_linger(osd, lreq);
+
+       send_linger(lreq);
+}
+
+static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_linger_request *lookup_lreq;
+
+       verify_osdc_wrlocked(osdc);
+
+       lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                      lreq->linger_id);
+       if (!lookup_lreq)
+               return;
+
+       WARN_ON(lookup_lreq != lreq);
+       erase_linger_mc(&osdc->linger_map_checks, lreq);
+       linger_put(lreq);
 }
 
 /*
- * caller should hold map_sem (for read) and request_mutex
+ * @lreq has to be both registered and linked.
  */
-static void __send_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
+static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-       void *p;
+       if (lreq->is_watch && lreq->ping_req->r_osd)
+               cancel_linger_request(lreq->ping_req);
+       if (lreq->reg_req->r_osd)
+               cancel_linger_request(lreq->reg_req);
+       cancel_linger_map_check(lreq);
+       unlink_linger(lreq->osd, lreq);
+       linger_unregister(lreq);
+}
+
+static void linger_cancel(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+
+       down_write(&osdc->lock);
+       if (__linger_registered(lreq))
+               __linger_cancel(lreq);
+       up_write(&osdc->lock);
+}
+
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
+
+static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osdmap *map = osdc->osdmap;
+
+       verify_osdc_wrlocked(osdc);
+       WARN_ON(!map->epoch);
+
+       if (lreq->register_gen) {
+               lreq->map_dne_bound = map->epoch;
+               dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
+                    lreq, lreq->linger_id);
+       } else {
+               dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
+                    __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+                    map->epoch);
+       }
+
+       if (lreq->map_dne_bound) {
+               if (map->epoch >= lreq->map_dne_bound) {
+                       /* we had a new enough map */
+                       pr_info("linger_id %llu pool does not exist\n",
+                               lreq->linger_id);
+                       linger_reg_commit_complete(lreq, -ENOENT);
+                       __linger_cancel(lreq);
+               }
+       } else {
+               send_linger_map_check(lreq);
+       }
+}
+
+static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
+{
+       struct ceph_osd_client *osdc = &greq->monc->client->osdc;
+       struct ceph_osd_linger_request *lreq;
+       u64 linger_id = greq->private_data;
+
+       WARN_ON(greq->result || !greq->u.newest);
+
+       down_write(&osdc->lock);
+       lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
+       if (!lreq) {
+               dout("%s linger_id %llu dne\n", __func__, linger_id);
+               goto out_unlock;
+       }
 
-       dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-            req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-            (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+       dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
+            __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
+            greq->u.newest);
+       if (!lreq->map_dne_bound)
+               lreq->map_dne_bound = greq->u.newest;
+       erase_linger_mc(&osdc->linger_map_checks, lreq);
+       check_linger_pool_dne(lreq);
 
-       /* fill in message content that changes each time we send it */
-       put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-       put_unaligned_le32(req->r_flags, req->r_request_flags);
-       put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
-       p = req->r_request_pgid;
-       ceph_encode_64(&p, req->r_pgid.pool);
-       ceph_encode_32(&p, req->r_pgid.seed);
-       put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-       memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-              sizeof(req->r_reassert_version));
+       linger_put(lreq);
+out_unlock:
+       up_write(&osdc->lock);
+}
 
-       req->r_stamp = jiffies;
-       list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       struct ceph_osd_linger_request *lookup_lreq;
+       int ret;
 
-       ceph_msg_get(req->r_request); /* send consumes a ref */
+       verify_osdc_wrlocked(osdc);
 
-       req->r_sent = req->r_osd->o_incarnation;
+       lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
+                                      lreq->linger_id);
+       if (lookup_lreq) {
+               WARN_ON(lookup_lreq != lreq);
+               return;
+       }
 
-       ceph_con_send(&req->r_osd->o_con, req->r_request);
+       linger_get(lreq);
+       insert_linger_mc(&osdc->linger_map_checks, lreq);
+       ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
+                                         linger_map_check_cb, lreq->linger_id);
+       WARN_ON(ret);
 }
 
-/*
- * Send any requests in the queue (req_unsent).
- */
-static void __send_queued(struct ceph_osd_client *osdc)
+static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_request *req, *tmp;
+       int ret;
 
-       dout("__send_queued\n");
-       list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-               __send_request(osdc, req);
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+       ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+       return ret ?: lreq->reg_commit_error;
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                                    struct ceph_osd_request *req,
-                                    bool nofail)
-{
-       int rc;
-
-       __register_request(osdc, req);
-       req->r_sent = 0;
-       req->r_got_reply = 0;
-       rc = __map_request(osdc, req, 0);
-       if (rc < 0) {
-               if (nofail) {
-                       dout("osdc_start_request failed map, "
-                               " will retry %lld\n", req->r_tid);
-                       rc = 0;
-               } else {
-                       __unregister_request(osdc, req);
-               }
-               return rc;
-       }
-
-       if (req->r_osd == NULL) {
-               dout("send_request %p no up osds in pg\n", req);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-       } else {
-               __send_queued(osdc);
-       }
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+{
+       int ret;
 
-       return 0;
+       dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
+       ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
+       return ret ?: lreq->notify_finish_error;
 }
 
 /*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
+ * Timeout callback, called every N seconds.  When 1 or more OSD
+ * requests has been active for more than N seconds, we send a keepalive
+ * (tag + timestamp) to its OSD to ensure any communications channel
+ * reset is detected.
  */
 static void handle_timeout(struct work_struct *work)
 {
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client, timeout_work.work);
        struct ceph_options *opts = osdc->client->options;
-       struct ceph_osd_request *req;
-       struct ceph_osd *osd;
-       struct list_head slow_osds;
-       dout("timeout\n");
-       down_read(&osdc->map_sem);
-
-       ceph_monc_request_next_osdmap(&osdc->client->monc);
+       unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+       LIST_HEAD(slow_osds);
+       struct rb_node *n, *p;
 
-       mutex_lock(&osdc->request_mutex);
+       dout("%s osdc %p\n", __func__, osdc);
+       down_write(&osdc->lock);
 
        /*
         * ping osds that are a bit slow.  this ensures that if there
         * is a break in the TCP connection we will notice, and reopen
         * a connection with that osd (from the fault callback).
         */
-       INIT_LIST_HEAD(&slow_osds);
-       list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-               if (time_before(jiffies,
-                               req->r_stamp + opts->osd_keepalive_timeout))
-                       break;
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+               bool found = false;
+
+               for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(p, struct ceph_osd_request, r_node);
+
+                       if (time_before(req->r_stamp, cutoff)) {
+                               dout(" req %p tid %llu on osd%d is laggy\n",
+                                    req, req->r_tid, osd->o_osd);
+                               found = true;
+                       }
+               }
+               for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_linger_request *lreq =
+                           rb_entry(p, struct ceph_osd_linger_request, node);
+
+                       dout(" lreq %p linger_id %llu is served by osd%d\n",
+                            lreq, lreq->linger_id, osd->o_osd);
+                       found = true;
+
+                       mutex_lock(&lreq->lock);
+                       if (lreq->is_watch && lreq->committed && !lreq->last_error)
+                               send_linger_ping(lreq);
+                       mutex_unlock(&lreq->lock);
+               }
 
-               osd = req->r_osd;
-               BUG_ON(!osd);
-               dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                    req->r_tid, osd->o_osd);
-               list_move_tail(&osd->o_keepalive_item, &slow_osds);
+               if (found)
+                       list_move_tail(&osd->o_keepalive_item, &slow_osds);
        }
+
+       if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+               maybe_request_map(osdc);
+
        while (!list_empty(&slow_osds)) {
-               osd = list_entry(slow_osds.next, struct ceph_osd,
-                                o_keepalive_item);
+               struct ceph_osd *osd = list_first_entry(&slow_osds,
+                                                       struct ceph_osd,
+                                                       o_keepalive_item);
                list_del_init(&osd->o_keepalive_item);
                ceph_con_keepalive(&osd->o_con);
        }
 
-       __schedule_osd_timeout(osdc);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       up_write(&osdc->lock);
+       schedule_delayed_work(&osdc->timeout_work,
+                             osdc->client->options->osd_keepalive_timeout);
 }
 
 static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
                container_of(work, struct ceph_osd_client,
                             osds_timeout_work.work);
        unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
+       struct ceph_osd *osd, *nosd;
+
+       dout("%s osdc %p\n", __func__, osdc);
+       down_write(&osdc->lock);
+       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+               if (time_before(jiffies, osd->lru_ttl))
+                       break;
 
-       dout("osds timeout\n");
-       down_read(&osdc->map_sem);
-       remove_old_osds(osdc);
-       up_read(&osdc->map_sem);
+               WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+               WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+               close_osd(osd);
+       }
 
+       up_write(&osdc->lock);
        schedule_delayed_work(&osdc->osds_timeout_work,
                              round_jiffies_relative(delay));
 }
@@ -1776,107 +2663,76 @@ e_inval:
        goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-       complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+       struct ceph_pg pgid;
+       u64 flags;
+       int result;
+       u32 epoch;
+       int num_ops;
+       u32 outdata_len[CEPH_OSD_MAX_OPS];
+       s32 rval[CEPH_OSD_MAX_OPS];
+       int retry_attempt;
+       struct ceph_eversion replay_version;
+       u64 user_version;
+       struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-       void *p, *end;
-       struct ceph_osd_request *req;
-       struct ceph_request_redirect redir;
-       u64 tid;
-       int object_len;
-       unsigned int numops;
-       int payload_len, flags;
-       s32 result;
-       s32 retry_attempt;
-       struct ceph_pg pg;
-       int err;
-       u32 reassert_epoch;
-       u64 reassert_version;
-       u32 osdmap_epoch;
-       int already_completed;
-       u32 bytes;
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
+       u16 version = le16_to_cpu(msg->hdr.version);
+       struct ceph_eversion bad_replay_version;
        u8 decode_redir;
-       unsigned int i;
-
-       tid = le64_to_cpu(msg->hdr.tid);
-       dout("handle_reply %p tid %llu\n", msg, tid);
+       u32 len;
+       int ret;
+       int i;
 
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       ceph_decode_32_safe(&p, end, len, e_inval);
+       ceph_decode_need(&p, end, len, e_inval);
+       p += len; /* skip oid */
 
-       ceph_decode_need(&p, end, 4, bad);
-       object_len = ceph_decode_32(&p);
-       ceph_decode_need(&p, end, object_len, bad);
-       p += object_len;
+       ret = ceph_decode_pgid(&p, end, &m->pgid);
+       if (ret)
+               return ret;
 
-       err = ceph_decode_pgid(&p, end, &pg);
-       if (err)
-               goto bad;
+       ceph_decode_64_safe(&p, end, m->flags, e_inval);
+       ceph_decode_32_safe(&p, end, m->result, e_inval);
+       ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+       memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+       p += sizeof(bad_replay_version);
+       ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-       ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-       flags = ceph_decode_64(&p);
-       result = ceph_decode_32(&p);
-       reassert_epoch = ceph_decode_32(&p);
-       reassert_version = ceph_decode_64(&p);
-       osdmap_epoch = ceph_decode_32(&p);
-
-       /* lookup */
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
-       if (req == NULL) {
-               dout("handle_reply tid %llu dne\n", tid);
-               goto bad_mutex;
-       }
-       ceph_osdc_get_request(req);
+       ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+       if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+               goto e_inval;
 
-       dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-            req, result);
-
-       ceph_decode_need(&p, end, 4, bad_put);
-       numops = ceph_decode_32(&p);
-       if (numops > CEPH_OSD_MAX_OPS)
-               goto bad_put;
-       if (numops != req->r_num_ops)
-               goto bad_put;
-       payload_len = 0;
-       ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-       for (i = 0; i < numops; i++) {
+       ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+                        e_inval);
+       for (i = 0; i < m->num_ops; i++) {
                struct ceph_osd_op *op = p;
-               int len;
 
-               len = le32_to_cpu(op->payload_len);
-               req->r_ops[i].outdata_len = len;
-               dout(" op %d has %d bytes\n", i, len);
-               payload_len += len;
+               m->outdata_len[i] = le32_to_cpu(op->payload_len);
                p += sizeof(*op);
        }
-       bytes = le32_to_cpu(msg->hdr.data_len);
-       if (payload_len != bytes) {
-               pr_warn("sum of op payload lens %d != data_len %d\n",
-                       payload_len, bytes);
-               goto bad_put;
-       }
 
-       ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-       retry_attempt = ceph_decode_32(&p);
-       for (i = 0; i < numops; i++)
-               req->r_ops[i].rval = ceph_decode_32(&p);
+       ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+       for (i = 0; i < m->num_ops; i++)
+               ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-       if (le16_to_cpu(msg->hdr.version) >= 6) {
-               p += 8 + 4; /* skip replay_version */
-               p += 8; /* skip user_version */
+       if (version >= 5) {
+               ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+               memcpy(&m->replay_version, p, sizeof(m->replay_version));
+               p += sizeof(m->replay_version);
+               ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+       } else {
+               m->replay_version = bad_replay_version; /* struct */
+               m->user_version = le64_to_cpu(m->replay_version.version);
+       }
 
-               if (le16_to_cpu(msg->hdr.version) >= 7)
-                       ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+       if (version >= 6) {
+               if (version >= 7)
+                       ceph_decode_8_safe(&p, end, decode_redir, e_inval);
                else
                        decode_redir = 1;
        } else {
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
        }
 
        if (decode_redir) {
-               err = ceph_redirect_decode(&p, end, &redir);
-               if (err)
-                       goto bad_put;
+               ret = ceph_redirect_decode(&p, end, &m->redirect);
+               if (ret)
+                       return ret;
        } else {
-               redir.oloc.pool = -1;
+               ceph_oloc_init(&m->redirect.oloc);
        }
 
-       if (redir.oloc.pool != -1) {
-               dout("redirect pool %lld\n", redir.oloc.pool);
-
-               __unregister_request(osdc, req);
-
-               req->r_target_oloc = redir.oloc; /* struct */
+       return 0;
 
-               /*
-                * Start redirect requests with nofail=true.  If
-                * mapping fails, request will end up on the notarget
-                * list, waiting for the new osdmap (which can take
-                * a while), even though the original request mapped
-                * successfully.  In the future we might want to follow
-                * original request's nofail setting here.
-                */
-               err = __ceph_osdc_start_request(osdc, req, true);
-               BUG_ON(err);
+e_inval:
+       return -EINVAL;
+}
 
-               goto out_unlock;
-       }
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+                        const struct MOSDOpReply *m)
+{
+       return (m->result < 0 ||
+               (m->flags & CEPH_OSD_FLAG_ONDISK) ||
+               !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
-       already_completed = req->r_got_reply;
-       if (!req->r_got_reply) {
-               req->r_result = result;
-               dout("handle_reply result %d bytes %d\n", req->r_result,
-                    bytes);
-               if (req->r_result == 0)
-                       req->r_result = bytes;
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set? yes                     no
+ *
+ * first reply is OK (needed   r_cb/r_completion,      r_cb/r_completion,
+ * any or needed/got safe)     r_safe_completion       r_safe_completion
+ *
+ * first reply is unsafe       r_unsafe_cb(true)       (nothing)
+ *
+ * when we get the safe reply  r_unsafe_cb(false),     r_cb/r_completion,
+ *                             r_safe_completion       r_safe_completion
+ */
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct ceph_osd_request *req;
+       struct MOSDOpReply m;
+       u64 tid = le64_to_cpu(msg->hdr.tid);
+       u32 data_len = 0;
+       bool already_acked;
+       int ret;
+       int i;
 
-               /* in case this is a write and we need to replay, */
-               req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-               req->r_reassert_version.version = cpu_to_le64(reassert_version);
+       dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-               req->r_got_reply = 1;
-       } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-               dout("handle_reply tid %llu dup ack\n", tid);
-               goto out_unlock;
+       down_read(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown\n", __func__, osd->o_osd);
+               goto out_unlock_osdc;
        }
+       WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
 
-       dout("handle_reply tid %llu flags %d\n", tid, flags);
+       mutex_lock(&osd->lock);
+       req = lookup_request(&osd->o_requests, tid);
+       if (!req) {
+               dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+               goto out_unlock_session;
+       }
 
-       if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-               __register_linger_request(osdc, req);
+       ret = decode_MOSDOpReply(msg, &m);
+       if (ret) {
+               pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+                      req->r_tid, ret);
+               ceph_msg_dump(msg);
+               goto fail_request;
+       }
+       dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+            __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+            m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+            le64_to_cpu(m.replay_version.version), m.user_version);
+
+       if (m.retry_attempt >= 0) {
+               if (m.retry_attempt != req->r_attempts - 1) {
+                       dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+                            req, req->r_tid, m.retry_attempt,
+                            req->r_attempts - 1);
+                       goto out_unlock_session;
+               }
+       } else {
+               WARN_ON(1); /* MOSDOpReply v4 is assumed */
+       }
 
-       /* either this is a read, or we got the safe response */
-       if (result < 0 ||
-           (flags & CEPH_OSD_FLAG_ONDISK) ||
-           ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-               __unregister_request(osdc, req);
+       if (!ceph_oloc_empty(&m.redirect.oloc)) {
+               dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+                    m.redirect.oloc.pool);
+               unlink_request(osd, req);
+               mutex_unlock(&osd->lock);
+
+               ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+               req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+               req->r_tid = 0;
+               __submit_request(req, false);
+               goto out_unlock_osdc;
+       }
 
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       if (m.num_ops != req->r_num_ops) {
+               pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+                      req->r_num_ops, req->r_tid);
+               goto fail_request;
+       }
+       for (i = 0; i < req->r_num_ops; i++) {
+               dout(" req %p tid %llu op %d rval %d len %u\n", req,
+                    req->r_tid, i, m.rval[i], m.outdata_len[i]);
+               req->r_ops[i].rval = m.rval[i];
+               req->r_ops[i].outdata_len = m.outdata_len[i];
+               data_len += m.outdata_len[i];
+       }
+       if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+               pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+                      le32_to_cpu(msg->hdr.data_len), req->r_tid);
+               goto fail_request;
+       }
+       dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+            req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+       already_acked = req->r_got_reply;
+       if (!already_acked) {
+               req->r_result = m.result ?: data_len;
+               req->r_replay_version = m.replay_version; /* struct */
+               req->r_got_reply = true;
+       } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+               dout("req %p tid %llu dup ack\n", req, req->r_tid);
+               goto out_unlock_session;
+       }
 
-       if (!already_completed) {
-               if (req->r_unsafe_callback &&
-                   result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
-                       req->r_unsafe_callback(req, true);
-               if (req->r_callback)
-                       req->r_callback(req, msg);
-               else
-                       complete_all(&req->r_completion);
+       if (done_request(req, &m)) {
+               __finish_request(req);
+               if (req->r_linger) {
+                       WARN_ON(req->r_unsafe_callback);
+                       dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
+                       __complete_request(req);
+               }
        }
 
-       if (flags & CEPH_OSD_FLAG_ONDISK) {
-               if (req->r_unsafe_callback && already_completed)
+       mutex_unlock(&osd->lock);
+       up_read(&osdc->lock);
+
+       if (done_request(req, &m)) {
+               if (already_acked && req->r_unsafe_callback) {
+                       dout("req %p tid %llu safe-cb\n", req, req->r_tid);
                        req->r_unsafe_callback(req, false);
-               complete_request(req);
+               } else if (!req->r_linger) {
+                       dout("req %p tid %llu cb\n", req, req->r_tid);
+                       __complete_request(req);
+               }
+       } else {
+               if (req->r_unsafe_callback) {
+                       dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+                       req->r_unsafe_callback(req, true);
+               } else {
+                       WARN_ON(1);
+               }
        }
+       if (m.flags & CEPH_OSD_FLAG_ONDISK)
+               complete_all(&req->r_safe_completion);
 
-out:
-       dout("req=%p req->r_linger=%d\n", req, req->r_linger);
        ceph_osdc_put_request(req);
        return;
-out_unlock:
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-       goto out;
 
-bad_put:
-       req->r_result = -EIO;
-       __unregister_request(osdc, req);
-       if (req->r_callback)
-               req->r_callback(req, msg);
-       else
-               complete_all(&req->r_completion);
-       complete_request(req);
-       ceph_osdc_put_request(req);
-bad_mutex:
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-bad:
-       pr_err("corrupt osd_op_reply got %d %d\n",
-              (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-       ceph_msg_dump(msg);
+fail_request:
+       complete_request(req, -EIO);
+out_unlock_session:
+       mutex_unlock(&osd->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static void set_pool_was_full(struct ceph_osd_client *osdc)
 {
-       struct rb_node *p, *n;
+       struct rb_node *n;
 
-       dout("%s %p\n", __func__, osdc);
-       for (p = rb_first(&osdc->osds); p; p = n) {
-               struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+       for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pi =
+                   rb_entry(n, struct ceph_pg_pool_info, node);
 
-               n = rb_next(p);
-               if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                   memcmp(&osd->o_con.peer_addr,
-                          ceph_osd_addr(osdc->osdmap,
-                                        osd->o_osd),
-                          sizeof(struct ceph_entity_addr)) != 0)
-                       __reset_osd(osdc, osd);
+               pi->was_full = __pool_full(pi);
+       }
+}
+
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+       struct ceph_pg_pool_info *pi;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+       if (!pi)
+               return false;
+
+       return pi->was_full && !__pool_full(pi);
+}
+
+static enum calc_target_result
+recalc_linger_target(struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_osd_client *osdc = lreq->osdc;
+       enum calc_target_result ct_res;
+
+       ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+       if (ct_res == CALC_TARGET_NEED_RESEND) {
+               struct ceph_osd *osd;
+
+               osd = lookup_create_osd(osdc, lreq->t.osd, true);
+               if (osd != lreq->osd) {
+                       unlink_linger(lreq->osd, lreq);
+                       link_linger(osd, lreq);
+               }
        }
+
+       return ct_res;
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-                         bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+                         bool force_resend,
+                         bool cleared_full,
+                         bool check_pool_cleared_full,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
 {
-       struct ceph_osd_request *req, *nreq;
-       struct rb_node *p;
-       int needmap = 0;
-       int err;
-       bool force_resend_req;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct rb_node *n;
+       bool force_resend_writes;
+
+       for (n = rb_first(&osd->o_linger_requests); n; ) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+               enum calc_target_result ct_res;
+
+               n = rb_next(n); /* recalc_linger_target() */
+
+               dout("%s lreq %p linger_id %llu\n", __func__, lreq,
+                    lreq->linger_id);
+               ct_res = recalc_linger_target(lreq);
+               switch (ct_res) {
+               case CALC_TARGET_NO_ACTION:
+                       force_resend_writes = cleared_full ||
+                           (check_pool_cleared_full &&
+                            pool_cleared_full(osdc, lreq->t.base_oloc.pool));
+                       if (!force_resend && !force_resend_writes)
+                               break;
+
+                       /* fall through */
+               case CALC_TARGET_NEED_RESEND:
+                       cancel_linger_map_check(lreq);
+                       /*
+                        * scan_requests() for the previous epoch(s)
+                        * may have already added it to the list, since
+                        * it's not unlinked here.
+                        */
+                       if (list_empty(&lreq->scan_item))
+                               list_add_tail(&lreq->scan_item, need_resend_linger);
+                       break;
+               case CALC_TARGET_POOL_DNE:
+                       check_linger_pool_dne(lreq);
+                       break;
+               }
+       }
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+               enum calc_target_result ct_res;
+
+               n = rb_next(n); /* unlink_request(), check_pool_dne() */
+
+               dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+               ct_res = calc_target(osdc, &req->r_t,
+                                    &req->r_last_force_resend, false);
+               switch (ct_res) {
+               case CALC_TARGET_NO_ACTION:
+                       force_resend_writes = cleared_full ||
+                           (check_pool_cleared_full &&
+                            pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+                       if (!force_resend &&
+                           (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+                            !force_resend_writes))
+                               break;
+
+                       /* fall through */
+               case CALC_TARGET_NEED_RESEND:
+                       cancel_map_check(req);
+                       unlink_request(osd, req);
+                       insert_request(need_resend, req);
+                       break;
+               case CALC_TARGET_POOL_DNE:
+                       check_pool_dne(req);
+                       break;
+               }
+       }
+}
+
+static int handle_one_map(struct ceph_osd_client *osdc,
+                         void *p, void *end, bool incremental,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
+{
+       struct ceph_osdmap *newmap;
+       struct rb_node *n;
+       bool skipped_map = false;
+       bool was_full;
+
+       was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+       set_pool_was_full(osdc);
 
-       dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-               force_resend_writes ? " (force resend writes)" : "");
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; ) {
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-               p = rb_next(p);
+       if (incremental)
+               newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+       else
+               newmap = ceph_osdmap_decode(&p, end);
+       if (IS_ERR(newmap))
+               return PTR_ERR(newmap);
 
+       if (newmap != osdc->osdmap) {
                /*
-                * For linger requests that have not yet been
-                * registered, move them to the linger list; they'll
-                * be sent to the osd in the loop below.  Unregister
-                * the request before re-registering it as a linger
-                * request to ensure the __map_request() below
-                * will decide it needs to be sent.
+                * Preserve ->was_full before destroying the old map.
+                * For pools that weren't in the old map, ->was_full
+                * should be false.
                 */
-               if (req->r_linger && list_empty(&req->r_linger_item)) {
-                       dout("%p tid %llu restart on osd%d\n",
-                            req, req->r_tid,
-                            req->r_osd ? req->r_osd->o_osd : -1);
-                       ceph_osdc_get_request(req);
-                       __unregister_request(osdc, req);
-                       __register_linger_request(osdc, req);
-                       ceph_osdc_put_request(req);
-                       continue;
+               for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
+                       struct ceph_pg_pool_info *pi =
+                           rb_entry(n, struct ceph_pg_pool_info, node);
+                       struct ceph_pg_pool_info *old_pi;
+
+                       old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
+                       if (old_pi)
+                               pi->was_full = old_pi->was_full;
+                       else
+                               WARN_ON(pi->was_full);
                }
 
-               force_resend_req = force_resend ||
-                       (force_resend_writes &&
-                               req->r_flags & CEPH_OSD_FLAG_WRITE);
-               err = __map_request(osdc, req, force_resend_req);
-               if (err < 0)
-                       continue;  /* error */
-               if (req->r_osd == NULL) {
-                       dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-                       needmap++;  /* request a newer map */
-               } else if (err > 0) {
-                       if (!req->r_linger) {
-                               dout("%p tid %llu requeued on osd%d\n", req,
-                                    req->r_tid,
-                                    req->r_osd ? req->r_osd->o_osd : -1);
-                               req->r_flags |= CEPH_OSD_FLAG_RETRY;
-                       }
+               if (osdc->osdmap->epoch &&
+                   osdc->osdmap->epoch + 1 < newmap->epoch) {
+                       WARN_ON(incremental);
+                       skipped_map = true;
                }
+
+               ceph_osdmap_destroy(osdc->osdmap);
+               osdc->osdmap = newmap;
        }
 
-       list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-                                r_linger_item) {
-               dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-               err = __map_request(osdc, req,
-                                   force_resend || force_resend_writes);
-               dout("__map_request returned %d\n", err);
-               if (err < 0)
-                       continue;  /* hrm! */
-               if (req->r_osd == NULL || err > 0) {
-                       if (req->r_osd == NULL) {
-                               dout("lingering %p tid %llu maps to no osd\n",
-                                    req, req->r_tid);
-                               /*
-                                * A homeless lingering request makes
-                                * no sense, as it's job is to keep
-                                * a particular OSD connection open.
-                                * Request a newer map and kick the
-                                * request, knowing that it won't be
-                                * resent until we actually get a map
-                                * that can tell us where to send it.
-                                */
-                               needmap++;
-                       }
+       was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+       scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+                     need_resend, need_resend_linger);
+
+       for (n = rb_first(&osdc->osds); n; ) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+               n = rb_next(n); /* close_osd() */
 
-                       dout("kicking lingering %p tid %llu osd%d\n", req,
-                            req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-                       __register_request(osdc, req);
-                       __unregister_linger_request(osdc, req);
+               scan_requests(osd, skipped_map, was_full, true, need_resend,
+                             need_resend_linger);
+               if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+                   memcmp(&osd->o_con.peer_addr,
+                          ceph_osd_addr(osdc->osdmap, osd->o_osd),
+                          sizeof(struct ceph_entity_addr)))
+                       close_osd(osd);
+       }
+
+       return 0;
+}
+
+static void kick_requests(struct ceph_osd_client *osdc,
+                         struct rb_root *need_resend,
+                         struct list_head *need_resend_linger)
+{
+       struct ceph_osd_linger_request *lreq, *nlreq;
+       struct rb_node *n;
+
+       for (n = rb_first(need_resend); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+               struct ceph_osd *osd;
+
+               n = rb_next(n);
+               erase_request(need_resend, req); /* before link_request() */
+
+               WARN_ON(req->r_osd);
+               calc_target(osdc, &req->r_t, NULL, false);
+               osd = lookup_create_osd(osdc, req->r_t.osd, true);
+               link_request(osd, req);
+               if (!req->r_linger) {
+                       if (!osd_homeless(osd) && !req->r_t.paused)
+                               send_request(req);
+               } else {
+                       cancel_linger_request(req);
                }
        }
-       reset_changed_osds(osdc);
-       mutex_unlock(&osdc->request_mutex);
 
-       if (needmap) {
-               dout("%d requests for down osds, need new map\n", needmap);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
+       list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
+               if (!osd_homeless(lreq->osd))
+                       send_linger(lreq);
+
+               list_del_init(&lreq->scan_item);
        }
 }
 
-
 /*
  * Process updated osd map.
  *
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  */
 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 {
-       void *p, *end, *next;
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
        u32 nr_maps, maplen;
        u32 epoch;
-       struct ceph_osdmap *newmap = NULL, *oldmap;
-       int err;
        struct ceph_fsid fsid;
-       bool was_full;
+       struct rb_root need_resend = RB_ROOT;
+       LIST_HEAD(need_resend_linger);
+       bool handled_incremental = false;
+       bool was_pauserd, was_pausewr;
+       bool pauserd, pausewr;
+       int err;
 
-       dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       dout("%s have %u\n", __func__, osdc->osdmap->epoch);
+       down_write(&osdc->lock);
 
        /* verify fsid */
        ceph_decode_need(&p, end, sizeof(fsid), bad);
        ceph_decode_copy(&p, &fsid, sizeof(fsid));
        if (ceph_check_fsid(osdc->client, &fsid) < 0)
-               return;
-
-       down_write(&osdc->map_sem);
+               goto bad;
 
-       was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+       was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+       was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+                     ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                     have_pool_full(osdc);
 
        /* incremental maps */
        ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                epoch = ceph_decode_32(&p);
                maplen = ceph_decode_32(&p);
                ceph_decode_need(&p, end, maplen, bad);
-               next = p + maplen;
-               if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+               if (osdc->osdmap->epoch &&
+                   osdc->osdmap->epoch + 1 == epoch) {
                        dout("applying incremental map %u len %d\n",
                             epoch, maplen);
-                       newmap = osdmap_apply_incremental(&p, next,
-                                                         osdc->osdmap,
-                                                         &osdc->client->msgr);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
+                       err = handle_one_map(osdc, p, p + maplen, true,
+                                            &need_resend, &need_resend_linger);
+                       if (err)
                                goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       if (newmap != osdc->osdmap) {
-                               ceph_osdmap_destroy(osdc->osdmap);
-                               osdc->osdmap = newmap;
-                       }
-                       was_full = was_full ||
-                               ceph_osdmap_flag(osdc->osdmap,
-                                                CEPH_OSDMAP_FULL);
-                       kick_requests(osdc, 0, was_full);
+                       handled_incremental = true;
                } else {
                        dout("ignoring incremental map %u len %d\n",
                             epoch, maplen);
                }
-               p = next;
+               p += maplen;
                nr_maps--;
        }
-       if (newmap)
+       if (handled_incremental)
                goto done;
 
        /* full maps */
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                if (nr_maps > 1) {
                        dout("skipping non-latest full map %u len %d\n",
                             epoch, maplen);
-               } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+               } else if (osdc->osdmap->epoch >= epoch) {
                        dout("skipping full map %u len %d, "
                             "older than our %u\n", epoch, maplen,
                             osdc->osdmap->epoch);
                } else {
-                       int skipped_map = 0;
+                       dout("taking full map %u len %d\n", epoch, maplen);
+                       err = handle_one_map(osdc, p, p + maplen, false,
+                                            &need_resend, &need_resend_linger);
+                       if (err)
+                               goto bad;
+               }
+               p += maplen;
+               nr_maps--;
+       }
+
+done:
+       /*
+        * subscribe to subsequent osdmap updates if full to ensure
+        * we find out when we are no longer full and stop returning
+        * ENOSPC.
+        */
+       pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+       pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+                 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                 have_pool_full(osdc);
+       if (was_pauserd || was_pausewr || pauserd || pausewr)
+               maybe_request_map(osdc);
+
+       kick_requests(osdc, &need_resend, &need_resend_linger);
+
+       ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                         osdc->osdmap->epoch);
+       up_write(&osdc->lock);
+       wake_up_all(&osdc->client->auth_wq);
+       return;
+
+bad:
+       pr_err("osdc handle_map corrupt msg\n");
+       ceph_msg_dump(msg);
+       up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+       struct rb_node *n;
+
+       for (n = rb_first(&osd->o_requests); n; ) {
+               struct ceph_osd_request *req =
+                   rb_entry(n, struct ceph_osd_request, r_node);
+
+               n = rb_next(n); /* cancel_linger_request() */
+
+               if (!req->r_linger) {
+                       if (!req->r_t.paused)
+                               send_request(req);
+               } else {
+                       cancel_linger_request(req);
+               }
+       }
+       for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
+               struct ceph_osd_linger_request *lreq =
+                   rb_entry(n, struct ceph_osd_linger_request, node);
+
+               send_linger(lreq);
+       }
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+       struct ceph_osd *osd = con->private;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+
+       dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+       down_write(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown\n", __func__, osd->o_osd);
+               goto out_unlock;
+       }
+
+       if (!reopen_osd(osd))
+               kick_osd_requests(osd);
+       maybe_request_map(osdc);
+
+out_unlock:
+       up_write(&osdc->lock);
+}
+
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+                               struct ceph_msg *msg)
+{
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front.iov_len;
+       struct ceph_osd_linger_request *lreq;
+       struct linger_work *lwork;
+       u8 proto_ver, opcode;
+       u64 cookie, notify_id;
+       u64 notifier_id = 0;
+       s32 return_code = 0;
+       void *payload = NULL;
+       u32 payload_len = 0;
+
+       ceph_decode_8_safe(&p, end, proto_ver, bad);
+       ceph_decode_8_safe(&p, end, opcode, bad);
+       ceph_decode_64_safe(&p, end, cookie, bad);
+       p += 8; /* skip ver */
+       ceph_decode_64_safe(&p, end, notify_id, bad);
+
+       if (proto_ver >= 1) {
+               ceph_decode_32_safe(&p, end, payload_len, bad);
+               ceph_decode_need(&p, end, payload_len, bad);
+               payload = p;
+               p += payload_len;
+       }
+
+       if (le16_to_cpu(msg->hdr.version) >= 2)
+               ceph_decode_32_safe(&p, end, return_code, bad);
+
+       if (le16_to_cpu(msg->hdr.version) >= 3)
+               ceph_decode_64_safe(&p, end, notifier_id, bad);
+
+       down_read(&osdc->lock);
+       lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
+       if (!lreq) {
+               dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
+                    cookie);
+               goto out_unlock_osdc;
+       }
 
-                       dout("taking full map %u len %d\n", epoch, maplen);
-                       newmap = ceph_osdmap_decode(&p, p+maplen);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
-                               goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       oldmap = osdc->osdmap;
-                       osdc->osdmap = newmap;
-                       if (oldmap) {
-                               if (oldmap->epoch + 1 < newmap->epoch)
-                                       skipped_map = 1;
-                               ceph_osdmap_destroy(oldmap);
+       mutex_lock(&lreq->lock);
+       dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
+            opcode, cookie, lreq, lreq->is_watch);
+       if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+               if (!lreq->last_error) {
+                       lreq->last_error = -ENOTCONN;
+                       queue_watch_error(lreq);
+               }
+       } else if (!lreq->is_watch) {
+               /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
+               if (lreq->notify_id && lreq->notify_id != notify_id) {
+                       dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
+                            lreq->notify_id, notify_id);
+               } else if (!completion_done(&lreq->notify_finish_wait)) {
+                       struct ceph_msg_data *data =
+                           list_first_entry_or_null(&msg->data,
+                                                    struct ceph_msg_data,
+                                                    links);
+
+                       if (data) {
+                               if (lreq->preply_pages) {
+                                       WARN_ON(data->type !=
+                                                       CEPH_MSG_DATA_PAGES);
+                                       *lreq->preply_pages = data->pages;
+                                       *lreq->preply_len = data->length;
+                               } else {
+                                       ceph_release_page_vector(data->pages,
+                                              calc_pages_for(0, data->length));
+                               }
                        }
-                       was_full = was_full ||
-                               ceph_osdmap_flag(osdc->osdmap,
-                                                CEPH_OSDMAP_FULL);
-                       kick_requests(osdc, skipped_map, was_full);
+                       lreq->notify_finish_error = return_code;
+                       complete_all(&lreq->notify_finish_wait);
+               }
+       } else {
+               /* CEPH_WATCH_EVENT_NOTIFY */
+               lwork = lwork_alloc(lreq, do_watch_notify);
+               if (!lwork) {
+                       pr_err("failed to allocate notify-lwork\n");
+                       goto out_unlock_lreq;
                }
-               p += maplen;
-               nr_maps--;
-       }
 
-       if (!osdc->osdmap)
-               goto bad;
-done:
-       downgrade_write(&osdc->map_sem);
-       ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
-                         osdc->osdmap->epoch);
+               lwork->notify.notify_id = notify_id;
+               lwork->notify.notifier_id = notifier_id;
+               lwork->notify.payload = payload;
+               lwork->notify.payload_len = payload_len;
+               lwork->notify.msg = ceph_msg_get(msg);
+               lwork_queue(lwork);
+       }
 
-       /*
-        * subscribe to subsequent osdmap updates if full to ensure
-        * we find out when we are no longer full and stop returning
-        * ENOSPC.
-        */
-       if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
-               ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-       mutex_lock(&osdc->request_mutex);
-       __send_queued(osdc);
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-       wake_up_all(&osdc->client->auth_wq);
+out_unlock_lreq:
+       mutex_unlock(&lreq->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
        return;
 
 bad:
-       pr_err("osdc handle_map corrupt msg\n");
-       ceph_msg_dump(msg);
-       up_write(&osdc->map_sem);
+       pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
 /*
- * watch/notify callback event infrastructure
- *
- * These callbacks are used both for watch and notify operations.
+ * Register request, send initial attempt.
  */
-static void __release_event(struct kref *kref)
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req,
+                           bool nofail)
 {
-       struct ceph_osd_event *event =
-               container_of(kref, struct ceph_osd_event, kref);
+       down_read(&osdc->lock);
+       submit_request(req, false);
+       up_read(&osdc->lock);
 
-       dout("__release_event %p\n", event);
-       kfree(event);
+       return 0;
 }
+EXPORT_SYMBOL(ceph_osdc_start_request);
 
-static void get_event(struct ceph_osd_event *event)
+/*
+ * Unregister a registered request.  The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
-       kref_get(&event->kref);
-}
+       struct ceph_osd_client *osdc = req->r_osdc;
 
-void ceph_osdc_put_event(struct ceph_osd_event *event)
-{
-       kref_put(&event->kref, __release_event);
+       down_write(&osdc->lock);
+       if (req->r_osd)
+               cancel_request(req);
+       up_write(&osdc->lock);
 }
-EXPORT_SYMBOL(ceph_osdc_put_event);
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
-static void __insert_event(struct ceph_osd_client *osdc,
-                            struct ceph_osd_event *new)
+/*
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
+static int wait_request_timeout(struct ceph_osd_request *req,
+                               unsigned long timeout)
 {
-       struct rb_node **p = &osdc->event_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_event *event = NULL;
+       long left;
 
-       while (*p) {
-               parent = *p;
-               event = rb_entry(parent, struct ceph_osd_event, node);
-               if (new->cookie < event->cookie)
-                       p = &(*p)->rb_left;
-               else if (new->cookie > event->cookie)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
+       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+       left = wait_for_completion_killable_timeout(&req->r_completion,
+                                               ceph_timeout_jiffies(timeout));
+       if (left <= 0) {
+               left = left ?: -ETIMEDOUT;
+               ceph_osdc_cancel_request(req);
+
+               /* kludge - need to to wake ceph_osdc_sync() */
+               complete_all(&req->r_safe_completion);
+       } else {
+               left = req->r_result; /* completed */
        }
 
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &osdc->event_tree);
+       return left;
 }
 
-static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
-                                               u64 cookie)
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                          struct ceph_osd_request *req)
 {
-       struct rb_node **p = &osdc->event_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_event *event = NULL;
-
-       while (*p) {
-               parent = *p;
-               event = rb_entry(parent, struct ceph_osd_event, node);
-               if (cookie < event->cookie)
-                       p = &(*p)->rb_left;
-               else if (cookie > event->cookie)
-                       p = &(*p)->rb_right;
-               else
-                       return event;
-       }
-       return NULL;
+       return wait_request_timeout(req, 0);
 }
+EXPORT_SYMBOL(ceph_osdc_wait_request);
 
-static void __remove_event(struct ceph_osd_event *event)
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-       struct ceph_osd_client *osdc = event->osdc;
+       struct rb_node *n, *p;
+       u64 last_tid = atomic64_read(&osdc->last_tid);
 
-       if (!RB_EMPTY_NODE(&event->node)) {
-               dout("__remove_event removed %p\n", event);
-               rb_erase(&event->node, &osdc->event_tree);
-               ceph_osdc_put_event(event);
-       } else {
-               dout("__remove_event didn't remove %p\n", event);
-       }
-}
+again:
+       down_read(&osdc->lock);
+       for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+               struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-int ceph_osdc_create_event(struct ceph_osd_client *osdc,
-                          void (*event_cb)(u64, u64, u8, void *),
-                          void *data, struct ceph_osd_event **pevent)
-{
-       struct ceph_osd_event *event;
+               mutex_lock(&osd->lock);
+               for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+                       struct ceph_osd_request *req =
+                           rb_entry(p, struct ceph_osd_request, r_node);
 
-       event = kmalloc(sizeof(*event), GFP_NOIO);
-       if (!event)
-               return -ENOMEM;
+                       if (req->r_tid > last_tid)
+                               break;
 
-       dout("create_event %p\n", event);
-       event->cb = event_cb;
-       event->one_shot = 0;
-       event->data = data;
-       event->osdc = osdc;
-       INIT_LIST_HEAD(&event->osd_node);
-       RB_CLEAR_NODE(&event->node);
-       kref_init(&event->kref);   /* one ref for us */
-       kref_get(&event->kref);    /* one ref for the caller */
-
-       spin_lock(&osdc->event_lock);
-       event->cookie = ++osdc->event_count;
-       __insert_event(osdc, event);
-       spin_unlock(&osdc->event_lock);
-
-       *pevent = event;
-       return 0;
+                       if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+                               continue;
+
+                       ceph_osdc_get_request(req);
+                       mutex_unlock(&osd->lock);
+                       up_read(&osdc->lock);
+                       dout("%s waiting on req %p tid %llu last_tid %llu\n",
+                            __func__, req, req->r_tid, last_tid);
+                       wait_for_completion(&req->r_safe_completion);
+                       ceph_osdc_put_request(req);
+                       goto again;
+               }
+
+               mutex_unlock(&osd->lock);
+       }
+
+       up_read(&osdc->lock);
+       dout("%s done last_tid %llu\n", __func__, last_tid);
 }
-EXPORT_SYMBOL(ceph_osdc_create_event);
+EXPORT_SYMBOL(ceph_osdc_sync);
 
-void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+static struct ceph_osd_request *
+alloc_linger_request(struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_client *osdc = event->osdc;
+       struct ceph_osd_request *req;
 
-       dout("cancel_event %p\n", event);
-       spin_lock(&osdc->event_lock);
-       __remove_event(event);
-       spin_unlock(&osdc->event_lock);
-       ceph_osdc_put_event(event); /* caller's */
-}
-EXPORT_SYMBOL(ceph_osdc_cancel_event);
+       req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return NULL;
 
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
 
-static void do_event_work(struct work_struct *work)
-{
-       struct ceph_osd_event_work *event_work =
-               container_of(work, struct ceph_osd_event_work, work);
-       struct ceph_osd_event *event = event_work->event;
-       u64 ver = event_work->ver;
-       u64 notify_id = event_work->notify_id;
-       u8 opcode = event_work->opcode;
+       if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
+               ceph_osdc_put_request(req);
+               return NULL;
+       }
 
-       dout("do_event_work completing %p\n", event);
-       event->cb(ver, notify_id, opcode, event->data);
-       dout("do_event_work completed %p\n", event);
-       ceph_osdc_put_event(event);
-       kfree(event_work);
+       return req;
 }
 
-
 /*
- * Process osd watch notifications
+ * Returns a handle, caller owns a ref.
  */
-static void handle_watch_notify(struct ceph_osd_client *osdc,
-                               struct ceph_msg *msg)
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+               struct ceph_object_id *oid,
+               struct ceph_object_locator *oloc,
+               rados_watchcb2_t wcb,
+               rados_watcherrcb_t errcb,
+               void *data)
 {
-       void *p, *end;
-       u8 proto_ver;
-       u64 cookie, ver, notify_id;
-       u8 opcode;
-       struct ceph_osd_event *event;
-       struct ceph_osd_event_work *event_work;
+       struct ceph_osd_linger_request *lreq;
+       int ret;
 
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
+       lreq = linger_alloc(osdc);
+       if (!lreq)
+               return ERR_PTR(-ENOMEM);
 
-       ceph_decode_8_safe(&p, end, proto_ver, bad);
-       ceph_decode_8_safe(&p, end, opcode, bad);
-       ceph_decode_64_safe(&p, end, cookie, bad);
-       ceph_decode_64_safe(&p, end, ver, bad);
-       ceph_decode_64_safe(&p, end, notify_id, bad);
+       lreq->is_watch = true;
+       lreq->wcb = wcb;
+       lreq->errcb = errcb;
+       lreq->data = data;
+       lreq->watch_valid_thru = jiffies;
+
+       ceph_oid_copy(&lreq->t.base_oid, oid);
+       ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+       lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       lreq->mtime = CURRENT_TIME;
+
+       lreq->reg_req = alloc_linger_request(lreq);
+       if (!lreq->reg_req) {
+               ret = -ENOMEM;
+               goto err_put_lreq;
+       }
 
-       spin_lock(&osdc->event_lock);
-       event = __find_event(osdc, cookie);
-       if (event) {
-               BUG_ON(event->one_shot);
-               get_event(event);
-       }
-       spin_unlock(&osdc->event_lock);
-       dout("handle_watch_notify cookie %lld ver %lld event %p\n",
-            cookie, ver, event);
-       if (event) {
-               event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
-               if (!event_work) {
-                       pr_err("couldn't allocate event_work\n");
-                       ceph_osdc_put_event(event);
-                       return;
-               }
-               INIT_WORK(&event_work->work, do_event_work);
-               event_work->event = event;
-               event_work->ver = ver;
-               event_work->notify_id = notify_id;
-               event_work->opcode = opcode;
+       lreq->ping_req = alloc_linger_request(lreq);
+       if (!lreq->ping_req) {
+               ret = -ENOMEM;
+               goto err_put_lreq;
+       }
 
-               queue_work(osdc->notify_wq, &event_work->work);
+       down_write(&osdc->lock);
+       linger_register(lreq); /* before osd_req_op_* */
+       osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_WATCH);
+       osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_PING);
+       linger_submit(lreq);
+       up_write(&osdc->lock);
+
+       ret = linger_reg_commit_wait(lreq);
+       if (ret) {
+               linger_cancel(lreq);
+               goto err_put_lreq;
        }
 
-       return;
+       return lreq;
 
-bad:
-       pr_err("osdc handle_watch_notify corrupt msg\n");
+err_put_lreq:
+       linger_put(lreq);
+       return ERR_PTR(ret);
 }
+EXPORT_SYMBOL(ceph_osdc_watch);
 
 /*
- * build new request AND message
+ * Releases a ref.
  *
+ * Times out after mount_timeout to preserve rbd unmap behaviour
+ * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
+ * with mount_timeout").
  */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-                               struct ceph_snap_context *snapc, u64 snap_id,
-                               struct timespec *mtime)
-{
-       struct ceph_msg *msg = req->r_request;
-       void *p;
-       size_t msg_size;
-       int flags = req->r_flags;
-       u64 data_len;
-       unsigned int i;
-
-       req->r_snapid = snap_id;
-       req->r_snapc = ceph_get_snap_context(snapc);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+                     struct ceph_osd_linger_request *lreq)
+{
+       struct ceph_options *opts = osdc->client->options;
+       struct ceph_osd_request *req;
+       int ret;
 
-       /* encode request */
-       msg->hdr.version = cpu_to_le16(4);
-
-       p = msg->front.iov_base;
-       ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-       req->r_request_osdmap_epoch = p;
-       p += 4;
-       req->r_request_flags = p;
-       p += 4;
-       if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-               ceph_encode_timespec(p, mtime);
-       p += sizeof(struct ceph_timespec);
-       req->r_request_reassert_version = p;
-       p += sizeof(struct ceph_eversion); /* will get filled in */
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return -ENOMEM;
 
-       /* oloc */
-       ceph_encode_8(&p, 4);
-       ceph_encode_8(&p, 4);
-       ceph_encode_32(&p, 8 + 4 + 4);
-       req->r_request_pool = p;
-       p += 8;
-       ceph_encode_32(&p, -1);  /* preferred */
-       ceph_encode_32(&p, 0);   /* key len */
+       ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
+       ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
+       req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       req->r_mtime = CURRENT_TIME;
+       osd_req_op_watch_init(req, 0, lreq->linger_id,
+                             CEPH_OSD_WATCH_OP_UNWATCH);
 
-       ceph_encode_8(&p, 1);
-       req->r_request_pgid = p;
-       p += 8 + 4;
-       ceph_encode_32(&p, -1);  /* preferred */
+       ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+       if (ret)
+               goto out_put_req;
 
-       /* oid */
-       ceph_encode_32(&p, req->r_base_oid.name_len);
-       memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-       dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
-            req->r_base_oid.name, req->r_base_oid.name_len);
-       p += req->r_base_oid.name_len;
-
-       /* ops--can imply data */
-       ceph_encode_16(&p, (u16)req->r_num_ops);
-       data_len = 0;
-       for (i = 0; i < req->r_num_ops; i++) {
-               data_len += osd_req_encode_op(req, p, i);
-               p += sizeof(struct ceph_osd_op);
-       }
+       ceph_osdc_start_request(osdc, req, false);
+       linger_cancel(lreq);
+       linger_put(lreq);
+       ret = wait_request_timeout(req, opts->mount_timeout);
 
-       /* snaps */
-       ceph_encode_64(&p, req->r_snapid);
-       ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-       ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-       if (req->r_snapc) {
-               for (i = 0; i < snapc->num_snaps; i++) {
-                       ceph_encode_64(&p, req->r_snapc->snaps[i]);
-               }
-       }
+out_put_req:
+       ceph_osdc_put_request(req);
+       return ret;
+}
+EXPORT_SYMBOL(ceph_osdc_unwatch);
 
-       req->r_request_attempts = p;
-       p += 4;
+static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
+                                     u64 notify_id, u64 cookie, void *payload,
+                                     size_t payload_len)
+{
+       struct ceph_osd_req_op *op;
+       struct ceph_pagelist *pl;
+       int ret;
 
-       /* data */
-       if (flags & CEPH_OSD_FLAG_WRITE) {
-               u16 data_off;
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
 
-               /*
-                * The header "data_off" is a hint to the receiver
-                * allowing it to align received data into its
-                * buffers such that there's no need to re-copy
-                * it before writing it to disk (direct I/O).
-                */
-               data_off = (u16) (off & 0xffff);
-               req->r_request->hdr.data_off = cpu_to_le16(data_off);
-       }
-       req->r_request->hdr.data_len = cpu_to_le32(data_len);
+       pl = kmalloc(sizeof(*pl), GFP_NOIO);
+       if (!pl)
+               return -ENOMEM;
 
-       BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-       msg_size = p - msg->front.iov_base;
-       msg->front.iov_len = msg_size;
-       msg->hdr.front_len = cpu_to_le32(msg_size);
+       ceph_pagelist_init(pl);
+       ret = ceph_pagelist_encode_64(pl, notify_id);
+       ret |= ceph_pagelist_encode_64(pl, cookie);
+       if (payload) {
+               ret |= ceph_pagelist_encode_32(pl, payload_len);
+               ret |= ceph_pagelist_append(pl, payload, payload_len);
+       } else {
+               ret |= ceph_pagelist_encode_32(pl, 0);
+       }
+       if (ret) {
+               ceph_pagelist_release(pl);
+               return -ENOMEM;
+       }
 
-       dout("build_request msg_size was %d\n", (int)msg_size);
+       ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
+       op->indata_len = pl->length;
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
 
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req,
-                           bool nofail)
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+                        struct ceph_object_id *oid,
+                        struct ceph_object_locator *oloc,
+                        u64 notify_id,
+                        u64 cookie,
+                        void *payload,
+                        size_t payload_len)
 {
-       int rc;
+       struct ceph_osd_request *req;
+       int ret;
+
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
+       if (!req)
+               return -ENOMEM;
 
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
+       ceph_oid_copy(&req->r_base_oid, oid);
+       ceph_oloc_copy(&req->r_base_oloc, oloc);
+       req->r_flags = CEPH_OSD_FLAG_READ;
 
-       rc = __ceph_osdc_start_request(osdc, req, nofail);
+       ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+       if (ret)
+               goto out_put_req;
+
+       ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+                                        payload_len);
+       if (ret)
+               goto out_put_req;
 
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
+       ceph_osdc_start_request(osdc, req, false);
+       ret = ceph_osdc_wait_request(osdc, req);
 
-       return rc;
+out_put_req:
+       ceph_osdc_put_request(req);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_start_request);
+EXPORT_SYMBOL(ceph_osdc_notify_ack);
 
-/*
- * Unregister a registered request.  The request is not completed (i.e.
- * no callbacks or wakeups) - higher layers are supposed to know what
- * they are canceling.
- */
-void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+                                 u64 cookie, u32 prot_ver, u32 timeout,
+                                 void *payload, size_t payload_len)
 {
-       struct ceph_osd_client *osdc = req->r_osdc;
+       struct ceph_osd_req_op *op;
+       struct ceph_pagelist *pl;
+       int ret;
 
-       mutex_lock(&osdc->request_mutex);
-       if (req->r_linger)
-               __unregister_linger_request(osdc, req);
-       __unregister_request(osdc, req);
-       mutex_unlock(&osdc->request_mutex);
+       op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+       op->notify.cookie = cookie;
 
-       dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+       pl = kmalloc(sizeof(*pl), GFP_NOIO);
+       if (!pl)
+               return -ENOMEM;
+
+       ceph_pagelist_init(pl);
+       ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
+       ret |= ceph_pagelist_encode_32(pl, timeout);
+       ret |= ceph_pagelist_encode_32(pl, payload_len);
+       ret |= ceph_pagelist_append(pl, payload, payload_len);
+       if (ret) {
+               ceph_pagelist_release(pl);
+               return -ENOMEM;
+       }
+
+       ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
+       op->indata_len = pl->length;
+       return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
 /*
- * wait for a request to complete
+ * @timeout: in seconds
+ *
+ * @preply_{pages,len} are initialized both on success and error.
+ * The caller is responsible for:
+ *
+ *     ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
  */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+                    struct ceph_object_id *oid,
+                    struct ceph_object_locator *oloc,
+                    void *payload,
+                    size_t payload_len,
+                    u32 timeout,
+                    struct page ***preply_pages,
+                    size_t *preply_len)
 {
-       int rc;
+       struct ceph_osd_linger_request *lreq;
+       struct page **pages;
+       int ret;
 
-       dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+       WARN_ON(!timeout);
+       if (preply_pages) {
+               *preply_pages = NULL;
+               *preply_len = 0;
+       }
 
-       rc = wait_for_completion_interruptible(&req->r_completion);
-       if (rc < 0) {
-               dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
-               ceph_osdc_cancel_request(req);
-               complete_request(req);
-               return rc;
+       lreq = linger_alloc(osdc);
+       if (!lreq)
+               return -ENOMEM;
+
+       lreq->preply_pages = preply_pages;
+       lreq->preply_len = preply_len;
+
+       ceph_oid_copy(&lreq->t.base_oid, oid);
+       ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+       lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+       lreq->reg_req = alloc_linger_request(lreq);
+       if (!lreq->reg_req) {
+               ret = -ENOMEM;
+               goto out_put_lreq;
+       }
+
+       /* for notify_id */
+       pages = ceph_alloc_page_vector(1, GFP_NOIO);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               goto out_put_lreq;
+       }
+
+       down_write(&osdc->lock);
+       linger_register(lreq); /* before osd_req_op_* */
+       ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
+                                    timeout, payload, payload_len);
+       if (ret) {
+               linger_unregister(lreq);
+               up_write(&osdc->lock);
+               ceph_release_page_vector(pages, 1);
+               goto out_put_lreq;
        }
+       ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
+                                                response_data),
+                                pages, PAGE_SIZE, 0, false, true);
+       linger_submit(lreq);
+       up_write(&osdc->lock);
+
+       ret = linger_reg_commit_wait(lreq);
+       if (!ret)
+               ret = linger_notify_finish_wait(lreq);
+       else
+               dout("lreq %p failed to initiate notify %d\n", lreq, ret);
 
-       dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
-            req->r_result);
-       return req->r_result;
+       linger_cancel(lreq);
+out_put_lreq:
+       linger_put(lreq);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_wait_request);
+EXPORT_SYMBOL(ceph_osdc_notify);
 
 /*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
+ * Return the number of milliseconds since the watch was last
+ * confirmed, or an error.  If there is an error, the watch is no
+ * longer valid, and should be destroyed with ceph_osdc_unwatch().
  */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+                         struct ceph_osd_linger_request *lreq)
 {
-       struct ceph_osd_request *req;
-       u64 last_tid, next_tid = 0;
-
-       mutex_lock(&osdc->request_mutex);
-       last_tid = osdc->last_tid;
-       while (1) {
-               req = __lookup_request_ge(osdc, next_tid);
-               if (!req)
-                       break;
-               if (req->r_tid > last_tid)
-                       break;
-
-               next_tid = req->r_tid + 1;
-               if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-                       continue;
+       unsigned long stamp, age;
+       int ret;
 
-               ceph_osdc_get_request(req);
-               mutex_unlock(&osdc->request_mutex);
-               dout("sync waiting on tid %llu (last is %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_safe_completion);
-               mutex_lock(&osdc->request_mutex);
-               ceph_osdc_put_request(req);
+       down_read(&osdc->lock);
+       mutex_lock(&lreq->lock);
+       stamp = lreq->watch_valid_thru;
+       if (!list_empty(&lreq->pending_lworks)) {
+               struct linger_work *lwork =
+                   list_first_entry(&lreq->pending_lworks,
+                                    struct linger_work,
+                                    pending_item);
+
+               if (time_before(lwork->queued_stamp, stamp))
+                       stamp = lwork->queued_stamp;
        }
-       mutex_unlock(&osdc->request_mutex);
-       dout("sync done (thru tid %llu)\n", last_tid);
+       age = jiffies - stamp;
+       dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
+            lreq, lreq->linger_id, age, lreq->last_error);
+       /* we are truncating to msecs, so return a safe upper bound */
+       ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
+
+       mutex_unlock(&lreq->lock);
+       up_read(&osdc->lock);
+       return ret;
 }
-EXPORT_SYMBOL(ceph_osdc_sync);
 
 /*
  * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
 }
 EXPORT_SYMBOL(ceph_osdc_flush_notifies);
 
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
+{
+       down_read(&osdc->lock);
+       maybe_request_map(osdc);
+       up_read(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
 
 /*
  * init, shutdown
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
        dout("init\n");
        osdc->client = client;
-       osdc->osdmap = NULL;
-       init_rwsem(&osdc->map_sem);
-       init_completion(&osdc->map_waiters);
-       osdc->last_requested_map = 0;
-       mutex_init(&osdc->request_mutex);
-       osdc->last_tid = 0;
+       init_rwsem(&osdc->lock);
        osdc->osds = RB_ROOT;
        INIT_LIST_HEAD(&osdc->osd_lru);
-       osdc->requests = RB_ROOT;
-       INIT_LIST_HEAD(&osdc->req_lru);
-       INIT_LIST_HEAD(&osdc->req_unsent);
-       INIT_LIST_HEAD(&osdc->req_notarget);
-       INIT_LIST_HEAD(&osdc->req_linger);
-       osdc->num_requests = 0;
+       spin_lock_init(&osdc->osd_lru_lock);
+       osd_init(&osdc->homeless_osd);
+       osdc->homeless_osd.o_osdc = osdc;
+       osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
+       osdc->linger_requests = RB_ROOT;
+       osdc->map_checks = RB_ROOT;
+       osdc->linger_map_checks = RB_ROOT;
        INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
        INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-       spin_lock_init(&osdc->event_lock);
-       osdc->event_tree = RB_ROOT;
-       osdc->event_count = 0;
-
-       schedule_delayed_work(&osdc->osds_timeout_work,
-           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
        err = -ENOMEM;
+       osdc->osdmap = ceph_osdmap_alloc();
+       if (!osdc->osdmap)
+               goto out;
+
        osdc->req_mempool = mempool_create_slab_pool(10,
                                                     ceph_osd_request_cache);
        if (!osdc->req_mempool)
-               goto out;
+               goto out_map;
 
        err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
-                               OSD_OP_FRONT_LEN, 10, true,
-                               "osd_op");
+                               PAGE_SIZE, 10, true, "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
-                               OSD_OPREPLY_FRONT_LEN, 10, true,
-                               "osd_op_reply");
+                               PAGE_SIZE, 10, true, "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
 
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->notify_wq)
                goto out_msgpool_reply;
 
+       schedule_delayed_work(&osdc->timeout_work,
+                             osdc->client->options->osd_keepalive_timeout);
+       schedule_delayed_work(&osdc->osds_timeout_work,
+           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
+
        return 0;
 
 out_msgpool_reply:
@@ -2709,6 +3936,8 @@ out_msgpool:
        ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
        mempool_destroy(osdc->req_mempool);
+out_map:
+       ceph_osdmap_destroy(osdc->osdmap);
 out:
        return err;
 }
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
        destroy_workqueue(osdc->notify_wq);
        cancel_delayed_work_sync(&osdc->timeout_work);
        cancel_delayed_work_sync(&osdc->osds_timeout_work);
-       if (osdc->osdmap) {
-               ceph_osdmap_destroy(osdc->osdmap);
-               osdc->osdmap = NULL;
+
+       down_write(&osdc->lock);
+       while (!RB_EMPTY_ROOT(&osdc->osds)) {
+               struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+                                               struct ceph_osd, o_node);
+               close_osd(osd);
        }
-       remove_all_osds(osdc);
+       up_write(&osdc->lock);
+       WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+       osd_cleanup(&osdc->homeless_osd);
+
+       WARN_ON(!list_empty(&osdc->osd_lru));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
+       WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
+       WARN_ON(atomic_read(&osdc->num_requests));
+       WARN_ON(atomic_read(&osdc->num_homeless));
+
+       ceph_osdmap_destroy(osdc->osdmap);
        mempool_destroy(osdc->req_mempool);
        ceph_msgpool_destroy(&osdc->msgpool_op);
        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                return PTR_ERR(req);
 
        /* it may be a short read due to an object boundary */
-
        osd_req_op_extent_osd_data_pages(req, 0,
                                pages, *plen, page_align, false, false);
 
        dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
             off, *plen, *plen, page_align);
 
-       ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
        rc = ceph_osdc_start_request(osdc, req, false);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int rc = 0;
        int page_align = off & ~PAGE_MASK;
 
-       BUG_ON(vino.snap != CEPH_NOSNAP);       /* snapshots aren't writeable */
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                false, false);
        dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-       ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+       req->r_mtime = *mtime;
        rc = ceph_osdc_start_request(osdc, req, true);
        if (!rc)
                rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
        struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
+       struct ceph_osd_client *osdc = osd->o_osdc;
        int type = le16_to_cpu(msg->hdr.type);
 
-       if (!osd)
-               goto out;
-       osdc = osd->o_osdc;
-
        switch (type) {
        case CEPH_MSG_OSD_MAP:
                ceph_osdc_handle_map(osdc, msg);
                break;
        case CEPH_MSG_OSD_OPREPLY:
-               handle_reply(osdc, msg);
+               handle_reply(osd, msg);
                break;
        case CEPH_MSG_WATCH_NOTIFY:
                handle_watch_notify(osdc, msg);
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
-out:
+
        ceph_msg_put(msg);
 }
 
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        struct ceph_osd_client *osdc = osd->o_osdc;
-       struct ceph_msg *m;
+       struct ceph_msg *m = NULL;
        struct ceph_osd_request *req;
        int front_len = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
-       u64 tid;
+       u64 tid = le64_to_cpu(hdr->tid);
+
+       down_read(&osdc->lock);
+       if (!osd_registered(osd)) {
+               dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+               *skip = 1;
+               goto out_unlock_osdc;
+       }
+       WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
 
-       tid = le64_to_cpu(hdr->tid);
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
+       mutex_lock(&osd->lock);
+       req = lookup_request(&osd->o_requests, tid);
        if (!req) {
                dout("%s osd%d tid %llu unknown, skipping\n", __func__,
                     osd->o_osd, tid);
-               m = NULL;
                *skip = 1;
-               goto out;
+               goto out_unlock_session;
        }
 
        ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
-                       goto out;
+                       goto out_unlock_session;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                        req->r_reply->data_length);
                m = NULL;
                *skip = 1;
-               goto out;
+               goto out_unlock_session;
        }
 
        m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-       mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+       mutex_unlock(&osd->lock);
+out_unlock_osdc:
+       up_read(&osdc->lock);
+       return m;
+}
+
+/*
+ * TODO: switch to a msg-owned pagelist
+ */
+static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
+{
+       struct ceph_msg *m;
+       int type = le16_to_cpu(hdr->type);
+       u32 front_len = le32_to_cpu(hdr->front_len);
+       u32 data_len = le32_to_cpu(hdr->data_len);
+
+       m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+       if (!m)
+               return NULL;
+
+       if (data_len) {
+               struct page **pages;
+               struct ceph_osd_data osd_data;
+
+               pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
+                                              GFP_NOIO);
+               if (!pages) {
+                       ceph_msg_put(m);
+                       return NULL;
+               }
+
+               ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
+                                        false);
+               ceph_osdc_msg_data_add(m, &osd_data);
+       }
+
        return m;
 }
 
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 {
        struct ceph_osd *osd = con->private;
        int type = le16_to_cpu(hdr->type);
-       int front = le32_to_cpu(hdr->front_len);
 
        *skip = 0;
        switch (type) {
        case CEPH_MSG_OSD_MAP:
        case CEPH_MSG_WATCH_NOTIFY:
-               return ceph_msg_new(type, front, GFP_NOFS, false);
+               return alloc_msg_with_page_vector(hdr);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
-               pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-                       osd->o_osd);
+               pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+                       osd->o_osd, type);
                *skip = 1;
                return NULL;
        }
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
        .alloc_msg = alloc_msg,
        .sign_message = osd_sign_message,
        .check_message_signature = osd_check_message_signature,
-       .fault = osd_reset,
+       .fault = osd_fault,
 };
index 243574c8cf33807fcaf9374530358f1e44080764..cde52e94732f36f7a4738a9c0f56296640560494 100644 (file)
@@ -380,23 +380,24 @@ bad:
        return ERR_PTR(err);
 }
 
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 {
-       if (l.pool < r.pool)
+       if (lhs->pool < rhs->pool)
                return -1;
-       if (l.pool > r.pool)
+       if (lhs->pool > rhs->pool)
                return 1;
-       if (l.seed < r.seed)
+       if (lhs->seed < rhs->seed)
                return -1;
-       if (l.seed > r.seed)
+       if (lhs->seed > rhs->seed)
                return 1;
+
        return 0;
 }
 
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
                               struct rb_root *root)
 {
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
        while (*p) {
                parent = *p;
                pg = rb_entry(parent, struct ceph_pg_mapping, node);
-               c = pgid_cmp(new->pgid, pg->pgid);
+               c = ceph_pg_compare(&new->pgid, &pg->pgid);
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
        while (n) {
                pg = rb_entry(n, struct ceph_pg_mapping, node);
-               c = pgid_cmp(pgid, pg->pgid);
+               c = ceph_pg_compare(&pgid, &pg->pgid);
                if (c < 0) {
                        n = n->rb_left;
                } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
        *p += 4;  /* skip crash_replay_interval */
 
        if (ev >= 7)
-               *p += 1;  /* skip min_size */
+               pi->min_size = ceph_decode_8(p);
+       else
+               pi->min_size = pi->size - pi->size / 2;
 
        if (ev >= 8)
                *p += 8 + 8;  /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
                pi->write_tier = -1;
        }
 
+       if (ev >= 10) {
+               /* skip properties */
+               num = ceph_decode_32(p);
+               while (num--) {
+                       len = ceph_decode_32(p);
+                       *p += len; /* key */
+                       len = ceph_decode_32(p);
+                       *p += len; /* val */
+               }
+       }
+
+       if (ev >= 11) {
+               /* skip hit_set_params */
+               *p += 1 + 1; /* versions */
+               len = ceph_decode_32(p);
+               *p += len;
+
+               *p += 4; /* skip hit_set_period */
+               *p += 4; /* skip hit_set_count */
+       }
+
+       if (ev >= 12)
+               *p += 4; /* skip stripe_width */
+
+       if (ev >= 13) {
+               *p += 8; /* skip target_max_bytes */
+               *p += 8; /* skip target_max_objects */
+               *p += 4; /* skip cache_target_dirty_ratio_micro */
+               *p += 4; /* skip cache_target_full_ratio_micro */
+               *p += 4; /* skip cache_min_flush_age */
+               *p += 4; /* skip cache_min_evict_age */
+       }
+
+       if (ev >=  14) {
+               /* skip erasure_code_profile */
+               len = ceph_decode_32(p);
+               *p += len;
+       }
+
+       if (ev >= 15)
+               pi->last_force_request_resend = ceph_decode_32(p);
+       else
+               pi->last_force_request_resend = 0;
+
        /* ignore the rest */
 
        *p = pool_end;
@@ -660,6 +707,23 @@ bad:
 /*
  * osd map
  */
+struct ceph_osdmap *ceph_osdmap_alloc(void)
+{
+       struct ceph_osdmap *map;
+
+       map = kzalloc(sizeof(*map), GFP_NOIO);
+       if (!map)
+               return NULL;
+
+       map->pg_pools = RB_ROOT;
+       map->pool_max = -1;
+       map->pg_temp = RB_ROOT;
+       map->primary_temp = RB_ROOT;
+       mutex_init(&map->crush_scratch_mutex);
+
+       return map;
+}
+
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
        dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
        struct ceph_osdmap *map;
        int ret;
 
-       map = kzalloc(sizeof(*map), GFP_NOFS);
+       map = ceph_osdmap_alloc();
        if (!map)
                return ERR_PTR(-ENOMEM);
 
-       map->pg_temp = RB_ROOT;
-       map->primary_temp = RB_ROOT;
-       mutex_init(&map->crush_scratch_mutex);
-
        ret = osdmap_decode(p, end, map);
        if (ret) {
                ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  * decode and apply an incremental map update.
  */
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
-                                            struct ceph_messenger *msgr)
+                                            struct ceph_osdmap *map)
 {
        struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
        return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+                  const struct ceph_object_id *src)
+{
+       WARN_ON(!ceph_oid_empty(dest));
+
+       if (src->name != src->inline_name) {
+               /* very rare, see ceph_object_id definition */
+               dest->name = kmalloc(src->name_len + 1,
+                                    GFP_NOIO | __GFP_NOFAIL);
+       }
+
+       memcpy(dest->name, src->name, src->name_len + 1);
+       dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
+
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+       int len;
+
+       WARN_ON(!ceph_oid_empty(oid));
+
+       len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+       if (len >= sizeof(oid->inline_name))
+               return len;
+
+       oid->name_len = len;
+       return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       BUG_ON(oid_printf_vargs(oid, fmt, ap));
+       va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+                     const char *fmt, va_list ap)
+{
+       va_list aq;
+       int len;
+
+       va_copy(aq, ap);
+       len = oid_printf_vargs(oid, fmt, aq);
+       va_end(aq);
+
+       if (len) {
+               char *external_name;
+
+               external_name = kmalloc(len + 1, gfp);
+               if (!external_name)
+                       return -ENOMEM;
+
+               oid->name = external_name;
+               WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+               oid->name_len = len;
+       }
+
+       return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+                    const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+       if (oid->name != oid->inline_name)
+               kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
+
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+                        const struct ceph_osds *rhs)
+{
+       if (lhs->size == rhs->size &&
+           !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+               return true;
+
+       return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+                      const struct ceph_osds *rhs)
+{
+       if (__osds_equal(lhs, rhs) &&
+           lhs->primary == rhs->primary)
+               return true;
+
+       return false;
+}
+
+static bool osds_valid(const struct ceph_osds *set)
+{
+       /* non-empty set */
+       if (set->size > 0 && set->primary >= 0)
+               return true;
+
+       /* empty can_shift_osds set */
+       if (!set->size && set->primary == -1)
+               return true;
+
+       /* empty !can_shift_osds set - all NONE */
+       if (set->size > 0 && set->primary == -1) {
+               int i;
+
+               for (i = 0; i < set->size; i++) {
+                       if (set->osds[i] != CRUSH_ITEM_NONE)
+                               break;
+               }
+               if (i == set->size)
+                       return true;
+       }
+
+       return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+       memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+       dest->size = src->size;
+       dest->primary = src->primary;
+}
+
+static bool is_split(const struct ceph_pg *pgid,
+                    u32 old_pg_num,
+                    u32 new_pg_num)
+{
+       int old_bits = calc_bits_of(old_pg_num);
+       int old_mask = (1 << old_bits) - 1;
+       int n;
+
+       WARN_ON(pgid->seed >= old_pg_num);
+       if (new_pg_num <= old_pg_num)
+               return false;
+
+       for (n = 1; ; n++) {
+               int next_bit = n << (old_bits - 1);
+               u32 s = next_bit | pgid->seed;
+
+               if (s < old_pg_num || s == pgid->seed)
+                       continue;
+               if (s >= new_pg_num)
+                       break;
+
+               s = ceph_stable_mod(s, old_pg_num, old_mask);
+               if (s == pgid->seed)
+                       return true;
+       }
+
+       return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid)
+{
+       return !osds_equal(old_acting, new_acting) ||
+              !osds_equal(old_up, new_up) ||
+              old_size != new_size ||
+              old_min_size != new_min_size ||
+              is_split(pgid, old_pg_num, new_pg_num) ||
+              old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+       int i;
+
+       for (i = 0; i < acting->size; i++) {
+               if (acting->osds[i] == osd)
+                       return i;
+       }
+
+       return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+                           const struct ceph_osds *new_acting)
+{
+       if (!old_acting->size && !new_acting->size)
+               return false; /* both still empty */
 
+       if (!old_acting->size ^ !new_acting->size)
+               return true; /* was empty, now not, or vice versa */
 
+       if (old_acting->primary != new_acting->primary)
+               return true; /* primary changed */
+
+       if (calc_pg_rank(old_acting->primary, old_acting) !=
+           calc_pg_rank(new_acting->primary, new_acting))
+               return true;
+
+       return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change)
+{
+       if (primary_changed(old_acting, new_acting))
+               return true;
+
+       if (any_change && !__osds_equal(old_acting, new_acting))
+               return true;
+
+       return false;
+}
 
 /*
  * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-                       struct ceph_object_locator *oloc,
-                       struct ceph_object_id *oid,
-                       struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+                             struct ceph_object_id *oid,
+                             struct ceph_object_locator *oloc,
+                             struct ceph_pg *raw_pgid)
 {
        struct ceph_pg_pool_info *pi;
 
-       pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+       pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
        if (!pi)
-               return -EIO;
+               return -ENOENT;
 
-       pg_out->pool = oloc->pool;
-       pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-                                    oid->name_len);
+       raw_pgid->pool = oloc->pool;
+       raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+                                      oid->name_len);
 
-       dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-            pg_out->pool, pg_out->seed);
+       dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
+            oid->name, raw_pgid->pool, raw_pgid->seed);
        return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
+
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+                        const struct ceph_pg *raw_pgid,
+                        struct ceph_pg *pgid)
+{
+       pgid->pool = raw_pgid->pool;
+       pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+                                    pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed).  Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+                        const struct ceph_pg *raw_pgid)
+{
+       if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+               /* hash pool id and seed so that pool PGs do not overlap */
+               return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                     ceph_stable_mod(raw_pgid->seed,
+                                                     pi->pgp_num,
+                                                     pi->pgp_num_mask),
+                                     raw_pgid->pool);
+       } else {
+               /*
+                * legacy behavior: add ps and pool together.  this is
+                * not a great approach because the PGs from each pool
+                * will overlap on top of each other: 0.5 == 1.4 ==
+                * 2.3 == ...
+                */
+               return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+                                      pi->pgp_num_mask) +
+                      (unsigned)raw_pgid->pool;
+       }
+}
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
                    int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 }
 
 /*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG.  The result may
+ * contain nonexistent OSDs.  ->primary is undefined for a raw set.
  *
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
  */
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-                         struct ceph_pg_pool_info *pool,
-                         struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+                          struct ceph_pg_pool_info *pi,
+                          const struct ceph_pg *raw_pgid,
+                          struct ceph_osds *raw,
+                          u32 *ppps)
 {
+       u32 pps = raw_pg_to_pps(pi, raw_pgid);
        int ruleno;
        int len;
 
-       /* crush */
-       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-                                pool->type, pool->size);
+       ceph_osds_init(raw);
+       if (ppps)
+               *ppps = pps;
+
+       ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+                                pi->size);
        if (ruleno < 0) {
                pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
-                      pgid.pool, pool->crush_ruleset, pool->type,
-                      pool->size);
-               return -ENOENT;
+                      pi->id, pi->crush_ruleset, pi->type, pi->size);
+               return;
        }
 
-       len = do_crush(osdmap, ruleno, pps, osds,
-                      min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+       len = do_crush(osdmap, ruleno, pps, raw->osds,
+                      min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
-                      len, ruleno, pgid.pool, pool->crush_ruleset,
-                      pool->type, pool->size);
-               return len;
+                      len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+                      pi->size);
+               return;
        }
 
-       return len;
+       raw->size = len;
 }
 
 /*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary.  By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
  *
- * Return up set length.  *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set.  If it's
+ * empty, ->primary will remain undefined.
  */
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
-                         struct ceph_pg_pool_info *pool,
-                         int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+                          struct ceph_pg_pool_info *pi,
+                          struct ceph_osds *set)
 {
-       int up_primary = -1;
        int i;
 
-       if (ceph_can_shift_osds(pool)) {
+       /* ->primary is undefined for a raw set */
+       BUG_ON(set->primary != -1);
+
+       if (ceph_can_shift_osds(pi)) {
                int removed = 0;
 
-               for (i = 0; i < len; i++) {
-                       if (ceph_osd_is_down(osdmap, osds[i])) {
+               /* shift left */
+               for (i = 0; i < set->size; i++) {
+                       if (ceph_osd_is_down(osdmap, set->osds[i])) {
                                removed++;
                                continue;
                        }
                        if (removed)
-                               osds[i - removed] = osds[i];
+                               set->osds[i - removed] = set->osds[i];
                }
-
-               len -= removed;
-               if (len > 0)
-                       up_primary = osds[0];
+               set->size -= removed;
+               if (set->size > 0)
+                       set->primary = set->osds[0];
        } else {
-               for (i = len - 1; i >= 0; i--) {
-                       if (ceph_osd_is_down(osdmap, osds[i]))
-                               osds[i] = CRUSH_ITEM_NONE;
+               /* set down/dne devices to NONE */
+               for (i = set->size - 1; i >= 0; i--) {
+                       if (ceph_osd_is_down(osdmap, set->osds[i]))
+                               set->osds[i] = CRUSH_ITEM_NONE;
                        else
-                               up_primary = osds[i];
+                               set->primary = set->osds[i];
                }
        }
-
-       *primary = up_primary;
-       return len;
 }
 
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
-                                  struct ceph_pg_pool_info *pool,
-                                  int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+                                  struct ceph_pg_pool_info *pi,
+                                  u32 pps,
+                                  struct ceph_osds *up)
 {
        int i;
        int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (!osdmap->osd_primary_affinity)
                return;
 
-       for (i = 0; i < len; i++) {
-               int osd = osds[i];
+       for (i = 0; i < up->size; i++) {
+               int osd = up->osds[i];
 
                if (osd != CRUSH_ITEM_NONE &&
                    osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
                        break;
                }
        }
-       if (i == len)
+       if (i == up->size)
                return;
 
        /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
         * osd into the hash/rng so that a proportional fraction of an
         * osd's pgs get rejected as primary.
         */
-       for (i = 0; i < len; i++) {
-               int osd = osds[i];
+       for (i = 0; i < up->size; i++) {
+               int osd = up->osds[i];
                u32 aff;
 
                if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
        if (pos < 0)
                return;
 
-       *primary = osds[pos];
+       up->primary = up->osds[pos];
 
-       if (ceph_can_shift_osds(pool) && pos > 0) {
+       if (ceph_can_shift_osds(pi) && pos > 0) {
                /* move the new primary to the front */
                for (i = pos; i > 0; i--)
-                       osds[i] = osds[i - 1];
-               osds[0] = *primary;
+                       up->osds[i] = up->osds[i - 1];
+               up->osds[0] = up->primary;
        }
 }
 
 /*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
  *
- * Return acting set length.  *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings.  This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
  */
-static int apply_temps(struct ceph_osdmap *osdmap,
-                      struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
-                      int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pi,
+                         const struct ceph_pg *raw_pgid,
+                         struct ceph_osds *temp)
 {
+       struct ceph_pg pgid;
        struct ceph_pg_mapping *pg;
-       int temp_len;
-       int temp_primary;
        int i;
 
-       /* raw_pg -> pg */
-       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-                                   pool->pg_num_mask);
+       raw_pg_to_pg(pi, raw_pgid, &pgid);
+       ceph_osds_init(temp);
 
        /* pg_temp? */
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-               temp_len = 0;
-               temp_primary = -1;
-
                for (i = 0; i < pg->pg_temp.len; i++) {
                        if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
-                               if (ceph_can_shift_osds(pool))
+                               if (ceph_can_shift_osds(pi))
                                        continue;
-                               else
-                                       osds[temp_len++] = CRUSH_ITEM_NONE;
+
+                               temp->osds[temp->size++] = CRUSH_ITEM_NONE;
                        } else {
-                               osds[temp_len++] = pg->pg_temp.osds[i];
+                               temp->osds[temp->size++] = pg->pg_temp.osds[i];
                        }
                }
 
                /* apply pg_temp's primary */
-               for (i = 0; i < temp_len; i++) {
-                       if (osds[i] != CRUSH_ITEM_NONE) {
-                               temp_primary = osds[i];
+               for (i = 0; i < temp->size; i++) {
+                       if (temp->osds[i] != CRUSH_ITEM_NONE) {
+                               temp->primary = temp->osds[i];
                                break;
                        }
                }
-       } else {
-               temp_len = len;
-               temp_primary = *primary;
        }
 
        /* primary_temp? */
        pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
        if (pg)
-               temp_primary = pg->primary_temp.osd;
-
-       *primary = temp_primary;
-       return temp_len;
+               temp->primary = pg->primary_temp.osd;
 }
 
 /*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
  *
- * Return acting set length, or error.  *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
  */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+                              const struct ceph_pg *raw_pgid,
+                              struct ceph_osds *up,
+                              struct ceph_osds *acting)
 {
-       struct ceph_pg_pool_info *pool;
+       struct ceph_pg_pool_info *pi;
        u32 pps;
-       int len;
 
-       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-       if (!pool) {
-               *primary = -1;
-               return -ENOENT;
+       pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+       if (!pi) {
+               ceph_osds_init(up);
+               ceph_osds_init(acting);
+               goto out;
        }
 
-       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-               /* hash pool id and seed so that pool PGs do not overlap */
-               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                                    pool->pgp_num_mask),
-                                    pgid.pool);
-       } else {
-               /*
-                * legacy behavior: add ps and pool together.  this is
-                * not a great approach because the PGs from each pool
-                * will overlap on top of each other: 0.5 == 1.4 ==
-                * 2.3 == ...
-                */
-               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                     pool->pgp_num_mask) +
-                       (unsigned)pgid.pool;
-       }
-
-       len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-       if (len < 0) {
-               *primary = -1;
-               return len;
+       pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+       raw_to_up_osds(osdmap, pi, up);
+       apply_primary_affinity(osdmap, pi, pps, up);
+       get_temp_osds(osdmap, pi, raw_pgid, acting);
+       if (!acting->size) {
+               memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+               acting->size = up->size;
+               if (acting->primary == -1)
+                       acting->primary = up->primary;
        }
-
-       len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
-       apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
-       len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
-       return len;
+out:
+       WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+                             const struct ceph_pg *raw_pgid)
 {
-       int osds[CEPH_PG_MAX_SIZE];
-       int primary;
-
-       ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+       struct ceph_osds up, acting;
 
-       return primary;
+       ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+       return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
index 02f53674dc39d6973d374f18b43bd0b1b93952c7..040ff627c18a52f463e6a27e06b3794f4b0b5b10 100644 (file)
@@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void)
  */
 struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
-               int flags)
+               int flags, gfp_t gfp)
 {
        LIST_HEAD(free);
        struct rpc_cred_cache *cache = auth->au_credcache;
@@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
        if (flags & RPCAUTH_LOOKUP_RCU)
                return ERR_PTR(-ECHILD);
 
-       new = auth->au_ops->crcreate(auth, acred, flags);
+       new = auth->au_ops->crcreate(auth, acred, flags, gfp);
        if (IS_ERR(new)) {
                cred = new;
                goto out;
@@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
                new = rpcauth_bind_new_cred(task, lookupflags);
        if (IS_ERR(new))
                return PTR_ERR(new);
-       if (req->rq_cred != NULL)
-               put_rpccred(req->rq_cred);
+       put_rpccred(req->rq_cred);
        req->rq_cred = new;
        return 0;
 }
@@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 void
 put_rpccred(struct rpc_cred *cred)
 {
+       if (cred == NULL)
+               return;
        /* Fast path for unhashed credentials */
        if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
                if (atomic_dec_and_test(&cred->cr_count))
index 41248b1820c778e2e2dd6dc89c3437692673d7cd..54dd3fdead54b8813bb6cbdf769f96a11d8f49ea 100644 (file)
@@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 
+struct rpc_cred *
+rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp)
+{
+       return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred);
+
 struct rpc_cred *rpc_lookup_cred_nonblock(void)
 {
        return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
@@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 static struct rpc_cred *
 generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+       return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
-generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct generic_cred *gcred;
 
-       gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+       gcred = kmalloc(sizeof(*gcred), gfp);
        if (gcred == NULL)
                return ERR_PTR(-ENOMEM);
 
index 15612ffa8d57271c2dc8cac81a148d910a3cff99..e64ae93d5b4f618e216f2c55070ceef60bd737d5 100644 (file)
@@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(auth, acred, flags);
+       return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
        struct gss_cred *cred = NULL;
@@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
                __func__, from_kuid(&init_user_ns, acred->uid),
                auth->au_flavor);
 
-       if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+       if (!(cred = kzalloc(sizeof(*cred), gfp)))
                goto out_err;
 
        rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
index 1095be9c80ab809900d2bf0afbde9c63b6034a9d..e085f5ae1548194603de4af635388624daec6516 100644 (file)
@@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
        struct rsc *found;
 
        memset(&rsci, 0, sizeof(rsci));
-       if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
-               return NULL;
+       rsci.handle.data = handle->data;
+       rsci.handle.len = handle->len;
        found = rsc_lookup(cd, &rsci);
-       rsc_free(&rsci);
        if (!found)
                return NULL;
        if (cache_check(cd, &found->h, NULL))
@@ -857,8 +856,8 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g
                goto out;
        if (svc_getnl(&buf->head[0]) != seq)
                goto out;
-       /* trim off the mic at the end before returning */
-       xdr_buf_trim(buf, mic.len + 4);
+       /* trim off the mic and padding at the end before returning */
+       xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4);
        stat = 0;
 out:
        kfree(mic.data);
index 0d3dd364c22f5e1671e1048d8abf4d37b51e268b..9f65452b7cbcbe380a36c0941176b5562a434f88 100644 (file)
@@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-       return rpcauth_lookup_credcache(auth, acred, flags);
+       return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
 }
 
 static struct rpc_cred *
-unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp)
 {
        struct unx_cred *cred;
        unsigned int groups = 0;
@@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
                        from_kuid(&init_user_ns, acred->uid),
                        from_kgid(&init_user_ns, acred->gid));
 
-       if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+       if (!(cred = kmalloc(sizeof(*cred), gfp)))
                return ERR_PTR(-ENOMEM);
 
        rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
index 7e0c9bf22df811475385496007cfac321e0422a5..06b4df9faaa1696b1e21ead9ddacea85cf41f760 100644 (file)
@@ -1413,6 +1413,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_payload);
 
+/**
+ * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes
+ * @clnt: RPC client to query
+ */
+size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
+{
+       struct rpc_xprt *xprt;
+       size_t ret;
+
+       rcu_read_lock();
+       xprt = rcu_dereference(clnt->cl_xprt);
+       ret = xprt->ops->bc_maxpayload(xprt);
+       rcu_read_unlock();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+
 /**
  * rpc_get_timeout - Get timeout for transport in units of HZ
  * @clnt: RPC client to query
index 7422f28818b24de5e7b36eca01ac2e91f9a7cd4f..f5572e31d518f85b2afbbb9ec1b8e89d36448803 100644 (file)
@@ -244,13 +244,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
        svc_xprt_received(new);
 }
 
-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
                    struct net *net, const int family,
                    const unsigned short port, int flags)
 {
        struct svc_xprt_class *xcl;
 
-       dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
        spin_lock(&svc_xprt_class_lock);
        list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
                struct svc_xprt *newxprt;
@@ -274,12 +273,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
        }
  err:
        spin_unlock(&svc_xprt_class_lock);
-       dprintk("svc: transport %s not found\n", xprt_name);
-
        /* This errno is exposed to user space.  Provide a reasonable
         * perror msg for a bad transport. */
        return -EPROTONOSUPPORT;
 }
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+                   struct net *net, const int family,
+                   const unsigned short port, int flags)
+{
+       int err;
+
+       dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+       err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+       if (err == -EPROTONOSUPPORT) {
+               request_module("svc%s", xprt_name);
+               err = _svc_create_xprt(serv, xprt_name, net, family, port, flags);
+       }
+       if (err)
+               dprintk("svc: transport %s not found, err %d\n",
+                       xprt_name, err);
+       return err;
+}
 EXPORT_SYMBOL_GPL(svc_create_xprt);
 
 /*
index 6bdb3865212d2edee16454970e21903870fd5d18..c4f3cc0c07752e3fbbe8e72316f2203a065e1646 100644 (file)
@@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
                xdr_set_iov(xdr, buf->head, buf->len);
        else if (buf->page_len != 0)
                xdr_set_page_base(xdr, 0, buf->len);
+       else
+               xdr_set_iov(xdr, buf->head, buf->len);
        if (p != NULL && p > xdr->p && xdr->end >= p) {
                xdr->nwords -= p - xdr->p;
                xdr->p = p;
index 2dcd7640eeb525d6a7f1ba8bd3eab047cda30f75..87762d976b63b9a1ebc5cb7a8b2603ed17b8b010 100644 (file)
@@ -191,6 +191,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
        return 0;
 }
 
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       size_t maxmsg;
+
+       maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize);
+       return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
 /**
  * rpcrdma_bc_marshal_reply - Send backwards direction reply
  * @rqst: buffer containing RPC reply data
index b289e106540bf9d85d0d1e1e476786af21b6e1ae..6326ebe8b5951a95c3e488bee905f6fa6f00a62b 100644 (file)
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES   (64)
 
+static struct workqueue_struct *fmr_recovery_wq;
+
+#define FMR_RECOVERY_WQ_FLAGS          (WQ_UNBOUND)
+
+int
+fmr_alloc_recovery_wq(void)
+{
+       fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
+       return !fmr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+fmr_destroy_recovery_wq(void)
+{
+       struct workqueue_struct *wq;
+
+       if (!fmr_recovery_wq)
+               return;
+
+       wq = fmr_recovery_wq;
+       fmr_recovery_wq = NULL;
+       destroy_workqueue(wq);
+}
+
+static int
+__fmr_unmap(struct rpcrdma_mw *mw)
+{
+       LIST_HEAD(l);
+
+       list_add(&mw->fmr.fmr->list, &l);
+       return ib_unmap_fmr(&l);
+}
+
+/* Deferred reset of a single FMR. Generate a fresh rkey by
+ * replacing the MR. There's no recovery if this fails.
+ */
+static void
+__fmr_recovery_worker(struct work_struct *work)
+{
+       struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
+                                           mw_work);
+       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+
+       __fmr_unmap(mw);
+       rpcrdma_put_mw(r_xprt, mw);
+       return;
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__fmr_queue_recovery(struct rpcrdma_mw *mw)
+{
+       INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
+       queue_work(fmr_recovery_wq, &mw->mw_work);
+}
+
 static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
            struct rpcrdma_create_data_internal *cdata)
 {
+       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+                                                     RPCRDMA_MAX_DATA_SEGS /
+                                                     RPCRDMA_MAX_FMR_SGES));
        return 0;
 }
 
@@ -48,7 +109,7 @@ static size_t
 fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+                    RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 }
 
 static int
@@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
                if (IS_ERR(r->fmr.fmr))
                        goto out_fmr_err;
 
+               r->mw_xprt = r_xprt;
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
        }
@@ -104,15 +166,6 @@ out:
        return rc;
 }
 
-static int
-__fmr_unmap(struct rpcrdma_mw *r)
-{
-       LIST_HEAD(l);
-
-       list_add(&r->fmr.fmr->list, &l);
-       return ib_unmap_fmr(&l);
-}
-
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -183,15 +236,10 @@ static void
 __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct ib_device *device = r_xprt->rx_ia.ri_device;
-       struct rpcrdma_mw *mw = seg->rl_mw;
        int nsegs = seg->mr_nsegs;
 
-       seg->rl_mw = NULL;
-
        while (nsegs--)
                rpcrdma_unmap_one(device, seg++);
-
-       rpcrdma_put_mw(r_xprt, mw);
 }
 
 /* Invalidate all memory regions that were registered for "req".
@@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
                seg = &req->rl_segments[i];
 
                __fmr_dma_unmap(r_xprt, seg);
+               rpcrdma_put_mw(r_xprt, seg->rl_mw);
 
                i += seg->mr_nsegs;
                seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
        }
 
        req->rl_nchunks = 0;
 }
 
-/* Use the ib_unmap_fmr() verb to prevent further remote
- * access via RDMA READ or RDMA WRITE.
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * In the asynchronous case, DMA unmapping occurs first here
+ * because the rpcrdma_mr_seg is released immediately after this
+ * call. It's contents won't be available in __fmr_dma_unmap later.
+ * FIXME.
  */
-static int
-fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+static void
+fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                 bool sync)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       int rc, nsegs = seg->mr_nsegs;
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
+       unsigned int i;
 
-       dprintk("RPC:       %s: FMR %p\n", __func__, mw);
+       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
 
-       seg1->rl_mw = NULL;
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia->ri_device, seg++);
-       rc = __fmr_unmap(mw);
-       if (rc)
-               goto out_err;
-       rpcrdma_put_mw(r_xprt, mw);
-       return nsegs;
+               if (sync) {
+                       /* ORDER */
+                       __fmr_unmap(mw);
+                       __fmr_dma_unmap(r_xprt, seg);
+                       rpcrdma_put_mw(r_xprt, mw);
+               } else {
+                       __fmr_dma_unmap(r_xprt, seg);
+                       __fmr_queue_recovery(mw);
+               }
 
-out_err:
-       /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
-        * will attempt to release it when the transport is destroyed.
-        */
-       dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
-       return nsegs;
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
+       }
 }
 
 static void
@@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_map                         = fmr_op_map,
        .ro_unmap_sync                  = fmr_op_unmap_sync,
-       .ro_unmap                       = fmr_op_unmap,
+       .ro_unmap_safe                  = fmr_op_unmap_safe,
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
        .ro_init                        = fmr_op_init,
index 94c3fa910b85e5e4a53109a2121dd9be51067c49..c0947544babeb976eea94ae4698ce955bd1a3c37 100644 (file)
@@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void)
        destroy_workqueue(wq);
 }
 
+static int
+__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
+{
+       struct rpcrdma_frmr *f = &r->frmr;
+       int rc;
+
+       rc = ib_dereg_mr(f->fr_mr);
+       if (rc) {
+               pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n",
+                       rc, r);
+               return rc;
+       }
+
+       f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG,
+                              ia->ri_max_frmr_depth);
+       if (IS_ERR(f->fr_mr)) {
+               pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n",
+                       PTR_ERR(f->fr_mr), r);
+               return PTR_ERR(f->fr_mr);
+       }
+
+       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+       f->fr_state = FRMR_IS_INVALID;
+       return 0;
+}
+
+static void
+__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
+{
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_frmr *f = &mw->frmr;
+       int rc;
+
+       rc = __frwr_reset_mr(ia, mw);
+       ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
+       if (rc)
+               return;
+
+       rpcrdma_put_mw(r_xprt, mw);
+}
+
 /* Deferred reset of a single FRMR. Generate a fresh rkey by
  * replacing the MR.
  *
@@ -109,26 +150,10 @@ static void
 __frwr_recovery_worker(struct work_struct *work)
 {
        struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
-                                           frmr.fr_work);
-       struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
-       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-
-       if (ib_dereg_mr(r->frmr.fr_mr))
-               goto out_fail;
+                                           mw_work);
 
-       r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
-       if (IS_ERR(r->frmr.fr_mr))
-               goto out_fail;
-
-       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
-       r->frmr.fr_state = FRMR_IS_INVALID;
-       rpcrdma_put_mw(r_xprt, r);
+       __frwr_reset_and_unmap(r->mw_xprt, r);
        return;
-
-out_fail:
-       pr_warn("RPC:       %s: FRMR %p unrecovered\n",
-               __func__, r);
 }
 
 /* A broken MR was discovered in a context that can't sleep.
@@ -137,8 +162,8 @@ out_fail:
 static void
 __frwr_queue_recovery(struct rpcrdma_mw *r)
 {
-       INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
-       queue_work(frwr_recovery_wq, &r->frmr.fr_work);
+       INIT_WORK(&r->mw_work, __frwr_recovery_worker);
+       queue_work(frwr_recovery_wq, &r->mw_work);
 }
 
 static int
@@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
        if (IS_ERR(f->fr_mr))
                goto out_mr_err;
 
-       f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
-       if (!f->sg)
+       f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
+       if (!f->fr_sg)
                goto out_list_err;
 
-       sg_init_table(f->sg, depth);
+       sg_init_table(f->fr_sg, depth);
 
        init_completion(&f->fr_linv_done);
 
@@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r)
        if (rc)
                dprintk("RPC:       %s: ib_dereg_mr status %i\n",
                        __func__, rc);
-       kfree(r->frmr.sg);
+       kfree(r->frmr.fr_sg);
 }
 
 static int
@@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                                               depth;
        }
 
+       rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
+                                                     RPCRDMA_MAX_DATA_SEGS /
+                                                     ia->ri_max_frmr_depth));
        return 0;
 }
 
@@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+                    RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth);
 }
 
 static void
@@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
                        return rc;
                }
 
+               r->mw_xprt = r_xprt;
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
-               r->frmr.fr_xprt = r_xprt;
        }
 
        return 0;
@@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
        for (i = 0; i < nsegs;) {
                if (seg->mr_page)
-                       sg_set_page(&frmr->sg[i],
+                       sg_set_page(&frmr->fr_sg[i],
                                    seg->mr_page,
                                    seg->mr_len,
                                    offset_in_page(seg->mr_offset));
                else
-                       sg_set_buf(&frmr->sg[i], seg->mr_offset,
+                       sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
                                   seg->mr_len);
 
                ++seg;
@@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
                        break;
        }
-       frmr->sg_nents = i;
+       frmr->fr_nents = i;
+       frmr->fr_dir = direction;
 
-       dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+       dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
        if (!dma_nents) {
                pr_err("RPC:       %s: failed to dma map sg %p sg_nents %u\n",
-                      __func__, frmr->sg, frmr->sg_nents);
+                      __func__, frmr->fr_sg, frmr->fr_nents);
                return -ENOMEM;
        }
 
-       n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE);
-       if (unlikely(n != frmr->sg_nents)) {
+       n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
+       if (unlikely(n != frmr->fr_nents)) {
                pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
-                      __func__, frmr->fr_mr, n, frmr->sg_nents);
+                      __func__, frmr->fr_mr, n, frmr->fr_nents);
                rc = n < 0 ? n : -EINVAL;
                goto out_senderr;
        }
 
        dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-               __func__, mw, frmr->sg_nents, mr->length);
+               __func__, mw, frmr->fr_nents, mr->length);
 
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
@@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (rc)
                goto out_senderr;
 
-       seg1->mr_dir = direction;
        seg1->rl_mw = mw;
        seg1->mr_rkey = mr->rkey;
        seg1->mr_base = mr->iova;
-       seg1->mr_nsegs = frmr->sg_nents;
+       seg1->mr_nsegs = frmr->fr_nents;
        seg1->mr_len = mr->length;
 
-       return frmr->sg_nents;
+       return frmr->fr_nents;
 
 out_senderr:
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
        __frwr_queue_recovery(mw);
        return rc;
 }
@@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
        return invalidate_wr;
 }
 
-static void
-__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-                int rc)
-{
-       struct ib_device *device = r_xprt->rx_ia.ri_device;
-       struct rpcrdma_mw *mw = seg->rl_mw;
-       struct rpcrdma_frmr *f = &mw->frmr;
-
-       seg->rl_mw = NULL;
-
-       ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
-
-       if (!rc)
-               rpcrdma_put_mw(r_xprt, mw);
-       else
-               __frwr_queue_recovery(mw);
-}
-
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
@@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        struct rpcrdma_mr_seg *seg;
        unsigned int i, nchunks;
        struct rpcrdma_frmr *f;
+       struct rpcrdma_mw *mw;
        int rc;
 
        dprintk("RPC:       %s: req %p\n", __func__, req);
@@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * unless ri_id->qp is a valid pointer.
         */
        rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
-       if (rc) {
-               pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
-               rdma_disconnect(ia->ri_id);
-               goto unmap;
-       }
+       if (rc)
+               goto reset_mrs;
 
        wait_for_completion(&f->fr_linv_done);
 
@@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 unmap:
        for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
                seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
+               seg->rl_mw = NULL;
 
-               __frwr_dma_unmap(r_xprt, seg, rc);
+               ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
+                               f->fr_dir);
+               rpcrdma_put_mw(r_xprt, mw);
 
                i += seg->mr_nsegs;
                seg->mr_nsegs = 0;
        }
 
        req->rl_nchunks = 0;
-}
+       return;
 
-/* Post a LOCAL_INV Work Request to prevent further remote access
- * via RDMA READ or RDMA WRITE.
- */
-static int
-frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-       struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       struct rpcrdma_frmr *frmr = &mw->frmr;
-       struct ib_send_wr *invalidate_wr, *bad_wr;
-       int rc, nsegs = seg->mr_nsegs;
+reset_mrs:
+       pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
 
-       dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
+       /* Find and reset the MRs in the LOCAL_INV WRs that did not
+        * get posted. This is synchronous, and slow.
+        */
+       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
+               f = &mw->frmr;
 
-       seg1->rl_mw = NULL;
-       frmr->fr_state = FRMR_IS_INVALID;
-       invalidate_wr = &mw->frmr.fr_invwr;
+               if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
+                       __frwr_reset_mr(ia, mw);
+                       bad_wr = bad_wr->next;
+               }
 
-       memset(invalidate_wr, 0, sizeof(*invalidate_wr));
-       frmr->fr_cqe.done = frwr_wc_localinv;
-       invalidate_wr->wr_cqe = &frmr->fr_cqe;
-       invalidate_wr->opcode = IB_WR_LOCAL_INV;
-       invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
+               i += seg->mr_nsegs;
+       }
+       goto unmap;
+}
 
-       ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
-       read_lock(&ia->ri_qplock);
-       rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
-       read_unlock(&ia->ri_qplock);
-       if (rc)
-               goto out_err;
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ */
+static void
+frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                  bool sync)
+{
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
+       unsigned int i;
 
-       rpcrdma_put_mw(r_xprt, mw);
-       return nsegs;
+       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
+               seg = &req->rl_segments[i];
+               mw = seg->rl_mw;
 
-out_err:
-       dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       __frwr_queue_recovery(mw);
-       return nsegs;
+               if (sync)
+                       __frwr_reset_and_unmap(r_xprt, mw);
+               else
+                       __frwr_queue_recovery(mw);
+
+               i += seg->mr_nsegs;
+               seg->mr_nsegs = 0;
+               seg->rl_mw = NULL;
+       }
 }
 
 static void
@@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_map                         = frwr_op_map,
        .ro_unmap_sync                  = frwr_op_unmap_sync,
-       .ro_unmap                       = frwr_op_unmap,
+       .ro_unmap_safe                  = frwr_op_unmap_safe,
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
        .ro_init                        = frwr_op_init,
index 481b9b6f4a150e9ed9d76f7cc4b187a59cc43b91..3750596cc432038ca524604e43479eb0af2012c5 100644 (file)
@@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                       __func__, PTR_ERR(mr));
                return -ENOMEM;
        }
-
        ia->ri_dma_mr = mr;
+
+       rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
+                                                     RPCRDMA_MAX_DATA_SEGS,
+                                                     RPCRDMA_MAX_HDR_SEGS));
        return 0;
 }
 
@@ -47,7 +50,7 @@ static size_t
 physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
 {
        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    rpcrdma_max_segments(r_xprt));
+                    RPCRDMA_MAX_HDR_SEGS);
 }
 
 static int
@@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        return 1;
 }
 
-/* Unmap a memory region, but leave it registered.
- */
-static int
-physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-       rpcrdma_unmap_one(ia->ri_device, seg);
-       return 1;
-}
-
 /* DMA unmap all memory regions that were mapped for "req".
  */
 static void
@@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
                rpcrdma_unmap_one(device, &req->rl_segments[i++]);
 }
 
+/* Use a slow, safe mechanism to invalidate all memory regions
+ * that were registered for "req".
+ *
+ * For physical memory registration, there is no good way to
+ * fence a single MR that has been advertised to the server. The
+ * client has already handed the server an R_key that cannot be
+ * invalidated and is shared by all MRs on this connection.
+ * Tearing down the PD might be the only safe choice, but it's
+ * not clear that a freshly acquired DMA R_key would be different
+ * than the one used by the PD that was just destroyed.
+ * FIXME.
+ */
+static void
+physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                      bool sync)
+{
+       physical_op_unmap_sync(r_xprt, req);
+}
+
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
        .ro_map                         = physical_op_map,
        .ro_unmap_sync                  = physical_op_unmap_sync,
-       .ro_unmap                       = physical_op_unmap,
+       .ro_unmap_safe                  = physical_op_unmap_safe,
        .ro_open                        = physical_op_open,
        .ro_maxpages                    = physical_op_maxpages,
        .ro_init                        = physical_op_init,
index 888823bb6dae40d3c3143823110237ce69741a7a..35a81096e83d50bd501726ed1d9376a5e4bcf54d 100644 (file)
@@ -61,26 +61,84 @@ enum rpcrdma_chunktype {
        rpcrdma_replych
 };
 
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
-       "pure inline",  /* no chunks */
-       " read chunk",  /* some argument via rdma read */
-       "*read chunk",  /* entire request via rdma read */
-       "write chunk",  /* some result via rdma write */
+       "inline",       /* no chunks */
+       "read list",    /* some argument via rdma read */
+       "*read list",   /* entire request via rdma read */
+       "write list",   /* some result via rdma write */
        "reply chunk"   /* entire reply via rdma write */
 };
-#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+       unsigned int size;
+
+       /* Fixed header fields and list discriminators */
+       size = RPCRDMA_HDRLEN_MIN;
+
+       /* Maximum Read list size */
+       maxsegs += 2;   /* segment for head and tail buffers */
+       size = maxsegs * sizeof(struct rpcrdma_read_chunk);
+
+       /* Minimal Read chunk size */
+       size += sizeof(__be32); /* segment count */
+       size += sizeof(struct rpcrdma_segment);
+       size += sizeof(__be32); /* list discriminator */
+
+       dprintk("RPC:       %s: max call header size = %u\n",
+               __func__, size);
+       return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message.  The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+       unsigned int size;
+
+       /* Fixed header fields and list discriminators */
+       size = RPCRDMA_HDRLEN_MIN;
+
+       /* Maximum Write list size */
+       maxsegs += 2;   /* segment for head and tail buffers */
+       size = sizeof(__be32);          /* segment count */
+       size += maxsegs * sizeof(struct rpcrdma_segment);
+       size += sizeof(__be32); /* list discriminator */
+
+       dprintk("RPC:       %s: max reply header size = %u\n",
+               __func__, size);
+       return size;
+}
+
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
+                                 struct rpcrdma_create_data_internal *cdata,
+                                 unsigned int maxsegs)
+{
+       ia->ri_max_inline_write = cdata->inline_wsize -
+                                 rpcrdma_max_call_header_size(maxsegs);
+       ia->ri_max_inline_read = cdata->inline_rsize -
+                                rpcrdma_max_reply_header_size(maxsegs);
+}
 
 /* The client can send a request inline as long as the RPCRDMA header
  * plus the RPC call fit under the transport's inline limit. If the
  * combined call message size exceeds that limit, the client must use
  * the read chunk list for this operation.
  */
-static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+                               struct rpc_rqst *rqst)
 {
-       unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+       return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
 }
 
 /* The client can't know how large the actual reply will be. Thus it
@@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
  * limit, the client must provide a write list or a reply chunk for
  * this request.
  */
-static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+                                  struct rpc_rqst *rqst)
 {
-       unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+       return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
 static int
@@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        return n;
 }
 
-/*
- * Create read/write chunk lists, and reply chunks, for RDMA
- *
- *   Assume check against THRESHOLD has been done, and chunks are required.
- *   Assume only encoding one list entry for read|write chunks. The NFSv3
- *     protocol is simple enough to allow this as it only has a single "bulk
- *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
- *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
- *
- * When used for a single reply chunk (which is a special write
- * chunk used for the entire reply, rather than just the data), it
- * is used primarily for READDIR and READLINK which would otherwise
- * be severely size-limited by a small rdma inline read max. The server
- * response will come back as an RDMA Write, followed by a message
- * of type RDMA_NOMSG carrying the xid and length. As a result, reply
- * chunks do not provide data alignment, however they do not require
- * "fixup" (moving the response to the upper layer buffer) either.
+static inline __be32 *
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+{
+       *iptr++ = cpu_to_be32(seg->mr_rkey);
+       *iptr++ = cpu_to_be32(seg->mr_len);
+       return xdr_encode_hyper(iptr, seg->mr_base);
+}
+
+/* XDR-encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
  *
  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
  *
@@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
  *   N elements, position P (same P for all chunks of same arg!):
  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Read list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+                        struct rpcrdma_req *req, struct rpc_rqst *rqst,
+                        __be32 *iptr, enum rpcrdma_chunktype rtype)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       unsigned int pos;
+       int n, nsegs;
+
+       if (rtype == rpcrdma_noch) {
+               *iptr++ = xdr_zero;     /* item not present */
+               return iptr;
+       }
+
+       pos = rqst->rq_snd_buf.head[0].iov_len;
+       if (rtype == rpcrdma_areadch)
+               pos = 0;
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       if (nsegs < 0)
+               return ERR_PTR(nsegs);
+
+       do {
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
+               if (n <= 0)
+                       return ERR_PTR(n);
+
+               *iptr++ = xdr_one;      /* item present */
+
+               /* All read segments in this chunk
+                * have the same "position".
+                */
+               *iptr++ = cpu_to_be32(pos);
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: read segment pos %u "
+                       "%d@0x%016llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__, pos,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.read_chunk_count++;
+               req->rl_nchunks++;
+               seg += n;
+               nsegs -= n;
+       } while (nsegs);
+       req->rl_nextseg = seg;
+
+       /* Finish Read list */
+       *iptr++ = xdr_zero;     /* Next item not present */
+       return iptr;
+}
+
+/* XDR-encode the Write list. Supports encoding a list containing
+ * one array of plain segments that belong to a single write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Write chunklist (a list of (one) counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO - 0
  *
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Write list, or an error pointer.
+ */
+static __be32 *
+rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+                         struct rpc_rqst *rqst, __be32 *iptr,
+                         enum rpcrdma_chunktype wtype)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       int n, nsegs, nchunks;
+       __be32 *segcount;
+
+       if (wtype != rpcrdma_writech) {
+               *iptr++ = xdr_zero;     /* no Write list present */
+               return iptr;
+       }
+
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+                                    rqst->rq_rcv_buf.head[0].iov_len,
+                                    wtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       if (nsegs < 0)
+               return ERR_PTR(nsegs);
+
+       *iptr++ = xdr_one;      /* Write list present */
+       segcount = iptr++;      /* save location of segment count */
+
+       nchunks = 0;
+       do {
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
+               if (n <= 0)
+                       return ERR_PTR(n);
+
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: write segment "
+                       "%d@0x016%llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.write_chunk_count++;
+               r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+               req->rl_nchunks++;
+               nchunks++;
+               seg   += n;
+               nsegs -= n;
+       } while (nsegs);
+       req->rl_nextseg = seg;
+
+       /* Update count of segments in this Write chunk */
+       *segcount = cpu_to_be32(nchunks);
+
+       /* Finish Write list */
+       *iptr++ = xdr_zero;     /* Next item not present */
+       return iptr;
+}
+
+/* XDR-encode the Reply chunk. Supports encoding an array of plain
+ * segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
  *  Reply chunk (a counted array):
  *   N elements:
  *    1 - N - HLOO - HLOO - ... - HLOO
  *
- * Returns positive RPC/RDMA header size, or negative errno.
+ * Returns a pointer to the XDR word in the RDMA header following
+ * the end of the Reply chunk, or an error pointer.
  */
-
-static ssize_t
-rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
-               struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+static __be32 *
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+                          struct rpcrdma_req *req, struct rpc_rqst *rqst,
+                          __be32 *iptr, enum rpcrdma_chunktype wtype)
 {
-       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-       int n, nsegs, nchunks = 0;
-       unsigned int pos;
-       struct rpcrdma_mr_seg *seg = req->rl_segments;
-       struct rpcrdma_read_chunk *cur_rchunk = NULL;
-       struct rpcrdma_write_array *warray = NULL;
-       struct rpcrdma_write_chunk *cur_wchunk = NULL;
-       __be32 *iptr = headerp->rm_body.rm_chunks;
-       int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
-
-       if (type == rpcrdma_readch || type == rpcrdma_areadch) {
-               /* a read chunk - server will RDMA Read our memory */
-               cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
-       } else {
-               /* a write or reply chunk - server will RDMA Write our memory */
-               *iptr++ = xdr_zero;     /* encode a NULL read chunk list */
-               if (type == rpcrdma_replych)
-                       *iptr++ = xdr_zero;     /* a NULL write chunk list */
-               warray = (struct rpcrdma_write_array *) iptr;
-               cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
-       }
+       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       int n, nsegs, nchunks;
+       __be32 *segcount;
 
-       if (type == rpcrdma_replych || type == rpcrdma_areadch)
-               pos = 0;
-       else
-               pos = target->head[0].iov_len;
+       if (wtype != rpcrdma_replych) {
+               *iptr++ = xdr_zero;     /* no Reply chunk present */
+               return iptr;
+       }
 
-       nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
        if (nsegs < 0)
-               return nsegs;
+               return ERR_PTR(nsegs);
 
-       map = r_xprt->rx_ia.ri_ops->ro_map;
+       *iptr++ = xdr_one;      /* Reply chunk present */
+       segcount = iptr++;      /* save location of segment count */
+
+       nchunks = 0;
        do {
-               n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
                if (n <= 0)
-                       goto out;
-               if (cur_rchunk) {       /* read */
-                       cur_rchunk->rc_discrim = xdr_one;
-                       /* all read chunks have the same "position" */
-                       cur_rchunk->rc_position = cpu_to_be32(pos);
-                       cur_rchunk->rc_target.rs_handle =
-                                               cpu_to_be32(seg->mr_rkey);
-                       cur_rchunk->rc_target.rs_length =
-                                               cpu_to_be32(seg->mr_len);
-                       xdr_encode_hyper(
-                                       (__be32 *)&cur_rchunk->rc_target.rs_offset,
-                                       seg->mr_base);
-                       dprintk("RPC:       %s: read chunk "
-                               "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
-                               seg->mr_len, (unsigned long long)seg->mr_base,
-                               seg->mr_rkey, pos, n < nsegs ? "more" : "last");
-                       cur_rchunk++;
-                       r_xprt->rx_stats.read_chunk_count++;
-               } else {                /* write/reply */
-                       cur_wchunk->wc_target.rs_handle =
-                                               cpu_to_be32(seg->mr_rkey);
-                       cur_wchunk->wc_target.rs_length =
-                                               cpu_to_be32(seg->mr_len);
-                       xdr_encode_hyper(
-                                       (__be32 *)&cur_wchunk->wc_target.rs_offset,
-                                       seg->mr_base);
-                       dprintk("RPC:       %s: %s chunk "
-                               "elem %d@0x%llx:0x%x (%s)\n", __func__,
-                               (type == rpcrdma_replych) ? "reply" : "write",
-                               seg->mr_len, (unsigned long long)seg->mr_base,
-                               seg->mr_rkey, n < nsegs ? "more" : "last");
-                       cur_wchunk++;
-                       if (type == rpcrdma_replych)
-                               r_xprt->rx_stats.reply_chunk_count++;
-                       else
-                               r_xprt->rx_stats.write_chunk_count++;
-                       r_xprt->rx_stats.total_rdma_request += seg->mr_len;
-               }
+                       return ERR_PTR(n);
+
+               iptr = xdr_encode_rdma_segment(iptr, seg);
+
+               dprintk("RPC: %5u %s: reply segment "
+                       "%d@0x%016llx:0x%08x (%s)\n",
+                       rqst->rq_task->tk_pid, __func__,
+                       seg->mr_len, (unsigned long long)seg->mr_base,
+                       seg->mr_rkey, n < nsegs ? "more" : "last");
+
+               r_xprt->rx_stats.reply_chunk_count++;
+               r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+               req->rl_nchunks++;
                nchunks++;
                seg   += n;
                nsegs -= n;
        } while (nsegs);
+       req->rl_nextseg = seg;
 
-       /* success. all failures return above */
-       req->rl_nchunks = nchunks;
-
-       /*
-        * finish off header. If write, marshal discrim and nchunks.
-        */
-       if (cur_rchunk) {
-               iptr = (__be32 *) cur_rchunk;
-               *iptr++ = xdr_zero;     /* finish the read chunk list */
-               *iptr++ = xdr_zero;     /* encode a NULL write chunk list */
-               *iptr++ = xdr_zero;     /* encode a NULL reply chunk */
-       } else {
-               warray->wc_discrim = xdr_one;
-               warray->wc_nchunks = cpu_to_be32(nchunks);
-               iptr = (__be32 *) cur_wchunk;
-               if (type == rpcrdma_writech) {
-                       *iptr++ = xdr_zero; /* finish the write chunk list */
-                       *iptr++ = xdr_zero; /* encode a NULL reply chunk */
-               }
-       }
-
-       /*
-        * Return header size.
-        */
-       return (unsigned char *)iptr - (unsigned char *)headerp;
+       /* Update count of segments in the Reply chunk */
+       *segcount = cpu_to_be32(nchunks);
 
-out:
-       for (pos = 0; nchunks--;)
-               pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-                                                     &req->rl_segments[pos]);
-       return n;
+       return iptr;
 }
 
 /*
@@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Uses multiple RDMA IOVs for a request:
- *  [0] -- RPC RDMA header, which uses memory from the *start* of the
- *         preregistered buffer that already holds the RPC data in
- *         its middle.
- *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
- *  [2] -- optional padding.
- *  [3] -- if padded, header only in [1] and data here.
+ * Prepares up to two IOVs per Call message:
+ *
+ *  [0] -- RPC RDMA header
+ *  [1] -- the RPC header/data
  *
  * Returns zero on success, otherwise a negative errno.
  */
@@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        struct rpc_xprt *xprt = rqst->rq_xprt;
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-       char *base;
-       size_t rpclen;
-       ssize_t hdrlen;
        enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
+       ssize_t hdrlen;
+       size_t rpclen;
+       __be32 *iptr;
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
                return rpcrdma_bc_marshal_reply(rqst);
 #endif
 
-       /*
-        * rpclen gets amount of data in first buffer, which is the
-        * pre-registered buffer.
-        */
-       base = rqst->rq_svec[0].iov_base;
-       rpclen = rqst->rq_svec[0].iov_len;
-
        headerp = rdmab_to_msg(req->rl_rdmabuf);
        /* don't byte-swap XID, it's already done in request */
        headerp->rm_xid = rqst->rq_xid;
@@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        /*
         * Chunks needed for results?
         *
-        * o Read ops return data as write chunk(s), header as inline.
         * o If the expected result is under the inline threshold, all ops
         *   return as inline.
+        * o Large read ops return data as write chunk(s), header as
+        *   inline.
         * o Large non-read ops return as a single reply chunk.
         */
-       if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-               wtype = rpcrdma_writech;
-       else if (rpcrdma_results_inline(rqst))
+       if (rpcrdma_results_inline(r_xprt, rqst))
                wtype = rpcrdma_noch;
+       else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+               wtype = rpcrdma_writech;
        else
                wtype = rpcrdma_replych;
 
@@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * that both has a data payload, and whose non-data arguments
         * by themselves are larger than the inline threshold.
         */
-       if (rpcrdma_args_inline(rqst)) {
+       if (rpcrdma_args_inline(r_xprt, rqst)) {
                rtype = rpcrdma_noch;
+               rpcrdma_inline_pullup(rqst);
+               rpclen = rqst->rq_svec[0].iov_len;
        } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
                rtype = rpcrdma_readch;
+               rpclen = rqst->rq_svec[0].iov_len;
+               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
        } else {
                r_xprt->rx_stats.nomsg_call_count++;
                headerp->rm_type = htonl(RDMA_NOMSG);
@@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                rpclen = 0;
        }
 
-       /* The following simplification is not true forever */
-       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
-               wtype = rpcrdma_noch;
-       if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
-               dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
-                       __func__);
-               return -EIO;
-       }
-
-       hdrlen = RPCRDMA_HDRLEN_MIN;
-
-       /*
-        * Pull up any extra send data into the preregistered buffer.
-        * When padding is in use and applies to the transfer, insert
-        * it and change the message type.
+       /* This implementation supports the following combinations
+        * of chunk lists in one RPC-over-RDMA Call message:
+        *
+        *   - Read list
+        *   - Write list
+        *   - Reply chunk
+        *   - Read list + Reply chunk
+        *
+        * It might not yet support the following combinations:
+        *
+        *   - Read list + Write list
+        *
+        * It does not support the following combinations:
+        *
+        *   - Write list + Reply chunk
+        *   - Read list + Write list + Reply chunk
+        *
+        * This implementation supports only a single chunk in each
+        * Read or Write list. Thus for example the client cannot
+        * send a Call message with a Position Zero Read chunk and a
+        * regular Read chunk at the same time.
         */
-       if (rtype == rpcrdma_noch) {
-
-               rpcrdma_inline_pullup(rqst);
-
-               headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-               headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-               headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-               /* new length after pullup */
-               rpclen = rqst->rq_svec[0].iov_len;
-       } else if (rtype == rpcrdma_readch)
-               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
-       if (rtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-                                              headerp, rtype);
-               wtype = rtype;  /* simplify dprintk */
-
-       } else if (wtype != rpcrdma_noch) {
-               hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-                                              headerp, wtype);
-       }
-       if (hdrlen < 0)
-               return hdrlen;
+       req->rl_nchunks = 0;
+       req->rl_nextseg = req->rl_segments;
+       iptr = headerp->rm_body.rm_chunks;
+       iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
+       if (IS_ERR(iptr))
+               goto out_unmap;
+       hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
+
+       if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+               goto out_overflow;
+
+       dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+               rqst->rq_task->tk_pid, __func__,
+               transfertypes[rtype], transfertypes[wtype],
+               hdrlen, rpclen);
 
-       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
-               " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[wtype], hdrlen, rpclen,
-               headerp, base, rdmab_lkey(req->rl_rdmabuf));
-
-       /*
-        * initialize send_iov's - normally only two: rdma chunk header and
-        * single preregistered RPC header buffer, but if padding is present,
-        * then use a preregistered (and zeroed) pad buffer between the RPC
-        * header and any write data. In all non-rdma cases, any following
-        * data has been copied into the RPC header buffer.
-        */
        req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
        req->rl_send_iov[0].length = hdrlen;
        req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
@@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 
        req->rl_niovs = 2;
        return 0;
+
+out_overflow:
+       pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
+               hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
+       /* Terminate this RPC. Chunks registered above will be
+        * released by xprt_release -> xprt_rmda_free .
+        */
+       return -EIO;
+
+out_unmap:
+       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+       return PTR_ERR(iptr);
 }
 
 /*
index 765bca47c74d7e9f28ad1be961ce86dacc30dd81..0ba9887f3e22bab9a1e3e809df5c4e2c23a510fe 100644 (file)
@@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
        return (__be32 *)&ary->wc_array[nchunks];
 }
 
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
+/**
+ * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header
+ * @rq_arg: Receive buffer
+ *
+ * On entry, xdr->head[0].iov_base points to first byte in the
+ * RPC-over-RDMA header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ * The length of the RPC-over-RDMA header is returned.
+ */
+int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
 {
+       struct rpcrdma_msg *rmsgp;
        __be32 *va, *vaend;
        unsigned int len;
        u32 hdr_len;
 
        /* Verify that there's enough bytes for header + something */
-       if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
+       if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
                dprintk("svcrdma: header too short = %d\n",
-                       rqstp->rq_arg.len);
+                       rq_arg->len);
                return -EINVAL;
        }
 
+       rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
        if (rmsgp->rm_vers != rpcrdma_version) {
                dprintk("%s: bad version %u\n", __func__,
                        be32_to_cpu(rmsgp->rm_vers));
@@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
                        be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
                va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-               rqstp->rq_arg.head[0].iov_base = va;
+               rq_arg->head[0].iov_base = va;
                len = (u32)((unsigned long)va - (unsigned long)rmsgp);
-               rqstp->rq_arg.head[0].iov_len -= len;
-               if (len > rqstp->rq_arg.len)
+               rq_arg->head[0].iov_len -= len;
+               if (len > rq_arg->len)
                        return -EINVAL;
                return len;
        default:
@@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
         * chunk list and a reply chunk list.
         */
        va = &rmsgp->rm_body.rm_chunks[0];
-       vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+       vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
        va = decode_read_list(va, vaend);
        if (!va) {
                dprintk("svcrdma: failed to decode read list\n");
@@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
                return -EINVAL;
        }
 
-       rqstp->rq_arg.head[0].iov_base = va;
+       rq_arg->head[0].iov_base = va;
        hdr_len = (unsigned long)va - (unsigned long)rmsgp;
-       rqstp->rq_arg.head[0].iov_len -= hdr_len;
-
+       rq_arg->head[0].iov_len -= hdr_len;
        return hdr_len;
 }
 
index fbe7444e7de6ab05ee1c6e91d09b03eca7ba1f3f..2c25606f25614da9fb181ead55c6c6d7a1546b62 100644 (file)
@@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        head->arg.len = rqstp->rq_arg.len;
        head->arg.buflen = rqstp->rq_arg.buflen;
 
-       ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
-       position = be32_to_cpu(ch->rc_position);
-
        /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+       position = be32_to_cpu(ch->rc_position);
        if (position == 0) {
                head->arg.pages = &head->pages[0];
                page_offset = head->byte_len;
@@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        if (page_offset & 3) {
                u32 pad = 4 - (page_offset & 3);
 
-               head->arg.page_len += pad;
+               head->arg.tail[0].iov_len += pad;
                head->arg.len += pad;
                head->arg.buflen += pad;
                page_offset += pad;
@@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
        return ret;
 }
 
-static int rdma_read_complete(struct svc_rqst *rqstp,
-                             struct svc_rdma_op_ctxt *head)
+static void rdma_read_complete(struct svc_rqst *rqstp,
+                              struct svc_rdma_op_ctxt *head)
 {
        int page_no;
-       int ret;
 
        /* Copy RPC pages */
        for (page_no = 0; page_no < head->count; page_no++) {
@@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
        rqstp->rq_arg.tail[0] = head->arg.tail[0];
        rqstp->rq_arg.len = head->arg.len;
        rqstp->rq_arg.buflen = head->arg.buflen;
-
-       /* Free the context */
-       svc_rdma_put_context(head, 0);
-
-       /* XXX: What should this be? */
-       rqstp->rq_prot = IPPROTO_MAX;
-       svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
-
-       ret = rqstp->rq_arg.head[0].iov_len
-               + rqstp->rq_arg.page_len
-               + rqstp->rq_arg.tail[0].iov_len;
-       dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
-               "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
-               ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
-               rqstp->rq_arg.head[0].iov_len);
-
-       return ret;
 }
 
 /* By convention, backchannel calls arrive via rdma_msg type
@@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                                  dto_q);
                list_del_init(&ctxt->dto_q);
                spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
-               return rdma_read_complete(rqstp, ctxt);
+               rdma_read_complete(rqstp, ctxt);
+               goto complete;
        } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
                ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
                                  struct svc_rdma_op_ctxt,
@@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
        /* Decode the RDMA header. */
        rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-       ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+       ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
        if (ret < 0)
                goto out_err;
        if (ret == 0)
@@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                return 0;
        }
 
+complete:
        ret = rqstp->rq_arg.head[0].iov_len
                + rqstp->rq_arg.page_len
                + rqstp->rq_arg.tail[0].iov_len;
index 4f1b1c4f45f9d11d1ca3df8767ea76e6b07d99ae..54d53330062030b682fcd8f267e7d79448c6a5ef 100644 (file)
@@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma,
                      struct svc_rqst *rqstp,
                      struct page *page,
                      struct rpcrdma_msg *rdma_resp,
-                     struct svc_rdma_op_ctxt *ctxt,
                      struct svc_rdma_req_map *vec,
                      int byte_count)
 {
+       struct svc_rdma_op_ctxt *ctxt;
        struct ib_send_wr send_wr;
        u32 xdr_off;
        int sge_no;
        int sge_bytes;
        int page_no;
        int pages;
-       int ret;
-
-       ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
-       if (ret) {
-               svc_rdma_put_context(ctxt, 0);
-               return -ENOTCONN;
-       }
+       int ret = -EIO;
 
        /* Prepare the context */
+       ctxt = svc_rdma_get_context(rdma);
+       ctxt->direction = DMA_TO_DEVICE;
        ctxt->pages[0] = page;
        ctxt->count = 1;
 
@@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
  err:
        svc_rdma_unmap_dma(ctxt);
        svc_rdma_put_context(ctxt, 1);
-       pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
-       return -EIO;
+       return ret;
 }
 
 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
@@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        int ret;
        int inline_bytes;
        struct page *res_page;
-       struct svc_rdma_op_ctxt *ctxt;
        struct svc_rdma_req_map *vec;
 
        dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
 
        /* Build an req vec for the XDR */
-       ctxt = svc_rdma_get_context(rdma);
-       ctxt->direction = DMA_TO_DEVICE;
        vec = svc_rdma_get_req_map(rdma);
        ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
        if (ret)
@@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
                inline_bytes -= ret;
        }
 
-       ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+       /* Post a fresh Receive buffer _before_ sending the reply */
+       ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+       if (ret)
+               goto err1;
+
+       ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
                         inline_bytes);
        if (ret < 0)
                goto err1;
@@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        put_page(res_page);
  err0:
        svc_rdma_put_req_map(rdma, vec);
-       svc_rdma_put_context(ctxt, 0);
+       pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
+              ret);
        set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
        return -ENOTCONN;
 }
index 90668969d5596b199c9fad0c188a3d72dccc99e6..dd9440137834c7b1a55b433a1a91bb52efa26acc 100644 (file)
@@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
        int ret;
 
        dprintk("svcrdma: Creating RDMA socket\n");
-       if (sa->sa_family != AF_INET) {
+       if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
                dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
                return ERR_PTR(-EAFNOSUPPORT);
        }
@@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
                goto err0;
        }
 
+       /* Allow both IPv4 and IPv6 sockets to bind a single port
+        * at the same time.
+        */
+#if IS_ENABLED(CONFIG_IPV6)
+       ret = rdma_set_afonly(listen_id, 1);
+       if (ret) {
+               dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
+               goto err1;
+       }
+#endif
        ret = rdma_bind_addr(listen_id, sa);
        if (ret) {
                dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
@@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 
        /* Post receive buffers */
-       for (i = 0; i < newxprt->sc_rq_depth; i++) {
+       for (i = 0; i < newxprt->sc_max_requests; i++) {
                ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
                if (ret) {
                        dprintk("svcrdma: failure posting receive buffers\n");
@@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work)
 
        dprintk("svcrdma: %s(%p)\n", __func__, rdma);
 
+       if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+               ib_drain_qp(rdma->sc_qp);
+
        /* We should only be called from kref_put */
        if (atomic_read(&xprt->xpt_ref.refcount) != 0)
                pr_err("svcrdma: sc_xprt still in use? (%d)\n",
index b1b009f10ea375a3f63148973e58e486df1aa282..99d2e5b72726abd00f1ac5e5732d5fa02119f55a 100644 (file)
@@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
 
 static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
 static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
@@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
+               .extra1         = &min_inline_size,
+               .extra2         = &max_inline_size,
        },
        {
                .procname       = "rdma_max_inline_write",
@@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
+               .extra1         = &min_inline_size,
+               .extra2         = &max_inline_size,
        },
        {
                .procname       = "rdma_inline_write_padding",
@@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
 out:
        dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
        req->rl_connect_cookie = 0;     /* our reserved value */
+       req->rl_task = task;
        return req->rl_sendbuf->rg_base;
 
 out_rdmabuf:
@@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer)
        struct rpcrdma_req *req;
        struct rpcrdma_xprt *r_xprt;
        struct rpcrdma_regbuf *rb;
-       int i;
 
        if (buffer == NULL)
                return;
@@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer)
 
        dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-       for (i = 0; req->rl_nchunks;) {
-               --req->rl_nchunks;
-               i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
-                                                   &req->rl_segments[i]);
-       }
+       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
+                                           !RPC_IS_ASYNC(req->rl_task));
 
        rpcrdma_buffer_put(req);
 }
@@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        .bc_setup               = xprt_rdma_bc_setup,
        .bc_up                  = xprt_rdma_bc_up,
+       .bc_maxpayload          = xprt_rdma_bc_maxpayload,
        .bc_free_rqst           = xprt_rdma_bc_free_rqst,
        .bc_destroy             = xprt_rdma_bc_destroy,
 #endif
index f5ed9f982cd71b12606d7f8953a6283447509029..b044d98a1370207422d129689bb43b35766fc76f 100644 (file)
@@ -203,15 +203,6 @@ out_fail:
        goto out_schedule;
 }
 
-static void
-rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
-{
-       struct ib_wc wc;
-
-       while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
-               rpcrdma_receive_wc(NULL, &wc);
-}
-
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -373,23 +364,6 @@ out:
        return ERR_PTR(rc);
 }
 
-/*
- * Drain any cq, prior to teardown.
- */
-static void
-rpcrdma_clean_cq(struct ib_cq *cq)
-{
-       struct ib_wc wc;
-       int count = 0;
-
-       while (1 == ib_poll_cq(cq, 1, &wc))
-               ++count;
-
-       if (count)
-               dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
-                       __func__, count, wc.opcode);
-}
-
 /*
  * Exported functions.
  */
@@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        dprintk("RPC:       %s: memory registration strategy is '%s'\n",
                __func__, ia->ri_ops->ro_displayname);
 
-       rwlock_init(&ia->ri_qplock);
        return 0;
 
 out3:
@@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                        __func__);
                return -ENOMEM;
        }
-       max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+       max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
 
        /* check provider's send/recv wr limits */
        if (cdata->max_requests > max_qp_wr)
@@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.srq = NULL;
        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
        ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+       ep->rep_attr.cap.max_send_wr += 1;      /* drain cqe */
        rc = ia->ri_ops->ro_open(ia, ep, cdata);
        if (rc)
                return rc;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
        ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+       ep->rep_attr.cap.max_recv_wr += 1;      /* drain cqe */
        ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
@@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        ep->rep_attr.recv_cq = recvcq;
 
        /* Initialize cma parameters */
+       memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
        /* RPC/RDMA does not use private data */
        ep->rep_remote_cma.private_data = NULL;
@@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                ep->rep_remote_cma.responder_resources =
                                                ia->ri_device->attrs.max_qp_rd_atom;
 
-       ep->rep_remote_cma.retry_count = 7;
+       /* Limit transport retries so client can detect server
+        * GID changes quickly. RPC layer handles re-establishing
+        * transport connection and retransmission.
+        */
+       ep->rep_remote_cma.retry_count = 6;
+
+       /* RPC-over-RDMA handles its own flow control. In addition,
+        * make all RNR NAKs visible so we know that RPC-over-RDMA
+        * flow control is working correctly (no NAKs should be seen).
+        */
        ep->rep_remote_cma.flow_control = 0;
        ep->rep_remote_cma.rnr_retry_count = 0;
 
@@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 
        cancel_delayed_work_sync(&ep->rep_connect_worker);
 
-       if (ia->ri_id->qp)
-               rpcrdma_ep_disconnect(ep, ia);
-
-       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-       rpcrdma_clean_cq(ep->rep_attr.send_cq);
-
        if (ia->ri_id->qp) {
+               rpcrdma_ep_disconnect(ep, ia);
                rdma_destroy_qp(ia->ri_id);
                ia->ri_id->qp = NULL;
        }
@@ -659,7 +639,6 @@ retry:
                dprintk("RPC:       %s: reconnecting...\n", __func__);
 
                rpcrdma_ep_disconnect(ep, ia);
-               rpcrdma_flush_cqs(ep);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -692,10 +671,8 @@ retry:
                        goto out;
                }
 
-               write_lock(&ia->ri_qplock);
                old = ia->ri_id;
                ia->ri_id = id;
-               write_unlock(&ia->ri_qplock);
 
                rdma_destroy_qp(old);
                rpcrdma_destroy_id(old);
@@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_flush_cqs(ep);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
                ep->rep_connected = rc;
        }
+
+       ib_drain_qp(ia->ri_id->qp);
 }
 
 struct rpcrdma_req *
@@ -1271,25 +1249,3 @@ out_rc:
        rpcrdma_recv_buffer_put(rep);
        return rc;
 }
-
-/* How many chunk list items fit within our inline buffers?
- */
-unsigned int
-rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-       int bytes, segments;
-
-       bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
-       bytes -= RPCRDMA_HDRLEN_MIN;
-       if (bytes < sizeof(struct rpcrdma_segment) * 2) {
-               pr_warn("RPC:       %s: inline threshold too small\n",
-                       __func__);
-               return 0;
-       }
-
-       segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
-       dprintk("RPC:       %s: max chunk list size = %d segments\n",
-               __func__, segments);
-       return segments;
-}
index 2ebc743cb96f4835205550fa210ecef37d55963a..95cdc66225ee1f52542119b7a2e152b888cf46b0 100644 (file)
@@ -65,7 +65,6 @@
  */
 struct rpcrdma_ia {
        const struct rpcrdma_memreg_ops *ri_ops;
-       rwlock_t                ri_qplock;
        struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
@@ -73,6 +72,8 @@ struct rpcrdma_ia {
        struct completion       ri_done;
        int                     ri_async_rc;
        unsigned int            ri_max_frmr_depth;
+       unsigned int            ri_max_inline_write;
+       unsigned int            ri_max_inline_read;
        struct ib_qp_attr       ri_qp_attr;
        struct ib_qp_init_attr  ri_qp_init_attr;
 };
@@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 
 #define RPCRDMA_DEF_GFP                (GFP_NOIO | __GFP_NOWARN)
 
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 8. This prevents less-capable devices and
+ * memory registrations from overrunning the Send buffer
+ * while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 8 read segments means the Read
+ * list (or Write list or Reply chunk) cannot consume more
+ * than
+ *
+ * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ *
+ * And the fixed part of the header is another 24 bytes.
+ *
+ * The smallest inline threshold is 1024 bytes, ensuring that
+ * at least 750 bytes are available for RPC messages.
+ */
+#define RPCRDMA_MAX_HDR_SEGS   (8)
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  */
 
 #define RPCRDMA_MAX_DATA_SEGS  ((1 * 1024 * 1024) / PAGE_SIZE)
-#define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+/* data segments + head/tail for Call + head/tail for Reply */
+#define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 4)
 
 struct rpcrdma_buffer;
 
@@ -198,14 +221,13 @@ enum rpcrdma_frmr_state {
 };
 
 struct rpcrdma_frmr {
-       struct scatterlist              *sg;
-       int                             sg_nents;
+       struct scatterlist              *fr_sg;
+       int                             fr_nents;
+       enum dma_data_direction         fr_dir;
        struct ib_mr                    *fr_mr;
        struct ib_cqe                   fr_cqe;
        enum rpcrdma_frmr_state         fr_state;
        struct completion               fr_linv_done;
-       struct work_struct              fr_work;
-       struct rpcrdma_xprt             *fr_xprt;
        union {
                struct ib_reg_wr        fr_regwr;
                struct ib_send_wr       fr_invwr;
@@ -222,6 +244,8 @@ struct rpcrdma_mw {
                struct rpcrdma_fmr      fmr;
                struct rpcrdma_frmr     frmr;
        };
+       struct work_struct      mw_work;
+       struct rpcrdma_xprt     *mw_xprt;
        struct list_head        mw_list;
        struct list_head        mw_all;
 };
@@ -270,12 +294,14 @@ struct rpcrdma_req {
        unsigned int            rl_niovs;
        unsigned int            rl_nchunks;
        unsigned int            rl_connect_cookie;
+       struct rpc_task         *rl_task;
        struct rpcrdma_buffer   *rl_buffer;
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
        struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
        struct rpcrdma_regbuf   *rl_rdmabuf;
        struct rpcrdma_regbuf   *rl_sendbuf;
        struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
+       struct rpcrdma_mr_seg   *rl_nextseg;
 
        struct ib_cqe           rl_cqe;
        struct list_head        rl_all;
@@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops {
                                  struct rpcrdma_mr_seg *, int, bool);
        void            (*ro_unmap_sync)(struct rpcrdma_xprt *,
                                         struct rpcrdma_req *);
-       int             (*ro_unmap)(struct rpcrdma_xprt *,
-                                   struct rpcrdma_mr_seg *);
+       void            (*ro_unmap_safe)(struct rpcrdma_xprt *,
+                                        struct rpcrdma_req *, bool);
        int             (*ro_open)(struct rpcrdma_ia *,
                                   struct rpcrdma_ep *,
                                   struct rpcrdma_create_data_internal *);
@@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
                         struct rpcrdma_regbuf *);
 
-unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
 int frwr_alloc_recovery_wq(void);
@@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
 int rpcrdma_marshal_req(struct rpc_rqst *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
+                                 struct rpcrdma_create_data_internal *,
+                                 unsigned int);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
@@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
index b90c5397b5e137c6cc8accad6eebe2b876363d4e..2d3e0c42361e6190555eb3b0cd9465851c88fd7b 100644 (file)
@@ -1364,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
                return ret;
        return 0;
 }
+
+static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt)
+{
+       return PAGE_SIZE;
+}
 #else
 static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
                                        struct xdr_skb_reader *desc)
@@ -2661,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
        .bc_setup               = xprt_setup_bc,
        .bc_up                  = xs_tcp_bc_up,
+       .bc_maxpayload          = xs_tcp_bc_maxpayload,
        .bc_free_rqst           = xprt_free_bc_rqst,
        .bc_destroy             = xprt_destroy_bc,
 #endif
index b2ab2a92a3757c8b075f8e2cf05c65c3021886ab..0f82314621f2e826570c4c22040ed74c73174810 100644 (file)
@@ -7,6 +7,7 @@ quote   := "
 squote  := '
 empty   :=
 space   := $(empty) $(empty)
+space_escape := _-_SPACE_-_
 
 ###
 # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
@@ -226,10 +227,10 @@ objectify = $(foreach o,$(1),$(if $(filter /%,$(o)),$(o),$(obj)/$(o)))
 # See Documentation/kbuild/makefiles.txt for more info
 
 ifneq ($(KBUILD_NOCMDDEP),1)
-# Check if both arguments has same arguments. Result is empty string if equal.
-# User may override this check using make KBUILD_NOCMDDEP=1
-arg-check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \
-                    $(filter-out $(cmd_$@),   $(cmd_$(1))) )
+# Check if both arguments are the same including their order. Result is empty
+# string if equal. User may override this check using make KBUILD_NOCMDDEP=1
+arg-check = $(filter-out $(subst $(space),$(space_escape),$(strip $(cmd_$@))), \
+                         $(subst $(space),$(space_escape),$(strip $(cmd_$1))))
 else
 arg-check = $(if $(strip $(cmd_$@)),,1)
 endif
@@ -256,10 +257,42 @@ if_changed = $(if $(strip $(any-prereq) $(arg-check)),                       \
 # Execute the command and also postprocess generated .d dependencies file.
 if_changed_dep = $(if $(strip $(any-prereq) $(arg-check) ),                  \
        @set -e;                                                             \
+       $(cmd_and_fixdep), @:)
+
+ifndef CONFIG_TRIM_UNUSED_KSYMS
+
+cmd_and_fixdep =                                                             \
        $(echo-cmd) $(cmd_$(1));                                             \
        scripts/basic/fixdep $(depfile) $@ '$(make-cmd)' > $(dot-target).tmp;\
        rm -f $(depfile);                                                    \
-       mv -f $(dot-target).tmp $(dot-target).cmd, @:)
+       mv -f $(dot-target).tmp $(dot-target).cmd;
+
+else
+
+# Filter out exported kernel symbol names from the preprocessor output.
+# See also __KSYM_DEPS__ in include/linux/export.h.
+# We disable the depfile generation here, so as not to overwrite the existing
+# depfile while fixdep is parsing it.
+flags_nodeps = $(filter-out -Wp$(comma)-M%, $($(1)))
+ksym_dep_filter =                                                            \
+       case "$(1)" in                                                       \
+         cc_*_c|cpp_i_c)                                                    \
+           $(CPP) $(call flags_nodeps,c_flags) -D__KSYM_DEPS__ $< ;;        \
+         as_*_S|cpp_s_S)                                                    \
+           $(CPP) $(call flags_nodeps,a_flags) -D__KSYM_DEPS__ $< ;;        \
+         boot*|build*|*cpp_lds_S|dtc|host*|vdso*) : ;;                      \
+         *) echo "Don't know how to preprocess $(1)" >&2; false ;;          \
+       esac | tr ";" "\n" | sed -rn 's/^.*=== __KSYM_(.*) ===.*$$/KSYM_\1/p'
+
+cmd_and_fixdep =                                                             \
+       $(echo-cmd) $(cmd_$(1));                                             \
+       $(ksym_dep_filter) |                                                 \
+               scripts/basic/fixdep -e $(depfile) $@ '$(make-cmd)'          \
+                       > $(dot-target).tmp;                                 \
+       rm -f $(depfile);                                                    \
+       mv -f $(dot-target).tmp $(dot-target).cmd;
+
+endif
 
 # Usage: $(call if_changed_rule,foo)
 # Will check if $(cmd_foo) or any of the prerequisites changed,
@@ -341,8 +374,6 @@ endif
 #
 ###############################################################################
 #
-space_escape := %%%SPACE%%%
-#
 define config_filename
 ifneq ($$(CONFIG_$(1)),"")
 $(1)_FILENAME := $$(subst \\,\,$$(subst \$$(quote),$$(quote),$$(subst $$(space_escape),\$$(space),$$(patsubst "%",%,$$(subst $$(space),$$(space_escape),$$(CONFIG_$(1)))))))
index e1bc1907090e871eb18fc8cf7fb5ec501eaa560a..0d1ca5bf42fbe4aaa3e82dc611b447195fffd2e1 100644 (file)
@@ -152,11 +152,11 @@ cmd_cc_s_c       = $(CC) $(c_flags) $(DISABLE_LTO) -fverbose-asm -S -o $@ $<
 $(obj)/%.s: $(src)/%.c FORCE
        $(call if_changed_dep,cc_s_c)
 
-quiet_cmd_cc_i_c = CPP $(quiet_modtag) $@
-cmd_cc_i_c       = $(CPP) $(c_flags)   -o $@ $<
+quiet_cmd_cpp_i_c = CPP $(quiet_modtag) $@
+cmd_cpp_i_c       = $(CPP) $(c_flags) -o $@ $<
 
 $(obj)/%.i: $(src)/%.c FORCE
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 cmd_gensymtypes =                                                           \
     $(CPP) -D__GENKSYMS__ $(c_flags) $< |                                   \
@@ -266,26 +266,24 @@ endif # CONFIG_STACK_VALIDATION
 
 define rule_cc_o_c
        $(call echo-cmd,checksrc) $(cmd_checksrc)                         \
-       $(call echo-cmd,cc_o_c) $(cmd_cc_o_c);                            \
+       $(call cmd_and_fixdep,cc_o_c)                                     \
        $(cmd_modversions)                                                \
-       $(cmd_objtool)                                            \
-       $(call echo-cmd,record_mcount)                                    \
-       $(cmd_record_mcount)                                              \
-       scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' >    \
-                                                     $(dot-target).tmp;  \
-       rm -f $(depfile);                                                 \
-       mv -f $(dot-target).tmp $(dot-target).cmd
+       $(cmd_objtool)                                                    \
+       $(call echo-cmd,record_mcount) $(cmd_record_mcount)
 endef
 
 define rule_as_o_S
-       $(call echo-cmd,as_o_S) $(cmd_as_o_S);                            \
-       $(cmd_objtool)                                            \
-       scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,as_o_S)' >    \
-                                                     $(dot-target).tmp;  \
-       rm -f $(depfile);                                                 \
-       mv -f $(dot-target).tmp $(dot-target).cmd
+       $(call cmd_and_fixdep,as_o_S)                                     \
+       $(cmd_objtool)
 endef
 
+# List module undefined symbols (or empty line if not enabled)
+ifdef CONFIG_TRIM_UNUSED_KSYMS
+cmd_undef_syms = $(NM) $@ | sed -n 's/^ \+U //p' | xargs echo
+else
+cmd_undef_syms = echo
+endif
+
 # Built-in and composite module parts
 $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
        $(call cmd,force_checksrc)
@@ -296,7 +294,8 @@ $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
 $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
        $(call cmd,force_checksrc)
        $(call if_changed_rule,cc_o_c)
-       @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod)
+       @{ echo $(@:.o=.ko); echo $@; \
+          $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod)
 
 quiet_cmd_cc_lst_c = MKLST   $@
       cmd_cc_lst_c = $(CC) $(c_flags) -g -c -o $*.o $< && \
@@ -314,11 +313,11 @@ modkern_aflags := $(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL)
 $(real-objs-m)      : modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE)
 $(real-objs-m:.o=.s): modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE)
 
-quiet_cmd_as_s_S = CPP $(quiet_modtag) $@
-cmd_as_s_S       = $(CPP) $(a_flags)   -o $@ $<
+quiet_cmd_cpp_s_S = CPP $(quiet_modtag) $@
+cmd_cpp_s_S       = $(CPP) $(a_flags) -o $@ $<
 
 $(obj)/%.s: $(src)/%.S FORCE
-       $(call if_changed_dep,as_s_S)
+       $(call if_changed_dep,cpp_s_S)
 
 quiet_cmd_as_o_S = AS $(quiet_modtag)  $@
 cmd_as_o_S       = $(CC) $(a_flags) -c -o $@ $<
@@ -426,7 +425,8 @@ $(call multi_depend, $(multi-used-y), .o, -objs -y)
 
 $(multi-used-m): FORCE
        $(call if_changed,link_multi-m)
-       @{ echo $(@:.o=.ko); echo $(link_multi_deps); } > $(MODVERDIR)/$(@F:.o=.mod)
+       @{ echo $(@:.o=.ko); echo $(link_multi_deps); \
+          $(cmd_undef_syms); } > $(MODVERDIR)/$(@F:.o=.mod)
 $(call multi_depend, $(multi-used-m), .o, -objs -y -m)
 
 targets += $(multi-used-y) $(multi-used-m)
index f9e47a70509c9b310ec417b9e1b80a038e606aaa..53449a6ff6aa7de3c5c868f0645f6000b6f551e6 100644 (file)
@@ -24,6 +24,7 @@ warning-1 += $(call cc-option, -Wmissing-prototypes)
 warning-1 += -Wold-style-definition
 warning-1 += $(call cc-option, -Wmissing-include-dirs)
 warning-1 += $(call cc-option, -Wunused-but-set-variable)
+warning-1 += $(call cc-option, -Wunused-const-variable)
 warning-1 += $(call cc-disable-warning, missing-field-initializers)
 warning-1 += $(call cc-disable-warning, sign-compare)
 
index ed1b7c4fb674a92c0c566a47796d013abd1c5275..e7df0f5db7ec273f45b755668815ce96ba1ffe46 100644 (file)
@@ -96,10 +96,10 @@ obj-dirs    := $(addprefix $(obj)/,$(obj-dirs))
 # Note: Files that end up in two or more modules are compiled without the
 #       KBUILD_MODNAME definition. The reason is that any made-up name would
 #       differ in different configs.
-name-fix = $(subst $(comma),_,$(subst -,_,$1))
-basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
+name-fix = $(squote)$(quote)$(subst $(comma),_,$(subst -,_,$1))$(quote)$(squote)
+basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
-                 -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
+                 -DKBUILD_MODNAME=$(call name-fix,$(modname)))
 
 orig_c_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(KBUILD_SUBDIR_CCFLAGS) \
                  $(ccflags-y) $(CFLAGS_$(basetarget).o)
@@ -162,7 +162,7 @@ endif
 
 c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                 $(__c_flags) $(modkern_cflags)                           \
-                -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags)
+                $(basename_flags) $(modname_flags)
 
 a_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                 $(__a_flags) $(modkern_aflags)
diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh
new file mode 100755 (executable)
index 0000000..8dc1918
--- /dev/null
@@ -0,0 +1,101 @@
+#!/bin/sh
+
+# Script to create/update include/generated/autoksyms.h and dependency files
+#
+# Copyright:   (C) 2016  Linaro Limited
+# Created by:  Nicolas Pitre, January 2016
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+# Create/update the include/generated/autoksyms.h file from the list
+# of all module's needed symbols as recorded on the third line of
+# .tmp_versions/*.mod files.
+#
+# For each symbol being added or removed, the corresponding dependency
+# file's timestamp is updated to force a rebuild of the affected source
+# file. All arguments passed to this script are assumed to be a command
+# to be exec'd to trigger a rebuild of those files.
+
+set -e
+
+cur_ksyms_file="include/generated/autoksyms.h"
+new_ksyms_file="include/generated/autoksyms.h.tmpnew"
+
+info() {
+       if [ "$quiet" != "silent_" ]; then
+               printf "  %-7s %s\n" "$1" "$2"
+       fi
+}
+
+info "CHK" "$cur_ksyms_file"
+
+# Use "make V=1" to debug this script.
+case "$KBUILD_VERBOSE" in
+*1*)
+       set -x
+       ;;
+esac
+
+# We need access to CONFIG_ symbols
+case "${KCONFIG_CONFIG}" in
+*/*)
+       . "${KCONFIG_CONFIG}"
+       ;;
+*)
+       # Force using a file from the current directory
+       . "./${KCONFIG_CONFIG}"
+esac
+
+# In case it doesn't exist yet...
+if [ -e "$cur_ksyms_file" ]; then touch "$cur_ksyms_file"; fi
+
+# Generate a new ksym list file with symbols needed by the current
+# set of modules.
+cat > "$new_ksyms_file" << EOT
+/*
+ * Automatically generated file; DO NOT EDIT.
+ */
+
+EOT
+sed -ns -e '3{s/ /\n/g;/^$/!p;}' "$MODVERDIR"/*.mod | sort -u |
+while read sym; do
+       if [ -n "$CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX" ]; then
+               sym="${sym#_}"
+       fi
+       echo "#define __KSYM_${sym} 1"
+done >> "$new_ksyms_file"
+
+# Special case for modversions (see modpost.c)
+if [ -n "$CONFIG_MODVERSIONS" ]; then
+       echo "#define __KSYM_module_layout 1" >> "$new_ksyms_file"
+fi
+
+# Extract changes between old and new list and touch corresponding
+# dependency files.
+changed=$(
+count=0
+sort "$cur_ksyms_file" "$new_ksyms_file" | uniq -u |
+sed -n 's/^#define __KSYM_\(.*\) 1/\1/p' | tr "A-Z_" "a-z/" |
+while read sympath; do
+       if [ -z "$sympath" ]; then continue; fi
+       depfile="include/config/ksym/${sympath}.h"
+       mkdir -p "$(dirname "$depfile")"
+       touch "$depfile"
+       echo $((count += 1))
+done | tail -1 )
+changed=${changed:-0}
+
+if [ $changed -gt 0 ]; then
+       # Replace the old list with tne new one
+       old=$(grep -c "^#define __KSYM_" "$cur_ksyms_file" || true)
+       new=$(grep -c "^#define __KSYM_" "$new_ksyms_file" || true)
+       info "KSYMS" "symbols: before=$old, after=$new, changed=$changed"
+       info "UPD" "$cur_ksyms_file"
+       mv -f "$new_ksyms_file" "$cur_ksyms_file"
+       # Then trigger a rebuild of affected source files
+       exec $@
+else
+       rm -f "$new_ksyms_file"
+fi
index caef815d17431c3616ebefb2ac4eef2bde5f1f52..746ec1ece6143fab9eb4a667f521c19fbabdc67c 100644 (file)
 #define INT_NFIG ntohl(0x4e464947)
 #define INT_FIG_ ntohl(0x4649475f)
 
+int insert_extra_deps;
 char *target;
 char *depfile;
 char *cmdline;
 
 static void usage(void)
 {
-       fprintf(stderr, "Usage: fixdep <depfile> <target> <cmdline>\n");
+       fprintf(stderr, "Usage: fixdep [-e] <depfile> <target> <cmdline>\n");
+       fprintf(stderr, " -e  insert extra dependencies given on stdin\n");
        exit(1);
 }
 
@@ -138,6 +140,40 @@ static void print_cmdline(void)
        printf("cmd_%s := %s\n\n", target, cmdline);
 }
 
+/*
+ * Print out a dependency path from a symbol name
+ */
+static void print_config(const char *m, int slen)
+{
+       int c, i;
+
+       printf("    $(wildcard include/config/");
+       for (i = 0; i < slen; i++) {
+               c = m[i];
+               if (c == '_')
+                       c = '/';
+               else
+                       c = tolower(c);
+               putchar(c);
+       }
+       printf(".h) \\\n");
+}
+
+static void do_extra_deps(void)
+{
+       if (insert_extra_deps) {
+               char buf[80];
+               while(fgets(buf, sizeof(buf), stdin)) {
+                       int len = strlen(buf);
+                       if (len < 2 || buf[len-1] != '\n') {
+                               fprintf(stderr, "fixdep: bad data on stdin\n");
+                               exit(1);
+                       }
+                       print_config(buf, len-1);
+               }
+       }
+}
+
 struct item {
        struct item     *next;
        unsigned int    len;
@@ -197,23 +233,12 @@ static void define_config(const char *name, int len, unsigned int hash)
 static void use_config(const char *m, int slen)
 {
        unsigned int hash = strhash(m, slen);
-       int c, i;
 
        if (is_defined_config(m, slen, hash))
            return;
 
        define_config(m, slen, hash);
-
-       printf("    $(wildcard include/config/");
-       for (i = 0; i < slen; i++) {
-               c = m[i];
-               if (c == '_')
-                       c = '/';
-               else
-                       c = tolower(c);
-               putchar(c);
-       }
-       printf(".h) \\\n");
+       print_config(m, slen);
 }
 
 static void parse_config_file(const char *map, size_t len)
@@ -250,7 +275,7 @@ static void parse_config_file(const char *map, size_t len)
        }
 }
 
-/* test is s ends in sub */
+/* test if s ends in sub */
 static int strrcmp(const char *s, const char *sub)
 {
        int slen = strlen(s);
@@ -333,6 +358,7 @@ static void parse_dep_file(void *map, size_t len)
 
                        /* Ignore certain dependencies */
                        if (strrcmp(s, "include/generated/autoconf.h") &&
+                           strrcmp(s, "include/generated/autoksyms.h") &&
                            strrcmp(s, "arch/um/include/uml-config.h") &&
                            strrcmp(s, "include/linux/kconfig.h") &&
                            strrcmp(s, ".ver")) {
@@ -378,6 +404,8 @@ static void parse_dep_file(void *map, size_t len)
                exit(1);
        }
 
+       do_extra_deps();
+
        printf("\n%s: $(deps_%s)\n\n", target, target);
        printf("$(deps_%s):\n", target);
 }
@@ -434,7 +462,10 @@ int main(int argc, char *argv[])
 {
        traps();
 
-       if (argc != 4)
+       if (argc == 5 && !strcmp(argv[1], "-e")) {
+               insert_extra_deps = 1;
+               argv++;
+       } else if (argc != 4)
                usage();
 
        depfile = argv[1];
index b2d758188f2f404b22c8c691b87d0e43a0404f5d..dd85a455b2ba9ec20250f8556904f80d597fb58a 100755 (executable)
@@ -98,7 +98,7 @@ run_cmd() {
 }
 
 kill_running() {
-       for i in $(seq $(( NPROC - 1 )) ); do
+       for i in $(seq $(( NPROC - 1 )) ); do
                if [ $VERBOSE -eq 2 ] ; then
                        echo "Killing ${SPATCH_PID[$i]}"
                fi
index 8ee0ac30e5475a845bfbf10f46c0e62316cb18bd..eb6bd9e4ab1abee4f1385188a2382f730432c3da 100644 (file)
@@ -106,7 +106,7 @@ position j0, j1, j2;
 @match_function_and_data_after_init_timer_context
 depends on !patch &&
 !match_immediate_function_data_after_init_timer_context &&
-(context || org || report)@
+ (context || org || report)@
 expression a, b, e1, e2, e3, e4, e5;
 position j0, j1, j2;
 @@
@@ -127,7 +127,7 @@ position j0, j1, j2;
 @r3_context depends on !patch &&
 !match_immediate_function_data_after_init_timer_context &&
 !match_function_and_data_after_init_timer_context &&
-(context || org || report)@
+ (context || org || report)@
 expression c, e6, e7;
 position r1.p;
 position j0, j1;
diff --git a/scripts/coccinelle/misc/compare_const_fl.cocci b/scripts/coccinelle/misc/compare_const_fl.cocci
deleted file mode 100644 (file)
index b5d4bab..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/// Move constants to the right of binary operators.
-//# Depends on personal taste in some cases.
-///
-// Confidence: Moderate
-// Copyright: (C) 2015 Copyright: (C) 2015 Julia Lawall, Inria. GPLv2.
-// URL: http://coccinelle.lip6.fr/
-// Options: --no-includes --include-headers
-
-virtual patch
-virtual context
-virtual org
-virtual report
-
-@r1 depends on patch && !context && !org && !report
- disable bitor_comm, neg_if_exp@
-constant c,c1;
-local idexpression i;
-expression e,e1,e2;
-binary operator b = {==,!=,&,|};
-type t;
-@@
-
-(
-c b (c1)
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
-i b e1
-|
-c | e1 | e2 | ...
-|
-c | (e ? e1 : e2)
-|
-- c
-+ e
-b
-- e
-+ c
-)
-
-@r2 depends on patch && !context && !org && !report
- disable gtr_lss, gtr_lss_eq, not_int2@
-constant c,c1;
-expression e,e1,e2;
-binary operator b;
-binary operator b1 = {<,<=},b2 = {<,<=};
-binary operator b3 = {>,>=},b4 = {>,>=};
-local idexpression i;
-type t;
-@@
-
-(
-c b c1
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
- (e1 b1 e) && (e b2 e2)
-|
- (e1 b3 e) && (e b4 e2)
-|
-i b e
-|
-- c < e
-+ e > c
-|
-- c <= e
-+ e >= c
-|
-- c > e
-+ e < c
-|
-- c >= e
-+ e <= c
-)
-
-// ----------------------------------------------------------------------------
-
-@r1_context depends on !patch && (context || org || report)
- disable bitor_comm, neg_if_exp exists@
-type t;
-binary operator b = {==,!=,&,|};
-constant c, c1;
-expression e, e1, e2;
-local idexpression i;
-position j0;
-@@
-
-(
-c b (c1)
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
-i b e1
-|
-c | e1 | e2 | ...
-|
-c | (e ? e1 : e2)
-|
-* c@j0 b e
-)
-
-@r2_context depends on !patch && (context || org || report)
- disable gtr_lss, gtr_lss_eq, not_int2 exists@
-type t;
-binary operator b, b1 = {<,<=}, b2 = {<,<=}, b3 = {>,>=}, b4 = {>,>=};
-constant c, c1;
-expression e, e1, e2;
-local idexpression i;
-position j0;
-@@
-
-(
-c b c1
-|
-sizeof(t) b e1
-|
-sizeof e b e1
-|
- (e1 b1 e) && (e b2 e2)
-|
- (e1 b3 e) && (e b4 e2)
-|
-i b e
-|
-* c@j0 < e
-|
-* c@j0 <= e
-|
-* c@j0 > e
-|
-* c@j0 >= e
-)
-
-// ----------------------------------------------------------------------------
-
-@script:python r1_org depends on org@
-j0 << r1_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.org.print_todo(j0[0], msg)
-
-@script:python r2_org depends on org@
-j0 << r2_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.org.print_todo(j0[0], msg)
-
-// ----------------------------------------------------------------------------
-
-@script:python r1_report depends on report@
-j0 << r1_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.report.print_report(j0[0], msg)
-
-@script:python r2_report depends on report@
-j0 << r2_context.j0;
-@@
-
-msg = "Move constant to right."
-coccilib.report.print_report(j0[0], msg)
-
index dafaf96e0a34baea94d4b4e5f171f9ce0e53927e..06121ce524a76006072459d352d727b4aebdf203 100644 (file)
@@ -873,5 +873,8 @@ int main(int argc, char **argv)
                        (double)nsyms / (double)HASH_BUCKETS);
        }
 
+       if (dumpfile)
+               fclose(dumpfile);
+
        return errors != 0;
 }
index dd243d2abd875b535d006535232821eefad3460b..297b079ae4d9f0decbabc76d0f7e833e20aadadb 100644 (file)
@@ -375,7 +375,9 @@ load:
                                continue;
                } else {
                        if (line[0] != '\r' && line[0] != '\n')
-                               conf_warning("unexpected data");
+                               conf_warning("unexpected data: %.*s",
+                                            (int)strcspn(line, "\r\n"), line);
+
                        continue;
                }
 setsym:
index 25cf0c2c0c795ac36658a2bfa061c9f3383630a1..2432298487fb330d04e365fa2b1c81ed6fbba691 100644 (file)
@@ -209,12 +209,26 @@ static void sym_set_all_changed(void)
 static void sym_calc_visibility(struct symbol *sym)
 {
        struct property *prop;
+       struct symbol *choice_sym = NULL;
        tristate tri;
 
        /* any prompt visible? */
        tri = no;
+
+       if (sym_is_choice_value(sym))
+               choice_sym = prop_get_symbol(sym_get_choice_prop(sym));
+
        for_all_prompts(sym, prop) {
                prop->visible.tri = expr_calc_value(prop->visible.expr);
+               /*
+                * Tristate choice_values with visibility 'mod' are
+                * not visible if the corresponding choice's value is
+                * 'yes'.
+                */
+               if (choice_sym && sym->type == S_TRISTATE &&
+                   prop->visible.tri == mod && choice_sym->curr.tri == yes)
+                       prop->visible.tri = no;
+
                tri = EXPR_OR(tri, prop->visible.tri);
        }
        if (tri == mod && (sym->type != S_TRISTATE || modules_val == no))
index c2c7389bfbab7694ec20fbb5168fda902643ddee..71b4a8af9d4dcdf1dcec434b6be98636f468b2dd 100644 (file)
@@ -52,7 +52,7 @@ rpm-pkg rpm: FORCE
        $(call cmd,src_tar,$(KERNELPATH),kernel.spec)
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
-       rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
+       rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
        rm $(KERNELPATH).tar.gz kernel.spec
 
 # binrpm-pkg
@@ -63,7 +63,7 @@ binrpm-pkg: FORCE
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
 
-       rpmbuild --define "_builddir $(objtree)" --target \
+       rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \
                $(UTS_MACHINE) -bb $(objtree)/binkernel.spec
        rm binkernel.spec
 
index 6c3b038ef40d2c761c415f3d060d3b92ee489d02..86e56fef74736a364db4f1b8115a8b43558b038e 100755 (executable)
@@ -322,7 +322,10 @@ fi
 
 # Build kernel header package
 (cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl) > "$objtree/debian/hdrsrcfiles"
-(cd $srctree; find arch/$SRCARCH/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
+if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
+       (cd $srctree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrsrcfiles"
+fi
+(cd $srctree; find arch/*/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles"
 (cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles"
 (cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles"
index b6de63cb3f231e1203cfa00eccd35edd3aee456a..57673bae5597ab9a5078fe9527825aa4456086c9 100755 (executable)
@@ -143,6 +143,11 @@ echo "if [ -x /sbin/new-kernel-pkg ]; then"
 echo "new-kernel-pkg --remove $KERNELRELEASE --rminitrd --initrdfile=/boot/initramfs-$KERNELRELEASE.img"
 echo "fi"
 echo ""
+echo "%postun"
+echo "if [ -x /sbin/update-bootloader ]; then"
+echo "/sbin/update-bootloader --remove $KERNELRELEASE"
+echo "fi"
+echo ""
 echo "%files"
 echo '%defattr (-, root, root)'
 echo "/lib/modules/$KERNELRELEASE"
index 9b756b1f3dc5241cc11eb8ccb3755e0bd126b3c8..0309f2111c703b499134d3217d0a3735cdb5f2a0 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/string_helpers.h>
+#include <linux/task_work.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
 
 #define YAMA_SCOPE_DISABLED    0
 #define YAMA_SCOPE_RELATIONAL  1
@@ -42,20 +45,71 @@ static DEFINE_SPINLOCK(ptracer_relations_lock);
 static void yama_relation_cleanup(struct work_struct *work);
 static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
 
-static void report_access(const char *access, struct task_struct *target,
-                         struct task_struct *agent)
+struct access_report_info {
+       struct callback_head work;
+       const char *access;
+       struct task_struct *target;
+       struct task_struct *agent;
+};
+
+static void __report_access(struct callback_head *work)
 {
+       struct access_report_info *info =
+               container_of(work, struct access_report_info, work);
        char *target_cmd, *agent_cmd;
 
-       target_cmd = kstrdup_quotable_cmdline(target, GFP_ATOMIC);
-       agent_cmd = kstrdup_quotable_cmdline(agent, GFP_ATOMIC);
+       target_cmd = kstrdup_quotable_cmdline(info->target, GFP_KERNEL);
+       agent_cmd = kstrdup_quotable_cmdline(info->agent, GFP_KERNEL);
 
        pr_notice_ratelimited(
                "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
-               access, target_cmd, target->pid, agent_cmd, agent->pid);
+               info->access, target_cmd, info->target->pid, agent_cmd,
+               info->agent->pid);
 
        kfree(agent_cmd);
        kfree(target_cmd);
+
+       put_task_struct(info->agent);
+       put_task_struct(info->target);
+       kfree(info);
+}
+
+/* defers execution because cmdline access can sleep */
+static void report_access(const char *access, struct task_struct *target,
+                               struct task_struct *agent)
+{
+       struct access_report_info *info;
+       char agent_comm[sizeof(agent->comm)];
+
+       assert_spin_locked(&target->alloc_lock); /* for target->comm */
+
+       if (current->flags & PF_KTHREAD) {
+               /* I don't think kthreads call task_work_run() before exiting.
+                * Imagine angry ranting about procfs here.
+                */
+               pr_notice_ratelimited(
+                   "ptrace %s of \"%s\"[%d] was attempted by \"%s\"[%d]\n",
+                   access, target->comm, target->pid,
+                   get_task_comm(agent_comm, agent), agent->pid);
+               return;
+       }
+
+       info = kmalloc(sizeof(*info), GFP_ATOMIC);
+       if (!info)
+               return;
+       init_task_work(&info->work, __report_access);
+       get_task_struct(target);
+       get_task_struct(agent);
+       info->access = access;
+       info->target = target;
+       info->agent = agent;
+       if (task_work_add(current, &info->work, true) == 0)
+               return; /* success */
+
+       WARN(1, "report_access called from exiting task");
+       put_task_struct(target);
+       put_task_struct(agent);
+       kfree(info);
 }
 
 /**
@@ -351,8 +405,11 @@ int yama_ptrace_traceme(struct task_struct *parent)
                break;
        }
 
-       if (rc)
+       if (rc) {
+               task_lock(current);
                report_access("traceme", current, parent);
+               task_unlock(current);
+       }
 
        return rc;
 }
index 6bf68fe7dd290dfeb9f51969f6b0d6d0cb38ab4c..f10b64d8c67472b6828e4cb8028e91bebbb6f0af 100644 (file)
@@ -16,6 +16,7 @@ help:
        @echo '  gpio                   - GPIO tools'
        @echo '  hv                     - tools used when in Hyper-V clients'
        @echo '  iio                    - IIO tools'
+       @echo '  kvm_stat               - top-like utility for displaying kvm statistics'
        @echo '  lguest                 - a minimal 32-bit x86 hypervisor'
        @echo '  net                    - misc networking tools'
        @echo '  perf                   - Linux performance measurement and analysis tool'
@@ -110,10 +111,13 @@ tmon_install:
 freefall_install:
        $(call descend,laptop/$(@:_install=),install)
 
+kvm_stat_install:
+       $(call descend,kvm/$(@:_install=),install)
+
 install: acpi_install cgroup_install cpupower_install hv_install firewire_install lguest_install \
                perf_install selftests_install turbostat_install usb_install \
                virtio_install vm_install net_install x86_energy_perf_policy_install \
-               tmon_install freefall_install objtool_install
+               tmon_install freefall_install objtool_install kvm_stat_install
 
 acpi_clean:
        $(call descend,power/acpi,clean)
index ee566e8bd1cff56efde9200bf4a76caa840669f1..27f3583193e69d4f6cd310fa1c94d8883cdf5f1c 100644 (file)
@@ -58,8 +58,8 @@ quiet_cmd_mkdir = MKDIR    $(dir $@)
 quiet_cmd_cc_o_c = CC       $@
       cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
 
-quiet_cmd_cc_i_c = CPP      $@
-      cmd_cc_i_c = $(CC) $(c_flags) -E -o $@ $<
+quiet_cmd_cpp_i_c = CPP      $@
+      cmd_cpp_i_c = $(CC) $(c_flags) -E -o $@ $<
 
 quiet_cmd_cc_s_c = AS       $@
       cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $<
@@ -83,11 +83,11 @@ $(OUTPUT)%.o: %.S FORCE
 
 $(OUTPUT)%.i: %.c FORCE
        $(call rule_mkdir)
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 $(OUTPUT)%.s: %.S FORCE
        $(call rule_mkdir)
-       $(call if_changed_dep,cc_i_c)
+       $(call if_changed_dep,cpp_i_c)
 
 $(OUTPUT)%.s: %.c FORCE
        $(call rule_mkdir)
diff --git a/tools/kvm/kvm_stat/Makefile b/tools/kvm/kvm_stat/Makefile
new file mode 100644 (file)
index 0000000..5b1cba5
--- /dev/null
@@ -0,0 +1,41 @@
+include ../../scripts/Makefile.include
+include ../../scripts/utilities.mak
+BINDIR=usr/bin
+MANDIR=usr/share/man
+MAN1DIR=$(MANDIR)/man1
+
+MAN1=kvm_stat.1
+
+A2X=a2x
+a2x_path := $(call get-executable,$(A2X))
+
+all: man
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+  ifneq ($(V),1)
+     QUIET_A2X = @echo '  A2X     '$@;
+  endif
+endif
+
+%.1: %.txt
+ifeq ($(a2x_path),)
+       $(error "You need to install asciidoc for man pages")
+else
+       $(QUIET_A2X)$(A2X) --doctype manpage --format manpage $<
+endif
+
+clean:
+       rm -f $(MAN1)
+
+man: $(MAN1)
+
+install-man: man
+       install -d -m 755 $(INSTALL_ROOT)/$(MAN1DIR)
+       install -m 644 kvm_stat.1 $(INSTALL_ROOT)/$(MAN1DIR)
+
+install-tools:
+       install -d -m 755 $(INSTALL_ROOT)/$(BINDIR)
+       install -m 755 -p "kvm_stat" "$(INSTALL_ROOT)/$(BINDIR)/$(TARGET)"
+
+install: install-tools install-man
+.PHONY: all clean man install-tools install-man install
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
new file mode 100755 (executable)
index 0000000..581278c
--- /dev/null
@@ -0,0 +1,1127 @@
+#!/usr/bin/python
+#
+# top-like utility for displaying kvm statistics
+#
+# Copyright 2006-2008 Qumranet Technologies
+# Copyright 2008-2011 Red Hat, Inc.
+#
+# Authors:
+#  Avi Kivity <avi@redhat.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2.  See
+# the COPYING file in the top-level directory.
+"""The kvm_stat module outputs statistics about running KVM VMs
+
+Three different ways of output formatting are available:
+- as a top-like text ui
+- in a key -> value format
+- in an all keys, all values format
+
+The data is sampled from the KVM's debugfs entries and its perf events.
+"""
+
+import curses
+import sys
+import os
+import time
+import optparse
+import ctypes
+import fcntl
+import resource
+import struct
+import re
+from collections import defaultdict
+from time import sleep
+
+VMX_EXIT_REASONS = {
+    'EXCEPTION_NMI':        0,
+    'EXTERNAL_INTERRUPT':   1,
+    'TRIPLE_FAULT':         2,
+    'PENDING_INTERRUPT':    7,
+    'NMI_WINDOW':           8,
+    'TASK_SWITCH':          9,
+    'CPUID':                10,
+    'HLT':                  12,
+    'INVLPG':               14,
+    'RDPMC':                15,
+    'RDTSC':                16,
+    'VMCALL':               18,
+    'VMCLEAR':              19,
+    'VMLAUNCH':             20,
+    'VMPTRLD':              21,
+    'VMPTRST':              22,
+    'VMREAD':               23,
+    'VMRESUME':             24,
+    'VMWRITE':              25,
+    'VMOFF':                26,
+    'VMON':                 27,
+    'CR_ACCESS':            28,
+    'DR_ACCESS':            29,
+    'IO_INSTRUCTION':       30,
+    'MSR_READ':             31,
+    'MSR_WRITE':            32,
+    'INVALID_STATE':        33,
+    'MWAIT_INSTRUCTION':    36,
+    'MONITOR_INSTRUCTION':  39,
+    'PAUSE_INSTRUCTION':    40,
+    'MCE_DURING_VMENTRY':   41,
+    'TPR_BELOW_THRESHOLD':  43,
+    'APIC_ACCESS':          44,
+    'EPT_VIOLATION':        48,
+    'EPT_MISCONFIG':        49,
+    'WBINVD':               54,
+    'XSETBV':               55,
+    'APIC_WRITE':           56,
+    'INVPCID':              58,
+}
+
+SVM_EXIT_REASONS = {
+    'READ_CR0':       0x000,
+    'READ_CR3':       0x003,
+    'READ_CR4':       0x004,
+    'READ_CR8':       0x008,
+    'WRITE_CR0':      0x010,
+    'WRITE_CR3':      0x013,
+    'WRITE_CR4':      0x014,
+    'WRITE_CR8':      0x018,
+    'READ_DR0':       0x020,
+    'READ_DR1':       0x021,
+    'READ_DR2':       0x022,
+    'READ_DR3':       0x023,
+    'READ_DR4':       0x024,
+    'READ_DR5':       0x025,
+    'READ_DR6':       0x026,
+    'READ_DR7':       0x027,
+    'WRITE_DR0':      0x030,
+    'WRITE_DR1':      0x031,
+    'WRITE_DR2':      0x032,
+    'WRITE_DR3':      0x033,
+    'WRITE_DR4':      0x034,
+    'WRITE_DR5':      0x035,
+    'WRITE_DR6':      0x036,
+    'WRITE_DR7':      0x037,
+    'EXCP_BASE':      0x040,
+    'INTR':           0x060,
+    'NMI':            0x061,
+    'SMI':            0x062,
+    'INIT':           0x063,
+    'VINTR':          0x064,
+    'CR0_SEL_WRITE':  0x065,
+    'IDTR_READ':      0x066,
+    'GDTR_READ':      0x067,
+    'LDTR_READ':      0x068,
+    'TR_READ':        0x069,
+    'IDTR_WRITE':     0x06a,
+    'GDTR_WRITE':     0x06b,
+    'LDTR_WRITE':     0x06c,
+    'TR_WRITE':       0x06d,
+    'RDTSC':          0x06e,
+    'RDPMC':          0x06f,
+    'PUSHF':          0x070,
+    'POPF':           0x071,
+    'CPUID':          0x072,
+    'RSM':            0x073,
+    'IRET':           0x074,
+    'SWINT':          0x075,
+    'INVD':           0x076,
+    'PAUSE':          0x077,
+    'HLT':            0x078,
+    'INVLPG':         0x079,
+    'INVLPGA':        0x07a,
+    'IOIO':           0x07b,
+    'MSR':            0x07c,
+    'TASK_SWITCH':    0x07d,
+    'FERR_FREEZE':    0x07e,
+    'SHUTDOWN':       0x07f,
+    'VMRUN':          0x080,
+    'VMMCALL':        0x081,
+    'VMLOAD':         0x082,
+    'VMSAVE':         0x083,
+    'STGI':           0x084,
+    'CLGI':           0x085,
+    'SKINIT':         0x086,
+    'RDTSCP':         0x087,
+    'ICEBP':          0x088,
+    'WBINVD':         0x089,
+    'MONITOR':        0x08a,
+    'MWAIT':          0x08b,
+    'MWAIT_COND':     0x08c,
+    'XSETBV':         0x08d,
+    'NPF':            0x400,
+}
+
+# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h)
+AARCH64_EXIT_REASONS = {
+    'UNKNOWN':      0x00,
+    'WFI':          0x01,
+    'CP15_32':      0x03,
+    'CP15_64':      0x04,
+    'CP14_MR':      0x05,
+    'CP14_LS':      0x06,
+    'FP_ASIMD':     0x07,
+    'CP10_ID':      0x08,
+    'CP14_64':      0x0C,
+    'ILL_ISS':      0x0E,
+    'SVC32':        0x11,
+    'HVC32':        0x12,
+    'SMC32':        0x13,
+    'SVC64':        0x15,
+    'HVC64':        0x16,
+    'SMC64':        0x17,
+    'SYS64':        0x18,
+    'IABT':         0x20,
+    'IABT_HYP':     0x21,
+    'PC_ALIGN':     0x22,
+    'DABT':         0x24,
+    'DABT_HYP':     0x25,
+    'SP_ALIGN':     0x26,
+    'FP_EXC32':     0x28,
+    'FP_EXC64':     0x2C,
+    'SERROR':       0x2F,
+    'BREAKPT':      0x30,
+    'BREAKPT_HYP':  0x31,
+    'SOFTSTP':      0x32,
+    'SOFTSTP_HYP':  0x33,
+    'WATCHPT':      0x34,
+    'WATCHPT_HYP':  0x35,
+    'BKPT32':       0x38,
+    'VECTOR32':     0x3A,
+    'BRK64':        0x3C,
+}
+
+# From include/uapi/linux/kvm.h, KVM_EXIT_xxx
+USERSPACE_EXIT_REASONS = {
+    'UNKNOWN':          0,
+    'EXCEPTION':        1,
+    'IO':               2,
+    'HYPERCALL':        3,
+    'DEBUG':            4,
+    'HLT':              5,
+    'MMIO':             6,
+    'IRQ_WINDOW_OPEN':  7,
+    'SHUTDOWN':         8,
+    'FAIL_ENTRY':       9,
+    'INTR':             10,
+    'SET_TPR':          11,
+    'TPR_ACCESS':       12,
+    'S390_SIEIC':       13,
+    'S390_RESET':       14,
+    'DCR':              15,
+    'NMI':              16,
+    'INTERNAL_ERROR':   17,
+    'OSI':              18,
+    'PAPR_HCALL':       19,
+    'S390_UCONTROL':    20,
+    'WATCHDOG':         21,
+    'S390_TSCH':        22,
+    'EPR':              23,
+    'SYSTEM_EVENT':     24,
+}
+
+IOCTL_NUMBERS = {
+    'SET_FILTER':  0x40082406,
+    'ENABLE':      0x00002400,
+    'DISABLE':     0x00002401,
+    'RESET':       0x00002403,
+}
+
+class Arch(object):
+    """Encapsulates global architecture specific data.
+
+    Contains the performance event open syscall and ioctl numbers, as
+    well as the VM exit reasons for the architecture it runs on.
+
+    """
+    @staticmethod
+    def get_arch():
+        machine = os.uname()[4]
+
+        if machine.startswith('ppc'):
+            return ArchPPC()
+        elif machine.startswith('aarch64'):
+            return ArchA64()
+        elif machine.startswith('s390'):
+            return ArchS390()
+        else:
+            # X86_64
+            for line in open('/proc/cpuinfo'):
+                if not line.startswith('flags'):
+                    continue
+
+                flags = line.split()
+                if 'vmx' in flags:
+                    return ArchX86(VMX_EXIT_REASONS)
+                if 'svm' in flags:
+                    return ArchX86(SVM_EXIT_REASONS)
+                return
+
+class ArchX86(Arch):
+    def __init__(self, exit_reasons):
+        self.sc_perf_evt_open = 298
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = exit_reasons
+
+class ArchPPC(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 319
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.ioctl_numbers['ENABLE'] = 0x20002400
+        self.ioctl_numbers['DISABLE'] = 0x20002401
+        self.ioctl_numbers['RESET'] = 0x20002403
+
+        # PPC comes in 32 and 64 bit and some generated ioctl
+        # numbers depend on the wordsize.
+        char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
+        self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
+        self.exit_reasons = {}
+
+class ArchA64(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 241
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = AARCH64_EXIT_REASONS
+
+class ArchS390(Arch):
+    def __init__(self):
+        self.sc_perf_evt_open = 331
+        self.ioctl_numbers = IOCTL_NUMBERS
+        self.exit_reasons = None
+
+ARCH = Arch.get_arch()
+
+
+def walkdir(path):
+    """Returns os.walk() data for specified directory.
+
+    As it is only a wrapper it returns the same 3-tuple of (dirpath,
+    dirnames, filenames).
+    """
+    return next(os.walk(path))
+
+
+def parse_int_list(list_string):
+    """Returns an int list from a string of comma separated integers and
+    integer ranges."""
+    integers = []
+    members = list_string.split(',')
+
+    for member in members:
+        if '-' not in member:
+            integers.append(int(member))
+        else:
+            int_range = member.split('-')
+            integers.extend(range(int(int_range[0]),
+                                  int(int_range[1]) + 1))
+
+    return integers
+
+
+def get_online_cpus():
+    """Returns a list of cpu id integers."""
+    with open('/sys/devices/system/cpu/online') as cpu_list:
+        cpu_string = cpu_list.readline()
+        return parse_int_list(cpu_string)
+
+
+def get_filters():
+    """Returns a dict of trace events, their filter ids and
+    the values that can be filtered.
+
+    Trace events can be filtered for special values by setting a
+    filter string via an ioctl. The string normally has the format
+    identifier==value. For each filter a new event will be created, to
+    be able to distinguish the events.
+
+    """
+    filters = {}
+    filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
+    if ARCH.exit_reasons:
+        filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
+    return filters
+
+libc = ctypes.CDLL('libc.so.6', use_errno=True)
+syscall = libc.syscall
+
+class perf_event_attr(ctypes.Structure):
+    """Struct that holds the necessary data to set up a trace event.
+
+    For an extensive explanation see perf_event_open(2) and
+    include/uapi/linux/perf_event.h, struct perf_event_attr
+
+    All fields that are not initialized in the constructor are 0.
+
+    """
+    _fields_ = [('type', ctypes.c_uint32),
+                ('size', ctypes.c_uint32),
+                ('config', ctypes.c_uint64),
+                ('sample_freq', ctypes.c_uint64),
+                ('sample_type', ctypes.c_uint64),
+                ('read_format', ctypes.c_uint64),
+                ('flags', ctypes.c_uint64),
+                ('wakeup_events', ctypes.c_uint32),
+                ('bp_type', ctypes.c_uint32),
+                ('bp_addr', ctypes.c_uint64),
+                ('bp_len', ctypes.c_uint64),
+                ]
+
+    def __init__(self):
+        super(self.__class__, self).__init__()
+        self.type = PERF_TYPE_TRACEPOINT
+        self.size = ctypes.sizeof(self)
+        self.read_format = PERF_FORMAT_GROUP
+
+def perf_event_open(attr, pid, cpu, group_fd, flags):
+    """Wrapper for the sys_perf_evt_open() syscall.
+
+    Used to set up performance events, returns a file descriptor or -1
+    on error.
+
+    Attributes are:
+    - syscall number
+    - struct perf_event_attr *
+    - pid or -1 to monitor all pids
+    - cpu number or -1 to monitor all cpus
+    - The file descriptor of the group leader or -1 to create a group.
+    - flags
+
+    """
+    return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
+                   ctypes.c_int(pid), ctypes.c_int(cpu),
+                   ctypes.c_int(group_fd), ctypes.c_long(flags))
+
+PERF_TYPE_TRACEPOINT = 2
+PERF_FORMAT_GROUP = 1 << 3
+
+PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
+PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
+
+class Group(object):
+    """Represents a perf event group."""
+
+    def __init__(self):
+        self.events = []
+
+    def add_event(self, event):
+        self.events.append(event)
+
+    def read(self):
+        """Returns a dict with 'event name: value' for all events in the
+        group.
+
+        Values are read by reading from the file descriptor of the
+        event that is the group leader. See perf_event_open(2) for
+        details.
+
+        Read format for the used event configuration is:
+        struct read_format {
+            u64 nr; /* The number of events */
+            struct {
+                u64 value; /* The value of the event */
+            } values[nr];
+        };
+
+        """
+        length = 8 * (1 + len(self.events))
+        read_format = 'xxxxxxxx' + 'Q' * len(self.events)
+        return dict(zip([event.name for event in self.events],
+                        struct.unpack(read_format,
+                                      os.read(self.events[0].fd, length))))
+
+class Event(object):
+    """Represents a performance event and manages its life cycle."""
+    def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
+                 trace_filter, trace_set='kvm'):
+        self.name = name
+        self.fd = None
+        self.setup_event(group, trace_cpu, trace_pid, trace_point,
+                         trace_filter, trace_set)
+
+    def __del__(self):
+        """Closes the event's file descriptor.
+
+        As no python file object was created for the file descriptor,
+        python will not reference count the descriptor and will not
+        close it itself automatically, so we do it.
+
+        """
+        if self.fd:
+            os.close(self.fd)
+
+    def setup_event_attribute(self, trace_set, trace_point):
+        """Returns an initialized ctype perf_event_attr struct."""
+
+        id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set,
+                               trace_point, 'id')
+
+        event_attr = perf_event_attr()
+        event_attr.config = int(open(id_path).read())
+        return event_attr
+
+    def setup_event(self, group, trace_cpu, trace_pid, trace_point,
+                    trace_filter, trace_set):
+        """Sets up the perf event in Linux.
+
+        Issues the syscall to register the event in the kernel and
+        then sets the optional filter.
+
+        """
+
+        event_attr = self.setup_event_attribute(trace_set, trace_point)
+
+        # First event will be group leader.
+        group_leader = -1
+
+        # All others have to pass the leader's descriptor instead.
+        if group.events:
+            group_leader = group.events[0].fd
+
+        fd = perf_event_open(event_attr, trace_pid,
+                             trace_cpu, group_leader, 0)
+        if fd == -1:
+            err = ctypes.get_errno()
+            raise OSError(err, os.strerror(err),
+                          'while calling sys_perf_event_open().')
+
+        if trace_filter:
+            fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'],
+                        trace_filter)
+
+        self.fd = fd
+
+    def enable(self):
+        """Enables the trace event in the kernel.
+
+        Enabling the group leader makes reading counters from it and the
+        events under it possible.
+
+        """
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0)
+
+    def disable(self):
+        """Disables the trace event in the kernel.
+
+        Disabling the group leader makes reading all counters under it
+        impossible.
+
+        """
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0)
+
+    def reset(self):
+        """Resets the count of the trace event in the kernel."""
+        fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
+
+class TracepointProvider(object):
+    """Data provider for the stats class.
+
+    Manages the events/groups from which it acquires its data.
+
+    """
+    def __init__(self):
+        self.group_leaders = []
+        self.filters = get_filters()
+        self._fields = self.get_available_fields()
+        self._pid = 0
+
+    def get_available_fields(self):
+        """Returns a list of available event's of format 'event name(filter
+        name)'.
+
+        All available events have directories under
+        /sys/kernel/debug/tracing/events/ which export information
+        about the specific event. Therefore, listing the dirs gives us
+        a list of all available events.
+
+        Some events like the vm exit reasons can be filtered for
+        specific values. To take account for that, the routine below
+        creates special fields with the following format:
+        event name(filter name)
+
+        """
+        path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
+        fields = walkdir(path)[1]
+        extra = []
+        for field in fields:
+            if field in self.filters:
+                filter_name_, filter_dicts = self.filters[field]
+                for name in filter_dicts:
+                    extra.append(field + '(' + name + ')')
+        fields += extra
+        return fields
+
+    def setup_traces(self):
+        """Creates all event and group objects needed to be able to retrieve
+        data."""
+        if self._pid > 0:
+            # Fetch list of all threads of the monitored pid, as qemu
+            # starts a thread for each vcpu.
+            path = os.path.join('/proc', str(self._pid), 'task')
+            groupids = walkdir(path)[1]
+        else:
+            groupids = get_online_cpus()
+
+        # The constant is needed as a buffer for python libs, std
+        # streams and other files that the script opens.
+        newlim = len(groupids) * len(self._fields) + 50
+        try:
+            softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
+
+            if hardlim < newlim:
+                # Now we need CAP_SYS_RESOURCE, to increase the hard limit.
+                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim))
+            else:
+                # Raising the soft limit is sufficient.
+                resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim))
+
+        except ValueError:
+            sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim))
+
+        for groupid in groupids:
+            group = Group()
+            for name in self._fields:
+                tracepoint = name
+                tracefilter = None
+                match = re.match(r'(.*)\((.*)\)', name)
+                if match:
+                    tracepoint, sub = match.groups()
+                    tracefilter = ('%s==%d\0' %
+                                   (self.filters[tracepoint][0],
+                                    self.filters[tracepoint][1][sub]))
+
+                # From perf_event_open(2):
+                # pid > 0 and cpu == -1
+                # This measures the specified process/thread on any CPU.
+                #
+                # pid == -1 and cpu >= 0
+                # This measures all processes/threads on the specified CPU.
+                trace_cpu = groupid if self._pid == 0 else -1
+                trace_pid = int(groupid) if self._pid != 0 else -1
+
+                group.add_event(Event(name=name,
+                                      group=group,
+                                      trace_cpu=trace_cpu,
+                                      trace_pid=trace_pid,
+                                      trace_point=tracepoint,
+                                      trace_filter=tracefilter))
+
+            self.group_leaders.append(group)
+
+    def available_fields(self):
+        return self.get_available_fields()
+
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, fields):
+        """Enables/disables the (un)wanted events"""
+        self._fields = fields
+        for group in self.group_leaders:
+            for index, event in enumerate(group.events):
+                if event.name in fields:
+                    event.reset()
+                    event.enable()
+                else:
+                    # Do not disable the group leader.
+                    # It would disable all of its events.
+                    if index != 0:
+                        event.disable()
+
+    @property
+    def pid(self):
+        return self._pid
+
+    @pid.setter
+    def pid(self, pid):
+        """Changes the monitored pid by setting new traces."""
+        self._pid = pid
+        # The garbage collector will get rid of all Event/Group
+        # objects and open files after removing the references.
+        self.group_leaders = []
+        self.setup_traces()
+        self.fields = self._fields
+
+    def read(self):
+        """Returns 'event name: current value' for all enabled events."""
+        ret = defaultdict(int)
+        for group in self.group_leaders:
+            for name, val in group.read().iteritems():
+                if name in self._fields:
+                    ret[name] += val
+        return ret
+
+class DebugfsProvider(object):
+    """Provides data from the files that KVM creates in the kvm debugfs
+    folder."""
+    def __init__(self):
+        self._fields = self.get_available_fields()
+        self._pid = 0
+        self.do_read = True
+
+    def get_available_fields(self):
+        """"Returns a list of available fields.
+
+        The fields are all available KVM debugfs files
+
+        """
+        return walkdir(PATH_DEBUGFS_KVM)[2]
+
+    @property
+    def fields(self):
+        return self._fields
+
+    @fields.setter
+    def fields(self, fields):
+        self._fields = fields
+
+    @property
+    def pid(self):
+        return self._pid
+
+    @pid.setter
+    def pid(self, pid):
+        if pid != 0:
+            self._pid = pid
+
+            vms = walkdir(PATH_DEBUGFS_KVM)[1]
+            if len(vms) == 0:
+                self.do_read = False
+
+            self.paths = filter(lambda x: "{}-".format(pid) in x, vms)
+
+        else:
+            self.paths = ['']
+            self.do_read = True
+
+    def read(self):
+        """Returns a dict with format:'file name / field -> current value'."""
+        results = {}
+
+        # If no debugfs filtering support is available, then don't read.
+        if not self.do_read:
+            return results
+
+        for path in self.paths:
+            for field in self._fields:
+                results[field] = results.get(field, 0) \
+                                 + self.read_field(field, path)
+
+        return results
+
+    def read_field(self, field, path):
+        """Returns the value of a single field from a specific VM."""
+        try:
+            return int(open(os.path.join(PATH_DEBUGFS_KVM,
+                                         path,
+                                         field))
+                       .read())
+        except IOError:
+            return 0
+
+class Stats(object):
+    """Manages the data providers and the data they provide.
+
+    It is used to set filters on the provider's data and collect all
+    provider data.
+
+    """
+    def __init__(self, providers, pid, fields=None):
+        self.providers = providers
+        self._pid_filter = pid
+        self._fields_filter = fields
+        self.values = {}
+        self.update_provider_pid()
+        self.update_provider_filters()
+
+    def update_provider_filters(self):
+        """Propagates fields filters to providers."""
+        def wanted(key):
+            if not self._fields_filter:
+                return True
+            return re.match(self._fields_filter, key) is not None
+
+        # As we reset the counters when updating the fields we can
+        # also clear the cache of old values.
+        self.values = {}
+        for provider in self.providers:
+            provider_fields = [key for key in provider.get_available_fields()
+                               if wanted(key)]
+            provider.fields = provider_fields
+
+    def update_provider_pid(self):
+        """Propagates pid filters to providers."""
+        for provider in self.providers:
+            provider.pid = self._pid_filter
+
+    @property
+    def fields_filter(self):
+        return self._fields_filter
+
+    @fields_filter.setter
+    def fields_filter(self, fields_filter):
+        self._fields_filter = fields_filter
+        self.update_provider_filters()
+
+    @property
+    def pid_filter(self):
+        return self._pid_filter
+
+    @pid_filter.setter
+    def pid_filter(self, pid):
+        self._pid_filter = pid
+        self.values = {}
+        self.update_provider_pid()
+
+    def get(self):
+        """Returns a dict with field -> (value, delta to last value) of all
+        provider data."""
+        for provider in self.providers:
+            new = provider.read()
+            for key in provider.fields:
+                oldval = self.values.get(key, (0, 0))
+                newval = new.get(key, 0)
+                newdelta = None
+                if oldval is not None:
+                    newdelta = newval - oldval[0]
+                self.values[key] = (newval, newdelta)
+        return self.values
+
+LABEL_WIDTH = 40
+NUMBER_WIDTH = 10
+
+class Tui(object):
+    """Instruments curses to draw a nice text ui."""
+    def __init__(self, stats):
+        self.stats = stats
+        self.screen = None
+        self.drilldown = False
+        self.update_drilldown()
+
+    def __enter__(self):
+        """Initialises curses for later use.  Based on curses.wrapper
+           implementation from the Python standard library."""
+        self.screen = curses.initscr()
+        curses.noecho()
+        curses.cbreak()
+
+        # The try/catch works around a minor bit of
+        # over-conscientiousness in the curses module, the error
+        # return from C start_color() is ignorable.
+        try:
+            curses.start_color()
+        except:
+            pass
+
+        curses.use_default_colors()
+        return self
+
+    def __exit__(self, *exception):
+        """Resets the terminal to its normal state.  Based on curses.wrappre
+           implementation from the Python standard library."""
+        if self.screen:
+            self.screen.keypad(0)
+            curses.echo()
+            curses.nocbreak()
+            curses.endwin()
+
+    def update_drilldown(self):
+        """Sets or removes a filter that only allows fields without braces."""
+        if not self.stats.fields_filter:
+            self.stats.fields_filter = r'^[^\(]*$'
+
+        elif self.stats.fields_filter == r'^[^\(]*$':
+            self.stats.fields_filter = None
+
+    def update_pid(self, pid):
+        """Propagates pid selection to stats object."""
+        self.stats.pid_filter = pid
+
+    def refresh(self, sleeptime):
+        """Refreshes on-screen data."""
+        self.screen.erase()
+        if self.stats.pid_filter > 0:
+            self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
+                               .format(self.stats.pid_filter),
+                               curses.A_BOLD)
+        else:
+            self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
+        self.screen.addstr(2, 1, 'Event')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
+                           len('Total'), 'Total')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
+                           len('Current'), 'Current')
+        row = 3
+        stats = self.stats.get()
+        def sortkey(x):
+            if stats[x][1]:
+                return (-stats[x][1], -stats[x][0])
+            else:
+                return (0, -stats[x][0])
+        for key in sorted(stats.keys(), key=sortkey):
+
+            if row >= self.screen.getmaxyx()[0]:
+                break
+            values = stats[key]
+            if not values[0] and not values[1]:
+                break
+            col = 1
+            self.screen.addstr(row, col, key)
+            col += LABEL_WIDTH
+            self.screen.addstr(row, col, '%10d' % (values[0],))
+            col += NUMBER_WIDTH
+            if values[1] is not None:
+                self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
+            row += 1
+        self.screen.refresh()
+
+    def show_filter_selection(self):
+        """Draws filter selection mask.
+
+        Asks for a valid regex and sets the fields filter accordingly.
+
+        """
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               "Show statistics for events matching a regex.",
+                               curses.A_BOLD)
+            self.screen.addstr(2, 0,
+                               "Current regex: {0}"
+                               .format(self.stats.fields_filter))
+            self.screen.addstr(3, 0, "New regex: ")
+            curses.echo()
+            regex = self.screen.getstr()
+            curses.noecho()
+            if len(regex) == 0:
+                return
+            try:
+                re.compile(regex)
+                self.stats.fields_filter = regex
+                return
+            except re.error:
+                continue
+
+    def show_vm_selection(self):
+        """Draws PID selection mask.
+
+        Asks for a pid until a valid pid or 0 has been entered.
+
+        """
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               'Show statistics for specific pid.',
+                               curses.A_BOLD)
+            self.screen.addstr(1, 0,
+                               'This might limit the shown data to the trace '
+                               'statistics.')
+
+            curses.echo()
+            self.screen.addstr(3, 0, "Pid [0 or pid]: ")
+            pid = self.screen.getstr()
+            curses.noecho()
+
+            try:
+                pid = int(pid)
+
+                if pid == 0:
+                    self.update_pid(pid)
+                    break
+                else:
+                    if not os.path.isdir(os.path.join('/proc/', str(pid))):
+                        continue
+                    else:
+                        self.update_pid(pid)
+                        break
+
+            except ValueError:
+                continue
+
+    def show_stats(self):
+        """Refreshes the screen and processes user input."""
+        sleeptime = 0.25
+        while True:
+            self.refresh(sleeptime)
+            curses.halfdelay(int(sleeptime * 10))
+            sleeptime = 3
+            try:
+                char = self.screen.getkey()
+                if char == 'x':
+                    self.drilldown = not self.drilldown
+                    self.update_drilldown()
+                if char == 'q':
+                    break
+                if char == 'f':
+                    self.show_filter_selection()
+                if char == 'p':
+                    self.show_vm_selection()
+            except KeyboardInterrupt:
+                break
+            except curses.error:
+                continue
+
+def batch(stats):
+    """Prints statistics in a key, value format."""
+    s = stats.get()
+    time.sleep(1)
+    s = stats.get()
+    for key in sorted(s.keys()):
+        values = s[key]
+        print '%-42s%10d%10d' % (key, values[0], values[1])
+
+def log(stats):
+    """Prints statistics as reiterating key block, multiple value blocks."""
+    keys = sorted(stats.get().iterkeys())
+    def banner():
+        for k in keys:
+            print '%s' % k,
+        print
+    def statline():
+        s = stats.get()
+        for k in keys:
+            print ' %9d' % s[k][1],
+        print
+    line = 0
+    banner_repeat = 20
+    while True:
+        time.sleep(1)
+        if line % banner_repeat == 0:
+            banner()
+        statline()
+        line += 1
+
+def get_options():
+    """Returns processed program arguments."""
+    description_text = """
+This script displays various statistics about VMs running under KVM.
+The statistics are gathered from the KVM debugfs entries and / or the
+currently available perf traces.
+
+The monitoring takes additional cpu cycles and might affect the VM's
+performance.
+
+Requirements:
+- Access to:
+    /sys/kernel/debug/kvm
+    /sys/kernel/debug/trace/events/*
+    /proc/pid/task
+- /proc/sys/kernel/perf_event_paranoid < 1 if user has no
+  CAP_SYS_ADMIN and perf events are used.
+- CAP_SYS_RESOURCE if the hard limit is not high enough to allow
+  the large number of files that are possibly opened.
+"""
+
+    class PlainHelpFormatter(optparse.IndentedHelpFormatter):
+        def format_description(self, description):
+            if description:
+                return description + "\n"
+            else:
+                return ""
+
+    optparser = optparse.OptionParser(description=description_text,
+                                      formatter=PlainHelpFormatter())
+    optparser.add_option('-1', '--once', '--batch',
+                         action='store_true',
+                         default=False,
+                         dest='once',
+                         help='run in batch mode for one second',
+                         )
+    optparser.add_option('-l', '--log',
+                         action='store_true',
+                         default=False,
+                         dest='log',
+                         help='run in logging mode (like vmstat)',
+                         )
+    optparser.add_option('-t', '--tracepoints',
+                         action='store_true',
+                         default=False,
+                         dest='tracepoints',
+                         help='retrieve statistics from tracepoints',
+                         )
+    optparser.add_option('-d', '--debugfs',
+                         action='store_true',
+                         default=False,
+                         dest='debugfs',
+                         help='retrieve statistics from debugfs',
+                         )
+    optparser.add_option('-f', '--fields',
+                         action='store',
+                         default=None,
+                         dest='fields',
+                         help='fields to display (regex)',
+                         )
+    optparser.add_option('-p', '--pid',
+                        action='store',
+                        default=0,
+                        type=int,
+                        dest='pid',
+                        help='restrict statistics to pid',
+                        )
+    (options, _) = optparser.parse_args(sys.argv)
+    return options
+
+def get_providers(options):
+    """Returns a list of data providers depending on the passed options."""
+    providers = []
+
+    if options.tracepoints:
+        providers.append(TracepointProvider())
+    if options.debugfs:
+        providers.append(DebugfsProvider())
+    if len(providers) == 0:
+        providers.append(TracepointProvider())
+
+    return providers
+
+def check_access(options):
+    """Exits if the current user can't access all needed directories."""
+    if not os.path.exists('/sys/kernel/debug'):
+        sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.')
+        sys.exit(1)
+
+    if not os.path.exists(PATH_DEBUGFS_KVM):
+        sys.stderr.write("Please make sure, that debugfs is mounted and "
+                         "readable by the current user:\n"
+                         "('mount -t debugfs debugfs /sys/kernel/debug')\n"
+                         "Also ensure, that the kvm modules are loaded.\n")
+        sys.exit(1)
+
+    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
+                                                     or not options.debugfs):
+        sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
+                         "when using the option -t (default).\n"
+                         "If it is enabled, make {0} readable by the "
+                         "current user.\n"
+                         .format(PATH_DEBUGFS_TRACING))
+        if options.tracepoints:
+            sys.exit(1)
+
+        sys.stderr.write("Falling back to debugfs statistics!\n")
+        options.debugfs = True
+        sleep(5)
+
+    return options
+
+def main():
+    options = get_options()
+    options = check_access(options)
+
+    if (options.pid > 0 and
+        not os.path.isdir(os.path.join('/proc/',
+                                       str(options.pid)))):
+        sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
+        sys.exit('Specified pid does not exist.')
+
+    providers = get_providers(options)
+    stats = Stats(providers, options.pid, fields=options.fields)
+
+    if options.log:
+        log(stats)
+    elif not options.once:
+        with Tui(stats) as tui:
+            tui.show_stats()
+    else:
+        batch(stats)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
new file mode 100644 (file)
index 0000000..b92a153
--- /dev/null
@@ -0,0 +1,63 @@
+kvm_stat(1)
+===========
+
+NAME
+----
+kvm_stat - Report KVM kernel module event counters
+
+SYNOPSIS
+--------
+[verse]
+'kvm_stat' [OPTION]...
+
+DESCRIPTION
+-----------
+kvm_stat prints counts of KVM kernel module trace events.  These events signify
+state transitions such as guest mode entry and exit.
+
+This tool is useful for observing guest behavior from the host perspective.
+Often conclusions about performance or buggy behavior can be drawn from the
+output.
+
+The set of KVM kernel module trace events may be specific to the kernel version
+or architecture.  It is best to check the KVM kernel module source code for the
+meaning of events.
+
+OPTIONS
+-------
+-1::
+--once::
+--batch::
+       run in batch mode for one second
+
+-l::
+--log::
+       run in logging mode (like vmstat)
+
+-t::
+--tracepoints::
+       retrieve statistics from tracepoints
+
+-d::
+--debugfs::
+       retrieve statistics from debugfs
+
+-p<pid>::
+--pid=<pid>::
+       limit statistics to one virtual machine (pid)
+
+-f<fields>::
+--fields=<fields>::
+       fields to display (regex)
+
+-h::
+--help::
+       show help message
+
+SEE ALSO
+--------
+'perf'(1), 'trace-cmd'(1)
+
+AUTHOR
+------
+Stefan Hajnoczi <stefanha@redhat.com>
index 6765c7e949f35ca3833265a33719f8c6e5baee94..f094f3c4ed84001557503f6cdd8ed33065c86611 100644 (file)
@@ -30,6 +30,10 @@ INCLUDES := -I$(srctree)/tools/include
 CFLAGS   += -Wall -Werror $(EXTRA_WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
 LDFLAGS  += -lelf $(LIBSUBCMD)
 
+# Allow old libelf to be used:
+elfshdr := $(shell echo '\#include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
+CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
+
 AWK = awk
 export srctree OUTPUT CFLAGS ARCH AWK
 include $(srctree)/tools/build/Makefile.include
index 7f3e00a2f9078dca0baf0c67b31106d08c2f2a70..aa1ff6596684f9304d0dd4bd3165f819b4dcdaf7 100644 (file)
 #include <linux/list.h>
 #include <linux/hashtable.h>
 
+#ifdef LIBELF_USE_DEPRECATED
+# define elf_getshdrnum    elf_getshnum
+# define elf_getshdrstrndx elf_getshstrndx
+#endif
+
 struct section {
        struct list_head list;
        GElf_Shdr sh;
index ebaf849e30efd15bce04edb37252fb4aed66441f..9cbddc290affdb2ad67c8c76b676e8932513d02f 100644 (file)
@@ -103,12 +103,13 @@ OPTIONS
 
        If --branch-stack option is used, following sort keys are also
        available:
-       dso_from, dso_to, symbol_from, symbol_to, mispredict.
 
        - dso_from: name of library or module branched from
        - dso_to: name of library or module branched to
        - symbol_from: name of function branched from
        - symbol_to: name of function branched to
+       - srcline_from: source file and line branched from
+       - srcline_to: source file and line branched to
        - mispredict: "N" for predicted branch, "Y" for mispredicted branch
        - in_tx: branch in TSX transaction
        - abort: TSX transaction abort.
@@ -248,7 +249,7 @@ OPTIONS
        Note that when using the --itrace option the synthesized callchain size
        will override this value if the synthesized callchain size is bigger.
 
-       Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+       Default: 127
 
 -G::
 --inverted::
index a856a1095893cab0a0d3f7fdff39820090c99422..4fc44c75263fdb803315394e8353d77063f69e6a 100644 (file)
@@ -267,7 +267,7 @@ include::itrace.txt[]
         Note that when using the --itrace option the synthesized callchain size
         will override this value if the synthesized callchain size is bigger.
 
-        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+        Default: 127
 
 --ns::
        Use 9 decimal places when displaying time (i.e. show the nanoseconds)
index 6afe20121bc06d671931a3d22d6eeca2ca35c0a6..1ab0782369b1faa9bbe6758ed0e3439c414eb7e1 100644 (file)
@@ -143,7 +143,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
         Implies '--call-graph dwarf' when --call-graph not present on the
         command line, on systems where DWARF unwinding was built in.
 
-        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+        Default: /proc/sys/kernel/perf_event_max_stack when present for
+                 live sessions (without --input/-i), 127 otherwise.
 
 --min-stack::
         Set the stack depth limit when parsing the callchain, anything
index 814158393656ca5348e6272ebc30f9449e0edfce..25c81734a9505604d8056ef897dd64975998ab66 100644 (file)
@@ -324,8 +324,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
                    "Skip symbols that cannot be annotated"),
        OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                  "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
                    "Interleave source code with assembly code (default)"),
        OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
index 632efc6b79a07e20c4b25f9757bcd9c089050270..d75bded21fe0ce73ef9aea745ff565c9ba61a204 100644 (file)
@@ -119,8 +119,8 @@ static int build_id_cache__add_kcore(const char *filename, bool force)
        if (build_id_cache__kcore_buildid(from_dir, sbuildid) < 0)
                return -1;
 
-       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s",
-                 buildid_dir, sbuildid);
+       scnprintf(to_dir, sizeof(to_dir), "%s/%s/%s",
+                 buildid_dir, DSO__NAME_KCORE, sbuildid);
 
        if (!force &&
            !build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) {
@@ -131,8 +131,8 @@ static int build_id_cache__add_kcore(const char *filename, bool force)
        if (build_id_cache__kcore_dir(dir, sizeof(dir)))
                return -1;
 
-       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s",
-                 buildid_dir, sbuildid, dir);
+       scnprintf(to_dir, sizeof(to_dir), "%s/%s/%s/%s",
+                 buildid_dir, DSO__NAME_KCORE, sbuildid, dir);
 
        if (mkdir_p(to_dir, 0755))
                return -1;
index 9ce354f469dce9e96d078ae5c352732cec59178c..f7645a42708eb2223069b3555df0a325c2c635d8 100644 (file)
@@ -812,8 +812,9 @@ static const struct option options[] = {
        OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
                   "separator for columns, no spaces will be added between "
                   "columns '.' is reserved."),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_UINTEGER('o', "order", &sort_compute, "Specify compute sorting."),
        OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
                     "How to display percentage of filtered entries", parse_filter_percentage),
index f3679c44d3f3d4b7c51bbb627375a315099b974d..dc3fcb597e4c10cf091dcad73a53776387baab16 100644 (file)
@@ -40,6 +40,7 @@
 #include <unistd.h>
 #include <sched.h>
 #include <sys/mman.h>
+#include <asm/bug.h>
 
 
 struct record {
@@ -82,27 +83,87 @@ static int process_synthesized_event(struct perf_tool *tool,
        return record__write(rec, event, event->header.size);
 }
 
+static int
+backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end)
+{
+       struct perf_event_header *pheader;
+       u64 evt_head = head;
+       int size = mask + 1;
+
+       pr_debug2("backward_rb_find_range: buf=%p, head=%"PRIx64"\n", buf, head);
+       pheader = (struct perf_event_header *)(buf + (head & mask));
+       *start = head;
+       while (true) {
+               if (evt_head - head >= (unsigned int)size) {
+                       pr_debug("Finshed reading backward ring buffer: rewind\n");
+                       if (evt_head - head > (unsigned int)size)
+                               evt_head -= pheader->size;
+                       *end = evt_head;
+                       return 0;
+               }
+
+               pheader = (struct perf_event_header *)(buf + (evt_head & mask));
+
+               if (pheader->size == 0) {
+                       pr_debug("Finshed reading backward ring buffer: get start\n");
+                       *end = evt_head;
+                       return 0;
+               }
+
+               evt_head += pheader->size;
+               pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
+       }
+       WARN_ONCE(1, "Shouldn't get here\n");
+       return -1;
+}
+
+static int
+rb_find_range(struct perf_evlist *evlist,
+             void *data, int mask, u64 head, u64 old,
+             u64 *start, u64 *end)
+{
+       if (!evlist->backward) {
+               *start = old;
+               *end = head;
+               return 0;
+       }
+
+       return backward_rb_find_range(data, mask, head, start, end);
+}
+
 static int record__mmap_read(struct record *rec, int idx)
 {
        struct perf_mmap *md = &rec->evlist->mmap[idx];
        u64 head = perf_mmap__read_head(md);
        u64 old = md->prev;
+       u64 end = head, start = old;
        unsigned char *data = md->base + page_size;
        unsigned long size;
        void *buf;
        int rc = 0;
 
-       if (old == head)
+       if (rb_find_range(rec->evlist, data, md->mask, head,
+                         old, &start, &end))
+               return -1;
+
+       if (start == end)
                return 0;
 
        rec->samples++;
 
-       size = head - old;
+       size = end - start;
+       if (size > (unsigned long)(md->mask) + 1) {
+               WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
+
+               md->prev = head;
+               perf_evlist__mmap_consume(rec->evlist, idx);
+               return 0;
+       }
 
-       if ((old & md->mask) + size != (head & md->mask)) {
-               buf = &data[old & md->mask];
-               size = md->mask + 1 - (old & md->mask);
-               old += size;
+       if ((start & md->mask) + size != (end & md->mask)) {
+               buf = &data[start & md->mask];
+               size = md->mask + 1 - (start & md->mask);
+               start += size;
 
                if (record__write(rec, buf, size) < 0) {
                        rc = -1;
@@ -110,16 +171,16 @@ static int record__mmap_read(struct record *rec, int idx)
                }
        }
 
-       buf = &data[old & md->mask];
-       size = head - old;
-       old += size;
+       buf = &data[start & md->mask];
+       size = end - start;
+       start += size;
 
        if (record__write(rec, buf, size) < 0) {
                rc = -1;
                goto out;
        }
 
-       md->prev = old;
+       md->prev = head;
        perf_evlist__mmap_consume(rec->evlist, idx);
 out:
        return rc;
index 87d40e3c4078ee99740e4563ebff885792ec9aff..a87cb338bdf14b2d49c19b840fb99b3265e68997 100644 (file)
@@ -691,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                        .ordered_events  = true,
                        .ordering_requires_timestamps = true,
                },
-               .max_stack               = sysctl_perf_event_max_stack,
+               .max_stack               = PERF_MAX_STACK_DEPTH,
                .pretty_printing_style   = "normal",
                .socket_filter           = -1,
        };
@@ -770,8 +770,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                   "columns '.' is reserved."),
        OPT_BOOLEAN('U', "hide-unresolved", &symbol_conf.hide_unresolved,
                    "Only display entries resolved to a symbol"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
                   "list of cpus to profile"),
        OPT_BOOLEAN('I', "show-info", &report.show_full_info,
index efca81679bb314624b88d024c52b63f2fc54729c..e3ce2f34d3ad5276cc8f10d78b6590c41b56c179 100644 (file)
@@ -2010,8 +2010,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                   "file", "kallsyms pathname"),
        OPT_BOOLEAN('G', "hide-call-graph", &no_callchain,
                    "When printing symbols do not display call chain"),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_CALLBACK('F', "fields", NULL, "str",
                     "comma separated output fields prepend with 'type:'. "
                     "Valid types: hw,sw,trace,raw. "
@@ -2067,8 +2068,6 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                NULL
        };
 
-       scripting_max_stack = sysctl_perf_event_max_stack;
-
        setup_scripting();
 
        argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
index e459b685a4e999817ff06e01994b3f30f233c528..ee7ada78d86f81c5f788feed43f42ec49e33bb96 100644 (file)
@@ -66,6 +66,7 @@
 #include <stdlib.h>
 #include <sys/prctl.h>
 #include <locale.h>
+#include <math.h>
 
 #define DEFAULT_SEPARATOR      " "
 #define CNTR_NOT_SUPPORTED     "<not supported>"
@@ -991,12 +992,12 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
        const char *fmt;
 
        if (csv_output) {
-               fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
+               fmt = floor(sc) != sc ?  "%.2f%s" : "%.0f%s";
        } else {
                if (big_num)
-                       fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s";
+                       fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s";
                else
-                       fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s";
+                       fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s";
        }
 
        aggr_printout(evsel, id, nr);
@@ -1909,6 +1910,9 @@ static int add_default_attributes(void)
        }
 
        if (!evsel_list->nr_entries) {
+               if (target__has_cpu(&target))
+                       default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
+
                if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
                        return -1;
                if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
@@ -2000,7 +2004,7 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
                                    union perf_event *event,
                                    struct perf_session *session)
 {
-       struct stat_round_event *round = &event->stat_round;
+       struct stat_round_event *stat_round = &event->stat_round;
        struct perf_evsel *counter;
        struct timespec tsh, *ts = NULL;
        const char **argv = session->header.env.cmdline_argv;
@@ -2009,12 +2013,12 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
        evlist__for_each(evsel_list, counter)
                perf_stat_process_counter(&stat_config, counter);
 
-       if (round->type == PERF_STAT_ROUND_TYPE__FINAL)
-               update_stats(&walltime_nsecs_stats, round->time);
+       if (stat_round->type == PERF_STAT_ROUND_TYPE__FINAL)
+               update_stats(&walltime_nsecs_stats, stat_round->time);
 
-       if (stat_config.interval && round->time) {
-               tsh.tv_sec  = round->time / NSECS_PER_SEC;
-               tsh.tv_nsec = round->time % NSECS_PER_SEC;
+       if (stat_config.interval && stat_round->time) {
+               tsh.tv_sec  = stat_round->time / NSECS_PER_SEC;
+               tsh.tv_nsec = stat_round->time % NSECS_PER_SEC;
                ts = &tsh;
        }
 
index 40cc9bb3506c6d1b1625f2e37da795be36809ce3..733a55422d030037ab1c84bbf4b351c45074cd2c 100644 (file)
@@ -1945,8 +1945,9 @@ int cmd_timechart(int argc, const char **argv,
        OPT_CALLBACK('p', "process", NULL, "process",
                      "process selector. Pass a pid or process name.",
                       parse_process),
-       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-                   "Look for files with symbols relative to this directory"),
+       OPT_CALLBACK(0, "symfs", NULL, "directory",
+                    "Look for files with symbols relative to this directory",
+                    symbol__config_symfs),
        OPT_INTEGER('n', "proc-num", &tchart.proc_num,
                    "min. number of tasks to print"),
        OPT_BOOLEAN('t', "topology", &tchart.topology,
index 1793da5856762afbcc1ad2313d3e69e557089606..2a6cc254ad0c3e09f65f49b5b0169890a5579ac0 100644 (file)
@@ -732,7 +732,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
        if (machine__resolve(machine, &al, sample) < 0)
                return;
 
-       if (!top->kptr_restrict_warned &&
+       if (!machine->kptr_restrict_warned &&
            symbol_conf.kptr_restrict &&
            al.cpumode == PERF_RECORD_MISC_KERNEL) {
                ui__warning(
@@ -743,7 +743,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
                          " modules" : "");
                if (use_browser <= 0)
                        sleep(5);
-               top->kptr_restrict_warned = true;
+               machine->kptr_restrict_warned = true;
        }
 
        if (al.sym == NULL) {
@@ -759,7 +759,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
                 * --hide-kernel-symbols, even if the user specifies an
                 * invalid --vmlinux ;-)
                 */
-               if (!top->kptr_restrict_warned && !top->vmlinux_warned &&
+               if (!machine->kptr_restrict_warned && !top->vmlinux_warned &&
                    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
                    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
                        if (symbol_conf.vmlinux_name) {
index 6e5c325148e488ec61a880196db9a7294d20156c..5c50fe70d6b37379bd8b89b05d9df7899b9fc27d 100644 (file)
@@ -576,84 +576,54 @@ static struct syscall_fmt {
        bool       hexret;
 } syscall_fmts[] = {
        { .name     = "access",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
-                            [1] = SCA_ACCMODE,  /* mode */ }, },
+         .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
        { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
        { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
        { .name     = "brk",        .hexret = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
-       { .name     = "chdir",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "chmod",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "chroot",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+       { .name     = "chdir",      .errmsg = true, },
+       { .name     = "chmod",      .errmsg = true, },
+       { .name     = "chroot",     .errmsg = true, },
        { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
        { .name     = "clone",      .errpid = true, },
        { .name     = "close",      .errmsg = true,
          .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
        { .name     = "connect",    .errmsg = true, },
-       { .name     = "creat",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "dup",        .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "dup2",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "dup3",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "creat",      .errmsg = true, },
+       { .name     = "dup",        .errmsg = true, },
+       { .name     = "dup2",       .errmsg = true, },
+       { .name     = "dup3",       .errmsg = true, },
        { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
        { .name     = "eventfd2",   .errmsg = true,
          .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
-       { .name     = "faccessat",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fadvise64",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fallocate",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fchdir",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fchmod",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "faccessat",  .errmsg = true, },
+       { .name     = "fadvise64",  .errmsg = true, },
+       { .name     = "fallocate",  .errmsg = true, },
+       { .name     = "fchdir",     .errmsg = true, },
+       { .name     = "fchmod",     .errmsg = true, },
        { .name     = "fchmodat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fchown",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "fchown",     .errmsg = true, },
        { .name     = "fchownat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
        { .name     = "fcntl",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [1] = SCA_STRARRAY, /* cmd */ },
+         .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
          .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
-       { .name     = "fdatasync",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "fdatasync",  .errmsg = true, },
        { .name     = "flock",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [1] = SCA_FLOCK, /* cmd */ }, },
-       { .name     = "fsetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "fstatfs",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "fsync",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "ftruncate", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
+       { .name     = "fsetxattr",  .errmsg = true, },
+       { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
+       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
+       { .name     = "fstatfs",    .errmsg = true, },
+       { .name     = "fsync",    .errmsg = true, },
+       { .name     = "ftruncate", .errmsg = true, },
        { .name     = "futex",      .errmsg = true,
          .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
        { .name     = "futimesat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "getdents",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "getdents64", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "getdents",   .errmsg = true, },
+       { .name     = "getdents64", .errmsg = true, },
        { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
        { .name     = "getpid",     .errpid = true, },
        { .name     = "getpgid",    .errpid = true, },
@@ -661,12 +631,10 @@ static struct syscall_fmt {
        { .name     = "getrandom",  .errmsg = true,
          .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
        { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-       { .name     = "getxattr",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "inotify_add_watch",          .errmsg = true,
-         .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "getxattr",   .errmsg = true, },
+       { .name     = "inotify_add_watch",          .errmsg = true, },
        { .name     = "ioctl",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
+         .arg_scnprintf = {
 #if defined(__i386__) || defined(__x86_64__)
 /*
  * FIXME: Make this available to all arches.
@@ -680,41 +648,28 @@ static struct syscall_fmt {
        { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
        { .name     = "kill",       .errmsg = true,
          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-       { .name     = "lchown",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "lgetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "lchown",    .errmsg = true, },
+       { .name     = "lgetxattr",  .errmsg = true, },
        { .name     = "linkat",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-       { .name     = "listxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "llistxattr", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "lremovexattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "listxattr",  .errmsg = true, },
+       { .name     = "llistxattr", .errmsg = true, },
+       { .name     = "lremovexattr",  .errmsg = true, },
        { .name     = "lseek",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_STRARRAY, /* whence */ },
+         .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
          .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
-       { .name     = "lsetxattr",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "lstat",      .errmsg = true, .alias = "newlstat",
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "lsxattr",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "lsetxattr",  .errmsg = true, },
+       { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
+       { .name     = "lsxattr",    .errmsg = true, },
        { .name     = "madvise",    .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX,      /* start */
                             [2] = SCA_MADV_BHV, /* behavior */ }, },
-       { .name     = "mkdir",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "mkdir",    .errmsg = true, },
        { .name     = "mkdirat",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "mknod",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
+       { .name     = "mknod",      .errmsg = true, },
        { .name     = "mknodat",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
        { .name     = "mlock",      .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
        { .name     = "mlockall",   .errmsg = true,
@@ -722,8 +677,7 @@ static struct syscall_fmt {
        { .name     = "mmap",       .hexret = true,
          .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
                             [2] = SCA_MMAP_PROT, /* prot */
-                            [3] = SCA_MMAP_FLAGS, /* flags */
-                            [4] = SCA_FD,        /* fd */ }, },
+                            [3] = SCA_MMAP_FLAGS, /* flags */ }, },
        { .name     = "mprotect",   .errmsg = true,
          .arg_scnprintf = { [0] = SCA_HEX, /* start */
                             [2] = SCA_MMAP_PROT, /* prot */ }, },
@@ -740,17 +694,14 @@ static struct syscall_fmt {
        { .name     = "name_to_handle_at", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
        { .name     = "newfstatat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
        { .name     = "open",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
-                            [1] = SCA_OPEN_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "open_by_handle_at", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "openat",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* filename */
                             [2] = SCA_OPEN_FLAGS, /* flags */ }, },
        { .name     = "perf_event_open", .errmsg = true,
          .arg_scnprintf = { [2] = SCA_INT, /* cpu */
@@ -760,39 +711,26 @@ static struct syscall_fmt {
          .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
        { .name     = "poll",       .errmsg = true, .timeout = true, },
        { .name     = "ppoll",      .errmsg = true, .timeout = true, },
-       { .name     = "pread",      .errmsg = true, .alias = "pread64",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "preadv",     .errmsg = true, .alias = "pread",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "pread",      .errmsg = true, .alias = "pread64", },
+       { .name     = "preadv",     .errmsg = true, .alias = "pread", },
        { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
-       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "pwritev",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "read",       .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "readlink",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
+       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
+       { .name     = "pwritev",    .errmsg = true, },
+       { .name     = "read",       .errmsg = true, },
+       { .name     = "readlink",   .errmsg = true, },
        { .name     = "readlinkat", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "readv",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "readv",      .errmsg = true, },
        { .name     = "recvfrom",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "recvmmsg",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "recvmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_MSG_FLAGS, /* flags */ }, },
-       { .name     = "removexattr", .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "removexattr", .errmsg = true, },
        { .name     = "renameat",   .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-       { .name     = "rmdir",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "rmdir",    .errmsg = true, },
        { .name     = "rt_sigaction", .errmsg = true,
          .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
        { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
@@ -807,22 +745,17 @@ static struct syscall_fmt {
                             [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
        { .name     = "select",     .errmsg = true, .timeout = true, },
        { .name     = "sendmmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "sendmsg",    .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [2] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "sendto",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */
-                            [3] = SCA_MSG_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
        { .name     = "set_tid_address", .errpid = true, },
        { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
        { .name     = "setpgid",    .errmsg = true, },
        { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-       { .name     = "setxattr",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "shutdown",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "setxattr",   .errmsg = true, },
+       { .name     = "shutdown",   .errmsg = true, },
        { .name     = "socket",     .errmsg = true,
          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
                             [1] = SCA_SK_TYPE, /* type */ },
@@ -831,10 +764,8 @@ static struct syscall_fmt {
          .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
                             [1] = SCA_SK_TYPE, /* type */ },
          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
-       { .name     = "stat",       .errmsg = true, .alias = "newstat",
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "statfs",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
+       { .name     = "stat",       .errmsg = true, .alias = "newstat", },
+       { .name     = "statfs",     .errmsg = true, },
        { .name     = "swapoff",    .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
        { .name     = "swapon",     .errmsg = true,
@@ -845,29 +776,21 @@ static struct syscall_fmt {
          .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
        { .name     = "tkill",      .errmsg = true,
          .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-       { .name     = "truncate",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
+       { .name     = "truncate",   .errmsg = true, },
        { .name     = "uname",      .errmsg = true, .alias = "newuname", },
        { .name     = "unlinkat",   .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-                            [1] = SCA_FILENAME, /* pathname */ }, },
-       { .name     = "utime",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "utime",  .errmsg = true, },
        { .name     = "utimensat",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
-                            [1] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "utimes",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
-       { .name     = "vmsplice",  .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
+       { .name     = "utimes",  .errmsg = true, },
+       { .name     = "vmsplice",  .errmsg = true, },
        { .name     = "wait4",      .errpid = true,
          .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
        { .name     = "waitid",     .errpid = true,
          .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
-       { .name     = "write",      .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
-       { .name     = "writev",     .errmsg = true,
-         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+       { .name     = "write",      .errmsg = true, },
+       { .name     = "writev",     .errmsg = true, },
 };
 
 static int syscall_fmt__cmp(const void *name, const void *fmtp)
@@ -1160,6 +1083,24 @@ static int trace__tool_process(struct perf_tool *tool,
        return trace__process_event(trace, machine, event, sample);
 }
 
+static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
+{
+       struct machine *machine = vmachine;
+
+       if (machine->kptr_restrict_warned)
+               return NULL;
+
+       if (symbol_conf.kptr_restrict) {
+               pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
+                          "Check /proc/sys/kernel/kptr_restrict.\n\n"
+                          "Kernel samples will not be resolved.\n");
+               machine->kptr_restrict_warned = true;
+               return NULL;
+       }
+
+       return machine__resolve_kernel_addr(vmachine, addrp, modp);
+}
+
 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 {
        int err = symbol__init(NULL);
@@ -1171,7 +1112,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
        if (trace->host == NULL)
                return -ENOMEM;
 
-       if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
+       if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
                return -errno;
 
        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
@@ -1186,7 +1127,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 static int syscall__set_arg_fmts(struct syscall *sc)
 {
        struct format_field *field;
-       int idx = 0;
+       int idx = 0, len;
 
        sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
        if (sc->arg_scnprintf == NULL)
@@ -1198,12 +1139,31 @@ static int syscall__set_arg_fmts(struct syscall *sc)
        for (field = sc->args; field; field = field->next) {
                if (sc->fmt && sc->fmt->arg_scnprintf[idx])
                        sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
+               else if (strcmp(field->type, "const char *") == 0 &&
+                        (strcmp(field->name, "filename") == 0 ||
+                         strcmp(field->name, "path") == 0 ||
+                         strcmp(field->name, "pathname") == 0))
+                       sc->arg_scnprintf[idx] = SCA_FILENAME;
                else if (field->flags & FIELD_IS_POINTER)
                        sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
                else if (strcmp(field->type, "pid_t") == 0)
                        sc->arg_scnprintf[idx] = SCA_PID;
                else if (strcmp(field->type, "umode_t") == 0)
                        sc->arg_scnprintf[idx] = SCA_MODE_T;
+               else if ((strcmp(field->type, "int") == 0 ||
+                         strcmp(field->type, "unsigned int") == 0 ||
+                         strcmp(field->type, "long") == 0) &&
+                        (len = strlen(field->name)) >= 2 &&
+                        strcmp(field->name + len - 2, "fd") == 0) {
+                       /*
+                        * /sys/kernel/tracing/events/syscalls/sys_enter*
+                        * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
+                        * 65 int
+                        * 23 unsigned int
+                        * 7 unsigned long
+                        */
+                       sc->arg_scnprintf[idx] = SCA_FD;
+               }
                ++idx;
        }
 
@@ -1534,7 +1494,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
        if (sc->is_exit) {
                if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
                        trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
-                       fprintf(trace->output, "%-70s\n", ttrace->entry_str);
+                       fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
                }
        } else {
                ttrace->entry_pending = true;
@@ -2887,12 +2847,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                mmap_pages_user_set = false;
 
        if (trace.max_stack == UINT_MAX) {
-               trace.max_stack = sysctl_perf_event_max_stack;
+               trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
                max_stack_user_set = false;
        }
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
-       if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled)
+       if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
                record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
 #endif
 
index 797000842d401640021d4b0035d91a5b94a996be..15982cee5ef3f3b85b51f470777df5a2ac5ee3d1 100644 (file)
@@ -549,6 +549,9 @@ int main(int argc, const char **argv)
        if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
                sysctl_perf_event_max_stack = value;
 
+       if (sysctl__read_int("kernel/perf_event_max_contexts_per_stack", &value) == 0)
+               sysctl_perf_event_max_contexts_per_stack = value;
+
        cmd = extract_argv0_path(argv[0]);
        if (!cmd)
                cmd = "perf-help";
index 4db73d5a0dbc6bf421c57d50ff0db7560caff03f..7e5a1e8874cee4a263d2577f55fd3ddf7dde3a95 100644 (file)
@@ -354,9 +354,6 @@ static struct ins_ops nop_ops = {
        .scnprintf = nop__scnprintf,
 };
 
-/*
- * Must be sorted by name!
- */
 static struct ins instructions[] = {
        { .name = "add",   .ops  = &mov_ops, },
        { .name = "addl",  .ops  = &mov_ops, },
@@ -372,8 +369,8 @@ static struct ins instructions[] = {
        { .name = "bgt",   .ops  = &jump_ops, },
        { .name = "bhi",   .ops  = &jump_ops, },
        { .name = "bl",    .ops  = &call_ops, },
-       { .name = "blt",   .ops  = &jump_ops, },
        { .name = "bls",   .ops  = &jump_ops, },
+       { .name = "blt",   .ops  = &jump_ops, },
        { .name = "blx",   .ops  = &call_ops, },
        { .name = "bne",   .ops  = &jump_ops, },
 #endif
@@ -449,18 +446,39 @@ static struct ins instructions[] = {
        { .name = "xbeginq", .ops  = &jump_ops, },
 };
 
-static int ins__cmp(const void *name, const void *insp)
+static int ins__key_cmp(const void *name, const void *insp)
 {
        const struct ins *ins = insp;
 
        return strcmp(name, ins->name);
 }
 
+static int ins__cmp(const void *a, const void *b)
+{
+       const struct ins *ia = a;
+       const struct ins *ib = b;
+
+       return strcmp(ia->name, ib->name);
+}
+
+static void ins__sort(void)
+{
+       const int nmemb = ARRAY_SIZE(instructions);
+
+       qsort(instructions, nmemb, sizeof(struct ins), ins__cmp);
+}
+
 static struct ins *ins__find(const char *name)
 {
        const int nmemb = ARRAY_SIZE(instructions);
+       static bool sorted;
+
+       if (!sorted) {
+               ins__sort();
+               sorted = true;
+       }
 
-       return bsearch(name, instructions, nmemb, sizeof(struct ins), ins__cmp);
+       return bsearch(name, instructions, nmemb, sizeof(struct ins), ins__key_cmp);
 }
 
 int symbol__annotate_init(struct map *map __maybe_unused, struct symbol *sym)
@@ -1122,7 +1140,7 @@ int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize)
        } else if (dso__is_kcore(dso)) {
                goto fallback;
        } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
-                  strstr(command, "[kernel.kallsyms]") ||
+                  strstr(command, DSO__NAME_KALLSYMS) ||
                   access(symfs_filename, R_OK)) {
                free(filename);
 fallback:
index bff425e1232cdcb99f362e1b7d5971e5d1121a23..67e5966503b21099688d3f90cf903b2d58d196df 100644 (file)
@@ -256,7 +256,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                size_t name_len;
                bool in_kernel = false;
 
-               if (!pos->hit)
+               if (!pos->hit && !dso__is_vdso(pos))
                        continue;
 
                if (dso__is_vdso(pos)) {
index 8d96c80cc67e629f0ef576ae24c1266e468df91f..c9a6dc173e74eb77f56673c6a609a5a92c788498 100644 (file)
@@ -298,8 +298,7 @@ static struct call_path *call_path_from_sample(struct db_export *dbe,
         */
        callchain_param.order = ORDER_CALLER;
        err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
-                                       sample, NULL, NULL,
-                                       sysctl_perf_event_max_stack);
+                                       sample, NULL, NULL, PERF_MAX_STACK_DEPTH);
        if (err) {
                callchain_param.order = saved_order;
                return NULL;
index 3357479082ca95b9b6cfd4df5015a30b470214fe..5d286f5d7906798a6ffe1e5e91b8a52bf4bc1973 100644 (file)
@@ -7,6 +7,7 @@
 #include "auxtrace.h"
 #include "util.h"
 #include "debug.h"
+#include "vdso.h"
 
 char dso__symtab_origin(const struct dso *dso)
 {
@@ -62,9 +63,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
                }
                break;
        case DSO_BINARY_TYPE__BUILD_ID_CACHE:
-               /* skip the locally configured cache if a symfs is given */
-               if (symbol_conf.symfs[0] ||
-                   (dso__build_id_filename(dso, filename, size) == NULL))
+               if (dso__build_id_filename(dso, filename, size) == NULL)
                        ret = -1;
                break;
 
@@ -1169,7 +1168,7 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
        struct dso *pos;
 
        list_for_each_entry(pos, head, node) {
-               if (with_hits && !pos->hit)
+               if (with_hits && !pos->hit && !dso__is_vdso(pos))
                        continue;
                if (pos->has_build_id) {
                        have_build_id = true;
index c4bfe11479a0e0d7559ff941c63e96557400bce4..e82ba90cc96997b03042c79c180e2de34bec231f 100644 (file)
@@ -44,6 +44,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
        perf_evlist__set_maps(evlist, cpus, threads);
        fdarray__init(&evlist->pollfd, 64);
        evlist->workload.pid = -1;
+       evlist->backward = false;
 }
 
 struct perf_evlist *perf_evlist__new(void)
@@ -679,6 +680,33 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
        return NULL;
 }
 
+static int perf_evlist__set_paused(struct perf_evlist *evlist, bool value)
+{
+       int i;
+
+       for (i = 0; i < evlist->nr_mmaps; i++) {
+               int fd = evlist->mmap[i].fd;
+               int err;
+
+               if (fd < 0)
+                       continue;
+               err = ioctl(fd, PERF_EVENT_IOC_PAUSE_OUTPUT, value ? 1 : 0);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
+
+int perf_evlist__pause(struct perf_evlist *evlist)
+{
+       return perf_evlist__set_paused(evlist, true);
+}
+
+int perf_evlist__resume(struct perf_evlist *evlist)
+{
+       return perf_evlist__set_paused(evlist, false);
+}
+
 /* When check_messup is true, 'end' must points to a good entry */
 static union perf_event *
 perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start,
@@ -881,6 +909,7 @@ static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
        if (evlist->mmap[idx].base != NULL) {
                munmap(evlist->mmap[idx].base, evlist->mmap_len);
                evlist->mmap[idx].base = NULL;
+               evlist->mmap[idx].fd = -1;
                atomic_set(&evlist->mmap[idx].refcnt, 0);
        }
        auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap);
@@ -901,10 +930,14 @@ void perf_evlist__munmap(struct perf_evlist *evlist)
 
 static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 {
+       int i;
+
        evlist->nr_mmaps = cpu_map__nr(evlist->cpus);
        if (cpu_map__empty(evlist->cpus))
                evlist->nr_mmaps = thread_map__nr(evlist->threads);
        evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+       for (i = 0; i < evlist->nr_mmaps; i++)
+               evlist->mmap[i].fd = -1;
        return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
@@ -941,6 +974,7 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
                evlist->mmap[idx].base = NULL;
                return -1;
        }
+       evlist->mmap[idx].fd = fd;
 
        if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap,
                                &mp->auxtrace_mp, evlist->mmap[idx].base, fd))
index 85d1b59802e86feafe943f4d823a694776baef15..d740fb877ab6f22ac912c9fdc972fc39f1bedc6a 100644 (file)
@@ -28,6 +28,7 @@ struct record_opts;
 struct perf_mmap {
        void             *base;
        int              mask;
+       int              fd;
        atomic_t         refcnt;
        u64              prev;
        struct auxtrace_mmap auxtrace_mmap;
@@ -43,6 +44,7 @@ struct perf_evlist {
        bool             overwrite;
        bool             enabled;
        bool             has_user_cpus;
+       bool             backward;
        size_t           mmap_len;
        int              id_pos;
        int              is_pos;
@@ -135,6 +137,8 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
 
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
 
+int perf_evlist__pause(struct perf_evlist *evlist);
+int perf_evlist__resume(struct perf_evlist *evlist);
 int perf_evlist__open(struct perf_evlist *evlist);
 void perf_evlist__close(struct perf_evlist *evlist);
 
index 52c7d8884741c33b6f00ed938c934bdf900c57b3..5d7037ef7d3b7d43f0b38822e4de50131ab6b4d2 100644 (file)
@@ -37,6 +37,7 @@ static struct {
        bool clockid;
        bool clockid_wrong;
        bool lbr_flags;
+       bool write_backward;
 } perf_missing_features;
 
 static clockid_t clockid;
@@ -1376,6 +1377,8 @@ fallback_missing_features:
        if (perf_missing_features.lbr_flags)
                evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
                                     PERF_SAMPLE_BRANCH_NO_CYCLES);
+       if (perf_missing_features.write_backward)
+               evsel->attr.write_backward = false;
 retry_sample_id:
        if (perf_missing_features.sample_id_all)
                evsel->attr.sample_id_all = 0;
@@ -1438,6 +1441,12 @@ retry_open:
                                err = -EINVAL;
                                goto out_close;
                        }
+
+                       if (evsel->overwrite &&
+                           perf_missing_features.write_backward) {
+                               err = -EINVAL;
+                               goto out_close;
+                       }
                }
        }
 
@@ -1500,6 +1509,10 @@ try_fallback:
                          PERF_SAMPLE_BRANCH_NO_FLAGS))) {
                perf_missing_features.lbr_flags = true;
                goto fallback_missing_features;
+       } else if (!perf_missing_features.write_backward &&
+                       evsel->attr.write_backward) {
+               perf_missing_features.write_backward = true;
+               goto fallback_missing_features;
        }
 
 out_close:
index 8a644fef452c07356530824f1f3af151c797fcd3..c1f10159804ca93768337838fd0ce70aadb35d2c 100644 (file)
@@ -112,6 +112,7 @@ struct perf_evsel {
        bool                    tracking;
        bool                    per_pkg;
        bool                    precise_max;
+       bool                    overwrite;
        /* parse modifier helper */
        int                     exclude_GH;
        int                     nr_members;
index cfab531437c743c4c849d40fd45545fcb2fbcddc..d1f19e0012d44d907a65ac4d1ca281eab38e9b70 100644 (file)
@@ -117,6 +117,13 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
                        hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
                        hists__set_unres_dso_col_len(hists, HISTC_DSO_TO);
                }
+
+               if (h->branch_info->srcline_from)
+                       hists__new_col_len(hists, HISTC_SRCLINE_FROM,
+                                       strlen(h->branch_info->srcline_from));
+               if (h->branch_info->srcline_to)
+                       hists__new_col_len(hists, HISTC_SRCLINE_TO,
+                                       strlen(h->branch_info->srcline_to));
        }
 
        if (h->mem_info) {
@@ -1042,6 +1049,8 @@ void hist_entry__delete(struct hist_entry *he)
        if (he->branch_info) {
                map__zput(he->branch_info->from.map);
                map__zput(he->branch_info->to.map);
+               free_srcline(he->branch_info->srcline_from);
+               free_srcline(he->branch_info->srcline_to);
                zfree(&he->branch_info);
        }
 
index 0f84bfb42bb1378c3b21aabee90ed9cd5e44c02f..7b54ccf1b7370ab2336bd16a1c18a7c932b7e41d 100644 (file)
@@ -52,6 +52,8 @@ enum hist_column {
        HISTC_MEM_IADDR_SYMBOL,
        HISTC_TRANSACTION,
        HISTC_CYCLES,
+       HISTC_SRCLINE_FROM,
+       HISTC_SRCLINE_TO,
        HISTC_TRACE,
        HISTC_NR_COLS, /* Last entry */
 };
index f9644f79686c8cd3c5ff631e685338068d8f071a..b1772180c82078150ff31196e1862e0c76e1d086 100644 (file)
@@ -43,6 +43,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 
        machine->symbol_filter = NULL;
        machine->id_hdr_size = 0;
+       machine->kptr_restrict_warned = false;
        machine->comm_exec = false;
        machine->kernel_start = 0;
 
@@ -709,7 +710,7 @@ static struct dso *machine__get_kernel(struct machine *machine)
        if (machine__is_host(machine)) {
                vmlinux_name = symbol_conf.vmlinux_name;
                if (!vmlinux_name)
-                       vmlinux_name = "[kernel.kallsyms]";
+                       vmlinux_name = DSO__NAME_KALLSYMS;
 
                kernel = machine__findnew_kernel(machine, vmlinux_name,
                                                 "[kernel]", DSO_TYPE_KERNEL);
@@ -1135,10 +1136,10 @@ int machine__create_kernel_maps(struct machine *machine)
 {
        struct dso *kernel = machine__get_kernel(machine);
        const char *name;
-       u64 addr = machine__get_running_kernel_start(machine, &name);
+       u64 addr;
        int ret;
 
-       if (!addr || kernel == NULL)
+       if (kernel == NULL)
                return -1;
 
        ret = __machine__create_kernel_maps(machine, kernel);
@@ -1160,8 +1161,9 @@ int machine__create_kernel_maps(struct machine *machine)
         */
        map_groups__fixup_end(&machine->kmaps);
 
-       if (maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name,
-                                            addr)) {
+       addr = machine__get_running_kernel_start(machine, &name);
+       if (!addr) {
+       } else if (maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, name, addr)) {
                machine__destroy_kernel_maps(machine);
                return -1;
        }
@@ -1769,11 +1771,6 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
                 */
                int mix_chain_nr = i + 1 + lbr_nr + 1;
 
-               if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) {
-                       pr_warning("corrupted callchain. skipping...\n");
-                       return 0;
-               }
-
                for (j = 0; j < mix_chain_nr; j++) {
                        if (callchain_param.order == ORDER_CALLEE) {
                                if (j < i + 1)
@@ -1811,9 +1808,9 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 {
        struct branch_stack *branch = sample->branch_stack;
        struct ip_callchain *chain = sample->callchain;
-       int chain_nr = min(max_stack, (int)chain->nr);
+       int chain_nr = chain->nr;
        u8 cpumode = PERF_RECORD_MISC_USER;
-       int i, j, err;
+       int i, j, err, nr_entries;
        int skip_idx = -1;
        int first_call = 0;
 
@@ -1828,8 +1825,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
         * Based on DWARF debug information, some architectures skip
         * a callchain entry saved by the kernel.
         */
-       if (chain->nr < sysctl_perf_event_max_stack)
-               skip_idx = arch_skip_callchain_idx(thread, chain);
+       skip_idx = arch_skip_callchain_idx(thread, chain);
 
        /*
         * Add branches to call stack for easier browsing. This gives
@@ -1889,12 +1885,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
        }
 
 check_calls:
-       if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) {
-               pr_warning("corrupted callchain. skipping...\n");
-               return 0;
-       }
-
-       for (i = first_call; i < chain_nr; i++) {
+       for (i = first_call, nr_entries = 0;
+            i < chain_nr && nr_entries < max_stack; i++) {
                u64 ip;
 
                if (callchain_param.order == ORDER_CALLEE)
@@ -1908,6 +1900,9 @@ check_calls:
 #endif
                ip = chain->ips[j];
 
+               if (ip < PERF_CONTEXT_MAX)
+                       ++nr_entries;
+
                err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
 
                if (err)
index 83f46790c52f7e3008427577c52288ed5f15e86b..41ac9cfd416b52d5972d8c2f8b5275a62b22ce65 100644 (file)
@@ -28,6 +28,7 @@ struct machine {
        pid_t             pid;
        u16               id_hdr_size;
        bool              comm_exec;
+       bool              kptr_restrict_warned;
        char              *root_dir;
        struct rb_root    threads;
        pthread_rwlock_t  threads_lock;
index 62c7f6988e0e584cb16f6deab30a84b3b8672971..5d1eb1ccd96c3e0cf73449c97aaee9246d1f9619 100644 (file)
@@ -264,8 +264,7 @@ static SV *perl_process_callchain(struct perf_sample *sample,
                goto exit;
 
        if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-                                     sample, NULL, NULL,
-                                     sysctl_perf_event_max_stack) != 0) {
+                                     sample, NULL, NULL, scripting_max_stack) != 0) {
                pr_err("Failed to resolve callchain. Skipping\n");
                goto exit;
        }
index 20e69edd5006bc29abb0d8be20f0c176085b7502..c4e9bd70723c5b26df4499af5d32afdb86109c31 100644 (file)
@@ -353,6 +353,88 @@ struct sort_entry sort_srcline = {
        .se_width_idx   = HISTC_SRCLINE,
 };
 
+/* --sort srcline_from */
+
+static int64_t
+sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       if (!left->branch_info->srcline_from) {
+               struct map *map = left->branch_info->from.map;
+               if (!map)
+                       left->branch_info->srcline_from = SRCLINE_UNKNOWN;
+               else
+                       left->branch_info->srcline_from = get_srcline(map->dso,
+                                          map__rip_2objdump(map,
+                                                            left->branch_info->from.al_addr),
+                                                        left->branch_info->from.sym, true);
+       }
+       if (!right->branch_info->srcline_from) {
+               struct map *map = right->branch_info->from.map;
+               if (!map)
+                       right->branch_info->srcline_from = SRCLINE_UNKNOWN;
+               else
+                       right->branch_info->srcline_from = get_srcline(map->dso,
+                                            map__rip_2objdump(map,
+                                                              right->branch_info->from.al_addr),
+                                                    right->branch_info->from.sym, true);
+       }
+       return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
+}
+
+static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
+                                       size_t size, unsigned int width)
+{
+       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_from);
+}
+
+struct sort_entry sort_srcline_from = {
+       .se_header      = "From Source:Line",
+       .se_cmp         = sort__srcline_from_cmp,
+       .se_snprintf    = hist_entry__srcline_from_snprintf,
+       .se_width_idx   = HISTC_SRCLINE_FROM,
+};
+
+/* --sort srcline_to */
+
+static int64_t
+sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       if (!left->branch_info->srcline_to) {
+               struct map *map = left->branch_info->to.map;
+               if (!map)
+                       left->branch_info->srcline_to = SRCLINE_UNKNOWN;
+               else
+                       left->branch_info->srcline_to = get_srcline(map->dso,
+                                          map__rip_2objdump(map,
+                                                            left->branch_info->to.al_addr),
+                                                        left->branch_info->from.sym, true);
+       }
+       if (!right->branch_info->srcline_to) {
+               struct map *map = right->branch_info->to.map;
+               if (!map)
+                       right->branch_info->srcline_to = SRCLINE_UNKNOWN;
+               else
+                       right->branch_info->srcline_to = get_srcline(map->dso,
+                                            map__rip_2objdump(map,
+                                                              right->branch_info->to.al_addr),
+                                                    right->branch_info->to.sym, true);
+       }
+       return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
+}
+
+static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
+                                       size_t size, unsigned int width)
+{
+       return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_to);
+}
+
+struct sort_entry sort_srcline_to = {
+       .se_header      = "To Source:Line",
+       .se_cmp         = sort__srcline_to_cmp,
+       .se_snprintf    = hist_entry__srcline_to_snprintf,
+       .se_width_idx   = HISTC_SRCLINE_TO,
+};
+
 /* --sort srcfile */
 
 static char no_srcfile[1];
@@ -1347,6 +1429,8 @@ static struct sort_dimension bstack_sort_dimensions[] = {
        DIM(SORT_IN_TX, "in_tx", sort_in_tx),
        DIM(SORT_ABORT, "abort", sort_abort),
        DIM(SORT_CYCLES, "cycles", sort_cycles),
+       DIM(SORT_SRCLINE_FROM, "srcline_from", sort_srcline_from),
+       DIM(SORT_SRCLINE_TO, "srcline_to", sort_srcline_to),
 };
 
 #undef DIM
index 42927f448bcbc2f5ae39b9cd416948387d898825..ebb59cacd092fa919e45a447a6e3e3ad70a0c880 100644 (file)
@@ -215,6 +215,8 @@ enum sort_type {
        SORT_ABORT,
        SORT_IN_TX,
        SORT_CYCLES,
+       SORT_SRCLINE_FROM,
+       SORT_SRCLINE_TO,
 
        /* memory mode specific sort keys */
        __SORT_MEMORY_MODE,
index fdb71961143e641dce7c45175630f344f8edadd7..aa9efe08762b9620c9d477384e524de81f8b137a 100644 (file)
@@ -94,7 +94,8 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 {
        int ctx = evsel_context(counter);
 
-       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
+           perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
                update_stats(&runtime_nsecs_stats[cpu], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
                update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
@@ -188,7 +189,7 @@ static void print_stalled_cycles_backend(int cpu,
 
        color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
-       out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
+       out->print_metric(out->ctx, color, "%7.2f%%", "backend cycles idle", ratio);
 }
 
 static void print_branch_misses(int cpu,
@@ -444,7 +445,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                        ratio = total / avg;
 
                print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
-       } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
+       } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) ||
+                  perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK)) {
                if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
                        print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
                                     avg / ratio);
index 7fb33304fb4ea66f1103c351b696a1e2b5e8bae8..20f9cb32b703cd10330fa80ed328a16126a59a04 100644 (file)
@@ -1662,8 +1662,8 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map)
 
        build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
 
-       scnprintf(path, sizeof(path), "%s/[kernel.kcore]/%s", buildid_dir,
-                 sbuild_id);
+       scnprintf(path, sizeof(path), "%s/%s/%s", buildid_dir,
+                 DSO__NAME_KCORE, sbuild_id);
 
        /* Use /proc/kallsyms if possible */
        if (is_host) {
@@ -1699,8 +1699,8 @@ static char *dso__find_kallsyms(struct dso *dso, struct map *map)
        if (!find_matching_kcore(map, path, sizeof(path)))
                return strdup(path);
 
-       scnprintf(path, sizeof(path), "%s/[kernel.kallsyms]/%s",
-                 buildid_dir, sbuild_id);
+       scnprintf(path, sizeof(path), "%s/%s/%s",
+                 buildid_dir, DSO__NAME_KALLSYMS, sbuild_id);
 
        if (access(path, F_OK)) {
                pr_err("No kallsyms or vmlinux with build-id %s was found\n",
@@ -1769,7 +1769,7 @@ do_kallsyms:
 
        if (err > 0 && !dso__is_kcore(dso)) {
                dso->binary_type = DSO_BINARY_TYPE__KALLSYMS;
-               dso__set_long_name(dso, "[kernel.kallsyms]", false);
+               dso__set_long_name(dso, DSO__NAME_KALLSYMS, false);
                map__fixup_start(map);
                map__fixup_end(map);
        }
@@ -2033,3 +2033,26 @@ void symbol__exit(void)
        symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL;
        symbol_conf.initialized = false;
 }
+
+int symbol__config_symfs(const struct option *opt __maybe_unused,
+                        const char *dir, int unset __maybe_unused)
+{
+       char *bf = NULL;
+       int ret;
+
+       symbol_conf.symfs = strdup(dir);
+       if (symbol_conf.symfs == NULL)
+               return -ENOMEM;
+
+       /* skip the locally configured cache if a symfs is given, and
+        * config buildid dir to symfs/.debug
+        */
+       ret = asprintf(&bf, "%s/%s", dir, ".debug");
+       if (ret < 0)
+               return -ENOMEM;
+
+       set_buildid_dir(bf);
+
+       free(bf);
+       return 0;
+}
index 2b5e4ed76fcb4b53c38ac74269e03317734bd699..b10d558a88032919c6249aa2417477022c3d71e2 100644 (file)
@@ -44,6 +44,9 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 #define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
 #endif
 
+#define DSO__NAME_KALLSYMS     "[kernel.kallsyms]"
+#define DSO__NAME_KCORE                "[kernel.kcore]"
+
 /** struct symbol - symtab entry
  *
  * @ignore - resolvable but tools ignore it (e.g. idle routines)
@@ -183,6 +186,8 @@ struct branch_info {
        struct addr_map_symbol from;
        struct addr_map_symbol to;
        struct branch_flags flags;
+       char                    *srcline_from;
+       char                    *srcline_to;
 };
 
 struct mem_info {
@@ -287,6 +292,8 @@ bool symbol_type__is_a(char symbol_type, enum map_type map_type);
 bool symbol__restricted_filename(const char *filename,
                                 const char *restricted_filename);
 bool symbol__is_idle(struct symbol *sym);
+int symbol__config_symfs(const struct option *opt __maybe_unused,
+                        const char *dir, int unset __maybe_unused);
 
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
                  struct symsrc *runtime_ss, symbol_filter_t filter,
index f92c37abb0a80b827c56bc32f2e8aeb3bb39463d..b2940c88734a01e280046f5c806c2a530fafc25a 100644 (file)
@@ -27,7 +27,6 @@ struct perf_top {
        int                max_stack;
        bool               hide_kernel_symbols, hide_user_symbols, zero;
        bool               use_tui, use_stdio;
-       bool               kptr_restrict_warned;
        bool               vmlinux_warned;
        bool               dump_symtab;
        struct hist_entry  *sym_filter_entry;
index eab077ad6ca92f5fac2f570c3c96008203a980de..23504ad5d6dd2154ca04d63ab4aa97e764de6692 100644 (file)
@@ -33,7 +33,8 @@ struct callchain_param        callchain_param = {
 unsigned int page_size;
 int cacheline_size;
 
-unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK;
 
 bool test_attr__enabled;
 
index 7651633a8dc7a2eda51f30750ca1a142ed72fbe8..1e8c3167b9fb3ab070c05974fb028445600c6574 100644 (file)
@@ -261,7 +261,8 @@ void sighandler_dump_stack(int sig);
 
 extern unsigned int page_size;
 extern int cacheline_size;
-extern unsigned int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
 
 struct parse_tag {
        char tag;
index 7947e568e0576905265e58a19caf6886633acaac..2e58549b2f0211ac11747d35b71c05145e81fbe7 100644 (file)
@@ -1234,6 +1234,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS     struct user_pt_regs
 # define SYSCALL_NUM   regs[8]
 # define SYSCALL_RET   regs[0]
+#elif defined(__hppa__)
+# define ARCH_REGS     struct user_regs_struct
+# define SYSCALL_NUM   gr[20]
+# define SYSCALL_RET   gr[28]
 #elif defined(__powerpc__)
 # define ARCH_REGS     struct pt_regs
 # define SYSCALL_NUM   gpr[0]
@@ -1303,7 +1307,7 @@ void change_syscall(struct __test_metadata *_metadata,
        EXPECT_EQ(0, ret);
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
-    defined(__s390__)
+    defined(__s390__) || defined(__hppa__)
        {
                regs.SYSCALL_NUM = syscall;
        }
@@ -1505,6 +1509,8 @@ TEST_F(TRACE_syscall, syscall_dropped)
 #  define __NR_seccomp 383
 # elif defined(__aarch64__)
 #  define __NR_seccomp 277
+# elif defined(__hppa__)
+#  define __NR_seccomp 338
 # elif defined(__powerpc__)
 #  define __NR_seccomp 358
 # elif defined(__s390__)
index c87957295f74ae6830f6348ea0482427901c4493..0bc737a75150bf225f6b3508182341536b66c88e 100644 (file)
@@ -30,7 +30,9 @@
 #define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_SHIFT  26
 #define MAP_HUGE_MASK   0x3f
+#if !defined(MAP_HUGETLB)
 #define MAP_HUGETLB    0x40000
+#endif
 
 #define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
 #define SHM_HUGE_SHIFT  26
index feaa64ac463018f03b9a80e4270281127e9b96c1..6ba7455298338988f1e4454b0fec3c41bb61bc89 100644 (file)
@@ -1,6 +1,6 @@
 all:
 
-all: ring virtio_ring_0_9 virtio_ring_poll
+all: ring virtio_ring_0_9 virtio_ring_poll virtio_ring_inorder
 
 CFLAGS += -Wall
 CFLAGS += -pthread -O2 -ggdb
@@ -10,13 +10,16 @@ main.o: main.c main.h
 ring.o: ring.c main.h
 virtio_ring_0_9.o: virtio_ring_0_9.c main.h
 virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h
+virtio_ring_inorder.o: virtio_ring_inorder.c virtio_ring_0_9.c main.h
 ring: ring.o main.o
 virtio_ring_0_9: virtio_ring_0_9.o main.o
 virtio_ring_poll: virtio_ring_poll.o main.o
+virtio_ring_inorder: virtio_ring_inorder.o main.o
 clean:
        -rm main.o
        -rm ring.o ring
        -rm virtio_ring_0_9.o virtio_ring_0_9
        -rm virtio_ring_poll.o virtio_ring_poll
+       -rm virtio_ring_inorder.o virtio_ring_inorder
 
 .PHONY: all clean
index 3a5ff438bd62f62296f6cbe5cf3a42e7f5bb1078..147abb452a6ccc098bf50338e0c353f4b8896f8a 100644 (file)
@@ -115,7 +115,7 @@ static void run_guest(void)
                do {
                        if (started < bufs &&
                            started - completed < max_outstanding) {
-                               r = add_inbuf(0, NULL, "Hello, world!");
+                               r = add_inbuf(0, "Buffer\n", "Hello, world!");
                                if (__builtin_expect(r == 0, true)) {
                                        ++started;
                                        if (!--tokick) {
index 47c9a1a18d361fa7b72c5ef67545f719327addb6..761866212aacf1149d03ef1151ac7727c944db72 100644 (file)
@@ -26,6 +26,14 @@ struct vring ring;
  * high bits of ring id ^ 0x8000).
  */
 /* #ifdef RING_POLL */
+/* enabling the below activates experimental in-order code
+ * (which skips ring updates and reads and writes len in descriptor).
+ */
+/* #ifdef INORDER */
+
+#if defined(RING_POLL) && defined(INORDER)
+#error "RING_POLL and INORDER are mutually exclusive"
+#endif
 
 /* how much padding is needed to avoid false cache sharing */
 #define HOST_GUEST_PADDING 0x80
@@ -35,7 +43,11 @@ struct guest {
        unsigned short last_used_idx;
        unsigned short num_free;
        unsigned short kicked_avail_idx;
+#ifndef INORDER
        unsigned short free_head;
+#else
+       unsigned short reserved_free_head;
+#endif
        unsigned char reserved[HOST_GUEST_PADDING - 10];
 } guest;
 
@@ -66,8 +78,10 @@ void alloc_ring(void)
        guest.avail_idx = 0;
        guest.kicked_avail_idx = -1;
        guest.last_used_idx = 0;
+#ifndef INORDER
        /* Put everything in free lists. */
        guest.free_head = 0;
+#endif
        for (i = 0; i < ring_size - 1; i++)
                ring.desc[i].next = i + 1;
        host.used_idx = 0;
@@ -84,13 +98,20 @@ void alloc_ring(void)
 /* guest side */
 int add_inbuf(unsigned len, void *buf, void *datap)
 {
-       unsigned head, avail;
+       unsigned head;
+#ifndef INORDER
+       unsigned avail;
+#endif
        struct vring_desc *desc;
 
        if (!guest.num_free)
                return -1;
 
+#ifdef INORDER
+       head = (ring_size - 1) & (guest.avail_idx++);
+#else
        head = guest.free_head;
+#endif
        guest.num_free--;
 
        desc = ring.desc;
@@ -102,7 +123,9 @@ int add_inbuf(unsigned len, void *buf, void *datap)
         * descriptors.
         */
        desc[head].flags &= ~VRING_DESC_F_NEXT;
+#ifndef INORDER
        guest.free_head = desc[head].next;
+#endif
 
        data[head].data = datap;
 
@@ -113,8 +136,12 @@ int add_inbuf(unsigned len, void *buf, void *datap)
        ring.avail->ring[avail & (ring_size - 1)] =
                (head | (avail & ~(ring_size - 1))) ^ 0x8000;
 #else
+#ifndef INORDER
+       /* Barrier A (for pairing) */
+       smp_release();
        avail = (ring_size - 1) & (guest.avail_idx++);
        ring.avail->ring[avail] = head;
+#endif
        /* Barrier A (for pairing) */
        smp_release();
 #endif
@@ -141,15 +168,27 @@ void *get_buf(unsigned *lenp, void **bufp)
                return NULL;
        /* Barrier B (for pairing) */
        smp_acquire();
+#ifdef INORDER
+       head = (ring_size - 1) & guest.last_used_idx;
+       index = head;
+#else
        head = (ring_size - 1) & guest.last_used_idx;
        index = ring.used->ring[head].id;
 #endif
+
+#endif
+#ifdef INORDER
+       *lenp = ring.desc[index].len;
+#else
        *lenp = ring.used->ring[head].len;
+#endif
        datap = data[index].data;
        *bufp = (void*)(unsigned long)ring.desc[index].addr;
        data[index].data = NULL;
+#ifndef INORDER
        ring.desc[index].next = guest.free_head;
        guest.free_head = index;
+#endif
        guest.num_free++;
        guest.last_used_idx++;
        return datap;
@@ -283,16 +322,24 @@ bool use_buf(unsigned *lenp, void **bufp)
        smp_acquire();
 
        used_idx &= ring_size - 1;
+#ifdef INORDER
+       head = used_idx;
+#else
        head = ring.avail->ring[used_idx];
+#endif
        desc = &ring.desc[head];
 #endif
 
        *lenp = desc->len;
        *bufp = (void *)(unsigned long)desc->addr;
 
+#ifdef INORDER
+       desc->len = desc->len - 1;
+#else
        /* now update used ring */
        ring.used->ring[used_idx].id = head;
        ring.used->ring[used_idx].len = desc->len - 1;
+#endif
        /* Barrier B (for pairing) */
        smp_release();
        host.used_idx++;
diff --git a/tools/virtio/ringtest/virtio_ring_inorder.c b/tools/virtio/ringtest/virtio_ring_inorder.c
new file mode 100644 (file)
index 0000000..2438ca5
--- /dev/null
@@ -0,0 +1,2 @@
+#define INORDER 1
+#include "virtio_ring_0_9.c"
index 409db3304471766fb80a5b3843d3ac2310731c8e..e2d5b6f988fb60b00aed69bff82118c0ba787ee7 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
@@ -174,10 +175,10 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
 
        timer->active_cleared_last = false;
        timer->irq.level = new_level;
-       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
+       trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq,
                                   timer->irq.level);
        ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
-                                        timer->map,
+                                        timer->irq.irq,
                                         timer->irq.level);
        WARN_ON(ret);
 }
@@ -196,7 +197,7 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
         * because the guest would never see the interrupt.  Instead wait
         * until we call this function from kvm_timer_flush_hwstate.
         */
-       if (!vgic_initialized(vcpu->kvm))
+       if (!vgic_initialized(vcpu->kvm) || !timer->enabled)
                return -ENODEV;
 
        if (kvm_timer_should_fire(vcpu) != timer->irq.level)
@@ -274,10 +275,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
        * to ensure that hardware interrupts from the timer triggers a guest
        * exit.
        */
-       if (timer->irq.level || kvm_vgic_map_is_active(vcpu, timer->map))
-               phys_active = true;
-       else
-               phys_active = false;
+       phys_active = timer->irq.level ||
+                       kvm_vgic_map_is_active(vcpu, timer->irq.irq);
 
        /*
         * We want to avoid hitting the (re)distributor as much as
@@ -302,7 +301,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
        if (timer->active_cleared_last && !phys_active)
                return;
 
-       ret = irq_set_irqchip_state(timer->map->irq,
+       ret = irq_set_irqchip_state(host_vtimer_irq,
                                    IRQCHIP_STATE_ACTIVE,
                                    phys_active);
        WARN_ON(ret);
@@ -334,7 +333,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
                         const struct kvm_irq_level *irq)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-       struct irq_phys_map *map;
 
        /*
         * The vcpu timer irq number cannot be determined in
@@ -353,15 +351,6 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
        timer->cntv_ctl = 0;
        kvm_timer_update_state(vcpu);
 
-       /*
-        * Tell the VGIC that the virtual interrupt is tied to a
-        * physical interrupt. We do that once per VCPU.
-        */
-       map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
-       if (WARN_ON(IS_ERR(map)))
-               return PTR_ERR(map);
-
-       timer->map = map;
        return 0;
 }
 
@@ -487,14 +476,43 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
        timer_disarm(timer);
-       if (timer->map)
-               kvm_vgic_unmap_phys_irq(vcpu, timer->map);
+       kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq);
 }
 
-void kvm_timer_enable(struct kvm *kvm)
+int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
-       if (kvm->arch.timer.enabled)
-               return;
+       struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       int phys_irq;
+       int ret;
+
+       if (timer->enabled)
+               return 0;
+
+       /*
+        * Find the physical IRQ number corresponding to the host_vtimer_irq
+        */
+       desc = irq_to_desc(host_vtimer_irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return -EINVAL;
+       }
+
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       phys_irq = data->hwirq;
+
+       /*
+        * Tell the VGIC that the virtual interrupt is tied to a
+        * physical interrupt. We do that once per VCPU.
+        */
+       ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq);
+       if (ret)
+               return ret;
+
 
        /*
         * There is a potential race here between VCPUs starting for the first
@@ -505,7 +523,9 @@ void kvm_timer_enable(struct kvm *kvm)
         * the arch timers are enabled.
         */
        if (timecounter && wqueue)
-               kvm->arch.timer.enabled = 1;
+               timer->enabled = 1;
+
+       return 0;
 }
 
 void kvm_timer_init(struct kvm *kvm)
index ea00d69e7078ccfaa8420c13a98a0c54b698b243..798866a8d8756b07dd815d7a8bc8dec8f6019e13 100644 (file)
 /* vcpu is already in the HYP VA space */
 void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
 {
-       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
        u64 val;
 
-       if (kvm->arch.timer.enabled) {
+       if (timer->enabled) {
                timer->cntv_ctl = read_sysreg_el0(cntv_ctl);
                timer->cntv_cval = read_sysreg_el0(cntv_cval);
        }
@@ -60,7 +59,7 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
        val |= CNTHCTL_EL1PCTEN;
        write_sysreg(val, cnthctl_el2);
 
-       if (kvm->arch.timer.enabled) {
+       if (timer->enabled) {
                write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2);
                write_sysreg_el0(timer->cntv_cval, cntv_cval);
                isb();
index 674bdf8ecf4f9d1e27a1d74e54cd9053cbbf5818..a3f12b3b277b949970f462f7305d4edce3a5f8c5 100644 (file)
 
 #include <asm/kvm_hyp.h>
 
+#ifdef CONFIG_KVM_NEW_VGIC
+extern struct vgic_global kvm_vgic_global_state;
+#define vgic_v2_params kvm_vgic_global_state
+#else
+extern struct vgic_params vgic_v2_params;
+#endif
+
 static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
                                            void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        u32 eisr0, eisr1;
        int i;
        bool expect_mi;
@@ -67,7 +74,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        u32 elrsr0, elrsr1;
 
        elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -86,7 +93,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-       int nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
        int i;
 
        for (i = 0; i < nr_lr; i++) {
@@ -141,13 +148,13 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
        struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-       int i, nr_lr;
+       int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+       int i;
        u64 live_lrs = 0;
 
        if (!base)
                return;
 
-       nr_lr = vcpu->arch.vgic_cpu.nr_lr;
 
        for (i = 0; i < nr_lr; i++)
                if (cpu_if->vgic_lr[i] & GICH_LR_STATE)
index 575c7aa30d7e64538bb72c11289c360c2fa884a4..a027569facfae948ef562d11604c162dc0d96c1e 100644 (file)
@@ -436,7 +436,14 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
+#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
+
+/*
+ * For one VM the interrupt type must be same for each vcpu.
+ * As a PPI, the interrupt number is the same for all vcpus,
+ * while as an SPI it must be a separate number per vcpu.
+ */
+static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
 {
        int i;
        struct kvm_vcpu *vcpu;
@@ -445,7 +452,7 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
                if (!kvm_arm_pmu_irq_initialized(vcpu))
                        continue;
 
-               if (is_ppi) {
+               if (irq_is_ppi(irq)) {
                        if (vcpu->arch.pmu.irq_num != irq)
                                return false;
                } else {
@@ -457,7 +464,6 @@ static bool irq_is_valid(struct kvm *kvm, int irq, bool is_ppi)
        return true;
 }
 
-
 int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
        switch (attr->attr) {
@@ -471,14 +477,11 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
                if (get_user(irq, uaddr))
                        return -EFAULT;
 
-               /*
-                * The PMU overflow interrupt could be a PPI or SPI, but for one
-                * VM the interrupt type must be same for each vcpu. As a PPI,
-                * the interrupt number is the same for all vcpus, while as an
-                * SPI it must be a separate number per vcpu.
-                */
-               if (irq < VGIC_NR_SGIS || irq >= vcpu->kvm->arch.vgic.nr_irqs ||
-                   !irq_is_valid(vcpu->kvm, irq, irq < VGIC_NR_PRIVATE_IRQS))
+               /* The PMU overflow interrupt can be a PPI or a valid SPI. */
+               if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq)))
+                       return -EINVAL;
+
+               if (!pmu_irq_is_valid(vcpu->kvm, irq))
                        return -EINVAL;
 
                if (kvm_arm_pmu_irq_initialized(vcpu))
index 7e826c9b2b0a3caeee4174531be51413aa6bd747..334cd7a891066d6dbef3af2944204fe3000c3d81 100644 (file)
@@ -171,7 +171,7 @@ static const struct vgic_ops vgic_v2_ops = {
        .enable                 = vgic_v2_enable,
 };
 
-static struct vgic_params vgic_v2_params;
+struct vgic_params __section(.hyp.text) vgic_v2_params;
 
 static void vgic_cpu_init_lrs(void *params)
 {
@@ -201,6 +201,8 @@ int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
        const struct resource *vctrl_res = &gic_kvm_info->vctrl;
        const struct resource *vcpu_res = &gic_kvm_info->vcpu;
 
+       memset(vgic, 0, sizeof(*vgic));
+
        if (!gic_kvm_info->maint_irq) {
                kvm_err("error getting vgic maintenance irq\n");
                ret = -ENXIO;
index c02a1b1cf855a5b17e9d0fc8bcd8811bacc03f11..75b02fa86436ac97d6939269cc2d70c50f181162 100644 (file)
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 
-/* These are for GICv2 emulation only */
-#define GICH_LR_VIRTUALID              (0x3ffUL << 0)
-#define GICH_LR_PHYSID_CPUID_SHIFT     (10)
-#define GICH_LR_PHYSID_CPUID           (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
-#define ICH_LR_VIRTUALID_MASK          (BIT_ULL(32) - 1)
-
 static u32 ich_vtr_el2;
 
 static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
@@ -43,7 +37,7 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
        u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
 
        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-               lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
+               lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
        else
                lr_desc.irq = val & GICH_LR_VIRTUALID;
 
index 60668a7f319a8adbeaff9f3ea1518ec9847fa6c1..c3bfbb981e73bfc80aed1e8a96b35e813b6f98b6 100644 (file)
@@ -690,12 +690,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
  */
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
-       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u64 elrsr = vgic_get_elrsr(vcpu);
        unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
        int i;
 
-       for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
+       for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
                struct vgic_lr lr = vgic_get_lr(vcpu, i);
 
                /*
@@ -820,7 +819,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct vgic_io_device *iodev = container_of(this,
                                                    struct vgic_io_device, dev);
-       struct kvm_run *run = vcpu->run;
        const struct vgic_io_range *range;
        struct kvm_exit_mmio mmio;
        bool updated_state;
@@ -849,12 +847,6 @@ static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
                updated_state = false;
        }
        spin_unlock(&dist->lock);
-       run->mmio.is_write      = is_write;
-       run->mmio.len           = len;
-       run->mmio.phys_addr     = addr;
-       memcpy(run->mmio.data, val, len);
-
-       kvm_handle_mmio_return(vcpu, run);
 
        if (updated_state)
                vgic_kick_vcpus(vcpu->kvm);
@@ -1102,18 +1094,18 @@ static bool dist_active_irq(struct kvm_vcpu *vcpu)
        return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
 }
 
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
        int i;
 
-       for (i = 0; i < vcpu->arch.vgic_cpu.nr_lr; i++) {
+       for (i = 0; i < vgic->nr_lr; i++) {
                struct vgic_lr vlr = vgic_get_lr(vcpu, i);
 
-               if (vlr.irq == map->virt_irq && vlr.state & LR_STATE_ACTIVE)
+               if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
                        return true;
        }
 
-       return vgic_irq_is_active(vcpu, map->virt_irq);
+       return vgic_irq_is_active(vcpu, virt_irq);
 }
 
 /*
@@ -1521,7 +1513,6 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                  struct irq_phys_map *map,
                                   unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
@@ -1660,14 +1651,14 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
        if (map)
                return -EINVAL;
 
-       return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+       return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
 }
 
 /**
  * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
- * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @virt_irq: The virtual IRQ to be injected
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *                           false: to ignore the call
  *          Level-sensitive  true:  raise the input signal
@@ -1678,7 +1669,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
  * being HIGH and 0 being LOW and all devices being active-HIGH.
  */
 int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
-                              struct irq_phys_map *map, bool level)
+                              unsigned int virt_irq, bool level)
 {
        int ret;
 
@@ -1686,7 +1677,7 @@ int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
        if (ret)
                return ret;
 
-       return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
+       return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
 }
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1712,43 +1703,28 @@ static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
 /**
  * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
  * @vcpu: The VCPU pointer
- * @virt_irq: The virtual irq number
- * @irq: The Linux IRQ number
+ * @virt_irq: The virtual IRQ number for the guest
+ * @phys_irq: The hardware IRQ number of the host
  *
  * Establish a mapping between a guest visible irq (@virt_irq) and a
- * Linux irq (@irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @irq. This mapping can be
+ * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @phys_irq. This mapping can be
  * established multiple times as long as the parameters are the same.
  *
- * Returns a valid pointer on success, and an error pointer otherwise
+ * Returns 0 on success or an error value otherwise.
  */
-struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
-                                          int virt_irq, int irq)
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
        struct irq_phys_map *map;
        struct irq_phys_map_entry *entry;
-       struct irq_desc *desc;
-       struct irq_data *data;
-       int phys_irq;
-
-       desc = irq_to_desc(irq);
-       if (!desc) {
-               kvm_err("%s: no interrupt descriptor\n", __func__);
-               return ERR_PTR(-EINVAL);
-       }
-
-       data = irq_desc_get_irq_data(desc);
-       while (data->parent_data)
-               data = data->parent_data;
-
-       phys_irq = data->hwirq;
+       int ret = 0;
 
        /* Create a new mapping */
        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
 
        spin_lock(&dist->irq_phys_map_lock);
 
@@ -1756,9 +1732,8 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
        map = vgic_irq_map_search(vcpu, virt_irq);
        if (map) {
                /* Make sure this mapping matches */
-               if (map->phys_irq != phys_irq   ||
-                   map->irq      != irq)
-                       map = ERR_PTR(-EINVAL);
+               if (map->phys_irq != phys_irq)
+                       ret = -EINVAL;
 
                /* Found an existing, valid mapping */
                goto out;
@@ -1767,7 +1742,6 @@ struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
        map           = &entry->map;
        map->virt_irq = virt_irq;
        map->phys_irq = phys_irq;
-       map->irq      = irq;
 
        list_add_tail_rcu(&entry->entry, root);
 
@@ -1775,9 +1749,9 @@ out:
        spin_unlock(&dist->irq_phys_map_lock);
        /* If we've found a hit in the existing list, free the useless
         * entry */
-       if (IS_ERR(map) || map != &entry->map)
+       if (ret || map != &entry->map)
                kfree(entry);
-       return map;
+       return ret;
 }
 
 static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
@@ -1813,25 +1787,22 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
 /**
  * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
  * @vcpu: The VCPU pointer
- * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ * @virt_irq: The virtual IRQ number to be unmapped
  *
  * Remove an existing mapping between virtual and physical interrupts.
  */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
 {
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        struct irq_phys_map_entry *entry;
        struct list_head *root;
 
-       if (!map)
-               return -EINVAL;
-
-       root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+       root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
 
        spin_lock(&dist->irq_phys_map_lock);
 
        list_for_each_entry(entry, root, entry) {
-               if (&entry->map == map) {
+               if (entry->map.virt_irq == virt_irq) {
                        list_del_rcu(&entry->entry);
                        call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
                        break;
@@ -1887,13 +1858,6 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
                return -ENOMEM;
        }
 
-       /*
-        * Store the number of LRs per vcpu, so we don't have to go
-        * all the way to the distributor structure to find out. Only
-        * assembly code should use this one.
-        */
-       vgic_cpu->nr_lr = vgic->nr_lr;
-
        return 0;
 }
 
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
new file mode 100644 (file)
index 0000000..a1442f7
--- /dev/null
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
+ */
+
+/* EARLY INIT */
+
+/*
+ * Those 2 functions should not be needed anymore but they
+ * still are called from arm.c
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+}
+
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+}
+
+/* CREATION */
+
+/**
+ * kvm_vgic_create: triggered by the instantiation of the VGIC device by
+ * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
+ * or through the generic KVM_CREATE_DEVICE API ioctl.
+ * irqchip_in_kernel() tells you if this function succeeded or not.
+ * @kvm: kvm struct pointer
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ */
+int kvm_vgic_create(struct kvm *kvm, u32 type)
+{
+       int i, vcpu_lock_idx = -1, ret;
+       struct kvm_vcpu *vcpu;
+
+       mutex_lock(&kvm->lock);
+
+       if (irqchip_in_kernel(kvm)) {
+               ret = -EEXIST;
+               goto out;
+       }
+
+       /*
+        * This function is also called by the KVM_CREATE_IRQCHIP handler,
+        * which had no chance yet to check the availability of the GICv2
+        * emulation. So check this here again. KVM_CREATE_DEVICE does
+        * the proper checks already.
+        */
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+               !kvm_vgic_global_state.can_emulate_gicv2) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       /*
+        * Any time a vcpu is run, vcpu_load is called which tries to grab the
+        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+        * that no other VCPUs are run while we create the vgic.
+        */
+       ret = -EBUSY;
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (!mutex_trylock(&vcpu->mutex))
+                       goto out_unlock;
+               vcpu_lock_idx = i;
+       }
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu->arch.has_run_once)
+                       goto out_unlock;
+       }
+       ret = 0;
+
+       if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+               kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+       else
+               kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+
+       if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+               ret = -E2BIG;
+               goto out_unlock;
+       }
+
+       kvm->arch.vgic.in_kernel = true;
+       kvm->arch.vgic.vgic_model = type;
+
+       /*
+        * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init)
+        * it is stored in distributor struct for asm save/restore purpose
+        */
+       kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base;
+
+       kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
+       kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+       kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
+
+out_unlock:
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+               mutex_unlock(&vcpu->mutex);
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+/* INIT/DESTROY */
+
+/**
+ * kvm_vgic_dist_init: initialize the dist data structures
+ * @kvm: kvm struct pointer
+ * @nr_spis: number of spis, frozen by caller
+ */
+static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+       int i;
+
+       dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
+       if (!dist->spis)
+               return  -ENOMEM;
+
+       /*
+        * In the following code we do not take the irq struct lock since
+        * no other action on irq structs can happen while the VGIC is
+        * not initialized yet:
+        * If someone wants to inject an interrupt or does a MMIO access, we
+        * require prior initialization in case of a virtual GICv3 or trigger
+        * initialization when using a virtual GICv2.
+        */
+       for (i = 0; i < nr_spis; i++) {
+               struct vgic_irq *irq = &dist->spis[i];
+
+               irq->intid = i + VGIC_NR_PRIVATE_IRQS;
+               INIT_LIST_HEAD(&irq->ap_list);
+               spin_lock_init(&irq->irq_lock);
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu0;
+               if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+                       irq->targets = 0;
+               else
+                       irq->mpidr = 0;
+       }
+       return 0;
+}
+
+/**
+ * kvm_vgic_vcpu_init: initialize the vcpu data structures and
+ * enable the VCPU interface
+ * @vcpu: the VCPU which's VGIC should be initialized
+ */
+static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       int i;
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       spin_lock_init(&vgic_cpu->ap_list_lock);
+
+       /*
+        * Enable and configure all SGIs to be edge-triggered and
+        * configure all PPIs as level-triggered.
+        */
+       for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+               struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+               INIT_LIST_HEAD(&irq->ap_list);
+               spin_lock_init(&irq->irq_lock);
+               irq->intid = i;
+               irq->vcpu = NULL;
+               irq->target_vcpu = vcpu;
+               irq->targets = 1U << vcpu->vcpu_id;
+               if (vgic_irq_is_sgi(i)) {
+                       /* SGIs */
+                       irq->enabled = 1;
+                       irq->config = VGIC_CONFIG_EDGE;
+               } else {
+                       /* PPIs */
+                       irq->config = VGIC_CONFIG_LEVEL;
+               }
+       }
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_enable(vcpu);
+       else
+               vgic_v3_enable(vcpu);
+}
+
+/*
+ * vgic_init: allocates and initializes dist and vcpu data structures
+ * depending on two dimensioning parameters:
+ * - the number of spis
+ * - the number of vcpus
+ * The function is generally called when nr_spis has been explicitly set
+ * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
+ * vgic_initialized() returns true when this function has succeeded.
+ * Must be called with kvm->lock held!
+ */
+int vgic_init(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct kvm_vcpu *vcpu;
+       int ret = 0, i;
+
+       if (vgic_initialized(kvm))
+               return 0;
+
+       /* freeze the number of spis */
+       if (!dist->nr_spis)
+               dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+
+       ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+       if (ret)
+               goto out;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_init(vcpu);
+
+       dist->initialized = true;
+out:
+       return ret;
+}
+
+static void kvm_vgic_dist_destroy(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       mutex_lock(&kvm->lock);
+
+       dist->ready = false;
+       dist->initialized = false;
+
+       kfree(dist->spis);
+       kfree(dist->redist_iodevs);
+       dist->nr_spis = 0;
+
+       mutex_unlock(&kvm->lock);
+}
+
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_vgic_dist_destroy(kvm);
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_vgic_vcpu_destroy(vcpu);
+}
+
+/**
+ * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
+ * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
+ * @kvm: kvm struct pointer
+ */
+int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
+}
+
+/* RESOURCE MAPPING */
+
+/**
+ * Map the MMIO regions depending on the VGIC model exposed to the guest
+ * called on the first VCPU run.
+ * Also map the virtual CPU interface into the VM.
+ * v2/v3 derivatives call vgic_init if not already done.
+ * vgic_ready() returns true if this function has succeeded.
+ * @kvm: kvm struct pointer
+ */
+int kvm_vgic_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       mutex_lock(&kvm->lock);
+       if (!irqchip_in_kernel(kvm))
+               goto out;
+
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+               ret = vgic_v2_map_resources(kvm);
+       else
+               ret = vgic_v3_map_resources(kvm);
+out:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+/* GENERIC PROBE */
+
+static void vgic_init_maintenance_interrupt(void *info)
+{
+       enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+}
+
+static int vgic_cpu_notify(struct notifier_block *self,
+                          unsigned long action, void *cpu)
+{
+       switch (action) {
+       case CPU_STARTING:
+       case CPU_STARTING_FROZEN:
+               vgic_init_maintenance_interrupt(NULL);
+               break;
+       case CPU_DYING:
+       case CPU_DYING_FROZEN:
+               disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block vgic_cpu_nb = {
+       .notifier_call = vgic_cpu_notify,
+};
+
+static irqreturn_t vgic_maintenance_handler(int irq, void *data)
+{
+       /*
+        * We cannot rely on the vgic maintenance interrupt to be
+        * delivered synchronously. This means we can only use it to
+        * exit the VM, and we perform the handling of EOIed
+        * interrupts on the exit path (see vgic_process_maintenance).
+        */
+       return IRQ_HANDLED;
+}
+
+/**
+ * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
+ * according to the host GIC model. Accordingly calls either
+ * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
+ * instantiated by a guest later on .
+ */
+int kvm_vgic_hyp_init(void)
+{
+       const struct gic_kvm_info *gic_kvm_info;
+       int ret;
+
+       gic_kvm_info = gic_get_kvm_info();
+       if (!gic_kvm_info)
+               return -ENODEV;
+
+       if (!gic_kvm_info->maint_irq) {
+               kvm_err("No vgic maintenance irq\n");
+               return -ENXIO;
+       }
+
+       switch (gic_kvm_info->type) {
+       case GIC_V2:
+               ret = vgic_v2_probe(gic_kvm_info);
+               break;
+       case GIC_V3:
+               ret = vgic_v3_probe(gic_kvm_info);
+               break;
+       default:
+               ret = -ENODEV;
+       };
+
+       if (ret)
+               return ret;
+
+       kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+       ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
+                                vgic_maintenance_handler,
+                                "vgic", kvm_get_running_vcpus());
+       if (ret) {
+               kvm_err("Cannot register interrupt %d\n",
+                       kvm_vgic_global_state.maint_irq);
+               return ret;
+       }
+
+       ret = __register_cpu_notifier(&vgic_cpu_nb);
+       if (ret) {
+               kvm_err("Cannot register vgic CPU notifier\n");
+               goto out_free_irq;
+       }
+
+       on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
+
+       kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
+       return 0;
+
+out_free_irq:
+       free_percpu_irq(kvm_vgic_global_state.maint_irq,
+                       kvm_get_running_vcpus());
+       return ret;
+}
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
new file mode 100644 (file)
index 0000000..c675513
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+                   struct kvm_kernel_irq_routing_entry *entries,
+                   int gsi)
+{
+       return 0;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned int irqchip,
+                        unsigned int pin)
+{
+       return pin;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id,
+               u32 irq, int level, bool line_status)
+{
+       unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
+
+       trace_kvm_set_irq(irq, level, irq_source_id);
+
+       BUG_ON(!vgic_initialized(kvm));
+
+       return kvm_vgic_inject_irq(kvm, 0, spi, level);
+}
+
+/* MSI not implemented yet */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id,
+               int level, bool line_status)
+{
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
new file mode 100644 (file)
index 0000000..0130c4b
--- /dev/null
@@ -0,0 +1,431 @@
+/*
+ * VGIC: KVM DEVICE API
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/* common helpers */
+
+static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+                            phys_addr_t addr, phys_addr_t alignment)
+{
+       if (addr & ~KVM_PHYS_MASK)
+               return -E2BIG;
+
+       if (!IS_ALIGNED(addr, alignment))
+               return -EINVAL;
+
+       if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
+               return -EEXIST;
+
+       return 0;
+}
+
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm:   pointer to the vm struct
+ * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
+ * @addr:  pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ *          address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space.  These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ * Check them for sanity (alignment, double assignment). We can't check for
+ * overlapping regions in case of a virtual GICv3 here, since we don't know
+ * the number of VCPUs yet, so we defer this check to map_resources().
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+{
+       int r = 0;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       int type_needed;
+       phys_addr_t *addr_ptr, alignment;
+
+       mutex_lock(&kvm->lock);
+       switch (type) {
+       case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_4K;
+               break;
+       case KVM_VGIC_V2_ADDR_TYPE_CPU:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
+               addr_ptr = &vgic->vgic_cpu_base;
+               alignment = SZ_4K;
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+               addr_ptr = &vgic->vgic_dist_base;
+               alignment = SZ_64K;
+               break;
+       case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+               type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
+               addr_ptr = &vgic->vgic_redist_base;
+               alignment = SZ_64K;
+               break;
+#endif
+       default:
+               r = -ENODEV;
+               goto out;
+       }
+
+       if (vgic->vgic_model != type_needed) {
+               r = -ENODEV;
+               goto out;
+       }
+
+       if (write) {
+               r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
+               if (!r)
+                       *addr_ptr = *addr;
+       } else {
+               *addr = *addr_ptr;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static int vgic_set_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+               return (r == -ENODEV) ? -ENXIO : r;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 val;
+               int ret = 0;
+
+               if (get_user(val, uaddr))
+                       return -EFAULT;
+
+               /*
+                * We require:
+                * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+                * - at most 1024 interrupts
+                * - a multiple of 32 interrupts
+                */
+               if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+                   val > VGIC_MAX_RESERVED ||
+                   (val & 31))
+                       return -EINVAL;
+
+               mutex_lock(&dev->kvm->lock);
+
+               if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+                       ret = -EBUSY;
+               else
+                       dev->kvm->arch.vgic.nr_spis =
+                               val - VGIC_NR_PRIVATE_IRQS;
+
+               mutex_unlock(&dev->kvm->lock);
+
+               return ret;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       mutex_lock(&dev->kvm->lock);
+                       r = vgic_init(dev->kvm);
+                       mutex_unlock(&dev->kvm->lock);
+                       return r;
+               }
+               break;
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_get_common_attr(struct kvm_device *dev,
+                               struct kvm_device_attr *attr)
+{
+       int r = -ENXIO;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+               u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+               u64 addr;
+               unsigned long type = (unsigned long)attr->attr;
+
+               r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+               if (r)
+                       return (r == -ENODEV) ? -ENXIO : r;
+
+               if (copy_to_user(uaddr, &addr, sizeof(addr)))
+                       return -EFAULT;
+               break;
+       }
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+               r = put_user(dev->kvm->arch.vgic.nr_spis +
+                            VGIC_NR_PRIVATE_IRQS, uaddr);
+               break;
+       }
+       }
+
+       return r;
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+       return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+       kfree(dev);
+}
+
+void kvm_register_vgic_device(unsigned long type)
+{
+       switch (type) {
+       case KVM_DEV_TYPE_ARM_VGIC_V2:
+               kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+                                       KVM_DEV_TYPE_ARM_VGIC_V2);
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
+               kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+                                       KVM_DEV_TYPE_ARM_VGIC_V3);
+               break;
+#endif
+       }
+}
+
+/** vgic_attr_regs_access: allows user space to read/write VGIC registers
+ *
+ * @dev: kvm device handle
+ * @attr: kvm device attribute
+ * @reg: address the value is read or written
+ * @is_write: write flag
+ *
+ */
+static int vgic_attr_regs_access(struct kvm_device *dev,
+                                struct kvm_device_attr *attr,
+                                u32 *reg, bool is_write)
+{
+       gpa_t addr;
+       int cpuid, ret, c;
+       struct kvm_vcpu *vcpu, *tmp_vcpu;
+       int vcpu_lock_idx = -1;
+
+       cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+                KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+       vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       mutex_lock(&dev->kvm->lock);
+
+       ret = vgic_init(dev->kvm);
+       if (ret)
+               goto out;
+
+       if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Any time a vcpu is run, vcpu_load is called which tries to grab the
+        * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+        * that no other VCPUs are run and fiddle with the vgic state while we
+        * access it.
+        */
+       ret = -EBUSY;
+       kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
+               if (!mutex_trylock(&tmp_vcpu->mutex))
+                       goto out;
+               vcpu_lock_idx = c;
+       }
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+out:
+       for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+               tmp_vcpu = kvm_get_vcpu(dev->kvm, vcpu_lock_idx);
+               mutex_unlock(&tmp_vcpu->mutex);
+       }
+
+       mutex_unlock(&dev->kvm->lock);
+       return ret;
+}
+
+/* V2 ops */
+
+static int vgic_v2_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_set_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg;
+
+               if (get_user(reg, uaddr))
+                       return -EFAULT;
+
+               return vgic_attr_regs_access(dev, attr, &reg, true);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       int ret;
+
+       ret = vgic_get_common_attr(dev, attr);
+       if (ret != -ENXIO)
+               return ret;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+               u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+               u32 reg = 0;
+
+               ret = vgic_attr_regs_access(dev, attr, &reg, false);
+               if (ret)
+                       return ret;
+               return put_user(reg, uaddr);
+       }
+       }
+
+       return -ENXIO;
+}
+
+static int vgic_v2_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V2_ADDR_TYPE_DIST:
+               case KVM_VGIC_V2_ADDR_TYPE_CPU:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               return vgic_v2_has_attr_regs(dev, attr);
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+       .name = "kvm-arm-vgic-v2",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v2_set_attr,
+       .get_attr = vgic_v2_get_attr,
+       .has_attr = vgic_v2_has_attr,
+};
+
+/* V3 ops */
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       return vgic_set_common_attr(dev, attr);
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       return vgic_get_common_attr(dev, attr);
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+                           struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_ADDR:
+               switch (attr->attr) {
+               case KVM_VGIC_V3_ADDR_TYPE_DIST:
+               case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+                       return 0;
+               }
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+               return 0;
+       case KVM_DEV_ARM_VGIC_GRP_CTRL:
+               switch (attr->attr) {
+               case KVM_DEV_ARM_VGIC_CTRL_INIT:
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+       .name = "kvm-arm-vgic-v3",
+       .create = vgic_create,
+       .destroy = vgic_destroy,
+       .set_attr = vgic_v3_set_attr,
+       .get_attr = vgic_v3_get_attr,
+       .has_attr = vgic_v3_has_attr,
+};
+
+#endif /* CONFIG_KVM_ARM_VGIC_V3 */
+
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
new file mode 100644 (file)
index 0000000..a213936
--- /dev/null
@@ -0,0 +1,446 @@
+/*
+ * VGICv2 MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 value;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               value = vcpu->kvm->arch.vgic.enabled ? GICD_ENABLE : 0;
+               break;
+       case GIC_DIST_CTR:
+               value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+               break;
+       case GIC_DIST_IIDR:
+               value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       bool was_enabled = dist->enabled;
+
+       switch (addr & 0x0c) {
+       case GIC_DIST_CTRL:
+               dist->enabled = val & GICD_ENABLE;
+               if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+               break;
+       case GIC_DIST_CTR:
+       case GIC_DIST_IIDR:
+               /* Nothing to do */
+               return;
+       }
+}
+
+static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
+                                gpa_t addr, unsigned int len,
+                                unsigned long val)
+{
+       int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
+       int intid = val & 0xf;
+       int targets = (val >> 16) & 0xff;
+       int mode = (val >> 24) & 0x03;
+       int c;
+       struct kvm_vcpu *vcpu;
+
+       switch (mode) {
+       case 0x0:               /* as specified by targets */
+               break;
+       case 0x1:
+               targets = (1U << nr_vcpus) - 1;                 /* all, ... */
+               targets &= ~(1U << source_vcpu->vcpu_id);       /* but self */
+               break;
+       case 0x2:               /* this very vCPU only */
+               targets = (1U << source_vcpu->vcpu_id);
+               break;
+       case 0x3:               /* reserved */
+               return;
+       }
+
+       kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
+               struct vgic_irq *irq;
+
+               if (!(targets & (1U << c)))
+                       continue;
+
+               irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+               irq->source |= 1U << source_vcpu->vcpu_id;
+
+               vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+       }
+}
+
+static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->targets << (i * 8);
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+
+       /* GICD_ITARGETSR[0-7] are read-only */
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               return;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+               int target;
+
+               spin_lock(&irq->irq_lock);
+
+               irq->targets = (val >> (i * 8)) & 0xff;
+               target = irq->targets ? __ffs(irq->targets) : 0;
+               irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->source << (i * 8);
+       }
+       return val;
+}
+
+static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->source &= ~((val >> (i * 8)) & 0xff);
+               if (!irq->source)
+                       irq->pending = false;
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len,
+                                    unsigned long val)
+{
+       u32 intid = addr & 0x0f;
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->source |= (val >> (i * 8)) & 0xff;
+
+               if (irq->source) {
+                       irq->pending = true;
+                       vgic_queue_irq_unlock(vcpu->kvm, irq);
+               } else {
+                       spin_unlock(&irq->irq_lock);
+               }
+       }
+}
+
+static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_get_vmcr(vcpu, vmcr);
+       else
+               vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
+#define GICC_ARCH_VERSION_V2   0x2
+
+/* These are for userland accesses only, there is no guest-facing emulation. */
+static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
+                                          gpa_t addr, unsigned int len)
+{
+       struct vgic_vmcr vmcr;
+       u32 val;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               val = vmcr.ctlr;
+               break;
+       case GIC_CPU_PRIMASK:
+               val = vmcr.pmr;
+               break;
+       case GIC_CPU_BINPOINT:
+               val = vmcr.bpr;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               val = vmcr.abpr;
+               break;
+       case GIC_CPU_IDENT:
+               val = ((PRODUCT_ID_KVM << 20) |
+                      (GICC_ARCH_VERSION_V2 << 16) |
+                      IMPLEMENTER_ARM);
+               break;
+       default:
+               return 0;
+       }
+
+       return val;
+}
+
+static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
+                                  gpa_t addr, unsigned int len,
+                                  unsigned long val)
+{
+       struct vgic_vmcr vmcr;
+
+       vgic_get_vmcr(vcpu, &vmcr);
+
+       switch (addr & 0xff) {
+       case GIC_CPU_CTRL:
+               vmcr.ctlr = val;
+               break;
+       case GIC_CPU_PRIMASK:
+               vmcr.pmr = val;
+               break;
+       case GIC_CPU_BINPOINT:
+               vmcr.bpr = val;
+               break;
+       case GIC_CPU_ALIAS_BINPOINT:
+               vmcr.abpr = val;
+               break;
+       }
+
+       vgic_set_vmcr(vcpu, &vmcr);
+}
+
+static const struct vgic_register_region vgic_v2_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_CTRL,
+               vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
+               vgic_mmio_read_target, vgic_mmio_write_target, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
+               vgic_mmio_read_config, vgic_mmio_write_config, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
+               vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
+               vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+};
+
+static const struct vgic_register_region vgic_v2_cpu_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 16,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
+               vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v2_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_4K;
+}
+
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+       const struct vgic_register_region *regions;
+       gpa_t addr;
+       int nr_regions, i, len;
+
+       addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+       switch (attr->group) {
+       case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+               regions = vgic_v2_dist_registers;
+               nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+               break;
+       case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+               regions = vgic_v2_cpu_registers;
+               nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+               break;
+       default:
+               return -ENXIO;
+       }
+
+       /* We only support aligned 32-bit accesses. */
+       if (addr & 3)
+               return -ENXIO;
+
+       for (i = 0; i < nr_regions; i++) {
+               if (regions[i].bits_per_irq)
+                       len = (regions[i].bits_per_irq * nr_irqs) / 8;
+               else
+                       len = regions[i].len;
+
+               if (regions[i].reg_offset <= addr &&
+                   regions[i].reg_offset + len > addr)
+                       return 0;
+       }
+
+       return -ENXIO;
+}
+
+/*
+ * When userland tries to access the VGIC register handlers, we need to
+ * create a usable struct vgic_io_device to be passed to the handlers and we
+ * have to set up a buffer similar to what would have happened if a guest MMIO
+ * access occurred, including doing endian conversions on BE systems.
+ */
+static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+                       bool is_write, int offset, u32 *val)
+{
+       unsigned int len = 4;
+       u8 buf[4];
+       int ret;
+
+       if (is_write) {
+               vgic_data_host_to_mmio_bus(buf, len, *val);
+               ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf);
+       } else {
+               ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf);
+               if (!ret)
+                       *val = vgic_data_mmio_bus_to_host(buf, len);
+       }
+
+       return ret;
+}
+
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_cpu_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val)
+{
+       struct vgic_io_device dev = {
+               .regions = vgic_v2_dist_registers,
+               .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+       };
+
+       return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
new file mode 100644 (file)
index 0000000..a0c515a
--- /dev/null
@@ -0,0 +1,455 @@
+/*
+ * VGICv3 MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/* extract @num bytes at @offset bytes offset in data */
+static unsigned long extract_bytes(unsigned long data, unsigned int offset,
+                                  unsigned int num)
+{
+       return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
+}
+
+static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       u32 value = 0;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR:
+               if (vcpu->kvm->arch.vgic.enabled)
+                       value |= GICD_CTLR_ENABLE_SS_G1;
+               value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+               break;
+       case GICD_TYPER:
+               value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+               value = (value >> 5) - 1;
+               value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+               break;
+       case GICD_IIDR:
+               value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+               break;
+       default:
+               return 0;
+       }
+
+       return value;
+}
+
+static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       bool was_enabled = dist->enabled;
+
+       switch (addr & 0x0c) {
+       case GICD_CTLR:
+               dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+
+               if (!was_enabled && dist->enabled)
+                       vgic_kick_vcpus(vcpu->kvm);
+               break;
+       case GICD_TYPER:
+       case GICD_IIDR:
+               return;
+       }
+}
+
+static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
+                                           gpa_t addr, unsigned int len)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return 0;
+
+       /* The upper word is RAZ for us. */
+       if (addr & 4)
+               return 0;
+
+       return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+}
+
+static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len,
+                                   unsigned long val)
+{
+       int intid = VGIC_ADDR_TO_INTID(addr, 64);
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+       if (!irq)
+               return;
+
+       /* The upper word is WI for us since we don't implement Aff3. */
+       if (addr & 4)
+               return;
+
+       spin_lock(&irq->irq_lock);
+
+       /* We only care about and preserve Aff0, Aff1 and Aff2. */
+       irq->mpidr = val & GENMASK(23, 0);
+       irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
+
+       spin_unlock(&irq->irq_lock);
+}
+
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       int target_vcpu_id = vcpu->vcpu_id;
+       u64 value;
+
+       value = (mpidr & GENMASK(23, 0)) << 32;
+       value |= ((target_vcpu_id & 0xffff) << 8);
+       if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
+               value |= GICR_TYPER_LAST;
+
+       return extract_bytes(value, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
+                                            gpa_t addr, unsigned int len)
+{
+       return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
+{
+       switch (addr & 0xffff) {
+       case GICD_PIDR2:
+               /* report a GICv3 compliant implementation */
+               return 0x3b;
+       }
+
+       return 0;
+}
+
+/*
+ * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
+ * redistributors, while SPIs are covered by registers in the distributor
+ * block. Trying to set private IRQs in this block gets ignored.
+ * We take some special care here to fix the calculation of the register
+ * offset.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc)  \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,                \
+               .access_flags = acc,                                    \
+               .read = vgic_mmio_read_raz,                             \
+               .write = vgic_mmio_write_wi,                            \
+       }, {                                                            \
+               .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,   \
+               .bits_per_irq = bpi,                                    \
+               .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,       \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+static const struct vgic_register_region vgic_v3_dist_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICD_CTLR,
+               vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
+               vgic_mmio_read_config, vgic_mmio_write_config, 2,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 1,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
+               vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
+               vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
+               vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+               VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
+               vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+               VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
+       REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0,
+               vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_senable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0,
+               vgic_mmio_read_enable, vgic_mmio_write_cenable, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_spending, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0,
+               vgic_mmio_read_pending, vgic_mmio_write_cpending, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_sactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0,
+               vgic_mmio_read_active, vgic_mmio_write_cactive, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0,
+               vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
+               VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0,
+               vgic_mmio_read_config, vgic_mmio_write_config, 8,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+       REGISTER_DESC_WITH_LENGTH(GICR_NSACR,
+               vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+               VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
+{
+       dev->regions = vgic_v3_dist_registers;
+       dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+
+       kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+       return SZ_64K;
+}
+
+int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
+{
+       int nr_vcpus = atomic_read(&kvm->online_vcpus);
+       struct kvm_vcpu *vcpu;
+       struct vgic_io_device *devices;
+       int c, ret = 0;
+
+       devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
+                         GFP_KERNEL);
+       if (!devices)
+               return -ENOMEM;
+
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
+               gpa_t sgi_base = rd_base + SZ_64K;
+               struct vgic_io_device *rd_dev = &devices[c * 2];
+               struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+
+               kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
+               rd_dev->base_addr = rd_base;
+               rd_dev->regions = vgic_v3_rdbase_registers;
+               rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
+               rd_dev->redist_vcpu = vcpu;
+
+               mutex_lock(&kvm->slots_lock);
+               ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
+                                             SZ_64K, &rd_dev->dev);
+               mutex_unlock(&kvm->slots_lock);
+
+               if (ret)
+                       break;
+
+               kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
+               sgi_dev->base_addr = sgi_base;
+               sgi_dev->regions = vgic_v3_sgibase_registers;
+               sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
+               sgi_dev->redist_vcpu = vcpu;
+
+               mutex_lock(&kvm->slots_lock);
+               ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base,
+                                             SZ_64K, &sgi_dev->dev);
+               mutex_unlock(&kvm->slots_lock);
+               if (ret) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &rd_dev->dev);
+                       break;
+               }
+       }
+
+       if (ret) {
+               /* The current c failed, so we start with the previous one. */
+               for (c--; c >= 0; c--) {
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &devices[c * 2].dev);
+                       kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+                                                 &devices[c * 2 + 1].dev);
+               }
+               kfree(devices);
+       } else {
+               kvm->arch.vgic.redist_iodevs = devices;
+       }
+
+       return ret;
+}
+
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+       unsigned long affinity;
+       int level0;
+
+       /*
+        * Split the current VCPU's MPIDR into affinity level 0 and the
+        * rest as this is what we have to compare against.
+        */
+       affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+       level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+       affinity &= ~MPIDR_LEVEL_MASK;
+
+       /* bail out if the upper three levels don't match */
+       if (sgi_aff != affinity)
+               return -1;
+
+       /* Is this VCPU's bit set in the mask ? */
+       if (!(sgi_cpu_mask & BIT(level0)))
+               return -1;
+
+       return level0;
+}
+
+/*
+ * The ICC_SGI* registers encode the affinity differently from the MPIDR,
+ * so provide a wrapper to use the existing defines to isolate a certain
+ * affinity level.
+ */
+#define SGI_AFFINITY_LEVEL(reg, level) \
+       ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+       >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *c_vcpu;
+       u16 target_cpus;
+       u64 mpidr;
+       int sgi, c;
+       int vcpu_id = vcpu->vcpu_id;
+       bool broadcast;
+
+       sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+       broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+       target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+       mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+       mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+
+       /*
+        * We iterate over all VCPUs to find the MPIDRs matching the request.
+        * If we have handled one CPU, we clear its bit to detect early
+        * if we are already finished. This avoids iterating through all
+        * VCPUs when most of the times we just signal a single VCPU.
+        */
+       kvm_for_each_vcpu(c, c_vcpu, kvm) {
+               struct vgic_irq *irq;
+
+               /* Exit early if we have dealt with all requested CPUs */
+               if (!broadcast && target_cpus == 0)
+                       break;
+
+               /* Don't signal the calling VCPU */
+               if (broadcast && c == vcpu_id)
+                       continue;
+
+               if (!broadcast) {
+                       int level0;
+
+                       level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+                       if (level0 == -1)
+                               continue;
+
+                       /* remove this matching VCPU from the mask */
+                       target_cpus &= ~BIT(level0);
+               }
+
+               irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
new file mode 100644 (file)
index 0000000..059595e
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * VGIC MMIO handling functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bsearch.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return 0;
+}
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len)
+{
+       return -1UL;
+}
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val)
+{
+       /* Ignore */
+}
+
+/*
+ * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
+ * of the enabled bit, so there is only one function for both here.
+ */
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->enabled)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               irq->enabled = true;
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               irq->enabled = false;
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->pending)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               irq->pending = true;
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       irq->soft_pending = true;
+
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       }
+}
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+
+               if (irq->config == VGIC_CONFIG_LEVEL) {
+                       irq->soft_pending = false;
+                       irq->pending = irq->line_level;
+               } else {
+                       irq->pending = false;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       u32 value = 0;
+       int i;
+
+       /* Loop over all IRQs affected by this read */
+       for (i = 0; i < len * 8; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->active)
+                       value |= (1U << i);
+       }
+
+       return value;
+}
+
+static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+                                   bool new_active_state)
+{
+       spin_lock(&irq->irq_lock);
+       /*
+        * If this virtual IRQ was written into a list register, we
+        * have to make sure the CPU that runs the VCPU thread has
+        * synced back LR state to the struct vgic_irq.  We can only
+        * know this for sure, when either this irq is not assigned to
+        * anyone's AP list anymore, or the VCPU thread is not
+        * running on any CPUs.
+        *
+        * In the opposite case, we know the VCPU thread may be on its
+        * way back from the guest and still has to sync back this
+        * IRQ, so we release and re-acquire the spin_lock to let the
+        * other thread sync back the IRQ.
+        */
+       while (irq->vcpu && /* IRQ may have state in an LR somewhere */
+              irq->vcpu->cpu != -1) { /* VCPU thread is running */
+               BUG_ON(irq->intid < VGIC_NR_PRIVATE_IRQS);
+               cond_resched_lock(&irq->irq_lock);
+       }
+
+       irq->active = new_active_state;
+       if (new_active_state)
+               vgic_queue_irq_unlock(vcpu->kvm, irq);
+       else
+               spin_unlock(&irq->irq_lock);
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts, we have to stop all the VCPUs because interrupts can
+ * be migrated while we don't hold the IRQ locks and we don't want to be
+ * chasing moving targets.
+ *
+ * For private interrupts, we only have to make sure the single and only VCPU
+ * that can potentially queue the IRQ is stopped.
+ */
+static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_halt_vcpu(vcpu);
+       else
+               kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_change_active_prepare */
+static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+       if (intid < VGIC_NR_PRIVATE_IRQS)
+               kvm_arm_resume_vcpu(vcpu);
+       else
+               kvm_arm_resume_guest(vcpu->kvm);
+}
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       vgic_change_active_prepare(vcpu, intid);
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, false);
+       }
+       vgic_change_active_finish(vcpu, intid);
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+       int i;
+
+       vgic_change_active_prepare(vcpu, intid);
+       for_each_set_bit(i, &val, len * 8) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+               vgic_mmio_change_active(vcpu, irq, true);
+       }
+       vgic_change_active_finish(vcpu, intid);
+}
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+       u64 val = 0;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               val |= (u64)irq->priority << (i * 8);
+       }
+
+       return val;
+}
+
+/*
+ * We currently don't handle changing the priority of an interrupt that
+ * is already pending on a VCPU. If there is a need for this, we would
+ * need to make this VCPU exit and re-evaluate the priorities, potentially
+ * leading to this interrupt getting presented now to the guest (if it has
+ * been masked by the priority mask before).
+ */
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+       int i;
+
+       for (i = 0; i < len; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               spin_lock(&irq->irq_lock);
+               /* Narrow the priority range to what we actually support */
+               irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       u32 value = 0;
+       int i;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       value |= (2U << (i * 2));
+       }
+
+       return value;
+}
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val)
+{
+       u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+       int i;
+
+       for (i = 0; i < len * 4; i++) {
+               struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+               /*
+                * The configuration cannot be changed for SGIs in general,
+                * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
+                * code relies on PPIs being level triggered, so we also
+                * make them read-only here.
+                */
+               if (intid + i < VGIC_NR_PRIVATE_IRQS)
+                       continue;
+
+               spin_lock(&irq->irq_lock);
+               if (test_bit(i * 2 + 1, &val)) {
+                       irq->config = VGIC_CONFIG_EDGE;
+               } else {
+                       irq->config = VGIC_CONFIG_LEVEL;
+                       irq->pending = irq->line_level | irq->soft_pending;
+               }
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+static int match_region(const void *key, const void *elt)
+{
+       const unsigned int offset = (unsigned long)key;
+       const struct vgic_register_region *region = elt;
+
+       if (offset < region->reg_offset)
+               return -1;
+
+       if (offset >= region->reg_offset + region->len)
+               return 1;
+
+       return 0;
+}
+
+/* Find the proper register handler entry given a certain address offset. */
+static const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions,
+                     unsigned int offset)
+{
+       return bsearch((void *)(uintptr_t)offset, region, nr_regions,
+                      sizeof(region[0]), match_region);
+}
+
+/*
+ * kvm_mmio_read_buf() returns a value in a format where it can be converted
+ * to a byte array and be directly observed as the guest wanted it to appear
+ * in memory if it had done the store itself, which is LE for the GIC, as the
+ * guest knows the GIC is always LE.
+ *
+ * We convert this value to the CPUs native format to deal with it as a data
+ * value.
+ */
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
+{
+       unsigned long data = kvm_mmio_read_buf(val, len);
+
+       switch (len) {
+       case 1:
+               return data;
+       case 2:
+               return le16_to_cpu(data);
+       case 4:
+               return le32_to_cpu(data);
+       default:
+               return le64_to_cpu(data);
+       }
+}
+
+/*
+ * kvm_mmio_write_buf() expects a value in a format such that if converted to
+ * a byte array it is observed as the guest would see it if it could perform
+ * the load directly.  Since the GIC is LE, and the guest knows this, the
+ * guest expects a value in little endian format.
+ *
+ * We convert the data value from the CPUs native format to LE so that the
+ * value is returned in the proper format.
+ */
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data)
+{
+       switch (len) {
+       case 1:
+               break;
+       case 2:
+               data = cpu_to_le16(data);
+               break;
+       case 4:
+               data = cpu_to_le32(data);
+               break;
+       default:
+               data = cpu_to_le64(data);
+       }
+
+       kvm_mmio_write_buf(buf, len, data);
+}
+
+static
+struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
+{
+       return container_of(dev, struct vgic_io_device, dev);
+}
+
+static bool check_region(const struct vgic_register_region *region,
+                        gpa_t addr, int len)
+{
+       if ((region->access_flags & VGIC_ACCESS_8bit) && len == 1)
+               return true;
+       if ((region->access_flags & VGIC_ACCESS_32bit) &&
+           len == sizeof(u32) && !(addr & 3))
+               return true;
+       if ((region->access_flags & VGIC_ACCESS_64bit) &&
+           len == sizeof(u64) && !(addr & 7))
+               return true;
+
+       return false;
+}
+
+static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                             gpa_t addr, int len, void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+       unsigned long data;
+
+       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+                                      addr - iodev->base_addr);
+       if (!region || !check_region(region, addr, len)) {
+               memset(val, 0, len);
+               return 0;
+       }
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       data = region->read(r_vcpu, addr, len);
+       vgic_data_host_to_mmio_bus(val, len, data);
+       return 0;
+}
+
+static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+                              gpa_t addr, int len, const void *val)
+{
+       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+       const struct vgic_register_region *region;
+       struct kvm_vcpu *r_vcpu;
+       unsigned long data = vgic_data_mmio_bus_to_host(val, len);
+
+       region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+                                      addr - iodev->base_addr);
+       if (!region)
+               return 0;
+
+       if (!check_region(region, addr, len))
+               return 0;
+
+       r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+       region->write(r_vcpu, addr, len, data);
+       return 0;
+}
+
+struct kvm_io_device_ops kvm_io_gic_ops = {
+       .read = dispatch_mmio_read,
+       .write = dispatch_mmio_write,
+};
+
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type type)
+{
+       struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
+       int ret = 0;
+       unsigned int len;
+
+       switch (type) {
+       case VGIC_V2:
+               len = vgic_v2_init_dist_iodev(io_device);
+               break;
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+       case VGIC_V3:
+               len = vgic_v3_init_dist_iodev(io_device);
+               break;
+#endif
+       default:
+               BUG_ON(1);
+       }
+
+       io_device->base_addr = dist_base_address;
+       io_device->redist_vcpu = NULL;
+
+       mutex_lock(&kvm->slots_lock);
+       ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+                                     len, &io_device->dev);
+       mutex_unlock(&kvm->slots_lock);
+
+       return ret;
+}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
new file mode 100644 (file)
index 0000000..8509014
--- /dev/null
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KVM_ARM_VGIC_MMIO_H__
+#define __KVM_ARM_VGIC_MMIO_H__
+
+struct vgic_register_region {
+       unsigned int reg_offset;
+       unsigned int len;
+       unsigned int bits_per_irq;
+       unsigned int access_flags;
+       unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+                             unsigned int len);
+       void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
+                     unsigned long val);
+};
+
+extern struct kvm_io_device_ops kvm_io_gic_ops;
+
+#define VGIC_ACCESS_8bit       1
+#define VGIC_ACCESS_32bit      2
+#define VGIC_ACCESS_64bit      4
+
+/*
+ * Generate a mask that covers the number of bytes required to address
+ * up to 1024 interrupts, each represented by <bits> bits. This assumes
+ * that <bits> is a power of two.
+ */
+#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
+
+/*
+ * (addr & mask) gives us the byte offset for the INT ID, so we want to
+ * divide this with 'bytes per irq' to get the INT ID, which is given
+ * by '(bits) / 8'.  But we do this with fixed-point-arithmetic and
+ * take advantage of the fact that division by a fraction equals
+ * multiplication with the inverted fraction, and scale up both the
+ * numerator and denominator with 8 to support at most 64 bits per IRQ:
+ */
+#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
+                                       64 / (bits) / 8)
+
+/*
+ * Some VGIC registers store per-IRQ information, with a different number
+ * of bits per IRQ. For those registers this macro is used.
+ * The _WITH_LENGTH version instantiates registers with a fixed length
+ * and is mutually exclusive with the _PER_IRQ version.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc)         \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = bpi,                                    \
+               .len = bpi * 1024 / 8,                                  \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)            \
+       {                                                               \
+               .reg_offset = off,                                      \
+               .bits_per_irq = 0,                                      \
+               .len = length,                                          \
+               .access_flags = acc,                                    \
+               .read = rd,                                             \
+               .write = wr,                                            \
+       }
+
+int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                                 struct vgic_register_region *reg_desc,
+                                 struct vgic_io_device *region,
+                                 int nr_irqs, bool offset_private);
+
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
+
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+                               unsigned long data);
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+                                gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+                       unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+                                    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+                            gpa_t addr, unsigned int len,
+                            unsigned long val);
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+                                     gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+                             gpa_t addr, unsigned int len,
+                             unsigned long val);
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+                                   gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+                           gpa_t addr, unsigned int len,
+                           unsigned long val);
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+
+#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
new file mode 100644 (file)
index 0000000..8ad42c2
--- /dev/null
@@ -0,0 +1,352 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+/*
+ * Call this function to convert a u64 value to an unsigned long * bitmask
+ * in a way that works on both 32-bit and 64-bit LE and BE platforms.
+ *
+ * Warning: Calling this function may modify *val.
+ */
+static unsigned long *u64_to_bitmask(u64 *val)
+{
+#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
+       *val = (*val >> 32) | (*val << 32);
+#endif
+       return (unsigned long *)val;
+}
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       if (cpuif->vgic_misr & GICH_MISR_EOI) {
+               u64 eisr = cpuif->vgic_eisr;
+               unsigned long *eisr_bmap = u64_to_bitmask(&eisr);
+               int lr;
+
+               for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) {
+                       u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID;
+
+                       WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE);
+
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+               }
+       }
+
+       /* check and disable underflow maintenance IRQ */
+       cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+       /*
+        * In the next iterations of the vcpu loop, if we sync the
+        * vgic state after flushing it, but before entering the guest
+        * (this happens for pending signals and vmid rollovers), then
+        * make sure we don't pick up any old maintenance interrupts
+        * here.
+        */
+       cpuif->vgic_eisr = 0;
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+       cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+       int lr;
+
+       for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+               u32 val = cpuif->vgic_lr[lr];
+               u32 intid = val & GICH_LR_VIRTUALID;
+               struct vgic_irq *irq;
+
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & GICH_LR_PENDING_BIT)) {
+                       irq->pending = true;
+
+                       if (vgic_irq_is_sgi(intid)) {
+                               u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+                               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+                               irq->source |= (1 << cpuid);
+                       }
+               }
+
+               /* Clear soft pending state when level IRQs have been acked */
+               if (irq->config == VGIC_CONFIG_LEVEL &&
+                   !(val & GICH_LR_PENDING_BIT)) {
+                       irq->soft_pending = false;
+                       irq->pending = irq->line_level;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 val = irq->intid;
+
+       if (irq->pending) {
+               val |= GICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending = false;
+
+               if (vgic_irq_is_sgi(irq->intid)) {
+                       u32 src = ffs(irq->source);
+
+                       BUG_ON(!src);
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source)
+                               irq->pending = true;
+               }
+       }
+
+       if (irq->active)
+               val |= GICH_LR_ACTIVE_BIT;
+
+       if (irq->hw) {
+               val |= GICH_LR_HW;
+               val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       val |= GICH_LR_EOI;
+       }
+
+       /* The GICv2 LR only holds five bits of priority. */
+       val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
+
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr;
+
+       vmcr  = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
+       vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
+               GICH_VMCR_ALIAS_BINPOINT_MASK;
+       vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
+               GICH_VMCR_BINPOINT_MASK;
+       vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
+               GICH_VMCR_PRIMASK_MASK;
+
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
+}
+
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
+
+       vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >>
+                       GICH_VMCR_CTRL_SHIFT;
+       vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
+                       GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+       vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
+                       GICH_VMCR_BINPOINT_SHIFT;
+       vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >>
+                       GICH_VMCR_PRIMASK_SHIFT;
+}
+
+void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
+
+       /* Get the show on the road... */
+       vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
+{
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
+               return false;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
+               return false;
+
+       if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
+               return true;
+       if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
+               return true;
+
+       return false;
+}
+
+int vgic_v2_map_resources(struct kvm *kvm)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       int ret = 0;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+               kvm_err("Need to set vgic cpu and dist addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
+               kvm_err("VGIC CPU and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Initialize the vgic if this hasn't already been done on demand by
+        * accessing the vgic state from userspace.
+        */
+       ret = vgic_init(kvm);
+       if (ret) {
+               kvm_err("Unable to initialize VGIC dynamic data structures\n");
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
+       if (ret) {
+               kvm_err("Unable to register VGIC MMIO regions\n");
+               goto out;
+       }
+
+       ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+                                   kvm_vgic_global_state.vcpu_base,
+                                   KVM_VGIC_V2_CPU_SIZE, true);
+       if (ret) {
+               kvm_err("Unable to remap VGIC CPU to VCPU\n");
+               goto out;
+       }
+
+       dist->ready = true;
+
+out:
+       if (ret)
+               kvm_vgic_destroy(kvm);
+       return ret;
+}
+
+/**
+ * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
+ * @node:      pointer to the DT node
+ *
+ * Returns 0 if a GICv2 has been found, returns an error code otherwise
+ */
+int vgic_v2_probe(const struct gic_kvm_info *info)
+{
+       int ret;
+       u32 vtr;
+
+       if (!info->vctrl.start) {
+               kvm_err("GICH not present in the firmware table\n");
+               return -ENXIO;
+       }
+
+       if (!PAGE_ALIGNED(info->vcpu.start)) {
+               kvm_err("GICV physical address 0x%llx not page aligned\n",
+                       (unsigned long long)info->vcpu.start);
+               return -ENXIO;
+       }
+
+       if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                       (unsigned long long)resource_size(&info->vcpu),
+                       PAGE_SIZE);
+               return -ENXIO;
+       }
+
+       kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start,
+                                                  resource_size(&info->vctrl));
+       if (!kvm_vgic_global_state.vctrl_base) {
+               kvm_err("Cannot ioremap GICH\n");
+               return -ENOMEM;
+       }
+
+       vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
+       kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+
+       ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
+                                    kvm_vgic_global_state.vctrl_base +
+                                        resource_size(&info->vctrl),
+                                    info->vctrl.start);
+
+       if (ret) {
+               kvm_err("Cannot map VCTRL into hyp\n");
+               iounmap(kvm_vgic_global_state.vctrl_base);
+               return ret;
+       }
+
+       kvm_vgic_global_state.can_emulate_gicv2 = true;
+       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+
+       kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+       kvm_vgic_global_state.type = VGIC_V2;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
+
+       kvm_info("vgic-v2@%llx\n", info->vctrl.start);
+
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
new file mode 100644 (file)
index 0000000..336a461
--- /dev/null
@@ -0,0 +1,330 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
+
+#include "vgic.h"
+
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+
+       if (cpuif->vgic_misr & ICH_MISR_EOI) {
+               unsigned long eisr_bmap = cpuif->vgic_eisr;
+               int lr;
+
+               for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) {
+                       u32 intid;
+                       u64 val = cpuif->vgic_lr[lr];
+
+                       if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                               intid = val & ICH_LR_VIRTUAL_ID_MASK;
+                       else
+                               intid = val & GICH_LR_VIRTUALID;
+
+                       WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE);
+
+                       kvm_notify_acked_irq(vcpu->kvm, 0,
+                                            intid - VGIC_NR_PRIVATE_IRQS);
+               }
+
+               /*
+                * In the next iterations of the vcpu loop, if we sync
+                * the vgic state after flushing it, but before
+                * entering the guest (this happens for pending
+                * signals and vmid rollovers), then make sure we
+                * don't pick up any old maintenance interrupts here.
+                */
+               cpuif->vgic_eisr = 0;
+       }
+
+       cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+}
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       int lr;
+
+       for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) {
+               u64 val = cpuif->vgic_lr[lr];
+               u32 intid;
+               struct vgic_irq *irq;
+
+               if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+                       intid = val & ICH_LR_VIRTUAL_ID_MASK;
+               else
+                       intid = val & GICH_LR_VIRTUALID;
+               irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+               spin_lock(&irq->irq_lock);
+
+               /* Always preserve the active bit */
+               irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+               /* Edge is the only case where we preserve the pending bit */
+               if (irq->config == VGIC_CONFIG_EDGE &&
+                   (val & ICH_LR_PENDING_BIT)) {
+                       irq->pending = true;
+
+                       if (vgic_irq_is_sgi(intid) &&
+                           model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+                               u32 cpuid = val & GICH_LR_PHYSID_CPUID;
+
+                               cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+                               irq->source |= (1 << cpuid);
+                       }
+               }
+
+               /* Clear soft pending state when level irqs have been acked */
+               if (irq->config == VGIC_CONFIG_LEVEL &&
+                   !(val & ICH_LR_PENDING_BIT)) {
+                       irq->soft_pending = false;
+                       irq->pending = irq->line_level;
+               }
+
+               spin_unlock(&irq->irq_lock);
+       }
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+       u32 model = vcpu->kvm->arch.vgic.vgic_model;
+       u64 val = irq->intid;
+
+       if (irq->pending) {
+               val |= ICH_LR_PENDING_BIT;
+
+               if (irq->config == VGIC_CONFIG_EDGE)
+                       irq->pending = false;
+
+               if (vgic_irq_is_sgi(irq->intid) &&
+                   model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+                       u32 src = ffs(irq->source);
+
+                       BUG_ON(!src);
+                       val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+                       irq->source &= ~(1 << (src - 1));
+                       if (irq->source)
+                               irq->pending = true;
+               }
+       }
+
+       if (irq->active)
+               val |= ICH_LR_ACTIVE_BIT;
+
+       if (irq->hw) {
+               val |= ICH_LR_HW;
+               val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+       } else {
+               if (irq->config == VGIC_CONFIG_LEVEL)
+                       val |= ICH_LR_EOI;
+       }
+
+       /*
+        * We currently only support Group1 interrupts, which is a
+        * known defect. This needs to be addressed at some point.
+        */
+       if (model == KVM_DEV_TYPE_ARM_VGIC_V3)
+               val |= ICH_LR_GROUP;
+
+       val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
+
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr;
+
+       vmcr  = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
+       vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+       vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+       vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
+}
+
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+       u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
+
+       vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
+       vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+       vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+       vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+}
+
+void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+       /*
+        * By forcing VMCR to zero, the GIC will restore the binary
+        * points to their reset values. Anything else resets to zero
+        * anyway.
+        */
+       vgic_v3->vgic_vmcr = 0;
+       vgic_v3->vgic_elrsr = ~0;
+
+       /*
+        * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+        * way, so we force SRE to 1 to demonstrate this to the guest.
+        * This goes with the spec allowing the value to be RAO/WI.
+        */
+       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+               vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
+       else
+               vgic_v3->vgic_sre = 0;
+
+       /* Get the show on the road... */
+       vgic_v3->vgic_hcr = ICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v3_check_base(struct kvm *kvm)
+{
+       struct vgic_dist *d = &kvm->arch.vgic;
+       gpa_t redist_size = KVM_VGIC_V3_REDIST_SIZE;
+
+       redist_size *= atomic_read(&kvm->online_vcpus);
+
+       if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
+               return false;
+       if (d->vgic_redist_base + redist_size < d->vgic_redist_base)
+               return false;
+
+       if (d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE <= d->vgic_redist_base)
+               return true;
+       if (d->vgic_redist_base + redist_size <= d->vgic_dist_base)
+               return true;
+
+       return false;
+}
+
+int vgic_v3_map_resources(struct kvm *kvm)
+{
+       int ret = 0;
+       struct vgic_dist *dist = &kvm->arch.vgic;
+
+       if (vgic_ready(kvm))
+               goto out;
+
+       if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+           IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
+               kvm_err("Need to set vgic distributor addresses first\n");
+               ret = -ENXIO;
+               goto out;
+       }
+
+       if (!vgic_v3_check_base(kvm)) {
+               kvm_err("VGIC redist and dist frames overlap\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * For a VGICv3 we require the userland to explicitly initialize
+        * the VGIC before we need to use it.
+        */
+       if (!vgic_initialized(kvm)) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
+       if (ret) {
+               kvm_err("Unable to register VGICv3 dist MMIO regions\n");
+               goto out;
+       }
+
+       ret = vgic_register_redist_iodevs(kvm, dist->vgic_redist_base);
+       if (ret) {
+               kvm_err("Unable to register VGICv3 redist MMIO regions\n");
+               goto out;
+       }
+
+       dist->ready = true;
+
+out:
+       if (ret)
+               kvm_vgic_destroy(kvm);
+       return ret;
+}
+
+/**
+ * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
+ * @node:      pointer to the DT node
+ *
+ * Returns 0 if a GICv3 has been found, returns an error code otherwise
+ */
+int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+       u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+
+       /*
+        * The ListRegs field is 5 bits, but there is a architectural
+        * maximum of 16 list registers. Just ignore bit 4...
+        */
+       kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+       kvm_vgic_global_state.can_emulate_gicv2 = false;
+
+       if (!info->vcpu.start) {
+               kvm_info("GICv3: no GICV resource entry\n");
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else if (!PAGE_ALIGNED(info->vcpu.start)) {
+               pr_warn("GICV physical address 0x%llx not page aligned\n",
+                       (unsigned long long)info->vcpu.start);
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else if (!PAGE_ALIGNED(resource_size(&info->vcpu))) {
+               pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
+                       (unsigned long long)resource_size(&info->vcpu),
+                       PAGE_SIZE);
+               kvm_vgic_global_state.vcpu_base = 0;
+       } else {
+               kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+               kvm_vgic_global_state.can_emulate_gicv2 = true;
+               kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+               kvm_info("vgic-v2@%llx\n", info->vcpu.start);
+       }
+       if (kvm_vgic_global_state.vcpu_base == 0)
+               kvm_info("disabling GICv2 emulation\n");
+       kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+
+       kvm_vgic_global_state.vctrl_base = NULL;
+       kvm_vgic_global_state.type = VGIC_V3;
+       kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+       return 0;
+}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
new file mode 100644 (file)
index 0000000..69b61ab
--- /dev/null
@@ -0,0 +1,619 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/list_sort.h>
+
+#include "vgic.h"
+
+#define CREATE_TRACE_POINTS
+#include "../trace.h"
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
+#else
+#define DEBUG_SPINLOCK_BUG_ON(p)
+#endif
+
+struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
+
+/*
+ * Locking order is always:
+ *   vgic_cpu->ap_list_lock
+ *     vgic_irq->irq_lock
+ *
+ * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ *   vcpuX->vcpu_id < vcpuY->vcpu_id:
+ *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ */
+
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid)
+{
+       /* SGIs and PPIs */
+       if (intid <= VGIC_MAX_PRIVATE)
+               return &vcpu->arch.vgic_cpu.private_irqs[intid];
+
+       /* SPIs */
+       if (intid <= VGIC_MAX_SPI)
+               return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
+
+       /* LPIs are not yet covered */
+       if (intid >= VGIC_MIN_LPI)
+               return NULL;
+
+       WARN(1, "Looking up struct vgic_irq for reserved INTID");
+       return NULL;
+}
+
+/**
+ * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq:       The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+       /* If the interrupt is active, it must stay on the current vcpu */
+       if (irq->active)
+               return irq->vcpu ? : irq->target_vcpu;
+
+       /*
+        * If the IRQ is not active but enabled and pending, we should direct
+        * it to its configured target VCPU.
+        * If the distributor is disabled, pending interrupts shouldn't be
+        * forwarded.
+        */
+       if (irq->enabled && irq->pending) {
+               if (unlikely(irq->target_vcpu &&
+                            !irq->target_vcpu->kvm->arch.vgic.enabled))
+                       return NULL;
+
+               return irq->target_vcpu;
+       }
+
+       /* If neither active nor pending and enabled, then this IRQ should not
+        * be queued to any VCPU.
+        */
+       return NULL;
+}
+
+/*
+ * The order of items in the ap_lists defines how we'll pack things in LRs as
+ * well, the first items in the list being the first things populated in the
+ * LRs.
+ *
+ * A hard rule is that active interrupts can never be pushed out of the LRs
+ * (and therefore take priority) since we cannot reliably trap on deactivation
+ * of IRQs and therefore they have to be present in the LRs.
+ *
+ * Otherwise things should be sorted by the priority field and the GIC
+ * hardware support will take care of preemption of priority groups etc.
+ *
+ * Return negative if "a" sorts before "b", 0 to preserve order, and positive
+ * to sort "b" before "a".
+ */
+static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
+       struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+       bool penda, pendb;
+       int ret;
+
+       spin_lock(&irqa->irq_lock);
+       spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+
+       if (irqa->active || irqb->active) {
+               ret = (int)irqb->active - (int)irqa->active;
+               goto out;
+       }
+
+       penda = irqa->enabled && irqa->pending;
+       pendb = irqb->enabled && irqb->pending;
+
+       if (!penda || !pendb) {
+               ret = (int)pendb - (int)penda;
+               goto out;
+       }
+
+       /* Both pending and enabled, sort by priority */
+       ret = irqa->priority - irqb->priority;
+out:
+       spin_unlock(&irqb->irq_lock);
+       spin_unlock(&irqa->irq_lock);
+       return ret;
+}
+
+/* Must be called with the ap_list_lock held */
+static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level)
+{
+       switch (irq->config) {
+       case VGIC_CONFIG_LEVEL:
+               return irq->line_level != level;
+       case VGIC_CONFIG_EDGE:
+               return level;
+       }
+
+       return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq)
+{
+       struct kvm_vcpu *vcpu;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+retry:
+       vcpu = vgic_target_oracle(irq);
+       if (irq->vcpu || !vcpu) {
+               /*
+                * If this IRQ is already on a VCPU's ap_list, then it
+                * cannot be moved or modified and there is no more work for
+                * us to do.
+                *
+                * Otherwise, if the irq is not pending and enabled, it does
+                * not need to be inserted into an ap_list and there is also
+                * no more work for us to do.
+                */
+               spin_unlock(&irq->irq_lock);
+               return false;
+       }
+
+       /*
+        * We must unlock the irq lock to take the ap_list_lock where
+        * we are going to insert this new pending interrupt.
+        */
+       spin_unlock(&irq->irq_lock);
+
+       /* someone can do stuff here, which we re-check below */
+
+       spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+       spin_lock(&irq->irq_lock);
+
+       /*
+        * Did something change behind our backs?
+        *
+        * There are two cases:
+        * 1) The irq lost its pending state or was disabled behind our
+        *    backs and/or it was queued to another VCPU's ap_list.
+        * 2) Someone changed the affinity on this irq behind our
+        *    backs and we are now holding the wrong ap_list_lock.
+        *
+        * In both cases, drop the locks and retry.
+        */
+
+       if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+
+               spin_lock(&irq->irq_lock);
+               goto retry;
+       }
+
+       list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+       irq->vcpu = vcpu;
+
+       spin_unlock(&irq->irq_lock);
+       spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+
+       kvm_vcpu_kick(vcpu);
+
+       return true;
+}
+
+static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
+                                  unsigned int intid, bool level,
+                                  bool mapped_irq)
+{
+       struct kvm_vcpu *vcpu;
+       struct vgic_irq *irq;
+       int ret;
+
+       trace_vgic_update_irq_pending(cpuid, intid, level);
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       vcpu = kvm_get_vcpu(kvm, cpuid);
+       if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+               return -EINVAL;
+
+       irq = vgic_get_irq(kvm, vcpu, intid);
+       if (!irq)
+               return -EINVAL;
+
+       if (irq->hw != mapped_irq)
+               return -EINVAL;
+
+       spin_lock(&irq->irq_lock);
+
+       if (!vgic_validate_injection(irq, level)) {
+               /* Nothing to see here, move along... */
+               spin_unlock(&irq->irq_lock);
+               return 0;
+       }
+
+       if (irq->config == VGIC_CONFIG_LEVEL) {
+               irq->line_level = level;
+               irq->pending = level || irq->soft_pending;
+       } else {
+               irq->pending = true;
+       }
+
+       vgic_queue_irq_unlock(kvm, irq);
+
+       return 0;
+}
+
+/**
+ * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @intid:   The INTID to inject a new state to.
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                       bool level)
+{
+       return vgic_update_irq_pending(kvm, cpuid, intid, level, false);
+}
+
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+                              bool level)
+{
+       return vgic_update_irq_pending(kvm, cpuid, intid, level, true);
+}
+
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+
+       BUG_ON(!irq);
+
+       spin_lock(&irq->irq_lock);
+
+       irq->hw = true;
+       irq->hwintid = phys_irq;
+
+       spin_unlock(&irq->irq_lock);
+
+       return 0;
+}
+
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+
+       BUG_ON(!irq);
+
+       if (!vgic_initialized(vcpu->kvm))
+               return -EAGAIN;
+
+       spin_lock(&irq->irq_lock);
+
+       irq->hw = false;
+       irq->hwintid = 0;
+
+       spin_unlock(&irq->irq_lock);
+
+       return 0;
+}
+
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq, *tmp;
+
+retry:
+       spin_lock(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+               struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
+
+               spin_lock(&irq->irq_lock);
+
+               BUG_ON(vcpu != irq->vcpu);
+
+               target_vcpu = vgic_target_oracle(irq);
+
+               if (!target_vcpu) {
+                       /*
+                        * We don't need to process this interrupt any
+                        * further, move it off the list.
+                        */
+                       list_del(&irq->ap_list);
+                       irq->vcpu = NULL;
+                       spin_unlock(&irq->irq_lock);
+                       continue;
+               }
+
+               if (target_vcpu == vcpu) {
+                       /* We're on the right CPU */
+                       spin_unlock(&irq->irq_lock);
+                       continue;
+               }
+
+               /* This interrupt looks like it has to be migrated. */
+
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vgic_cpu->ap_list_lock);
+
+               /*
+                * Ensure locking order by always locking the smallest
+                * ID first.
+                */
+               if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
+                       vcpuA = vcpu;
+                       vcpuB = target_vcpu;
+               } else {
+                       vcpuA = target_vcpu;
+                       vcpuB = vcpu;
+               }
+
+               spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+               spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+                                SINGLE_DEPTH_NESTING);
+               spin_lock(&irq->irq_lock);
+
+               /*
+                * If the affinity has been preserved, move the
+                * interrupt around. Otherwise, it means things have
+                * changed while the interrupt was unlocked, and we
+                * need to replay this.
+                *
+                * In all cases, we cannot trust the list not to have
+                * changed, so we restart from the beginning.
+                */
+               if (target_vcpu == vgic_target_oracle(irq)) {
+                       struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+                       list_del(&irq->ap_list);
+                       irq->vcpu = target_vcpu;
+                       list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+               }
+
+               spin_unlock(&irq->irq_lock);
+               spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+               spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+               goto retry;
+       }
+
+       spin_unlock(&vgic_cpu->ap_list_lock);
+}
+
+static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_process_maintenance(vcpu);
+       else
+               vgic_v3_process_maintenance(vcpu);
+}
+
+static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_fold_lr_state(vcpu);
+       else
+               vgic_v3_fold_lr_state(vcpu);
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
+                                   struct vgic_irq *irq, int lr)
+{
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_populate_lr(vcpu, irq, lr);
+       else
+               vgic_v3_populate_lr(vcpu, irq, lr);
+}
+
+static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_clear_lr(vcpu, lr);
+       else
+               vgic_v3_clear_lr(vcpu, lr);
+}
+
+static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
+{
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_set_underflow(vcpu);
+       else
+               vgic_v3_set_underflow(vcpu);
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count = 0;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+               /* GICv2 SGIs can count for more than one... */
+               if (vgic_irq_is_sgi(irq->intid) && irq->source)
+                       count += hweight8(irq->source);
+               else
+                       count++;
+               spin_unlock(&irq->irq_lock);
+       }
+       return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       int count = 0;
+
+       DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+
+       if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) {
+               vgic_set_underflow(vcpu);
+               vgic_sort_ap_list(vcpu);
+       }
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+
+               if (unlikely(vgic_target_oracle(irq) != vcpu))
+                       goto next;
+
+               /*
+                * If we get an SGI with multiple sources, try to get
+                * them in all at once.
+                */
+               do {
+                       vgic_populate_lr(vcpu, irq, count++);
+               } while (irq->source && count < kvm_vgic_global_state.nr_lr);
+
+next:
+               spin_unlock(&irq->irq_lock);
+
+               if (count == kvm_vgic_global_state.nr_lr)
+                       break;
+       }
+
+       vcpu->arch.vgic_cpu.used_lrs = count;
+
+       /* Nuke remaining LRs */
+       for ( ; count < kvm_vgic_global_state.nr_lr; count++)
+               vgic_clear_lr(vcpu, count);
+}
+
+/* Sync back the hardware VGIC state into our emulation after a guest's run. */
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+       vgic_process_maintenance_interrupt(vcpu);
+       vgic_fold_lr_state(vcpu);
+       vgic_prune_ap_list(vcpu);
+}
+
+/* Flush our emulation state into the GIC hardware before entering the guest. */
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+       spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+       vgic_flush_lr_state(vcpu);
+       spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+}
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       struct vgic_irq *irq;
+       bool pending = false;
+
+       if (!vcpu->kvm->arch.vgic.enabled)
+               return false;
+
+       spin_lock(&vgic_cpu->ap_list_lock);
+
+       list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+               spin_lock(&irq->irq_lock);
+               pending = irq->pending && irq->enabled;
+               spin_unlock(&irq->irq_lock);
+
+               if (pending)
+                       break;
+       }
+
+       spin_unlock(&vgic_cpu->ap_list_lock);
+
+       return pending;
+}
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+       struct kvm_vcpu *vcpu;
+       int c;
+
+       /*
+        * We've injected an interrupt, time to find out who deserves
+        * a good kick...
+        */
+       kvm_for_each_vcpu(c, vcpu, kvm) {
+               if (kvm_vgic_vcpu_pending_irq(vcpu))
+                       kvm_vcpu_kick(vcpu);
+       }
+}
+
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
+{
+       struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+       bool map_is_active;
+
+       spin_lock(&irq->irq_lock);
+       map_is_active = irq->hw && irq->active;
+       spin_unlock(&irq->irq_lock);
+
+       return map_is_active;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
new file mode 100644 (file)
index 0000000..7b300ca
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __KVM_ARM_VGIC_NEW_H__
+#define __KVM_ARM_VGIC_NEW_H__
+
+#include <linux/irqchip/arm-gic-common.h>
+
+#define PRODUCT_ID_KVM         0x4b    /* ASCII code K */
+#define IMPLEMENTER_ARM                0x43b
+
+#define VGIC_ADDR_UNDEF                (-1)
+#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
+
+#define INTERRUPT_ID_BITS_SPIS 10
+#define VGIC_PRI_BITS          5
+
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
+struct vgic_vmcr {
+       u32     ctlr;
+       u32     abpr;
+       u32     bpr;
+       u32     pmr;
+};
+
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+                             u32 intid);
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_kick_vcpus(struct kvm *kvm);
+
+void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                        int offset, u32 *val);
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+                         int offset, u32 *val);
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_enable(struct kvm_vcpu *vcpu);
+int vgic_v2_probe(const struct gic_kvm_info *info);
+int vgic_v2_map_resources(struct kvm *kvm);
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+                            enum vgic_type);
+
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_enable(struct kvm_vcpu *vcpu);
+int vgic_v3_probe(const struct gic_kvm_info *info);
+int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+#else
+static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void vgic_v3_populate_lr(struct kvm_vcpu *vcpu,
+                                      struct vgic_irq *irq, int lr)
+{
+}
+
+static inline void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+}
+
+static inline void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+}
+
+static inline
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+}
+
+static inline void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+       return -ENODEV;
+}
+
+static inline int vgic_v3_map_resources(struct kvm *kvm)
+{
+       return -ENODEV;
+}
+
+static inline int vgic_register_redist_iodevs(struct kvm *kvm,
+                                             gpa_t dist_base_address)
+{
+       return -ENODEV;
+}
+#endif
+
+void kvm_register_vgic_device(unsigned long type);
+int vgic_lazy_init(struct kvm *kvm);
+int vgic_init(struct kvm *kvm);
+
+#endif
index dd4ac9d9e8f5c71abd930e1ed079b4c52834cd51..37af23052470eff99b2c3a3fdcf05898c7312602 100644 (file)
@@ -63,6 +63,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+/* Worst case buffer size needed for holding an integer. */
+#define ITOA_MAX_LEN 12
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -100,6 +103,9 @@ static __read_mostly struct preempt_ops kvm_preempt_ops;
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
+static int kvm_debugfs_num_entries;
+static const struct file_operations *stat_fops_per_vm[];
+
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 #ifdef CONFIG_KVM_COMPAT
@@ -542,6 +548,58 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
        kvfree(slots);
 }
 
+static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+{
+       int i;
+
+       if (!kvm->debugfs_dentry)
+               return;
+
+       debugfs_remove_recursive(kvm->debugfs_dentry);
+
+       for (i = 0; i < kvm_debugfs_num_entries; i++)
+               kfree(kvm->debugfs_stat_data[i]);
+       kfree(kvm->debugfs_stat_data);
+}
+
+static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+{
+       char dir_name[ITOA_MAX_LEN * 2];
+       struct kvm_stat_data *stat_data;
+       struct kvm_stats_debugfs_item *p;
+
+       if (!debugfs_initialized())
+               return 0;
+
+       snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+       kvm->debugfs_dentry = debugfs_create_dir(dir_name,
+                                                kvm_debugfs_dir);
+       if (!kvm->debugfs_dentry)
+               return -ENOMEM;
+
+       kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+                                        sizeof(*kvm->debugfs_stat_data),
+                                        GFP_KERNEL);
+       if (!kvm->debugfs_stat_data)
+               return -ENOMEM;
+
+       for (p = debugfs_entries; p->name; p++) {
+               stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+               if (!stat_data)
+                       return -ENOMEM;
+
+               stat_data->kvm = kvm;
+               stat_data->offset = p->offset;
+               kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+               if (!debugfs_create_file(p->name, 0444,
+                                        kvm->debugfs_dentry,
+                                        stat_data,
+                                        stat_fops_per_vm[p->kind]))
+                       return -ENOMEM;
+       }
+       return 0;
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
@@ -647,6 +705,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        int i;
        struct mm_struct *mm = kvm->mm;
 
+       kvm_destroy_vm_debugfs(kvm);
        kvm_arch_sync_events(kvm);
        spin_lock(&kvm_lock);
        list_del(&kvm->vm_list);
@@ -2999,8 +3058,15 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
        }
 #endif
        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
-       if (r < 0)
+       if (r < 0) {
                kvm_put_kvm(kvm);
+               return r;
+       }
+
+       if (kvm_create_vm_debugfs(kvm, r) < 0) {
+               kvm_put_kvm(kvm);
+               return -ENOMEM;
+       }
 
        return r;
 }
@@ -3425,15 +3491,114 @@ static struct notifier_block kvm_cpu_notifier = {
        .notifier_call = kvm_cpu_hotplug,
 };
 
+static int kvm_debugfs_open(struct inode *inode, struct file *file,
+                          int (*get)(void *, u64 *), int (*set)(void *, u64),
+                          const char *fmt)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       /* The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.
+        * To avoid the race between open and the removal of the debugfs
+        * directory we test against the users count.
+        */
+       if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+               return -ENOENT;
+
+       if (simple_attr_open(inode, file, get, set, fmt)) {
+               kvm_put_kvm(stat_data->kvm);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int kvm_debugfs_release(struct inode *inode, struct file *file)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+                                         inode->i_private;
+
+       simple_attr_release(inode, file);
+       kvm_put_kvm(stat_data->kvm);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm(void *data, u64 *val)
+{
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+
+       *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+
+       return 0;
+}
+
+static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+                               NULL, "%llu\n");
+}
+
+static const struct file_operations vm_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vm_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static int vcpu_stat_get_per_vm(void *data, u64 *val)
+{
+       int i;
+       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       struct kvm_vcpu *vcpu;
+
+       *val = 0;
+
+       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+               *val += *(u32 *)((void *)vcpu + stat_data->offset);
+
+       return 0;
+}
+
+static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+{
+       __simple_attr_check_format("%llu\n", 0ull);
+       return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+                                NULL, "%llu\n");
+}
+
+static const struct file_operations vcpu_stat_get_per_vm_fops = {
+       .owner   = THIS_MODULE,
+       .open    = vcpu_stat_get_per_vm_open,
+       .release = kvm_debugfs_release,
+       .read    = simple_attr_read,
+       .write   = simple_attr_write,
+       .llseek  = generic_file_llseek,
+};
+
+static const struct file_operations *stat_fops_per_vm[] = {
+       [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+       [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+};
+
 static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               *val += *(u32 *)((void *)kvm + offset);
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3444,15 +3609,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_vcpu *vcpu;
-       int i;
+       struct kvm_stat_data stat_tmp = {.offset = offset};
+       u64 tmp_val;
 
        *val = 0;
        spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       *val += *(u32 *)((void *)vcpu + offset);
-
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               stat_tmp.kvm = kvm;
+               vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               *val += tmp_val;
+       }
        spin_unlock(&kvm_lock);
        return 0;
 }
@@ -3473,7 +3639,8 @@ static int kvm_init_debug(void)
        if (kvm_debugfs_dir == NULL)
                goto out;
 
-       for (p = debugfs_entries; p->name; ++p) {
+       kvm_debugfs_num_entries = 0;
+       for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
                if (!debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                         (void *)(long)p->offset,
                                         stat_fops[p->kind]))