Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 15:57:29 +0000 (08:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 15:57:29 +0000 (08:57 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (42 commits)
  Btrfs: hash the btree inode during  fill_super
  Btrfs: relocate file extents in clusters
  Btrfs: don't rename file into dummy directory
  Btrfs: check size of inode backref before adding hardlink
  Btrfs: fix releasepage to avoid unlocking extents we haven't locked
  Btrfs: Fix test_range_bit for whole file extents
  Btrfs: fix errors handling cached state in set/clear_extent_bit
  Btrfs: fix early enospc during balancing
  Btrfs: deal with NULL space info
  Btrfs: account for space used by the super mirrors
  Btrfs: fix extent entry threshold calculation
  Btrfs: remove dead code
  Btrfs: fix bitmap size tracking
  Btrfs: don't keep retrying a block group if we fail to allocate a cluster
  Btrfs: make balance code choose more wisely when relocating
  Btrfs: fix arithmetic error in clone ioctl
  Btrfs: add snapshot/subvolume destroy ioctl
  Btrfs: change how subvolumes are organized
  Btrfs: do not reuse objectid of deleted snapshot/subvol
  Btrfs: speed up snapshot dropping
  ...

294 files changed:
Documentation/auxdisplay/cfag12864b-example.c
Documentation/cgroups/cgroups.txt
Documentation/cgroups/memory.txt
Documentation/crypto/async-tx-api.txt
Documentation/filesystems/sharedsubtree.txt
Documentation/filesystems/vfs.txt
Documentation/ioctl/ioctl-number.txt
Documentation/sysctl/fs.txt
Documentation/sysctl/kernel.txt
Documentation/sysctl/vm.txt
Documentation/vm/.gitignore
Documentation/vm/locking
Documentation/vm/page-types.c
MAINTAINERS
arch/alpha/include/asm/fcntl.h
arch/alpha/kernel/core_marvel.c
arch/alpha/kernel/core_titan.c
arch/alpha/kernel/pci_impl.h
arch/alpha/kernel/pci_iommu.c
arch/arm/include/asm/hardware/iop3xx-adma.h
arch/arm/include/asm/hardware/iop_adma.h
arch/arm/mach-iop13xx/include/mach/adma.h
arch/arm/mach-iop13xx/setup.c
arch/arm/plat-iop/adma.c
arch/frv/kernel/pm.c
arch/mips/lasat/sysctl.c
arch/parisc/include/asm/fcntl.h
arch/powerpc/include/asm/fsldma.h [new file with mode: 0644]
arch/s390/appldata/appldata_base.c
arch/s390/kernel/debug.c
arch/s390/mm/cmm.c
arch/sh/drivers/dma/Kconfig
arch/sh/drivers/dma/Makefile
arch/sh/include/asm/dma-sh.h
arch/x86/include/asm/nmi.h
arch/x86/kernel/apic/nmi.c
arch/x86/kernel/vsyscall_64.c
arch/x86/mm/fault.c
crypto/async_tx/Kconfig
crypto/async_tx/Makefile
crypto/async_tx/async_memcpy.c
crypto/async_tx/async_memset.c
crypto/async_tx/async_pq.c [new file with mode: 0644]
crypto/async_tx/async_raid6_recov.c [new file with mode: 0644]
crypto/async_tx/async_tx.c
crypto/async_tx/async_xor.c
crypto/async_tx/raid6test.c [new file with mode: 0644]
drivers/cdrom/cdrom.c
drivers/char/Kconfig
drivers/char/Makefile
drivers/char/bfin-otp.c
drivers/char/hpet.c
drivers/char/mem.c
drivers/char/mwave/mwavedd.c
drivers/char/random.c
drivers/char/rio/rioctrl.c
drivers/char/uv_mmtimer.c [new file with mode: 0644]
drivers/dca/dca-core.c
drivers/dma/Kconfig
drivers/dma/Makefile
drivers/dma/at_hdmac.c
drivers/dma/at_hdmac_regs.h
drivers/dma/dmaengine.c
drivers/dma/dmatest.c
drivers/dma/dw_dmac.c
drivers/dma/dw_dmac_regs.h
drivers/dma/fsldma.c
drivers/dma/fsldma.h
drivers/dma/ioat.c [deleted file]
drivers/dma/ioat/Makefile [new file with mode: 0644]
drivers/dma/ioat/dca.c [new file with mode: 0644]
drivers/dma/ioat/dma.c [new file with mode: 0644]
drivers/dma/ioat/dma.h [new file with mode: 0644]
drivers/dma/ioat/dma_v2.c [new file with mode: 0644]
drivers/dma/ioat/dma_v2.h [new file with mode: 0644]
drivers/dma/ioat/dma_v3.c [new file with mode: 0644]
drivers/dma/ioat/hw.h [new file with mode: 0644]
drivers/dma/ioat/pci.c [new file with mode: 0644]
drivers/dma/ioat/registers.h [new file with mode: 0644]
drivers/dma/ioat_dca.c [deleted file]
drivers/dma/ioat_dma.c [deleted file]
drivers/dma/ioatdma.h [deleted file]
drivers/dma/ioatdma_hw.h [deleted file]
drivers/dma/ioatdma_registers.h [deleted file]
drivers/dma/iop-adma.c
drivers/dma/iovlock.c
drivers/dma/mv_xor.c
drivers/dma/mv_xor.h
drivers/dma/shdma.c [new file with mode: 0644]
drivers/dma/shdma.h [new file with mode: 0644]
drivers/dma/txx9dmac.c
drivers/dma/txx9dmac.h
drivers/edac/Kconfig
drivers/edac/Makefile
drivers/edac/cpc925_edac.c
drivers/edac/edac_device.c
drivers/edac/edac_mc.c
drivers/edac/edac_pci.c
drivers/edac/i3200_edac.c [new file with mode: 0644]
drivers/edac/mpc85xx_edac.c
drivers/edac/mv64x60_edac.c
drivers/idle/i7300_idle.c
drivers/input/misc/Kconfig
drivers/md/Kconfig
drivers/md/bitmap.c
drivers/md/linear.c
drivers/md/md.c
drivers/md/md.h
drivers/md/multipath.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/md/raid5.h
drivers/media/dvb/dvb-core/dvbdev.h
drivers/media/dvb/dvb-usb/Kconfig
drivers/media/video/saa7164/saa7164-api.c
drivers/media/video/saa7164/saa7164-cmd.c
drivers/media/video/saa7164/saa7164-core.c
drivers/media/video/saa7164/saa7164.h
drivers/memstick/core/memstick.c
drivers/misc/sgi-gru/grukservices.c
drivers/misc/sgi-gru/gruprocfs.c
drivers/mmc/host/atmel-mci.c
drivers/net/wireless/arlan-proc.c
drivers/parport/procfs.c
drivers/staging/go7007/Makefile
drivers/usb/serial/sierra.c
drivers/vlynq/vlynq.c
fs/adfs/inode.c
fs/attr.c
fs/befs/linuxvfs.c
fs/binfmt_elf.c
fs/binfmt_elf_fdpic.c
fs/binfmt_flat.c
fs/block_dev.c
fs/btrfs/inode.c
fs/buffer.c
fs/char_dev.c
fs/cifs/cifsfs.c
fs/cifs/inode.c
fs/coda/coda_int.h
fs/compat.c
fs/drop_caches.c
fs/exec.c
fs/exofs/super.c
fs/ext2/inode.c
fs/ext3/inode.c
fs/ext4/inode.c
fs/fat/inode.c
fs/fcntl.c
fs/file_table.c
fs/fuse/dir.c
fs/fuse/fuse_i.h
fs/fuse/inode.c
fs/gfs2/aops.c
fs/hfs/mdb.c
fs/hfsplus/super.c
fs/hugetlbfs/inode.c
fs/inode.c
fs/internal.h
fs/ioctl.c
fs/isofs/inode.c
fs/jfs/super.c
fs/libfs.c
fs/namespace.c
fs/ncpfs/inode.c
fs/ncpfs/ioctl.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nls/nls_base.c
fs/ntfs/aops.c
fs/ntfs/super.c
fs/ocfs2/aops.c
fs/proc/meminfo.c
fs/proc/proc_sysctl.c
fs/ramfs/file-nommu.c
fs/read_write.c
fs/romfs/super.c
fs/seq_file.c
fs/smbfs/inode.c
fs/super.c
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_sysctl.c
include/asm-generic/fcntl.h
include/asm-generic/mman-common.h
include/asm-generic/siginfo.h
include/linux/async_tx.h
include/linux/binfmts.h
include/linux/cgroup.h
include/linux/configfs.h
include/linux/dca.h
include/linux/debugfs.h
include/linux/dmaengine.h
include/linux/fs.h
include/linux/ftrace.h
include/linux/futex.h
include/linux/hugetlb.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/page-flags.h
include/linux/page_cgroup.h
include/linux/pci_ids.h
include/linux/prctl.h
include/linux/relay.h
include/linux/res_counter.h
include/linux/rmap.h
include/linux/sched.h
include/linux/security.h
include/linux/seq_file.h
include/linux/signal.h
include/linux/swap.h
include/linux/swapops.h
include/linux/sysctl.h
include/linux/time.h
include/linux/tracehook.h
include/linux/tracepoint.h
include/linux/unaligned/be_byteshift.h
include/linux/unaligned/le_byteshift.h
include/linux/writeback.h
include/net/ip.h
include/net/ndisc.h
ipc/ipc_sysctl.c
ipc/mq_sysctl.c
kernel/Makefile
kernel/audit.c
kernel/audit_watch.c
kernel/auditsc.c
kernel/cgroup.c
kernel/cgroup_debug.c [deleted file]
kernel/cgroup_freezer.c
kernel/cpuset.c
kernel/exit.c
kernel/fork.c
kernel/hung_task.c
kernel/ns_cgroup.c
kernel/pid_namespace.c
kernel/ptrace.c
kernel/res_counter.c
kernel/sched.c
kernel/sched_fair.c
kernel/signal.c
kernel/slow-work.c
kernel/softlockup.c
kernel/sys.c
kernel/sysctl.c
kernel/time/Makefile
kernel/time/timeconv.c [new file with mode: 0644]
kernel/trace/ftrace.c
kernel/trace/trace_stack.c
kernel/utsname_sysctl.c
lib/decompress_inflate.c
lib/decompress_unlzma.c
mm/Kconfig
mm/Makefile
mm/filemap.c
mm/hugetlb.c
mm/hwpoison-inject.c [new file with mode: 0644]
mm/ksm.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c [new file with mode: 0644]
mm/memory.c
mm/migrate.c
mm/mremap.c
mm/nommu.c
mm/page-writeback.c
mm/page_alloc.c
mm/rmap.c
mm/shmem.c
mm/swapfile.c
mm/truncate.c
mm/vmscan.c
net/bridge/br_netfilter.c
net/decnet/dn_dev.c
net/decnet/sysctl_net_decnet.c
net/ipv4/devinet.c
net/ipv4/route.c
net/ipv4/sysctl_net_ipv4.c
net/ipv6/addrconf.c
net/ipv6/ndisc.c
net/ipv6/route.c
net/irda/irsysctl.c
net/netfilter/ipvs/ip_vs_ctl.c
net/netfilter/nf_log.c
net/phonet/sysctl.c
net/sunrpc/sysctl.c
net/sunrpc/xprtrdma/svc_rdma.c
security/device_cgroup.c
security/lsm_audit.c
security/min_addr.c
security/selinux/hooks.c

index 1d2c010bae120faacda9f7a40324a3bc3b57a308..e7823ffb1ca0f4f06d8ebbcec85b14d4db9fc10c 100644 (file)
@@ -194,7 +194,6 @@ static void cfag12864b_blit(void)
  */
 
 #include <stdio.h>
-#include <string.h>
 
 #define EXAMPLES       6
 
index 6eb1a97e88ce887c9628843aa664d55aca59071d..455d4e6d346d839eb0bd8b811efed40afd3642fa 100644 (file)
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
 
 # echo 0 > tasks
 
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
 3. Kernel API
 =============
 
@@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-              struct task_struct *task)
+              struct task_struct *task, bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called prior to moving a task into a cgroup; if the subsystem
@@ -509,14 +529,20 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex.
+remain valid while the caller holds cgroup_mutex. If threadgroup is
+true, then a successful result indicates that all threads in the given
+thread's threadgroup can be moved together.
 
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-           struct cgroup *old_cgrp, struct task_struct *task)
+           struct cgroup *old_cgrp, struct task_struct *task,
+           bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+If threadgroup is true, the subsystem should take care of all threads
+in the specified thread's threadgroup. Currently does not support any
+subsystem that might need the old_cgrp for every thread in the group.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
index 23d1262c0775755efa8363d049d008e875776f89..b871f2552b45760e87de534ef119ab5e6dcdd102 100644 (file)
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
 pages that are selected for reclaiming come from the per cgroup LRU
 list.
 
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
+
 2. Locking
 
 The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
 NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
@@ -375,7 +379,42 @@ cgroups created below it.
 
 NOTE2: This feature can be enabled/disabled per subtree.
 
-7. TODO
+7. Soft limits
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best effort feature, it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is setup such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 megabytes)
+
+# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use
+
+# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1: Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2: It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. TODO
 
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
index 9f59fcbf5d82b9ce3236dd044f9e1d75b4600e79..ba046b8fa92fb4a34a360fe54e6b997c019feb43 100644 (file)
@@ -54,20 +54,23 @@ features surfaced as a result:
 
 3.1 General format of the API:
 struct dma_async_tx_descriptor *
-async_<operation>(<op specific parameters>,
-                 enum async_tx_flags flags,
-                 struct dma_async_tx_descriptor *dependency,
-                 dma_async_tx_callback callback_routine,
-                 void *callback_parameter);
+async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
 
 3.2 Supported operations:
-memcpy       - memory copy between a source and a destination buffer
-memset       - fill a destination buffer with a byte value
-xor          - xor a series of source buffers and write the result to a
-              destination buffer
-xor_zero_sum - xor a series of source buffers and set a flag if the
-              result is zero.  The implementation attempts to prevent
-              writes to memory
+memcpy  - memory copy between a source and a destination buffer
+memset  - fill a destination buffer with a byte value
+xor     - xor a series of source buffers and write the result to a
+         destination buffer
+xor_val - xor a series of source buffers and set a flag if the
+         result is zero.  The implementation attempts to prevent
+         writes to memory
+pq     - generate the p+q (raid6 syndrome) from a series of source buffers
+pq_val  - validate that a p and or q buffer are in sync with a given series of
+         sources
+datap  - (raid6_datap_recov) recover a raid6 data block and the p block
+         from the given sources
+2data  - (raid6_2data_recov) recover 2 raid6 data blocks from the given
+         sources
 
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
@@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to
 recycle (or free) the descriptor.  A descriptor can be acked by one of the
 following methods:
 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
-2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
-   descriptor of a new operation.
+2/ submitting an unacknowledged descriptor as a dependency to another
+   async_tx call will implicitly set the acknowledged state.
 3/ calling async_tx_ack() on the descriptor.
 
 3.4 When does the operation execute?
@@ -119,30 +122,42 @@ of an operation.
 Perform a xor->copy->xor operation where each operation depends on the
 result from the previous operation:
 
-void complete_xor_copy_xor(void *param)
+void callback(void *param)
 {
-       printk("complete\n");
+       struct completion *cmp = param;
+
+       complete(cmp);
 }
 
-int run_xor_copy_xor(struct page **xor_srcs,
-                    int xor_src_cnt,
-                    struct page *xor_dest,
-                    size_t xor_len,
-                    struct page *copy_src,
-                    struct page *copy_dest,
-                    size_t copy_len)
+void run_xor_copy_xor(struct page **xor_srcs,
+                     int xor_src_cnt,
+                     struct page *xor_dest,
+                     size_t xor_len,
+                     struct page *copy_src,
+                     struct page *copy_dest,
+                     size_t copy_len)
 {
        struct dma_async_tx_descriptor *tx;
+       addr_conv_t addr_conv[xor_src_cnt];
+       struct async_submit_ctl submit;
+       addr_conv_t addr_conv[NDISKS];
+       struct completion cmp;
+
+       init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL,
+                         addr_conv);
+       tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit)
 
-       tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-                      ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
-       tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
-                         ASYNC_TX_DEP_ACK, tx, NULL, NULL);
-       tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-                      ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
-                      tx, complete_xor_copy_xor, NULL);
+       submit->depend_tx = tx;
+       tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit);
+
+       init_completion(&cmp);
+       init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx,
+                         callback, &cmp, addr_conv);
+       tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit);
 
        async_tx_issue_pending_all();
+
+       wait_for_completion(&cmp);
 }
 
 See include/linux/async_tx.h for more information on the flags.  See the
index 736540045dc7b7c9e448fb90bd72fc63346e082d..23a181074f94b5f65ac25c603d65fc749fa5ed78 100644 (file)
@@ -4,7 +4,7 @@ Shared Subtrees
 Contents:
        1) Overview
        2) Features
-       3) smount command
+       3) Setting mount states
        4) Use-case
        5) Detailed semantics
        6) Quiz
@@ -41,14 +41,14 @@ replicas continue to be exactly same.
 
        Here is an example:
 
-       Lets say /mnt has a mount that is shared.
+       Let's say /mnt has a mount that is shared.
        mount --make-shared /mnt
 
-       note: mount command does not yet support the --make-shared flag.
-       I have included a small C program which does the same by executing
-       'smount /mnt shared'
+       Note: mount(8) command now supports the --make-shared flag,
+       so the sample 'smount' program is no longer needed and has been
+       removed.
 
-       #mount --bind /mnt /tmp
+       # mount --bind /mnt /tmp
        The above command replicates the mount at /mnt to the mountpoint /tmp
        and the contents of both the mounts remain identical.
 
@@ -58,8 +58,8 @@ replicas continue to be exactly same.
        #ls /tmp
        a b c
 
-       Now lets say we mount a device at /tmp/a
-       #mount /dev/sd0  /tmp/a
+       Now let's say we mount a device at /tmp/a
+       # mount /dev/sd0  /tmp/a
 
        #ls /tmp/a
        t1 t2 t2
@@ -80,21 +80,20 @@ replicas continue to be exactly same.
 
        Here is an example:
 
-       Lets say /mnt has a mount which is shared.
-       #mount --make-shared /mnt
+       Let's say /mnt has a mount which is shared.
+       # mount --make-shared /mnt
 
-       Lets bind mount /mnt to /tmp
-       #mount --bind /mnt /tmp
+       Let's bind mount /mnt to /tmp
+       # mount --bind /mnt /tmp
 
        the new mount at /tmp becomes a shared mount and it is a replica of
        the mount at /mnt.
 
-       Now lets make the mount at /tmp; a slave of /mnt
-       #mount --make-slave /tmp
-       [or smount /tmp slave]
+       Now let's make the mount at /tmp; a slave of /mnt
+       # mount --make-slave /tmp
 
-       lets mount /dev/sd0 on /mnt/a
-       #mount /dev/sd0 /mnt/a
+       let's mount /dev/sd0 on /mnt/a
+       # mount /dev/sd0 /mnt/a
 
        #ls /mnt/a
        t1 t2 t3
@@ -104,9 +103,9 @@ replicas continue to be exactly same.
 
        Note the mount event has propagated to the mount at /tmp
 
-       However lets see what happens if we mount something on the mount at /tmp
+       However let's see what happens if we mount something on the mount at /tmp
 
-       #mount /dev/sd1 /tmp/b
+       # mount /dev/sd1 /tmp/b
 
        #ls /tmp/b
        s1 s2 s3
@@ -124,12 +123,11 @@ replicas continue to be exactly same.
 
 2d) A unbindable mount is a unbindable private mount
 
-       lets say we have a mount at /mnt and we make is unbindable
+       let's say we have a mount at /mnt and we make is unbindable
 
-       #mount --make-unbindable /mnt
-        [ smount /mnt  unbindable ]
+       # mount --make-unbindable /mnt
 
-        Lets try to bind mount this mount somewhere else.
+        Let's try to bind mount this mount somewhere else.
         # mount --bind /mnt /tmp
         mount: wrong fs type, bad option, bad superblock on /mnt,
                or too many mounted file systems
@@ -137,149 +135,15 @@ replicas continue to be exactly same.
        Binding a unbindable mount is a invalid operation.
 
 
-3) smount command
+3) Setting mount states
 
-       Currently the mount command is not aware of shared subtree features.
-       Work is in progress to add the support in mount ( util-linux package ).
-       Till then use the following program.
+       The mount command (util-linux package) can be used to set mount
+       states:
 
-       ------------------------------------------------------------------------
-       //
-       //this code was developed my Miklos Szeredi <miklos@szeredi.hu>
-       //and modified by Ram Pai <linuxram@us.ibm.com>
-       // sample usage:
-       //              smount /tmp shared
-       //
-       #include <stdio.h>
-       #include <stdlib.h>
-       #include <unistd.h>
-       #include <string.h>
-       #include <sys/mount.h>
-       #include <sys/fsuid.h>
-
-       #ifndef MS_REC
-       #define MS_REC          0x4000  /* 16384: Recursive loopback */
-       #endif
-
-       #ifndef MS_SHARED
-       #define MS_SHARED               1<<20   /* Shared */
-       #endif
-
-       #ifndef MS_PRIVATE
-       #define MS_PRIVATE              1<<18   /* Private */
-       #endif
-
-       #ifndef MS_SLAVE
-       #define MS_SLAVE                1<<19   /* Slave */
-       #endif
-
-       #ifndef MS_UNBINDABLE
-       #define MS_UNBINDABLE           1<<17   /* Unbindable */
-       #endif
-
-       int main(int argc, char *argv[])
-       {
-               int type;
-               if(argc != 3) {
-                       fprintf(stderr, "usage: %s dir "
-                       "<rshared|rslave|rprivate|runbindable|shared|slave"
-                       "|private|unbindable>\n" , argv[0]);
-                       return 1;
-               }
-
-               fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
-
-               if (strcmp(argv[2],"rshared")==0)
-                       type=(MS_SHARED|MS_REC);
-               else if (strcmp(argv[2],"rslave")==0)
-                       type=(MS_SLAVE|MS_REC);
-               else if (strcmp(argv[2],"rprivate")==0)
-                       type=(MS_PRIVATE|MS_REC);
-               else if (strcmp(argv[2],"runbindable")==0)
-                       type=(MS_UNBINDABLE|MS_REC);
-               else if (strcmp(argv[2],"shared")==0)
-                       type=MS_SHARED;
-               else if (strcmp(argv[2],"slave")==0)
-                       type=MS_SLAVE;
-               else if (strcmp(argv[2],"private")==0)
-                       type=MS_PRIVATE;
-               else if (strcmp(argv[2],"unbindable")==0)
-                       type=MS_UNBINDABLE;
-               else {
-                       fprintf(stderr, "invalid operation: %s\n", argv[2]);
-                       return 1;
-               }
-               setfsuid(getuid());
-
-               if(mount("", argv[1], "dontcare", type, "") == -1) {
-                       perror("mount");
-                       return 1;
-               }
-               return 0;
-       }
-       -----------------------------------------------------------------------
-
-       Copy the above code snippet into smount.c
-       gcc -o smount smount.c
-
-
-       (i) To mark all the mounts under /mnt as shared execute the following
-       command:
-
-               smount /mnt rshared
-               the corresponding syntax planned for mount command is
-               mount --make-rshared /mnt
-
-           just to mark a mount /mnt as shared, execute the following
-           command:
-               smount /mnt shared
-               the corresponding syntax planned for mount command is
-               mount --make-shared /mnt
-
-       (ii) To mark all the shared mounts under /mnt as slave execute the
-       following
-
-            command:
-               smount /mnt rslave
-               the corresponding syntax planned for mount command is
-               mount --make-rslave /mnt
-
-           just to mark a mount /mnt as slave, execute the following
-           command:
-               smount /mnt slave
-               the corresponding syntax planned for mount command is
-               mount --make-slave /mnt
-
-       (iii) To mark all the mounts under /mnt as private execute the
-       following command:
-
-               smount /mnt rprivate
-               the corresponding syntax planned for mount command is
-               mount --make-rprivate /mnt
-
-           just to mark a mount /mnt as private, execute the following
-           command:
-               smount /mnt private
-               the corresponding syntax planned for mount command is
-               mount --make-private /mnt
-
-             NOTE: by default all the mounts are created as private. But if
-             you want to change some shared/slave/unbindable  mount as
-             private at a later point in time, this command can help.
-
-       (iv) To mark all the mounts under /mnt as unbindable execute the
-       following
-
-            command:
-               smount /mnt runbindable
-               the corresponding syntax planned for mount command is
-               mount --make-runbindable /mnt
-
-           just to mark a mount /mnt as unbindable, execute the following
-           command:
-               smount /mnt unbindable
-               the corresponding syntax planned for mount command is
-               mount --make-unbindable /mnt
+       mount --make-shared mountpoint
+       mount --make-slave mountpoint
+       mount --make-private mountpoint
+       mount --make-unbindable mountpoint
 
 
 4) Use cases
@@ -350,7 +214,7 @@ replicas continue to be exactly same.
                mount --rbind / /view/v3
                mount --rbind / /view/v4
 
-               and if /usr has a versioning filesystem mounted, than that
+               and if /usr has a versioning filesystem mounted, then that
                mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
                /view/v4/usr too
 
@@ -390,7 +254,7 @@ replicas continue to be exactly same.
 
                For example:
                        mount --make-shared /mnt
-                       mount --bin /mnt /tmp
+                       mount --bind /mnt /tmp
 
                The mount at /mnt and that at /tmp are both shared and belong
                to the same peer group. Anything mounted or unmounted under
@@ -558,7 +422,7 @@ replicas continue to be exactly same.
        then the subtree under the unbindable mount is pruned in the new
        location.
 
-       eg: lets say we have the following mount tree.
+       eg: let's say we have the following mount tree.
 
                A
              /   \
@@ -566,7 +430,7 @@ replicas continue to be exactly same.
             / \ / \
             D E F G
 
-            Lets say all the mount except the mount C in the tree are
+            Let's say all the mount except the mount C in the tree are
             of a type other than unbindable.
 
             If this tree is rbound to say Z
@@ -683,13 +547,13 @@ replicas continue to be exactly same.
        'b' on mounts that receive propagation from mount 'B' and does not have
        sub-mounts within them are unmounted.
 
-       Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to
+       Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
        each other.
 
-       lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+       let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
        'B1', 'B2' and 'B3' respectively.
 
-       lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+       let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
        mount 'B1', 'B2' and 'B3' respectively.
 
        if 'C1' is unmounted, all the mounts that are most-recently-mounted on
@@ -710,7 +574,7 @@ replicas continue to be exactly same.
        A cloned namespace contains all the mounts as that of the parent
        namespace.
 
-       Lets say 'A' and 'B' are the corresponding mounts in the parent and the
+       Let's say 'A' and 'B' are the corresponding mounts in the parent and the
        child namespace.
 
        If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
@@ -759,11 +623,11 @@ replicas continue to be exactly same.
                mount --make-slave /mnt
 
                At this point we have the first mount at /tmp and
-               its root dentry is 1. Lets call this mount 'A'
+               its root dentry is 1. Let's call this mount 'A'
                And then we have a second mount at /tmp1 with root
-               dentry 2. Lets call this mount 'B'
+               dentry 2. Let's call this mount 'B'
                Next we have a third mount at /mnt with root dentry
-               mnt. Lets call this mount 'C'
+               mnt. Let's call this mount 'C'
 
                'B' is the slave of 'A' and 'C' is a slave of 'B'
                A -> B -> C
@@ -794,7 +658,7 @@ replicas continue to be exactly same.
 
        Q3 Why is unbindable mount needed?
 
-               Lets say we want to replicate the mount tree at multiple
+               Let's say we want to replicate the mount tree at multiple
                locations within the same subtree.
 
                if one rbind mounts a tree within the same subtree 'n' times
@@ -803,7 +667,7 @@ replicas continue to be exactly same.
                mounts. Here is a example.
 
                step 1:
-                  lets say the root tree has just two directories with
+                  let's say the root tree has just two directories with
                   one vfsmount.
                                    root
                                   /    \
@@ -875,7 +739,7 @@ replicas continue to be exactly same.
                Unclonable mounts come in handy here.
 
                step 1:
-                  lets say the root tree has just two directories with
+                  let's say the root tree has just two directories with
                   one vfsmount.
                                    root
                                   /    \
index f49eecf2e57354a7dd94ad73a911ba2f69135c19..623f094c9d8d95a535d3274f60d6a30dc8d71b5f 100644 (file)
@@ -536,6 +536,7 @@ struct address_space_operations {
        /* migrate the contents of a page to the specified target */
        int (*migratepage) (struct page *, struct page *);
        int (*launder_page) (struct page *);
+       int (*error_remove_page) (struct mapping *mapping, struct page *page);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
        prevent redirtying the page, it is kept locked during the whole
        operation.
 
+  error_remove_page: normally set to generic_error_remove_page if truncation
+       is ok for this address space. Used for memory failure handling.
+       Setting this implies you deal with pages going away under you,
+       unless you have them locked or reference counts increased.
+
+
 The File Object
 ===============
 
index aafca0a8f66ab9d32b3fc5e3ba1a3c81403e3415..947374977ca5a2ef72e59cdb730d9d50c7f6b0b2 100644 (file)
@@ -135,6 +135,7 @@ Code        Seq#    Include File            Comments
                                        <http://mikonos.dia.unisa.it/tcfs>
 'l'    40-7F   linux/udf_fs_i.h        in development:
                                        <http://sourceforge.net/projects/linux-udf/>
+'m'    00-09   linux/mmtimer.h
 'm'    all     linux/mtio.h            conflict!
 'm'    all     linux/soundcard.h       conflict!
 'm'    all     linux/synclink.h        conflict!
index 1458448436cc4589bc329167ba945eb92719792b..62682500878a97e69a107a99655ca280e72df915 100644 (file)
@@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots
 of error messages about running out of file handles, you might
 want to increase this limit.
 
-The three values in file-nr denote the number of allocated
-file handles, the number of unused file handles and the maximum
-number of file handles. When the allocated file handles come
-close to the maximum, but the number of unused file handles is
-significantly greater than 0, you've encountered a peak in your 
-usage of file handles and you don't need to increase the maximum.
-
+Historically, the three values in file-nr denoted the number of
+allocated file handles, the number of allocated but unused file
+handles, and the maximum number of file handles. Linux 2.6 always
+reports 0 as the number of free file handles -- this is not an
+error, it just means that the number of allocated file handles
+exactly matches the number of used file handles.
+
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
 ==============================================================
 
 nr_open:
index b3d8b492274052c8fbb538b1fba96c16a4d64cff..a028b92001eddca50be93544c8667bd585417401 100644 (file)
@@ -22,6 +22,7 @@ show up in /proc/sys/kernel:
 - callhome                  [ S390 only ]
 - auto_msgmni
 - core_pattern
+- core_pipe_limit
 - core_uses_pid
 - ctrl-alt-del
 - dentry-state
@@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name.
 
 ==============================================================
 
+core_pipe_limit:
+
+This sysctl is only applicable when core_pattern is configured to pipe core
+files to user space helper a (when the first character of core_pattern is a '|',
+see above).  When collecting cores via a pipe to an application, it is
+occasionally usefull for the collecting application to gather data about the
+crashing process from its /proc/pid directory.  In order to do this safely, the
+kernel must wait for the collecting process to exit, so as not to remove the
+crashing processes proc files prematurely.  This in turn creates the possibility
+that a misbehaving userspace collecting process can block the reaping of a
+crashed process simply by never exiting.  This sysctl defends against that.  It
+defines how many concurrent crashing processes may be piped to user space
+applications in parallel.  If this value is exceeded, then those crashing
+processes above that value are noted via the kernel log and their cores are
+skipped.  0 is a special value, indicating that unlimited processes may be
+captured in parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crahing pid>/).  This value defaults
+to 0.
+
+==============================================================
+
 core_uses_pid:
 
 The default coredump filename is "core".  By setting
index e6fb1ec2744b180d25bd41e6a672e5627dd8e971..a6e360d2055c561ae347a75e20c015b3f61a8928 100644 (file)
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - zone_reclaim_mode
 
-
 ==============================================================
 
 block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
 
 The default value is 65536.
 
+=============================================================
+
+memory_failure_early_kill:
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+==============================================================
+
+memory_failure_recovery
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
 ==============================================================
 
 min_free_kbytes:
index 33e8a023df02287cbfe69854f41c8d316668f082..09b164a5700ff371615d6c701fc7578c6bc2f233 100644 (file)
@@ -1 +1,2 @@
+page-types
 slabinfo
index f366fa956179505cc602ff79f90adc83f95f7a77..25fadb448760008dc6408ed0cca2a72c66b2efd0 100644 (file)
@@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the
 mm start up ... this is a loose form of stability on mm_users. For
 example, it is used in copy_mm to protect against a racing tlb_gather_mmu
 single address space optimization, so that the zap_page_range (from
-vmtruncate) does not lose sending ipi's to cloned threads that might 
+truncate) does not lose sending ipi's to cloned threads that might
 be spawned underneath it and go to user mode to drag in pte's into tlbs.
 
 swap_lock
index 3eda8ea00852ee7cbeb44ad39a17a74f0d73bcc4..fa1a30d9e9d540a27324affbfe8aae579661c877 100644 (file)
@@ -5,6 +5,7 @@
  * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
  */
 
+#define _LARGEFILE64_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
 #include <getopt.h>
 #include <limits.h>
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 
 
+/*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+
+
 /*
  * kernel page flags
  */
@@ -126,6 +148,14 @@ static int         nr_addr_ranges;
 static unsigned long   opt_offset[MAX_ADDR_RANGES];
 static unsigned long   opt_size[MAX_ADDR_RANGES];
 
+#define MAX_VMAS       10240
+static int             nr_vmas;
+static unsigned long   pg_start[MAX_VMAS];
+static unsigned long   pg_end[MAX_VMAS];
+static unsigned long   voffset;
+
+static int             pagemap_fd;
+
 #define MAX_BIT_FILTERS        64
 static int             nr_bit_filters;
 static uint64_t                opt_mask[MAX_BIT_FILTERS];
@@ -135,7 +165,6 @@ static int          page_size;
 
 #define PAGES_BATCH    (64 << 10)      /* 64k pages */
 static int             kpageflags_fd;
-static uint64_t                kpageflags_buf[KPF_BYTES * PAGES_BATCH];
 
 #define HASH_SHIFT     13
 #define HASH_SIZE      (1 << HASH_SHIFT)
@@ -158,6 +187,11 @@ static uint64_t    page_flags[HASH_SIZE];
        type __min2 = (y);                      \
        __min1 < __min2 ? __min1 : __min2; })
 
+#define max_t(type, x, y) ({                   \
+       type __max1 = (x);                      \
+       type __max2 = (y);                      \
+       __max1 > __max2 ? __max1 : __max2; })
+
 static unsigned long pages2mb(unsigned long pages)
 {
        return (pages * page_size) >> 20;
@@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags)
 static void show_page_range(unsigned long offset, uint64_t flags)
 {
        static uint64_t      flags0;
+       static unsigned long voff;
        static unsigned long index;
        static unsigned long count;
 
-       if (flags == flags0 && offset == index + count) {
+       if (flags == flags0 && offset == index + count &&
+           (!opt_pid || voffset == voff + count)) {
                count++;
                return;
        }
 
-       if (count)
-               printf("%lu\t%lu\t%s\n",
+       if (count) {
+               if (opt_pid)
+                       printf("%lx\t", voff);
+               printf("%lx\t%lx\t%s\n",
                                index, count, page_flag_name(flags0));
+       }
 
        flags0 = flags;
        index  = offset;
+       voff   = voffset;
        count  = 1;
 }
 
 static void show_page(unsigned long offset, uint64_t flags)
 {
-       printf("%lu\t%s\n", offset, page_flag_name(flags));
+       if (opt_pid)
+               printf("%lx\t", voffset);
+       printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 
 static void show_summary(void)
@@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count)
        lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
 
        while (count) {
+               uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
+
                batch = min_t(unsigned long, count, PAGES_BATCH);
                n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
                if (n == 0)
@@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count)
        }
 }
 
+
+#define PAGEMAP_BATCH  4096
+static unsigned long task_pfn(unsigned long pgoff)
+{
+       static uint64_t buf[PAGEMAP_BATCH];
+       static unsigned long start;
+       static long count;
+       uint64_t pfn;
+
+       if (pgoff < start || pgoff >= start + count) {
+               if (lseek64(pagemap_fd,
+                           (uint64_t)pgoff * PM_ENTRY_BYTES,
+                           SEEK_SET) < 0) {
+                       perror("pagemap seek");
+                       exit(EXIT_FAILURE);
+               }
+               count = read(pagemap_fd, buf, sizeof(buf));
+               if (count == 0)
+                       return 0;
+               if (count < 0) {
+                       perror("pagemap read");
+                       exit(EXIT_FAILURE);
+               }
+               if (count % PM_ENTRY_BYTES) {
+                       fatal("pagemap read not aligned.\n");
+                       exit(EXIT_FAILURE);
+               }
+               count /= PM_ENTRY_BYTES;
+               start = pgoff;
+       }
+
+       pfn = buf[pgoff - start];
+       if (pfn & PM_PRESENT)
+               pfn = PM_PFRAME(pfn);
+       else
+               pfn = 0;
+
+       return pfn;
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+       int i = 0;
+       const unsigned long end = index + count;
+
+       while (index < end) {
+
+               while (pg_end[i] <= index)
+                       if (++i >= nr_vmas)
+                               return;
+               if (pg_start[i] >= end)
+                       return;
+
+               voffset = max_t(unsigned long, pg_start[i], index);
+               index   = min_t(unsigned long, pg_end[i], end);
+
+               assert(voffset < index);
+               for (; voffset < index; voffset++) {
+                       unsigned long pfn = task_pfn(voffset);
+                       if (pfn)
+                               walk_pfn(pfn, 1);
+               }
+       }
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+       if (nr_addr_ranges >= MAX_ADDR_RANGES)
+               fatal("too many addr ranges\n");
+
+       opt_offset[nr_addr_ranges] = offset;
+       opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+       nr_addr_ranges++;
+}
+
 static void walk_addr_ranges(void)
 {
        int i;
@@ -415,10 +534,13 @@ static void walk_addr_ranges(void)
        }
 
        if (!nr_addr_ranges)
-               walk_pfn(0, ULONG_MAX);
+               add_addr_range(0, ULONG_MAX);
 
        for (i = 0; i < nr_addr_ranges; i++)
-               walk_pfn(opt_offset[i], opt_size[i]);
+               if (!opt_pid)
+                       walk_pfn(opt_offset[i], opt_size[i]);
+               else
+                       walk_task(opt_offset[i], opt_size[i]);
 
        close(kpageflags_fd);
 }
@@ -446,8 +568,8 @@ static void usage(void)
 "            -r|--raw                  Raw mode, for kernel developers\n"
 "            -a|--addr    addr-spec    Walk a range of pages\n"
 "            -b|--bits    bits-spec    Walk pages with specified bits\n"
-#if 0 /* planned features */
 "            -p|--pid     pid          Walk process address space\n"
+#if 0 /* planned features */
 "            -f|--file    filename     Walk file address space\n"
 #endif
 "            -l|--list                 Show page details in ranges\n"
@@ -459,7 +581,7 @@ static void usage(void)
 "            N+M                       pages range from N to N+M-1\n"
 "            N,M                       pages range from N to M-1\n"
 "            N,                        pages range from N to end\n"
-"            ,M                        pages range from 0 to M\n"
+"            ,M                        pages range from 0 to M-1\n"
 "bits-spec:\n"
 "            bit1,bit2                 (flags & (bit1|bit2)) != 0\n"
 "            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1\n"
@@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str)
 
 static void parse_pid(const char *str)
 {
+       FILE *file;
+       char buf[5000];
+
        opt_pid = parse_number(str);
-}
 
-static void parse_file(const char *name)
-{
+       sprintf(buf, "/proc/%d/pagemap", opt_pid);
+       pagemap_fd = open(buf, O_RDONLY);
+       if (pagemap_fd < 0) {
+               perror(buf);
+               exit(EXIT_FAILURE);
+       }
+
+       sprintf(buf, "/proc/%d/maps", opt_pid);
+       file = fopen(buf, "r");
+       if (!file) {
+               perror(buf);
+               exit(EXIT_FAILURE);
+       }
+
+       while (fgets(buf, sizeof(buf), file) != NULL) {
+               unsigned long vm_start;
+               unsigned long vm_end;
+               unsigned long long pgoff;
+               int major, minor;
+               char r, w, x, s;
+               unsigned long ino;
+               int n;
+
+               n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+                          &vm_start,
+                          &vm_end,
+                          &r, &w, &x, &s,
+                          &pgoff,
+                          &major, &minor,
+                          &ino);
+               if (n < 10) {
+                       fprintf(stderr, "unexpected line: %s\n", buf);
+                       continue;
+               }
+               pg_start[nr_vmas] = vm_start / page_size;
+               pg_end[nr_vmas] = vm_end / page_size;
+               if (++nr_vmas >= MAX_VMAS) {
+                       fprintf(stderr, "too many VMAs\n");
+                       break;
+               }
+       }
+       fclose(file);
 }
 
-static void add_addr_range(unsigned long offset, unsigned long size)
+static void parse_file(const char *name)
 {
-       if (nr_addr_ranges >= MAX_ADDR_RANGES)
-               fatal("too much addr ranges\n");
-
-       opt_offset[nr_addr_ranges] = offset;
-       opt_size[nr_addr_ranges] = size;
-       nr_addr_ranges++;
 }
 
 static void parse_addr_range(const char *optarg)
@@ -676,8 +834,10 @@ int main(int argc, char *argv[])
                }
        }
 
+       if (opt_list && opt_pid)
+               printf("voffset\t");
        if (opt_list == 1)
-               printf("offset\tcount\tflags\n");
+               printf("offset\tlen\tflags\n");
        if (opt_list == 2)
                printf("offset\tflags\n");
 
index 7c1c0b05b298f069756c1dc119083e75f0e74a24..0c138ba86526907a141ca2e94631bb68ab390c5b 100644 (file)
@@ -2331,7 +2331,9 @@ S:        Orphan
 F:     drivers/hwmon/
 
 HARDWARE RANDOM NUMBER GENERATOR CORE
-S:     Orphan
+M:     Matt Mackall <mpm@selenic.com>
+M:     Herbert Xu <herbert@gondor.apana.org.au>
+S:     Odd fixes
 F:     Documentation/hw_random.txt
 F:     drivers/char/hw_random/
 F:     include/linux/hw_random.h
index 25da0017ec87fb2dd675257d7d16a68cc5f89aff..e42823e954aa30a6693e3799749927b111b0a133 100644 (file)
@@ -26,6 +26,8 @@
 #define F_GETOWN       6       /*  for sockets. */
 #define F_SETSIG       10      /*  for sockets. */
 #define F_GETSIG       11      /*  for sockets. */
+#define F_SETOWN_EX    12
+#define F_GETOWN_EX    13
 
 /* for posix fcntl() and lockf() */
 #define F_RDLCK                1
index e302daecbe56cbe5ac5a9cca91046da88fad51da..8e059e58b0acd6eba4aace468da92dd0090e5008 100644 (file)
@@ -1016,7 +1016,7 @@ marvel_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *m
 {
        struct marvel_agp_aperture *aper = agp->aperture.sysdata;
        return iommu_bind(aper->arena, aper->pg_start + pg_start, 
-                         mem->page_count, mem->memory);
+                         mem->page_count, mem->pages);
 }
 
 static int 
index 319fcb74611e57c9990aaba2e691079e8f1aa038..76686497b1e210992fcf0ee51fd86ed800e5dd14 100644 (file)
@@ -680,7 +680,7 @@ titan_agp_bind_memory(alpha_agp_info *agp, off_t pg_start, struct agp_memory *me
 {
        struct titan_agp_aperture *aper = agp->aperture.sysdata;
        return iommu_bind(aper->arena, aper->pg_start + pg_start, 
-                         mem->page_count, mem->memory);
+                         mem->page_count, mem->pages);
 }
 
 static int 
index 00edd04b585ec00724ea42a9aac24fb3b990a449..85457b2d4516dc009ad0407c6b01731add69081b 100644 (file)
@@ -198,7 +198,7 @@ extern unsigned long size_for_memory(unsigned long max);
 
 extern int iommu_reserve(struct pci_iommu_arena *, long, long);
 extern int iommu_release(struct pci_iommu_arena *, long, long);
-extern int iommu_bind(struct pci_iommu_arena *, long, long, unsigned long *);
+extern int iommu_bind(struct pci_iommu_arena *, long, long, struct page **);
 extern int iommu_unbind(struct pci_iommu_arena *, long, long);
 
 
index d15aedfe60661a5dc83b8292e959d23f2d4b8ac0..8449504f5e0b1d826841181d476067028b540a92 100644 (file)
@@ -876,7 +876,7 @@ iommu_release(struct pci_iommu_arena *arena, long pg_start, long pg_count)
 
 int
 iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count, 
-          unsigned long *physaddrs)
+          struct page **pages)
 {
        unsigned long flags;
        unsigned long *ptes;
@@ -896,7 +896,7 @@ iommu_bind(struct pci_iommu_arena *arena, long pg_start, long pg_count,
        }
                
        for(i = 0, j = pg_start; i < pg_count; i++, j++)
-               ptes[j] = mk_iommu_pte(physaddrs[i]);
+               ptes[j] = mk_iommu_pte(page_to_phys(pages[i]));
 
        spin_unlock_irqrestore(&arena->lock, flags);
 
index 83e6ba338e2c4c9c13c6e05cd0aadeea5ce461fc..1a8c7279a28b39eb8473d5e5ffb383cfbc5040ec 100644 (file)
@@ -187,11 +187,74 @@ union iop3xx_desc {
        void *ptr;
 };
 
+/* No support for p+q operations */
+static inline int
+iop_chan_pq_slot_count(size_t len, int src_cnt, int *slots_per_op)
+{
+       BUG();
+       return 0;
+}
+
+static inline void
+iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
+                 unsigned long flags)
+{
+       BUG();
+}
+
+static inline void
+iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
+{
+       BUG();
+}
+
+static inline void
+iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
+                        dma_addr_t addr, unsigned char coef)
+{
+       BUG();
+}
+
+static inline int
+iop_chan_pq_zero_sum_slot_count(size_t len, int src_cnt, int *slots_per_op)
+{
+       BUG();
+       return 0;
+}
+
+static inline void
+iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
+                         unsigned long flags)
+{
+       BUG();
+}
+
+static inline void
+iop_desc_set_pq_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
+{
+       BUG();
+}
+
+#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
+
+static inline void
+iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
+                             dma_addr_t *src)
+{
+       BUG();
+}
+
 static inline int iop_adma_get_max_xor(void)
 {
        return 32;
 }
 
+static inline int iop_adma_get_max_pq(void)
+{
+       BUG();
+       return 0;
+}
+
 static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
 {
        int id = chan->device->id;
@@ -332,6 +395,11 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt,
        return slot_cnt;
 }
 
+static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
+{
+       return 0;
+}
+
 static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
 {
@@ -349,6 +417,14 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
        return 0;
 }
 
+
+static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
+                                         struct iop_adma_chan *chan)
+{
+       BUG();
+       return 0;
+}
+
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
 {
@@ -756,13 +832,14 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
        hw_desc->src[0] = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
        struct iop3xx_desc_aau *hw_desc = desc->hw_desc;
        struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
 
        iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en));
-       return desc_ctrl.zero_result_err;
+       return desc_ctrl.zero_result_err << SUM_CHECK_P;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
index 385c6e8cbbd214357e798270ccac8845cb672cec..59b8c3892f76731608b346d0d100910e9047c481 100644 (file)
@@ -86,6 +86,7 @@ struct iop_adma_chan {
  * @idx: pool index
  * @unmap_src_cnt: number of xor sources
  * @unmap_len: transaction bytecount
+ * @tx_list: list of descriptors that are associated with one operation
  * @async_tx: support for the async_tx api
  * @group_list: list of slots that make up a multi-descriptor transaction
  *     for example transfer lengths larger than the supported hw max
@@ -102,10 +103,12 @@ struct iop_adma_desc_slot {
        u16 idx;
        u16 unmap_src_cnt;
        size_t unmap_len;
+       struct list_head tx_list;
        struct dma_async_tx_descriptor async_tx;
        union {
                u32 *xor_check_result;
                u32 *crc32_result;
+               u32 *pq_check_result;
        };
 };
 
index 5722e86f2174a93aa4b91975f2c2b608a5b4a1af..6d3782d85a9ff6d2db65a71de2632cc0b1151b33 100644 (file)
@@ -150,6 +150,8 @@ static inline int iop_adma_get_max_xor(void)
        return 16;
 }
 
+#define iop_adma_get_max_pq iop_adma_get_max_xor
+
 static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan)
 {
        return __raw_readl(ADMA_ADAR(chan));
@@ -211,7 +213,10 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op)
 #define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
+#define IOP_ADMA_PQ_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT
 #define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o)
+#define iop_chan_pq_slot_count iop_chan_xor_slot_count
+#define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count
 
 static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
@@ -220,6 +225,13 @@ static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
        return hw_desc->dest_addr;
 }
 
+static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
+                                         struct iop_adma_chan *chan)
+{
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+       return hw_desc->q_dest_addr;
+}
+
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
 {
@@ -319,6 +331,58 @@ iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
        return 1;
 }
 
+static inline void
+iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
+                 unsigned long flags)
+{
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+       union {
+               u32 value;
+               struct iop13xx_adma_desc_ctrl field;
+       } u_desc_ctrl;
+
+       u_desc_ctrl.value = 0;
+       u_desc_ctrl.field.src_select = src_cnt - 1;
+       u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
+       u_desc_ctrl.field.pq_xfer_en = 1;
+       u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
+       u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
+       hw_desc->desc_ctrl = u_desc_ctrl.value;
+}
+
+static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
+{
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+       union {
+               u32 value;
+               struct iop13xx_adma_desc_ctrl field;
+       } u_desc_ctrl;
+
+       u_desc_ctrl.value = hw_desc->desc_ctrl;
+       return u_desc_ctrl.field.pq_xfer_en;
+}
+
+static inline void
+iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
+                         unsigned long flags)
+{
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+       union {
+               u32 value;
+               struct iop13xx_adma_desc_ctrl field;
+       } u_desc_ctrl;
+
+       u_desc_ctrl.value = 0;
+       u_desc_ctrl.field.src_select = src_cnt - 1;
+       u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */
+       u_desc_ctrl.field.zero_result = 1;
+       u_desc_ctrl.field.status_write_back_en = 1;
+       u_desc_ctrl.field.pq_xfer_en = 1;
+       u_desc_ctrl.field.p_xfer_dis = !!(flags & DMA_PREP_PQ_DISABLE_P);
+       u_desc_ctrl.field.int_en = flags & DMA_PREP_INTERRUPT;
+       hw_desc->desc_ctrl = u_desc_ctrl.value;
+}
+
 static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan,
                                        u32 byte_count)
@@ -351,6 +415,7 @@ iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len)
        }
 }
 
+#define iop_desc_set_pq_zero_sum_byte_count iop_desc_set_zero_sum_byte_count
 
 static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan,
@@ -361,6 +426,16 @@ static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc,
        hw_desc->upper_dest_addr = 0;
 }
 
+static inline void
+iop_desc_set_pq_addr(struct iop_adma_desc_slot *desc, dma_addr_t *addr)
+{
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
+
+       hw_desc->dest_addr = addr[0];
+       hw_desc->q_dest_addr = addr[1];
+       hw_desc->upper_dest_addr = 0;
+}
+
 static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc,
                                        dma_addr_t addr)
 {
@@ -388,6 +463,29 @@ static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc,
        } while (slot_cnt);
 }
 
+static inline void
+iop_desc_set_pq_src_addr(struct iop_adma_desc_slot *desc, int src_idx,
+                        dma_addr_t addr, unsigned char coef)
+{
+       int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op;
+       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter;
+       struct iop13xx_adma_src *src;
+       int i = 0;
+
+       do {
+               iter = iop_hw_desc_slot_idx(hw_desc, i);
+               src = &iter->src[src_idx];
+               src->src_addr = addr;
+               src->pq_upper_src_addr = 0;
+               src->pq_dmlt = coef;
+               slot_cnt -= slots_per_op;
+               if (slot_cnt) {
+                       i += slots_per_op;
+                       addr += IOP_ADMA_PQ_MAX_BYTE_COUNT;
+               }
+       } while (slot_cnt);
+}
+
 static inline void
 iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
        struct iop_adma_chan *chan)
@@ -399,6 +497,15 @@ iop_desc_init_interrupt(struct iop_adma_desc_slot *desc,
 }
 
 #define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr
+#define iop_desc_set_pq_zero_sum_src_addr iop_desc_set_pq_src_addr
+
+static inline void
+iop_desc_set_pq_zero_sum_addr(struct iop_adma_desc_slot *desc, int pq_idx,
+                             dma_addr_t *src)
+{
+       iop_desc_set_xor_src_addr(desc, pq_idx, src[pq_idx]);
+       iop_desc_set_xor_src_addr(desc, pq_idx+1, src[pq_idx+1]);
+}
 
 static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc,
                                        u32 next_desc_addr)
@@ -428,18 +535,20 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc,
        hw_desc->block_fill_data = val;
 }
 
-static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
+static inline enum sum_check_flags
+iop_desc_get_zero_result(struct iop_adma_desc_slot *desc)
 {
        struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
        struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field;
        struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field;
+       enum sum_check_flags flags;
 
        BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result));
 
-       if (desc_ctrl.pq_xfer_en)
-               return byte_count.zero_result_err_q;
-       else
-               return byte_count.zero_result_err;
+       flags = byte_count.zero_result_err_q << SUM_CHECK_Q;
+       flags |= byte_count.zero_result_err << SUM_CHECK_P;
+
+       return flags;
 }
 
 static inline void iop_chan_append(struct iop_adma_chan *chan)
index bee42c609df6c6db3c491e9826fc5c4d5973225d..5c147fb66a013f809dbc92a4180fb2e89f6d4f22 100644 (file)
@@ -477,10 +477,8 @@ void __init iop13xx_platform_init(void)
                        plat_data = &iop13xx_adma_0_data;
                        dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
                        dma_cap_set(DMA_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+                       dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
                        dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-                       dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
                        dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
                        break;
                case IOP13XX_INIT_ADMA_1:
@@ -489,10 +487,8 @@ void __init iop13xx_platform_init(void)
                        plat_data = &iop13xx_adma_1_data;
                        dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
                        dma_cap_set(DMA_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+                       dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
                        dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-                       dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
                        dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
                        break;
                case IOP13XX_INIT_ADMA_2:
@@ -501,14 +497,11 @@ void __init iop13xx_platform_init(void)
                        plat_data = &iop13xx_adma_2_data;
                        dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
                        dma_cap_set(DMA_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask);
+                       dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
                        dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
-                       dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask);
                        dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
-                       dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask);
-                       dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask);
-                       dma_cap_set(DMA_PQ_ZERO_SUM, plat_data->cap_mask);
+                       dma_cap_set(DMA_PQ, plat_data->cap_mask);
+                       dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
                        break;
                }
        }
index 3c127aabe214e4bb8d7a2ff00c37061836a88cff..1ff6a37e893c423da523a945e0cf7c5d8f837152 100644 (file)
@@ -179,7 +179,6 @@ static int __init iop3xx_adma_cap_init(void)
        dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
        #else
        dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask);
-       dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask);
        dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask);
        #endif
 
@@ -188,7 +187,6 @@ static int __init iop3xx_adma_cap_init(void)
        dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
        #else
        dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask);
-       dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask);
        dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask);
        #endif
 
@@ -198,7 +196,7 @@ static int __init iop3xx_adma_cap_init(void)
        dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
        #else
        dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask);
-       dma_cap_set(DMA_ZERO_SUM, iop3xx_aau_data.cap_mask);
+       dma_cap_set(DMA_XOR_VAL, iop3xx_aau_data.cap_mask);
        dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask);
        dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
        #endif
index be722fc1acff523986182dcef23da7b15a3c2534..0d4d3e3a4cfcb72c2ad2538c410635f9a799c524 100644 (file)
@@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len)
 /*
  * Send us to sleep.
  */
-static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp,
+static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
                                void __user *buffer, size_t *lenp, loff_t *fpos)
 {
        int retval, mode;
@@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode)
 }
 
 
-static int cmode_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cmode_procctl(ctl_table *ctl, int write,
                         void __user *buffer, size_t *lenp, loff_t *fpos)
 {
        int new_cmode;
 
        if (!write)
-               return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+               return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
        new_cmode = user_atoi(buffer, *lenp);
 
@@ -301,13 +301,13 @@ static int try_set_cm(int new_cm)
        return 0;
 }
 
-static int p0_procctl(ctl_table *ctl, int write, struct file *filp,
+static int p0_procctl(ctl_table *ctl, int write,
                      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
        int new_p0;
 
        if (!write)
-               return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+               return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
        new_p0 = user_atoi(buffer, *lenp);
 
@@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table,
        return 1;
 }
 
-static int cm_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cm_procctl(ctl_table *ctl, int write,
                      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
        int new_cm;
 
        if (!write)
-               return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+               return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
        new_cm = user_atoi(buffer, *lenp);
 
index 3f04d4c406b75f397f8c568a730bbcdf118f33f2..b3deed8db619c93979fa626150f2043482df43ff 100644 (file)
@@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table,
 
 
 /* And the same for proc */
-int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
+int proc_dolasatstring(ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
        int r;
 
-       r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+       r = proc_dostring(table, write, buffer, lenp, ppos);
        if ((!write) || r)
                return r;
 
@@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
 }
 
 /* proc function to write EEPROM after changing int entry */
-int proc_dolasatint(ctl_table *table, int write, struct file *filp,
+int proc_dolasatint(ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
        int r;
 
-       r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       r = proc_dointvec(table, write, buffer, lenp, ppos);
        if ((!write) || r)
                return r;
 
@@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
 static int rtctmp;
 
 /* proc function to read/write RealTime Clock */
-int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
+int proc_dolasatrtc(ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
        struct timespec ts;
@@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
                if (rtctmp < 0)
                        rtctmp = 0;
        }
-       r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       r = proc_dointvec(table, write, buffer, lenp, ppos);
        if (r)
                return r;
 
@@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table,
 #endif
 
 #ifdef CONFIG_INET
-int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
+int proc_lasat_ip(ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
        unsigned int ip;
@@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table,
        return 0;
 }
 
-int proc_lasat_prid(ctl_table *table, int write, struct file *filp,
+int proc_lasat_prid(ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
 {
        int r;
 
-       r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       r = proc_dointvec(table, write, buffer, lenp, ppos);
        if (r < 0)
                return r;
        if (write) {
index 1e1c824764ee1f0202c9012090e06f532eb75e42..5f39d5597cedea76f43c3470bcb9998c9dd07b12 100644 (file)
@@ -28,6 +28,8 @@
 #define F_SETOWN       12      /*  for sockets. */
 #define F_SETSIG       13      /*  for sockets. */
 #define F_GETSIG       14      /*  for sockets. */
+#define F_GETOWN_EX    15
+#define F_SETOWN_EX    16
 
 /* for posix fcntl() and lockf() */
 #define F_RDLCK                01
diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h
new file mode 100644 (file)
index 0000000..a67aeed
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+ * Freescale MPC83XX / MPC85XX DMA Controller
+ *
+ * Copyright (c) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__
+#define __ARCH_POWERPC_ASM_FSLDMA_H__
+
+#include <linux/dmaengine.h>
+
+/*
+ * Definitions for the Freescale DMA controller's DMA_SLAVE implemention
+ *
+ * The Freescale DMA_SLAVE implementation was designed to handle many-to-many
+ * transfers. An example usage would be an accelerated copy between two
+ * scatterlists. Another example use would be an accelerated copy from
+ * multiple non-contiguous device buffers into a single scatterlist.
+ *
+ * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This
+ * structure contains a list of hardware addresses that should be copied
+ * to/from the scatterlist passed into device_prep_slave_sg(). The structure
+ * also has some fields to enable hardware-specific features.
+ */
+
+/**
+ * struct fsl_dma_hw_addr
+ * @entry: linked list entry
+ * @address: the hardware address
+ * @length: length to transfer
+ *
+ * Holds a single physical hardware address / length pair for use
+ * with the DMAEngine DMA_SLAVE API.
+ */
+struct fsl_dma_hw_addr {
+       struct list_head entry;
+
+       dma_addr_t address;
+       size_t length;
+};
+
+/**
+ * struct fsl_dma_slave
+ * @addresses: a linked list of struct fsl_dma_hw_addr structures
+ * @request_count: value for DMA request count
+ * @src_loop_size: setup and enable constant source-address DMA transfers
+ * @dst_loop_size: setup and enable constant destination address DMA transfers
+ * @external_start: enable externally started DMA transfers
+ * @external_pause: enable externally paused DMA transfers
+ *
+ * Holds a list of address / length pairs for use with the DMAEngine
+ * DMA_SLAVE API implementation for the Freescale DMA controller.
+ */
+struct fsl_dma_slave {
+
+       /* List of hardware address/length pairs */
+       struct list_head addresses;
+
+       /* Support for extra controller features */
+       unsigned int request_count;
+       unsigned int src_loop_size;
+       unsigned int dst_loop_size;
+       bool external_start;
+       bool external_pause;
+};
+
+/**
+ * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave
+ * @slave: the &struct fsl_dma_slave to add to
+ * @address: the hardware address to add
+ * @length: the length of bytes to transfer from @address
+ *
+ * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on
+ * success, -ERRNO otherwise.
+ */
+static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave,
+                                      dma_addr_t address, size_t length)
+{
+       struct fsl_dma_hw_addr *addr;
+
+       addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+       if (!addr)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&addr->entry);
+       addr->address = address;
+       addr->length = length;
+
+       list_add_tail(&addr->entry, &slave->addresses);
+       return 0;
+}
+
+/**
+ * fsl_dma_slave_free - free a struct fsl_dma_slave
+ * @slave: the struct fsl_dma_slave to free
+ *
+ * Free a struct fsl_dma_slave and all associated address/length pairs
+ */
+static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave)
+{
+       struct fsl_dma_hw_addr *addr, *tmp;
+
+       if (slave) {
+               list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) {
+                       list_del(&addr->entry);
+                       kfree(addr);
+               }
+
+               kfree(slave);
+       }
+}
+
+/**
+ * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave
+ * @gfp: the flags to pass to kmalloc when allocating this structure
+ *
+ * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new
+ * struct fsl_dma_slave on success, or NULL on failure.
+ */
+static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp)
+{
+       struct fsl_dma_slave *slave;
+
+       slave = kzalloc(sizeof(*slave), gfp);
+       if (!slave)
+               return NULL;
+
+       INIT_LIST_HEAD(&slave->addresses);
+       return slave;
+}
+
+#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */
index 264528e4f58d5ea3fa0ff12f534c925df21001a6..b55fd7ed1c31011c10e7393575473dffefeab596 100644 (file)
@@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev;
  * /proc entries (sysctl)
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
-static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+static int appldata_timer_handler(ctl_table *ctl, int write,
                                  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int appldata_interval_handler(ctl_table *ctl, int write,
-                                        struct file *filp,
                                         void __user *buffer,
                                         size_t *lenp, loff_t *ppos);
 
@@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd)
  * Start/Stop timer, show status of timer (0 = not active, 1 = active)
  */
 static int
-appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_timer_handler(ctl_table *ctl, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int len;
@@ -289,7 +288,7 @@ out:
  * current timer interval.
  */
 static int
-appldata_interval_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_interval_handler(ctl_table *ctl, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int len, interval;
@@ -335,7 +334,7 @@ out:
  * monitoring (0 = not in process, 1 = in process)
  */
 static int
-appldata_generic_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_generic_handler(ctl_table *ctl, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct appldata_ops *ops = NULL, *tmp_ops;
index 4c512561687df1e84d557564ef90ca9c65497605..20f282c911c28b6ae44c64bebcb7fee85c110062 100644 (file)
@@ -881,11 +881,11 @@ static int debug_active=1;
  * if debug_active is already off
  */
 static int
-s390dbf_procactive(ctl_table *table, int write, struct file *filp,
+s390dbf_procactive(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        if (!write || debug_stoppable || !debug_active)
-               return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+               return proc_dointvec(table, write, buffer, lenp, ppos);
        else
                return 0;
 }
index 413c240cbca773bb964b01cfc75506d0345180e1..b201135cc18c25d38b5b486ca3ffa2729cffbc4d 100644 (file)
@@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp)
 static struct ctl_table cmm_table[];
 
 static int
-cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_pages_handler(ctl_table *ctl, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        char buf[16], *p;
@@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 }
 
 static int
-cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_timeout_handler(ctl_table *ctl, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        char buf[64], *p;
index b91fa8dbf047b374cf31e6d3878067d3941df635..4d58eb0973d4f5ea9d048d4e53e949afb02bdfec 100644 (file)
@@ -1,12 +1,9 @@
 menu "DMA support"
 
-config SH_DMA_API
-       bool
 
 config SH_DMA
        bool "SuperH on-chip DMA controller (DMAC) support"
        depends on CPU_SH3 || CPU_SH4
-       select SH_DMA_API
        default n
 
 config SH_DMA_IRQ_MULTI
@@ -19,6 +16,15 @@ config SH_DMA_IRQ_MULTI
                     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785  || \
                     CPU_SUBTYPE_SH7760
 
+config SH_DMA_API
+       depends on SH_DMA
+       bool "SuperH DMA API support"
+       default n
+       help
+         SH_DMA_API always enabled DMA API of used SuperH.
+         If you want to use DMA ENGINE, you must not enable this.
+         Please enable DMA_ENGINE and SH_DMAE.
+
 config NR_ONCHIP_DMA_CHANNELS
        int
        depends on SH_DMA
index c6068137b46f42aa13f8491a3a8a298a4514d01b..d88c9484762c941b54ce5166c544f0b7e767819b 100644 (file)
@@ -2,8 +2,7 @@
 # Makefile for the SuperH DMA specific kernel interface routines under Linux.
 #
 
-obj-$(CONFIG_SH_DMA_API)       += dma-api.o dma-sysfs.o
-obj-$(CONFIG_SH_DMA)           += dma-sh.o
+obj-$(CONFIG_SH_DMA_API)       += dma-sh.o dma-api.o dma-sysfs.o
 obj-$(CONFIG_PVR2_DMA)         += dma-pvr2.o
 obj-$(CONFIG_G2_DMA)           += dma-g2.o
 obj-$(CONFIG_SH_DMABRG)                += dmabrg.o
index 68a5f4cb0343eeffa2750543a87f7ea3bbd11d0d..78eed3e0bdf548dc6c5f1761b51eefeb17cdb0a0 100644 (file)
@@ -116,4 +116,17 @@ static u32 dma_base_addr[] __maybe_unused = {
 #define CHCR    0x0C
 #define DMAOR  0x40
 
+/*
+ * for dma engine
+ *
+ * SuperH DMA mode
+ */
+#define SHDMA_MIX_IRQ  (1 << 1)
+#define SHDMA_DMAOR1   (1 << 2)
+#define SHDMA_DMAE1            (1 << 3)
+
+struct sh_dmae_pdata {
+       unsigned int mode;
+};
+
 #endif /* __DMA_SH_H */
index e63cf7d441e1351071997f4524315171c677e952..139d4c1a33a7c85a2989a8ab02ee1f6cee52d601 100644 (file)
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
 #define NMI_INVALID    3
 
 struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+extern int proc_nmi_enabled(struct ctl_table *, int ,
                        void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
index cb66a22d98ad72a3ad9eae5fc5b606beb15bcb94..7ff61d6a188ab2d1779270a6ff937c0a1a40b82a 100644 (file)
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 /*
  * proc handler for /proc/sys/kernel/nmi
  */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write,
                        void __user *buffer, size_t *length, loff_t *ppos)
 {
        int old_state;
 
        nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
        old_state = nmi_watchdog_enabled;
-       proc_dointvec(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, buffer, length, ppos);
        if (!!old_state == !!nmi_watchdog_enabled)
                return 0;
 
index cf53a78e2dcf1b6639dd569b3810da809e437e1b..8cb4974ff5990c19267077d473caeeb504fe5bae 100644 (file)
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-                      void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
 static ctl_table kernel_table2[] = {
        { .procname = "vsyscall64",
          .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
          .mode = 0644,
-         .proc_handler = vsyscall_sysctl_change },
+         .proc_handler = proc_dointvec },
        {}
 };
 
index 82728f2c6d5599ccda0c4cb1dee132ce3305f5ca..f4cee9028cf0b01e11951662b625f63371f627e6 100644 (file)
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
+       info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
 
        force_sig_info(si_signo, &info, tsk);
 }
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
 }
 
 static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+         unsigned int fault)
 {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
+       int code = BUS_ADRERR;
 
        up_read(&mm->mmap_sem);
 
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
        tsk->thread.error_code  = error_code;
        tsk->thread.trap_no     = 14;
 
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+#ifdef CONFIG_MEMORY_FAILURE
+       if (fault & VM_FAULT_HWPOISON) {
+               printk(KERN_ERR
+       "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+                       tsk->comm, tsk->pid, address);
+               code = BUS_MCEERR_AR;
+       }
+#endif
+       force_sig_info_fault(SIGBUS, code, address, tsk);
 }
 
 static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        if (fault & VM_FAULT_OOM) {
                out_of_memory(regs, error_code, address);
        } else {
-               if (fault & VM_FAULT_SIGBUS)
-                       do_sigbus(regs, error_code, address);
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+                       do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
        }
index d8fb39145986569d6ac0fc327881aa95edf52e62..e5aeb2b79e6f956998b8bd0b2bc3bc882ab5067c 100644 (file)
@@ -14,3 +14,12 @@ config ASYNC_MEMSET
        tristate
        select ASYNC_CORE
 
+config ASYNC_PQ
+       tristate
+       select ASYNC_CORE
+
+config ASYNC_RAID6_RECOV
+       tristate
+       select ASYNC_CORE
+       select ASYNC_PQ
+
index 27baa7d52fbcf863873d9fd55507115d77e46c28..d1e0e6f72bc14e651c2a9c0a1c1f7936ead53970 100644 (file)
@@ -2,3 +2,6 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o
 obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
 obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
+obj-$(CONFIG_ASYNC_PQ) += async_pq.o
+obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o
+obj-$(CONFIG_ASYNC_RAID6_TEST) += raid6test.o
index ddccfb01c416b9a636324ee273244b147c644a16..0ec1fb69d4eacc2a310504caa9109186f9e89370 100644 (file)
  * async_memcpy - attempt to copy memory with a dma engine.
  * @dest: destination page
  * @src: src page
- * @offset: offset in pages to start transaction
+ * @dest_offset: offset into 'dest' to start transaction
+ * @src_offset: offset into 'src' to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
- * @depend_tx: memcpy depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-       unsigned int src_offset, size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+            unsigned int src_offset, size_t len,
+            struct async_submit_ctl *submit)
 {
-       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY,
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY,
                                                      &dest, 1, &src, 1, len);
        struct dma_device *device = chan ? chan->device : NULL;
        struct dma_async_tx_descriptor *tx = NULL;
 
-       if (device) {
+       if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
                dma_addr_t dma_dest, dma_src;
-               unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+               unsigned long dma_prep_flags = 0;
 
+               if (submit->cb_fn)
+                       dma_prep_flags |= DMA_PREP_INTERRUPT;
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_prep_flags |= DMA_PREP_FENCE;
                dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
                                        DMA_FROM_DEVICE);
 
@@ -67,13 +70,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 
        if (tx) {
                pr_debug("%s: (async) len: %zu\n", __func__, len);
-               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+               async_tx_submit(chan, tx, submit);
        } else {
                void *dest_buf, *src_buf;
                pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
                /* wait for any prerequisite operations */
-               async_tx_quiesce(&depend_tx);
+               async_tx_quiesce(&submit->depend_tx);
 
                dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
                src_buf = kmap_atomic(src, KM_USER1) + src_offset;
@@ -83,26 +86,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
                kunmap_atomic(dest_buf, KM_USER0);
                kunmap_atomic(src_buf, KM_USER1);
 
-               async_tx_sync_epilog(cb_fn, cb_param);
+               async_tx_sync_epilog(submit);
        }
 
        return tx;
 }
 EXPORT_SYMBOL_GPL(async_memcpy);
 
-static int __init async_memcpy_init(void)
-{
-       return 0;
-}
-
-static void __exit async_memcpy_exit(void)
-{
-       do { } while (0);
-}
-
-module_init(async_memcpy_init);
-module_exit(async_memcpy_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous memcpy api");
 MODULE_LICENSE("GPL");
index 5b5eb99bb244311bb1456c62080249d812967be4..58e4a8752aee52c06681bac43055f53586e3b3e1 100644 (file)
  * @val: fill value
  * @offset: offset in pages to start transaction
  * @len: length in bytes
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: memset depends on the result of this transaction
- * @cb_fn: function to call when the memcpy completes
- * @cb_param: parameter to pass to the callback routine
+ *
+ * honored flags: ASYNC_TX_ACK
  */
 struct dma_async_tx_descriptor *
-async_memset(struct page *dest, int val, unsigned int offset,
-       size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+async_memset(struct page *dest, int val, unsigned int offset, size_t len,
+            struct async_submit_ctl *submit)
 {
-       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET,
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET,
                                                      &dest, 1, NULL, 0, len);
        struct dma_device *device = chan ? chan->device : NULL;
        struct dma_async_tx_descriptor *tx = NULL;
 
-       if (device) {
+       if (device && is_dma_fill_aligned(device, offset, 0, len)) {
                dma_addr_t dma_dest;
-               unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+               unsigned long dma_prep_flags = 0;
 
+               if (submit->cb_fn)
+                       dma_prep_flags |= DMA_PREP_INTERRUPT;
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_prep_flags |= DMA_PREP_FENCE;
                dma_dest = dma_map_page(device->dev, dest, offset, len,
                                        DMA_FROM_DEVICE);
 
@@ -64,38 +64,25 @@ async_memset(struct page *dest, int val, unsigned int offset,
 
        if (tx) {
                pr_debug("%s: (async) len: %zu\n", __func__, len);
-               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+               async_tx_submit(chan, tx, submit);
        } else { /* run the memset synchronously */
                void *dest_buf;
                pr_debug("%s: (sync) len: %zu\n", __func__, len);
 
-               dest_buf = (void *) (((char *) page_address(dest)) + offset);
+               dest_buf = page_address(dest) + offset;
 
                /* wait for any prerequisite operations */
-               async_tx_quiesce(&depend_tx);
+               async_tx_quiesce(&submit->depend_tx);
 
                memset(dest_buf, val, len);
 
-               async_tx_sync_epilog(cb_fn, cb_param);
+               async_tx_sync_epilog(submit);
        }
 
        return tx;
 }
 EXPORT_SYMBOL_GPL(async_memset);
 
-static int __init async_memset_init(void)
-{
-       return 0;
-}
-
-static void __exit async_memset_exit(void)
-{
-       do { } while (0);
-}
-
-module_init(async_memset_init);
-module_exit(async_memset_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous memset api");
 MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
new file mode 100644 (file)
index 0000000..b88db6d
--- /dev/null
@@ -0,0 +1,395 @@
+/*
+ * Copyright(c) 2007 Yuri Tikhonov <yur@emcraft.com>
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+/**
+ * scribble - space to hold throwaway P buffer for synchronous gen_syndrome
+ */
+static struct page *scribble;
+
+static bool is_raid6_zero_block(struct page *p)
+{
+       return p == (void *) raid6_empty_zero_page;
+}
+
+/* the struct page *blocks[] parameter passed to async_gen_syndrome()
+ * and async_syndrome_val() contains the 'P' destination address at
+ * blocks[disks-2] and the 'Q' destination address at blocks[disks-1]
+ *
+ * note: these are macros as they are used as lvalues
+ */
+#define P(b, d) (b[d-2])
+#define Q(b, d) (b[d-1])
+
+/**
+ * do_async_gen_syndrome - asynchronously calculate P and/or Q
+ */
+static __async_inline struct dma_async_tx_descriptor *
+do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
+                     const unsigned char *scfs, unsigned int offset, int disks,
+                     size_t len, dma_addr_t *dma_src,
+                     struct async_submit_ctl *submit)
+{
+       struct dma_async_tx_descriptor *tx = NULL;
+       struct dma_device *dma = chan->device;
+       enum dma_ctrl_flags dma_flags = 0;
+       enum async_tx_flags flags_orig = submit->flags;
+       dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+       dma_async_tx_callback cb_param_orig = submit->cb_param;
+       int src_cnt = disks - 2;
+       unsigned char coefs[src_cnt];
+       unsigned short pq_src_cnt;
+       dma_addr_t dma_dest[2];
+       int src_off = 0;
+       int idx;
+       int i;
+
+       /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
+       if (P(blocks, disks))
+               dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset,
+                                          len, DMA_BIDIRECTIONAL);
+       else
+               dma_flags |= DMA_PREP_PQ_DISABLE_P;
+       if (Q(blocks, disks))
+               dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset,
+                                          len, DMA_BIDIRECTIONAL);
+       else
+               dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+
+       /* convert source addresses being careful to collapse 'empty'
+        * sources and update the coefficients accordingly
+        */
+       for (i = 0, idx = 0; i < src_cnt; i++) {
+               if (is_raid6_zero_block(blocks[i]))
+                       continue;
+               dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len,
+                                           DMA_TO_DEVICE);
+               coefs[idx] = scfs[i];
+               idx++;
+       }
+       src_cnt = idx;
+
+       while (src_cnt > 0) {
+               submit->flags = flags_orig;
+               pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags));
+               /* if we are submitting additional pqs, leave the chain open,
+                * clear the callback parameters, and leave the destination
+                * buffers mapped
+                */
+               if (src_cnt > pq_src_cnt) {
+                       submit->flags &= ~ASYNC_TX_ACK;
+                       submit->flags |= ASYNC_TX_FENCE;
+                       dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
+                       submit->cb_fn = NULL;
+                       submit->cb_param = NULL;
+               } else {
+                       dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP;
+                       submit->cb_fn = cb_fn_orig;
+                       submit->cb_param = cb_param_orig;
+                       if (cb_fn_orig)
+                               dma_flags |= DMA_PREP_INTERRUPT;
+               }
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_flags |= DMA_PREP_FENCE;
+
+               /* Since we have clobbered the src_list we are committed
+                * to doing this asynchronously.  Drivers force forward
+                * progress in case they can not provide a descriptor
+                */
+               for (;;) {
+                       tx = dma->device_prep_dma_pq(chan, dma_dest,
+                                                    &dma_src[src_off],
+                                                    pq_src_cnt,
+                                                    &coefs[src_off], len,
+                                                    dma_flags);
+                       if (likely(tx))
+                               break;
+                       async_tx_quiesce(&submit->depend_tx);
+                       dma_async_issue_pending(chan);
+               }
+
+               async_tx_submit(chan, tx, submit);
+               submit->depend_tx = tx;
+
+               /* drop completed sources */
+               src_cnt -= pq_src_cnt;
+               src_off += pq_src_cnt;
+
+               dma_flags |= DMA_PREP_CONTINUE;
+       }
+
+       return tx;
+}
+
+/**
+ * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome
+ */
+static void
+do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+                    size_t len, struct async_submit_ctl *submit)
+{
+       void **srcs;
+       int i;
+
+       if (submit->scribble)
+               srcs = submit->scribble;
+       else
+               srcs = (void **) blocks;
+
+       for (i = 0; i < disks; i++) {
+               if (is_raid6_zero_block(blocks[i])) {
+                       BUG_ON(i > disks - 3); /* P or Q can't be zero */
+                       srcs[i] = blocks[i];
+               } else
+                       srcs[i] = page_address(blocks[i]) + offset;
+       }
+       raid6_call.gen_syndrome(disks, len, srcs);
+       async_tx_sync_epilog(submit);
+}
+
+/**
+ * async_gen_syndrome - asynchronously calculate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @submit: submission/completion modifiers
+ *
+ * General note: This routine assumes a field of GF(2^8) with a
+ * primitive polynomial of 0x11d and a generator of {02}.
+ *
+ * 'disks' note: callers can optionally omit either P or Q (but not
+ * both) from the calculation by setting blocks[disks-2] or
+ * blocks[disks-1] to NULL.  When P or Q is omitted 'len' must be <=
+ * PAGE_SIZE as a temporary buffer of this size is used in the
+ * synchronous path.  'disks' always accounts for both destination
+ * buffers.
+ *
+ * 'blocks' note: if submit->scribble is NULL then the contents of
+ * 'blocks' may be overridden
+ */
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
+                  size_t len, struct async_submit_ctl *submit)
+{
+       int src_cnt = disks - 2;
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+                                                     &P(blocks, disks), 2,
+                                                     blocks, src_cnt, len);
+       struct dma_device *device = chan ? chan->device : NULL;
+       dma_addr_t *dma_src = NULL;
+
+       BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
+
+       if (submit->scribble)
+               dma_src = submit->scribble;
+       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+               dma_src = (dma_addr_t *) blocks;
+
+       if (dma_src && device &&
+           (src_cnt <= dma_maxpq(device, 0) ||
+            dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
+           is_dma_pq_aligned(device, offset, 0, len)) {
+               /* run the p+q asynchronously */
+               pr_debug("%s: (async) disks: %d len: %zu\n",
+                        __func__, disks, len);
+               return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset,
+                                            disks, len, dma_src, submit);
+       }
+
+       /* run the pq synchronously */
+       pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len);
+
+       /* wait for any prerequisite operations */
+       async_tx_quiesce(&submit->depend_tx);
+
+       if (!P(blocks, disks)) {
+               P(blocks, disks) = scribble;
+               BUG_ON(len + offset > PAGE_SIZE);
+       }
+       if (!Q(blocks, disks)) {
+               Q(blocks, disks) = scribble;
+               BUG_ON(len + offset > PAGE_SIZE);
+       }
+       do_sync_gen_syndrome(blocks, offset, disks, len, submit);
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(async_gen_syndrome);
+
+/**
+ * async_syndrome_val - asynchronously validate a raid6 syndrome
+ * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1
+ * @offset: common offset into each block (src and dest) to start transaction
+ * @disks: number of blocks (including missing P or Q, see below)
+ * @len: length of operation in bytes
+ * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set
+ * @spare: temporary result buffer for the synchronous case
+ * @submit: submission / completion modifiers
+ *
+ * The same notes from async_gen_syndrome apply to the 'blocks',
+ * and 'disks' parameters of this routine.  The synchronous path
+ * requires a temporary result buffer and submit->scribble to be
+ * specified.
+ */
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
+                  size_t len, enum sum_check_flags *pqres, struct page *spare,
+                  struct async_submit_ctl *submit)
+{
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL,
+                                                     NULL, 0,  blocks, disks,
+                                                     len);
+       struct dma_device *device = chan ? chan->device : NULL;
+       struct dma_async_tx_descriptor *tx;
+       enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
+       dma_addr_t *dma_src = NULL;
+
+       BUG_ON(disks < 4);
+
+       if (submit->scribble)
+               dma_src = submit->scribble;
+       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+               dma_src = (dma_addr_t *) blocks;
+
+       if (dma_src && device && disks <= dma_maxpq(device, 0) &&
+           is_dma_pq_aligned(device, offset, 0, len)) {
+               struct device *dev = device->dev;
+               dma_addr_t *pq = &dma_src[disks-2];
+               int i;
+
+               pr_debug("%s: (async) disks: %d len: %zu\n",
+                        __func__, disks, len);
+               if (!P(blocks, disks))
+                       dma_flags |= DMA_PREP_PQ_DISABLE_P;
+               if (!Q(blocks, disks))
+                       dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_flags |= DMA_PREP_FENCE;
+               for (i = 0; i < disks; i++)
+                       if (likely(blocks[i])) {
+                               BUG_ON(is_raid6_zero_block(blocks[i]));
+                               dma_src[i] = dma_map_page(dev, blocks[i],
+                                                         offset, len,
+                                                         DMA_TO_DEVICE);
+                       }
+
+               for (;;) {
+                       tx = device->device_prep_dma_pq_val(chan, pq, dma_src,
+                                                           disks - 2,
+                                                           raid6_gfexp,
+                                                           len, pqres,
+                                                           dma_flags);
+                       if (likely(tx))
+                               break;
+                       async_tx_quiesce(&submit->depend_tx);
+                       dma_async_issue_pending(chan);
+               }
+               async_tx_submit(chan, tx, submit);
+
+               return tx;
+       } else {
+               struct page *p_src = P(blocks, disks);
+               struct page *q_src = Q(blocks, disks);
+               enum async_tx_flags flags_orig = submit->flags;
+               dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+               void *scribble = submit->scribble;
+               void *cb_param_orig = submit->cb_param;
+               void *p, *q, *s;
+
+               pr_debug("%s: (sync) disks: %d len: %zu\n",
+                        __func__, disks, len);
+
+               /* caller must provide a temporary result buffer and
+                * allow the input parameters to be preserved
+                */
+               BUG_ON(!spare || !scribble);
+
+               /* wait for any prerequisite operations */
+               async_tx_quiesce(&submit->depend_tx);
+
+               /* recompute p and/or q into the temporary buffer and then
+                * check to see the result matches the current value
+                */
+               tx = NULL;
+               *pqres = 0;
+               if (p_src) {
+                       init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+                                         NULL, NULL, scribble);
+                       tx = async_xor(spare, blocks, offset, disks-2, len, submit);
+                       async_tx_quiesce(&tx);
+                       p = page_address(p_src) + offset;
+                       s = page_address(spare) + offset;
+                       *pqres |= !!memcmp(p, s, len) << SUM_CHECK_P;
+               }
+
+               if (q_src) {
+                       P(blocks, disks) = NULL;
+                       Q(blocks, disks) = spare;
+                       init_async_submit(submit, 0, NULL, NULL, NULL, scribble);
+                       tx = async_gen_syndrome(blocks, offset, disks, len, submit);
+                       async_tx_quiesce(&tx);
+                       q = page_address(q_src) + offset;
+                       s = page_address(spare) + offset;
+                       *pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q;
+               }
+
+               /* restore P, Q and submit */
+               P(blocks, disks) = p_src;
+               Q(blocks, disks) = q_src;
+
+               submit->cb_fn = cb_fn_orig;
+               submit->cb_param = cb_param_orig;
+               submit->flags = flags_orig;
+               async_tx_sync_epilog(submit);
+
+               return NULL;
+       }
+}
+EXPORT_SYMBOL_GPL(async_syndrome_val);
+
+static int __init async_pq_init(void)
+{
+       scribble = alloc_page(GFP_KERNEL);
+
+       if (scribble)
+               return 0;
+
+       pr_err("%s: failed to allocate required spare page\n", __func__);
+
+       return -ENOMEM;
+}
+
+static void __exit async_pq_exit(void)
+{
+       put_page(scribble);
+}
+
+module_init(async_pq_init);
+module_exit(async_pq_exit);
+
+MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
new file mode 100644 (file)
index 0000000..6d73dde
--- /dev/null
@@ -0,0 +1,468 @@
+/*
+ * Asynchronous RAID-6 recovery calculations ASYNC_TX API.
+ * Copyright(c) 2009 Intel Corporation
+ *
+ * based on raid6recov.c:
+ *   Copyright 2002 H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/pq.h>
+#include <linux/async_tx.h>
+
+static struct dma_async_tx_descriptor *
+async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
+                 size_t len, struct async_submit_ctl *submit)
+{
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+                                                     &dest, 1, srcs, 2, len);
+       struct dma_device *dma = chan ? chan->device : NULL;
+       const u8 *amul, *bmul;
+       u8 ax, bx;
+       u8 *a, *b, *c;
+
+       if (dma) {
+               dma_addr_t dma_dest[2];
+               dma_addr_t dma_src[2];
+               struct device *dev = dma->dev;
+               struct dma_async_tx_descriptor *tx;
+               enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_flags |= DMA_PREP_FENCE;
+               dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+               dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
+               dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
+               tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef,
+                                            len, dma_flags);
+               if (tx) {
+                       async_tx_submit(chan, tx, submit);
+                       return tx;
+               }
+
+               /* could not get a descriptor, unmap and fall through to
+                * the synchronous path
+                */
+               dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
+               dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
+               dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE);
+       }
+
+       /* run the operation synchronously */
+       async_tx_quiesce(&submit->depend_tx);
+       amul = raid6_gfmul[coef[0]];
+       bmul = raid6_gfmul[coef[1]];
+       a = page_address(srcs[0]);
+       b = page_address(srcs[1]);
+       c = page_address(dest);
+
+       while (len--) {
+               ax    = amul[*a++];
+               bx    = bmul[*b++];
+               *c++ = ax ^ bx;
+       }
+
+       return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
+          struct async_submit_ctl *submit)
+{
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
+                                                     &dest, 1, &src, 1, len);
+       struct dma_device *dma = chan ? chan->device : NULL;
+       const u8 *qmul; /* Q multiplier table */
+       u8 *d, *s;
+
+       if (dma) {
+               dma_addr_t dma_dest[2];
+               dma_addr_t dma_src[1];
+               struct device *dev = dma->dev;
+               struct dma_async_tx_descriptor *tx;
+               enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
+
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_flags |= DMA_PREP_FENCE;
+               dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+               dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
+               tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
+                                            len, dma_flags);
+               if (tx) {
+                       async_tx_submit(chan, tx, submit);
+                       return tx;
+               }
+
+               /* could not get a descriptor, unmap and fall through to
+                * the synchronous path
+                */
+               dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
+               dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
+       }
+
+       /* no channel available, or failed to allocate a descriptor, so
+        * perform the operation synchronously
+        */
+       async_tx_quiesce(&submit->depend_tx);
+       qmul  = raid6_gfmul[coef];
+       d = page_address(dest);
+       s = page_address(src);
+
+       while (len--)
+               *d++ = qmul[*s++];
+
+       return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks,
+             struct async_submit_ctl *submit)
+{
+       struct dma_async_tx_descriptor *tx = NULL;
+       struct page *p, *q, *a, *b;
+       struct page *srcs[2];
+       unsigned char coef[2];
+       enum async_tx_flags flags = submit->flags;
+       dma_async_tx_callback cb_fn = submit->cb_fn;
+       void *cb_param = submit->cb_param;
+       void *scribble = submit->scribble;
+
+       p = blocks[4-2];
+       q = blocks[4-1];
+
+       a = blocks[faila];
+       b = blocks[failb];
+
+       /* in the 4 disk case P + Pxy == P and Q + Qxy == Q */
+       /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+       srcs[0] = p;
+       srcs[1] = q;
+       coef[0] = raid6_gfexi[failb-faila];
+       coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_sum_product(b, srcs, coef, bytes, submit);
+
+       /* Dy = P+Pxy+Dx */
+       srcs[0] = p;
+       srcs[1] = b;
+       init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn,
+                         cb_param, scribble);
+       tx = async_xor(a, srcs, 0, 2, bytes, submit);
+
+       return tx;
+
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks,
+             struct async_submit_ctl *submit)
+{
+       struct dma_async_tx_descriptor *tx = NULL;
+       struct page *p, *q, *g, *dp, *dq;
+       struct page *srcs[2];
+       unsigned char coef[2];
+       enum async_tx_flags flags = submit->flags;
+       dma_async_tx_callback cb_fn = submit->cb_fn;
+       void *cb_param = submit->cb_param;
+       void *scribble = submit->scribble;
+       int uninitialized_var(good);
+       int i;
+
+       for (i = 0; i < 3; i++) {
+               if (i == faila || i == failb)
+                       continue;
+               else {
+                       good = i;
+                       break;
+               }
+       }
+       BUG_ON(i >= 3);
+
+       p = blocks[5-2];
+       q = blocks[5-1];
+       g = blocks[good];
+
+       /* Compute syndrome with zero for the missing data pages
+        * Use the dead data pages as temporary storage for delta p and
+        * delta q
+        */
+       dp = blocks[faila];
+       dq = blocks[failb];
+
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_memcpy(dp, g, 0, 0, bytes, submit);
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+
+       /* compute P + Pxy */
+       srcs[0] = dp;
+       srcs[1] = p;
+       init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         NULL, NULL, scribble);
+       tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+       /* compute Q + Qxy */
+       srcs[0] = dq;
+       srcs[1] = q;
+       init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         NULL, NULL, scribble);
+       tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+       /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+       srcs[0] = dp;
+       srcs[1] = dq;
+       coef[0] = raid6_gfexi[failb-faila];
+       coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+       /* Dy = P+Pxy+Dx */
+       srcs[0] = dp;
+       srcs[1] = dq;
+       init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+                         cb_param, scribble);
+       tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+       return tx;
+}
+
+static struct dma_async_tx_descriptor *
+__2data_recov_n(int disks, size_t bytes, int faila, int failb,
+             struct page **blocks, struct async_submit_ctl *submit)
+{
+       struct dma_async_tx_descriptor *tx = NULL;
+       struct page *p, *q, *dp, *dq;
+       struct page *srcs[2];
+       unsigned char coef[2];
+       enum async_tx_flags flags = submit->flags;
+       dma_async_tx_callback cb_fn = submit->cb_fn;
+       void *cb_param = submit->cb_param;
+       void *scribble = submit->scribble;
+
+       p = blocks[disks-2];
+       q = blocks[disks-1];
+
+       /* Compute syndrome with zero for the missing data pages
+        * Use the dead data pages as temporary storage for
+        * delta p and delta q
+        */
+       dp = blocks[faila];
+       blocks[faila] = (void *)raid6_empty_zero_page;
+       blocks[disks-2] = dp;
+       dq = blocks[failb];
+       blocks[failb] = (void *)raid6_empty_zero_page;
+       blocks[disks-1] = dq;
+
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+
+       /* Restore pointer table */
+       blocks[faila]   = dp;
+       blocks[failb]   = dq;
+       blocks[disks-2] = p;
+       blocks[disks-1] = q;
+
+       /* compute P + Pxy */
+       srcs[0] = dp;
+       srcs[1] = p;
+       init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         NULL, NULL, scribble);
+       tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+       /* compute Q + Qxy */
+       srcs[0] = dq;
+       srcs[1] = q;
+       init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         NULL, NULL, scribble);
+       tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+       /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
+       srcs[0] = dp;
+       srcs[1] = dq;
+       coef[0] = raid6_gfexi[failb-faila];
+       coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_sum_product(dq, srcs, coef, bytes, submit);
+
+       /* Dy = P+Pxy+Dx */
+       srcs[0] = dp;
+       srcs[1] = dq;
+       init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+                         cb_param, scribble);
+       tx = async_xor(dp, srcs, 0, 2, bytes, submit);
+
+       return tx;
+}
+
+/**
+ * async_raid6_2data_recov - asynchronously calculate two missing data blocks
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: first failed drive index
+ * @failb: second failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+                       struct page **blocks, struct async_submit_ctl *submit)
+{
+       BUG_ON(faila == failb);
+       if (failb < faila)
+               swap(faila, failb);
+
+       pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+       /* we need to preserve the contents of 'blocks' for the async
+        * case, so punt to synchronous if a scribble buffer is not available
+        */
+       if (!submit->scribble) {
+               void **ptrs = (void **) blocks;
+               int i;
+
+               async_tx_quiesce(&submit->depend_tx);
+               for (i = 0; i < disks; i++)
+                       ptrs[i] = page_address(blocks[i]);
+
+               raid6_2data_recov(disks, bytes, faila, failb, ptrs);
+
+               async_tx_sync_epilog(submit);
+
+               return NULL;
+       }
+
+       switch (disks) {
+       case 4:
+               /* dma devices do not uniformly understand a zero source pq
+                * operation (in contrast to the synchronous case), so
+                * explicitly handle the 4 disk special case
+                */
+               return __2data_recov_4(bytes, faila, failb, blocks, submit);
+       case 5:
+               /* dma devices do not uniformly understand a single
+                * source pq operation (in contrast to the synchronous
+                * case), so explicitly handle the 5 disk special case
+                */
+               return __2data_recov_5(bytes, faila, failb, blocks, submit);
+       default:
+               return __2data_recov_n(disks, bytes, faila, failb, blocks, submit);
+       }
+}
+EXPORT_SYMBOL_GPL(async_raid6_2data_recov);
+
+/**
+ * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block
+ * @disks: number of disks in the RAID-6 array
+ * @bytes: block size
+ * @faila: failed drive index
+ * @blocks: array of source pointers where the last two entries are p and q
+ * @submit: submission/completion modifiers
+ */
+struct dma_async_tx_descriptor *
+async_raid6_datap_recov(int disks, size_t bytes, int faila,
+                       struct page **blocks, struct async_submit_ctl *submit)
+{
+       struct dma_async_tx_descriptor *tx = NULL;
+       struct page *p, *q, *dq;
+       u8 coef;
+       enum async_tx_flags flags = submit->flags;
+       dma_async_tx_callback cb_fn = submit->cb_fn;
+       void *cb_param = submit->cb_param;
+       void *scribble = submit->scribble;
+       struct page *srcs[2];
+
+       pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes);
+
+       /* we need to preserve the contents of 'blocks' for the async
+        * case, so punt to synchronous if a scribble buffer is not available
+        */
+       if (!scribble) {
+               void **ptrs = (void **) blocks;
+               int i;
+
+               async_tx_quiesce(&submit->depend_tx);
+               for (i = 0; i < disks; i++)
+                       ptrs[i] = page_address(blocks[i]);
+
+               raid6_datap_recov(disks, bytes, faila, ptrs);
+
+               async_tx_sync_epilog(submit);
+
+               return NULL;
+       }
+
+       p = blocks[disks-2];
+       q = blocks[disks-1];
+
+       /* Compute syndrome with zero for the missing data page
+        * Use the dead data page as temporary storage for delta q
+        */
+       dq = blocks[faila];
+       blocks[faila] = (void *)raid6_empty_zero_page;
+       blocks[disks-1] = dq;
+
+       /* in the 4 disk case we only need to perform a single source
+        * multiplication
+        */
+       if (disks == 4) {
+               int good = faila == 0 ? 1 : 0;
+               struct page *g = blocks[good];
+
+               init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+                                 scribble);
+               tx = async_memcpy(p, g, 0, 0, bytes, submit);
+
+               init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+                                 scribble);
+               tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
+       } else {
+               init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
+                                 scribble);
+               tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
+       }
+
+       /* Restore pointer table */
+       blocks[faila]   = dq;
+       blocks[disks-1] = q;
+
+       /* calculate g^{-faila} */
+       coef = raid6_gfinv[raid6_gfexp[faila]];
+
+       srcs[0] = dq;
+       srcs[1] = q;
+       init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         NULL, NULL, scribble);
+       tx = async_xor(dq, srcs, 0, 2, bytes, submit);
+
+       init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
+       tx = async_mult(dq, dq, coef, bytes, submit);
+
+       srcs[0] = p;
+       srcs[1] = dq;
+       init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn,
+                         cb_param, scribble);
+       tx = async_xor(p, srcs, 0, 2, bytes, submit);
+
+       return tx;
+}
+EXPORT_SYMBOL_GPL(async_raid6_datap_recov);
+
+MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
+MODULE_DESCRIPTION("asynchronous RAID-6 recovery api");
+MODULE_LICENSE("GPL");
index 06eb6cc09fef97714d12ebf8859e5e712d798df5..f9cdf04fe7c0a370908c8f6f5f3c04c9fb3d35f7 100644 (file)
@@ -42,16 +42,21 @@ static void __exit async_tx_exit(void)
        async_dmaengine_put();
 }
 
+module_init(async_tx_init);
+module_exit(async_tx_exit);
+
 /**
  * __async_tx_find_channel - find a channel to carry out the operation or let
  *     the transaction execute synchronously
- * @depend_tx: transaction dependency
+ * @submit: transaction dependency and submission modifiers
  * @tx_type: transaction type
  */
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-       enum dma_transaction_type tx_type)
+__async_tx_find_channel(struct async_submit_ctl *submit,
+                       enum dma_transaction_type tx_type)
 {
+       struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
        /* see if we can keep the chain on one channel */
        if (depend_tx &&
            dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
@@ -59,17 +64,6 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
        return async_dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
-#else
-static int __init async_tx_init(void)
-{
-       printk(KERN_INFO "async_tx: api initialized (sync-only)\n");
-       return 0;
-}
-
-static void __exit async_tx_exit(void)
-{
-       do { } while (0);
-}
 #endif
 
 
@@ -83,10 +77,14 @@ static void
 async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
                        struct dma_async_tx_descriptor *tx)
 {
-       struct dma_chan *chan;
-       struct dma_device *device;
+       struct dma_chan *chan = depend_tx->chan;
+       struct dma_device *device = chan->device;
        struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
 
+       #ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+       BUG();
+       #endif
+
        /* first check to see if we can still append to depend_tx */
        spin_lock_bh(&depend_tx->lock);
        if (depend_tx->parent && depend_tx->chan == tx->chan) {
@@ -96,11 +94,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
        }
        spin_unlock_bh(&depend_tx->lock);
 
-       if (!intr_tx)
+       /* attached dependency, flush the parent channel */
+       if (!intr_tx) {
+               device->device_issue_pending(chan);
                return;
-
-       chan = depend_tx->chan;
-       device = chan->device;
+       }
 
        /* see if we can schedule an interrupt
         * otherwise poll for completion
@@ -134,6 +132,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
                        intr_tx->tx_submit(intr_tx);
                        async_tx_ack(intr_tx);
                }
+               device->device_issue_pending(chan);
        } else {
                if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
                        panic("%s: DMA_ERROR waiting for depend_tx\n",
@@ -144,13 +143,14 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 
 
 /**
- * submit_disposition - while holding depend_tx->lock we must avoid submitting
- *     new operations to prevent a circular locking dependency with
- *     drivers that already hold a channel lock when calling
- *     async_tx_run_dependencies.
+ * submit_disposition - flags for routing an incoming operation
  * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock
  * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch
  * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly
+ *
+ * while holding depend_tx->lock we must avoid submitting new operations
+ * to prevent a circular locking dependency with drivers that already
+ * hold a channel lock when calling async_tx_run_dependencies.
  */
 enum submit_disposition {
        ASYNC_TX_SUBMITTED,
@@ -160,11 +160,12 @@ enum submit_disposition {
 
 void
 async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-       enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+               struct async_submit_ctl *submit)
 {
-       tx->callback = cb_fn;
-       tx->callback_param = cb_param;
+       struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
+
+       tx->callback = submit->cb_fn;
+       tx->callback_param = submit->cb_param;
 
        if (depend_tx) {
                enum submit_disposition s;
@@ -220,30 +221,29 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
                tx->tx_submit(tx);
        }
 
-       if (flags & ASYNC_TX_ACK)
+       if (submit->flags & ASYNC_TX_ACK)
                async_tx_ack(tx);
 
-       if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+       if (depend_tx)
                async_tx_ack(depend_tx);
 }
 EXPORT_SYMBOL_GPL(async_tx_submit);
 
 /**
- * async_trigger_callback - schedules the callback function to be run after
- * any dependent operations have been completed.
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: 'callback' requires the completion of this transaction
- * @cb_fn: function to call after depend_tx completes
- * @cb_param: parameter to pass to the callback routine
+ * async_trigger_callback - schedules the callback function to be run
+ * @submit: submission and completion parameters
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * The callback is run after any dependent operations have completed.
  */
 struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+async_trigger_callback(struct async_submit_ctl *submit)
 {
        struct dma_chan *chan;
        struct dma_device *device;
        struct dma_async_tx_descriptor *tx;
+       struct dma_async_tx_descriptor *depend_tx = submit->depend_tx;
 
        if (depend_tx) {
                chan = depend_tx->chan;
@@ -262,14 +262,14 @@ async_trigger_callback(enum async_tx_flags flags,
        if (tx) {
                pr_debug("%s: (async)\n", __func__);
 
-               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+               async_tx_submit(chan, tx, submit);
        } else {
                pr_debug("%s: (sync)\n", __func__);
 
                /* wait for any prerequisite operations */
-               async_tx_quiesce(&depend_tx);
+               async_tx_quiesce(&submit->depend_tx);
 
-               async_tx_sync_epilog(cb_fn, cb_param);
+               async_tx_sync_epilog(submit);
        }
 
        return tx;
@@ -295,9 +295,6 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx)
 }
 EXPORT_SYMBOL_GPL(async_tx_quiesce);
 
-module_init(async_tx_init);
-module_exit(async_tx_exit);
-
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API");
 MODULE_LICENSE("GPL");
index 90dd3f8bd283171ca725d75f7daae52fd0089ab1..b459a9034aace5270b4e2d7f73387ca4bac4eb05 100644 (file)
 /* do_async_xor - dma map the pages and perform the xor with an engine */
 static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
-            unsigned int offset, int src_cnt, size_t len,
-            enum async_tx_flags flags,
-            struct dma_async_tx_descriptor *depend_tx,
-            dma_async_tx_callback cb_fn, void *cb_param)
+            unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src,
+            struct async_submit_ctl *submit)
 {
        struct dma_device *dma = chan->device;
-       dma_addr_t *dma_src = (dma_addr_t *) src_list;
        struct dma_async_tx_descriptor *tx = NULL;
        int src_off = 0;
        int i;
-       dma_async_tx_callback _cb_fn;
-       void *_cb_param;
-       enum async_tx_flags async_flags;
+       dma_async_tx_callback cb_fn_orig = submit->cb_fn;
+       void *cb_param_orig = submit->cb_param;
+       enum async_tx_flags flags_orig = submit->flags;
        enum dma_ctrl_flags dma_flags;
        int xor_src_cnt;
        dma_addr_t dma_dest;
@@ -63,25 +60,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
        }
 
        while (src_cnt) {
-               async_flags = flags;
+               submit->flags = flags_orig;
                dma_flags = 0;
-               xor_src_cnt = min(src_cnt, dma->max_xor);
+               xor_src_cnt = min(src_cnt, (int)dma->max_xor);
                /* if we are submitting additional xors, leave the chain open,
                 * clear the callback parameters, and leave the destination
                 * buffer mapped
                 */
                if (src_cnt > xor_src_cnt) {
-                       async_flags &= ~ASYNC_TX_ACK;
+                       submit->flags &= ~ASYNC_TX_ACK;
+                       submit->flags |= ASYNC_TX_FENCE;
                        dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
-                       _cb_fn = NULL;
-                       _cb_param = NULL;
+                       submit->cb_fn = NULL;
+                       submit->cb_param = NULL;
                } else {
-                       _cb_fn = cb_fn;
-                       _cb_param = cb_param;
+                       submit->cb_fn = cb_fn_orig;
+                       submit->cb_param = cb_param_orig;
                }
-               if (_cb_fn)
+               if (submit->cb_fn)
                        dma_flags |= DMA_PREP_INTERRUPT;
-
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_flags |= DMA_PREP_FENCE;
                /* Since we have clobbered the src_list we are committed
                 * to doing this asynchronously.  Drivers force forward progress
                 * in case they can not provide a descriptor
@@ -90,7 +89,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
                                              xor_src_cnt, len, dma_flags);
 
                if (unlikely(!tx))
-                       async_tx_quiesce(&depend_tx);
+                       async_tx_quiesce(&submit->depend_tx);
 
                /* spin wait for the preceeding transactions to complete */
                while (unlikely(!tx)) {
@@ -101,11 +100,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
                                                      dma_flags);
                }
 
-               async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn,
-                               _cb_param);
-
-               depend_tx = tx;
-               flags |= ASYNC_TX_DEP_ACK;
+               async_tx_submit(chan, tx, submit);
+               submit->depend_tx = tx;
 
                if (src_cnt > xor_src_cnt) {
                        /* drop completed sources */
@@ -124,23 +120,27 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
 
 static void
 do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
-           int src_cnt, size_t len, enum async_tx_flags flags,
-           dma_async_tx_callback cb_fn, void *cb_param)
+           int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
        int i;
        int xor_src_cnt;
        int src_off = 0;
        void *dest_buf;
-       void **srcs = (void **) src_list;
+       void **srcs;
+
+       if (submit->scribble)
+               srcs = submit->scribble;
+       else
+               srcs = (void **) src_list;
 
-       /* reuse the 'src_list' array to convert to buffer pointers */
+       /* convert to buffer pointers */
        for (i = 0; i < src_cnt; i++)
                srcs[i] = page_address(src_list[i]) + offset;
 
        /* set destination address */
        dest_buf = page_address(dest) + offset;
 
-       if (flags & ASYNC_TX_XOR_ZERO_DST)
+       if (submit->flags & ASYNC_TX_XOR_ZERO_DST)
                memset(dest_buf, 0, len);
 
        while (src_cnt > 0) {
@@ -153,61 +153,70 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
                src_off += xor_src_cnt;
        }
 
-       async_tx_sync_epilog(cb_fn, cb_param);
+       async_tx_sync_epilog(submit);
 }
 
 /**
  * async_xor - attempt to xor a set of blocks with a dma engine.
- *     xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
- *     flag must be set to not include dest data in the calculation.  The
- *     assumption with dma eninges is that they only use the destination
- *     buffer as a source when it is explicity specified in the source list.
  * @dest: destination page
- * @src_list: array of source pages (if the dest is also a source it must be
- *     at index zero).  The contents of this array may be overwritten.
- * @offset: offset in pages to start transaction
+ * @src_list: array of source pages
+ * @offset: common src/dst offset to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
- * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST,
- *     ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
+ *
+ * xor_blocks always uses the dest as a source so the
+ * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
+ * the calculation.  The assumption with dma eninges is that they only
+ * use the destination buffer as a source when it is explicity specified
+ * in the source list.
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-       int src_cnt, size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+         int src_cnt, size_t len, struct async_submit_ctl *submit)
 {
-       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR,
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
                                                      &dest, 1, src_list,
                                                      src_cnt, len);
+       dma_addr_t *dma_src = NULL;
+
        BUG_ON(src_cnt <= 1);
 
-       if (chan) {
+       if (submit->scribble)
+               dma_src = submit->scribble;
+       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+               dma_src = (dma_addr_t *) src_list;
+
+       if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) {
                /* run the xor asynchronously */
                pr_debug("%s (async): len: %zu\n", __func__, len);
 
                return do_async_xor(chan, dest, src_list, offset, src_cnt, len,
-                                   flags, depend_tx, cb_fn, cb_param);
+                                   dma_src, submit);
        } else {
                /* run the xor synchronously */
                pr_debug("%s (sync): len: %zu\n", __func__, len);
+               WARN_ONCE(chan, "%s: no space for dma address conversion\n",
+                         __func__);
 
                /* in the sync case the dest is an implied source
                 * (assumes the dest is the first source)
                 */
-               if (flags & ASYNC_TX_XOR_DROP_DST) {
+               if (submit->flags & ASYNC_TX_XOR_DROP_DST) {
                        src_cnt--;
                        src_list++;
                }
 
                /* wait for any prerequisite operations */
-               async_tx_quiesce(&depend_tx);
+               async_tx_quiesce(&submit->depend_tx);
 
-               do_sync_xor(dest, src_list, offset, src_cnt, len,
-                           flags, cb_fn, cb_param);
+               do_sync_xor(dest, src_list, offset, src_cnt, len, submit);
 
                return NULL;
        }
@@ -222,104 +231,94 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len)
 }
 
 /**
- * async_xor_zero_sum - attempt a xor parity check with a dma engine.
+ * async_xor_val - attempt a xor parity check with a dma engine.
  * @dest: destination page used if the xor is performed synchronously
- * @src_list: array of source pages.  The dest page must be listed as a source
- *     at index zero.  The contents of this array may be overwritten.
+ * @src_list: array of source pages
  * @offset: offset in pages to start transaction
  * @src_cnt: number of source pages
  * @len: length in bytes
  * @result: 0 if sum == 0 else non-zero
- * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
- * @depend_tx: xor depends on the result of this transaction.
- * @cb_fn: function to call when the xor completes
- * @cb_param: parameter to pass to the callback routine
+ * @submit: submission / completion modifiers
+ *
+ * honored flags: ASYNC_TX_ACK
+ *
+ * src_list note: if the dest is also a source it must be at index zero.
+ * The contents of this array will be overwritten if a scribble region
+ * is not specified.
  */
 struct dma_async_tx_descriptor *
-async_xor_zero_sum(struct page *dest, struct page **src_list,
-       unsigned int offset, int src_cnt, size_t len,
-       u32 *result, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_param)
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+             int src_cnt, size_t len, enum sum_check_flags *result,
+             struct async_submit_ctl *submit)
 {
-       struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM,
+       struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL,
                                                      &dest, 1, src_list,
                                                      src_cnt, len);
        struct dma_device *device = chan ? chan->device : NULL;
        struct dma_async_tx_descriptor *tx = NULL;
+       dma_addr_t *dma_src = NULL;
 
        BUG_ON(src_cnt <= 1);
 
-       if (device && src_cnt <= device->max_xor) {
-               dma_addr_t *dma_src = (dma_addr_t *) src_list;
-               unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0;
+       if (submit->scribble)
+               dma_src = submit->scribble;
+       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
+               dma_src = (dma_addr_t *) src_list;
+
+       if (dma_src && device && src_cnt <= device->max_xor &&
+           is_dma_xor_aligned(device, offset, 0, len)) {
+               unsigned long dma_prep_flags = 0;
                int i;
 
                pr_debug("%s: (async) len: %zu\n", __func__, len);
 
+               if (submit->cb_fn)
+                       dma_prep_flags |= DMA_PREP_INTERRUPT;
+               if (submit->flags & ASYNC_TX_FENCE)
+                       dma_prep_flags |= DMA_PREP_FENCE;
                for (i = 0; i < src_cnt; i++)
                        dma_src[i] = dma_map_page(device->dev, src_list[i],
                                                  offset, len, DMA_TO_DEVICE);
 
-               tx = device->device_prep_dma_zero_sum(chan, dma_src, src_cnt,
-                                                     len, result,
-                                                     dma_prep_flags);
+               tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt,
+                                                    len, result,
+                                                    dma_prep_flags);
                if (unlikely(!tx)) {
-                       async_tx_quiesce(&depend_tx);
+                       async_tx_quiesce(&submit->depend_tx);
 
                        while (!tx) {
                                dma_async_issue_pending(chan);
-                               tx = device->device_prep_dma_zero_sum(chan,
+                               tx = device->device_prep_dma_xor_val(chan,
                                        dma_src, src_cnt, len, result,
                                        dma_prep_flags);
                        }
                }
 
-               async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+               async_tx_submit(chan, tx, submit);
        } else {
-               unsigned long xor_flags = flags;
+               enum async_tx_flags flags_orig = submit->flags;
 
                pr_debug("%s: (sync) len: %zu\n", __func__, len);
+               WARN_ONCE(device && src_cnt <= device->max_xor,
+                         "%s: no space for dma address conversion\n",
+                         __func__);
 
-               xor_flags |= ASYNC_TX_XOR_DROP_DST;
-               xor_flags &= ~ASYNC_TX_ACK;
+               submit->flags |= ASYNC_TX_XOR_DROP_DST;
+               submit->flags &= ~ASYNC_TX_ACK;
 
-               tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags,
-                       depend_tx, NULL, NULL);
+               tx = async_xor(dest, src_list, offset, src_cnt, len, submit);
 
                async_tx_quiesce(&tx);
 
-               *result = page_is_zero(dest, offset, len) ? 0 : 1;
+               *result = !page_is_zero(dest, offset, len) << SUM_CHECK_P;
 
-               async_tx_sync_epilog(cb_fn, cb_param);
+               async_tx_sync_epilog(submit);
+               submit->flags = flags_orig;
        }
 
        return tx;
 }
-EXPORT_SYMBOL_GPL(async_xor_zero_sum);
-
-static int __init async_xor_init(void)
-{
-       #ifdef CONFIG_ASYNC_TX_DMA
-       /* To conserve stack space the input src_list (array of page pointers)
-        * is reused to hold the array of dma addresses passed to the driver.
-        * This conversion is only possible when dma_addr_t is less than the
-        * the size of a pointer.  HIGHMEM64G is known to violate this
-        * assumption.
-        */
-       BUILD_BUG_ON(sizeof(dma_addr_t) > sizeof(struct page *));
-       #endif
-
-       return 0;
-}
-
-static void __exit async_xor_exit(void)
-{
-       do { } while (0);
-}
-
-module_init(async_xor_init);
-module_exit(async_xor_exit);
+EXPORT_SYMBOL_GPL(async_xor_val);
 
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api");
diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c
new file mode 100644 (file)
index 0000000..3ec27c7
--- /dev/null
@@ -0,0 +1,240 @@
+/*
+ * asynchronous raid6 recovery self test
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * based on drivers/md/raid6test/test.c:
+ *     Copyright 2002-2007 H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/async_tx.h>
+#include <linux/random.h>
+
+#undef pr
+#define pr(fmt, args...) pr_info("raid6test: " fmt, ##args)
+
+#define NDISKS 16 /* Including P and Q */
+
+static struct page *dataptrs[NDISKS];
+static addr_conv_t addr_conv[NDISKS];
+static struct page *data[NDISKS+3];
+static struct page *spare;
+static struct page *recovi;
+static struct page *recovj;
+
+static void callback(void *param)
+{
+       struct completion *cmp = param;
+
+       complete(cmp);
+}
+
+static void makedata(int disks)
+{
+       int i, j;
+
+       for (i = 0; i < disks; i++) {
+               for (j = 0; j < PAGE_SIZE/sizeof(u32); j += sizeof(u32)) {
+                       u32 *p = page_address(data[i]) + j;
+
+                       *p = random32();
+               }
+
+               dataptrs[i] = data[i];
+       }
+}
+
+static char disk_type(int d, int disks)
+{
+       if (d == disks - 2)
+               return 'P';
+       else if (d == disks - 1)
+               return 'Q';
+       else
+               return 'D';
+}
+
+/* Recover two failed blocks. */
+static void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, struct page **ptrs)
+{
+       struct async_submit_ctl submit;
+       struct completion cmp;
+       struct dma_async_tx_descriptor *tx = NULL;
+       enum sum_check_flags result = ~0;
+
+       if (faila > failb)
+               swap(faila, failb);
+
+       if (failb == disks-1) {
+               if (faila == disks-2) {
+                       /* P+Q failure.  Just rebuild the syndrome. */
+                       init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+                       tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+               } else {
+                       struct page *blocks[disks];
+                       struct page *dest;
+                       int count = 0;
+                       int i;
+
+                       /* data+Q failure.  Reconstruct data from P,
+                        * then rebuild syndrome
+                        */
+                       for (i = disks; i-- ; ) {
+                               if (i == faila || i == failb)
+                                       continue;
+                               blocks[count++] = ptrs[i];
+                       }
+                       dest = ptrs[faila];
+                       init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
+                                         NULL, NULL, addr_conv);
+                       tx = async_xor(dest, blocks, 0, count, bytes, &submit);
+
+                       init_async_submit(&submit, 0, tx, NULL, NULL, addr_conv);
+                       tx = async_gen_syndrome(ptrs, 0, disks, bytes, &submit);
+               }
+       } else {
+               if (failb == disks-2) {
+                       /* data+P failure. */
+                       init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+                       tx = async_raid6_datap_recov(disks, bytes, faila, ptrs, &submit);
+               } else {
+                       /* data+data failure. */
+                       init_async_submit(&submit, 0, NULL, NULL, NULL, addr_conv);
+                       tx = async_raid6_2data_recov(disks, bytes, faila, failb, ptrs, &submit);
+               }
+       }
+       init_completion(&cmp);
+       init_async_submit(&submit, ASYNC_TX_ACK, tx, callback, &cmp, addr_conv);
+       tx = async_syndrome_val(ptrs, 0, disks, bytes, &result, spare, &submit);
+       async_tx_issue_pending(tx);
+
+       if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0)
+               pr("%s: timeout! (faila: %d failb: %d disks: %d)\n",
+                  __func__, faila, failb, disks);
+
+       if (result != 0)
+               pr("%s: validation failure! faila: %d failb: %d sum_check_flags: %x\n",
+                  __func__, faila, failb, result);
+}
+
+static int test_disks(int i, int j, int disks)
+{
+       int erra, errb;
+
+       memset(page_address(recovi), 0xf0, PAGE_SIZE);
+       memset(page_address(recovj), 0xba, PAGE_SIZE);
+
+       dataptrs[i] = recovi;
+       dataptrs[j] = recovj;
+
+       raid6_dual_recov(disks, PAGE_SIZE, i, j, dataptrs);
+
+       erra = memcmp(page_address(data[i]), page_address(recovi), PAGE_SIZE);
+       errb = memcmp(page_address(data[j]), page_address(recovj), PAGE_SIZE);
+
+       pr("%s(%d, %d): faila=%3d(%c)  failb=%3d(%c)  %s\n",
+          __func__, i, j, i, disk_type(i, disks), j, disk_type(j, disks),
+          (!erra && !errb) ? "OK" : !erra ? "ERRB" : !errb ? "ERRA" : "ERRAB");
+
+       dataptrs[i] = data[i];
+       dataptrs[j] = data[j];
+
+       return erra || errb;
+}
+
+static int test(int disks, int *tests)
+{
+       struct dma_async_tx_descriptor *tx;
+       struct async_submit_ctl submit;
+       struct completion cmp;
+       int err = 0;
+       int i, j;
+
+       recovi = data[disks];
+       recovj = data[disks+1];
+       spare  = data[disks+2];
+
+       makedata(disks);
+
+       /* Nuke syndromes */
+       memset(page_address(data[disks-2]), 0xee, PAGE_SIZE);
+       memset(page_address(data[disks-1]), 0xee, PAGE_SIZE);
+
+       /* Generate assumed good syndrome */
+       init_completion(&cmp);
+       init_async_submit(&submit, ASYNC_TX_ACK, NULL, callback, &cmp, addr_conv);
+       tx = async_gen_syndrome(dataptrs, 0, disks, PAGE_SIZE, &submit);
+       async_tx_issue_pending(tx);
+
+       if (wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)) == 0) {
+               pr("error: initial gen_syndrome(%d) timed out\n", disks);
+               return 1;
+       }
+
+       pr("testing the %d-disk case...\n", disks);
+       for (i = 0; i < disks-1; i++)
+               for (j = i+1; j < disks; j++) {
+                       (*tests)++;
+                       err += test_disks(i, j, disks);
+               }
+
+       return err;
+}
+
+
+static int raid6_test(void)
+{
+       int err = 0;
+       int tests = 0;
+       int i;
+
+       for (i = 0; i < NDISKS+3; i++) {
+               data[i] = alloc_page(GFP_KERNEL);
+               if (!data[i]) {
+                       while (i--)
+                               put_page(data[i]);
+                       return -ENOMEM;
+               }
+       }
+
+       /* the 4-disk and 5-disk cases are special for the recovery code */
+       if (NDISKS > 4)
+               err += test(4, &tests);
+       if (NDISKS > 5)
+               err += test(5, &tests);
+       err += test(NDISKS, &tests);
+
+       pr("\n");
+       pr("complete (%d tests, %d failure%s)\n",
+          tests, err, err == 1 ? "" : "s");
+
+       for (i = 0; i < NDISKS+3; i++)
+               put_page(data[i]);
+
+       return 0;
+}
+
+static void raid6_test_exit(void)
+{
+}
+
+/* when compiled-in wait for drivers to load first (assumes dma drivers
+ * are also compliled-in)
+ */
+late_initcall(raid6_test);
+module_exit(raid6_test_exit);
+MODULE_AUTHOR("Dan Williams <dan.j.williams@intel.com>");
+MODULE_DESCRIPTION("asynchronous RAID-6 recovery self tests");
+MODULE_LICENSE("GPL");
index 71d1b9bab70b515afbe682cca1de5bc4ca3613cb..614da5b8613ac7d11271865bd7766a49b3ffdf7b 100644 (file)
@@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info,
        return 0;
 }
 
-static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_info(ctl_table *ctl, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int pos;
@@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
                goto done;
 doit:
        mutex_unlock(&cdrom_mutex);
-       return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       return proc_dostring(ctl, write, buffer, lenp, ppos);
 done:
        printk(KERN_INFO "cdrom: info buffer too small\n");
        goto doit;
@@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void)
        mutex_unlock(&cdrom_mutex);
 }
 
-static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_handler(ctl_table *ctl, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
        
-       ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write) {
        
index 6a06913b01d30151065471350ec492e7df79a6a3..08a6f50ae791952508ba62fa896a99032788af38 100644 (file)
@@ -1087,6 +1087,14 @@ config MMTIMER
          The mmtimer device allows direct userspace access to the
          Altix system timer.
 
+config UV_MMTIMER
+       tristate "UV_MMTIMER Memory mapped RTC for SGI UV"
+       depends on X86_UV
+       default m
+       help
+         The uv_mmtimer device allows direct userspace access to the
+         UV system timer.
+
 source "drivers/char/tpm/Kconfig"
 
 config TELCLOCK
index 66f779ad4f4c05297409c5eefadcf92d0a3903f1..19a79dd79eee0008503552ac41a14881205d87cd 100644 (file)
@@ -58,6 +58,7 @@ obj-$(CONFIG_RAW_DRIVER)      += raw.o
 obj-$(CONFIG_SGI_SNSC)         += snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)            += mspec.o
 obj-$(CONFIG_MMTIMER)          += mmtimer.o
+obj-$(CONFIG_UV_MMTIMER)       += uv_mmtimer.o
 obj-$(CONFIG_VIOTAPE)          += viotape.o
 obj-$(CONFIG_HVCS)             += hvcs.o
 obj-$(CONFIG_IBM_BSR)          += bsr.o
index 0a01329451e4af6d0f1b83624de37099c8c29382..e3dd24bff5143206df8c4345fc1fce56fe492cda 100644 (file)
@@ -1,8 +1,7 @@
 /*
  * Blackfin On-Chip OTP Memory Interface
- *  Supports BF52x/BF54x
  *
- * Copyright 2007-2008 Analog Devices Inc.
+ * Copyright 2007-2009 Analog Devices Inc.
  *
  * Enter bugs at http://blackfin.uclinux.org/
  *
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
+#include <mtd/mtd-abi.h>
 
 #include <asm/blackfin.h>
+#include <asm/bfrom.h>
 #include <asm/uaccess.h>
 
 #define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args)
 
 static DEFINE_MUTEX(bfin_otp_lock);
 
-/* OTP Boot ROM functions */
-#define _BOOTROM_OTP_COMMAND           0xEF000018
-#define _BOOTROM_OTP_READ              0xEF00001A
-#define _BOOTROM_OTP_WRITE             0xEF00001C
-
-static u32 (* const otp_command)(u32 command, u32 value) = (void *)_BOOTROM_OTP_COMMAND;
-static u32 (* const otp_read)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_READ;
-static u32 (* const otp_write)(u32 page, u32 flags, u64 *page_content) = (void *)_BOOTROM_OTP_WRITE;
-
-/* otp_command(): defines for "command" */
-#define OTP_INIT             0x00000001
-#define OTP_CLOSE            0x00000002
-
-/* otp_{read,write}(): defines for "flags" */
-#define OTP_LOWER_HALF       0x00000000 /* select upper/lower 64-bit half (bit 0) */
-#define OTP_UPPER_HALF       0x00000001
-#define OTP_NO_ECC           0x00000010 /* do not use ECC */
-#define OTP_LOCK             0x00000020 /* sets page protection bit for page */
-#define OTP_ACCESS_READ      0x00001000
-#define OTP_ACCESS_READWRITE 0x00002000
-
-/* Return values for all functions */
-#define OTP_SUCCESS          0x00000000
-#define OTP_MASTER_ERROR     0x001
-#define OTP_WRITE_ERROR      0x003
-#define OTP_READ_ERROR       0x005
-#define OTP_ACC_VIO_ERROR    0x009
-#define OTP_DATA_MULT_ERROR  0x011
-#define OTP_ECC_MULT_ERROR   0x021
-#define OTP_PREV_WR_ERROR    0x041
-#define OTP_DATA_SB_WARN     0x100
-#define OTP_ECC_SB_WARN      0x200
-
 /**
  *     bfin_otp_read - Read OTP pages
  *
@@ -86,9 +54,11 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
        page = *pos / (sizeof(u64) * 2);
        while (bytes_done < count) {
                flags = (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
-               stamp("processing page %i (%s)", page, (flags == OTP_UPPER_HALF ? "upper" : "lower"));
-               ret = otp_read(page, flags, &content);
+               stamp("processing page %i (0x%x:%s)", page, flags,
+                       (flags & OTP_UPPER_HALF ? "upper" : "lower"));
+               ret = bfrom_OtpRead(page, flags, &content);
                if (ret & OTP_MASTER_ERROR) {
+                       stamp("error from otp: 0x%x", ret);
                        bytes_done = -EIO;
                        break;
                }
@@ -96,7 +66,7 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
                        bytes_done = -EFAULT;
                        break;
                }
-               if (flags == OTP_UPPER_HALF)
+               if (flags & OTP_UPPER_HALF)
                        ++page;
                bytes_done += sizeof(content);
                *pos += sizeof(content);
@@ -108,14 +78,53 @@ static ssize_t bfin_otp_read(struct file *file, char __user *buff, size_t count,
 }
 
 #ifdef CONFIG_BFIN_OTP_WRITE_ENABLE
+static bool allow_writes;
+
+/**
+ *     bfin_otp_init_timing - setup OTP timing parameters
+ *
+ *     Required before doing any write operation.  Algorithms from HRM.
+ */
+static u32 bfin_otp_init_timing(void)
+{
+       u32 tp1, tp2, tp3, timing;
+
+       tp1 = get_sclk() / 1000000;
+       tp2 = (2 * get_sclk() / 10000000) << 8;
+       tp3 = (0x1401) << 15;
+       timing = tp1 | tp2 | tp3;
+       if (bfrom_OtpCommand(OTP_INIT, timing))
+               return 0;
+
+       return timing;
+}
+
+/**
+ *     bfin_otp_deinit_timing - set timings to only allow reads
+ *
+ *     Should be called after all writes are done.
+ */
+static void bfin_otp_deinit_timing(u32 timing)
+{
+       /* mask bits [31:15] so that any attempts to write fail */
+       bfrom_OtpCommand(OTP_CLOSE, 0);
+       bfrom_OtpCommand(OTP_INIT, timing & ~(-1 << 15));
+       bfrom_OtpCommand(OTP_CLOSE, 0);
+}
+
 /**
- *     bfin_otp_write - Write OTP pages
+ *     bfin_otp_write - write OTP pages
  *
  *     All writes must be in half page chunks (half page == 64 bits).
  */
 static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t count, loff_t *pos)
 {
-       stampit();
+       ssize_t bytes_done;
+       u32 timing, page, base_flags, flags, ret;
+       u64 content;
+
+       if (!allow_writes)
+               return -EACCES;
 
        if (count % sizeof(u64))
                return -EMSGSIZE;
@@ -123,20 +132,96 @@ static ssize_t bfin_otp_write(struct file *filp, const char __user *buff, size_t
        if (mutex_lock_interruptible(&bfin_otp_lock))
                return -ERESTARTSYS;
 
-       /* need otp_init() documentation before this can be implemented */
+       stampit();
+
+       timing = bfin_otp_init_timing();
+       if (timing == 0) {
+               mutex_unlock(&bfin_otp_lock);
+               return -EIO;
+       }
+
+       base_flags = OTP_CHECK_FOR_PREV_WRITE;
+
+       bytes_done = 0;
+       page = *pos / (sizeof(u64) * 2);
+       while (bytes_done < count) {
+               flags = base_flags | (*pos % (sizeof(u64) * 2) ? OTP_UPPER_HALF : OTP_LOWER_HALF);
+               stamp("processing page %i (0x%x:%s) from %p", page, flags,
+                       (flags & OTP_UPPER_HALF ? "upper" : "lower"), buff + bytes_done);
+               if (copy_from_user(&content, buff + bytes_done, sizeof(content))) {
+                       bytes_done = -EFAULT;
+                       break;
+               }
+               ret = bfrom_OtpWrite(page, flags, &content);
+               if (ret & OTP_MASTER_ERROR) {
+                       stamp("error from otp: 0x%x", ret);
+                       bytes_done = -EIO;
+                       break;
+               }
+               if (flags & OTP_UPPER_HALF)
+                       ++page;
+               bytes_done += sizeof(content);
+               *pos += sizeof(content);
+       }
+
+       bfin_otp_deinit_timing(timing);
 
        mutex_unlock(&bfin_otp_lock);
 
+       return bytes_done;
+}
+
+static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+       stampit();
+
+       switch (cmd) {
+       case OTPLOCK: {
+               u32 timing;
+               int ret = -EIO;
+
+               if (!allow_writes)
+                       return -EACCES;
+
+               if (mutex_lock_interruptible(&bfin_otp_lock))
+                       return -ERESTARTSYS;
+
+               timing = bfin_otp_init_timing();
+               if (timing) {
+                       u32 otp_result = bfrom_OtpWrite(arg, OTP_LOCK, NULL);
+                       stamp("locking page %lu resulted in 0x%x", arg, otp_result);
+                       if (!(otp_result & OTP_MASTER_ERROR))
+                               ret = 0;
+
+                       bfin_otp_deinit_timing(timing);
+               }
+
+               mutex_unlock(&bfin_otp_lock);
+
+               return ret;
+       }
+
+       case MEMLOCK:
+               allow_writes = false;
+               return 0;
+
+       case MEMUNLOCK:
+               allow_writes = true;
+               return 0;
+       }
+
        return -EINVAL;
 }
 #else
 # define bfin_otp_write NULL
+# define bfin_otp_ioctl NULL
 #endif
 
 static struct file_operations bfin_otp_fops = {
-       .owner    = THIS_MODULE,
-       .read     = bfin_otp_read,
-       .write    = bfin_otp_write,
+       .owner          = THIS_MODULE,
+       .unlocked_ioctl = bfin_otp_ioctl,
+       .read           = bfin_otp_read,
+       .write          = bfin_otp_write,
 };
 
 static struct miscdevice bfin_otp_misc_device = {
index 4a9f3492b9216142333f3fa2013542b4638b9c1e..70a770ac013875a3e4b5200b954a21197623e2e7 100644 (file)
@@ -166,9 +166,8 @@ static irqreturn_t hpet_interrupt(int irq, void *data)
                unsigned long m, t;
 
                t = devp->hd_ireqfreq;
-               m = read_counter(&devp->hd_hpet->hpet_mc);
-               write_counter(t + m + devp->hd_hpets->hp_delta,
-                             &devp->hd_timer->hpet_compare);
+               m = read_counter(&devp->hd_timer->hpet_compare);
+               write_counter(t + m, &devp->hd_timer->hpet_compare);
        }
 
        if (devp->hd_flags & HPET_SHARED_IRQ)
@@ -504,21 +503,25 @@ static int hpet_ioctl_ieon(struct hpet_dev *devp)
        g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK;
 
        if (devp->hd_flags & HPET_PERIODIC) {
-               write_counter(t, &timer->hpet_compare);
                g |= Tn_TYPE_CNF_MASK;
-               v |= Tn_TYPE_CNF_MASK;
-               writeq(v, &timer->hpet_config);
-               v |= Tn_VAL_SET_CNF_MASK;
+               v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK;
                writeq(v, &timer->hpet_config);
                local_irq_save(flags);
 
-               /* NOTE:  what we modify here is a hidden accumulator
+               /*
+                * NOTE: First we modify the hidden accumulator
                 * register supported by periodic-capable comparators.
                 * We never want to modify the (single) counter; that
-                * would affect all the comparators.
+                * would affect all the comparators. The value written
+                * is the counter value when the first interrupt is due.
                 */
                m = read_counter(&hpet->hpet_mc);
                write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare);
+               /*
+                * Then we modify the comparator, indicating the period
+                * for subsequent interrupt.
+                */
+               write_counter(t, &timer->hpet_compare);
        } else {
                local_irq_save(flags);
                m = read_counter(&hpet->hpet_mc);
index 0aede1d6a9eaa28a843274793ddc0af54e492962..6c8b65d069e514badcab1b70c6032fea129b2bc7 100644 (file)
@@ -690,7 +690,7 @@ static ssize_t read_zero(struct file * file, char __user * buf,
 
                if (chunk > PAGE_SIZE)
                        chunk = PAGE_SIZE;      /* Just for latency reasons */
-               unwritten = clear_user(buf, chunk);
+               unwritten = __clear_user(buf, chunk);
                written += chunk - unwritten;
                if (unwritten)
                        break;
index 94ad2c3bfc4a290d025bffbbf14481fa1b8dd39d..a4ec50c950722410e9cd6dc99b72b76edf11fb2b 100644 (file)
@@ -281,12 +281,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
                case IOCTL_MW_REGISTER_IPC: {
                        unsigned int ipcnum = (unsigned int) ioarg;
        
-                       PRINTK_3(TRACE_MWAVE,
-                               "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
-                               " ipcnum %x entry usIntCount %x\n",
-                               ipcnum,
-                               pDrvData->IPCs[ipcnum].usIntCount);
-       
                        if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
                                PRINTK_ERROR(KERN_ERR_MWAVE
                                                "mwavedd::mwave_ioctl:"
@@ -295,6 +289,12 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
                                                ipcnum);
                                return -EINVAL;
                        }
+                       PRINTK_3(TRACE_MWAVE,
+                               "mwavedd::mwave_ioctl IOCTL_MW_REGISTER_IPC"
+                               " ipcnum %x entry usIntCount %x\n",
+                               ipcnum,
+                               pDrvData->IPCs[ipcnum].usIntCount);
+
                        lock_kernel();
                        pDrvData->IPCs[ipcnum].bIsHere = FALSE;
                        pDrvData->IPCs[ipcnum].bIsEnabled = TRUE;
@@ -310,11 +310,6 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
                case IOCTL_MW_GET_IPC: {
                        unsigned int ipcnum = (unsigned int) ioarg;
        
-                       PRINTK_3(TRACE_MWAVE,
-                               "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
-                               " ipcnum %x, usIntCount %x\n",
-                               ipcnum,
-                               pDrvData->IPCs[ipcnum].usIntCount);
                        if (ipcnum >= ARRAY_SIZE(pDrvData->IPCs)) {
                                PRINTK_ERROR(KERN_ERR_MWAVE
                                                "mwavedd::mwave_ioctl:"
@@ -322,6 +317,11 @@ static long mwave_ioctl(struct file *file, unsigned int iocmd,
                                                " Invalid ipcnum %x\n", ipcnum);
                                return -EINVAL;
                        }
+                       PRINTK_3(TRACE_MWAVE,
+                               "mwavedd::mwave_ioctl IOCTL_MW_GET_IPC"
+                               " ipcnum %x, usIntCount %x\n",
+                               ipcnum,
+                               pDrvData->IPCs[ipcnum].usIntCount);
        
                        lock_kernel();
                        if (pDrvData->IPCs[ipcnum].bIsEnabled == TRUE) {
index d8a9255e1a3f1ed757d2066a528108cf033e1760..04b505e5a5e25da1b477218b787f6bd9f2470709 100644 (file)
@@ -1231,7 +1231,7 @@ static char sysctl_bootid[16];
  * as an ASCII string in the standard UUID format.  If accesses via the
  * sysctl system call, it is returned as 16 bytes of binary data.
  */
-static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
+static int proc_do_uuid(ctl_table *table, int write,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        ctl_table fake_table;
@@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
        fake_table.data = buf;
        fake_table.maxlen = sizeof(buf);
 
-       return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos);
+       return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
 static int uuid_strategy(ctl_table *table,
index eecee0f576d2d7eea772b574ebac97ec33db04f4..74339559f0b9a3be4a768c5ef1f65d4cc8b3ca58 100644 (file)
@@ -873,7 +873,7 @@ int riocontrol(struct rio_info *p, dev_t dev, int cmd, unsigned long arg, int su
                /*
                 ** It is important that the product code is an unsigned object!
                 */
-               if (DownLoad.ProductCode > MAX_PRODUCT) {
+               if (DownLoad.ProductCode >= MAX_PRODUCT) {
                        rio_dprintk(RIO_DEBUG_CTRL, "RIO_DOWNLOAD: Bad product code %d passed\n", DownLoad.ProductCode);
                        p->RIOError.Error = NO_SUCH_PRODUCT;
                        return -ENXIO;
diff --git a/drivers/char/uv_mmtimer.c b/drivers/char/uv_mmtimer.c
new file mode 100644 (file)
index 0000000..867b67b
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * Timer device implementation for SGI UV platform.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2009 Silicon Graphics, Inc.  All rights reserved.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mmtimer.h>
+#include <linux/miscdevice.h>
+#include <linux/posix-timers.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/smp_lock.h>
+
+#include <asm/genapic.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+MODULE_AUTHOR("Dimitri Sivanich <sivanich@sgi.com>");
+MODULE_DESCRIPTION("SGI UV Memory Mapped RTC Timer");
+MODULE_LICENSE("GPL");
+
+/* name of the device, usually in /dev */
+#define UV_MMTIMER_NAME "mmtimer"
+#define UV_MMTIMER_DESC "SGI UV Memory Mapped RTC Timer"
+#define UV_MMTIMER_VERSION "1.0"
+
+static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
+                                               unsigned long arg);
+static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma);
+
+/*
+ * Period in femtoseconds (10^-15 s)
+ */
+static unsigned long uv_mmtimer_femtoperiod;
+
+static const struct file_operations uv_mmtimer_fops = {
+       .owner = THIS_MODULE,
+       .mmap = uv_mmtimer_mmap,
+       .unlocked_ioctl = uv_mmtimer_ioctl,
+};
+
+/**
+ * uv_mmtimer_ioctl - ioctl interface for /dev/uv_mmtimer
+ * @file: file structure for the device
+ * @cmd: command to execute
+ * @arg: optional argument to command
+ *
+ * Executes the command specified by @cmd.  Returns 0 for success, < 0 for
+ * failure.
+ *
+ * Valid commands:
+ *
+ * %MMTIMER_GETOFFSET - Should return the offset (relative to the start
+ * of the page where the registers are mapped) for the counter in question.
+ *
+ * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15)
+ * seconds
+ *
+ * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address
+ * specified by @arg
+ *
+ * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter
+ *
+ * %MMTIMER_MMAPAVAIL - Returns 1 if registers can be mmap'd into userspace
+ *
+ * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it
+ * in the address specified by @arg.
+ */
+static long uv_mmtimer_ioctl(struct file *file, unsigned int cmd,
+                                               unsigned long arg)
+{
+       int ret = 0;
+
+       switch (cmd) {
+       case MMTIMER_GETOFFSET: /* offset of the counter */
+               /*
+                * UV RTC register is on its own page
+                */
+               if (PAGE_SIZE <= (1 << 16))
+                       ret = ((UV_LOCAL_MMR_BASE | UVH_RTC) & (PAGE_SIZE-1))
+                               / 8;
+               else
+                       ret = -ENOSYS;
+               break;
+
+       case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */
+               if (copy_to_user((unsigned long __user *)arg,
+                               &uv_mmtimer_femtoperiod, sizeof(unsigned long)))
+                       ret = -EFAULT;
+               break;
+
+       case MMTIMER_GETFREQ: /* frequency in Hz */
+               if (copy_to_user((unsigned long __user *)arg,
+                               &sn_rtc_cycles_per_second,
+                               sizeof(unsigned long)))
+                       ret = -EFAULT;
+               break;
+
+       case MMTIMER_GETBITS: /* number of bits in the clock */
+               ret = hweight64(UVH_RTC_REAL_TIME_CLOCK_MASK);
+               break;
+
+       case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */
+               ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0;
+               break;
+
+       case MMTIMER_GETCOUNTER:
+               if (copy_to_user((unsigned long __user *)arg,
+                               (unsigned long *)uv_local_mmr_address(UVH_RTC),
+                               sizeof(unsigned long)))
+                       ret = -EFAULT;
+               break;
+       default:
+               ret = -ENOTTY;
+               break;
+       }
+       return ret;
+}
+
+/**
+ * uv_mmtimer_mmap - maps the clock's registers into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ *
+ * Calls remap_pfn_range() to map the clock's registers into
+ * the calling process' address space.
+ */
+static int uv_mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       unsigned long uv_mmtimer_addr;
+
+       if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+               return -EINVAL;
+
+       if (vma->vm_flags & VM_WRITE)
+               return -EPERM;
+
+       if (PAGE_SIZE > (1 << 16))
+               return -ENOSYS;
+
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+       uv_mmtimer_addr = UV_LOCAL_MMR_BASE | UVH_RTC;
+       uv_mmtimer_addr &= ~(PAGE_SIZE - 1);
+       uv_mmtimer_addr &= 0xfffffffffffffffUL;
+
+       if (remap_pfn_range(vma, vma->vm_start, uv_mmtimer_addr >> PAGE_SHIFT,
+                                       PAGE_SIZE, vma->vm_page_prot)) {
+               printk(KERN_ERR "remap_pfn_range failed in uv_mmtimer_mmap\n");
+               return -EAGAIN;
+       }
+
+       return 0;
+}
+
+static struct miscdevice uv_mmtimer_miscdev = {
+       MISC_DYNAMIC_MINOR,
+       UV_MMTIMER_NAME,
+       &uv_mmtimer_fops
+};
+
+
+/**
+ * uv_mmtimer_init - device initialization routine
+ *
+ * Does initial setup for the uv_mmtimer device.
+ */
+static int __init uv_mmtimer_init(void)
+{
+       if (!is_uv_system()) {
+               printk(KERN_ERR "%s: Hardware unsupported\n", UV_MMTIMER_NAME);
+               return -1;
+       }
+
+       /*
+        * Sanity check the cycles/sec variable
+        */
+       if (sn_rtc_cycles_per_second < 100000) {
+               printk(KERN_ERR "%s: unable to determine clock frequency\n",
+                      UV_MMTIMER_NAME);
+               return -1;
+       }
+
+       uv_mmtimer_femtoperiod = ((unsigned long)1E15 +
+                               sn_rtc_cycles_per_second / 2) /
+                               sn_rtc_cycles_per_second;
+
+       if (misc_register(&uv_mmtimer_miscdev)) {
+               printk(KERN_ERR "%s: failed to register device\n",
+                      UV_MMTIMER_NAME);
+               return -1;
+       }
+
+       printk(KERN_INFO "%s: v%s, %ld MHz\n", UV_MMTIMER_DESC,
+               UV_MMTIMER_VERSION,
+               sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+       return 0;
+}
+
+module_init(uv_mmtimer_init);
index 25b743abfb59442b52ccec0057573716249bdbbf..52e6bb70a490d8bebf26745191bcf80ce5338ca2 100644 (file)
@@ -28,7 +28,7 @@
 #include <linux/device.h>
 #include <linux/dca.h>
 
-#define DCA_VERSION "1.8"
+#define DCA_VERSION "1.12.1"
 
 MODULE_VERSION(DCA_VERSION);
 MODULE_LICENSE("GPL");
@@ -36,20 +36,92 @@ MODULE_AUTHOR("Intel Corporation");
 
 static DEFINE_SPINLOCK(dca_lock);
 
-static LIST_HEAD(dca_providers);
+static LIST_HEAD(dca_domains);
 
-static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+static struct pci_bus *dca_pci_rc_from_dev(struct device *dev)
 {
-       struct dca_provider *dca, *ret = NULL;
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct pci_bus *bus = pdev->bus;
 
-       list_for_each_entry(dca, &dca_providers, node) {
-               if ((!dev) || (dca->ops->dev_managed(dca, dev))) {
-                       ret = dca;
-                       break;
-               }
+       while (bus->parent)
+               bus = bus->parent;
+
+       return bus;
+}
+
+static struct dca_domain *dca_allocate_domain(struct pci_bus *rc)
+{
+       struct dca_domain *domain;
+
+       domain = kzalloc(sizeof(*domain), GFP_NOWAIT);
+       if (!domain)
+               return NULL;
+
+       INIT_LIST_HEAD(&domain->dca_providers);
+       domain->pci_rc = rc;
+
+       return domain;
+}
+
+static void dca_free_domain(struct dca_domain *domain)
+{
+       list_del(&domain->node);
+       kfree(domain);
+}
+
+static struct dca_domain *dca_find_domain(struct pci_bus *rc)
+{
+       struct dca_domain *domain;
+
+       list_for_each_entry(domain, &dca_domains, node)
+               if (domain->pci_rc == rc)
+                       return domain;
+
+       return NULL;
+}
+
+static struct dca_domain *dca_get_domain(struct device *dev)
+{
+       struct pci_bus *rc;
+       struct dca_domain *domain;
+
+       rc = dca_pci_rc_from_dev(dev);
+       domain = dca_find_domain(rc);
+
+       if (!domain) {
+               domain = dca_allocate_domain(rc);
+               if (domain)
+                       list_add(&domain->node, &dca_domains);
+       }
+
+       return domain;
+}
+
+static struct dca_provider *dca_find_provider_by_dev(struct device *dev)
+{
+       struct dca_provider *dca;
+       struct pci_bus *rc;
+       struct dca_domain *domain;
+
+       if (dev) {
+               rc = dca_pci_rc_from_dev(dev);
+               domain = dca_find_domain(rc);
+               if (!domain)
+                       return NULL;
+       } else {
+               if (!list_empty(&dca_domains))
+                       domain = list_first_entry(&dca_domains,
+                                                 struct dca_domain,
+                                                 node);
+               else
+                       return NULL;
        }
 
-       return ret;
+       list_for_each_entry(dca, &domain->dca_providers, node)
+               if ((!dev) || (dca->ops->dev_managed(dca, dev)))
+                       return dca;
+
+       return NULL;
 }
 
 /**
@@ -61,6 +133,8 @@ int dca_add_requester(struct device *dev)
        struct dca_provider *dca;
        int err, slot = -ENODEV;
        unsigned long flags;
+       struct pci_bus *pci_rc;
+       struct dca_domain *domain;
 
        if (!dev)
                return -EFAULT;
@@ -74,7 +148,14 @@ int dca_add_requester(struct device *dev)
                return -EEXIST;
        }
 
-       list_for_each_entry(dca, &dca_providers, node) {
+       pci_rc = dca_pci_rc_from_dev(dev);
+       domain = dca_find_domain(pci_rc);
+       if (!domain) {
+               spin_unlock_irqrestore(&dca_lock, flags);
+               return -ENODEV;
+       }
+
+       list_for_each_entry(dca, &domain->dca_providers, node) {
                slot = dca->ops->add_requester(dca, dev);
                if (slot >= 0)
                        break;
@@ -222,13 +303,19 @@ int register_dca_provider(struct dca_provider *dca, struct device *dev)
 {
        int err;
        unsigned long flags;
+       struct dca_domain *domain;
 
        err = dca_sysfs_add_provider(dca, dev);
        if (err)
                return err;
 
        spin_lock_irqsave(&dca_lock, flags);
-       list_add(&dca->node, &dca_providers);
+       domain = dca_get_domain(dev);
+       if (!domain) {
+               spin_unlock_irqrestore(&dca_lock, flags);
+               return -ENODEV;
+       }
+       list_add(&dca->node, &domain->dca_providers);
        spin_unlock_irqrestore(&dca_lock, flags);
 
        blocking_notifier_call_chain(&dca_provider_chain,
@@ -241,15 +328,24 @@ EXPORT_SYMBOL_GPL(register_dca_provider);
  * unregister_dca_provider - remove a dca provider
  * @dca - struct created by alloc_dca_provider()
  */
-void unregister_dca_provider(struct dca_provider *dca)
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev)
 {
        unsigned long flags;
+       struct pci_bus *pci_rc;
+       struct dca_domain *domain;
 
        blocking_notifier_call_chain(&dca_provider_chain,
                                     DCA_PROVIDER_REMOVE, NULL);
 
        spin_lock_irqsave(&dca_lock, flags);
+
        list_del(&dca->node);
+
+       pci_rc = dca_pci_rc_from_dev(dev);
+       domain = dca_find_domain(pci_rc);
+       if (list_empty(&domain->dca_providers))
+               dca_free_domain(domain);
+
        spin_unlock_irqrestore(&dca_lock, flags);
 
        dca_sysfs_remove_provider(dca);
@@ -276,7 +372,7 @@ EXPORT_SYMBOL_GPL(dca_unregister_notify);
 
 static int __init dca_init(void)
 {
-       printk(KERN_ERR "dca service started, version %s\n", DCA_VERSION);
+       pr_info("dca service started, version %s\n", DCA_VERSION);
        return dca_sysfs_init();
 }
 
index 81e1020fb5148a75677aec191e974c554031d03f..5903a88351bfdf5b844a45c616aa68419318a3fb 100644 (file)
@@ -17,11 +17,15 @@ if DMADEVICES
 
 comment "DMA Devices"
 
+config ASYNC_TX_DISABLE_CHANNEL_SWITCH
+       bool
+
 config INTEL_IOATDMA
        tristate "Intel I/OAT DMA support"
        depends on PCI && X86
        select DMA_ENGINE
        select DCA
+       select ASYNC_TX_DISABLE_CHANNEL_SWITCH
        help
          Enable support for the Intel(R) I/OAT DMA engine present
          in recent Intel Xeon chipsets.
@@ -97,6 +101,14 @@ config TXX9_DMAC
          Support the TXx9 SoC internal DMA controller.  This can be
          integrated in chips such as the Toshiba TX4927/38/39.
 
+config SH_DMAE
+       tristate "Renesas SuperH DMAC support"
+       depends on SUPERH && SH_DMA
+       depends on !SH_DMA_API
+       select DMA_ENGINE
+       help
+         Enable support for the Renesas SuperH DMA controllers.
+
 config DMA_ENGINE
        bool
 
@@ -116,7 +128,7 @@ config NET_DMA
 
 config ASYNC_TX_DMA
        bool "Async_tx: Offload support for the async_tx api"
-       depends on DMA_ENGINE && !HIGHMEM64G
+       depends on DMA_ENGINE
        help
          This allows the async_tx api to take advantage of offload engines for
          memcpy, memset, xor, and raid6 p+q operations.  If your platform has
index 40e1e008357192b1aced8b1abab5a86460c78352..eca71ba78ae9716234b46c30788f350ded01a692 100644 (file)
@@ -1,8 +1,7 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_DMATEST) += dmatest.o
-obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
-ioatdma-objs := ioat.o ioat_dma.o ioat_dca.o
+obj-$(CONFIG_INTEL_IOATDMA) += ioat/
 obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_FSL_DMA) += fsldma.o
 obj-$(CONFIG_MV_XOR) += mv_xor.o
@@ -10,3 +9,4 @@ obj-$(CONFIG_DW_DMAC) += dw_dmac.o
 obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_MX3_IPU) += ipu/
 obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o
+obj-$(CONFIG_SH_DMAE) += shdma.o
index c8522e6f1ad2d0aef5722e740b3bbce22ca3986b..7585c4164bd5f1d28d300e923969b797c5d9bd99 100644 (file)
@@ -87,6 +87,7 @@ static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan,
        desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys);
        if (desc) {
                memset(desc, 0, sizeof(struct at_desc));
+               INIT_LIST_HEAD(&desc->tx_list);
                dma_async_tx_descriptor_init(&desc->txd, chan);
                /* txd.flags will be overwritten in prep functions */
                desc->txd.flags = DMA_CTRL_ACK;
@@ -150,11 +151,11 @@ static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc)
                struct at_desc *child;
 
                spin_lock_bh(&atchan->lock);
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        dev_vdbg(chan2dev(&atchan->chan_common),
                                        "moving child desc %p to freelist\n",
                                        child);
-               list_splice_init(&desc->txd.tx_list, &atchan->free_list);
+               list_splice_init(&desc->tx_list, &atchan->free_list);
                dev_vdbg(chan2dev(&atchan->chan_common),
                         "moving desc %p to freelist\n", desc);
                list_add(&desc->desc_node, &atchan->free_list);
@@ -247,30 +248,33 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
        param = txd->callback_param;
 
        /* move children to free_list */
-       list_splice_init(&txd->tx_list, &atchan->free_list);
+       list_splice_init(&desc->tx_list, &atchan->free_list);
        /* move myself to free_list */
        list_move(&desc->desc_node, &atchan->free_list);
 
        /* unmap dma addresses */
-       if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                       dma_unmap_single(chan2parent(&atchan->chan_common),
-                                       desc->lli.daddr,
-                                       desc->len, DMA_FROM_DEVICE);
-               else
-                       dma_unmap_page(chan2parent(&atchan->chan_common),
-                                       desc->lli.daddr,
-                                       desc->len, DMA_FROM_DEVICE);
-       }
-       if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                       dma_unmap_single(chan2parent(&atchan->chan_common),
-                                       desc->lli.saddr,
-                                       desc->len, DMA_TO_DEVICE);
-               else
-                       dma_unmap_page(chan2parent(&atchan->chan_common),
-                                       desc->lli.saddr,
-                                       desc->len, DMA_TO_DEVICE);
+       if (!atchan->chan_common.private) {
+               struct device *parent = chan2parent(&atchan->chan_common);
+               if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+                       if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+                               dma_unmap_single(parent,
+                                               desc->lli.daddr,
+                                               desc->len, DMA_FROM_DEVICE);
+                       else
+                               dma_unmap_page(parent,
+                                               desc->lli.daddr,
+                                               desc->len, DMA_FROM_DEVICE);
+               }
+               if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+                       if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+                               dma_unmap_single(parent,
+                                               desc->lli.saddr,
+                                               desc->len, DMA_TO_DEVICE);
+                       else
+                               dma_unmap_page(parent,
+                                               desc->lli.saddr,
+                                               desc->len, DMA_TO_DEVICE);
+               }
        }
 
        /*
@@ -334,7 +338,7 @@ static void atc_cleanup_descriptors(struct at_dma_chan *atchan)
                        /* This one is currently in progress */
                        return;
 
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        if (!(child->lli.ctrla & ATC_DONE))
                                /* Currently in progress */
                                return;
@@ -407,7 +411,7 @@ static void atc_handle_error(struct at_dma_chan *atchan)
        dev_crit(chan2dev(&atchan->chan_common),
                        "  cookie: %d\n", bad_desc->txd.cookie);
        atc_dump_lli(atchan, &bad_desc->lli);
-       list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+       list_for_each_entry(child, &bad_desc->tx_list, desc_node)
                atc_dump_lli(atchan, &child->lli);
 
        /* Pretend the descriptor completed successfully */
@@ -587,7 +591,7 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                        prev->lli.dscr = desc->txd.phys;
                        /* insert the link descriptor to the LD ring */
                        list_add_tail(&desc->desc_node,
-                                       &first->txd.tx_list);
+                                       &first->tx_list);
                }
                prev = desc;
        }
@@ -646,8 +650,6 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
        reg_width = atslave->reg_width;
 
-       sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
-
        ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla;
        ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN;
 
@@ -687,7 +689,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                                prev->lli.dscr = desc->txd.phys;
                                /* insert the link descriptor to the LD ring */
                                list_add_tail(&desc->desc_node,
-                                               &first->txd.tx_list);
+                                               &first->tx_list);
                        }
                        prev = desc;
                        total_len += len;
@@ -729,7 +731,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                                prev->lli.dscr = desc->txd.phys;
                                /* insert the link descriptor to the LD ring */
                                list_add_tail(&desc->desc_node,
-                                               &first->txd.tx_list);
+                                               &first->tx_list);
                        }
                        prev = desc;
                        total_len += len;
index 4c972afc49ec32db97ddd52059c9e27961f20f98..495457e3dc4b6133dc6c846392961db3a461aaf9 100644 (file)
@@ -165,6 +165,7 @@ struct at_desc {
        struct at_lli                   lli;
 
        /* THEN values for driver housekeeping */
+       struct list_head                tx_list;
        struct dma_async_tx_descriptor  txd;
        struct list_head                desc_node;
        size_t                          len;
index 5a87384ea4ff5cf58c77b64200a621b8fe616b76..bd0b248de2cfabc28f1fecd63e5d64ed61236844 100644 (file)
@@ -608,6 +608,40 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
+static bool device_has_all_tx_types(struct dma_device *device)
+{
+       /* A device that satisfies this test has channels that will never cause
+        * an async_tx channel switch event as all possible operation types can
+        * be handled.
+        */
+       #ifdef CONFIG_ASYNC_TX_DMA
+       if (!dma_has_cap(DMA_INTERRUPT, device->cap_mask))
+               return false;
+       #endif
+
+       #if defined(CONFIG_ASYNC_MEMCPY) || defined(CONFIG_ASYNC_MEMCPY_MODULE)
+       if (!dma_has_cap(DMA_MEMCPY, device->cap_mask))
+               return false;
+       #endif
+
+       #if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE)
+       if (!dma_has_cap(DMA_MEMSET, device->cap_mask))
+               return false;
+       #endif
+
+       #if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE)
+       if (!dma_has_cap(DMA_XOR, device->cap_mask))
+               return false;
+       #endif
+
+       #if defined(CONFIG_ASYNC_PQ) || defined(CONFIG_ASYNC_PQ_MODULE)
+       if (!dma_has_cap(DMA_PQ, device->cap_mask))
+               return false;
+       #endif
+
+       return true;
+}
+
 static int get_dma_id(struct dma_device *device)
 {
        int rc;
@@ -644,8 +678,12 @@ int dma_async_device_register(struct dma_device *device)
                !device->device_prep_dma_memcpy);
        BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) &&
                !device->device_prep_dma_xor);
-       BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
-               !device->device_prep_dma_zero_sum);
+       BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) &&
+               !device->device_prep_dma_xor_val);
+       BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) &&
+               !device->device_prep_dma_pq);
+       BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
+               !device->device_prep_dma_pq_val);
        BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
                !device->device_prep_dma_memset);
        BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
@@ -661,6 +699,12 @@ int dma_async_device_register(struct dma_device *device)
        BUG_ON(!device->device_issue_pending);
        BUG_ON(!device->dev);
 
+       /* note: this only matters in the
+        * CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH=y case
+        */
+       if (device_has_all_tx_types(device))
+               dma_cap_set(DMA_ASYNC_TX, device->cap_mask);
+
        idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
        if (!idr_ref)
                return -ENOMEM;
@@ -933,55 +977,29 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
        tx->chan = chan;
        spin_lock_init(&tx->lock);
-       INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
 /* dma_wait_for_async_tx - spin wait for a transaction to complete
  * @tx: in-flight transaction to wait on
- *
- * This routine assumes that tx was obtained from a call to async_memcpy,
- * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
- * and submitted).  Walking the parent chain is only meant to cover for DMA
- * drivers that do not implement the DMA_INTERRUPT capability and may race with
- * the driver's descriptor cleanup routine.
  */
 enum dma_status
 dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 {
-       enum dma_status status;
-       struct dma_async_tx_descriptor *iter;
-       struct dma_async_tx_descriptor *parent;
+       unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
 
        if (!tx)
                return DMA_SUCCESS;
 
-       WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
-                 " %s\n", __func__, dma_chan_name(tx->chan));
-
-       /* poll through the dependency chain, return when tx is complete */
-       do {
-               iter = tx;
-
-               /* find the root of the unsubmitted dependency chain */
-               do {
-                       parent = iter->parent;
-                       if (!parent)
-                               break;
-                       else
-                               iter = parent;
-               } while (parent);
-
-               /* there is a small window for ->parent == NULL and
-                * ->cookie == -EBUSY
-                */
-               while (iter->cookie == -EBUSY)
-                       cpu_relax();
-
-               status = dma_sync_wait(iter->chan, iter->cookie);
-       } while (status == DMA_IN_PROGRESS || (iter != tx));
-
-       return status;
+       while (tx->cookie == -EBUSY) {
+               if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
+                       pr_err("%s timeout waiting for descriptor submission\n",
+                               __func__);
+                       return DMA_ERROR;
+               }
+               cpu_relax();
+       }
+       return dma_sync_wait(tx->chan, tx->cookie);
 }
 EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
 
index d93017fc7872d1f0fe3904ff85870ff8aee1156a..a32a4cf7b1e049ab537db5c065f4f6a474dca2d2 100644 (file)
@@ -48,6 +48,11 @@ module_param(xor_sources, uint, S_IRUGO);
 MODULE_PARM_DESC(xor_sources,
                "Number of xor source buffers (default: 3)");
 
+static unsigned int pq_sources = 3;
+module_param(pq_sources, uint, S_IRUGO);
+MODULE_PARM_DESC(pq_sources,
+               "Number of p+q source buffers (default: 3)");
+
 /*
  * Initialization patterns. All bytes in the source buffer has bit 7
  * set, all bytes in the destination buffer has bit 7 cleared.
@@ -232,6 +237,7 @@ static int dmatest_func(void *data)
        dma_cookie_t            cookie;
        enum dma_status         status;
        enum dma_ctrl_flags     flags;
+       u8                      pq_coefs[pq_sources];
        int                     ret;
        int                     src_cnt;
        int                     dst_cnt;
@@ -248,6 +254,11 @@ static int dmatest_func(void *data)
        else if (thread->type == DMA_XOR) {
                src_cnt = xor_sources | 1; /* force odd to ensure dst = src */
                dst_cnt = 1;
+       } else if (thread->type == DMA_PQ) {
+               src_cnt = pq_sources | 1; /* force odd to ensure dst = src */
+               dst_cnt = 2;
+               for (i = 0; i < pq_sources; i++)
+                       pq_coefs[i] = 1;
        } else
                goto err_srcs;
 
@@ -283,6 +294,7 @@ static int dmatest_func(void *data)
                dma_addr_t dma_dsts[dst_cnt];
                struct completion cmp;
                unsigned long tmo = msecs_to_jiffies(3000);
+               u8 align = 0;
 
                total_tests++;
 
@@ -290,6 +302,18 @@ static int dmatest_func(void *data)
                src_off = dmatest_random() % (test_buf_size - len + 1);
                dst_off = dmatest_random() % (test_buf_size - len + 1);
 
+               /* honor alignment restrictions */
+               if (thread->type == DMA_MEMCPY)
+                       align = dev->copy_align;
+               else if (thread->type == DMA_XOR)
+                       align = dev->xor_align;
+               else if (thread->type == DMA_PQ)
+                       align = dev->pq_align;
+
+               len = (len >> align) << align;
+               src_off = (src_off >> align) << align;
+               dst_off = (dst_off >> align) << align;
+
                dmatest_init_srcs(thread->srcs, src_off, len);
                dmatest_init_dsts(thread->dsts, dst_off, len);
 
@@ -306,6 +330,7 @@ static int dmatest_func(void *data)
                                                     DMA_BIDIRECTIONAL);
                }
 
+
                if (thread->type == DMA_MEMCPY)
                        tx = dev->device_prep_dma_memcpy(chan,
                                                         dma_dsts[0] + dst_off,
@@ -316,6 +341,15 @@ static int dmatest_func(void *data)
                                                      dma_dsts[0] + dst_off,
                                                      dma_srcs, xor_sources,
                                                      len, flags);
+               else if (thread->type == DMA_PQ) {
+                       dma_addr_t dma_pq[dst_cnt];
+
+                       for (i = 0; i < dst_cnt; i++)
+                               dma_pq[i] = dma_dsts[i] + dst_off;
+                       tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs,
+                                                    pq_sources, pq_coefs,
+                                                    len, flags);
+               }
 
                if (!tx) {
                        for (i = 0; i < src_cnt; i++)
@@ -459,6 +493,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
                op = "copy";
        else if (type == DMA_XOR)
                op = "xor";
+       else if (type == DMA_PQ)
+               op = "pq";
        else
                return -EINVAL;
 
@@ -514,6 +550,10 @@ static int dmatest_add_channel(struct dma_chan *chan)
                cnt = dmatest_add_threads(dtc, DMA_XOR);
                thread_count += cnt > 0 ? cnt : 0;
        }
+       if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+               cnt = dmatest_add_threads(dtc, DMA_PQ);
+               thread_count += cnt > 0 ?: 0;
+       }
 
        pr_info("dmatest: Started %u threads using %s\n",
                thread_count, dma_chan_name(chan));
index 933c143b6a740d8030d0d5318de4232e3d71d1ee..2eea823516a7aa9797f7908074c4ec68bd564b81 100644 (file)
@@ -116,7 +116,7 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
 {
        struct dw_desc  *child;
 
-       list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+       list_for_each_entry(child, &desc->tx_list, desc_node)
                dma_sync_single_for_cpu(chan2parent(&dwc->chan),
                                child->txd.phys, sizeof(child->lli),
                                DMA_TO_DEVICE);
@@ -137,11 +137,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
                dwc_sync_desc_for_cpu(dwc, desc);
 
                spin_lock_bh(&dwc->lock);
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        dev_vdbg(chan2dev(&dwc->chan),
                                        "moving child desc %p to freelist\n",
                                        child);
-               list_splice_init(&desc->txd.tx_list, &dwc->free_list);
+               list_splice_init(&desc->tx_list, &dwc->free_list);
                dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
                list_add(&desc->desc_node, &dwc->free_list);
                spin_unlock_bh(&dwc->lock);
@@ -209,19 +209,28 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
        param = txd->callback_param;
 
        dwc_sync_desc_for_cpu(dwc, desc);
-       list_splice_init(&txd->tx_list, &dwc->free_list);
+       list_splice_init(&desc->tx_list, &dwc->free_list);
        list_move(&desc->desc_node, &dwc->free_list);
 
-       /*
-        * We use dma_unmap_page() regardless of how the buffers were
-        * mapped before they were submitted...
-        */
-       if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP))
-               dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar,
-                              desc->len, DMA_FROM_DEVICE);
-       if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP))
-               dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar,
-                              desc->len, DMA_TO_DEVICE);
+       if (!dwc->chan.private) {
+               struct device *parent = chan2parent(&dwc->chan);
+               if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+                       if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+                               dma_unmap_single(parent, desc->lli.dar,
+                                               desc->len, DMA_FROM_DEVICE);
+                       else
+                               dma_unmap_page(parent, desc->lli.dar,
+                                               desc->len, DMA_FROM_DEVICE);
+               }
+               if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+                       if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+                               dma_unmap_single(parent, desc->lli.sar,
+                                               desc->len, DMA_TO_DEVICE);
+                       else
+                               dma_unmap_page(parent, desc->lli.sar,
+                                               desc->len, DMA_TO_DEVICE);
+               }
+       }
 
        /*
         * The API requires that no submissions are done from a
@@ -289,7 +298,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
                        /* This one is currently in progress */
                        return;
 
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        if (child->lli.llp == llp)
                                /* Currently in progress */
                                return;
@@ -356,7 +365,7 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
        dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
                        "  cookie: %d\n", bad_desc->txd.cookie);
        dwc_dump_lli(dwc, &bad_desc->lli);
-       list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+       list_for_each_entry(child, &bad_desc->tx_list, desc_node)
                dwc_dump_lli(dwc, &child->lli);
 
        /* Pretend the descriptor completed successfully */
@@ -608,7 +617,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                                        prev->txd.phys, sizeof(prev->lli),
                                        DMA_TO_DEVICE);
                        list_add_tail(&desc->desc_node,
-                                       &first->txd.tx_list);
+                                       &first->tx_list);
                }
                prev = desc;
        }
@@ -658,8 +667,6 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
        reg_width = dws->reg_width;
        prev = first = NULL;
 
-       sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
-
        switch (direction) {
        case DMA_TO_DEVICE:
                ctllo = (DWC_DEFAULT_CTLLO
@@ -700,7 +707,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                                                sizeof(prev->lli),
                                                DMA_TO_DEVICE);
                                list_add_tail(&desc->desc_node,
-                                               &first->txd.tx_list);
+                                               &first->tx_list);
                        }
                        prev = desc;
                        total_len += len;
@@ -746,7 +753,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                                                sizeof(prev->lli),
                                                DMA_TO_DEVICE);
                                list_add_tail(&desc->desc_node,
-                                               &first->txd.tx_list);
+                                               &first->tx_list);
                        }
                        prev = desc;
                        total_len += len;
@@ -902,6 +909,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
                        break;
                }
 
+               INIT_LIST_HEAD(&desc->tx_list);
                dma_async_tx_descriptor_init(&desc->txd, chan);
                desc->txd.tx_submit = dwc_tx_submit;
                desc->txd.flags = DMA_CTRL_ACK;
index 13a580767031a3aac04046ffd563ef1d83f053b0..d9a939f67f461ffa7b5deffd0f21d35bd7678864 100644 (file)
@@ -217,6 +217,7 @@ struct dw_desc {
 
        /* THEN values for driver housekeeping */
        struct list_head                desc_node;
+       struct list_head                tx_list;
        struct dma_async_tx_descriptor  txd;
        size_t                          len;
 };
index ef87a89841450e0f35f1a35f4753b0207abc0300..296f9e747fac3b920cc344e6862f6d20d7c4b6b8 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/dmapool.h>
 #include <linux/of_platform.h>
 
+#include <asm/fsldma.h>
 #include "fsldma.h"
 
 static void dma_init(struct fsl_dma_chan *fsl_chan)
@@ -280,28 +281,40 @@ static void fsl_chan_set_dest_loop_size(struct fsl_dma_chan *fsl_chan, int size)
 }
 
 /**
- * fsl_chan_toggle_ext_pause - Toggle channel external pause status
+ * fsl_chan_set_request_count - Set DMA Request Count for external control
  * @fsl_chan : Freescale DMA channel
- * @size     : Pause control size, 0 for disable external pause control.
- *             The maximum is 1024.
+ * @size     : Number of bytes to transfer in a single request
+ *
+ * The Freescale DMA channel can be controlled by the external signal DREQ#.
+ * The DMA request count is how many bytes are allowed to transfer before
+ * pausing the channel, after which a new assertion of DREQ# resumes channel
+ * operation.
  *
- * The Freescale DMA channel can be controlled by the external
- * signal DREQ#. The pause control size is how many bytes are allowed
- * to transfer before pausing the channel, after which a new assertion
- * of DREQ# resumes channel operation.
+ * A size of 0 disables external pause control. The maximum size is 1024.
  */
-static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int size)
+static void fsl_chan_set_request_count(struct fsl_dma_chan *fsl_chan, int size)
 {
-       if (size > 1024)
-               return;
+       BUG_ON(size > 1024);
+       DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
+               DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
+                       | ((__ilog2(size) << 24) & 0x0f000000),
+               32);
+}
 
-       if (size) {
-               DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
-                       DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
-                               | ((__ilog2(size) << 24) & 0x0f000000),
-                       32);
+/**
+ * fsl_chan_toggle_ext_pause - Toggle channel external pause status
+ * @fsl_chan : Freescale DMA channel
+ * @enable   : 0 is disabled, 1 is enabled.
+ *
+ * The Freescale DMA channel can be controlled by the external signal DREQ#.
+ * The DMA Request Count feature should be used in addition to this feature
+ * to set the number of bytes to transfer before pausing the channel.
+ */
+static void fsl_chan_toggle_ext_pause(struct fsl_dma_chan *fsl_chan, int enable)
+{
+       if (enable)
                fsl_chan->feature |= FSL_DMA_CHAN_PAUSE_EXT;
-       else
+       else
                fsl_chan->feature &= ~FSL_DMA_CHAN_PAUSE_EXT;
 }
 
@@ -326,7 +339,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable)
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
        struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan);
-       struct fsl_desc_sw *desc;
+       struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
+       struct fsl_desc_sw *child;
        unsigned long flags;
        dma_cookie_t cookie;
 
@@ -334,7 +348,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
        spin_lock_irqsave(&fsl_chan->desc_lock, flags);
 
        cookie = fsl_chan->common.cookie;
-       list_for_each_entry(desc, &tx->tx_list, node) {
+       list_for_each_entry(child, &desc->tx_list, node) {
                cookie++;
                if (cookie < 0)
                        cookie = 1;
@@ -343,8 +357,8 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
        }
 
        fsl_chan->common.cookie = cookie;
-       append_ld_queue(fsl_chan, tx_to_fsl_desc(tx));
-       list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev);
+       append_ld_queue(fsl_chan, desc);
+       list_splice_init(&desc->tx_list, fsl_chan->ld_queue.prev);
 
        spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
 
@@ -366,6 +380,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
        desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc);
        if (desc_sw) {
                memset(desc_sw, 0, sizeof(struct fsl_desc_sw));
+               INIT_LIST_HEAD(&desc_sw->tx_list);
                dma_async_tx_descriptor_init(&desc_sw->async_tx,
                                                &fsl_chan->common);
                desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
@@ -455,7 +470,7 @@ fsl_dma_prep_interrupt(struct dma_chan *chan, unsigned long flags)
        new->async_tx.flags = flags;
 
        /* Insert the link descriptor to the LD ring */
-       list_add_tail(&new->node, &new->async_tx.tx_list);
+       list_add_tail(&new->node, &new->tx_list);
 
        /* Set End-of-link to the last link descriptor of new list*/
        set_ld_eol(fsl_chan, new);
@@ -513,7 +528,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
                dma_dest += copy;
 
                /* Insert the link descriptor to the LD ring */
-               list_add_tail(&new->node, &first->async_tx.tx_list);
+               list_add_tail(&new->node, &first->tx_list);
        } while (len);
 
        new->async_tx.flags = flags; /* client is in control of this ack */
@@ -528,7 +543,7 @@ fail:
        if (!first)
                return NULL;
 
-       list = &first->async_tx.tx_list;
+       list = &first->tx_list;
        list_for_each_entry_safe_reverse(new, prev, list, node) {
                list_del(&new->node);
                dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
@@ -537,6 +552,229 @@ fail:
        return NULL;
 }
 
+/**
+ * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
+ * @chan: DMA channel
+ * @sgl: scatterlist to transfer to/from
+ * @sg_len: number of entries in @scatterlist
+ * @direction: DMA direction
+ * @flags: DMAEngine flags
+ *
+ * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the
+ * DMA_SLAVE API, this gets the device-specific information from the
+ * chan->private variable.
+ */
+static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
+       struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len,
+       enum dma_data_direction direction, unsigned long flags)
+{
+       struct fsl_dma_chan *fsl_chan;
+       struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
+       struct fsl_dma_slave *slave;
+       struct list_head *tx_list;
+       size_t copy;
+
+       int i;
+       struct scatterlist *sg;
+       size_t sg_used;
+       size_t hw_used;
+       struct fsl_dma_hw_addr *hw;
+       dma_addr_t dma_dst, dma_src;
+
+       if (!chan)
+               return NULL;
+
+       if (!chan->private)
+               return NULL;
+
+       fsl_chan = to_fsl_chan(chan);
+       slave = chan->private;
+
+       if (list_empty(&slave->addresses))
+               return NULL;
+
+       hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry);
+       hw_used = 0;
+
+       /*
+        * Build the hardware transaction to copy from the scatterlist to
+        * the hardware, or from the hardware to the scatterlist
+        *
+        * If you are copying from the hardware to the scatterlist and it
+        * takes two hardware entries to fill an entire page, then both
+        * hardware entries will be coalesced into the same page
+        *
+        * If you are copying from the scatterlist to the hardware and a
+        * single page can fill two hardware entries, then the data will
+        * be read out of the page into the first hardware entry, and so on
+        */
+       for_each_sg(sgl, sg, sg_len, i) {
+               sg_used = 0;
+
+               /* Loop until the entire scatterlist entry is used */
+               while (sg_used < sg_dma_len(sg)) {
+
+                       /*
+                        * If we've used up the current hardware address/length
+                        * pair, we need to load a new one
+                        *
+                        * This is done in a while loop so that descriptors with
+                        * length == 0 will be skipped
+                        */
+                       while (hw_used >= hw->length) {
+
+                               /*
+                                * If the current hardware entry is the last
+                                * entry in the list, we're finished
+                                */
+                               if (list_is_last(&hw->entry, &slave->addresses))
+                                       goto finished;
+
+                               /* Get the next hardware address/length pair */
+                               hw = list_entry(hw->entry.next,
+                                               struct fsl_dma_hw_addr, entry);
+                               hw_used = 0;
+                       }
+
+                       /* Allocate the link descriptor from DMA pool */
+                       new = fsl_dma_alloc_descriptor(fsl_chan);
+                       if (!new) {
+                               dev_err(fsl_chan->dev, "No free memory for "
+                                                      "link descriptor\n");
+                               goto fail;
+                       }
+#ifdef FSL_DMA_LD_DEBUG
+                       dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new);
+#endif
+
+                       /*
+                        * Calculate the maximum number of bytes to transfer,
+                        * making sure it is less than the DMA controller limit
+                        */
+                       copy = min_t(size_t, sg_dma_len(sg) - sg_used,
+                                            hw->length - hw_used);
+                       copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT);
+
+                       /*
+                        * DMA_FROM_DEVICE
+                        * from the hardware to the scatterlist
+                        *
+                        * DMA_TO_DEVICE
+                        * from the scatterlist to the hardware
+                        */
+                       if (direction == DMA_FROM_DEVICE) {
+                               dma_src = hw->address + hw_used;
+                               dma_dst = sg_dma_address(sg) + sg_used;
+                       } else {
+                               dma_src = sg_dma_address(sg) + sg_used;
+                               dma_dst = hw->address + hw_used;
+                       }
+
+                       /* Fill in the descriptor */
+                       set_desc_cnt(fsl_chan, &new->hw, copy);
+                       set_desc_src(fsl_chan, &new->hw, dma_src);
+                       set_desc_dest(fsl_chan, &new->hw, dma_dst);
+
+                       /*
+                        * If this is not the first descriptor, chain the
+                        * current descriptor after the previous descriptor
+                        */
+                       if (!first) {
+                               first = new;
+                       } else {
+                               set_desc_next(fsl_chan, &prev->hw,
+                                             new->async_tx.phys);
+                       }
+
+                       new->async_tx.cookie = 0;
+                       async_tx_ack(&new->async_tx);
+
+                       prev = new;
+                       sg_used += copy;
+                       hw_used += copy;
+
+                       /* Insert the link descriptor into the LD ring */
+                       list_add_tail(&new->node, &first->tx_list);
+               }
+       }
+
+finished:
+
+       /* All of the hardware address/length pairs had length == 0 */
+       if (!first || !new)
+               return NULL;
+
+       new->async_tx.flags = flags;
+       new->async_tx.cookie = -EBUSY;
+
+       /* Set End-of-link to the last link descriptor of new list */
+       set_ld_eol(fsl_chan, new);
+
+       /* Enable extra controller features */
+       if (fsl_chan->set_src_loop_size)
+               fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size);
+
+       if (fsl_chan->set_dest_loop_size)
+               fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size);
+
+       if (fsl_chan->toggle_ext_start)
+               fsl_chan->toggle_ext_start(fsl_chan, slave->external_start);
+
+       if (fsl_chan->toggle_ext_pause)
+               fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause);
+
+       if (fsl_chan->set_request_count)
+               fsl_chan->set_request_count(fsl_chan, slave->request_count);
+
+       return &first->async_tx;
+
+fail:
+       /* If first was not set, then we failed to allocate the very first
+        * descriptor, and we're done */
+       if (!first)
+               return NULL;
+
+       /*
+        * First is set, so all of the descriptors we allocated have been added
+        * to first->tx_list, INCLUDING "first" itself. Therefore we
+        * must traverse the list backwards freeing each descriptor in turn
+        *
+        * We're re-using variables for the loop, oh well
+        */
+       tx_list = &first->tx_list;
+       list_for_each_entry_safe_reverse(new, prev, tx_list, node) {
+               list_del_init(&new->node);
+               dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
+       }
+
+       return NULL;
+}
+
+static void fsl_dma_device_terminate_all(struct dma_chan *chan)
+{
+       struct fsl_dma_chan *fsl_chan;
+       struct fsl_desc_sw *desc, *tmp;
+       unsigned long flags;
+
+       if (!chan)
+               return;
+
+       fsl_chan = to_fsl_chan(chan);
+
+       /* Halt the DMA engine */
+       dma_halt(fsl_chan);
+
+       spin_lock_irqsave(&fsl_chan->desc_lock, flags);
+
+       /* Remove and free all of the descriptors in the LD queue */
+       list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) {
+               list_del(&desc->node);
+               dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys);
+       }
+
+       spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
+}
+
 /**
  * fsl_dma_update_completed_cookie - Update the completed cookie.
  * @fsl_chan : Freescale DMA channel
@@ -883,6 +1121,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
                new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start;
                new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size;
                new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size;
+               new_fsl_chan->set_request_count = fsl_chan_set_request_count;
        }
 
        spin_lock_init(&new_fsl_chan->desc_lock);
@@ -962,12 +1201,15 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev,
 
        dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
        dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
+       dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
        fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
        fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
        fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
        fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
        fdev->common.device_is_tx_complete = fsl_dma_is_complete;
        fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
+       fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg;
+       fdev->common.device_terminate_all = fsl_dma_device_terminate_all;
        fdev->common.dev = &dev->dev;
 
        fdev->irq = irq_of_parse_and_map(dev->node, 0);
index dc7f26865797cc12ef6610296a83919107c5ad84..0df14cbb8ca335d4ab5b2e444e78a3f42c22709b 100644 (file)
@@ -90,6 +90,7 @@ struct fsl_dma_ld_hw {
 struct fsl_desc_sw {
        struct fsl_dma_ld_hw hw;
        struct list_head node;
+       struct list_head tx_list;
        struct dma_async_tx_descriptor async_tx;
        struct list_head *ld;
        void *priv;
@@ -143,10 +144,11 @@ struct fsl_dma_chan {
        struct tasklet_struct tasklet;
        u32 feature;
 
-       void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int size);
+       void (*toggle_ext_pause)(struct fsl_dma_chan *fsl_chan, int enable);
        void (*toggle_ext_start)(struct fsl_dma_chan *fsl_chan, int enable);
        void (*set_src_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
        void (*set_dest_loop_size)(struct fsl_dma_chan *fsl_chan, int size);
+       void (*set_request_count)(struct fsl_dma_chan *fsl_chan, int size);
 };
 
 #define to_fsl_chan(chan) container_of(chan, struct fsl_dma_chan, common)
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
deleted file mode 100644 (file)
index 2225bb6..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Intel I/OAT DMA Linux driver
- * Copyright(c) 2007 - 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- */
-
-/*
- * This driver supports an Intel I/OAT DMA engine, which does asynchronous
- * copy operations.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/dca.h>
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
-#include "ioatdma_hw.h"
-
-MODULE_VERSION(IOAT_DMA_VERSION);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Intel Corporation");
-
-static struct pci_device_id ioat_pci_tbl[] = {
-       /* I/OAT v1 platforms */
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB)  },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
-       { PCI_DEVICE(PCI_VENDOR_ID_UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
-
-       /* I/OAT v2 platforms */
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
-
-       /* I/OAT v3 platforms */
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
-       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
-       { 0, }
-};
-
-struct ioat_device {
-       struct pci_dev          *pdev;
-       void __iomem            *iobase;
-       struct ioatdma_device   *dma;
-       struct dca_provider     *dca;
-};
-
-static int __devinit ioat_probe(struct pci_dev *pdev,
-                               const struct pci_device_id *id);
-static void __devexit ioat_remove(struct pci_dev *pdev);
-
-static int ioat_dca_enabled = 1;
-module_param(ioat_dca_enabled, int, 0644);
-MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
-
-static struct pci_driver ioat_pci_driver = {
-       .name           = "ioatdma",
-       .id_table       = ioat_pci_tbl,
-       .probe          = ioat_probe,
-       .remove         = __devexit_p(ioat_remove),
-};
-
-static int __devinit ioat_probe(struct pci_dev *pdev,
-                               const struct pci_device_id *id)
-{
-       void __iomem *iobase;
-       struct ioat_device *device;
-       unsigned long mmio_start, mmio_len;
-       int err;
-
-       err = pci_enable_device(pdev);
-       if (err)
-               goto err_enable_device;
-
-       err = pci_request_regions(pdev, ioat_pci_driver.name);
-       if (err)
-               goto err_request_regions;
-
-       err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (err)
-               err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-       if (err)
-               goto err_set_dma_mask;
-
-       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (err)
-               err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-       if (err)
-               goto err_set_dma_mask;
-
-       mmio_start = pci_resource_start(pdev, 0);
-       mmio_len = pci_resource_len(pdev, 0);
-       iobase = ioremap(mmio_start, mmio_len);
-       if (!iobase) {
-               err = -ENOMEM;
-               goto err_ioremap;
-       }
-
-       device = kzalloc(sizeof(*device), GFP_KERNEL);
-       if (!device) {
-               err = -ENOMEM;
-               goto err_kzalloc;
-       }
-       device->pdev = pdev;
-       pci_set_drvdata(pdev, device);
-       device->iobase = iobase;
-
-       pci_set_master(pdev);
-
-       switch (readb(iobase + IOAT_VER_OFFSET)) {
-       case IOAT_VER_1_2:
-               device->dma = ioat_dma_probe(pdev, iobase);
-               if (device->dma && ioat_dca_enabled)
-                       device->dca = ioat_dca_init(pdev, iobase);
-               break;
-       case IOAT_VER_2_0:
-               device->dma = ioat_dma_probe(pdev, iobase);
-               if (device->dma && ioat_dca_enabled)
-                       device->dca = ioat2_dca_init(pdev, iobase);
-               break;
-       case IOAT_VER_3_0:
-               device->dma = ioat_dma_probe(pdev, iobase);
-               if (device->dma && ioat_dca_enabled)
-                       device->dca = ioat3_dca_init(pdev, iobase);
-               break;
-       default:
-               err = -ENODEV;
-               break;
-       }
-       if (!device->dma)
-               err = -ENODEV;
-
-       if (err)
-               goto err_version;
-
-       return 0;
-
-err_version:
-       kfree(device);
-err_kzalloc:
-       iounmap(iobase);
-err_ioremap:
-err_set_dma_mask:
-       pci_release_regions(pdev);
-       pci_disable_device(pdev);
-err_request_regions:
-err_enable_device:
-       return err;
-}
-
-static void __devexit ioat_remove(struct pci_dev *pdev)
-{
-       struct ioat_device *device = pci_get_drvdata(pdev);
-
-       dev_err(&pdev->dev, "Removing dma and dca services\n");
-       if (device->dca) {
-               unregister_dca_provider(device->dca);
-               free_dca_provider(device->dca);
-               device->dca = NULL;
-       }
-
-       if (device->dma) {
-               ioat_dma_remove(device->dma);
-               device->dma = NULL;
-       }
-
-       kfree(device);
-}
-
-static int __init ioat_init_module(void)
-{
-       return pci_register_driver(&ioat_pci_driver);
-}
-module_init(ioat_init_module);
-
-static void __exit ioat_exit_module(void)
-{
-       pci_unregister_driver(&ioat_pci_driver);
-}
-module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile
new file mode 100644 (file)
index 0000000..8997d3f
--- /dev/null
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+ioatdma-objs := pci.o dma.o dma_v2.o dma_v3.o dca.o
diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c
new file mode 100644 (file)
index 0000000..69d0261
--- /dev/null
@@ -0,0 +1,684 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2007 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/dca.h>
+
+/* either a kernel change is needed, or we need something like this in kernel */
+#ifndef CONFIG_SMP
+#include <asm/smp.h>
+#undef cpu_physical_id
+#define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24)
+#endif
+
+#include "dma.h"
+#include "registers.h"
+
+/*
+ * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6
+ * contain the bit number of the APIC ID to map into the DCA tag.  If the valid
+ * bit is not set, then the value must be 0 or 1 and defines the bit in the tag.
+ */
+#define DCA_TAG_MAP_VALID 0x80
+
+#define DCA3_TAG_MAP_BIT_TO_INV 0x80
+#define DCA3_TAG_MAP_BIT_TO_SEL 0x40
+#define DCA3_TAG_MAP_LITERAL_VAL 0x1
+
+#define DCA_TAG_MAP_MASK 0xDF
+
+/* expected tag map bytes for I/OAT ver.2 */
+#define DCA2_TAG_MAP_BYTE0 0x80
+#define DCA2_TAG_MAP_BYTE1 0x0
+#define DCA2_TAG_MAP_BYTE2 0x81
+#define DCA2_TAG_MAP_BYTE3 0x82
+#define DCA2_TAG_MAP_BYTE4 0x82
+
+/* verify if tag map matches expected values */
+static inline int dca2_tag_map_valid(u8 *tag_map)
+{
+       return ((tag_map[0] == DCA2_TAG_MAP_BYTE0) &&
+               (tag_map[1] == DCA2_TAG_MAP_BYTE1) &&
+               (tag_map[2] == DCA2_TAG_MAP_BYTE2) &&
+               (tag_map[3] == DCA2_TAG_MAP_BYTE3) &&
+               (tag_map[4] == DCA2_TAG_MAP_BYTE4));
+}
+
+/*
+ * "Legacy" DCA systems do not implement the DCA register set in the
+ * I/OAT device.  Software needs direct support for their tag mappings.
+ */
+
+#define APICID_BIT(x)          (DCA_TAG_MAP_VALID | (x))
+#define IOAT_TAG_MAP_LEN       8
+
+static u8 ioat_tag_map_BNB[IOAT_TAG_MAP_LEN] = {
+       1, APICID_BIT(1), APICID_BIT(2), APICID_BIT(2), };
+static u8 ioat_tag_map_SCNB[IOAT_TAG_MAP_LEN] = {
+       1, APICID_BIT(1), APICID_BIT(2), APICID_BIT(2), };
+static u8 ioat_tag_map_CNB[IOAT_TAG_MAP_LEN] = {
+       1, APICID_BIT(1), APICID_BIT(3), APICID_BIT(4), APICID_BIT(2), };
+static u8 ioat_tag_map_UNISYS[IOAT_TAG_MAP_LEN] = { 0 };
+
+/* pack PCI B/D/F into a u16 */
+static inline u16 dcaid_from_pcidev(struct pci_dev *pci)
+{
+       return (pci->bus->number << 8) | pci->devfn;
+}
+
+static int dca_enabled_in_bios(struct pci_dev *pdev)
+{
+       /* CPUID level 9 returns DCA configuration */
+       /* Bit 0 indicates DCA enabled by the BIOS */
+       unsigned long cpuid_level_9;
+       int res;
+
+       cpuid_level_9 = cpuid_eax(9);
+       res = test_bit(0, &cpuid_level_9);
+       if (!res)
+               dev_err(&pdev->dev, "DCA is disabled in BIOS\n");
+
+       return res;
+}
+
+static int system_has_dca_enabled(struct pci_dev *pdev)
+{
+       if (boot_cpu_has(X86_FEATURE_DCA))
+               return dca_enabled_in_bios(pdev);
+
+       dev_err(&pdev->dev, "boot cpu doesn't have X86_FEATURE_DCA\n");
+       return 0;
+}
+
+struct ioat_dca_slot {
+       struct pci_dev *pdev;   /* requester device */
+       u16 rid;                /* requester id, as used by IOAT */
+};
+
+#define IOAT_DCA_MAX_REQ 6
+#define IOAT3_DCA_MAX_REQ 2
+
+struct ioat_dca_priv {
+       void __iomem            *iobase;
+       void __iomem            *dca_base;
+       int                      max_requesters;
+       int                      requester_count;
+       u8                       tag_map[IOAT_TAG_MAP_LEN];
+       struct ioat_dca_slot     req_slots[0];
+};
+
+/* 5000 series chipset DCA Port Requester ID Table Entry Format
+ * [15:8]      PCI-Express Bus Number
+ * [7:3]       PCI-Express Device Number
+ * [2:0]       PCI-Express Function Number
+ *
+ * 5000 series chipset DCA control register format
+ * [7:1]       Reserved (0)
+ * [0]         Ignore Function Number
+ */
+
+static int ioat_dca_add_requester(struct dca_provider *dca, struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+       u16 id;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+       id = dcaid_from_pcidev(pdev);
+
+       if (ioatdca->requester_count == ioatdca->max_requesters)
+               return -ENODEV;
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == NULL) {
+                       /* found an empty slot */
+                       ioatdca->requester_count++;
+                       ioatdca->req_slots[i].pdev = pdev;
+                       ioatdca->req_slots[i].rid = id;
+                       writew(id, ioatdca->dca_base + (i * 4));
+                       /* make sure the ignore function bit is off */
+                       writeb(0, ioatdca->dca_base + (i * 4) + 2);
+                       return i;
+               }
+       }
+       /* Error, ioatdma->requester_count is out of whack */
+       return -EFAULT;
+}
+
+static int ioat_dca_remove_requester(struct dca_provider *dca,
+                                    struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == pdev) {
+                       writew(0, ioatdca->dca_base + (i * 4));
+                       ioatdca->req_slots[i].pdev = NULL;
+                       ioatdca->req_slots[i].rid = 0;
+                       ioatdca->requester_count--;
+                       return i;
+               }
+       }
+       return -ENODEV;
+}
+
+static u8 ioat_dca_get_tag(struct dca_provider *dca,
+                          struct device *dev,
+                          int cpu)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       int i, apic_id, bit, value;
+       u8 entry, tag;
+
+       tag = 0;
+       apic_id = cpu_physical_id(cpu);
+
+       for (i = 0; i < IOAT_TAG_MAP_LEN; i++) {
+               entry = ioatdca->tag_map[i];
+               if (entry & DCA_TAG_MAP_VALID) {
+                       bit = entry & ~DCA_TAG_MAP_VALID;
+                       value = (apic_id & (1 << bit)) ? 1 : 0;
+               } else {
+                       value = entry ? 1 : 0;
+               }
+               tag |= (value << i);
+       }
+       return tag;
+}
+
+static int ioat_dca_dev_managed(struct dca_provider *dca,
+                               struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+
+       pdev = to_pci_dev(dev);
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == pdev)
+                       return 1;
+       }
+       return 0;
+}
+
+static struct dca_ops ioat_dca_ops = {
+       .add_requester          = ioat_dca_add_requester,
+       .remove_requester       = ioat_dca_remove_requester,
+       .get_tag                = ioat_dca_get_tag,
+       .dev_managed            = ioat_dca_dev_managed,
+};
+
+
+struct dca_provider * __devinit
+ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+{
+       struct dca_provider *dca;
+       struct ioat_dca_priv *ioatdca;
+       u8 *tag_map = NULL;
+       int i;
+       int err;
+       u8 version;
+       u8 max_requesters;
+
+       if (!system_has_dca_enabled(pdev))
+               return NULL;
+
+       /* I/OAT v1 systems must have a known tag_map to support DCA */
+       switch (pdev->vendor) {
+       case PCI_VENDOR_ID_INTEL:
+               switch (pdev->device) {
+               case PCI_DEVICE_ID_INTEL_IOAT:
+                       tag_map = ioat_tag_map_BNB;
+                       break;
+               case PCI_DEVICE_ID_INTEL_IOAT_CNB:
+                       tag_map = ioat_tag_map_CNB;
+                       break;
+               case PCI_DEVICE_ID_INTEL_IOAT_SCNB:
+                       tag_map = ioat_tag_map_SCNB;
+                       break;
+               }
+               break;
+       case PCI_VENDOR_ID_UNISYS:
+               switch (pdev->device) {
+               case PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR:
+                       tag_map = ioat_tag_map_UNISYS;
+                       break;
+               }
+               break;
+       }
+       if (tag_map == NULL)
+               return NULL;
+
+       version = readb(iobase + IOAT_VER_OFFSET);
+       if (version == IOAT_VER_3_0)
+               max_requesters = IOAT3_DCA_MAX_REQ;
+       else
+               max_requesters = IOAT_DCA_MAX_REQ;
+
+       dca = alloc_dca_provider(&ioat_dca_ops,
+                       sizeof(*ioatdca) +
+                       (sizeof(struct ioat_dca_slot) * max_requesters));
+       if (!dca)
+               return NULL;
+
+       ioatdca = dca_priv(dca);
+       ioatdca->max_requesters = max_requesters;
+       ioatdca->dca_base = iobase + 0x54;
+
+       /* copy over the APIC ID to DCA tag mapping */
+       for (i = 0; i < IOAT_TAG_MAP_LEN; i++)
+               ioatdca->tag_map[i] = tag_map[i];
+
+       err = register_dca_provider(dca, &pdev->dev);
+       if (err) {
+               free_dca_provider(dca);
+               return NULL;
+       }
+
+       return dca;
+}
+
+
+static int ioat2_dca_add_requester(struct dca_provider *dca, struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+       u16 id;
+       u16 global_req_table;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+       id = dcaid_from_pcidev(pdev);
+
+       if (ioatdca->requester_count == ioatdca->max_requesters)
+               return -ENODEV;
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == NULL) {
+                       /* found an empty slot */
+                       ioatdca->requester_count++;
+                       ioatdca->req_slots[i].pdev = pdev;
+                       ioatdca->req_slots[i].rid = id;
+                       global_req_table =
+                             readw(ioatdca->dca_base + IOAT_DCA_GREQID_OFFSET);
+                       writel(id | IOAT_DCA_GREQID_VALID,
+                              ioatdca->iobase + global_req_table + (i * 4));
+                       return i;
+               }
+       }
+       /* Error, ioatdma->requester_count is out of whack */
+       return -EFAULT;
+}
+
+static int ioat2_dca_remove_requester(struct dca_provider *dca,
+                                     struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+       u16 global_req_table;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == pdev) {
+                       global_req_table =
+                             readw(ioatdca->dca_base + IOAT_DCA_GREQID_OFFSET);
+                       writel(0, ioatdca->iobase + global_req_table + (i * 4));
+                       ioatdca->req_slots[i].pdev = NULL;
+                       ioatdca->req_slots[i].rid = 0;
+                       ioatdca->requester_count--;
+                       return i;
+               }
+       }
+       return -ENODEV;
+}
+
+static u8 ioat2_dca_get_tag(struct dca_provider *dca,
+                           struct device *dev,
+                           int cpu)
+{
+       u8 tag;
+
+       tag = ioat_dca_get_tag(dca, dev, cpu);
+       tag = (~tag) & 0x1F;
+       return tag;
+}
+
+static struct dca_ops ioat2_dca_ops = {
+       .add_requester          = ioat2_dca_add_requester,
+       .remove_requester       = ioat2_dca_remove_requester,
+       .get_tag                = ioat2_dca_get_tag,
+       .dev_managed            = ioat_dca_dev_managed,
+};
+
+static int ioat2_dca_count_dca_slots(void __iomem *iobase, u16 dca_offset)
+{
+       int slots = 0;
+       u32 req;
+       u16 global_req_table;
+
+       global_req_table = readw(iobase + dca_offset + IOAT_DCA_GREQID_OFFSET);
+       if (global_req_table == 0)
+               return 0;
+       do {
+               req = readl(iobase + global_req_table + (slots * sizeof(u32)));
+               slots++;
+       } while ((req & IOAT_DCA_GREQID_LASTID) == 0);
+
+       return slots;
+}
+
+struct dca_provider * __devinit
+ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+{
+       struct dca_provider *dca;
+       struct ioat_dca_priv *ioatdca;
+       int slots;
+       int i;
+       int err;
+       u32 tag_map;
+       u16 dca_offset;
+       u16 csi_fsb_control;
+       u16 pcie_control;
+       u8 bit;
+
+       if (!system_has_dca_enabled(pdev))
+               return NULL;
+
+       dca_offset = readw(iobase + IOAT_DCAOFFSET_OFFSET);
+       if (dca_offset == 0)
+               return NULL;
+
+       slots = ioat2_dca_count_dca_slots(iobase, dca_offset);
+       if (slots == 0)
+               return NULL;
+
+       dca = alloc_dca_provider(&ioat2_dca_ops,
+                                sizeof(*ioatdca)
+                                     + (sizeof(struct ioat_dca_slot) * slots));
+       if (!dca)
+               return NULL;
+
+       ioatdca = dca_priv(dca);
+       ioatdca->iobase = iobase;
+       ioatdca->dca_base = iobase + dca_offset;
+       ioatdca->max_requesters = slots;
+
+       /* some bios might not know to turn these on */
+       csi_fsb_control = readw(ioatdca->dca_base + IOAT_FSB_CAP_ENABLE_OFFSET);
+       if ((csi_fsb_control & IOAT_FSB_CAP_ENABLE_PREFETCH) == 0) {
+               csi_fsb_control |= IOAT_FSB_CAP_ENABLE_PREFETCH;
+               writew(csi_fsb_control,
+                      ioatdca->dca_base + IOAT_FSB_CAP_ENABLE_OFFSET);
+       }
+       pcie_control = readw(ioatdca->dca_base + IOAT_PCI_CAP_ENABLE_OFFSET);
+       if ((pcie_control & IOAT_PCI_CAP_ENABLE_MEMWR) == 0) {
+               pcie_control |= IOAT_PCI_CAP_ENABLE_MEMWR;
+               writew(pcie_control,
+                      ioatdca->dca_base + IOAT_PCI_CAP_ENABLE_OFFSET);
+       }
+
+
+       /* TODO version, compatibility and configuration checks */
+
+       /* copy out the APIC to DCA tag map */
+       tag_map = readl(ioatdca->dca_base + IOAT_APICID_TAG_MAP_OFFSET);
+       for (i = 0; i < 5; i++) {
+               bit = (tag_map >> (4 * i)) & 0x0f;
+               if (bit < 8)
+                       ioatdca->tag_map[i] = bit | DCA_TAG_MAP_VALID;
+               else
+                       ioatdca->tag_map[i] = 0;
+       }
+
+       if (!dca2_tag_map_valid(ioatdca->tag_map)) {
+               dev_err(&pdev->dev, "APICID_TAG_MAP set incorrectly by BIOS, "
+                       "disabling DCA\n");
+               free_dca_provider(dca);
+               return NULL;
+       }
+
+       err = register_dca_provider(dca, &pdev->dev);
+       if (err) {
+               free_dca_provider(dca);
+               return NULL;
+       }
+
+       return dca;
+}
+
+static int ioat3_dca_add_requester(struct dca_provider *dca, struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+       u16 id;
+       u16 global_req_table;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+       id = dcaid_from_pcidev(pdev);
+
+       if (ioatdca->requester_count == ioatdca->max_requesters)
+               return -ENODEV;
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == NULL) {
+                       /* found an empty slot */
+                       ioatdca->requester_count++;
+                       ioatdca->req_slots[i].pdev = pdev;
+                       ioatdca->req_slots[i].rid = id;
+                       global_req_table =
+                             readw(ioatdca->dca_base + IOAT3_DCA_GREQID_OFFSET);
+                       writel(id | IOAT_DCA_GREQID_VALID,
+                              ioatdca->iobase + global_req_table + (i * 4));
+                       return i;
+               }
+       }
+       /* Error, ioatdma->requester_count is out of whack */
+       return -EFAULT;
+}
+
+static int ioat3_dca_remove_requester(struct dca_provider *dca,
+                                     struct device *dev)
+{
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       struct pci_dev *pdev;
+       int i;
+       u16 global_req_table;
+
+       /* This implementation only supports PCI-Express */
+       if (dev->bus != &pci_bus_type)
+               return -ENODEV;
+       pdev = to_pci_dev(dev);
+
+       for (i = 0; i < ioatdca->max_requesters; i++) {
+               if (ioatdca->req_slots[i].pdev == pdev) {
+                       global_req_table =
+                             readw(ioatdca->dca_base + IOAT3_DCA_GREQID_OFFSET);
+                       writel(0, ioatdca->iobase + global_req_table + (i * 4));
+                       ioatdca->req_slots[i].pdev = NULL;
+                       ioatdca->req_slots[i].rid = 0;
+                       ioatdca->requester_count--;
+                       return i;
+               }
+       }
+       return -ENODEV;
+}
+
+static u8 ioat3_dca_get_tag(struct dca_provider *dca,
+                           struct device *dev,
+                           int cpu)
+{
+       u8 tag;
+
+       struct ioat_dca_priv *ioatdca = dca_priv(dca);
+       int i, apic_id, bit, value;
+       u8 entry;
+
+       tag = 0;
+       apic_id = cpu_physical_id(cpu);
+
+       for (i = 0; i < IOAT_TAG_MAP_LEN; i++) {
+               entry = ioatdca->tag_map[i];
+               if (entry & DCA3_TAG_MAP_BIT_TO_SEL) {
+                       bit = entry &
+                               ~(DCA3_TAG_MAP_BIT_TO_SEL | DCA3_TAG_MAP_BIT_TO_INV);
+                       value = (apic_id & (1 << bit)) ? 1 : 0;
+               } else if (entry & DCA3_TAG_MAP_BIT_TO_INV) {
+                       bit = entry & ~DCA3_TAG_MAP_BIT_TO_INV;
+                       value = (apic_id & (1 << bit)) ? 0 : 1;
+               } else {
+                       value = (entry & DCA3_TAG_MAP_LITERAL_VAL) ? 1 : 0;
+               }
+               tag |= (value << i);
+       }
+
+       return tag;
+}
+
+static struct dca_ops ioat3_dca_ops = {
+       .add_requester          = ioat3_dca_add_requester,
+       .remove_requester       = ioat3_dca_remove_requester,
+       .get_tag                = ioat3_dca_get_tag,
+       .dev_managed            = ioat_dca_dev_managed,
+};
+
+static int ioat3_dca_count_dca_slots(void *iobase, u16 dca_offset)
+{
+       int slots = 0;
+       u32 req;
+       u16 global_req_table;
+
+       global_req_table = readw(iobase + dca_offset + IOAT3_DCA_GREQID_OFFSET);
+       if (global_req_table == 0)
+               return 0;
+
+       do {
+               req = readl(iobase + global_req_table + (slots * sizeof(u32)));
+               slots++;
+       } while ((req & IOAT_DCA_GREQID_LASTID) == 0);
+
+       return slots;
+}
+
+struct dca_provider * __devinit
+ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+{
+       struct dca_provider *dca;
+       struct ioat_dca_priv *ioatdca;
+       int slots;
+       int i;
+       int err;
+       u16 dca_offset;
+       u16 csi_fsb_control;
+       u16 pcie_control;
+       u8 bit;
+
+       union {
+               u64 full;
+               struct {
+                       u32 low;
+                       u32 high;
+               };
+       } tag_map;
+
+       if (!system_has_dca_enabled(pdev))
+               return NULL;
+
+       dca_offset = readw(iobase + IOAT_DCAOFFSET_OFFSET);
+       if (dca_offset == 0)
+               return NULL;
+
+       slots = ioat3_dca_count_dca_slots(iobase, dca_offset);
+       if (slots == 0)
+               return NULL;
+
+       dca = alloc_dca_provider(&ioat3_dca_ops,
+                                sizeof(*ioatdca)
+                                     + (sizeof(struct ioat_dca_slot) * slots));
+       if (!dca)
+               return NULL;
+
+       ioatdca = dca_priv(dca);
+       ioatdca->iobase = iobase;
+       ioatdca->dca_base = iobase + dca_offset;
+       ioatdca->max_requesters = slots;
+
+       /* some bios might not know to turn these on */
+       csi_fsb_control = readw(ioatdca->dca_base + IOAT3_CSI_CONTROL_OFFSET);
+       if ((csi_fsb_control & IOAT3_CSI_CONTROL_PREFETCH) == 0) {
+               csi_fsb_control |= IOAT3_CSI_CONTROL_PREFETCH;
+               writew(csi_fsb_control,
+                      ioatdca->dca_base + IOAT3_CSI_CONTROL_OFFSET);
+       }
+       pcie_control = readw(ioatdca->dca_base + IOAT3_PCI_CONTROL_OFFSET);
+       if ((pcie_control & IOAT3_PCI_CONTROL_MEMWR) == 0) {
+               pcie_control |= IOAT3_PCI_CONTROL_MEMWR;
+               writew(pcie_control,
+                      ioatdca->dca_base + IOAT3_PCI_CONTROL_OFFSET);
+       }
+
+
+       /* TODO version, compatibility and configuration checks */
+
+       /* copy out the APIC to DCA tag map */
+       tag_map.low =
+               readl(ioatdca->dca_base + IOAT3_APICID_TAG_MAP_OFFSET_LOW);
+       tag_map.high =
+               readl(ioatdca->dca_base + IOAT3_APICID_TAG_MAP_OFFSET_HIGH);
+       for (i = 0; i < 8; i++) {
+               bit = tag_map.full >> (8 * i);
+               ioatdca->tag_map[i] = bit & DCA_TAG_MAP_MASK;
+       }
+
+       err = register_dca_provider(dca, &pdev->dev);
+       if (err) {
+               free_dca_provider(dca);
+               return NULL;
+       }
+
+       return dca;
+}
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
new file mode 100644 (file)
index 0000000..c524d36
--- /dev/null
@@ -0,0 +1,1238 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2004 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/i7300_idle.h>
+#include "dma.h"
+#include "registers.h"
+#include "hw.h"
+
+int ioat_pending_level = 4;
+module_param(ioat_pending_level, int, 0644);
+MODULE_PARM_DESC(ioat_pending_level,
+                "high-water mark for pushing ioat descriptors (default: 4)");
+
+/* internal functions */
+static void ioat1_cleanup(struct ioat_dma_chan *ioat);
+static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat);
+
+/**
+ * ioat_dma_do_interrupt - handler used for single vector interrupt mode
+ * @irq: interrupt id
+ * @data: interrupt data
+ */
+static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
+{
+       struct ioatdma_device *instance = data;
+       struct ioat_chan_common *chan;
+       unsigned long attnstatus;
+       int bit;
+       u8 intrctrl;
+
+       intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
+
+       if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
+               return IRQ_NONE;
+
+       if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
+               writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
+               return IRQ_NONE;
+       }
+
+       attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
+       for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
+               chan = ioat_chan_by_index(instance, bit);
+               tasklet_schedule(&chan->cleanup_task);
+       }
+
+       writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
+       return IRQ_HANDLED;
+}
+
+/**
+ * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
+ * @irq: interrupt id
+ * @data: interrupt data
+ */
+static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
+{
+       struct ioat_chan_common *chan = data;
+
+       tasklet_schedule(&chan->cleanup_task);
+
+       return IRQ_HANDLED;
+}
+
+static void ioat1_cleanup_tasklet(unsigned long data);
+
+/* common channel initialization */
+void ioat_init_channel(struct ioatdma_device *device,
+                      struct ioat_chan_common *chan, int idx,
+                      void (*timer_fn)(unsigned long),
+                      void (*tasklet)(unsigned long),
+                      unsigned long ioat)
+{
+       struct dma_device *dma = &device->common;
+
+       chan->device = device;
+       chan->reg_base = device->reg_base + (0x80 * (idx + 1));
+       spin_lock_init(&chan->cleanup_lock);
+       chan->common.device = dma;
+       list_add_tail(&chan->common.device_node, &dma->channels);
+       device->idx[idx] = chan;
+       init_timer(&chan->timer);
+       chan->timer.function = timer_fn;
+       chan->timer.data = ioat;
+       tasklet_init(&chan->cleanup_task, tasklet, ioat);
+       tasklet_disable(&chan->cleanup_task);
+}
+
+static void ioat1_timer_event(unsigned long data);
+
+/**
+ * ioat1_dma_enumerate_channels - find and initialize the device's channels
+ * @device: the device to be enumerated
+ */
+static int ioat1_enumerate_channels(struct ioatdma_device *device)
+{
+       u8 xfercap_scale;
+       u32 xfercap;
+       int i;
+       struct ioat_dma_chan *ioat;
+       struct device *dev = &device->pdev->dev;
+       struct dma_device *dma = &device->common;
+
+       INIT_LIST_HEAD(&dma->channels);
+       dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
+       dma->chancnt &= 0x1f; /* bits [4:0] valid */
+       if (dma->chancnt > ARRAY_SIZE(device->idx)) {
+               dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
+                        dma->chancnt, ARRAY_SIZE(device->idx));
+               dma->chancnt = ARRAY_SIZE(device->idx);
+       }
+       xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
+       xfercap_scale &= 0x1f; /* bits [4:0] valid */
+       xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
+       dev_dbg(dev, "%s: xfercap = %d\n", __func__, xfercap);
+
+#ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
+       if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
+               dma->chancnt--;
+#endif
+       for (i = 0; i < dma->chancnt; i++) {
+               ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
+               if (!ioat)
+                       break;
+
+               ioat_init_channel(device, &ioat->base, i,
+                                 ioat1_timer_event,
+                                 ioat1_cleanup_tasklet,
+                                 (unsigned long) ioat);
+               ioat->xfercap = xfercap;
+               spin_lock_init(&ioat->desc_lock);
+               INIT_LIST_HEAD(&ioat->free_desc);
+               INIT_LIST_HEAD(&ioat->used_desc);
+       }
+       dma->chancnt = i;
+       return i;
+}
+
+/**
+ * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
+ *                                 descriptors to hw
+ * @chan: DMA channel handle
+ */
+static inline void
+__ioat1_dma_memcpy_issue_pending(struct ioat_dma_chan *ioat)
+{
+       void __iomem *reg_base = ioat->base.reg_base;
+
+       dev_dbg(to_dev(&ioat->base), "%s: pending: %d\n",
+               __func__, ioat->pending);
+       ioat->pending = 0;
+       writeb(IOAT_CHANCMD_APPEND, reg_base + IOAT1_CHANCMD_OFFSET);
+}
+
+static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(chan);
+
+       if (ioat->pending > 0) {
+               spin_lock_bh(&ioat->desc_lock);
+               __ioat1_dma_memcpy_issue_pending(ioat);
+               spin_unlock_bh(&ioat->desc_lock);
+       }
+}
+
+/**
+ * ioat1_reset_channel - restart a channel
+ * @ioat: IOAT DMA channel handle
+ */
+static void ioat1_reset_channel(struct ioat_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       void __iomem *reg_base = chan->reg_base;
+       u32 chansts, chanerr;
+
+       dev_warn(to_dev(chan), "reset\n");
+       chanerr = readl(reg_base + IOAT_CHANERR_OFFSET);
+       chansts = *chan->completion & IOAT_CHANSTS_STATUS;
+       if (chanerr) {
+               dev_err(to_dev(chan),
+                       "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
+                       chan_num(chan), chansts, chanerr);
+               writel(chanerr, reg_base + IOAT_CHANERR_OFFSET);
+       }
+
+       /*
+        * whack it upside the head with a reset
+        * and wait for things to settle out.
+        * force the pending count to a really big negative
+        * to make sure no one forces an issue_pending
+        * while we're waiting.
+        */
+
+       ioat->pending = INT_MIN;
+       writeb(IOAT_CHANCMD_RESET,
+              reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+       set_bit(IOAT_RESET_PENDING, &chan->state);
+       mod_timer(&chan->timer, jiffies + RESET_DELAY);
+}
+
+static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+       struct dma_chan *c = tx->chan;
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+       struct ioat_desc_sw *desc = tx_to_ioat_desc(tx);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_desc_sw *first;
+       struct ioat_desc_sw *chain_tail;
+       dma_cookie_t cookie;
+
+       spin_lock_bh(&ioat->desc_lock);
+       /* cookie incr and addition to used_list must be atomic */
+       cookie = c->cookie;
+       cookie++;
+       if (cookie < 0)
+               cookie = 1;
+       c->cookie = cookie;
+       tx->cookie = cookie;
+       dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
+
+       /* write address into NextDescriptor field of last desc in chain */
+       first = to_ioat_desc(desc->tx_list.next);
+       chain_tail = to_ioat_desc(ioat->used_desc.prev);
+       /* make descriptor updates globally visible before chaining */
+       wmb();
+       chain_tail->hw->next = first->txd.phys;
+       list_splice_tail_init(&desc->tx_list, &ioat->used_desc);
+       dump_desc_dbg(ioat, chain_tail);
+       dump_desc_dbg(ioat, first);
+
+       if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
+               mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+       ioat->active += desc->hw->tx_cnt;
+       ioat->pending += desc->hw->tx_cnt;
+       if (ioat->pending >= ioat_pending_level)
+               __ioat1_dma_memcpy_issue_pending(ioat);
+       spin_unlock_bh(&ioat->desc_lock);
+
+       return cookie;
+}
+
+/**
+ * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
+ * @ioat: the channel supplying the memory pool for the descriptors
+ * @flags: allocation flags
+ */
+static struct ioat_desc_sw *
+ioat_dma_alloc_descriptor(struct ioat_dma_chan *ioat, gfp_t flags)
+{
+       struct ioat_dma_descriptor *desc;
+       struct ioat_desc_sw *desc_sw;
+       struct ioatdma_device *ioatdma_device;
+       dma_addr_t phys;
+
+       ioatdma_device = ioat->base.device;
+       desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
+       if (unlikely(!desc))
+               return NULL;
+
+       desc_sw = kzalloc(sizeof(*desc_sw), flags);
+       if (unlikely(!desc_sw)) {
+               pci_pool_free(ioatdma_device->dma_pool, desc, phys);
+               return NULL;
+       }
+
+       memset(desc, 0, sizeof(*desc));
+
+       INIT_LIST_HEAD(&desc_sw->tx_list);
+       dma_async_tx_descriptor_init(&desc_sw->txd, &ioat->base.common);
+       desc_sw->txd.tx_submit = ioat1_tx_submit;
+       desc_sw->hw = desc;
+       desc_sw->txd.phys = phys;
+       set_desc_id(desc_sw, -1);
+
+       return desc_sw;
+}
+
+static int ioat_initial_desc_count = 256;
+module_param(ioat_initial_desc_count, int, 0644);
+MODULE_PARM_DESC(ioat_initial_desc_count,
+                "ioat1: initial descriptors per channel (default: 256)");
+/**
+ * ioat1_dma_alloc_chan_resources - returns the number of allocated descriptors
+ * @chan: the channel to be filled out
+ */
+static int ioat1_dma_alloc_chan_resources(struct dma_chan *c)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_desc_sw *desc;
+       u32 chanerr;
+       int i;
+       LIST_HEAD(tmp_list);
+
+       /* have we already been set up? */
+       if (!list_empty(&ioat->free_desc))
+               return ioat->desccount;
+
+       /* Setup register to interrupt and write completion status on error */
+       writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
+
+       chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+       if (chanerr) {
+               dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
+               writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
+       }
+
+       /* Allocate descriptors */
+       for (i = 0; i < ioat_initial_desc_count; i++) {
+               desc = ioat_dma_alloc_descriptor(ioat, GFP_KERNEL);
+               if (!desc) {
+                       dev_err(to_dev(chan), "Only %d initial descriptors\n", i);
+                       break;
+               }
+               set_desc_id(desc, i);
+               list_add_tail(&desc->node, &tmp_list);
+       }
+       spin_lock_bh(&ioat->desc_lock);
+       ioat->desccount = i;
+       list_splice(&tmp_list, &ioat->free_desc);
+       spin_unlock_bh(&ioat->desc_lock);
+
+       /* allocate a completion writeback area */
+       /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+       chan->completion = pci_pool_alloc(chan->device->completion_pool,
+                                         GFP_KERNEL, &chan->completion_dma);
+       memset(chan->completion, 0, sizeof(*chan->completion));
+       writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
+              chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
+       writel(((u64) chan->completion_dma) >> 32,
+              chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
+
+       tasklet_enable(&chan->cleanup_task);
+       ioat1_dma_start_null_desc(ioat);  /* give chain to dma device */
+       dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
+               __func__, ioat->desccount);
+       return ioat->desccount;
+}
+
+/**
+ * ioat1_dma_free_chan_resources - release all the descriptors
+ * @chan: the channel to be cleaned
+ */
+static void ioat1_dma_free_chan_resources(struct dma_chan *c)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioatdma_device *ioatdma_device = chan->device;
+       struct ioat_desc_sw *desc, *_desc;
+       int in_use_descs = 0;
+
+       /* Before freeing channel resources first check
+        * if they have been previously allocated for this channel.
+        */
+       if (ioat->desccount == 0)
+               return;
+
+       tasklet_disable(&chan->cleanup_task);
+       del_timer_sync(&chan->timer);
+       ioat1_cleanup(ioat);
+
+       /* Delay 100ms after reset to allow internal DMA logic to quiesce
+        * before removing DMA descriptor resources.
+        */
+       writeb(IOAT_CHANCMD_RESET,
+              chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+       mdelay(100);
+
+       spin_lock_bh(&ioat->desc_lock);
+       list_for_each_entry_safe(desc, _desc, &ioat->used_desc, node) {
+               dev_dbg(to_dev(chan), "%s: freeing %d from used list\n",
+                       __func__, desc_id(desc));
+               dump_desc_dbg(ioat, desc);
+               in_use_descs++;
+               list_del(&desc->node);
+               pci_pool_free(ioatdma_device->dma_pool, desc->hw,
+                             desc->txd.phys);
+               kfree(desc);
+       }
+       list_for_each_entry_safe(desc, _desc,
+                                &ioat->free_desc, node) {
+               list_del(&desc->node);
+               pci_pool_free(ioatdma_device->dma_pool, desc->hw,
+                             desc->txd.phys);
+               kfree(desc);
+       }
+       spin_unlock_bh(&ioat->desc_lock);
+
+       pci_pool_free(ioatdma_device->completion_pool,
+                     chan->completion,
+                     chan->completion_dma);
+
+       /* one is ok since we left it on there on purpose */
+       if (in_use_descs > 1)
+               dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
+                       in_use_descs - 1);
+
+       chan->last_completion = 0;
+       chan->completion_dma = 0;
+       ioat->pending = 0;
+       ioat->desccount = 0;
+}
+
+/**
+ * ioat1_dma_get_next_descriptor - return the next available descriptor
+ * @ioat: IOAT DMA channel handle
+ *
+ * Gets the next descriptor from the chain, and must be called with the
+ * channel's desc_lock held.  Allocates more descriptors if the channel
+ * has run out.
+ */
+static struct ioat_desc_sw *
+ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat)
+{
+       struct ioat_desc_sw *new;
+
+       if (!list_empty(&ioat->free_desc)) {
+               new = to_ioat_desc(ioat->free_desc.next);
+               list_del(&new->node);
+       } else {
+               /* try to get another desc */
+               new = ioat_dma_alloc_descriptor(ioat, GFP_ATOMIC);
+               if (!new) {
+                       dev_err(to_dev(&ioat->base), "alloc failed\n");
+                       return NULL;
+               }
+       }
+       dev_dbg(to_dev(&ioat->base), "%s: allocated: %d\n",
+               __func__, desc_id(new));
+       prefetch(new->hw);
+       return new;
+}
+
+static struct dma_async_tx_descriptor *
+ioat1_dma_prep_memcpy(struct dma_chan *c, dma_addr_t dma_dest,
+                     dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+       struct ioat_desc_sw *desc;
+       size_t copy;
+       LIST_HEAD(chain);
+       dma_addr_t src = dma_src;
+       dma_addr_t dest = dma_dest;
+       size_t total_len = len;
+       struct ioat_dma_descriptor *hw = NULL;
+       int tx_cnt = 0;
+
+       spin_lock_bh(&ioat->desc_lock);
+       desc = ioat1_dma_get_next_descriptor(ioat);
+       do {
+               if (!desc)
+                       break;
+
+               tx_cnt++;
+               copy = min_t(size_t, len, ioat->xfercap);
+
+               hw = desc->hw;
+               hw->size = copy;
+               hw->ctl = 0;
+               hw->src_addr = src;
+               hw->dst_addr = dest;
+
+               list_add_tail(&desc->node, &chain);
+
+               len -= copy;
+               dest += copy;
+               src += copy;
+               if (len) {
+                       struct ioat_desc_sw *next;
+
+                       async_tx_ack(&desc->txd);
+                       next = ioat1_dma_get_next_descriptor(ioat);
+                       hw->next = next ? next->txd.phys : 0;
+                       dump_desc_dbg(ioat, desc);
+                       desc = next;
+               } else
+                       hw->next = 0;
+       } while (len);
+
+       if (!desc) {
+               struct ioat_chan_common *chan = &ioat->base;
+
+               dev_err(to_dev(chan),
+                       "chan%d - get_next_desc failed\n", chan_num(chan));
+               list_splice(&chain, &ioat->free_desc);
+               spin_unlock_bh(&ioat->desc_lock);
+               return NULL;
+       }
+       spin_unlock_bh(&ioat->desc_lock);
+
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       list_splice(&chain, &desc->tx_list);
+       hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       hw->ctl_f.compl_write = 1;
+       hw->tx_cnt = tx_cnt;
+       dump_desc_dbg(ioat, desc);
+
+       return &desc->txd;
+}
+
+static void ioat1_cleanup_tasklet(unsigned long data)
+{
+       struct ioat_dma_chan *chan = (void *)data;
+
+       ioat1_cleanup(chan);
+       writew(IOAT_CHANCTRL_RUN, chan->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
+                   size_t len, struct ioat_dma_descriptor *hw)
+{
+       struct pci_dev *pdev = chan->device->pdev;
+       size_t offset = len - hw->size;
+
+       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+               ioat_unmap(pdev, hw->dst_addr - offset, len,
+                          PCI_DMA_FROMDEVICE, flags, 1);
+
+       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP))
+               ioat_unmap(pdev, hw->src_addr - offset, len,
+                          PCI_DMA_TODEVICE, flags, 0);
+}
+
+unsigned long ioat_get_current_completion(struct ioat_chan_common *chan)
+{
+       unsigned long phys_complete;
+       u64 completion;
+
+       completion = *chan->completion;
+       phys_complete = ioat_chansts_to_addr(completion);
+
+       dev_dbg(to_dev(chan), "%s: phys_complete: %#llx\n", __func__,
+               (unsigned long long) phys_complete);
+
+       if (is_ioat_halted(completion)) {
+               u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+               dev_err(to_dev(chan), "Channel halted, chanerr = %x\n",
+                       chanerr);
+
+               /* TODO do something to salvage the situation */
+       }
+
+       return phys_complete;
+}
+
+bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
+                          unsigned long *phys_complete)
+{
+       *phys_complete = ioat_get_current_completion(chan);
+       if (*phys_complete == chan->last_completion)
+               return false;
+       clear_bit(IOAT_COMPLETION_ACK, &chan->state);
+       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+       return true;
+}
+
+static void __cleanup(struct ioat_dma_chan *ioat, unsigned long phys_complete)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct list_head *_desc, *n;
+       struct dma_async_tx_descriptor *tx;
+
+       dev_dbg(to_dev(chan), "%s: phys_complete: %lx\n",
+                __func__, phys_complete);
+       list_for_each_safe(_desc, n, &ioat->used_desc) {
+               struct ioat_desc_sw *desc;
+
+               prefetch(n);
+               desc = list_entry(_desc, typeof(*desc), node);
+               tx = &desc->txd;
+               /*
+                * Incoming DMA requests may use multiple descriptors,
+                * due to exceeding xfercap, perhaps. If so, only the
+                * last one will have a cookie, and require unmapping.
+                */
+               dump_desc_dbg(ioat, desc);
+               if (tx->cookie) {
+                       chan->completed_cookie = tx->cookie;
+                       tx->cookie = 0;
+                       ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+                       ioat->active -= desc->hw->tx_cnt;
+                       if (tx->callback) {
+                               tx->callback(tx->callback_param);
+                               tx->callback = NULL;
+                       }
+               }
+
+               if (tx->phys != phys_complete) {
+                       /*
+                        * a completed entry, but not the last, so clean
+                        * up if the client is done with the descriptor
+                        */
+                       if (async_tx_test_ack(tx))
+                               list_move_tail(&desc->node, &ioat->free_desc);
+               } else {
+                       /*
+                        * last used desc. Do not remove, so we can
+                        * append from it.
+                        */
+
+                       /* if nothing else is pending, cancel the
+                        * completion timeout
+                        */
+                       if (n == &ioat->used_desc) {
+                               dev_dbg(to_dev(chan),
+                                       "%s cancel completion timeout\n",
+                                       __func__);
+                               clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+                       }
+
+                       /* TODO check status bits? */
+                       break;
+               }
+       }
+
+       chan->last_completion = phys_complete;
+}
+
+/**
+ * ioat1_cleanup - cleanup up finished descriptors
+ * @chan: ioat channel to be cleaned up
+ *
+ * To prevent lock contention we defer cleanup when the locks are
+ * contended with a terminal timeout that forces cleanup and catches
+ * completion notification errors.
+ */
+static void ioat1_cleanup(struct ioat_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       unsigned long phys_complete;
+
+       prefetch(chan->completion);
+
+       if (!spin_trylock_bh(&chan->cleanup_lock))
+               return;
+
+       if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       if (!spin_trylock_bh(&ioat->desc_lock)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       __cleanup(ioat, phys_complete);
+
+       spin_unlock_bh(&ioat->desc_lock);
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static void ioat1_timer_event(unsigned long data)
+{
+       struct ioat_dma_chan *ioat = (void *) data;
+       struct ioat_chan_common *chan = &ioat->base;
+
+       dev_dbg(to_dev(chan), "%s: state: %lx\n", __func__, chan->state);
+
+       spin_lock_bh(&chan->cleanup_lock);
+       if (test_and_clear_bit(IOAT_RESET_PENDING, &chan->state)) {
+               struct ioat_desc_sw *desc;
+
+               spin_lock_bh(&ioat->desc_lock);
+
+               /* restart active descriptors */
+               desc = to_ioat_desc(ioat->used_desc.prev);
+               ioat_set_chainaddr(ioat, desc->txd.phys);
+               ioat_start(chan);
+
+               ioat->pending = 0;
+               set_bit(IOAT_COMPLETION_PENDING, &chan->state);
+               mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+               spin_unlock_bh(&ioat->desc_lock);
+       } else if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+               unsigned long phys_complete;
+
+               spin_lock_bh(&ioat->desc_lock);
+               /* if we haven't made progress and we have already
+                * acknowledged a pending completion once, then be more
+                * forceful with a restart
+                */
+               if (ioat_cleanup_preamble(chan, &phys_complete))
+                       __cleanup(ioat, phys_complete);
+               else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+                       ioat1_reset_channel(ioat);
+               else {
+                       u64 status = ioat_chansts(chan);
+
+                       /* manually update the last completion address */
+                       if (ioat_chansts_to_addr(status) != 0)
+                               *chan->completion = status;
+
+                       set_bit(IOAT_COMPLETION_ACK, &chan->state);
+                       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+               }
+               spin_unlock_bh(&ioat->desc_lock);
+       }
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static enum dma_status
+ioat1_dma_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+                     dma_cookie_t *done, dma_cookie_t *used)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+       if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+               return DMA_SUCCESS;
+
+       ioat1_cleanup(ioat);
+
+       return ioat_is_complete(c, cookie, done, used);
+}
+
+static void ioat1_dma_start_null_desc(struct ioat_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_desc_sw *desc;
+       struct ioat_dma_descriptor *hw;
+
+       spin_lock_bh(&ioat->desc_lock);
+
+       desc = ioat1_dma_get_next_descriptor(ioat);
+
+       if (!desc) {
+               dev_err(to_dev(chan),
+                       "Unable to start null desc - get next desc failed\n");
+               spin_unlock_bh(&ioat->desc_lock);
+               return;
+       }
+
+       hw = desc->hw;
+       hw->ctl = 0;
+       hw->ctl_f.null = 1;
+       hw->ctl_f.int_en = 1;
+       hw->ctl_f.compl_write = 1;
+       /* set size to non-zero value (channel returns error when size is 0) */
+       hw->size = NULL_DESC_BUFFER_SIZE;
+       hw->src_addr = 0;
+       hw->dst_addr = 0;
+       async_tx_ack(&desc->txd);
+       hw->next = 0;
+       list_add_tail(&desc->node, &ioat->used_desc);
+       dump_desc_dbg(ioat, desc);
+
+       ioat_set_chainaddr(ioat, desc->txd.phys);
+       ioat_start(chan);
+       spin_unlock_bh(&ioat->desc_lock);
+}
+
+/*
+ * Perform a IOAT transaction to verify the HW works.
+ */
+#define IOAT_TEST_SIZE 2000
+
+static void __devinit ioat_dma_test_callback(void *dma_async_param)
+{
+       struct completion *cmp = dma_async_param;
+
+       complete(cmp);
+}
+
+/**
+ * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
+ * @device: device to be tested
+ */
+int __devinit ioat_dma_self_test(struct ioatdma_device *device)
+{
+       int i;
+       u8 *src;
+       u8 *dest;
+       struct dma_device *dma = &device->common;
+       struct device *dev = &device->pdev->dev;
+       struct dma_chan *dma_chan;
+       struct dma_async_tx_descriptor *tx;
+       dma_addr_t dma_dest, dma_src;
+       dma_cookie_t cookie;
+       int err = 0;
+       struct completion cmp;
+       unsigned long tmo;
+       unsigned long flags;
+
+       src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
+       if (!src)
+               return -ENOMEM;
+       dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
+       if (!dest) {
+               kfree(src);
+               return -ENOMEM;
+       }
+
+       /* Fill in src buffer */
+       for (i = 0; i < IOAT_TEST_SIZE; i++)
+               src[i] = (u8)i;
+
+       /* Start copy, using first DMA channel */
+       dma_chan = container_of(dma->channels.next, struct dma_chan,
+                               device_node);
+       if (dma->device_alloc_chan_resources(dma_chan) < 1) {
+               dev_err(dev, "selftest cannot allocate chan resource\n");
+               err = -ENODEV;
+               goto out;
+       }
+
+       dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE);
+       dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE);
+       flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE |
+               DMA_PREP_INTERRUPT;
+       tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
+                                                  IOAT_TEST_SIZE, flags);
+       if (!tx) {
+               dev_err(dev, "Self-test prep failed, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       async_tx_ack(tx);
+       init_completion(&cmp);
+       tx->callback = ioat_dma_test_callback;
+       tx->callback_param = &cmp;
+       cookie = tx->tx_submit(tx);
+       if (cookie < 0) {
+               dev_err(dev, "Self-test setup failed, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       dma->device_issue_pending(dma_chan);
+
+       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+       if (tmo == 0 ||
+           dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL)
+                                       != DMA_SUCCESS) {
+               dev_err(dev, "Self-test copy timed out, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       if (memcmp(src, dest, IOAT_TEST_SIZE)) {
+               dev_err(dev, "Self-test copy failed compare, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+free_resources:
+       dma->device_free_chan_resources(dma_chan);
+out:
+       kfree(src);
+       kfree(dest);
+       return err;
+}
+
+static char ioat_interrupt_style[32] = "msix";
+module_param_string(ioat_interrupt_style, ioat_interrupt_style,
+                   sizeof(ioat_interrupt_style), 0644);
+MODULE_PARM_DESC(ioat_interrupt_style,
+                "set ioat interrupt style: msix (default), "
+                "msix-single-vector, msi, intx)");
+
+/**
+ * ioat_dma_setup_interrupts - setup interrupt handler
+ * @device: ioat device
+ */
+static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
+{
+       struct ioat_chan_common *chan;
+       struct pci_dev *pdev = device->pdev;
+       struct device *dev = &pdev->dev;
+       struct msix_entry *msix;
+       int i, j, msixcnt;
+       int err = -EINVAL;
+       u8 intrctrl = 0;
+
+       if (!strcmp(ioat_interrupt_style, "msix"))
+               goto msix;
+       if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
+               goto msix_single_vector;
+       if (!strcmp(ioat_interrupt_style, "msi"))
+               goto msi;
+       if (!strcmp(ioat_interrupt_style, "intx"))
+               goto intx;
+       dev_err(dev, "invalid ioat_interrupt_style %s\n", ioat_interrupt_style);
+       goto err_no_irq;
+
+msix:
+       /* The number of MSI-X vectors should equal the number of channels */
+       msixcnt = device->common.chancnt;
+       for (i = 0; i < msixcnt; i++)
+               device->msix_entries[i].entry = i;
+
+       err = pci_enable_msix(pdev, device->msix_entries, msixcnt);
+       if (err < 0)
+               goto msi;
+       if (err > 0)
+               goto msix_single_vector;
+
+       for (i = 0; i < msixcnt; i++) {
+               msix = &device->msix_entries[i];
+               chan = ioat_chan_by_index(device, i);
+               err = devm_request_irq(dev, msix->vector,
+                                      ioat_dma_do_interrupt_msix, 0,
+                                      "ioat-msix", chan);
+               if (err) {
+                       for (j = 0; j < i; j++) {
+                               msix = &device->msix_entries[j];
+                               chan = ioat_chan_by_index(device, j);
+                               devm_free_irq(dev, msix->vector, chan);
+                       }
+                       goto msix_single_vector;
+               }
+       }
+       intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
+       goto done;
+
+msix_single_vector:
+       msix = &device->msix_entries[0];
+       msix->entry = 0;
+       err = pci_enable_msix(pdev, device->msix_entries, 1);
+       if (err)
+               goto msi;
+
+       err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0,
+                              "ioat-msix", device);
+       if (err) {
+               pci_disable_msix(pdev);
+               goto msi;
+       }
+       goto done;
+
+msi:
+       err = pci_enable_msi(pdev);
+       if (err)
+               goto intx;
+
+       err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt, 0,
+                              "ioat-msi", device);
+       if (err) {
+               pci_disable_msi(pdev);
+               goto intx;
+       }
+       goto done;
+
+intx:
+       err = devm_request_irq(dev, pdev->irq, ioat_dma_do_interrupt,
+                              IRQF_SHARED, "ioat-intx", device);
+       if (err)
+               goto err_no_irq;
+
+done:
+       if (device->intr_quirk)
+               device->intr_quirk(device);
+       intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
+       writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
+       return 0;
+
+err_no_irq:
+       /* Disable all interrupt generation */
+       writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
+       dev_err(dev, "no usable interrupts\n");
+       return err;
+}
+
+static void ioat_disable_interrupts(struct ioatdma_device *device)
+{
+       /* Disable all interrupt generation */
+       writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
+}
+
+int __devinit ioat_probe(struct ioatdma_device *device)
+{
+       int err = -ENODEV;
+       struct dma_device *dma = &device->common;
+       struct pci_dev *pdev = device->pdev;
+       struct device *dev = &pdev->dev;
+
+       /* DMA coherent memory pool for DMA descriptor allocations */
+       device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
+                                          sizeof(struct ioat_dma_descriptor),
+                                          64, 0);
+       if (!device->dma_pool) {
+               err = -ENOMEM;
+               goto err_dma_pool;
+       }
+
+       device->completion_pool = pci_pool_create("completion_pool", pdev,
+                                                 sizeof(u64), SMP_CACHE_BYTES,
+                                                 SMP_CACHE_BYTES);
+
+       if (!device->completion_pool) {
+               err = -ENOMEM;
+               goto err_completion_pool;
+       }
+
+       device->enumerate_channels(device);
+
+       dma_cap_set(DMA_MEMCPY, dma->cap_mask);
+       dma->dev = &pdev->dev;
+
+       if (!dma->chancnt) {
+               dev_err(dev, "zero channels detected\n");
+               goto err_setup_interrupts;
+       }
+
+       err = ioat_dma_setup_interrupts(device);
+       if (err)
+               goto err_setup_interrupts;
+
+       err = device->self_test(device);
+       if (err)
+               goto err_self_test;
+
+       return 0;
+
+err_self_test:
+       ioat_disable_interrupts(device);
+err_setup_interrupts:
+       pci_pool_destroy(device->completion_pool);
+err_completion_pool:
+       pci_pool_destroy(device->dma_pool);
+err_dma_pool:
+       return err;
+}
+
+int __devinit ioat_register(struct ioatdma_device *device)
+{
+       int err = dma_async_device_register(&device->common);
+
+       if (err) {
+               ioat_disable_interrupts(device);
+               pci_pool_destroy(device->completion_pool);
+               pci_pool_destroy(device->dma_pool);
+       }
+
+       return err;
+}
+
+/* ioat1_intr_quirk - fix up dma ctrl register to enable / disable msi */
+static void ioat1_intr_quirk(struct ioatdma_device *device)
+{
+       struct pci_dev *pdev = device->pdev;
+       u32 dmactrl;
+
+       pci_read_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
+       if (pdev->msi_enabled)
+               dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
+       else
+               dmactrl &= ~IOAT_PCI_DMACTRL_MSI_EN;
+       pci_write_config_dword(pdev, IOAT_PCI_DMACTRL_OFFSET, dmactrl);
+}
+
+static ssize_t ring_size_show(struct dma_chan *c, char *page)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+       return sprintf(page, "%d\n", ioat->desccount);
+}
+static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
+
+static ssize_t ring_active_show(struct dma_chan *c, char *page)
+{
+       struct ioat_dma_chan *ioat = to_ioat_chan(c);
+
+       return sprintf(page, "%d\n", ioat->active);
+}
+static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
+
+static ssize_t cap_show(struct dma_chan *c, char *page)
+{
+       struct dma_device *dma = c->device;
+
+       return sprintf(page, "copy%s%s%s%s%s%s\n",
+                      dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "",
+                      dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "",
+                      dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "",
+                      dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "",
+                      dma_has_cap(DMA_MEMSET, dma->cap_mask)  ? " fill" : "",
+                      dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : "");
+
+}
+struct ioat_sysfs_entry ioat_cap_attr = __ATTR_RO(cap);
+
+static ssize_t version_show(struct dma_chan *c, char *page)
+{
+       struct dma_device *dma = c->device;
+       struct ioatdma_device *device = to_ioatdma_device(dma);
+
+       return sprintf(page, "%d.%d\n",
+                      device->version >> 4, device->version & 0xf);
+}
+struct ioat_sysfs_entry ioat_version_attr = __ATTR_RO(version);
+
+static struct attribute *ioat1_attrs[] = {
+       &ring_size_attr.attr,
+       &ring_active_attr.attr,
+       &ioat_cap_attr.attr,
+       &ioat_version_attr.attr,
+       NULL,
+};
+
+static ssize_t
+ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+       struct ioat_sysfs_entry *entry;
+       struct ioat_chan_common *chan;
+
+       entry = container_of(attr, struct ioat_sysfs_entry, attr);
+       chan = container_of(kobj, struct ioat_chan_common, kobj);
+
+       if (!entry->show)
+               return -EIO;
+       return entry->show(&chan->common, page);
+}
+
+struct sysfs_ops ioat_sysfs_ops = {
+       .show   = ioat_attr_show,
+};
+
+static struct kobj_type ioat1_ktype = {
+       .sysfs_ops = &ioat_sysfs_ops,
+       .default_attrs = ioat1_attrs,
+};
+
+void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type)
+{
+       struct dma_device *dma = &device->common;
+       struct dma_chan *c;
+
+       list_for_each_entry(c, &dma->channels, device_node) {
+               struct ioat_chan_common *chan = to_chan_common(c);
+               struct kobject *parent = &c->dev->device.kobj;
+               int err;
+
+               err = kobject_init_and_add(&chan->kobj, type, parent, "quickdata");
+               if (err) {
+                       dev_warn(to_dev(chan),
+                                "sysfs init error (%d), continuing...\n", err);
+                       kobject_put(&chan->kobj);
+                       set_bit(IOAT_KOBJ_INIT_FAIL, &chan->state);
+               }
+       }
+}
+
+void ioat_kobject_del(struct ioatdma_device *device)
+{
+       struct dma_device *dma = &device->common;
+       struct dma_chan *c;
+
+       list_for_each_entry(c, &dma->channels, device_node) {
+               struct ioat_chan_common *chan = to_chan_common(c);
+
+               if (!test_bit(IOAT_KOBJ_INIT_FAIL, &chan->state)) {
+                       kobject_del(&chan->kobj);
+                       kobject_put(&chan->kobj);
+               }
+       }
+}
+
+int __devinit ioat1_dma_probe(struct ioatdma_device *device, int dca)
+{
+       struct pci_dev *pdev = device->pdev;
+       struct dma_device *dma;
+       int err;
+
+       device->intr_quirk = ioat1_intr_quirk;
+       device->enumerate_channels = ioat1_enumerate_channels;
+       device->self_test = ioat_dma_self_test;
+       dma = &device->common;
+       dma->device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
+       dma->device_issue_pending = ioat1_dma_memcpy_issue_pending;
+       dma->device_alloc_chan_resources = ioat1_dma_alloc_chan_resources;
+       dma->device_free_chan_resources = ioat1_dma_free_chan_resources;
+       dma->device_is_tx_complete = ioat1_dma_is_complete;
+
+       err = ioat_probe(device);
+       if (err)
+               return err;
+       ioat_set_tcp_copy_break(4096);
+       err = ioat_register(device);
+       if (err)
+               return err;
+       ioat_kobject_add(device, &ioat1_ktype);
+
+       if (dca)
+               device->dca = ioat_dca_init(pdev, device->reg_base);
+
+       return err;
+}
+
+void __devexit ioat_dma_remove(struct ioatdma_device *device)
+{
+       struct dma_device *dma = &device->common;
+
+       ioat_disable_interrupts(device);
+
+       ioat_kobject_del(device);
+
+       dma_async_device_unregister(dma);
+
+       pci_pool_destroy(device->dma_pool);
+       pci_pool_destroy(device->completion_pool);
+
+       INIT_LIST_HEAD(&dma->channels);
+}
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
new file mode 100644 (file)
index 0000000..c14fdfe
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_H
+#define IOATDMA_H
+
+#include <linux/dmaengine.h>
+#include "hw.h"
+#include "registers.h"
+#include <linux/init.h>
+#include <linux/dmapool.h>
+#include <linux/cache.h>
+#include <linux/pci_ids.h>
+#include <net/tcp.h>
+
+#define IOAT_DMA_VERSION  "4.00"
+
+#define IOAT_LOW_COMPLETION_MASK       0xffffffc0
+#define IOAT_DMA_DCA_ANY_CPU           ~0
+
+#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
+#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, txd)
+#define to_dev(ioat_chan) (&(ioat_chan)->device->pdev->dev)
+
+#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
+
+/*
+ * workaround for IOAT ver.3.0 null descriptor issue
+ * (channel returns error when size is 0)
+ */
+#define NULL_DESC_BUFFER_SIZE 1
+
+/**
+ * struct ioatdma_device - internal representation of a IOAT device
+ * @pdev: PCI-Express device
+ * @reg_base: MMIO register space base address
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @version: version of ioatdma device
+ * @msix_entries: irq handlers
+ * @idx: per channel data
+ * @dca: direct cache access context
+ * @intr_quirk: interrupt setup quirk (for ioat_v1 devices)
+ * @enumerate_channels: hw version specific channel enumeration
+ * @cleanup_tasklet: select between the v2 and v3 cleanup routines
+ * @timer_fn: select between the v2 and v3 timer watchdog routines
+ * @self_test: hardware version specific self test for each supported op type
+ *
+ * Note: the v3 cleanup routine supports raid operations
+ */
+struct ioatdma_device {
+       struct pci_dev *pdev;
+       void __iomem *reg_base;
+       struct pci_pool *dma_pool;
+       struct pci_pool *completion_pool;
+       struct dma_device common;
+       u8 version;
+       struct msix_entry msix_entries[4];
+       struct ioat_chan_common *idx[4];
+       struct dca_provider *dca;
+       void (*intr_quirk)(struct ioatdma_device *device);
+       int (*enumerate_channels)(struct ioatdma_device *device);
+       void (*cleanup_tasklet)(unsigned long data);
+       void (*timer_fn)(unsigned long data);
+       int (*self_test)(struct ioatdma_device *device);
+};
+
+struct ioat_chan_common {
+       struct dma_chan common;
+       void __iomem *reg_base;
+       unsigned long last_completion;
+       spinlock_t cleanup_lock;
+       dma_cookie_t completed_cookie;
+       unsigned long state;
+       #define IOAT_COMPLETION_PENDING 0
+       #define IOAT_COMPLETION_ACK 1
+       #define IOAT_RESET_PENDING 2
+       #define IOAT_KOBJ_INIT_FAIL 3
+       struct timer_list timer;
+       #define COMPLETION_TIMEOUT msecs_to_jiffies(100)
+       #define IDLE_TIMEOUT msecs_to_jiffies(2000)
+       #define RESET_DELAY msecs_to_jiffies(100)
+       struct ioatdma_device *device;
+       dma_addr_t completion_dma;
+       u64 *completion;
+       struct tasklet_struct cleanup_task;
+       struct kobject kobj;
+};
+
+struct ioat_sysfs_entry {
+       struct attribute attr;
+       ssize_t (*show)(struct dma_chan *, char *);
+};
+
+/**
+ * struct ioat_dma_chan - internal representation of a DMA channel
+ */
+struct ioat_dma_chan {
+       struct ioat_chan_common base;
+
+       size_t xfercap; /* XFERCAP register value expanded out */
+
+       spinlock_t desc_lock;
+       struct list_head free_desc;
+       struct list_head used_desc;
+
+       int pending;
+       u16 desccount;
+       u16 active;
+};
+
+static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c)
+{
+       return container_of(c, struct ioat_chan_common, common);
+}
+
+static inline struct ioat_dma_chan *to_ioat_chan(struct dma_chan *c)
+{
+       struct ioat_chan_common *chan = to_chan_common(c);
+
+       return container_of(chan, struct ioat_dma_chan, base);
+}
+
+/**
+ * ioat_is_complete - poll the status of an ioat transaction
+ * @c: channel handle
+ * @cookie: transaction identifier
+ * @done: if set, updated with last completed transaction
+ * @used: if set, updated with last used transaction
+ */
+static inline enum dma_status
+ioat_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+                dma_cookie_t *done, dma_cookie_t *used)
+{
+       struct ioat_chan_common *chan = to_chan_common(c);
+       dma_cookie_t last_used;
+       dma_cookie_t last_complete;
+
+       last_used = c->cookie;
+       last_complete = chan->completed_cookie;
+
+       if (done)
+               *done = last_complete;
+       if (used)
+               *used = last_used;
+
+       return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+/* wrapper around hardware descriptor format + additional software fields */
+
+/**
+ * struct ioat_desc_sw - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor (for memcpy)
+ * @node: this descriptor will either be on the free list,
+ *     or attached to a transaction list (tx_list)
+ * @txd: the generic software descriptor for all engines
+ * @id: identifier for debug
+ */
+struct ioat_desc_sw {
+       struct ioat_dma_descriptor *hw;
+       struct list_head node;
+       size_t len;
+       struct list_head tx_list;
+       struct dma_async_tx_descriptor txd;
+       #ifdef DEBUG
+       int id;
+       #endif
+};
+
+#ifdef DEBUG
+#define set_desc_id(desc, i) ((desc)->id = (i))
+#define desc_id(desc) ((desc)->id)
+#else
+#define set_desc_id(desc, i)
+#define desc_id(desc) (0)
+#endif
+
+static inline void
+__dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
+               struct dma_async_tx_descriptor *tx, int id)
+{
+       struct device *dev = to_dev(chan);
+
+       dev_dbg(dev, "desc[%d]: (%#llx->%#llx) cookie: %d flags: %#x"
+               " ctl: %#x (op: %d int_en: %d compl: %d)\n", id,
+               (unsigned long long) tx->phys,
+               (unsigned long long) hw->next, tx->cookie, tx->flags,
+               hw->ctl, hw->ctl_f.op, hw->ctl_f.int_en, hw->ctl_f.compl_write);
+}
+
+#define dump_desc_dbg(c, d) \
+       ({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; })
+
+static inline void ioat_set_tcp_copy_break(unsigned long copybreak)
+{
+       #ifdef CONFIG_NET_DMA
+       sysctl_tcp_dma_copybreak = copybreak;
+       #endif
+}
+
+static inline struct ioat_chan_common *
+ioat_chan_by_index(struct ioatdma_device *device, int index)
+{
+       return device->idx[index];
+}
+
+static inline u64 ioat_chansts(struct ioat_chan_common *chan)
+{
+       u8 ver = chan->device->version;
+       u64 status;
+       u32 status_lo;
+
+       /* We need to read the low address first as this causes the
+        * chipset to latch the upper bits for the subsequent read
+        */
+       status_lo = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_LOW(ver));
+       status = readl(chan->reg_base + IOAT_CHANSTS_OFFSET_HIGH(ver));
+       status <<= 32;
+       status |= status_lo;
+
+       return status;
+}
+
+static inline void ioat_start(struct ioat_chan_common *chan)
+{
+       u8 ver = chan->device->version;
+
+       writeb(IOAT_CHANCMD_START, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
+}
+
+static inline u64 ioat_chansts_to_addr(u64 status)
+{
+       return status & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
+}
+
+static inline u32 ioat_chanerr(struct ioat_chan_common *chan)
+{
+       return readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+}
+
+static inline void ioat_suspend(struct ioat_chan_common *chan)
+{
+       u8 ver = chan->device->version;
+
+       writeb(IOAT_CHANCMD_SUSPEND, chan->reg_base + IOAT_CHANCMD_OFFSET(ver));
+}
+
+static inline void ioat_set_chainaddr(struct ioat_dma_chan *ioat, u64 addr)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+
+       writel(addr & 0x00000000FFFFFFFF,
+              chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
+       writel(addr >> 32,
+              chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
+}
+
+static inline bool is_ioat_active(unsigned long status)
+{
+       return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE);
+}
+
+static inline bool is_ioat_idle(unsigned long status)
+{
+       return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_DONE);
+}
+
+static inline bool is_ioat_halted(unsigned long status)
+{
+       return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED);
+}
+
+static inline bool is_ioat_suspended(unsigned long status)
+{
+       return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED);
+}
+
+/* channel was fatally programmed */
+static inline bool is_ioat_bug(unsigned long err)
+{
+       return !!(err & (IOAT_CHANERR_SRC_ADDR_ERR|IOAT_CHANERR_DEST_ADDR_ERR|
+                        IOAT_CHANERR_NEXT_ADDR_ERR|IOAT_CHANERR_CONTROL_ERR|
+                        IOAT_CHANERR_LENGTH_ERR));
+}
+
+static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len,
+                             int direction, enum dma_ctrl_flags flags, bool dst)
+{
+       if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) ||
+           (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE)))
+               pci_unmap_single(pdev, addr, len, direction);
+       else
+               pci_unmap_page(pdev, addr, len, direction);
+}
+
+int __devinit ioat_probe(struct ioatdma_device *device);
+int __devinit ioat_register(struct ioatdma_device *device);
+int __devinit ioat1_dma_probe(struct ioatdma_device *dev, int dca);
+int __devinit ioat_dma_self_test(struct ioatdma_device *device);
+void __devexit ioat_dma_remove(struct ioatdma_device *device);
+struct dca_provider * __devinit ioat_dca_init(struct pci_dev *pdev,
+                                             void __iomem *iobase);
+unsigned long ioat_get_current_completion(struct ioat_chan_common *chan);
+void ioat_init_channel(struct ioatdma_device *device,
+                      struct ioat_chan_common *chan, int idx,
+                      void (*timer_fn)(unsigned long),
+                      void (*tasklet)(unsigned long),
+                      unsigned long ioat);
+void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
+                   size_t len, struct ioat_dma_descriptor *hw);
+bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
+                          unsigned long *phys_complete);
+void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type);
+void ioat_kobject_del(struct ioatdma_device *device);
+extern struct sysfs_ops ioat_sysfs_ops;
+extern struct ioat_sysfs_entry ioat_version_attr;
+extern struct ioat_sysfs_entry ioat_cap_attr;
+#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
new file mode 100644 (file)
index 0000000..96ffab7
--- /dev/null
@@ -0,0 +1,871 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2004 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine (versions >= 2), which
+ * does asynchronous data movement and checksumming operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/i7300_idle.h>
+#include "dma.h"
+#include "dma_v2.h"
+#include "registers.h"
+#include "hw.h"
+
+int ioat_ring_alloc_order = 8;
+module_param(ioat_ring_alloc_order, int, 0644);
+MODULE_PARM_DESC(ioat_ring_alloc_order,
+                "ioat2+: allocate 2^n descriptors per channel"
+                " (default: 8 max: 16)");
+static int ioat_ring_max_alloc_order = IOAT_MAX_ORDER;
+module_param(ioat_ring_max_alloc_order, int, 0644);
+MODULE_PARM_DESC(ioat_ring_max_alloc_order,
+                "ioat2+: upper limit for ring size (default: 16)");
+
+void __ioat2_issue_pending(struct ioat2_dma_chan *ioat)
+{
+       void * __iomem reg_base = ioat->base.reg_base;
+
+       ioat->pending = 0;
+       ioat->dmacount += ioat2_ring_pending(ioat);
+       ioat->issued = ioat->head;
+       /* make descriptor updates globally visible before notifying channel */
+       wmb();
+       writew(ioat->dmacount, reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
+       dev_dbg(to_dev(&ioat->base),
+               "%s: head: %#x tail: %#x issued: %#x count: %#x\n",
+               __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
+}
+
+void ioat2_issue_pending(struct dma_chan *chan)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(chan);
+
+       spin_lock_bh(&ioat->ring_lock);
+       if (ioat->pending == 1)
+               __ioat2_issue_pending(ioat);
+       spin_unlock_bh(&ioat->ring_lock);
+}
+
+/**
+ * ioat2_update_pending - log pending descriptors
+ * @ioat: ioat2+ channel
+ *
+ * set pending to '1' unless pending is already set to '2', pending == 2
+ * indicates that submission is temporarily blocked due to an in-flight
+ * reset.  If we are already above the ioat_pending_level threshold then
+ * just issue pending.
+ *
+ * called with ring_lock held
+ */
+static void ioat2_update_pending(struct ioat2_dma_chan *ioat)
+{
+       if (unlikely(ioat->pending == 2))
+               return;
+       else if (ioat2_ring_pending(ioat) > ioat_pending_level)
+               __ioat2_issue_pending(ioat);
+       else
+               ioat->pending = 1;
+}
+
+static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_ring_ent *desc;
+       struct ioat_dma_descriptor *hw;
+       int idx;
+
+       if (ioat2_ring_space(ioat) < 1) {
+               dev_err(to_dev(&ioat->base),
+                       "Unable to start null desc - ring full\n");
+               return;
+       }
+
+       dev_dbg(to_dev(&ioat->base), "%s: head: %#x tail: %#x issued: %#x\n",
+               __func__, ioat->head, ioat->tail, ioat->issued);
+       idx = ioat2_desc_alloc(ioat, 1);
+       desc = ioat2_get_ring_ent(ioat, idx);
+
+       hw = desc->hw;
+       hw->ctl = 0;
+       hw->ctl_f.null = 1;
+       hw->ctl_f.int_en = 1;
+       hw->ctl_f.compl_write = 1;
+       /* set size to non-zero value (channel returns error when size is 0) */
+       hw->size = NULL_DESC_BUFFER_SIZE;
+       hw->src_addr = 0;
+       hw->dst_addr = 0;
+       async_tx_ack(&desc->txd);
+       ioat2_set_chainaddr(ioat, desc->txd.phys);
+       dump_desc_dbg(ioat, desc);
+       __ioat2_issue_pending(ioat);
+}
+
+static void ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
+{
+       spin_lock_bh(&ioat->ring_lock);
+       __ioat2_start_null_desc(ioat);
+       spin_unlock_bh(&ioat->ring_lock);
+}
+
+static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct dma_async_tx_descriptor *tx;
+       struct ioat_ring_ent *desc;
+       bool seen_current = false;
+       u16 active;
+       int i;
+
+       dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
+               __func__, ioat->head, ioat->tail, ioat->issued);
+
+       active = ioat2_ring_active(ioat);
+       for (i = 0; i < active && !seen_current; i++) {
+               prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
+               desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+               tx = &desc->txd;
+               dump_desc_dbg(ioat, desc);
+               if (tx->cookie) {
+                       ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+                       chan->completed_cookie = tx->cookie;
+                       tx->cookie = 0;
+                       if (tx->callback) {
+                               tx->callback(tx->callback_param);
+                               tx->callback = NULL;
+                       }
+               }
+
+               if (tx->phys == phys_complete)
+                       seen_current = true;
+       }
+       ioat->tail += i;
+       BUG_ON(!seen_current); /* no active descs have written a completion? */
+
+       chan->last_completion = phys_complete;
+       if (ioat->head == ioat->tail) {
+               dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
+                       __func__);
+               clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+               mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+       }
+}
+
+/**
+ * ioat2_cleanup - clean finished descriptors (advance tail pointer)
+ * @chan: ioat channel to be cleaned up
+ */
+static void ioat2_cleanup(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       unsigned long phys_complete;
+
+       prefetch(chan->completion);
+
+       if (!spin_trylock_bh(&chan->cleanup_lock))
+               return;
+
+       if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       if (!spin_trylock_bh(&ioat->ring_lock)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       __cleanup(ioat, phys_complete);
+
+       spin_unlock_bh(&ioat->ring_lock);
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+void ioat2_cleanup_tasklet(unsigned long data)
+{
+       struct ioat2_dma_chan *ioat = (void *) data;
+
+       ioat2_cleanup(ioat);
+       writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+void __ioat2_restart_chan(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+
+       /* set the tail to be re-issued */
+       ioat->issued = ioat->tail;
+       ioat->dmacount = 0;
+       set_bit(IOAT_COMPLETION_PENDING, &chan->state);
+       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+       dev_dbg(to_dev(chan),
+               "%s: head: %#x tail: %#x issued: %#x count: %#x\n",
+               __func__, ioat->head, ioat->tail, ioat->issued, ioat->dmacount);
+
+       if (ioat2_ring_pending(ioat)) {
+               struct ioat_ring_ent *desc;
+
+               desc = ioat2_get_ring_ent(ioat, ioat->tail);
+               ioat2_set_chainaddr(ioat, desc->txd.phys);
+               __ioat2_issue_pending(ioat);
+       } else
+               __ioat2_start_null_desc(ioat);
+}
+
+static void ioat2_restart_channel(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       unsigned long phys_complete;
+       u32 status;
+
+       status = ioat_chansts(chan);
+       if (is_ioat_active(status) || is_ioat_idle(status))
+               ioat_suspend(chan);
+       while (is_ioat_active(status) || is_ioat_idle(status)) {
+               status = ioat_chansts(chan);
+               cpu_relax();
+       }
+
+       if (ioat_cleanup_preamble(chan, &phys_complete))
+               __cleanup(ioat, phys_complete);
+
+       __ioat2_restart_chan(ioat);
+}
+
+void ioat2_timer_event(unsigned long data)
+{
+       struct ioat2_dma_chan *ioat = (void *) data;
+       struct ioat_chan_common *chan = &ioat->base;
+
+       spin_lock_bh(&chan->cleanup_lock);
+       if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+               unsigned long phys_complete;
+               u64 status;
+
+               spin_lock_bh(&ioat->ring_lock);
+               status = ioat_chansts(chan);
+
+               /* when halted due to errors check for channel
+                * programming errors before advancing the completion state
+                */
+               if (is_ioat_halted(status)) {
+                       u32 chanerr;
+
+                       chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+                       BUG_ON(is_ioat_bug(chanerr));
+               }
+
+               /* if we haven't made progress and we have already
+                * acknowledged a pending completion once, then be more
+                * forceful with a restart
+                */
+               if (ioat_cleanup_preamble(chan, &phys_complete))
+                       __cleanup(ioat, phys_complete);
+               else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+                       ioat2_restart_channel(ioat);
+               else {
+                       set_bit(IOAT_COMPLETION_ACK, &chan->state);
+                       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+               }
+               spin_unlock_bh(&ioat->ring_lock);
+       } else {
+               u16 active;
+
+               /* if the ring is idle, empty, and oversized try to step
+                * down the size
+                */
+               spin_lock_bh(&ioat->ring_lock);
+               active = ioat2_ring_active(ioat);
+               if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
+                       reshape_ring(ioat, ioat->alloc_order-1);
+               spin_unlock_bh(&ioat->ring_lock);
+
+               /* keep shrinking until we get back to our minimum
+                * default size
+                */
+               if (ioat->alloc_order > ioat_get_alloc_order())
+                       mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+       }
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+/**
+ * ioat2_enumerate_channels - find and initialize the device's channels
+ * @device: the device to be enumerated
+ */
+int ioat2_enumerate_channels(struct ioatdma_device *device)
+{
+       struct ioat2_dma_chan *ioat;
+       struct device *dev = &device->pdev->dev;
+       struct dma_device *dma = &device->common;
+       u8 xfercap_log;
+       int i;
+
+       INIT_LIST_HEAD(&dma->channels);
+       dma->chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
+       dma->chancnt &= 0x1f; /* bits [4:0] valid */
+       if (dma->chancnt > ARRAY_SIZE(device->idx)) {
+               dev_warn(dev, "(%d) exceeds max supported channels (%zu)\n",
+                        dma->chancnt, ARRAY_SIZE(device->idx));
+               dma->chancnt = ARRAY_SIZE(device->idx);
+       }
+       xfercap_log = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
+       xfercap_log &= 0x1f; /* bits [4:0] valid */
+       if (xfercap_log == 0)
+               return 0;
+       dev_dbg(dev, "%s: xfercap = %d\n", __func__, 1 << xfercap_log);
+
+       /* FIXME which i/oat version is i7300? */
+#ifdef CONFIG_I7300_IDLE_IOAT_CHANNEL
+       if (i7300_idle_platform_probe(NULL, NULL, 1) == 0)
+               dma->chancnt--;
+#endif
+       for (i = 0; i < dma->chancnt; i++) {
+               ioat = devm_kzalloc(dev, sizeof(*ioat), GFP_KERNEL);
+               if (!ioat)
+                       break;
+
+               ioat_init_channel(device, &ioat->base, i,
+                                 device->timer_fn,
+                                 device->cleanup_tasklet,
+                                 (unsigned long) ioat);
+               ioat->xfercap_log = xfercap_log;
+               spin_lock_init(&ioat->ring_lock);
+       }
+       dma->chancnt = i;
+       return i;
+}
+
+static dma_cookie_t ioat2_tx_submit_unlock(struct dma_async_tx_descriptor *tx)
+{
+       struct dma_chan *c = tx->chan;
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       dma_cookie_t cookie = c->cookie;
+
+       cookie++;
+       if (cookie < 0)
+               cookie = 1;
+       tx->cookie = cookie;
+       c->cookie = cookie;
+       dev_dbg(to_dev(&ioat->base), "%s: cookie: %d\n", __func__, cookie);
+
+       if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
+               mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+       ioat2_update_pending(ioat);
+       spin_unlock_bh(&ioat->ring_lock);
+
+       return cookie;
+}
+
+static struct ioat_ring_ent *ioat2_alloc_ring_ent(struct dma_chan *chan, gfp_t flags)
+{
+       struct ioat_dma_descriptor *hw;
+       struct ioat_ring_ent *desc;
+       struct ioatdma_device *dma;
+       dma_addr_t phys;
+
+       dma = to_ioatdma_device(chan->device);
+       hw = pci_pool_alloc(dma->dma_pool, flags, &phys);
+       if (!hw)
+               return NULL;
+       memset(hw, 0, sizeof(*hw));
+
+       desc = kmem_cache_alloc(ioat2_cache, flags);
+       if (!desc) {
+               pci_pool_free(dma->dma_pool, hw, phys);
+               return NULL;
+       }
+       memset(desc, 0, sizeof(*desc));
+
+       dma_async_tx_descriptor_init(&desc->txd, chan);
+       desc->txd.tx_submit = ioat2_tx_submit_unlock;
+       desc->hw = hw;
+       desc->txd.phys = phys;
+       return desc;
+}
+
+static void ioat2_free_ring_ent(struct ioat_ring_ent *desc, struct dma_chan *chan)
+{
+       struct ioatdma_device *dma;
+
+       dma = to_ioatdma_device(chan->device);
+       pci_pool_free(dma->dma_pool, desc->hw, desc->txd.phys);
+       kmem_cache_free(ioat2_cache, desc);
+}
+
+static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gfp_t flags)
+{
+       struct ioat_ring_ent **ring;
+       int descs = 1 << order;
+       int i;
+
+       if (order > ioat_get_max_alloc_order())
+               return NULL;
+
+       /* allocate the array to hold the software ring */
+       ring = kcalloc(descs, sizeof(*ring), flags);
+       if (!ring)
+               return NULL;
+       for (i = 0; i < descs; i++) {
+               ring[i] = ioat2_alloc_ring_ent(c, flags);
+               if (!ring[i]) {
+                       while (i--)
+                               ioat2_free_ring_ent(ring[i], c);
+                       kfree(ring);
+                       return NULL;
+               }
+               set_desc_id(ring[i], i);
+       }
+
+       /* link descs */
+       for (i = 0; i < descs-1; i++) {
+               struct ioat_ring_ent *next = ring[i+1];
+               struct ioat_dma_descriptor *hw = ring[i]->hw;
+
+               hw->next = next->txd.phys;
+       }
+       ring[i]->hw->next = ring[0]->txd.phys;
+
+       return ring;
+}
+
+/* ioat2_alloc_chan_resources - allocate/initialize ioat2 descriptor ring
+ * @chan: channel to be initialized
+ */
+int ioat2_alloc_chan_resources(struct dma_chan *c)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_ring_ent **ring;
+       u32 chanerr;
+       int order;
+
+       /* have we already been set up? */
+       if (ioat->ring)
+               return 1 << ioat->alloc_order;
+
+       /* Setup register to interrupt and write completion status on error */
+       writew(IOAT_CHANCTRL_RUN, chan->reg_base + IOAT_CHANCTRL_OFFSET);
+
+       chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+       if (chanerr) {
+               dev_err(to_dev(chan), "CHANERR = %x, clearing\n", chanerr);
+               writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
+       }
+
+       /* allocate a completion writeback area */
+       /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+       chan->completion = pci_pool_alloc(chan->device->completion_pool,
+                                         GFP_KERNEL, &chan->completion_dma);
+       if (!chan->completion)
+               return -ENOMEM;
+
+       memset(chan->completion, 0, sizeof(*chan->completion));
+       writel(((u64) chan->completion_dma) & 0x00000000FFFFFFFF,
+              chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
+       writel(((u64) chan->completion_dma) >> 32,
+              chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
+
+       order = ioat_get_alloc_order();
+       ring = ioat2_alloc_ring(c, order, GFP_KERNEL);
+       if (!ring)
+               return -ENOMEM;
+
+       spin_lock_bh(&ioat->ring_lock);
+       ioat->ring = ring;
+       ioat->head = 0;
+       ioat->issued = 0;
+       ioat->tail = 0;
+       ioat->pending = 0;
+       ioat->alloc_order = order;
+       spin_unlock_bh(&ioat->ring_lock);
+
+       tasklet_enable(&chan->cleanup_task);
+       ioat2_start_null_desc(ioat);
+
+       return 1 << ioat->alloc_order;
+}
+
+bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
+{
+       /* reshape differs from normal ring allocation in that we want
+        * to allocate a new software ring while only
+        * extending/truncating the hardware ring
+        */
+       struct ioat_chan_common *chan = &ioat->base;
+       struct dma_chan *c = &chan->common;
+       const u16 curr_size = ioat2_ring_mask(ioat) + 1;
+       const u16 active = ioat2_ring_active(ioat);
+       const u16 new_size = 1 << order;
+       struct ioat_ring_ent **ring;
+       u16 i;
+
+       if (order > ioat_get_max_alloc_order())
+               return false;
+
+       /* double check that we have at least 1 free descriptor */
+       if (active == curr_size)
+               return false;
+
+       /* when shrinking, verify that we can hold the current active
+        * set in the new ring
+        */
+       if (active >= new_size)
+               return false;
+
+       /* allocate the array to hold the software ring */
+       ring = kcalloc(new_size, sizeof(*ring), GFP_NOWAIT);
+       if (!ring)
+               return false;
+
+       /* allocate/trim descriptors as needed */
+       if (new_size > curr_size) {
+               /* copy current descriptors to the new ring */
+               for (i = 0; i < curr_size; i++) {
+                       u16 curr_idx = (ioat->tail+i) & (curr_size-1);
+                       u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+                       ring[new_idx] = ioat->ring[curr_idx];
+                       set_desc_id(ring[new_idx], new_idx);
+               }
+
+               /* add new descriptors to the ring */
+               for (i = curr_size; i < new_size; i++) {
+                       u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+                       ring[new_idx] = ioat2_alloc_ring_ent(c, GFP_NOWAIT);
+                       if (!ring[new_idx]) {
+                               while (i--) {
+                                       u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+                                       ioat2_free_ring_ent(ring[new_idx], c);
+                               }
+                               kfree(ring);
+                               return false;
+                       }
+                       set_desc_id(ring[new_idx], new_idx);
+               }
+
+               /* hw link new descriptors */
+               for (i = curr_size-1; i < new_size; i++) {
+                       u16 new_idx = (ioat->tail+i) & (new_size-1);
+                       struct ioat_ring_ent *next = ring[(new_idx+1) & (new_size-1)];
+                       struct ioat_dma_descriptor *hw = ring[new_idx]->hw;
+
+                       hw->next = next->txd.phys;
+               }
+       } else {
+               struct ioat_dma_descriptor *hw;
+               struct ioat_ring_ent *next;
+
+               /* copy current descriptors to the new ring, dropping the
+                * removed descriptors
+                */
+               for (i = 0; i < new_size; i++) {
+                       u16 curr_idx = (ioat->tail+i) & (curr_size-1);
+                       u16 new_idx = (ioat->tail+i) & (new_size-1);
+
+                       ring[new_idx] = ioat->ring[curr_idx];
+                       set_desc_id(ring[new_idx], new_idx);
+               }
+
+               /* free deleted descriptors */
+               for (i = new_size; i < curr_size; i++) {
+                       struct ioat_ring_ent *ent;
+
+                       ent = ioat2_get_ring_ent(ioat, ioat->tail+i);
+                       ioat2_free_ring_ent(ent, c);
+               }
+
+               /* fix up hardware ring */
+               hw = ring[(ioat->tail+new_size-1) & (new_size-1)]->hw;
+               next = ring[(ioat->tail+new_size) & (new_size-1)];
+               hw->next = next->txd.phys;
+       }
+
+       dev_dbg(to_dev(chan), "%s: allocated %d descriptors\n",
+               __func__, new_size);
+
+       kfree(ioat->ring);
+       ioat->ring = ring;
+       ioat->alloc_order = order;
+
+       return true;
+}
+
+/**
+ * ioat2_alloc_and_lock - common descriptor alloc boilerplate for ioat2,3 ops
+ * @idx: gets starting descriptor index on successful allocation
+ * @ioat: ioat2,3 channel (ring) to operate on
+ * @num_descs: allocation length
+ */
+int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+
+       spin_lock_bh(&ioat->ring_lock);
+       /* never allow the last descriptor to be consumed, we need at
+        * least one free at all times to allow for on-the-fly ring
+        * resizing.
+        */
+       while (unlikely(ioat2_ring_space(ioat) <= num_descs)) {
+               if (reshape_ring(ioat, ioat->alloc_order + 1) &&
+                   ioat2_ring_space(ioat) > num_descs)
+                               break;
+
+               if (printk_ratelimit())
+                       dev_dbg(to_dev(chan),
+                               "%s: ring full! num_descs: %d (%x:%x:%x)\n",
+                               __func__, num_descs, ioat->head, ioat->tail,
+                               ioat->issued);
+               spin_unlock_bh(&ioat->ring_lock);
+
+               /* progress reclaim in the allocation failure case we
+                * may be called under bh_disabled so we need to trigger
+                * the timer event directly
+                */
+               spin_lock_bh(&chan->cleanup_lock);
+               if (jiffies > chan->timer.expires &&
+                   timer_pending(&chan->timer)) {
+                       struct ioatdma_device *device = chan->device;
+
+                       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+                       spin_unlock_bh(&chan->cleanup_lock);
+                       device->timer_fn((unsigned long) ioat);
+               } else
+                       spin_unlock_bh(&chan->cleanup_lock);
+               return -ENOMEM;
+       }
+
+       dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n",
+               __func__, num_descs, ioat->head, ioat->tail, ioat->issued);
+
+       *idx = ioat2_desc_alloc(ioat, num_descs);
+       return 0;  /* with ioat->ring_lock held */
+}
+
+struct dma_async_tx_descriptor *
+ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
+                          dma_addr_t dma_src, size_t len, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_dma_descriptor *hw;
+       struct ioat_ring_ent *desc;
+       dma_addr_t dst = dma_dest;
+       dma_addr_t src = dma_src;
+       size_t total_len = len;
+       int num_descs;
+       u16 idx;
+       int i;
+
+       num_descs = ioat2_xferlen_to_descs(ioat, len);
+       if (likely(num_descs) &&
+           ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
+               /* pass */;
+       else
+               return NULL;
+       i = 0;
+       do {
+               size_t copy = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+               desc = ioat2_get_ring_ent(ioat, idx + i);
+               hw = desc->hw;
+
+               hw->size = copy;
+               hw->ctl = 0;
+               hw->src_addr = src;
+               hw->dst_addr = dst;
+
+               len -= copy;
+               dst += copy;
+               src += copy;
+               dump_desc_dbg(ioat, desc);
+       } while (++i < num_descs);
+
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+       hw->ctl_f.compl_write = 1;
+       dump_desc_dbg(ioat, desc);
+       /* we leave the channel locked to ensure in order submission */
+
+       return &desc->txd;
+}
+
+/**
+ * ioat2_free_chan_resources - release all the descriptors
+ * @chan: the channel to be cleaned
+ */
+void ioat2_free_chan_resources(struct dma_chan *c)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioatdma_device *device = chan->device;
+       struct ioat_ring_ent *desc;
+       const u16 total_descs = 1 << ioat->alloc_order;
+       int descs;
+       int i;
+
+       /* Before freeing channel resources first check
+        * if they have been previously allocated for this channel.
+        */
+       if (!ioat->ring)
+               return;
+
+       tasklet_disable(&chan->cleanup_task);
+       del_timer_sync(&chan->timer);
+       device->cleanup_tasklet((unsigned long) ioat);
+
+       /* Delay 100ms after reset to allow internal DMA logic to quiesce
+        * before removing DMA descriptor resources.
+        */
+       writeb(IOAT_CHANCMD_RESET,
+              chan->reg_base + IOAT_CHANCMD_OFFSET(chan->device->version));
+       mdelay(100);
+
+       spin_lock_bh(&ioat->ring_lock);
+       descs = ioat2_ring_space(ioat);
+       dev_dbg(to_dev(chan), "freeing %d idle descriptors\n", descs);
+       for (i = 0; i < descs; i++) {
+               desc = ioat2_get_ring_ent(ioat, ioat->head + i);
+               ioat2_free_ring_ent(desc, c);
+       }
+
+       if (descs < total_descs)
+               dev_err(to_dev(chan), "Freeing %d in use descriptors!\n",
+                       total_descs - descs);
+
+       for (i = 0; i < total_descs - descs; i++) {
+               desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+               dump_desc_dbg(ioat, desc);
+               ioat2_free_ring_ent(desc, c);
+       }
+
+       kfree(ioat->ring);
+       ioat->ring = NULL;
+       ioat->alloc_order = 0;
+       pci_pool_free(device->completion_pool, chan->completion,
+                     chan->completion_dma);
+       spin_unlock_bh(&ioat->ring_lock);
+
+       chan->last_completion = 0;
+       chan->completion_dma = 0;
+       ioat->pending = 0;
+       ioat->dmacount = 0;
+}
+
+enum dma_status
+ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+                    dma_cookie_t *done, dma_cookie_t *used)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioatdma_device *device = ioat->base.device;
+
+       if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+               return DMA_SUCCESS;
+
+       device->cleanup_tasklet((unsigned long) ioat);
+
+       return ioat_is_complete(c, cookie, done, used);
+}
+
+static ssize_t ring_size_show(struct dma_chan *c, char *page)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+       return sprintf(page, "%d\n", (1 << ioat->alloc_order) & ~1);
+}
+static struct ioat_sysfs_entry ring_size_attr = __ATTR_RO(ring_size);
+
+static ssize_t ring_active_show(struct dma_chan *c, char *page)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+       /* ...taken outside the lock, no need to be precise */
+       return sprintf(page, "%d\n", ioat2_ring_active(ioat));
+}
+static struct ioat_sysfs_entry ring_active_attr = __ATTR_RO(ring_active);
+
+static struct attribute *ioat2_attrs[] = {
+       &ring_size_attr.attr,
+       &ring_active_attr.attr,
+       &ioat_cap_attr.attr,
+       &ioat_version_attr.attr,
+       NULL,
+};
+
+struct kobj_type ioat2_ktype = {
+       .sysfs_ops = &ioat_sysfs_ops,
+       .default_attrs = ioat2_attrs,
+};
+
+int __devinit ioat2_dma_probe(struct ioatdma_device *device, int dca)
+{
+       struct pci_dev *pdev = device->pdev;
+       struct dma_device *dma;
+       struct dma_chan *c;
+       struct ioat_chan_common *chan;
+       int err;
+
+       device->enumerate_channels = ioat2_enumerate_channels;
+       device->cleanup_tasklet = ioat2_cleanup_tasklet;
+       device->timer_fn = ioat2_timer_event;
+       device->self_test = ioat_dma_self_test;
+       dma = &device->common;
+       dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
+       dma->device_issue_pending = ioat2_issue_pending;
+       dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
+       dma->device_free_chan_resources = ioat2_free_chan_resources;
+       dma->device_is_tx_complete = ioat2_is_complete;
+
+       err = ioat_probe(device);
+       if (err)
+               return err;
+       ioat_set_tcp_copy_break(2048);
+
+       list_for_each_entry(c, &dma->channels, device_node) {
+               chan = to_chan_common(c);
+               writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | IOAT_DMA_DCA_ANY_CPU,
+                      chan->reg_base + IOAT_DCACTRL_OFFSET);
+       }
+
+       err = ioat_register(device);
+       if (err)
+               return err;
+
+       ioat_kobject_add(device, &ioat2_ktype);
+
+       if (dca)
+               device->dca = ioat2_dca_init(pdev, device->reg_base);
+
+       return err;
+}
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
new file mode 100644 (file)
index 0000000..1d849ef
--- /dev/null
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_V2_H
+#define IOATDMA_V2_H
+
+#include <linux/dmaengine.h>
+#include "dma.h"
+#include "hw.h"
+
+
+extern int ioat_pending_level;
+extern int ioat_ring_alloc_order;
+
+/*
+ * workaround for IOAT ver.3.0 null descriptor issue
+ * (channel returns error when size is 0)
+ */
+#define NULL_DESC_BUFFER_SIZE 1
+
+#define IOAT_MAX_ORDER 16
+#define ioat_get_alloc_order() \
+       (min(ioat_ring_alloc_order, IOAT_MAX_ORDER))
+#define ioat_get_max_alloc_order() \
+       (min(ioat_ring_max_alloc_order, IOAT_MAX_ORDER))
+
+/* struct ioat2_dma_chan - ioat v2 / v3 channel attributes
+ * @base: common ioat channel parameters
+ * @xfercap_log; log2 of channel max transfer length (for fast division)
+ * @head: allocated index
+ * @issued: hardware notification point
+ * @tail: cleanup index
+ * @pending: lock free indicator for issued != head
+ * @dmacount: identical to 'head' except for occasionally resetting to zero
+ * @alloc_order: log2 of the number of allocated descriptors
+ * @ring: software ring buffer implementation of hardware ring
+ * @ring_lock: protects ring attributes
+ */
+struct ioat2_dma_chan {
+       struct ioat_chan_common base;
+       size_t xfercap_log;
+       u16 head;
+       u16 issued;
+       u16 tail;
+       u16 dmacount;
+       u16 alloc_order;
+       int pending;
+       struct ioat_ring_ent **ring;
+       spinlock_t ring_lock;
+};
+
+static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c)
+{
+       struct ioat_chan_common *chan = to_chan_common(c);
+
+       return container_of(chan, struct ioat2_dma_chan, base);
+}
+
+static inline u16 ioat2_ring_mask(struct ioat2_dma_chan *ioat)
+{
+       return (1 << ioat->alloc_order) - 1;
+}
+
+/* count of descriptors in flight with the engine */
+static inline u16 ioat2_ring_active(struct ioat2_dma_chan *ioat)
+{
+       return (ioat->head - ioat->tail) & ioat2_ring_mask(ioat);
+}
+
+/* count of descriptors pending submission to hardware */
+static inline u16 ioat2_ring_pending(struct ioat2_dma_chan *ioat)
+{
+       return (ioat->head - ioat->issued) & ioat2_ring_mask(ioat);
+}
+
+static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat)
+{
+       u16 num_descs = ioat2_ring_mask(ioat) + 1;
+       u16 active = ioat2_ring_active(ioat);
+
+       BUG_ON(active > num_descs);
+
+       return num_descs - active;
+}
+
+/* assumes caller already checked space */
+static inline u16 ioat2_desc_alloc(struct ioat2_dma_chan *ioat, u16 len)
+{
+       ioat->head += len;
+       return ioat->head - len;
+}
+
+static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len)
+{
+       u16 num_descs = len >> ioat->xfercap_log;
+
+       num_descs += !!(len & ((1 << ioat->xfercap_log) - 1));
+       return num_descs;
+}
+
+/**
+ * struct ioat_ring_ent - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor (for memcpy)
+ * @fill: hardware fill descriptor
+ * @xor: hardware xor descriptor
+ * @xor_ex: hardware xor extension descriptor
+ * @pq: hardware pq descriptor
+ * @pq_ex: hardware pq extension descriptor
+ * @pqu: hardware pq update descriptor
+ * @raw: hardware raw (un-typed) descriptor
+ * @txd: the generic software descriptor for all engines
+ * @len: total transaction length for unmap
+ * @result: asynchronous result of validate operations
+ * @id: identifier for debug
+ */
+
+struct ioat_ring_ent {
+       union {
+               struct ioat_dma_descriptor *hw;
+               struct ioat_fill_descriptor *fill;
+               struct ioat_xor_descriptor *xor;
+               struct ioat_xor_ext_descriptor *xor_ex;
+               struct ioat_pq_descriptor *pq;
+               struct ioat_pq_ext_descriptor *pq_ex;
+               struct ioat_pq_update_descriptor *pqu;
+               struct ioat_raw_descriptor *raw;
+       };
+       size_t len;
+       struct dma_async_tx_descriptor txd;
+       enum sum_check_flags *result;
+       #ifdef DEBUG
+       int id;
+       #endif
+};
+
+static inline struct ioat_ring_ent *
+ioat2_get_ring_ent(struct ioat2_dma_chan *ioat, u16 idx)
+{
+       return ioat->ring[idx & ioat2_ring_mask(ioat)];
+}
+
+static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+
+       writel(addr & 0x00000000FFFFFFFF,
+              chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
+       writel(addr >> 32,
+              chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
+}
+
+int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca);
+int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca);
+struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
+struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
+int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs);
+int ioat2_enumerate_channels(struct ioatdma_device *device);
+struct dma_async_tx_descriptor *
+ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
+                          dma_addr_t dma_src, size_t len, unsigned long flags);
+void ioat2_issue_pending(struct dma_chan *chan);
+int ioat2_alloc_chan_resources(struct dma_chan *c);
+void ioat2_free_chan_resources(struct dma_chan *c);
+enum dma_status ioat2_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+                                 dma_cookie_t *done, dma_cookie_t *used);
+void __ioat2_restart_chan(struct ioat2_dma_chan *ioat);
+bool reshape_ring(struct ioat2_dma_chan *ioat, int order);
+void __ioat2_issue_pending(struct ioat2_dma_chan *ioat);
+void ioat2_cleanup_tasklet(unsigned long data);
+void ioat2_timer_event(unsigned long data);
+extern struct kobj_type ioat2_ktype;
+extern struct kmem_cache *ioat2_cache;
+#endif /* IOATDMA_V2_H */
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
new file mode 100644 (file)
index 0000000..35d1e33
--- /dev/null
@@ -0,0 +1,1223 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2004-2009 Intel Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Support routines for v3+ hardware
+ */
+
+#include <linux/pci.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include "registers.h"
+#include "hw.h"
+#include "dma.h"
+#include "dma_v2.h"
+
+/* ioat hardware assumes at least two sources for raid operations */
+#define src_cnt_to_sw(x) ((x) + 2)
+#define src_cnt_to_hw(x) ((x) - 2)
+
+/* provide a lookup table for setting the source address in the base or
+ * extended descriptor of an xor or pq descriptor
+ */
+static const u8 xor_idx_to_desc __read_mostly = 0xd0;
+static const u8 xor_idx_to_field[] __read_mostly = { 1, 4, 5, 6, 7, 0, 1, 2 };
+static const u8 pq_idx_to_desc __read_mostly = 0xf8;
+static const u8 pq_idx_to_field[] __read_mostly = { 1, 4, 5, 0, 1, 2, 4, 5 };
+
+static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx)
+{
+       struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
+
+       return raw->field[xor_idx_to_field[idx]];
+}
+
+static void xor_set_src(struct ioat_raw_descriptor *descs[2],
+                       dma_addr_t addr, u32 offset, int idx)
+{
+       struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
+
+       raw->field[xor_idx_to_field[idx]] = addr + offset;
+}
+
+static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx)
+{
+       struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
+
+       return raw->field[pq_idx_to_field[idx]];
+}
+
+static void pq_set_src(struct ioat_raw_descriptor *descs[2],
+                      dma_addr_t addr, u32 offset, u8 coef, int idx)
+{
+       struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *) descs[0];
+       struct ioat_raw_descriptor *raw = descs[pq_idx_to_desc >> idx & 1];
+
+       raw->field[pq_idx_to_field[idx]] = addr + offset;
+       pq->coef[idx] = coef;
+}
+
+static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
+                           struct ioat_ring_ent *desc, int idx)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct pci_dev *pdev = chan->device->pdev;
+       size_t len = desc->len;
+       size_t offset = len - desc->hw->size;
+       struct dma_async_tx_descriptor *tx = &desc->txd;
+       enum dma_ctrl_flags flags = tx->flags;
+
+       switch (desc->hw->ctl_f.op) {
+       case IOAT_OP_COPY:
+               if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */
+                       ioat_dma_unmap(chan, flags, len, desc->hw);
+               break;
+       case IOAT_OP_FILL: {
+               struct ioat_fill_descriptor *hw = desc->fill;
+
+               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+                       ioat_unmap(pdev, hw->dst_addr - offset, len,
+                                  PCI_DMA_FROMDEVICE, flags, 1);
+               break;
+       }
+       case IOAT_OP_XOR_VAL:
+       case IOAT_OP_XOR: {
+               struct ioat_xor_descriptor *xor = desc->xor;
+               struct ioat_ring_ent *ext;
+               struct ioat_xor_ext_descriptor *xor_ex = NULL;
+               int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt);
+               struct ioat_raw_descriptor *descs[2];
+               int i;
+
+               if (src_cnt > 5) {
+                       ext = ioat2_get_ring_ent(ioat, idx + 1);
+                       xor_ex = ext->xor_ex;
+               }
+
+               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+                       descs[0] = (struct ioat_raw_descriptor *) xor;
+                       descs[1] = (struct ioat_raw_descriptor *) xor_ex;
+                       for (i = 0; i < src_cnt; i++) {
+                               dma_addr_t src = xor_get_src(descs, i);
+
+                               ioat_unmap(pdev, src - offset, len,
+                                          PCI_DMA_TODEVICE, flags, 0);
+                       }
+
+                       /* dest is a source in xor validate operations */
+                       if (xor->ctl_f.op == IOAT_OP_XOR_VAL) {
+                               ioat_unmap(pdev, xor->dst_addr - offset, len,
+                                          PCI_DMA_TODEVICE, flags, 1);
+                               break;
+                       }
+               }
+
+               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
+                       ioat_unmap(pdev, xor->dst_addr - offset, len,
+                                  PCI_DMA_FROMDEVICE, flags, 1);
+               break;
+       }
+       case IOAT_OP_PQ_VAL:
+       case IOAT_OP_PQ: {
+               struct ioat_pq_descriptor *pq = desc->pq;
+               struct ioat_ring_ent *ext;
+               struct ioat_pq_ext_descriptor *pq_ex = NULL;
+               int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
+               struct ioat_raw_descriptor *descs[2];
+               int i;
+
+               if (src_cnt > 3) {
+                       ext = ioat2_get_ring_ent(ioat, idx + 1);
+                       pq_ex = ext->pq_ex;
+               }
+
+               /* in the 'continue' case don't unmap the dests as sources */
+               if (dmaf_p_disabled_continue(flags))
+                       src_cnt--;
+               else if (dmaf_continue(flags))
+                       src_cnt -= 3;
+
+               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+                       descs[0] = (struct ioat_raw_descriptor *) pq;
+                       descs[1] = (struct ioat_raw_descriptor *) pq_ex;
+                       for (i = 0; i < src_cnt; i++) {
+                               dma_addr_t src = pq_get_src(descs, i);
+
+                               ioat_unmap(pdev, src - offset, len,
+                                          PCI_DMA_TODEVICE, flags, 0);
+                       }
+
+                       /* the dests are sources in pq validate operations */
+                       if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
+                               if (!(flags & DMA_PREP_PQ_DISABLE_P))
+                                       ioat_unmap(pdev, pq->p_addr - offset,
+                                                  len, PCI_DMA_TODEVICE, flags, 0);
+                               if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+                                       ioat_unmap(pdev, pq->q_addr - offset,
+                                                  len, PCI_DMA_TODEVICE, flags, 0);
+                               break;
+                       }
+               }
+
+               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+                       if (!(flags & DMA_PREP_PQ_DISABLE_P))
+                               ioat_unmap(pdev, pq->p_addr - offset, len,
+                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
+                       if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+                               ioat_unmap(pdev, pq->q_addr - offset, len,
+                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
+               }
+               break;
+       }
+       default:
+               dev_err(&pdev->dev, "%s: unknown op type: %#x\n",
+                       __func__, desc->hw->ctl_f.op);
+       }
+}
+
+static bool desc_has_ext(struct ioat_ring_ent *desc)
+{
+       struct ioat_dma_descriptor *hw = desc->hw;
+
+       if (hw->ctl_f.op == IOAT_OP_XOR ||
+           hw->ctl_f.op == IOAT_OP_XOR_VAL) {
+               struct ioat_xor_descriptor *xor = desc->xor;
+
+               if (src_cnt_to_sw(xor->ctl_f.src_cnt) > 5)
+                       return true;
+       } else if (hw->ctl_f.op == IOAT_OP_PQ ||
+                  hw->ctl_f.op == IOAT_OP_PQ_VAL) {
+               struct ioat_pq_descriptor *pq = desc->pq;
+
+               if (src_cnt_to_sw(pq->ctl_f.src_cnt) > 3)
+                       return true;
+       }
+
+       return false;
+}
+
+/**
+ * __cleanup - reclaim used descriptors
+ * @ioat: channel (ring) to clean
+ *
+ * The difference from the dma_v2.c __cleanup() is that this routine
+ * handles extended descriptors and dma-unmapping raid operations.
+ */
+static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_ring_ent *desc;
+       bool seen_current = false;
+       u16 active;
+       int i;
+
+       dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
+               __func__, ioat->head, ioat->tail, ioat->issued);
+
+       active = ioat2_ring_active(ioat);
+       for (i = 0; i < active && !seen_current; i++) {
+               struct dma_async_tx_descriptor *tx;
+
+               prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
+               desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+               dump_desc_dbg(ioat, desc);
+               tx = &desc->txd;
+               if (tx->cookie) {
+                       chan->completed_cookie = tx->cookie;
+                       ioat3_dma_unmap(ioat, desc, ioat->tail + i);
+                       tx->cookie = 0;
+                       if (tx->callback) {
+                               tx->callback(tx->callback_param);
+                               tx->callback = NULL;
+                       }
+               }
+
+               if (tx->phys == phys_complete)
+                       seen_current = true;
+
+               /* skip extended descriptors */
+               if (desc_has_ext(desc)) {
+                       BUG_ON(i + 1 >= active);
+                       i++;
+               }
+       }
+       ioat->tail += i;
+       BUG_ON(!seen_current); /* no active descs have written a completion? */
+       chan->last_completion = phys_complete;
+       if (ioat->head == ioat->tail) {
+               dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
+                       __func__);
+               clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
+               mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+       }
+}
+
+static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       unsigned long phys_complete;
+
+       prefetch(chan->completion);
+
+       if (!spin_trylock_bh(&chan->cleanup_lock))
+               return;
+
+       if (!ioat_cleanup_preamble(chan, &phys_complete)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       if (!spin_trylock_bh(&ioat->ring_lock)) {
+               spin_unlock_bh(&chan->cleanup_lock);
+               return;
+       }
+
+       __cleanup(ioat, phys_complete);
+
+       spin_unlock_bh(&ioat->ring_lock);
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static void ioat3_cleanup_tasklet(unsigned long data)
+{
+       struct ioat2_dma_chan *ioat = (void *) data;
+
+       ioat3_cleanup(ioat);
+       writew(IOAT_CHANCTRL_RUN | IOAT3_CHANCTRL_COMPL_DCA_EN,
+              ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
+}
+
+static void ioat3_restart_channel(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       unsigned long phys_complete;
+       u32 status;
+
+       status = ioat_chansts(chan);
+       if (is_ioat_active(status) || is_ioat_idle(status))
+               ioat_suspend(chan);
+       while (is_ioat_active(status) || is_ioat_idle(status)) {
+               status = ioat_chansts(chan);
+               cpu_relax();
+       }
+
+       if (ioat_cleanup_preamble(chan, &phys_complete))
+               __cleanup(ioat, phys_complete);
+
+       __ioat2_restart_chan(ioat);
+}
+
+static void ioat3_timer_event(unsigned long data)
+{
+       struct ioat2_dma_chan *ioat = (void *) data;
+       struct ioat_chan_common *chan = &ioat->base;
+
+       spin_lock_bh(&chan->cleanup_lock);
+       if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
+               unsigned long phys_complete;
+               u64 status;
+
+               spin_lock_bh(&ioat->ring_lock);
+               status = ioat_chansts(chan);
+
+               /* when halted due to errors check for channel
+                * programming errors before advancing the completion state
+                */
+               if (is_ioat_halted(status)) {
+                       u32 chanerr;
+
+                       chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+                       BUG_ON(is_ioat_bug(chanerr));
+               }
+
+               /* if we haven't made progress and we have already
+                * acknowledged a pending completion once, then be more
+                * forceful with a restart
+                */
+               if (ioat_cleanup_preamble(chan, &phys_complete))
+                       __cleanup(ioat, phys_complete);
+               else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+                       ioat3_restart_channel(ioat);
+               else {
+                       set_bit(IOAT_COMPLETION_ACK, &chan->state);
+                       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+               }
+               spin_unlock_bh(&ioat->ring_lock);
+       } else {
+               u16 active;
+
+               /* if the ring is idle, empty, and oversized try to step
+                * down the size
+                */
+               spin_lock_bh(&ioat->ring_lock);
+               active = ioat2_ring_active(ioat);
+               if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
+                       reshape_ring(ioat, ioat->alloc_order-1);
+               spin_unlock_bh(&ioat->ring_lock);
+
+               /* keep shrinking until we get back to our minimum
+                * default size
+                */
+               if (ioat->alloc_order > ioat_get_alloc_order())
+                       mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+       }
+       spin_unlock_bh(&chan->cleanup_lock);
+}
+
+static enum dma_status
+ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
+                 dma_cookie_t *done, dma_cookie_t *used)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+
+       if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
+               return DMA_SUCCESS;
+
+       ioat3_cleanup(ioat);
+
+       return ioat_is_complete(c, cookie, done, used);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
+                      size_t len, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_ring_ent *desc;
+       size_t total_len = len;
+       struct ioat_fill_descriptor *fill;
+       int num_descs;
+       u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
+       u16 idx;
+       int i;
+
+       num_descs = ioat2_xferlen_to_descs(ioat, len);
+       if (likely(num_descs) &&
+           ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
+               /* pass */;
+       else
+               return NULL;
+       i = 0;
+       do {
+               size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+               desc = ioat2_get_ring_ent(ioat, idx + i);
+               fill = desc->fill;
+
+               fill->size = xfer_size;
+               fill->src_data = src_data;
+               fill->dst_addr = dest;
+               fill->ctl = 0;
+               fill->ctl_f.op = IOAT_OP_FILL;
+
+               len -= xfer_size;
+               dest += xfer_size;
+               dump_desc_dbg(ioat, desc);
+       } while (++i < num_descs);
+
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+       fill->ctl_f.compl_write = 1;
+       dump_desc_dbg(ioat, desc);
+
+       /* we leave the channel locked to ensure in order submission */
+       return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+__ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
+                     dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt,
+                     size_t len, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_ring_ent *compl_desc;
+       struct ioat_ring_ent *desc;
+       struct ioat_ring_ent *ext;
+       size_t total_len = len;
+       struct ioat_xor_descriptor *xor;
+       struct ioat_xor_ext_descriptor *xor_ex = NULL;
+       struct ioat_dma_descriptor *hw;
+       u32 offset = 0;
+       int num_descs;
+       int with_ext;
+       int i;
+       u16 idx;
+       u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR;
+
+       BUG_ON(src_cnt < 2);
+
+       num_descs = ioat2_xferlen_to_descs(ioat, len);
+       /* we need 2x the number of descriptors to cover greater than 5
+        * sources
+        */
+       if (src_cnt > 5) {
+               with_ext = 1;
+               num_descs *= 2;
+       } else
+               with_ext = 0;
+
+       /* completion writes from the raid engine may pass completion
+        * writes from the legacy engine, so we need one extra null
+        * (legacy) descriptor to ensure all completion writes arrive in
+        * order.
+        */
+       if (likely(num_descs) &&
+           ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
+               /* pass */;
+       else
+               return NULL;
+       i = 0;
+       do {
+               struct ioat_raw_descriptor *descs[2];
+               size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+               int s;
+
+               desc = ioat2_get_ring_ent(ioat, idx + i);
+               xor = desc->xor;
+
+               /* save a branch by unconditionally retrieving the
+                * extended descriptor xor_set_src() knows to not write
+                * to it in the single descriptor case
+                */
+               ext = ioat2_get_ring_ent(ioat, idx + i + 1);
+               xor_ex = ext->xor_ex;
+
+               descs[0] = (struct ioat_raw_descriptor *) xor;
+               descs[1] = (struct ioat_raw_descriptor *) xor_ex;
+               for (s = 0; s < src_cnt; s++)
+                       xor_set_src(descs, src[s], offset, s);
+               xor->size = xfer_size;
+               xor->dst_addr = dest + offset;
+               xor->ctl = 0;
+               xor->ctl_f.op = op;
+               xor->ctl_f.src_cnt = src_cnt_to_hw(src_cnt);
+
+               len -= xfer_size;
+               offset += xfer_size;
+               dump_desc_dbg(ioat, desc);
+       } while ((i += 1 + with_ext) < num_descs);
+
+       /* last xor descriptor carries the unmap parameters and fence bit */
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       if (result)
+               desc->result = result;
+       xor->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+
+       /* completion descriptor carries interrupt bit */
+       compl_desc = ioat2_get_ring_ent(ioat, idx + i);
+       compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
+       hw = compl_desc->hw;
+       hw->ctl = 0;
+       hw->ctl_f.null = 1;
+       hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       hw->ctl_f.compl_write = 1;
+       hw->size = NULL_DESC_BUFFER_SIZE;
+       dump_desc_dbg(ioat, compl_desc);
+
+       /* we leave the channel locked to ensure in order submission */
+       return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+              unsigned int src_cnt, size_t len, unsigned long flags)
+{
+       return __ioat3_prep_xor_lock(chan, NULL, dest, src, src_cnt, len, flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_xor_val(struct dma_chan *chan, dma_addr_t *src,
+                   unsigned int src_cnt, size_t len,
+                   enum sum_check_flags *result, unsigned long flags)
+{
+       /* the cleanup routine only sets bits on validate failure, it
+        * does not clear bits on validate success... so clear it here
+        */
+       *result = 0;
+
+       return __ioat3_prep_xor_lock(chan, result, src[0], &src[1],
+                                    src_cnt - 1, len, flags);
+}
+
+static void
+dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct ioat_ring_ent *ext)
+{
+       struct device *dev = to_dev(&ioat->base);
+       struct ioat_pq_descriptor *pq = desc->pq;
+       struct ioat_pq_ext_descriptor *pq_ex = ext ? ext->pq_ex : NULL;
+       struct ioat_raw_descriptor *descs[] = { (void *) pq, (void *) pq_ex };
+       int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
+       int i;
+
+       dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x"
+               " sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n",
+               desc_id(desc), (unsigned long long) desc->txd.phys,
+               (unsigned long long) (pq_ex ? pq_ex->next : pq->next),
+               desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en,
+               pq->ctl_f.compl_write,
+               pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q",
+               pq->ctl_f.src_cnt);
+       for (i = 0; i < src_cnt; i++)
+               dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i,
+                       (unsigned long long) pq_get_src(descs, i), pq->coef[i]);
+       dev_dbg(dev, "\tP: %#llx\n", pq->p_addr);
+       dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr);
+}
+
+static struct dma_async_tx_descriptor *
+__ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
+                    const dma_addr_t *dst, const dma_addr_t *src,
+                    unsigned int src_cnt, const unsigned char *scf,
+                    size_t len, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioat_ring_ent *compl_desc;
+       struct ioat_ring_ent *desc;
+       struct ioat_ring_ent *ext;
+       size_t total_len = len;
+       struct ioat_pq_descriptor *pq;
+       struct ioat_pq_ext_descriptor *pq_ex = NULL;
+       struct ioat_dma_descriptor *hw;
+       u32 offset = 0;
+       int num_descs;
+       int with_ext;
+       int i, s;
+       u16 idx;
+       u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
+
+       dev_dbg(to_dev(chan), "%s\n", __func__);
+       /* the engine requires at least two sources (we provide
+        * at least 1 implied source in the DMA_PREP_CONTINUE case)
+        */
+       BUG_ON(src_cnt + dmaf_continue(flags) < 2);
+
+       num_descs = ioat2_xferlen_to_descs(ioat, len);
+       /* we need 2x the number of descriptors to cover greater than 3
+        * sources
+        */
+       if (src_cnt > 3 || flags & DMA_PREP_CONTINUE) {
+               with_ext = 1;
+               num_descs *= 2;
+       } else
+               with_ext = 0;
+
+       /* completion writes from the raid engine may pass completion
+        * writes from the legacy engine, so we need one extra null
+        * (legacy) descriptor to ensure all completion writes arrive in
+        * order.
+        */
+       if (likely(num_descs) &&
+           ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
+               /* pass */;
+       else
+               return NULL;
+       i = 0;
+       do {
+               struct ioat_raw_descriptor *descs[2];
+               size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+               desc = ioat2_get_ring_ent(ioat, idx + i);
+               pq = desc->pq;
+
+               /* save a branch by unconditionally retrieving the
+                * extended descriptor pq_set_src() knows to not write
+                * to it in the single descriptor case
+                */
+               ext = ioat2_get_ring_ent(ioat, idx + i + with_ext);
+               pq_ex = ext->pq_ex;
+
+               descs[0] = (struct ioat_raw_descriptor *) pq;
+               descs[1] = (struct ioat_raw_descriptor *) pq_ex;
+
+               for (s = 0; s < src_cnt; s++)
+                       pq_set_src(descs, src[s], offset, scf[s], s);
+
+               /* see the comment for dma_maxpq in include/linux/dmaengine.h */
+               if (dmaf_p_disabled_continue(flags))
+                       pq_set_src(descs, dst[1], offset, 1, s++);
+               else if (dmaf_continue(flags)) {
+                       pq_set_src(descs, dst[0], offset, 0, s++);
+                       pq_set_src(descs, dst[1], offset, 1, s++);
+                       pq_set_src(descs, dst[1], offset, 0, s++);
+               }
+               pq->size = xfer_size;
+               pq->p_addr = dst[0] + offset;
+               pq->q_addr = dst[1] + offset;
+               pq->ctl = 0;
+               pq->ctl_f.op = op;
+               pq->ctl_f.src_cnt = src_cnt_to_hw(s);
+               pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P);
+               pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q);
+
+               len -= xfer_size;
+               offset += xfer_size;
+       } while ((i += 1 + with_ext) < num_descs);
+
+       /* last pq descriptor carries the unmap parameters and fence bit */
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       if (result)
+               desc->result = result;
+       pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+       dump_pq_desc_dbg(ioat, desc, ext);
+
+       /* completion descriptor carries interrupt bit */
+       compl_desc = ioat2_get_ring_ent(ioat, idx + i);
+       compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
+       hw = compl_desc->hw;
+       hw->ctl = 0;
+       hw->ctl_f.null = 1;
+       hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       hw->ctl_f.compl_write = 1;
+       hw->size = NULL_DESC_BUFFER_SIZE;
+       dump_desc_dbg(ioat, compl_desc);
+
+       /* we leave the channel locked to ensure in order submission */
+       return &desc->txd;
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+             unsigned int src_cnt, const unsigned char *scf, size_t len,
+             unsigned long flags)
+{
+       /* handle the single source multiply case from the raid6
+        * recovery path
+        */
+       if (unlikely((flags & DMA_PREP_PQ_DISABLE_P) && src_cnt == 1)) {
+               dma_addr_t single_source[2];
+               unsigned char single_source_coef[2];
+
+               BUG_ON(flags & DMA_PREP_PQ_DISABLE_Q);
+               single_source[0] = src[0];
+               single_source[1] = src[0];
+               single_source_coef[0] = scf[0];
+               single_source_coef[1] = 0;
+
+               return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2,
+                                           single_source_coef, len, flags);
+       } else
+               return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf,
+                                           len, flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+                 unsigned int src_cnt, const unsigned char *scf, size_t len,
+                 enum sum_check_flags *pqres, unsigned long flags)
+{
+       /* the cleanup routine only sets bits on validate failure, it
+        * does not clear bits on validate success... so clear it here
+        */
+       *pqres = 0;
+
+       return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
+                                   flags);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
+                unsigned int src_cnt, size_t len, unsigned long flags)
+{
+       unsigned char scf[src_cnt];
+       dma_addr_t pq[2];
+
+       memset(scf, 0, src_cnt);
+       flags |= DMA_PREP_PQ_DISABLE_Q;
+       pq[0] = dst;
+       pq[1] = ~0;
+
+       return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
+                                   flags);
+}
+
+struct dma_async_tx_descriptor *
+ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
+                    unsigned int src_cnt, size_t len,
+                    enum sum_check_flags *result, unsigned long flags)
+{
+       unsigned char scf[src_cnt];
+       dma_addr_t pq[2];
+
+       /* the cleanup routine only sets bits on validate failure, it
+        * does not clear bits on validate success... so clear it here
+        */
+       *result = 0;
+
+       memset(scf, 0, src_cnt);
+       flags |= DMA_PREP_PQ_DISABLE_Q;
+       pq[0] = src[0];
+       pq[1] = ~0;
+
+       return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf,
+                                   len, flags);
+}
+
+static struct dma_async_tx_descriptor *
+ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_ring_ent *desc;
+       struct ioat_dma_descriptor *hw;
+       u16 idx;
+
+       if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0)
+               desc = ioat2_get_ring_ent(ioat, idx);
+       else
+               return NULL;
+
+       hw = desc->hw;
+       hw->ctl = 0;
+       hw->ctl_f.null = 1;
+       hw->ctl_f.int_en = 1;
+       hw->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+       hw->ctl_f.compl_write = 1;
+       hw->size = NULL_DESC_BUFFER_SIZE;
+       hw->src_addr = 0;
+       hw->dst_addr = 0;
+
+       desc->txd.flags = flags;
+       desc->len = 1;
+
+       dump_desc_dbg(ioat, desc);
+
+       /* we leave the channel locked to ensure in order submission */
+       return &desc->txd;
+}
+
+static void __devinit ioat3_dma_test_callback(void *dma_async_param)
+{
+       struct completion *cmp = dma_async_param;
+
+       complete(cmp);
+}
+
+#define IOAT_NUM_SRC_TEST 6 /* must be <= 8 */
+static int __devinit ioat_xor_val_self_test(struct ioatdma_device *device)
+{
+       int i, src_idx;
+       struct page *dest;
+       struct page *xor_srcs[IOAT_NUM_SRC_TEST];
+       struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1];
+       dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1];
+       dma_addr_t dma_addr, dest_dma;
+       struct dma_async_tx_descriptor *tx;
+       struct dma_chan *dma_chan;
+       dma_cookie_t cookie;
+       u8 cmp_byte = 0;
+       u32 cmp_word;
+       u32 xor_val_result;
+       int err = 0;
+       struct completion cmp;
+       unsigned long tmo;
+       struct device *dev = &device->pdev->dev;
+       struct dma_device *dma = &device->common;
+
+       dev_dbg(dev, "%s\n", __func__);
+
+       if (!dma_has_cap(DMA_XOR, dma->cap_mask))
+               return 0;
+
+       for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
+               xor_srcs[src_idx] = alloc_page(GFP_KERNEL);
+               if (!xor_srcs[src_idx]) {
+                       while (src_idx--)
+                               __free_page(xor_srcs[src_idx]);
+                       return -ENOMEM;
+               }
+       }
+
+       dest = alloc_page(GFP_KERNEL);
+       if (!dest) {
+               while (src_idx--)
+                       __free_page(xor_srcs[src_idx]);
+               return -ENOMEM;
+       }
+
+       /* Fill in src buffers */
+       for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++) {
+               u8 *ptr = page_address(xor_srcs[src_idx]);
+               for (i = 0; i < PAGE_SIZE; i++)
+                       ptr[i] = (1 << src_idx);
+       }
+
+       for (src_idx = 0; src_idx < IOAT_NUM_SRC_TEST; src_idx++)
+               cmp_byte ^= (u8) (1 << src_idx);
+
+       cmp_word = (cmp_byte << 24) | (cmp_byte << 16) |
+                       (cmp_byte << 8) | cmp_byte;
+
+       memset(page_address(dest), 0, PAGE_SIZE);
+
+       dma_chan = container_of(dma->channels.next, struct dma_chan,
+                               device_node);
+       if (dma->device_alloc_chan_resources(dma_chan) < 1) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       /* test xor */
+       dest_dma = dma_map_page(dev, dest, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+       for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
+               dma_srcs[i] = dma_map_page(dev, xor_srcs[i], 0, PAGE_SIZE,
+                                          DMA_TO_DEVICE);
+       tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs,
+                                     IOAT_NUM_SRC_TEST, PAGE_SIZE,
+                                     DMA_PREP_INTERRUPT);
+
+       if (!tx) {
+               dev_err(dev, "Self-test xor prep failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       async_tx_ack(tx);
+       init_completion(&cmp);
+       tx->callback = ioat3_dma_test_callback;
+       tx->callback_param = &cmp;
+       cookie = tx->tx_submit(tx);
+       if (cookie < 0) {
+               dev_err(dev, "Self-test xor setup failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       dma->device_issue_pending(dma_chan);
+
+       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+       if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+               dev_err(dev, "Self-test xor timed out\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       dma_sync_single_for_cpu(dev, dest_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+       for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) {
+               u32 *ptr = page_address(dest);
+               if (ptr[i] != cmp_word) {
+                       dev_err(dev, "Self-test xor failed compare\n");
+                       err = -ENODEV;
+                       goto free_resources;
+               }
+       }
+       dma_sync_single_for_device(dev, dest_dma, PAGE_SIZE, DMA_TO_DEVICE);
+
+       /* skip validate if the capability is not present */
+       if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
+               goto free_resources;
+
+       /* validate the sources with the destintation page */
+       for (i = 0; i < IOAT_NUM_SRC_TEST; i++)
+               xor_val_srcs[i] = xor_srcs[i];
+       xor_val_srcs[i] = dest;
+
+       xor_val_result = 1;
+
+       for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
+               dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
+                                          DMA_TO_DEVICE);
+       tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
+                                         IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
+                                         &xor_val_result, DMA_PREP_INTERRUPT);
+       if (!tx) {
+               dev_err(dev, "Self-test zero prep failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       async_tx_ack(tx);
+       init_completion(&cmp);
+       tx->callback = ioat3_dma_test_callback;
+       tx->callback_param = &cmp;
+       cookie = tx->tx_submit(tx);
+       if (cookie < 0) {
+               dev_err(dev, "Self-test zero setup failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       dma->device_issue_pending(dma_chan);
+
+       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+       if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+               dev_err(dev, "Self-test validate timed out\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       if (xor_val_result != 0) {
+               dev_err(dev, "Self-test validate failed compare\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       /* skip memset if the capability is not present */
+       if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask))
+               goto free_resources;
+
+       /* test memset */
+       dma_addr = dma_map_page(dev, dest, 0,
+                       PAGE_SIZE, DMA_FROM_DEVICE);
+       tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE,
+                                        DMA_PREP_INTERRUPT);
+       if (!tx) {
+               dev_err(dev, "Self-test memset prep failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       async_tx_ack(tx);
+       init_completion(&cmp);
+       tx->callback = ioat3_dma_test_callback;
+       tx->callback_param = &cmp;
+       cookie = tx->tx_submit(tx);
+       if (cookie < 0) {
+               dev_err(dev, "Self-test memset setup failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       dma->device_issue_pending(dma_chan);
+
+       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+       if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+               dev_err(dev, "Self-test memset timed out\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
+               u32 *ptr = page_address(dest);
+               if (ptr[i]) {
+                       dev_err(dev, "Self-test memset failed compare\n");
+                       err = -ENODEV;
+                       goto free_resources;
+               }
+       }
+
+       /* test for non-zero parity sum */
+       xor_val_result = 0;
+       for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
+               dma_srcs[i] = dma_map_page(dev, xor_val_srcs[i], 0, PAGE_SIZE,
+                                          DMA_TO_DEVICE);
+       tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
+                                         IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
+                                         &xor_val_result, DMA_PREP_INTERRUPT);
+       if (!tx) {
+               dev_err(dev, "Self-test 2nd zero prep failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       async_tx_ack(tx);
+       init_completion(&cmp);
+       tx->callback = ioat3_dma_test_callback;
+       tx->callback_param = &cmp;
+       cookie = tx->tx_submit(tx);
+       if (cookie < 0) {
+               dev_err(dev, "Self-test  2nd zero setup failed\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       dma->device_issue_pending(dma_chan);
+
+       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
+
+       if (dma->device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+               dev_err(dev, "Self-test 2nd validate timed out\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       if (xor_val_result != SUM_CHECK_P_RESULT) {
+               dev_err(dev, "Self-test validate failed compare\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+free_resources:
+       dma->device_free_chan_resources(dma_chan);
+out:
+       src_idx = IOAT_NUM_SRC_TEST;
+       while (src_idx--)
+               __free_page(xor_srcs[src_idx]);
+       __free_page(dest);
+       return err;
+}
+
+static int __devinit ioat3_dma_self_test(struct ioatdma_device *device)
+{
+       int rc = ioat_dma_self_test(device);
+
+       if (rc)
+               return rc;
+
+       rc = ioat_xor_val_self_test(device);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
+{
+       struct pci_dev *pdev = device->pdev;
+       struct dma_device *dma;
+       struct dma_chan *c;
+       struct ioat_chan_common *chan;
+       bool is_raid_device = false;
+       int err;
+       u16 dev_id;
+       u32 cap;
+
+       device->enumerate_channels = ioat2_enumerate_channels;
+       device->self_test = ioat3_dma_self_test;
+       dma = &device->common;
+       dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
+       dma->device_issue_pending = ioat2_issue_pending;
+       dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
+       dma->device_free_chan_resources = ioat2_free_chan_resources;
+
+       dma_cap_set(DMA_INTERRUPT, dma->cap_mask);
+       dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock;
+
+       cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET);
+       if (cap & IOAT_CAP_XOR) {
+               is_raid_device = true;
+               dma->max_xor = 8;
+               dma->xor_align = 2;
+
+               dma_cap_set(DMA_XOR, dma->cap_mask);
+               dma->device_prep_dma_xor = ioat3_prep_xor;
+
+               dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
+               dma->device_prep_dma_xor_val = ioat3_prep_xor_val;
+       }
+       if (cap & IOAT_CAP_PQ) {
+               is_raid_device = true;
+               dma_set_maxpq(dma, 8, 0);
+               dma->pq_align = 2;
+
+               dma_cap_set(DMA_PQ, dma->cap_mask);
+               dma->device_prep_dma_pq = ioat3_prep_pq;
+
+               dma_cap_set(DMA_PQ_VAL, dma->cap_mask);
+               dma->device_prep_dma_pq_val = ioat3_prep_pq_val;
+
+               if (!(cap & IOAT_CAP_XOR)) {
+                       dma->max_xor = 8;
+                       dma->xor_align = 2;
+
+                       dma_cap_set(DMA_XOR, dma->cap_mask);
+                       dma->device_prep_dma_xor = ioat3_prep_pqxor;
+
+                       dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
+                       dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val;
+               }
+       }
+       if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) {
+               dma_cap_set(DMA_MEMSET, dma->cap_mask);
+               dma->device_prep_dma_memset = ioat3_prep_memset_lock;
+       }
+
+
+       if (is_raid_device) {
+               dma->device_is_tx_complete = ioat3_is_complete;
+               device->cleanup_tasklet = ioat3_cleanup_tasklet;
+               device->timer_fn = ioat3_timer_event;
+       } else {
+               dma->device_is_tx_complete = ioat2_is_complete;
+               device->cleanup_tasklet = ioat2_cleanup_tasklet;
+               device->timer_fn = ioat2_timer_event;
+       }
+
+       /* -= IOAT ver.3 workarounds =- */
+       /* Write CHANERRMSK_INT with 3E07h to mask out the errors
+        * that can cause stability issues for IOAT ver.3
+        */
+       pci_write_config_dword(pdev, IOAT_PCI_CHANERRMASK_INT_OFFSET, 0x3e07);
+
+       /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
+        * (workaround for spurious config parity error after restart)
+        */
+       pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id);
+       if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0)
+               pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10);
+
+       err = ioat_probe(device);
+       if (err)
+               return err;
+       ioat_set_tcp_copy_break(262144);
+
+       list_for_each_entry(c, &dma->channels, device_node) {
+               chan = to_chan_common(c);
+               writel(IOAT_DMA_DCA_ANY_CPU,
+                      chan->reg_base + IOAT_DCACTRL_OFFSET);
+       }
+
+       err = ioat_register(device);
+       if (err)
+               return err;
+
+       ioat_kobject_add(device, &ioat2_ktype);
+
+       if (dca)
+               device->dca = ioat3_dca_init(pdev, device->reg_base);
+
+       return 0;
+}
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h
new file mode 100644 (file)
index 0000000..99afb12
--- /dev/null
@@ -0,0 +1,215 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_HW_H_
+#define _IOAT_HW_H_
+
+/* PCI Configuration Space Values */
+#define IOAT_PCI_VID            0x8086
+#define IOAT_MMIO_BAR          0
+
+/* CB device ID's */
+#define IOAT_PCI_DID_5000       0x1A38
+#define IOAT_PCI_DID_CNB        0x360B
+#define IOAT_PCI_DID_SCNB       0x65FF
+#define IOAT_PCI_DID_SNB        0x402F
+
+#define IOAT_PCI_RID            0x00
+#define IOAT_PCI_SVID           0x8086
+#define IOAT_PCI_SID            0x8086
+#define IOAT_VER_1_2            0x12    /* Version 1.2 */
+#define IOAT_VER_2_0            0x20    /* Version 2.0 */
+#define IOAT_VER_3_0            0x30    /* Version 3.0 */
+#define IOAT_VER_3_2            0x32    /* Version 3.2 */
+
+struct ioat_dma_descriptor {
+       uint32_t        size;
+       union {
+               uint32_t ctl;
+               struct {
+                       unsigned int int_en:1;
+                       unsigned int src_snoop_dis:1;
+                       unsigned int dest_snoop_dis:1;
+                       unsigned int compl_write:1;
+                       unsigned int fence:1;
+                       unsigned int null:1;
+                       unsigned int src_brk:1;
+                       unsigned int dest_brk:1;
+                       unsigned int bundle:1;
+                       unsigned int dest_dca:1;
+                       unsigned int hint:1;
+                       unsigned int rsvd2:13;
+                       #define IOAT_OP_COPY 0x00
+                       unsigned int op:8;
+               } ctl_f;
+       };
+       uint64_t        src_addr;
+       uint64_t        dst_addr;
+       uint64_t        next;
+       uint64_t        rsv1;
+       uint64_t        rsv2;
+       /* store some driver data in an unused portion of the descriptor */
+       union {
+               uint64_t        user1;
+               uint64_t        tx_cnt;
+       };
+       uint64_t        user2;
+};
+
+struct ioat_fill_descriptor {
+       uint32_t        size;
+       union {
+               uint32_t ctl;
+               struct {
+                       unsigned int int_en:1;
+                       unsigned int rsvd:1;
+                       unsigned int dest_snoop_dis:1;
+                       unsigned int compl_write:1;
+                       unsigned int fence:1;
+                       unsigned int rsvd2:2;
+                       unsigned int dest_brk:1;
+                       unsigned int bundle:1;
+                       unsigned int rsvd4:15;
+                       #define IOAT_OP_FILL 0x01
+                       unsigned int op:8;
+               } ctl_f;
+       };
+       uint64_t        src_data;
+       uint64_t        dst_addr;
+       uint64_t        next;
+       uint64_t        rsv1;
+       uint64_t        next_dst_addr;
+       uint64_t        user1;
+       uint64_t        user2;
+};
+
+struct ioat_xor_descriptor {
+       uint32_t        size;
+       union {
+               uint32_t ctl;
+               struct {
+                       unsigned int int_en:1;
+                       unsigned int src_snoop_dis:1;
+                       unsigned int dest_snoop_dis:1;
+                       unsigned int compl_write:1;
+                       unsigned int fence:1;
+                       unsigned int src_cnt:3;
+                       unsigned int bundle:1;
+                       unsigned int dest_dca:1;
+                       unsigned int hint:1;
+                       unsigned int rsvd:13;
+                       #define IOAT_OP_XOR 0x87
+                       #define IOAT_OP_XOR_VAL 0x88
+                       unsigned int op:8;
+               } ctl_f;
+       };
+       uint64_t        src_addr;
+       uint64_t        dst_addr;
+       uint64_t        next;
+       uint64_t        src_addr2;
+       uint64_t        src_addr3;
+       uint64_t        src_addr4;
+       uint64_t        src_addr5;
+};
+
+struct ioat_xor_ext_descriptor {
+       uint64_t        src_addr6;
+       uint64_t        src_addr7;
+       uint64_t        src_addr8;
+       uint64_t        next;
+       uint64_t        rsvd[4];
+};
+
+struct ioat_pq_descriptor {
+       uint32_t        size;
+       union {
+               uint32_t ctl;
+               struct {
+                       unsigned int int_en:1;
+                       unsigned int src_snoop_dis:1;
+                       unsigned int dest_snoop_dis:1;
+                       unsigned int compl_write:1;
+                       unsigned int fence:1;
+                       unsigned int src_cnt:3;
+                       unsigned int bundle:1;
+                       unsigned int dest_dca:1;
+                       unsigned int hint:1;
+                       unsigned int p_disable:1;
+                       unsigned int q_disable:1;
+                       unsigned int rsvd:11;
+                       #define IOAT_OP_PQ 0x89
+                       #define IOAT_OP_PQ_VAL 0x8a
+                       unsigned int op:8;
+               } ctl_f;
+       };
+       uint64_t        src_addr;
+       uint64_t        p_addr;
+       uint64_t        next;
+       uint64_t        src_addr2;
+       uint64_t        src_addr3;
+       uint8_t         coef[8];
+       uint64_t        q_addr;
+};
+
+struct ioat_pq_ext_descriptor {
+       uint64_t        src_addr4;
+       uint64_t        src_addr5;
+       uint64_t        src_addr6;
+       uint64_t        next;
+       uint64_t        src_addr7;
+       uint64_t        src_addr8;
+       uint64_t        rsvd[2];
+};
+
+struct ioat_pq_update_descriptor {
+       uint32_t        size;
+       union {
+               uint32_t ctl;
+               struct {
+                       unsigned int int_en:1;
+                       unsigned int src_snoop_dis:1;
+                       unsigned int dest_snoop_dis:1;
+                       unsigned int compl_write:1;
+                       unsigned int fence:1;
+                       unsigned int src_cnt:3;
+                       unsigned int bundle:1;
+                       unsigned int dest_dca:1;
+                       unsigned int hint:1;
+                       unsigned int p_disable:1;
+                       unsigned int q_disable:1;
+                       unsigned int rsvd:3;
+                       unsigned int coef:8;
+                       #define IOAT_OP_PQ_UP 0x8b
+                       unsigned int op:8;
+               } ctl_f;
+       };
+       uint64_t        src_addr;
+       uint64_t        p_addr;
+       uint64_t        next;
+       uint64_t        src_addr2;
+       uint64_t        p_src;
+       uint64_t        q_src;
+       uint64_t        q_addr;
+};
+
+struct ioat_raw_descriptor {
+       uint64_t        field[8];
+};
+#endif
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
new file mode 100644 (file)
index 0000000..d545fae
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * Intel I/OAT DMA Linux driver
+ * Copyright(c) 2007 - 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dca.h>
+#include "dma.h"
+#include "dma_v2.h"
+#include "registers.h"
+#include "hw.h"
+
+MODULE_VERSION(IOAT_DMA_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static struct pci_device_id ioat_pci_tbl[] = {
+       /* I/OAT v1 platforms */
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB)  },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SCNB) },
+       { PCI_VDEVICE(UNISYS, PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
+
+       /* I/OAT v2 platforms */
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB) },
+
+       /* I/OAT v3 platforms */
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG0) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG1) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG2) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG3) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG4) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG5) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG6) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_TBG7) },
+
+       /* I/OAT v3.2 platforms */
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF0) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF1) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF2) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF3) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF4) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF5) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF6) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF7) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF8) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_JSF9) },
+
+       { 0, }
+};
+MODULE_DEVICE_TABLE(pci, ioat_pci_tbl);
+
+static int __devinit ioat_pci_probe(struct pci_dev *pdev,
+                                   const struct pci_device_id *id);
+static void __devexit ioat_remove(struct pci_dev *pdev);
+
+static int ioat_dca_enabled = 1;
+module_param(ioat_dca_enabled, int, 0644);
+MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
+
+struct kmem_cache *ioat2_cache;
+
+#define DRV_NAME "ioatdma"
+
+static struct pci_driver ioat_pci_driver = {
+       .name           = DRV_NAME,
+       .id_table       = ioat_pci_tbl,
+       .probe          = ioat_pci_probe,
+       .remove         = __devexit_p(ioat_remove),
+};
+
+static struct ioatdma_device *
+alloc_ioatdma(struct pci_dev *pdev, void __iomem *iobase)
+{
+       struct device *dev = &pdev->dev;
+       struct ioatdma_device *d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+
+       if (!d)
+               return NULL;
+       d->pdev = pdev;
+       d->reg_base = iobase;
+       return d;
+}
+
+static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       void __iomem * const *iomap;
+       struct device *dev = &pdev->dev;
+       struct ioatdma_device *device;
+       int err;
+
+       err = pcim_enable_device(pdev);
+       if (err)
+               return err;
+
+       err = pcim_iomap_regions(pdev, 1 << IOAT_MMIO_BAR, DRV_NAME);
+       if (err)
+               return err;
+       iomap = pcim_iomap_table(pdev);
+       if (!iomap)
+               return -ENOMEM;
+
+       err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (err)
+               err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+       if (err)
+               return err;
+
+       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (err)
+               err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+       if (err)
+               return err;
+
+       device = devm_kzalloc(dev, sizeof(*device), GFP_KERNEL);
+       if (!device)
+               return -ENOMEM;
+
+       pci_set_master(pdev);
+
+       device = alloc_ioatdma(pdev, iomap[IOAT_MMIO_BAR]);
+       if (!device)
+               return -ENOMEM;
+       pci_set_drvdata(pdev, device);
+
+       device->version = readb(device->reg_base + IOAT_VER_OFFSET);
+       if (device->version == IOAT_VER_1_2)
+               err = ioat1_dma_probe(device, ioat_dca_enabled);
+       else if (device->version == IOAT_VER_2_0)
+               err = ioat2_dma_probe(device, ioat_dca_enabled);
+       else if (device->version >= IOAT_VER_3_0)
+               err = ioat3_dma_probe(device, ioat_dca_enabled);
+       else
+               return -ENODEV;
+
+       if (err) {
+               dev_err(dev, "Intel(R) I/OAT DMA Engine init failed\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+static void __devexit ioat_remove(struct pci_dev *pdev)
+{
+       struct ioatdma_device *device = pci_get_drvdata(pdev);
+
+       if (!device)
+               return;
+
+       dev_err(&pdev->dev, "Removing dma and dca services\n");
+       if (device->dca) {
+               unregister_dca_provider(device->dca, &pdev->dev);
+               free_dca_provider(device->dca);
+               device->dca = NULL;
+       }
+       ioat_dma_remove(device);
+}
+
+static int __init ioat_init_module(void)
+{
+       int err;
+
+       pr_info("%s: Intel(R) QuickData Technology Driver %s\n",
+               DRV_NAME, IOAT_DMA_VERSION);
+
+       ioat2_cache = kmem_cache_create("ioat2", sizeof(struct ioat_ring_ent),
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!ioat2_cache)
+               return -ENOMEM;
+
+       err = pci_register_driver(&ioat_pci_driver);
+       if (err)
+               kmem_cache_destroy(ioat2_cache);
+
+       return err;
+}
+module_init(ioat_init_module);
+
+static void __exit ioat_exit_module(void)
+{
+       pci_unregister_driver(&ioat_pci_driver);
+       kmem_cache_destroy(ioat2_cache);
+}
+module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h
new file mode 100644 (file)
index 0000000..63038e1
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_REGISTERS_H_
+#define _IOAT_REGISTERS_H_
+
+#define IOAT_PCI_DMACTRL_OFFSET                        0x48
+#define IOAT_PCI_DMACTRL_DMA_EN                        0x00000001
+#define IOAT_PCI_DMACTRL_MSI_EN                        0x00000002
+
+#define IOAT_PCI_DEVICE_ID_OFFSET              0x02
+#define IOAT_PCI_DMAUNCERRSTS_OFFSET           0x148
+#define IOAT_PCI_CHANERRMASK_INT_OFFSET                0x184
+
+/* MMIO Device Registers */
+#define IOAT_CHANCNT_OFFSET                    0x00    /*  8-bit */
+
+#define IOAT_XFERCAP_OFFSET                    0x01    /*  8-bit */
+#define IOAT_XFERCAP_4KB                       12
+#define IOAT_XFERCAP_8KB                       13
+#define IOAT_XFERCAP_16KB                      14
+#define IOAT_XFERCAP_32KB                      15
+#define IOAT_XFERCAP_32GB                      0
+
+#define IOAT_GENCTRL_OFFSET                    0x02    /*  8-bit */
+#define IOAT_GENCTRL_DEBUG_EN                  0x01
+
+#define IOAT_INTRCTRL_OFFSET                   0x03    /*  8-bit */
+#define IOAT_INTRCTRL_MASTER_INT_EN            0x01    /* Master Interrupt Enable */
+#define IOAT_INTRCTRL_INT_STATUS               0x02    /* ATTNSTATUS -or- Channel Int */
+#define IOAT_INTRCTRL_INT                      0x04    /* INT_STATUS -and- MASTER_INT_EN */
+#define IOAT_INTRCTRL_MSIX_VECTOR_CONTROL      0x08    /* Enable all MSI-X vectors */
+
+#define IOAT_ATTNSTATUS_OFFSET                 0x04    /* Each bit is a channel */
+
+#define IOAT_VER_OFFSET                                0x08    /*  8-bit */
+#define IOAT_VER_MAJOR_MASK                    0xF0
+#define IOAT_VER_MINOR_MASK                    0x0F
+#define GET_IOAT_VER_MAJOR(x)                  (((x) & IOAT_VER_MAJOR_MASK) >> 4)
+#define GET_IOAT_VER_MINOR(x)                  ((x) & IOAT_VER_MINOR_MASK)
+
+#define IOAT_PERPORTOFFSET_OFFSET              0x0A    /* 16-bit */
+
+#define IOAT_INTRDELAY_OFFSET                  0x0C    /* 16-bit */
+#define IOAT_INTRDELAY_INT_DELAY_MASK          0x3FFF  /* Interrupt Delay Time */
+#define IOAT_INTRDELAY_COALESE_SUPPORT         0x8000  /* Interrupt Coalescing Supported */
+
+#define IOAT_DEVICE_STATUS_OFFSET              0x0E    /* 16-bit */
+#define IOAT_DEVICE_STATUS_DEGRADED_MODE       0x0001
+#define IOAT_DEVICE_MMIO_RESTRICTED            0x0002
+#define IOAT_DEVICE_MEMORY_BYPASS              0x0004
+#define IOAT_DEVICE_ADDRESS_REMAPPING          0x0008
+
+#define IOAT_DMA_CAP_OFFSET                    0x10    /* 32-bit */
+#define IOAT_CAP_PAGE_BREAK                    0x00000001
+#define IOAT_CAP_CRC                           0x00000002
+#define IOAT_CAP_SKIP_MARKER                   0x00000004
+#define IOAT_CAP_DCA                           0x00000010
+#define IOAT_CAP_CRC_MOVE                      0x00000020
+#define IOAT_CAP_FILL_BLOCK                    0x00000040
+#define IOAT_CAP_APIC                          0x00000080
+#define IOAT_CAP_XOR                           0x00000100
+#define IOAT_CAP_PQ                            0x00000200
+
+#define IOAT_CHANNEL_MMIO_SIZE                 0x80    /* Each Channel MMIO space is this size */
+
+/* DMA Channel Registers */
+#define IOAT_CHANCTRL_OFFSET                   0x00    /* 16-bit Channel Control Register */
+#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK    0xF000
+#define IOAT3_CHANCTRL_COMPL_DCA_EN            0x0200
+#define IOAT_CHANCTRL_CHANNEL_IN_USE           0x0100
+#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL    0x0020
+#define IOAT_CHANCTRL_ERR_INT_EN               0x0010
+#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN         0x0008
+#define IOAT_CHANCTRL_ERR_COMPLETION_EN                0x0004
+#define IOAT_CHANCTRL_INT_REARM                        0x0001
+#define IOAT_CHANCTRL_RUN                      (IOAT_CHANCTRL_INT_REARM |\
+                                                IOAT_CHANCTRL_ERR_COMPLETION_EN |\
+                                                IOAT_CHANCTRL_ANY_ERR_ABORT_EN |\
+                                                IOAT_CHANCTRL_ERR_INT_EN)
+
+#define IOAT_DMA_COMP_OFFSET                   0x02    /* 16-bit DMA channel compatibility */
+#define IOAT_DMA_COMP_V1                       0x0001  /* Compatibility with DMA version 1 */
+#define IOAT_DMA_COMP_V2                       0x0002  /* Compatibility with DMA version 2 */
+
+
+#define IOAT1_CHANSTS_OFFSET           0x04    /* 64-bit Channel Status Register */
+#define IOAT2_CHANSTS_OFFSET           0x08    /* 64-bit Channel Status Register */
+#define IOAT_CHANSTS_OFFSET(ver)               ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHANSTS_OFFSET : IOAT2_CHANSTS_OFFSET)
+#define IOAT1_CHANSTS_OFFSET_LOW       0x04
+#define IOAT2_CHANSTS_OFFSET_LOW       0x08
+#define IOAT_CHANSTS_OFFSET_LOW(ver)           ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHANSTS_OFFSET_LOW : IOAT2_CHANSTS_OFFSET_LOW)
+#define IOAT1_CHANSTS_OFFSET_HIGH      0x08
+#define IOAT2_CHANSTS_OFFSET_HIGH      0x0C
+#define IOAT_CHANSTS_OFFSET_HIGH(ver)          ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH)
+#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR (~0x3fULL)
+#define IOAT_CHANSTS_SOFT_ERR                  0x10ULL
+#define IOAT_CHANSTS_UNAFFILIATED_ERR          0x8ULL
+#define IOAT_CHANSTS_STATUS    0x7ULL
+#define IOAT_CHANSTS_ACTIVE    0x0
+#define IOAT_CHANSTS_DONE      0x1
+#define IOAT_CHANSTS_SUSPENDED 0x2
+#define IOAT_CHANSTS_HALTED    0x3
+
+
+
+#define IOAT_CHAN_DMACOUNT_OFFSET      0x06    /* 16-bit DMA Count register */
+
+#define IOAT_DCACTRL_OFFSET         0x30   /* 32 bit Direct Cache Access Control Register */
+#define IOAT_DCACTRL_CMPL_WRITE_ENABLE 0x10000
+#define IOAT_DCACTRL_TARGET_CPU_MASK   0xFFFF /* APIC ID */
+
+/* CB DCA Memory Space Registers */
+#define IOAT_DCAOFFSET_OFFSET       0x14
+/* CB_BAR + IOAT_DCAOFFSET value */
+#define IOAT_DCA_VER_OFFSET         0x00
+#define IOAT_DCA_VER_MAJOR_MASK     0xF0
+#define IOAT_DCA_VER_MINOR_MASK     0x0F
+
+#define IOAT_DCA_COMP_OFFSET        0x02
+#define IOAT_DCA_COMP_V1            0x1
+
+#define IOAT_FSB_CAPABILITY_OFFSET  0x04
+#define IOAT_FSB_CAPABILITY_PREFETCH    0x1
+
+#define IOAT_PCI_CAPABILITY_OFFSET  0x06
+#define IOAT_PCI_CAPABILITY_MEMWR   0x1
+
+#define IOAT_FSB_CAP_ENABLE_OFFSET  0x08
+#define IOAT_FSB_CAP_ENABLE_PREFETCH    0x1
+
+#define IOAT_PCI_CAP_ENABLE_OFFSET  0x0A
+#define IOAT_PCI_CAP_ENABLE_MEMWR   0x1
+
+#define IOAT_APICID_TAG_MAP_OFFSET  0x0C
+#define IOAT_APICID_TAG_MAP_TAG0    0x0000000F
+#define IOAT_APICID_TAG_MAP_TAG0_SHIFT 0
+#define IOAT_APICID_TAG_MAP_TAG1    0x000000F0
+#define IOAT_APICID_TAG_MAP_TAG1_SHIFT 4
+#define IOAT_APICID_TAG_MAP_TAG2    0x00000F00
+#define IOAT_APICID_TAG_MAP_TAG2_SHIFT 8
+#define IOAT_APICID_TAG_MAP_TAG3    0x0000F000
+#define IOAT_APICID_TAG_MAP_TAG3_SHIFT 12
+#define IOAT_APICID_TAG_MAP_TAG4    0x000F0000
+#define IOAT_APICID_TAG_MAP_TAG4_SHIFT 16
+#define IOAT_APICID_TAG_CB2_VALID   0x8080808080
+
+#define IOAT_DCA_GREQID_OFFSET      0x10
+#define IOAT_DCA_GREQID_SIZE        0x04
+#define IOAT_DCA_GREQID_MASK        0xFFFF
+#define IOAT_DCA_GREQID_IGNOREFUN   0x10000000
+#define IOAT_DCA_GREQID_VALID       0x20000000
+#define IOAT_DCA_GREQID_LASTID      0x80000000
+
+#define IOAT3_CSI_CAPABILITY_OFFSET 0x08
+#define IOAT3_CSI_CAPABILITY_PREFETCH    0x1
+
+#define IOAT3_PCI_CAPABILITY_OFFSET 0x0A
+#define IOAT3_PCI_CAPABILITY_MEMWR  0x1
+
+#define IOAT3_CSI_CONTROL_OFFSET    0x0C
+#define IOAT3_CSI_CONTROL_PREFETCH  0x1
+
+#define IOAT3_PCI_CONTROL_OFFSET    0x0E
+#define IOAT3_PCI_CONTROL_MEMWR     0x1
+
+#define IOAT3_APICID_TAG_MAP_OFFSET 0x10
+#define IOAT3_APICID_TAG_MAP_OFFSET_LOW  0x10
+#define IOAT3_APICID_TAG_MAP_OFFSET_HIGH 0x14
+
+#define IOAT3_DCA_GREQID_OFFSET     0x02
+
+#define IOAT1_CHAINADDR_OFFSET         0x0C    /* 64-bit Descriptor Chain Address Register */
+#define IOAT2_CHAINADDR_OFFSET         0x10    /* 64-bit Descriptor Chain Address Register */
+#define IOAT_CHAINADDR_OFFSET(ver)             ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHAINADDR_OFFSET : IOAT2_CHAINADDR_OFFSET)
+#define IOAT1_CHAINADDR_OFFSET_LOW     0x0C
+#define IOAT2_CHAINADDR_OFFSET_LOW     0x10
+#define IOAT_CHAINADDR_OFFSET_LOW(ver)         ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHAINADDR_OFFSET_LOW : IOAT2_CHAINADDR_OFFSET_LOW)
+#define IOAT1_CHAINADDR_OFFSET_HIGH    0x10
+#define IOAT2_CHAINADDR_OFFSET_HIGH    0x14
+#define IOAT_CHAINADDR_OFFSET_HIGH(ver)                ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHAINADDR_OFFSET_HIGH : IOAT2_CHAINADDR_OFFSET_HIGH)
+
+#define IOAT1_CHANCMD_OFFSET           0x14    /*  8-bit DMA Channel Command Register */
+#define IOAT2_CHANCMD_OFFSET           0x04    /*  8-bit DMA Channel Command Register */
+#define IOAT_CHANCMD_OFFSET(ver)               ((ver) < IOAT_VER_2_0 \
+                                               ? IOAT1_CHANCMD_OFFSET : IOAT2_CHANCMD_OFFSET)
+#define IOAT_CHANCMD_RESET                     0x20
+#define IOAT_CHANCMD_RESUME                    0x10
+#define IOAT_CHANCMD_ABORT                     0x08
+#define IOAT_CHANCMD_SUSPEND                   0x04
+#define IOAT_CHANCMD_APPEND                    0x02
+#define IOAT_CHANCMD_START                     0x01
+
+#define IOAT_CHANCMP_OFFSET                    0x18    /* 64-bit Channel Completion Address Register */
+#define IOAT_CHANCMP_OFFSET_LOW                        0x18
+#define IOAT_CHANCMP_OFFSET_HIGH               0x1C
+
+#define IOAT_CDAR_OFFSET                       0x20    /* 64-bit Current Descriptor Address Register */
+#define IOAT_CDAR_OFFSET_LOW                   0x20
+#define IOAT_CDAR_OFFSET_HIGH                  0x24
+
+#define IOAT_CHANERR_OFFSET                    0x28    /* 32-bit Channel Error Register */
+#define IOAT_CHANERR_SRC_ADDR_ERR      0x0001
+#define IOAT_CHANERR_DEST_ADDR_ERR     0x0002
+#define IOAT_CHANERR_NEXT_ADDR_ERR     0x0004
+#define IOAT_CHANERR_NEXT_DESC_ALIGN_ERR       0x0008
+#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR      0x0010
+#define IOAT_CHANERR_CHANCMD_ERR               0x0020
+#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR  0x0040
+#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR      0x0080
+#define IOAT_CHANERR_READ_DATA_ERR             0x0100
+#define IOAT_CHANERR_WRITE_DATA_ERR            0x0200
+#define IOAT_CHANERR_CONTROL_ERR       0x0400
+#define IOAT_CHANERR_LENGTH_ERR        0x0800
+#define IOAT_CHANERR_COMPLETION_ADDR_ERR       0x1000
+#define IOAT_CHANERR_INT_CONFIGURATION_ERR     0x2000
+#define IOAT_CHANERR_SOFT_ERR                  0x4000
+#define IOAT_CHANERR_UNAFFILIATED_ERR          0x8000
+#define IOAT_CHANERR_XOR_P_OR_CRC_ERR          0x10000
+#define IOAT_CHANERR_XOR_Q_ERR                 0x20000
+#define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR      0x40000
+
+#define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR)
+
+#define IOAT_CHANERR_MASK_OFFSET               0x2C    /* 32-bit Channel Error Register */
+
+#endif /* _IOAT_REGISTERS_H_ */
diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat_dca.c
deleted file mode 100644 (file)
index c012a1e..0000000
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Intel I/OAT DMA Linux driver
- * Copyright(c) 2007 - 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/dca.h>
-
-/* either a kernel change is needed, or we need something like this in kernel */
-#ifndef CONFIG_SMP
-#include <asm/smp.h>
-#undef cpu_physical_id
-#define cpu_physical_id(cpu) (cpuid_ebx(1) >> 24)
-#endif
-
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
-
-/*
- * Bit 7 of a tag map entry is the "valid" bit, if it is set then bits 0:6
- * contain the bit number of the APIC ID to map into the DCA tag.  If the valid
- * bit is not set, then the value must be 0 or 1 and defines the bit in the tag.
- */
-#define DCA_TAG_MAP_VALID 0x80
-
-#define DCA3_TAG_MAP_BIT_TO_INV 0x80
-#define DCA3_TAG_MAP_BIT_TO_SEL 0x40
-#define DCA3_TAG_MAP_LITERAL_VAL 0x1
-
-#define DCA_TAG_MAP_MASK 0xDF
-
-/* expected tag map bytes for I/OAT ver.2 */
-#define DCA2_TAG_MAP_BYTE0 0x80
-#define DCA2_TAG_MAP_BYTE1 0x0
-#define DCA2_TAG_MAP_BYTE2 0x81
-#define DCA2_TAG_MAP_BYTE3 0x82
-#define DCA2_TAG_MAP_BYTE4 0x82
-
-/* verify if tag map matches expected values */
-static inline int dca2_tag_map_valid(u8 *tag_map)
-{
-       return ((tag_map[0] == DCA2_TAG_MAP_BYTE0) &&
-               (tag_map[1] == DCA2_TAG_MAP_BYTE1) &&
-               (tag_map[2] == DCA2_TAG_MAP_BYTE2) &&
-               (tag_map[3] == DCA2_TAG_MAP_BYTE3) &&
-               (tag_map[4] == DCA2_TAG_MAP_BYTE4));
-}
-
-/*
- * "Legacy" DCA systems do not implement the DCA register set in the
- * I/OAT device.  Software needs direct support for their tag mappings.
- */
-
-#define APICID_BIT(x)          (DCA_TAG_MAP_VALID | (x))
-#define IOAT_TAG_MAP_LEN       8
-
-static u8 ioat_tag_map_BNB[IOAT_TAG_MAP_LEN] = {
-       1, APICID_BIT(1), APICID_BIT(2), APICID_BIT(2), };
-static u8 ioat_tag_map_SCNB[IOAT_TAG_MAP_LEN] = {
-       1, APICID_BIT(1), APICID_BIT(2), APICID_BIT(2), };
-static u8 ioat_tag_map_CNB[IOAT_TAG_MAP_LEN] = {
-       1, APICID_BIT(1), APICID_BIT(3), APICID_BIT(4), APICID_BIT(2), };
-static u8 ioat_tag_map_UNISYS[IOAT_TAG_MAP_LEN] = { 0 };
-
-/* pack PCI B/D/F into a u16 */
-static inline u16 dcaid_from_pcidev(struct pci_dev *pci)
-{
-       return (pci->bus->number << 8) | pci->devfn;
-}
-
-static int dca_enabled_in_bios(struct pci_dev *pdev)
-{
-       /* CPUID level 9 returns DCA configuration */
-       /* Bit 0 indicates DCA enabled by the BIOS */
-       unsigned long cpuid_level_9;
-       int res;
-
-       cpuid_level_9 = cpuid_eax(9);
-       res = test_bit(0, &cpuid_level_9);
-       if (!res)
-               dev_err(&pdev->dev, "DCA is disabled in BIOS\n");
-
-       return res;
-}
-
-static int system_has_dca_enabled(struct pci_dev *pdev)
-{
-       if (boot_cpu_has(X86_FEATURE_DCA))
-               return dca_enabled_in_bios(pdev);
-
-       dev_err(&pdev->dev, "boot cpu doesn't have X86_FEATURE_DCA\n");
-       return 0;
-}
-
-struct ioat_dca_slot {
-       struct pci_dev *pdev;   /* requester device */
-       u16 rid;                /* requester id, as used by IOAT */
-};
-
-#define IOAT_DCA_MAX_REQ 6
-#define IOAT3_DCA_MAX_REQ 2
-
-struct ioat_dca_priv {
-       void __iomem            *iobase;
-       void __iomem            *dca_base;
-       int                      max_requesters;
-       int                      requester_count;
-       u8                       tag_map[IOAT_TAG_MAP_LEN];
-       struct ioat_dca_slot     req_slots[0];
-};
-
-/* 5000 series chipset DCA Port Requester ID Table Entry Format
- * [15:8]      PCI-Express Bus Number
- * [7:3]       PCI-Express Device Number
- * [2:0]       PCI-Express Function Number
- *
- * 5000 series chipset DCA control register format
- * [7:1]       Reserved (0)
- * [0]         Ignore Function Number
- */
-
-static int ioat_dca_add_requester(struct dca_provider *dca, struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-       u16 id;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-       id = dcaid_from_pcidev(pdev);
-
-       if (ioatdca->requester_count == ioatdca->max_requesters)
-               return -ENODEV;
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == NULL) {
-                       /* found an empty slot */
-                       ioatdca->requester_count++;
-                       ioatdca->req_slots[i].pdev = pdev;
-                       ioatdca->req_slots[i].rid = id;
-                       writew(id, ioatdca->dca_base + (i * 4));
-                       /* make sure the ignore function bit is off */
-                       writeb(0, ioatdca->dca_base + (i * 4) + 2);
-                       return i;
-               }
-       }
-       /* Error, ioatdma->requester_count is out of whack */
-       return -EFAULT;
-}
-
-static int ioat_dca_remove_requester(struct dca_provider *dca,
-                                    struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == pdev) {
-                       writew(0, ioatdca->dca_base + (i * 4));
-                       ioatdca->req_slots[i].pdev = NULL;
-                       ioatdca->req_slots[i].rid = 0;
-                       ioatdca->requester_count--;
-                       return i;
-               }
-       }
-       return -ENODEV;
-}
-
-static u8 ioat_dca_get_tag(struct dca_provider *dca,
-                          struct device *dev,
-                          int cpu)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       int i, apic_id, bit, value;
-       u8 entry, tag;
-
-       tag = 0;
-       apic_id = cpu_physical_id(cpu);
-
-       for (i = 0; i < IOAT_TAG_MAP_LEN; i++) {
-               entry = ioatdca->tag_map[i];
-               if (entry & DCA_TAG_MAP_VALID) {
-                       bit = entry & ~DCA_TAG_MAP_VALID;
-                       value = (apic_id & (1 << bit)) ? 1 : 0;
-               } else {
-                       value = entry ? 1 : 0;
-               }
-               tag |= (value << i);
-       }
-       return tag;
-}
-
-static int ioat_dca_dev_managed(struct dca_provider *dca,
-                               struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-
-       pdev = to_pci_dev(dev);
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == pdev)
-                       return 1;
-       }
-       return 0;
-}
-
-static struct dca_ops ioat_dca_ops = {
-       .add_requester          = ioat_dca_add_requester,
-       .remove_requester       = ioat_dca_remove_requester,
-       .get_tag                = ioat_dca_get_tag,
-       .dev_managed            = ioat_dca_dev_managed,
-};
-
-
-struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
-{
-       struct dca_provider *dca;
-       struct ioat_dca_priv *ioatdca;
-       u8 *tag_map = NULL;
-       int i;
-       int err;
-       u8 version;
-       u8 max_requesters;
-
-       if (!system_has_dca_enabled(pdev))
-               return NULL;
-
-       /* I/OAT v1 systems must have a known tag_map to support DCA */
-       switch (pdev->vendor) {
-       case PCI_VENDOR_ID_INTEL:
-               switch (pdev->device) {
-               case PCI_DEVICE_ID_INTEL_IOAT:
-                       tag_map = ioat_tag_map_BNB;
-                       break;
-               case PCI_DEVICE_ID_INTEL_IOAT_CNB:
-                       tag_map = ioat_tag_map_CNB;
-                       break;
-               case PCI_DEVICE_ID_INTEL_IOAT_SCNB:
-                       tag_map = ioat_tag_map_SCNB;
-                       break;
-               }
-               break;
-       case PCI_VENDOR_ID_UNISYS:
-               switch (pdev->device) {
-               case PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR:
-                       tag_map = ioat_tag_map_UNISYS;
-                       break;
-               }
-               break;
-       }
-       if (tag_map == NULL)
-               return NULL;
-
-       version = readb(iobase + IOAT_VER_OFFSET);
-       if (version == IOAT_VER_3_0)
-               max_requesters = IOAT3_DCA_MAX_REQ;
-       else
-               max_requesters = IOAT_DCA_MAX_REQ;
-
-       dca = alloc_dca_provider(&ioat_dca_ops,
-                       sizeof(*ioatdca) +
-                       (sizeof(struct ioat_dca_slot) * max_requesters));
-       if (!dca)
-               return NULL;
-
-       ioatdca = dca_priv(dca);
-       ioatdca->max_requesters = max_requesters;
-       ioatdca->dca_base = iobase + 0x54;
-
-       /* copy over the APIC ID to DCA tag mapping */
-       for (i = 0; i < IOAT_TAG_MAP_LEN; i++)
-               ioatdca->tag_map[i] = tag_map[i];
-
-       err = register_dca_provider(dca, &pdev->dev);
-       if (err) {
-               free_dca_provider(dca);
-               return NULL;
-       }
-
-       return dca;
-}
-
-
-static int ioat2_dca_add_requester(struct dca_provider *dca, struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-       u16 id;
-       u16 global_req_table;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-       id = dcaid_from_pcidev(pdev);
-
-       if (ioatdca->requester_count == ioatdca->max_requesters)
-               return -ENODEV;
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == NULL) {
-                       /* found an empty slot */
-                       ioatdca->requester_count++;
-                       ioatdca->req_slots[i].pdev = pdev;
-                       ioatdca->req_slots[i].rid = id;
-                       global_req_table =
-                             readw(ioatdca->dca_base + IOAT_DCA_GREQID_OFFSET);
-                       writel(id | IOAT_DCA_GREQID_VALID,
-                              ioatdca->iobase + global_req_table + (i * 4));
-                       return i;
-               }
-       }
-       /* Error, ioatdma->requester_count is out of whack */
-       return -EFAULT;
-}
-
-static int ioat2_dca_remove_requester(struct dca_provider *dca,
-                                     struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-       u16 global_req_table;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == pdev) {
-                       global_req_table =
-                             readw(ioatdca->dca_base + IOAT_DCA_GREQID_OFFSET);
-                       writel(0, ioatdca->iobase + global_req_table + (i * 4));
-                       ioatdca->req_slots[i].pdev = NULL;
-                       ioatdca->req_slots[i].rid = 0;
-                       ioatdca->requester_count--;
-                       return i;
-               }
-       }
-       return -ENODEV;
-}
-
-static u8 ioat2_dca_get_tag(struct dca_provider *dca,
-                           struct device *dev,
-                           int cpu)
-{
-       u8 tag;
-
-       tag = ioat_dca_get_tag(dca, dev, cpu);
-       tag = (~tag) & 0x1F;
-       return tag;
-}
-
-static struct dca_ops ioat2_dca_ops = {
-       .add_requester          = ioat2_dca_add_requester,
-       .remove_requester       = ioat2_dca_remove_requester,
-       .get_tag                = ioat2_dca_get_tag,
-       .dev_managed            = ioat_dca_dev_managed,
-};
-
-static int ioat2_dca_count_dca_slots(void __iomem *iobase, u16 dca_offset)
-{
-       int slots = 0;
-       u32 req;
-       u16 global_req_table;
-
-       global_req_table = readw(iobase + dca_offset + IOAT_DCA_GREQID_OFFSET);
-       if (global_req_table == 0)
-               return 0;
-       do {
-               req = readl(iobase + global_req_table + (slots * sizeof(u32)));
-               slots++;
-       } while ((req & IOAT_DCA_GREQID_LASTID) == 0);
-
-       return slots;
-}
-
-struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase)
-{
-       struct dca_provider *dca;
-       struct ioat_dca_priv *ioatdca;
-       int slots;
-       int i;
-       int err;
-       u32 tag_map;
-       u16 dca_offset;
-       u16 csi_fsb_control;
-       u16 pcie_control;
-       u8 bit;
-
-       if (!system_has_dca_enabled(pdev))
-               return NULL;
-
-       dca_offset = readw(iobase + IOAT_DCAOFFSET_OFFSET);
-       if (dca_offset == 0)
-               return NULL;
-
-       slots = ioat2_dca_count_dca_slots(iobase, dca_offset);
-       if (slots == 0)
-               return NULL;
-
-       dca = alloc_dca_provider(&ioat2_dca_ops,
-                                sizeof(*ioatdca)
-                                     + (sizeof(struct ioat_dca_slot) * slots));
-       if (!dca)
-               return NULL;
-
-       ioatdca = dca_priv(dca);
-       ioatdca->iobase = iobase;
-       ioatdca->dca_base = iobase + dca_offset;
-       ioatdca->max_requesters = slots;
-
-       /* some bios might not know to turn these on */
-       csi_fsb_control = readw(ioatdca->dca_base + IOAT_FSB_CAP_ENABLE_OFFSET);
-       if ((csi_fsb_control & IOAT_FSB_CAP_ENABLE_PREFETCH) == 0) {
-               csi_fsb_control |= IOAT_FSB_CAP_ENABLE_PREFETCH;
-               writew(csi_fsb_control,
-                      ioatdca->dca_base + IOAT_FSB_CAP_ENABLE_OFFSET);
-       }
-       pcie_control = readw(ioatdca->dca_base + IOAT_PCI_CAP_ENABLE_OFFSET);
-       if ((pcie_control & IOAT_PCI_CAP_ENABLE_MEMWR) == 0) {
-               pcie_control |= IOAT_PCI_CAP_ENABLE_MEMWR;
-               writew(pcie_control,
-                      ioatdca->dca_base + IOAT_PCI_CAP_ENABLE_OFFSET);
-       }
-
-
-       /* TODO version, compatibility and configuration checks */
-
-       /* copy out the APIC to DCA tag map */
-       tag_map = readl(ioatdca->dca_base + IOAT_APICID_TAG_MAP_OFFSET);
-       for (i = 0; i < 5; i++) {
-               bit = (tag_map >> (4 * i)) & 0x0f;
-               if (bit < 8)
-                       ioatdca->tag_map[i] = bit | DCA_TAG_MAP_VALID;
-               else
-                       ioatdca->tag_map[i] = 0;
-       }
-
-       if (!dca2_tag_map_valid(ioatdca->tag_map)) {
-               dev_err(&pdev->dev, "APICID_TAG_MAP set incorrectly by BIOS, "
-                       "disabling DCA\n");
-               free_dca_provider(dca);
-               return NULL;
-       }
-
-       err = register_dca_provider(dca, &pdev->dev);
-       if (err) {
-               free_dca_provider(dca);
-               return NULL;
-       }
-
-       return dca;
-}
-
-static int ioat3_dca_add_requester(struct dca_provider *dca, struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-       u16 id;
-       u16 global_req_table;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-       id = dcaid_from_pcidev(pdev);
-
-       if (ioatdca->requester_count == ioatdca->max_requesters)
-               return -ENODEV;
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == NULL) {
-                       /* found an empty slot */
-                       ioatdca->requester_count++;
-                       ioatdca->req_slots[i].pdev = pdev;
-                       ioatdca->req_slots[i].rid = id;
-                       global_req_table =
-                             readw(ioatdca->dca_base + IOAT3_DCA_GREQID_OFFSET);
-                       writel(id | IOAT_DCA_GREQID_VALID,
-                              ioatdca->iobase + global_req_table + (i * 4));
-                       return i;
-               }
-       }
-       /* Error, ioatdma->requester_count is out of whack */
-       return -EFAULT;
-}
-
-static int ioat3_dca_remove_requester(struct dca_provider *dca,
-                                     struct device *dev)
-{
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       struct pci_dev *pdev;
-       int i;
-       u16 global_req_table;
-
-       /* This implementation only supports PCI-Express */
-       if (dev->bus != &pci_bus_type)
-               return -ENODEV;
-       pdev = to_pci_dev(dev);
-
-       for (i = 0; i < ioatdca->max_requesters; i++) {
-               if (ioatdca->req_slots[i].pdev == pdev) {
-                       global_req_table =
-                             readw(ioatdca->dca_base + IOAT3_DCA_GREQID_OFFSET);
-                       writel(0, ioatdca->iobase + global_req_table + (i * 4));
-                       ioatdca->req_slots[i].pdev = NULL;
-                       ioatdca->req_slots[i].rid = 0;
-                       ioatdca->requester_count--;
-                       return i;
-               }
-       }
-       return -ENODEV;
-}
-
-static u8 ioat3_dca_get_tag(struct dca_provider *dca,
-                           struct device *dev,
-                           int cpu)
-{
-       u8 tag;
-
-       struct ioat_dca_priv *ioatdca = dca_priv(dca);
-       int i, apic_id, bit, value;
-       u8 entry;
-
-       tag = 0;
-       apic_id = cpu_physical_id(cpu);
-
-       for (i = 0; i < IOAT_TAG_MAP_LEN; i++) {
-               entry = ioatdca->tag_map[i];
-               if (entry & DCA3_TAG_MAP_BIT_TO_SEL) {
-                       bit = entry &
-                               ~(DCA3_TAG_MAP_BIT_TO_SEL | DCA3_TAG_MAP_BIT_TO_INV);
-                       value = (apic_id & (1 << bit)) ? 1 : 0;
-               } else if (entry & DCA3_TAG_MAP_BIT_TO_INV) {
-                       bit = entry & ~DCA3_TAG_MAP_BIT_TO_INV;
-                       value = (apic_id & (1 << bit)) ? 0 : 1;
-               } else {
-                       value = (entry & DCA3_TAG_MAP_LITERAL_VAL) ? 1 : 0;
-               }
-               tag |= (value << i);
-       }
-
-       return tag;
-}
-
-static struct dca_ops ioat3_dca_ops = {
-       .add_requester          = ioat3_dca_add_requester,
-       .remove_requester       = ioat3_dca_remove_requester,
-       .get_tag                = ioat3_dca_get_tag,
-       .dev_managed            = ioat_dca_dev_managed,
-};
-
-static int ioat3_dca_count_dca_slots(void *iobase, u16 dca_offset)
-{
-       int slots = 0;
-       u32 req;
-       u16 global_req_table;
-
-       global_req_table = readw(iobase + dca_offset + IOAT3_DCA_GREQID_OFFSET);
-       if (global_req_table == 0)
-               return 0;
-
-       do {
-               req = readl(iobase + global_req_table + (slots * sizeof(u32)));
-               slots++;
-       } while ((req & IOAT_DCA_GREQID_LASTID) == 0);
-
-       return slots;
-}
-
-struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
-{
-       struct dca_provider *dca;
-       struct ioat_dca_priv *ioatdca;
-       int slots;
-       int i;
-       int err;
-       u16 dca_offset;
-       u16 csi_fsb_control;
-       u16 pcie_control;
-       u8 bit;
-
-       union {
-               u64 full;
-               struct {
-                       u32 low;
-                       u32 high;
-               };
-       } tag_map;
-
-       if (!system_has_dca_enabled(pdev))
-               return NULL;
-
-       dca_offset = readw(iobase + IOAT_DCAOFFSET_OFFSET);
-       if (dca_offset == 0)
-               return NULL;
-
-       slots = ioat3_dca_count_dca_slots(iobase, dca_offset);
-       if (slots == 0)
-               return NULL;
-
-       dca = alloc_dca_provider(&ioat3_dca_ops,
-                                sizeof(*ioatdca)
-                                     + (sizeof(struct ioat_dca_slot) * slots));
-       if (!dca)
-               return NULL;
-
-       ioatdca = dca_priv(dca);
-       ioatdca->iobase = iobase;
-       ioatdca->dca_base = iobase + dca_offset;
-       ioatdca->max_requesters = slots;
-
-       /* some bios might not know to turn these on */
-       csi_fsb_control = readw(ioatdca->dca_base + IOAT3_CSI_CONTROL_OFFSET);
-       if ((csi_fsb_control & IOAT3_CSI_CONTROL_PREFETCH) == 0) {
-               csi_fsb_control |= IOAT3_CSI_CONTROL_PREFETCH;
-               writew(csi_fsb_control,
-                      ioatdca->dca_base + IOAT3_CSI_CONTROL_OFFSET);
-       }
-       pcie_control = readw(ioatdca->dca_base + IOAT3_PCI_CONTROL_OFFSET);
-       if ((pcie_control & IOAT3_PCI_CONTROL_MEMWR) == 0) {
-               pcie_control |= IOAT3_PCI_CONTROL_MEMWR;
-               writew(pcie_control,
-                      ioatdca->dca_base + IOAT3_PCI_CONTROL_OFFSET);
-       }
-
-
-       /* TODO version, compatibility and configuration checks */
-
-       /* copy out the APIC to DCA tag map */
-       tag_map.low =
-               readl(ioatdca->dca_base + IOAT3_APICID_TAG_MAP_OFFSET_LOW);
-       tag_map.high =
-               readl(ioatdca->dca_base + IOAT3_APICID_TAG_MAP_OFFSET_HIGH);
-       for (i = 0; i < 8; i++) {
-               bit = tag_map.full >> (8 * i);
-               ioatdca->tag_map[i] = bit & DCA_TAG_MAP_MASK;
-       }
-
-       err = register_dca_provider(dca, &pdev->dev);
-       if (err) {
-               free_dca_provider(dca);
-               return NULL;
-       }
-
-       return dca;
-}
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
deleted file mode 100644 (file)
index a600fc0..0000000
+++ /dev/null
@@ -1,1741 +0,0 @@
-/*
- * Intel I/OAT DMA Linux driver
- * Copyright(c) 2004 - 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- */
-
-/*
- * This driver supports an Intel I/OAT DMA engine, which does asynchronous
- * copy operations.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/dmaengine.h>
-#include <linux/delay.h>
-#include <linux/dma-mapping.h>
-#include <linux/workqueue.h>
-#include <linux/i7300_idle.h>
-#include "ioatdma.h"
-#include "ioatdma_registers.h"
-#include "ioatdma_hw.h"
-
-#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
-#define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, common)
-#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
-#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx)
-
-#define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
-static int ioat_pending_level = 4;
-module_param(ioat_pending_level, int, 0644);
-MODULE_PARM_DESC(ioat_pending_level,
-                "high-water mark for pushing ioat descriptors (default: 4)");
-
-#define RESET_DELAY  msecs_to_jiffies(100)
-#define WATCHDOG_DELAY  round_jiffies(msecs_to_jiffies(2000))
-static void ioat_dma_chan_reset_part2(struct work_struct *work);
-static void ioat_dma_chan_watchdog(struct work_struct *work);
-
-/*
- * workaround for IOAT ver.3.0 null descriptor issue
- * (channel returns error when size is 0)
- */
-#define NULL_DESC_BUFFER_SIZE 1
-
-/* internal functions */
-static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan);
-static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan);
-
-static struct ioat_desc_sw *
-ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
-static struct ioat_desc_sw *
-ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan);
-
-static inline struct ioat_dma_chan *ioat_lookup_chan_by_index(
-                                               struct ioatdma_device *device,
-                                               int index)
-{
-       return device->idx[index];
-}
-
-/**
- * ioat_dma_do_interrupt - handler used for single vector interrupt mode
- * @irq: interrupt id
- * @data: interrupt data
- */
-static irqreturn_t ioat_dma_do_interrupt(int irq, void *data)
-{
-       struct ioatdma_device *instance = data;
-       struct ioat_dma_chan *ioat_chan;
-       unsigned long attnstatus;
-       int bit;
-       u8 intrctrl;
-
-       intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET);
-
-       if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
-               return IRQ_NONE;
-
-       if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
-               writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
-               return IRQ_NONE;
-       }
-
-       attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET);
-       for_each_bit(bit, &attnstatus, BITS_PER_LONG) {
-               ioat_chan = ioat_lookup_chan_by_index(instance, bit);
-               tasklet_schedule(&ioat_chan->cleanup_task);
-       }
-
-       writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET);
-       return IRQ_HANDLED;
-}
-
-/**
- * ioat_dma_do_interrupt_msix - handler used for vector-per-channel interrupt mode
- * @irq: interrupt id
- * @data: interrupt data
- */
-static irqreturn_t ioat_dma_do_interrupt_msix(int irq, void *data)
-{
-       struct ioat_dma_chan *ioat_chan = data;
-
-       tasklet_schedule(&ioat_chan->cleanup_task);
-
-       return IRQ_HANDLED;
-}
-
-static void ioat_dma_cleanup_tasklet(unsigned long data);
-
-/**
- * ioat_dma_enumerate_channels - find and initialize the device's channels
- * @device: the device to be enumerated
- */
-static int ioat_dma_enumerate_channels(struct ioatdma_device *device)
-{
-       u8 xfercap_scale;
-       u32 xfercap;
-       int i;
-       struct ioat_dma_chan *ioat_chan;
-
-       /*
-        * IOAT ver.3 workarounds
-        */
-       if (device->version == IOAT_VER_3_0) {
-               u32 chan_err_mask;
-               u16 dev_id;
-               u32 dmauncerrsts;
-
-               /*
-                * Write CHANERRMSK_INT with 3E07h to mask out the errors
-                * that can cause stability issues for IOAT ver.3
-                */
-               chan_err_mask = 0x3E07;
-               pci_write_config_dword(device->pdev,
-                       IOAT_PCI_CHANERRMASK_INT_OFFSET,
-                       chan_err_mask);
-
-               /*
-                * Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
-                * (workaround for spurious config parity error after restart)
-                */
-               pci_read_config_word(device->pdev,
-                       IOAT_PCI_DEVICE_ID_OFFSET,
-                       &dev_id);
-               if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) {
-                       dmauncerrsts = 0x10;
-                       pci_write_config_dword(device->pdev,
-                               IOAT_PCI_DMAUNCERRSTS_OFFSET,
-                               dmauncerrsts);
-               }
-       }
-
-       device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET);
-       xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET);
-       xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
-
-#ifdef  CONFIG_I7300_IDLE_IOAT_CHANNEL
-       if (i7300_idle_platform_probe(NULL, NULL, 1) == 0) {
-               device->common.chancnt--;
-       }
-#endif
-       for (i = 0; i < device->common.chancnt; i++) {
-               ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
-               if (!ioat_chan) {
-                       device->common.chancnt = i;
-                       break;
-               }
-
-               ioat_chan->device = device;
-               ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1));
-               ioat_chan->xfercap = xfercap;
-               ioat_chan->desccount = 0;
-               INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2);
-               if (ioat_chan->device->version == IOAT_VER_2_0)
-                       writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE |
-                              IOAT_DMA_DCA_ANY_CPU,
-                              ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
-               else if (ioat_chan->device->version == IOAT_VER_3_0)
-                       writel(IOAT_DMA_DCA_ANY_CPU,
-                              ioat_chan->reg_base + IOAT_DCACTRL_OFFSET);
-               spin_lock_init(&ioat_chan->cleanup_lock);
-               spin_lock_init(&ioat_chan->desc_lock);
-               INIT_LIST_HEAD(&ioat_chan->free_desc);
-               INIT_LIST_HEAD(&ioat_chan->used_desc);
-               /* This should be made common somewhere in dmaengine.c */
-               ioat_chan->common.device = &device->common;
-               list_add_tail(&ioat_chan->common.device_node,
-                             &device->common.channels);
-               device->idx[i] = ioat_chan;
-               tasklet_init(&ioat_chan->cleanup_task,
-                            ioat_dma_cleanup_tasklet,
-                            (unsigned long) ioat_chan);
-               tasklet_disable(&ioat_chan->cleanup_task);
-       }
-       return device->common.chancnt;
-}
-
-/**
- * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended
- *                                 descriptors to hw
- * @chan: DMA channel handle
- */
-static inline void __ioat1_dma_memcpy_issue_pending(
-                                               struct ioat_dma_chan *ioat_chan)
-{
-       ioat_chan->pending = 0;
-       writeb(IOAT_CHANCMD_APPEND, ioat_chan->reg_base + IOAT1_CHANCMD_OFFSET);
-}
-
-static void ioat1_dma_memcpy_issue_pending(struct dma_chan *chan)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-       if (ioat_chan->pending > 0) {
-               spin_lock_bh(&ioat_chan->desc_lock);
-               __ioat1_dma_memcpy_issue_pending(ioat_chan);
-               spin_unlock_bh(&ioat_chan->desc_lock);
-       }
-}
-
-static inline void __ioat2_dma_memcpy_issue_pending(
-                                               struct ioat_dma_chan *ioat_chan)
-{
-       ioat_chan->pending = 0;
-       writew(ioat_chan->dmacount,
-              ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
-}
-
-static void ioat2_dma_memcpy_issue_pending(struct dma_chan *chan)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-       if (ioat_chan->pending > 0) {
-               spin_lock_bh(&ioat_chan->desc_lock);
-               __ioat2_dma_memcpy_issue_pending(ioat_chan);
-               spin_unlock_bh(&ioat_chan->desc_lock);
-       }
-}
-
-
-/**
- * ioat_dma_chan_reset_part2 - reinit the channel after a reset
- */
-static void ioat_dma_chan_reset_part2(struct work_struct *work)
-{
-       struct ioat_dma_chan *ioat_chan =
-               container_of(work, struct ioat_dma_chan, work.work);
-       struct ioat_desc_sw *desc;
-
-       spin_lock_bh(&ioat_chan->cleanup_lock);
-       spin_lock_bh(&ioat_chan->desc_lock);
-
-       ioat_chan->completion_virt->low = 0;
-       ioat_chan->completion_virt->high = 0;
-       ioat_chan->pending = 0;
-
-       /*
-        * count the descriptors waiting, and be sure to do it
-        * right for both the CB1 line and the CB2 ring
-        */
-       ioat_chan->dmacount = 0;
-       if (ioat_chan->used_desc.prev) {
-               desc = to_ioat_desc(ioat_chan->used_desc.prev);
-               do {
-                       ioat_chan->dmacount++;
-                       desc = to_ioat_desc(desc->node.next);
-               } while (&desc->node != ioat_chan->used_desc.next);
-       }
-
-       /*
-        * write the new starting descriptor address
-        * this puts channel engine into ARMED state
-        */
-       desc = to_ioat_desc(ioat_chan->used_desc.prev);
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-                      ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
-               writel(((u64) desc->async_tx.phys) >> 32,
-                      ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
-
-               writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
-                       + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-               break;
-       case IOAT_VER_2_0:
-               writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-                      ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
-               writel(((u64) desc->async_tx.phys) >> 32,
-                      ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
-
-               /* tell the engine to go with what's left to be done */
-               writew(ioat_chan->dmacount,
-                      ioat_chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
-
-               break;
-       }
-       dev_err(&ioat_chan->device->pdev->dev,
-               "chan%d reset - %d descs waiting, %d total desc\n",
-               chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-
-       spin_unlock_bh(&ioat_chan->desc_lock);
-       spin_unlock_bh(&ioat_chan->cleanup_lock);
-}
-
-/**
- * ioat_dma_reset_channel - restart a channel
- * @ioat_chan: IOAT DMA channel handle
- */
-static void ioat_dma_reset_channel(struct ioat_dma_chan *ioat_chan)
-{
-       u32 chansts, chanerr;
-
-       if (!ioat_chan->used_desc.prev)
-               return;
-
-       chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-       chansts = (ioat_chan->completion_virt->low
-                                       & IOAT_CHANSTS_DMA_TRANSFER_STATUS);
-       if (chanerr) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "chan%d, CHANSTS = 0x%08x CHANERR = 0x%04x, clearing\n",
-                       chan_num(ioat_chan), chansts, chanerr);
-               writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-       }
-
-       /*
-        * whack it upside the head with a reset
-        * and wait for things to settle out.
-        * force the pending count to a really big negative
-        * to make sure no one forces an issue_pending
-        * while we're waiting.
-        */
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-       ioat_chan->pending = INT_MIN;
-       writeb(IOAT_CHANCMD_RESET,
-              ioat_chan->reg_base
-              + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       /* schedule the 2nd half instead of sleeping a long time */
-       schedule_delayed_work(&ioat_chan->work, RESET_DELAY);
-}
-
-/**
- * ioat_dma_chan_watchdog - watch for stuck channels
- */
-static void ioat_dma_chan_watchdog(struct work_struct *work)
-{
-       struct ioatdma_device *device =
-               container_of(work, struct ioatdma_device, work.work);
-       struct ioat_dma_chan *ioat_chan;
-       int i;
-
-       union {
-               u64 full;
-               struct {
-                       u32 low;
-                       u32 high;
-               };
-       } completion_hw;
-       unsigned long compl_desc_addr_hw;
-
-       for (i = 0; i < device->common.chancnt; i++) {
-               ioat_chan = ioat_lookup_chan_by_index(device, i);
-
-               if (ioat_chan->device->version == IOAT_VER_1_2
-                       /* have we started processing anything yet */
-                   && ioat_chan->last_completion
-                       /* have we completed any since last watchdog cycle? */
-                   && (ioat_chan->last_completion ==
-                               ioat_chan->watchdog_completion)
-                       /* has TCP stuck on one cookie since last watchdog? */
-                   && (ioat_chan->watchdog_tcp_cookie ==
-                               ioat_chan->watchdog_last_tcp_cookie)
-                   && (ioat_chan->watchdog_tcp_cookie !=
-                               ioat_chan->completed_cookie)
-                       /* is there something in the chain to be processed? */
-                       /* CB1 chain always has at least the last one processed */
-                   && (ioat_chan->used_desc.prev != ioat_chan->used_desc.next)
-                   && ioat_chan->pending == 0) {
-
-                       /*
-                        * check CHANSTS register for completed
-                        * descriptor address.
-                        * if it is different than completion writeback,
-                        * it is not zero
-                        * and it has changed since the last watchdog
-                        *     we can assume that channel
-                        *     is still working correctly
-                        *     and the problem is in completion writeback.
-                        *     update completion writeback
-                        *     with actual CHANSTS value
-                        * else
-                        *     try resetting the channel
-                        */
-
-                       completion_hw.low = readl(ioat_chan->reg_base +
-                               IOAT_CHANSTS_OFFSET_LOW(ioat_chan->device->version));
-                       completion_hw.high = readl(ioat_chan->reg_base +
-                               IOAT_CHANSTS_OFFSET_HIGH(ioat_chan->device->version));
-#if (BITS_PER_LONG == 64)
-                       compl_desc_addr_hw =
-                               completion_hw.full
-                               & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-#else
-                       compl_desc_addr_hw =
-                               completion_hw.low & IOAT_LOW_COMPLETION_MASK;
-#endif
-
-                       if ((compl_desc_addr_hw != 0)
-                          && (compl_desc_addr_hw != ioat_chan->watchdog_completion)
-                          && (compl_desc_addr_hw != ioat_chan->last_compl_desc_addr_hw)) {
-                               ioat_chan->last_compl_desc_addr_hw = compl_desc_addr_hw;
-                               ioat_chan->completion_virt->low = completion_hw.low;
-                               ioat_chan->completion_virt->high = completion_hw.high;
-                       } else {
-                               ioat_dma_reset_channel(ioat_chan);
-                               ioat_chan->watchdog_completion = 0;
-                               ioat_chan->last_compl_desc_addr_hw = 0;
-                       }
-
-               /*
-                * for version 2.0 if there are descriptors yet to be processed
-                * and the last completed hasn't changed since the last watchdog
-                *      if they haven't hit the pending level
-                *          issue the pending to push them through
-                *      else
-                *          try resetting the channel
-                */
-               } else if (ioat_chan->device->version == IOAT_VER_2_0
-                   && ioat_chan->used_desc.prev
-                   && ioat_chan->last_completion
-                   && ioat_chan->last_completion == ioat_chan->watchdog_completion) {
-
-                       if (ioat_chan->pending < ioat_pending_level)
-                               ioat2_dma_memcpy_issue_pending(&ioat_chan->common);
-                       else {
-                               ioat_dma_reset_channel(ioat_chan);
-                               ioat_chan->watchdog_completion = 0;
-                       }
-               } else {
-                       ioat_chan->last_compl_desc_addr_hw = 0;
-                       ioat_chan->watchdog_completion
-                                       = ioat_chan->last_completion;
-               }
-
-               ioat_chan->watchdog_last_tcp_cookie =
-                       ioat_chan->watchdog_tcp_cookie;
-       }
-
-       schedule_delayed_work(&device->work, WATCHDOG_DELAY);
-}
-
-static dma_cookie_t ioat1_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
-       struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
-       struct ioat_desc_sw *prev, *new;
-       struct ioat_dma_descriptor *hw;
-       dma_cookie_t cookie;
-       LIST_HEAD(new_chain);
-       u32 copy;
-       size_t len;
-       dma_addr_t src, dst;
-       unsigned long orig_flags;
-       unsigned int desc_count = 0;
-
-       /* src and dest and len are stored in the initial descriptor */
-       len = first->len;
-       src = first->src;
-       dst = first->dst;
-       orig_flags = first->async_tx.flags;
-       new = first;
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-       prev = to_ioat_desc(ioat_chan->used_desc.prev);
-       prefetch(prev->hw);
-       do {
-               copy = min_t(size_t, len, ioat_chan->xfercap);
-
-               async_tx_ack(&new->async_tx);
-
-               hw = new->hw;
-               hw->size = copy;
-               hw->ctl = 0;
-               hw->src_addr = src;
-               hw->dst_addr = dst;
-               hw->next = 0;
-
-               /* chain together the physical address list for the HW */
-               wmb();
-               prev->hw->next = (u64) new->async_tx.phys;
-
-               len -= copy;
-               dst += copy;
-               src += copy;
-
-               list_add_tail(&new->node, &new_chain);
-               desc_count++;
-               prev = new;
-       } while (len && (new = ioat1_dma_get_next_descriptor(ioat_chan)));
-
-       if (!new) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "tx submit failed\n");
-               spin_unlock_bh(&ioat_chan->desc_lock);
-               return -ENOMEM;
-       }
-
-       hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-       if (first->async_tx.callback) {
-               hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
-               if (first != new) {
-                       /* move callback into to last desc */
-                       new->async_tx.callback = first->async_tx.callback;
-                       new->async_tx.callback_param
-                                       = first->async_tx.callback_param;
-                       first->async_tx.callback = NULL;
-                       first->async_tx.callback_param = NULL;
-               }
-       }
-
-       new->tx_cnt = desc_count;
-       new->async_tx.flags = orig_flags; /* client is in control of this ack */
-
-       /* store the original values for use in later cleanup */
-       if (new != first) {
-               new->src = first->src;
-               new->dst = first->dst;
-               new->len = first->len;
-       }
-
-       /* cookie incr and addition to used_list must be atomic */
-       cookie = ioat_chan->common.cookie;
-       cookie++;
-       if (cookie < 0)
-               cookie = 1;
-       ioat_chan->common.cookie = new->async_tx.cookie = cookie;
-
-       /* write address into NextDescriptor field of last desc in chain */
-       to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
-                                                       first->async_tx.phys;
-       list_splice_tail(&new_chain, &ioat_chan->used_desc);
-
-       ioat_chan->dmacount += desc_count;
-       ioat_chan->pending += desc_count;
-       if (ioat_chan->pending >= ioat_pending_level)
-               __ioat1_dma_memcpy_issue_pending(ioat_chan);
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       return cookie;
-}
-
-static dma_cookie_t ioat2_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
-       struct ioat_desc_sw *first = tx_to_ioat_desc(tx);
-       struct ioat_desc_sw *new;
-       struct ioat_dma_descriptor *hw;
-       dma_cookie_t cookie;
-       u32 copy;
-       size_t len;
-       dma_addr_t src, dst;
-       unsigned long orig_flags;
-       unsigned int desc_count = 0;
-
-       /* src and dest and len are stored in the initial descriptor */
-       len = first->len;
-       src = first->src;
-       dst = first->dst;
-       orig_flags = first->async_tx.flags;
-       new = first;
-
-       /*
-        * ioat_chan->desc_lock is still in force in version 2 path
-        * it gets unlocked at end of this function
-        */
-       do {
-               copy = min_t(size_t, len, ioat_chan->xfercap);
-
-               async_tx_ack(&new->async_tx);
-
-               hw = new->hw;
-               hw->size = copy;
-               hw->ctl = 0;
-               hw->src_addr = src;
-               hw->dst_addr = dst;
-
-               len -= copy;
-               dst += copy;
-               src += copy;
-               desc_count++;
-       } while (len && (new = ioat2_dma_get_next_descriptor(ioat_chan)));
-
-       if (!new) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "tx submit failed\n");
-               spin_unlock_bh(&ioat_chan->desc_lock);
-               return -ENOMEM;
-       }
-
-       hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-       if (first->async_tx.callback) {
-               hw->ctl |= IOAT_DMA_DESCRIPTOR_CTL_INT_GN;
-               if (first != new) {
-                       /* move callback into to last desc */
-                       new->async_tx.callback = first->async_tx.callback;
-                       new->async_tx.callback_param
-                                       = first->async_tx.callback_param;
-                       first->async_tx.callback = NULL;
-                       first->async_tx.callback_param = NULL;
-               }
-       }
-
-       new->tx_cnt = desc_count;
-       new->async_tx.flags = orig_flags; /* client is in control of this ack */
-
-       /* store the original values for use in later cleanup */
-       if (new != first) {
-               new->src = first->src;
-               new->dst = first->dst;
-               new->len = first->len;
-       }
-
-       /* cookie incr and addition to used_list must be atomic */
-       cookie = ioat_chan->common.cookie;
-       cookie++;
-       if (cookie < 0)
-               cookie = 1;
-       ioat_chan->common.cookie = new->async_tx.cookie = cookie;
-
-       ioat_chan->dmacount += desc_count;
-       ioat_chan->pending += desc_count;
-       if (ioat_chan->pending >= ioat_pending_level)
-               __ioat2_dma_memcpy_issue_pending(ioat_chan);
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       return cookie;
-}
-
-/**
- * ioat_dma_alloc_descriptor - allocate and return a sw and hw descriptor pair
- * @ioat_chan: the channel supplying the memory pool for the descriptors
- * @flags: allocation flags
- */
-static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
-                                       struct ioat_dma_chan *ioat_chan,
-                                       gfp_t flags)
-{
-       struct ioat_dma_descriptor *desc;
-       struct ioat_desc_sw *desc_sw;
-       struct ioatdma_device *ioatdma_device;
-       dma_addr_t phys;
-
-       ioatdma_device = to_ioatdma_device(ioat_chan->common.device);
-       desc = pci_pool_alloc(ioatdma_device->dma_pool, flags, &phys);
-       if (unlikely(!desc))
-               return NULL;
-
-       desc_sw = kzalloc(sizeof(*desc_sw), flags);
-       if (unlikely(!desc_sw)) {
-               pci_pool_free(ioatdma_device->dma_pool, desc, phys);
-               return NULL;
-       }
-
-       memset(desc, 0, sizeof(*desc));
-       dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common);
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               desc_sw->async_tx.tx_submit = ioat1_tx_submit;
-               break;
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               desc_sw->async_tx.tx_submit = ioat2_tx_submit;
-               break;
-       }
-
-       desc_sw->hw = desc;
-       desc_sw->async_tx.phys = phys;
-
-       return desc_sw;
-}
-
-static int ioat_initial_desc_count = 256;
-module_param(ioat_initial_desc_count, int, 0644);
-MODULE_PARM_DESC(ioat_initial_desc_count,
-                "initial descriptors per channel (default: 256)");
-
-/**
- * ioat2_dma_massage_chan_desc - link the descriptors into a circle
- * @ioat_chan: the channel to be massaged
- */
-static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan)
-{
-       struct ioat_desc_sw *desc, *_desc;
-
-       /* setup used_desc */
-       ioat_chan->used_desc.next = ioat_chan->free_desc.next;
-       ioat_chan->used_desc.prev = NULL;
-
-       /* pull free_desc out of the circle so that every node is a hw
-        * descriptor, but leave it pointing to the list
-        */
-       ioat_chan->free_desc.prev->next = ioat_chan->free_desc.next;
-       ioat_chan->free_desc.next->prev = ioat_chan->free_desc.prev;
-
-       /* circle link the hw descriptors */
-       desc = to_ioat_desc(ioat_chan->free_desc.next);
-       desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
-       list_for_each_entry_safe(desc, _desc, ioat_chan->free_desc.next, node) {
-               desc->hw->next = to_ioat_desc(desc->node.next)->async_tx.phys;
-       }
-}
-
-/**
- * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors
- * @chan: the channel to be filled out
- */
-static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-       struct ioat_desc_sw *desc;
-       u16 chanctrl;
-       u32 chanerr;
-       int i;
-       LIST_HEAD(tmp_list);
-
-       /* have we already been set up? */
-       if (!list_empty(&ioat_chan->free_desc))
-               return ioat_chan->desccount;
-
-       /* Setup register to interrupt and write completion status on error */
-       chanctrl = IOAT_CHANCTRL_ERR_INT_EN |
-               IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
-               IOAT_CHANCTRL_ERR_COMPLETION_EN;
-       writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
-
-       chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-       if (chanerr) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "CHANERR = %x, clearing\n", chanerr);
-               writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
-       }
-
-       /* Allocate descriptors */
-       for (i = 0; i < ioat_initial_desc_count; i++) {
-               desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
-               if (!desc) {
-                       dev_err(&ioat_chan->device->pdev->dev,
-                               "Only %d initial descriptors\n", i);
-                       break;
-               }
-               list_add_tail(&desc->node, &tmp_list);
-       }
-       spin_lock_bh(&ioat_chan->desc_lock);
-       ioat_chan->desccount = i;
-       list_splice(&tmp_list, &ioat_chan->free_desc);
-       if (ioat_chan->device->version != IOAT_VER_1_2)
-               ioat2_dma_massage_chan_desc(ioat_chan);
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       /* allocate a completion writeback area */
-       /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
-       ioat_chan->completion_virt =
-               pci_pool_alloc(ioat_chan->device->completion_pool,
-                              GFP_KERNEL,
-                              &ioat_chan->completion_addr);
-       memset(ioat_chan->completion_virt, 0,
-              sizeof(*ioat_chan->completion_virt));
-       writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF,
-              ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW);
-       writel(((u64) ioat_chan->completion_addr) >> 32,
-              ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH);
-
-       tasklet_enable(&ioat_chan->cleanup_task);
-       ioat_dma_start_null_desc(ioat_chan);  /* give chain to dma device */
-       return ioat_chan->desccount;
-}
-
-/**
- * ioat_dma_free_chan_resources - release all the descriptors
- * @chan: the channel to be cleaned
- */
-static void ioat_dma_free_chan_resources(struct dma_chan *chan)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-       struct ioatdma_device *ioatdma_device = to_ioatdma_device(chan->device);
-       struct ioat_desc_sw *desc, *_desc;
-       int in_use_descs = 0;
-
-       /* Before freeing channel resources first check
-        * if they have been previously allocated for this channel.
-        */
-       if (ioat_chan->desccount == 0)
-               return;
-
-       tasklet_disable(&ioat_chan->cleanup_task);
-       ioat_dma_memcpy_cleanup(ioat_chan);
-
-       /* Delay 100ms after reset to allow internal DMA logic to quiesce
-        * before removing DMA descriptor resources.
-        */
-       writeb(IOAT_CHANCMD_RESET,
-              ioat_chan->reg_base
-                       + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-       mdelay(100);
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               list_for_each_entry_safe(desc, _desc,
-                                        &ioat_chan->used_desc, node) {
-                       in_use_descs++;
-                       list_del(&desc->node);
-                       pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-                                     desc->async_tx.phys);
-                       kfree(desc);
-               }
-               list_for_each_entry_safe(desc, _desc,
-                                        &ioat_chan->free_desc, node) {
-                       list_del(&desc->node);
-                       pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-                                     desc->async_tx.phys);
-                       kfree(desc);
-               }
-               break;
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               list_for_each_entry_safe(desc, _desc,
-                                        ioat_chan->free_desc.next, node) {
-                       list_del(&desc->node);
-                       pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-                                     desc->async_tx.phys);
-                       kfree(desc);
-               }
-               desc = to_ioat_desc(ioat_chan->free_desc.next);
-               pci_pool_free(ioatdma_device->dma_pool, desc->hw,
-                             desc->async_tx.phys);
-               kfree(desc);
-               INIT_LIST_HEAD(&ioat_chan->free_desc);
-               INIT_LIST_HEAD(&ioat_chan->used_desc);
-               break;
-       }
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       pci_pool_free(ioatdma_device->completion_pool,
-                     ioat_chan->completion_virt,
-                     ioat_chan->completion_addr);
-
-       /* one is ok since we left it on there on purpose */
-       if (in_use_descs > 1)
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "Freeing %d in use descriptors!\n",
-                       in_use_descs - 1);
-
-       ioat_chan->last_completion = ioat_chan->completion_addr = 0;
-       ioat_chan->pending = 0;
-       ioat_chan->dmacount = 0;
-       ioat_chan->desccount = 0;
-       ioat_chan->watchdog_completion = 0;
-       ioat_chan->last_compl_desc_addr_hw = 0;
-       ioat_chan->watchdog_tcp_cookie =
-               ioat_chan->watchdog_last_tcp_cookie = 0;
-}
-
-/**
- * ioat_dma_get_next_descriptor - return the next available descriptor
- * @ioat_chan: IOAT DMA channel handle
- *
- * Gets the next descriptor from the chain, and must be called with the
- * channel's desc_lock held.  Allocates more descriptors if the channel
- * has run out.
- */
-static struct ioat_desc_sw *
-ioat1_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
-{
-       struct ioat_desc_sw *new;
-
-       if (!list_empty(&ioat_chan->free_desc)) {
-               new = to_ioat_desc(ioat_chan->free_desc.next);
-               list_del(&new->node);
-       } else {
-               /* try to get another desc */
-               new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
-               if (!new) {
-                       dev_err(&ioat_chan->device->pdev->dev,
-                               "alloc failed\n");
-                       return NULL;
-               }
-       }
-
-       prefetch(new->hw);
-       return new;
-}
-
-static struct ioat_desc_sw *
-ioat2_dma_get_next_descriptor(struct ioat_dma_chan *ioat_chan)
-{
-       struct ioat_desc_sw *new;
-
-       /*
-        * used.prev points to where to start processing
-        * used.next points to next free descriptor
-        * if used.prev == NULL, there are none waiting to be processed
-        * if used.next == used.prev.prev, there is only one free descriptor,
-        *      and we need to use it to as a noop descriptor before
-        *      linking in a new set of descriptors, since the device
-        *      has probably already read the pointer to it
-        */
-       if (ioat_chan->used_desc.prev &&
-           ioat_chan->used_desc.next == ioat_chan->used_desc.prev->prev) {
-
-               struct ioat_desc_sw *desc;
-               struct ioat_desc_sw *noop_desc;
-               int i;
-
-               /* set up the noop descriptor */
-               noop_desc = to_ioat_desc(ioat_chan->used_desc.next);
-               /* set size to non-zero value (channel returns error when size is 0) */
-               noop_desc->hw->size = NULL_DESC_BUFFER_SIZE;
-               noop_desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
-               noop_desc->hw->src_addr = 0;
-               noop_desc->hw->dst_addr = 0;
-
-               ioat_chan->used_desc.next = ioat_chan->used_desc.next->next;
-               ioat_chan->pending++;
-               ioat_chan->dmacount++;
-
-               /* try to get a few more descriptors */
-               for (i = 16; i; i--) {
-                       desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
-                       if (!desc) {
-                               dev_err(&ioat_chan->device->pdev->dev,
-                                       "alloc failed\n");
-                               break;
-                       }
-                       list_add_tail(&desc->node, ioat_chan->used_desc.next);
-
-                       desc->hw->next
-                               = to_ioat_desc(desc->node.next)->async_tx.phys;
-                       to_ioat_desc(desc->node.prev)->hw->next
-                               = desc->async_tx.phys;
-                       ioat_chan->desccount++;
-               }
-
-               ioat_chan->used_desc.next = noop_desc->node.next;
-       }
-       new = to_ioat_desc(ioat_chan->used_desc.next);
-       prefetch(new);
-       ioat_chan->used_desc.next = new->node.next;
-
-       if (ioat_chan->used_desc.prev == NULL)
-               ioat_chan->used_desc.prev = &new->node;
-
-       prefetch(new->hw);
-       return new;
-}
-
-static struct ioat_desc_sw *ioat_dma_get_next_descriptor(
-                                               struct ioat_dma_chan *ioat_chan)
-{
-       if (!ioat_chan)
-               return NULL;
-
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               return ioat1_dma_get_next_descriptor(ioat_chan);
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               return ioat2_dma_get_next_descriptor(ioat_chan);
-       }
-       return NULL;
-}
-
-static struct dma_async_tx_descriptor *ioat1_dma_prep_memcpy(
-                                               struct dma_chan *chan,
-                                               dma_addr_t dma_dest,
-                                               dma_addr_t dma_src,
-                                               size_t len,
-                                               unsigned long flags)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-       struct ioat_desc_sw *new;
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-       new = ioat_dma_get_next_descriptor(ioat_chan);
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       if (new) {
-               new->len = len;
-               new->dst = dma_dest;
-               new->src = dma_src;
-               new->async_tx.flags = flags;
-               return &new->async_tx;
-       } else {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
-                       chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-               return NULL;
-       }
-}
-
-static struct dma_async_tx_descriptor *ioat2_dma_prep_memcpy(
-                                               struct dma_chan *chan,
-                                               dma_addr_t dma_dest,
-                                               dma_addr_t dma_src,
-                                               size_t len,
-                                               unsigned long flags)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-       struct ioat_desc_sw *new;
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-       new = ioat2_dma_get_next_descriptor(ioat_chan);
-
-       /*
-        * leave ioat_chan->desc_lock set in ioat 2 path
-        * it will get unlocked at end of tx_submit
-        */
-
-       if (new) {
-               new->len = len;
-               new->dst = dma_dest;
-               new->src = dma_src;
-               new->async_tx.flags = flags;
-               return &new->async_tx;
-       } else {
-               spin_unlock_bh(&ioat_chan->desc_lock);
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "chan%d - get_next_desc failed: %d descs waiting, %d total desc\n",
-                       chan_num(ioat_chan), ioat_chan->dmacount, ioat_chan->desccount);
-               return NULL;
-       }
-}
-
-static void ioat_dma_cleanup_tasklet(unsigned long data)
-{
-       struct ioat_dma_chan *chan = (void *)data;
-       ioat_dma_memcpy_cleanup(chan);
-       writew(IOAT_CHANCTRL_INT_DISABLE,
-              chan->reg_base + IOAT_CHANCTRL_OFFSET);
-}
-
-static void
-ioat_dma_unmap(struct ioat_dma_chan *ioat_chan, struct ioat_desc_sw *desc)
-{
-       if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               if (desc->async_tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                       pci_unmap_single(ioat_chan->device->pdev,
-                                        pci_unmap_addr(desc, dst),
-                                        pci_unmap_len(desc, len),
-                                        PCI_DMA_FROMDEVICE);
-               else
-                       pci_unmap_page(ioat_chan->device->pdev,
-                                      pci_unmap_addr(desc, dst),
-                                      pci_unmap_len(desc, len),
-                                      PCI_DMA_FROMDEVICE);
-       }
-
-       if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               if (desc->async_tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                       pci_unmap_single(ioat_chan->device->pdev,
-                                        pci_unmap_addr(desc, src),
-                                        pci_unmap_len(desc, len),
-                                        PCI_DMA_TODEVICE);
-               else
-                       pci_unmap_page(ioat_chan->device->pdev,
-                                      pci_unmap_addr(desc, src),
-                                      pci_unmap_len(desc, len),
-                                      PCI_DMA_TODEVICE);
-       }
-}
-
-/**
- * ioat_dma_memcpy_cleanup - cleanup up finished descriptors
- * @chan: ioat channel to be cleaned up
- */
-static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan)
-{
-       unsigned long phys_complete;
-       struct ioat_desc_sw *desc, *_desc;
-       dma_cookie_t cookie = 0;
-       unsigned long desc_phys;
-       struct ioat_desc_sw *latest_desc;
-
-       prefetch(ioat_chan->completion_virt);
-
-       if (!spin_trylock_bh(&ioat_chan->cleanup_lock))
-               return;
-
-       /* The completion writeback can happen at any time,
-          so reads by the driver need to be atomic operations
-          The descriptor physical addresses are limited to 32-bits
-          when the CPU can only do a 32-bit mov */
-
-#if (BITS_PER_LONG == 64)
-       phys_complete =
-               ioat_chan->completion_virt->full
-               & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-#else
-       phys_complete =
-               ioat_chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK;
-#endif
-
-       if ((ioat_chan->completion_virt->full
-               & IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
-                               IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "Channel halted, chanerr = %x\n",
-                       readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET));
-
-               /* TODO do something to salvage the situation */
-       }
-
-       if (phys_complete == ioat_chan->last_completion) {
-               spin_unlock_bh(&ioat_chan->cleanup_lock);
-               /*
-                * perhaps we're stuck so hard that the watchdog can't go off?
-                * try to catch it after 2 seconds
-                */
-               if (ioat_chan->device->version != IOAT_VER_3_0) {
-                       if (time_after(jiffies,
-                                      ioat_chan->last_completion_time + HZ*WATCHDOG_DELAY)) {
-                               ioat_dma_chan_watchdog(&(ioat_chan->device->work.work));
-                               ioat_chan->last_completion_time = jiffies;
-                       }
-               }
-               return;
-       }
-       ioat_chan->last_completion_time = jiffies;
-
-       cookie = 0;
-       if (!spin_trylock_bh(&ioat_chan->desc_lock)) {
-               spin_unlock_bh(&ioat_chan->cleanup_lock);
-               return;
-       }
-
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               list_for_each_entry_safe(desc, _desc,
-                                        &ioat_chan->used_desc, node) {
-
-                       /*
-                        * Incoming DMA requests may use multiple descriptors,
-                        * due to exceeding xfercap, perhaps. If so, only the
-                        * last one will have a cookie, and require unmapping.
-                        */
-                       if (desc->async_tx.cookie) {
-                               cookie = desc->async_tx.cookie;
-                               ioat_dma_unmap(ioat_chan, desc);
-                               if (desc->async_tx.callback) {
-                                       desc->async_tx.callback(desc->async_tx.callback_param);
-                                       desc->async_tx.callback = NULL;
-                               }
-                       }
-
-                       if (desc->async_tx.phys != phys_complete) {
-                               /*
-                                * a completed entry, but not the last, so clean
-                                * up if the client is done with the descriptor
-                                */
-                               if (async_tx_test_ack(&desc->async_tx)) {
-                                       list_move_tail(&desc->node,
-                                                      &ioat_chan->free_desc);
-                               } else
-                                       desc->async_tx.cookie = 0;
-                       } else {
-                               /*
-                                * last used desc. Do not remove, so we can
-                                * append from it, but don't look at it next
-                                * time, either
-                                */
-                               desc->async_tx.cookie = 0;
-
-                               /* TODO check status bits? */
-                               break;
-                       }
-               }
-               break;
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               /* has some other thread has already cleaned up? */
-               if (ioat_chan->used_desc.prev == NULL)
-                       break;
-
-               /* work backwards to find latest finished desc */
-               desc = to_ioat_desc(ioat_chan->used_desc.next);
-               latest_desc = NULL;
-               do {
-                       desc = to_ioat_desc(desc->node.prev);
-                       desc_phys = (unsigned long)desc->async_tx.phys
-                                      & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
-                       if (desc_phys == phys_complete) {
-                               latest_desc = desc;
-                               break;
-                       }
-               } while (&desc->node != ioat_chan->used_desc.prev);
-
-               if (latest_desc != NULL) {
-
-                       /* work forwards to clear finished descriptors */
-                       for (desc = to_ioat_desc(ioat_chan->used_desc.prev);
-                            &desc->node != latest_desc->node.next &&
-                            &desc->node != ioat_chan->used_desc.next;
-                            desc = to_ioat_desc(desc->node.next)) {
-                               if (desc->async_tx.cookie) {
-                                       cookie = desc->async_tx.cookie;
-                                       desc->async_tx.cookie = 0;
-                                       ioat_dma_unmap(ioat_chan, desc);
-                                       if (desc->async_tx.callback) {
-                                               desc->async_tx.callback(desc->async_tx.callback_param);
-                                               desc->async_tx.callback = NULL;
-                                       }
-                               }
-                       }
-
-                       /* move used.prev up beyond those that are finished */
-                       if (&desc->node == ioat_chan->used_desc.next)
-                               ioat_chan->used_desc.prev = NULL;
-                       else
-                               ioat_chan->used_desc.prev = &desc->node;
-               }
-               break;
-       }
-
-       spin_unlock_bh(&ioat_chan->desc_lock);
-
-       ioat_chan->last_completion = phys_complete;
-       if (cookie != 0)
-               ioat_chan->completed_cookie = cookie;
-
-       spin_unlock_bh(&ioat_chan->cleanup_lock);
-}
-
-/**
- * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
- * @chan: IOAT DMA channel handle
- * @cookie: DMA transaction identifier
- * @done: if not %NULL, updated with last completed transaction
- * @used: if not %NULL, updated with last used transaction
- */
-static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
-                                           dma_cookie_t cookie,
-                                           dma_cookie_t *done,
-                                           dma_cookie_t *used)
-{
-       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-       dma_cookie_t last_used;
-       dma_cookie_t last_complete;
-       enum dma_status ret;
-
-       last_used = chan->cookie;
-       last_complete = ioat_chan->completed_cookie;
-       ioat_chan->watchdog_tcp_cookie = cookie;
-
-       if (done)
-               *done = last_complete;
-       if (used)
-               *used = last_used;
-
-       ret = dma_async_is_complete(cookie, last_complete, last_used);
-       if (ret == DMA_SUCCESS)
-               return ret;
-
-       ioat_dma_memcpy_cleanup(ioat_chan);
-
-       last_used = chan->cookie;
-       last_complete = ioat_chan->completed_cookie;
-
-       if (done)
-               *done = last_complete;
-       if (used)
-               *used = last_used;
-
-       return dma_async_is_complete(cookie, last_complete, last_used);
-}
-
-static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan)
-{
-       struct ioat_desc_sw *desc;
-
-       spin_lock_bh(&ioat_chan->desc_lock);
-
-       desc = ioat_dma_get_next_descriptor(ioat_chan);
-
-       if (!desc) {
-               dev_err(&ioat_chan->device->pdev->dev,
-                       "Unable to start null desc - get next desc failed\n");
-               spin_unlock_bh(&ioat_chan->desc_lock);
-               return;
-       }
-
-       desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL
-                               | IOAT_DMA_DESCRIPTOR_CTL_INT_GN
-                               | IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-       /* set size to non-zero value (channel returns error when size is 0) */
-       desc->hw->size = NULL_DESC_BUFFER_SIZE;
-       desc->hw->src_addr = 0;
-       desc->hw->dst_addr = 0;
-       async_tx_ack(&desc->async_tx);
-       switch (ioat_chan->device->version) {
-       case IOAT_VER_1_2:
-               desc->hw->next = 0;
-               list_add_tail(&desc->node, &ioat_chan->used_desc);
-
-               writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-                      ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_LOW);
-               writel(((u64) desc->async_tx.phys) >> 32,
-                      ioat_chan->reg_base + IOAT1_CHAINADDR_OFFSET_HIGH);
-
-               writeb(IOAT_CHANCMD_START, ioat_chan->reg_base
-                       + IOAT_CHANCMD_OFFSET(ioat_chan->device->version));
-               break;
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
-                      ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_LOW);
-               writel(((u64) desc->async_tx.phys) >> 32,
-                      ioat_chan->reg_base + IOAT2_CHAINADDR_OFFSET_HIGH);
-
-               ioat_chan->dmacount++;
-               __ioat2_dma_memcpy_issue_pending(ioat_chan);
-               break;
-       }
-       spin_unlock_bh(&ioat_chan->desc_lock);
-}
-
-/*
- * Perform a IOAT transaction to verify the HW works.
- */
-#define IOAT_TEST_SIZE 2000
-
-static void ioat_dma_test_callback(void *dma_async_param)
-{
-       struct completion *cmp = dma_async_param;
-
-       complete(cmp);
-}
-
-/**
- * ioat_dma_self_test - Perform a IOAT transaction to verify the HW works.
- * @device: device to be tested
- */
-static int ioat_dma_self_test(struct ioatdma_device *device)
-{
-       int i;
-       u8 *src;
-       u8 *dest;
-       struct dma_chan *dma_chan;
-       struct dma_async_tx_descriptor *tx;
-       dma_addr_t dma_dest, dma_src;
-       dma_cookie_t cookie;
-       int err = 0;
-       struct completion cmp;
-       unsigned long tmo;
-       unsigned long flags;
-
-       src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
-       if (!src)
-               return -ENOMEM;
-       dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
-       if (!dest) {
-               kfree(src);
-               return -ENOMEM;
-       }
-
-       /* Fill in src buffer */
-       for (i = 0; i < IOAT_TEST_SIZE; i++)
-               src[i] = (u8)i;
-
-       /* Start copy, using first DMA channel */
-       dma_chan = container_of(device->common.channels.next,
-                               struct dma_chan,
-                               device_node);
-       if (device->common.device_alloc_chan_resources(dma_chan) < 1) {
-               dev_err(&device->pdev->dev,
-                       "selftest cannot allocate chan resource\n");
-               err = -ENODEV;
-               goto out;
-       }
-
-       dma_src = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE,
-                                DMA_TO_DEVICE);
-       dma_dest = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
-                                 DMA_FROM_DEVICE);
-       flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_COMPL_DEST_UNMAP_SINGLE;
-       tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
-                                                  IOAT_TEST_SIZE, flags);
-       if (!tx) {
-               dev_err(&device->pdev->dev,
-                       "Self-test prep failed, disabling\n");
-               err = -ENODEV;
-               goto free_resources;
-       }
-
-       async_tx_ack(tx);
-       init_completion(&cmp);
-       tx->callback = ioat_dma_test_callback;
-       tx->callback_param = &cmp;
-       cookie = tx->tx_submit(tx);
-       if (cookie < 0) {
-               dev_err(&device->pdev->dev,
-                       "Self-test setup failed, disabling\n");
-               err = -ENODEV;
-               goto free_resources;
-       }
-       device->common.device_issue_pending(dma_chan);
-
-       tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
-
-       if (tmo == 0 ||
-           device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
-                                       != DMA_SUCCESS) {
-               dev_err(&device->pdev->dev,
-                       "Self-test copy timed out, disabling\n");
-               err = -ENODEV;
-               goto free_resources;
-       }
-       if (memcmp(src, dest, IOAT_TEST_SIZE)) {
-               dev_err(&device->pdev->dev,
-                       "Self-test copy failed compare, disabling\n");
-               err = -ENODEV;
-               goto free_resources;
-       }
-
-free_resources:
-       device->common.device_free_chan_resources(dma_chan);
-out:
-       kfree(src);
-       kfree(dest);
-       return err;
-}
-
-static char ioat_interrupt_style[32] = "msix";
-module_param_string(ioat_interrupt_style, ioat_interrupt_style,
-                   sizeof(ioat_interrupt_style), 0644);
-MODULE_PARM_DESC(ioat_interrupt_style,
-                "set ioat interrupt style: msix (default), "
-                "msix-single-vector, msi, intx)");
-
-/**
- * ioat_dma_setup_interrupts - setup interrupt handler
- * @device: ioat device
- */
-static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
-{
-       struct ioat_dma_chan *ioat_chan;
-       int err, i, j, msixcnt;
-       u8 intrctrl = 0;
-
-       if (!strcmp(ioat_interrupt_style, "msix"))
-               goto msix;
-       if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
-               goto msix_single_vector;
-       if (!strcmp(ioat_interrupt_style, "msi"))
-               goto msi;
-       if (!strcmp(ioat_interrupt_style, "intx"))
-               goto intx;
-       dev_err(&device->pdev->dev, "invalid ioat_interrupt_style %s\n",
-               ioat_interrupt_style);
-       goto err_no_irq;
-
-msix:
-       /* The number of MSI-X vectors should equal the number of channels */
-       msixcnt = device->common.chancnt;
-       for (i = 0; i < msixcnt; i++)
-               device->msix_entries[i].entry = i;
-
-       err = pci_enable_msix(device->pdev, device->msix_entries, msixcnt);
-       if (err < 0)
-               goto msi;
-       if (err > 0)
-               goto msix_single_vector;
-
-       for (i = 0; i < msixcnt; i++) {
-               ioat_chan = ioat_lookup_chan_by_index(device, i);
-               err = request_irq(device->msix_entries[i].vector,
-                                 ioat_dma_do_interrupt_msix,
-                                 0, "ioat-msix", ioat_chan);
-               if (err) {
-                       for (j = 0; j < i; j++) {
-                               ioat_chan =
-                                       ioat_lookup_chan_by_index(device, j);
-                               free_irq(device->msix_entries[j].vector,
-                                        ioat_chan);
-                       }
-                       goto msix_single_vector;
-               }
-       }
-       intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
-       device->irq_mode = msix_multi_vector;
-       goto done;
-
-msix_single_vector:
-       device->msix_entries[0].entry = 0;
-       err = pci_enable_msix(device->pdev, device->msix_entries, 1);
-       if (err)
-               goto msi;
-
-       err = request_irq(device->msix_entries[0].vector, ioat_dma_do_interrupt,
-                         0, "ioat-msix", device);
-       if (err) {
-               pci_disable_msix(device->pdev);
-               goto msi;
-       }
-       device->irq_mode = msix_single_vector;
-       goto done;
-
-msi:
-       err = pci_enable_msi(device->pdev);
-       if (err)
-               goto intx;
-
-       err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
-                         0, "ioat-msi", device);
-       if (err) {
-               pci_disable_msi(device->pdev);
-               goto intx;
-       }
-       /*
-        * CB 1.2 devices need a bit set in configuration space to enable MSI
-        */
-       if (device->version == IOAT_VER_1_2) {
-               u32 dmactrl;
-               pci_read_config_dword(device->pdev,
-                                     IOAT_PCI_DMACTRL_OFFSET, &dmactrl);
-               dmactrl |= IOAT_PCI_DMACTRL_MSI_EN;
-               pci_write_config_dword(device->pdev,
-                                      IOAT_PCI_DMACTRL_OFFSET, dmactrl);
-       }
-       device->irq_mode = msi;
-       goto done;
-
-intx:
-       err = request_irq(device->pdev->irq, ioat_dma_do_interrupt,
-                         IRQF_SHARED, "ioat-intx", device);
-       if (err)
-               goto err_no_irq;
-       device->irq_mode = intx;
-
-done:
-       intrctrl |= IOAT_INTRCTRL_MASTER_INT_EN;
-       writeb(intrctrl, device->reg_base + IOAT_INTRCTRL_OFFSET);
-       return 0;
-
-err_no_irq:
-       /* Disable all interrupt generation */
-       writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
-       dev_err(&device->pdev->dev, "no usable interrupts\n");
-       device->irq_mode = none;
-       return -1;
-}
-
-/**
- * ioat_dma_remove_interrupts - remove whatever interrupts were set
- * @device: ioat device
- */
-static void ioat_dma_remove_interrupts(struct ioatdma_device *device)
-{
-       struct ioat_dma_chan *ioat_chan;
-       int i;
-
-       /* Disable all interrupt generation */
-       writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
-
-       switch (device->irq_mode) {
-       case msix_multi_vector:
-               for (i = 0; i < device->common.chancnt; i++) {
-                       ioat_chan = ioat_lookup_chan_by_index(device, i);
-                       free_irq(device->msix_entries[i].vector, ioat_chan);
-               }
-               pci_disable_msix(device->pdev);
-               break;
-       case msix_single_vector:
-               free_irq(device->msix_entries[0].vector, device);
-               pci_disable_msix(device->pdev);
-               break;
-       case msi:
-               free_irq(device->pdev->irq, device);
-               pci_disable_msi(device->pdev);
-               break;
-       case intx:
-               free_irq(device->pdev->irq, device);
-               break;
-       case none:
-               dev_warn(&device->pdev->dev,
-                        "call to %s without interrupts setup\n", __func__);
-       }
-       device->irq_mode = none;
-}
-
-struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
-                                     void __iomem *iobase)
-{
-       int err;
-       struct ioatdma_device *device;
-
-       device = kzalloc(sizeof(*device), GFP_KERNEL);
-       if (!device) {
-               err = -ENOMEM;
-               goto err_kzalloc;
-       }
-       device->pdev = pdev;
-       device->reg_base = iobase;
-       device->version = readb(device->reg_base + IOAT_VER_OFFSET);
-
-       /* DMA coherent memory pool for DMA descriptor allocations */
-       device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
-                                          sizeof(struct ioat_dma_descriptor),
-                                          64, 0);
-       if (!device->dma_pool) {
-               err = -ENOMEM;
-               goto err_dma_pool;
-       }
-
-       device->completion_pool = pci_pool_create("completion_pool", pdev,
-                                                 sizeof(u64), SMP_CACHE_BYTES,
-                                                 SMP_CACHE_BYTES);
-       if (!device->completion_pool) {
-               err = -ENOMEM;
-               goto err_completion_pool;
-       }
-
-       INIT_LIST_HEAD(&device->common.channels);
-       ioat_dma_enumerate_channels(device);
-
-       device->common.device_alloc_chan_resources =
-                                               ioat_dma_alloc_chan_resources;
-       device->common.device_free_chan_resources =
-                                               ioat_dma_free_chan_resources;
-       device->common.dev = &pdev->dev;
-
-       dma_cap_set(DMA_MEMCPY, device->common.cap_mask);
-       device->common.device_is_tx_complete = ioat_dma_is_complete;
-       switch (device->version) {
-       case IOAT_VER_1_2:
-               device->common.device_prep_dma_memcpy = ioat1_dma_prep_memcpy;
-               device->common.device_issue_pending =
-                                               ioat1_dma_memcpy_issue_pending;
-               break;
-       case IOAT_VER_2_0:
-       case IOAT_VER_3_0:
-               device->common.device_prep_dma_memcpy = ioat2_dma_prep_memcpy;
-               device->common.device_issue_pending =
-                                               ioat2_dma_memcpy_issue_pending;
-               break;
-       }
-
-       dev_err(&device->pdev->dev,
-               "Intel(R) I/OAT DMA Engine found,"
-               " %d channels, device version 0x%02x, driver version %s\n",
-               device->common.chancnt, device->version, IOAT_DMA_VERSION);
-
-       if (!device->common.chancnt) {
-               dev_err(&device->pdev->dev,
-                       "Intel(R) I/OAT DMA Engine problem found: "
-                       "zero channels detected\n");
-               goto err_setup_interrupts;
-       }
-
-       err = ioat_dma_setup_interrupts(device);
-       if (err)
-               goto err_setup_interrupts;
-
-       err = ioat_dma_self_test(device);
-       if (err)
-               goto err_self_test;
-
-       ioat_set_tcp_copy_break(device);
-
-       dma_async_device_register(&device->common);
-
-       if (device->version != IOAT_VER_3_0) {
-               INIT_DELAYED_WORK(&device->work, ioat_dma_chan_watchdog);
-               schedule_delayed_work(&device->work,
-                                     WATCHDOG_DELAY);
-       }
-
-       return device;
-
-err_self_test:
-       ioat_dma_remove_interrupts(device);
-err_setup_interrupts:
-       pci_pool_destroy(device->completion_pool);
-err_completion_pool:
-       pci_pool_destroy(device->dma_pool);
-err_dma_pool:
-       kfree(device);
-err_kzalloc:
-       dev_err(&pdev->dev,
-               "Intel(R) I/OAT DMA Engine initialization failed\n");
-       return NULL;
-}
-
-void ioat_dma_remove(struct ioatdma_device *device)
-{
-       struct dma_chan *chan, *_chan;
-       struct ioat_dma_chan *ioat_chan;
-
-       if (device->version != IOAT_VER_3_0)
-               cancel_delayed_work(&device->work);
-
-       ioat_dma_remove_interrupts(device);
-
-       dma_async_device_unregister(&device->common);
-
-       pci_pool_destroy(device->dma_pool);
-       pci_pool_destroy(device->completion_pool);
-
-       iounmap(device->reg_base);
-       pci_release_regions(device->pdev);
-       pci_disable_device(device->pdev);
-
-       list_for_each_entry_safe(chan, _chan,
-                                &device->common.channels, device_node) {
-               ioat_chan = to_ioat_chan(chan);
-               list_del(&chan->device_node);
-               kfree(ioat_chan);
-       }
-       kfree(device);
-}
-
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
deleted file mode 100644 (file)
index a52ff4b..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef IOATDMA_H
-#define IOATDMA_H
-
-#include <linux/dmaengine.h>
-#include "ioatdma_hw.h"
-#include <linux/init.h>
-#include <linux/dmapool.h>
-#include <linux/cache.h>
-#include <linux/pci_ids.h>
-#include <net/tcp.h>
-
-#define IOAT_DMA_VERSION  "3.64"
-
-enum ioat_interrupt {
-       none = 0,
-       msix_multi_vector = 1,
-       msix_single_vector = 2,
-       msi = 3,
-       intx = 4,
-};
-
-#define IOAT_LOW_COMPLETION_MASK       0xffffffc0
-#define IOAT_DMA_DCA_ANY_CPU           ~0
-#define IOAT_WATCHDOG_PERIOD           (2 * HZ)
-
-
-/**
- * struct ioatdma_device - internal representation of a IOAT device
- * @pdev: PCI-Express device
- * @reg_base: MMIO register space base address
- * @dma_pool: for allocating DMA descriptors
- * @common: embedded struct dma_device
- * @version: version of ioatdma device
- * @irq_mode: which style irq to use
- * @msix_entries: irq handlers
- * @idx: per channel data
- */
-
-struct ioatdma_device {
-       struct pci_dev *pdev;
-       void __iomem *reg_base;
-       struct pci_pool *dma_pool;
-       struct pci_pool *completion_pool;
-       struct dma_device common;
-       u8 version;
-       enum ioat_interrupt irq_mode;
-       struct delayed_work work;
-       struct msix_entry msix_entries[4];
-       struct ioat_dma_chan *idx[4];
-};
-
-/**
- * struct ioat_dma_chan - internal representation of a DMA channel
- */
-struct ioat_dma_chan {
-
-       void __iomem *reg_base;
-
-       dma_cookie_t completed_cookie;
-       unsigned long last_completion;
-       unsigned long last_completion_time;
-
-       size_t xfercap; /* XFERCAP register value expanded out */
-
-       spinlock_t cleanup_lock;
-       spinlock_t desc_lock;
-       struct list_head free_desc;
-       struct list_head used_desc;
-       unsigned long watchdog_completion;
-       int watchdog_tcp_cookie;
-       u32 watchdog_last_tcp_cookie;
-       struct delayed_work work;
-
-       int pending;
-       int dmacount;
-       int desccount;
-
-       struct ioatdma_device *device;
-       struct dma_chan common;
-
-       dma_addr_t completion_addr;
-       union {
-               u64 full; /* HW completion writeback */
-               struct {
-                       u32 low;
-                       u32 high;
-               };
-       } *completion_virt;
-       unsigned long last_compl_desc_addr_hw;
-       struct tasklet_struct cleanup_task;
-};
-
-/* wrapper around hardware descriptor format + additional software fields */
-
-/**
- * struct ioat_desc_sw - wrapper around hardware descriptor
- * @hw: hardware DMA descriptor
- * @node: this descriptor will either be on the free list,
- *     or attached to a transaction list (async_tx.tx_list)
- * @tx_cnt: number of descriptors required to complete the transaction
- * @async_tx: the generic software descriptor for all engines
- */
-struct ioat_desc_sw {
-       struct ioat_dma_descriptor *hw;
-       struct list_head node;
-       int tx_cnt;
-       size_t len;
-       dma_addr_t src;
-       dma_addr_t dst;
-       struct dma_async_tx_descriptor async_tx;
-};
-
-static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev)
-{
-       #ifdef CONFIG_NET_DMA
-       switch (dev->version) {
-       case IOAT_VER_1_2:
-               sysctl_tcp_dma_copybreak = 4096;
-               break;
-       case IOAT_VER_2_0:
-               sysctl_tcp_dma_copybreak = 2048;
-               break;
-       case IOAT_VER_3_0:
-               sysctl_tcp_dma_copybreak = 262144;
-               break;
-       }
-       #endif
-}
-
-#if defined(CONFIG_INTEL_IOATDMA) || defined(CONFIG_INTEL_IOATDMA_MODULE)
-struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev,
-                                     void __iomem *iobase);
-void ioat_dma_remove(struct ioatdma_device *device);
-struct dca_provider *ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-#else
-#define ioat_dma_probe(pdev, iobase)    NULL
-#define ioat_dma_remove(device)         do { } while (0)
-#define ioat_dca_init(pdev, iobase)    NULL
-#define ioat2_dca_init(pdev, iobase)   NULL
-#define ioat3_dca_init(pdev, iobase)   NULL
-#endif
-
-#endif /* IOATDMA_H */
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
deleted file mode 100644 (file)
index afa57ee..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef _IOAT_HW_H_
-#define _IOAT_HW_H_
-
-/* PCI Configuration Space Values */
-#define IOAT_PCI_VID            0x8086
-
-/* CB device ID's */
-#define IOAT_PCI_DID_5000       0x1A38
-#define IOAT_PCI_DID_CNB        0x360B
-#define IOAT_PCI_DID_SCNB       0x65FF
-#define IOAT_PCI_DID_SNB        0x402F
-
-#define IOAT_PCI_RID            0x00
-#define IOAT_PCI_SVID           0x8086
-#define IOAT_PCI_SID            0x8086
-#define IOAT_VER_1_2            0x12    /* Version 1.2 */
-#define IOAT_VER_2_0            0x20    /* Version 2.0 */
-#define IOAT_VER_3_0            0x30    /* Version 3.0 */
-
-struct ioat_dma_descriptor {
-       uint32_t        size;
-       uint32_t        ctl;
-       uint64_t        src_addr;
-       uint64_t        dst_addr;
-       uint64_t        next;
-       uint64_t        rsv1;
-       uint64_t        rsv2;
-       uint64_t        user1;
-       uint64_t        user2;
-};
-
-#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001
-#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002
-#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004
-#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008
-#define IOAT_DMA_DESCRIPTOR_CTL_FRAME  0x00000010
-#define IOAT_DMA_DESCRIPTOR_NUL                0x00000020
-#define IOAT_DMA_DESCRIPTOR_CTL_SP_BRK 0x00000040
-#define IOAT_DMA_DESCRIPTOR_CTL_DP_BRK 0x00000080
-#define IOAT_DMA_DESCRIPTOR_CTL_BNDL   0x00000100
-#define IOAT_DMA_DESCRIPTOR_CTL_DCA    0x00000200
-#define IOAT_DMA_DESCRIPTOR_CTL_BUFHINT        0x00000400
-
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_CONTEXT 0xFF000000
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_DMA     0x00000000
-
-#define IOAT_DMA_DESCRIPTOR_CTL_CONTEXT_DCA    0x00000001
-#define IOAT_DMA_DESCRIPTOR_CTL_OPCODE_MASK    0xFF000000
-
-#endif
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioatdma_registers.h
deleted file mode 100644 (file)
index 49bc277..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called COPYING.
- */
-#ifndef _IOAT_REGISTERS_H_
-#define _IOAT_REGISTERS_H_
-
-#define IOAT_PCI_DMACTRL_OFFSET                        0x48
-#define IOAT_PCI_DMACTRL_DMA_EN                        0x00000001
-#define IOAT_PCI_DMACTRL_MSI_EN                        0x00000002
-
-#define IOAT_PCI_DEVICE_ID_OFFSET              0x02
-#define IOAT_PCI_DMAUNCERRSTS_OFFSET           0x148
-#define IOAT_PCI_CHANERRMASK_INT_OFFSET                0x184
-
-/* MMIO Device Registers */
-#define IOAT_CHANCNT_OFFSET                    0x00    /*  8-bit */
-
-#define IOAT_XFERCAP_OFFSET                    0x01    /*  8-bit */
-#define IOAT_XFERCAP_4KB                       12
-#define IOAT_XFERCAP_8KB                       13
-#define IOAT_XFERCAP_16KB                      14
-#define IOAT_XFERCAP_32KB                      15
-#define IOAT_XFERCAP_32GB                      0
-
-#define IOAT_GENCTRL_OFFSET                    0x02    /*  8-bit */
-#define IOAT_GENCTRL_DEBUG_EN                  0x01
-
-#define IOAT_INTRCTRL_OFFSET                   0x03    /*  8-bit */
-#define IOAT_INTRCTRL_MASTER_INT_EN            0x01    /* Master Interrupt Enable */
-#define IOAT_INTRCTRL_INT_STATUS               0x02    /* ATTNSTATUS -or- Channel Int */
-#define IOAT_INTRCTRL_INT                      0x04    /* INT_STATUS -and- MASTER_INT_EN */
-#define IOAT_INTRCTRL_MSIX_VECTOR_CONTROL      0x08    /* Enable all MSI-X vectors */
-
-#define IOAT_ATTNSTATUS_OFFSET                 0x04    /* Each bit is a channel */
-
-#define IOAT_VER_OFFSET                                0x08    /*  8-bit */
-#define IOAT_VER_MAJOR_MASK                    0xF0
-#define IOAT_VER_MINOR_MASK                    0x0F
-#define GET_IOAT_VER_MAJOR(x)                  (((x) & IOAT_VER_MAJOR_MASK) >> 4)
-#define GET_IOAT_VER_MINOR(x)                  ((x) & IOAT_VER_MINOR_MASK)
-
-#define IOAT_PERPORTOFFSET_OFFSET              0x0A    /* 16-bit */
-
-#define IOAT_INTRDELAY_OFFSET                  0x0C    /* 16-bit */
-#define IOAT_INTRDELAY_INT_DELAY_MASK          0x3FFF  /* Interrupt Delay Time */
-#define IOAT_INTRDELAY_COALESE_SUPPORT         0x8000  /* Interrupt Coalescing Supported */
-
-#define IOAT_DEVICE_STATUS_OFFSET              0x0E    /* 16-bit */
-#define IOAT_DEVICE_STATUS_DEGRADED_MODE       0x0001
-
-#define IOAT_CHANNEL_MMIO_SIZE                 0x80    /* Each Channel MMIO space is this size */
-
-/* DMA Channel Registers */
-#define IOAT_CHANCTRL_OFFSET                   0x00    /* 16-bit Channel Control Register */
-#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK    0xF000
-#define IOAT_CHANCTRL_CHANNEL_IN_USE           0x0100
-#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL    0x0020
-#define IOAT_CHANCTRL_ERR_INT_EN               0x0010
-#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN         0x0008
-#define IOAT_CHANCTRL_ERR_COMPLETION_EN                0x0004
-#define IOAT_CHANCTRL_INT_DISABLE              0x0001
-
-#define IOAT_DMA_COMP_OFFSET                   0x02    /* 16-bit DMA channel compatibility */
-#define IOAT_DMA_COMP_V1                       0x0001  /* Compatibility with DMA version 1 */
-#define IOAT_DMA_COMP_V2                       0x0002  /* Compatibility with DMA version 2 */
-
-
-#define IOAT1_CHANSTS_OFFSET           0x04    /* 64-bit Channel Status Register */
-#define IOAT2_CHANSTS_OFFSET           0x08    /* 64-bit Channel Status Register */
-#define IOAT_CHANSTS_OFFSET(ver)               ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHANSTS_OFFSET : IOAT2_CHANSTS_OFFSET)
-#define IOAT1_CHANSTS_OFFSET_LOW       0x04
-#define IOAT2_CHANSTS_OFFSET_LOW       0x08
-#define IOAT_CHANSTS_OFFSET_LOW(ver)           ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHANSTS_OFFSET_LOW : IOAT2_CHANSTS_OFFSET_LOW)
-#define IOAT1_CHANSTS_OFFSET_HIGH      0x08
-#define IOAT2_CHANSTS_OFFSET_HIGH      0x0C
-#define IOAT_CHANSTS_OFFSET_HIGH(ver)          ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHANSTS_OFFSET_HIGH : IOAT2_CHANSTS_OFFSET_HIGH)
-#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR ~0x3F
-#define IOAT_CHANSTS_SOFT_ERR                  0x0000000000000010
-#define IOAT_CHANSTS_UNAFFILIATED_ERR          0x0000000000000008
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS       0x0000000000000007
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE        0x0
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE  0x1
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED     0x2
-#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED        0x3
-
-
-
-#define IOAT_CHAN_DMACOUNT_OFFSET      0x06    /* 16-bit DMA Count register */
-
-#define IOAT_DCACTRL_OFFSET         0x30   /* 32 bit Direct Cache Access Control Register */
-#define IOAT_DCACTRL_CMPL_WRITE_ENABLE 0x10000
-#define IOAT_DCACTRL_TARGET_CPU_MASK   0xFFFF /* APIC ID */
-
-/* CB DCA Memory Space Registers */
-#define IOAT_DCAOFFSET_OFFSET       0x14
-/* CB_BAR + IOAT_DCAOFFSET value */
-#define IOAT_DCA_VER_OFFSET         0x00
-#define IOAT_DCA_VER_MAJOR_MASK     0xF0
-#define IOAT_DCA_VER_MINOR_MASK     0x0F
-
-#define IOAT_DCA_COMP_OFFSET        0x02
-#define IOAT_DCA_COMP_V1            0x1
-
-#define IOAT_FSB_CAPABILITY_OFFSET  0x04
-#define IOAT_FSB_CAPABILITY_PREFETCH    0x1
-
-#define IOAT_PCI_CAPABILITY_OFFSET  0x06
-#define IOAT_PCI_CAPABILITY_MEMWR   0x1
-
-#define IOAT_FSB_CAP_ENABLE_OFFSET  0x08
-#define IOAT_FSB_CAP_ENABLE_PREFETCH    0x1
-
-#define IOAT_PCI_CAP_ENABLE_OFFSET  0x0A
-#define IOAT_PCI_CAP_ENABLE_MEMWR   0x1
-
-#define IOAT_APICID_TAG_MAP_OFFSET  0x0C
-#define IOAT_APICID_TAG_MAP_TAG0    0x0000000F
-#define IOAT_APICID_TAG_MAP_TAG0_SHIFT 0
-#define IOAT_APICID_TAG_MAP_TAG1    0x000000F0
-#define IOAT_APICID_TAG_MAP_TAG1_SHIFT 4
-#define IOAT_APICID_TAG_MAP_TAG2    0x00000F00
-#define IOAT_APICID_TAG_MAP_TAG2_SHIFT 8
-#define IOAT_APICID_TAG_MAP_TAG3    0x0000F000
-#define IOAT_APICID_TAG_MAP_TAG3_SHIFT 12
-#define IOAT_APICID_TAG_MAP_TAG4    0x000F0000
-#define IOAT_APICID_TAG_MAP_TAG4_SHIFT 16
-#define IOAT_APICID_TAG_CB2_VALID   0x8080808080
-
-#define IOAT_DCA_GREQID_OFFSET      0x10
-#define IOAT_DCA_GREQID_SIZE        0x04
-#define IOAT_DCA_GREQID_MASK        0xFFFF
-#define IOAT_DCA_GREQID_IGNOREFUN   0x10000000
-#define IOAT_DCA_GREQID_VALID       0x20000000
-#define IOAT_DCA_GREQID_LASTID      0x80000000
-
-#define IOAT3_CSI_CAPABILITY_OFFSET 0x08
-#define IOAT3_CSI_CAPABILITY_PREFETCH    0x1
-
-#define IOAT3_PCI_CAPABILITY_OFFSET 0x0A
-#define IOAT3_PCI_CAPABILITY_MEMWR  0x1
-
-#define IOAT3_CSI_CONTROL_OFFSET    0x0C
-#define IOAT3_CSI_CONTROL_PREFETCH  0x1
-
-#define IOAT3_PCI_CONTROL_OFFSET    0x0E
-#define IOAT3_PCI_CONTROL_MEMWR     0x1
-
-#define IOAT3_APICID_TAG_MAP_OFFSET 0x10
-#define IOAT3_APICID_TAG_MAP_OFFSET_LOW  0x10
-#define IOAT3_APICID_TAG_MAP_OFFSET_HIGH 0x14
-
-#define IOAT3_DCA_GREQID_OFFSET     0x02
-
-#define IOAT1_CHAINADDR_OFFSET         0x0C    /* 64-bit Descriptor Chain Address Register */
-#define IOAT2_CHAINADDR_OFFSET         0x10    /* 64-bit Descriptor Chain Address Register */
-#define IOAT_CHAINADDR_OFFSET(ver)             ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHAINADDR_OFFSET : IOAT2_CHAINADDR_OFFSET)
-#define IOAT1_CHAINADDR_OFFSET_LOW     0x0C
-#define IOAT2_CHAINADDR_OFFSET_LOW     0x10
-#define IOAT_CHAINADDR_OFFSET_LOW(ver)         ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHAINADDR_OFFSET_LOW : IOAT2_CHAINADDR_OFFSET_LOW)
-#define IOAT1_CHAINADDR_OFFSET_HIGH    0x10
-#define IOAT2_CHAINADDR_OFFSET_HIGH    0x14
-#define IOAT_CHAINADDR_OFFSET_HIGH(ver)                ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHAINADDR_OFFSET_HIGH : IOAT2_CHAINADDR_OFFSET_HIGH)
-
-#define IOAT1_CHANCMD_OFFSET           0x14    /*  8-bit DMA Channel Command Register */
-#define IOAT2_CHANCMD_OFFSET           0x04    /*  8-bit DMA Channel Command Register */
-#define IOAT_CHANCMD_OFFSET(ver)               ((ver) < IOAT_VER_2_0 \
-                                               ? IOAT1_CHANCMD_OFFSET : IOAT2_CHANCMD_OFFSET)
-#define IOAT_CHANCMD_RESET                     0x20
-#define IOAT_CHANCMD_RESUME                    0x10
-#define IOAT_CHANCMD_ABORT                     0x08
-#define IOAT_CHANCMD_SUSPEND                   0x04
-#define IOAT_CHANCMD_APPEND                    0x02
-#define IOAT_CHANCMD_START                     0x01
-
-#define IOAT_CHANCMP_OFFSET                    0x18    /* 64-bit Channel Completion Address Register */
-#define IOAT_CHANCMP_OFFSET_LOW                        0x18
-#define IOAT_CHANCMP_OFFSET_HIGH               0x1C
-
-#define IOAT_CDAR_OFFSET                       0x20    /* 64-bit Current Descriptor Address Register */
-#define IOAT_CDAR_OFFSET_LOW                   0x20
-#define IOAT_CDAR_OFFSET_HIGH                  0x24
-
-#define IOAT_CHANERR_OFFSET                    0x28    /* 32-bit Channel Error Register */
-#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001
-#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR        0x0002
-#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR  0x0004
-#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR     0x0008
-#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR      0x0010
-#define IOAT_CHANERR_CHANCMD_ERR               0x0020
-#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR  0x0040
-#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR      0x0080
-#define IOAT_CHANERR_READ_DATA_ERR             0x0100
-#define IOAT_CHANERR_WRITE_DATA_ERR            0x0200
-#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR    0x0400
-#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR     0x0800
-#define IOAT_CHANERR_COMPLETION_ADDR_ERR       0x1000
-#define IOAT_CHANERR_INT_CONFIGURATION_ERR     0x2000
-#define IOAT_CHANERR_SOFT_ERR                  0x4000
-#define IOAT_CHANERR_UNAFFILIATED_ERR          0x8000
-
-#define IOAT_CHANERR_MASK_OFFSET               0x2C    /* 32-bit Channel Error Register */
-
-#endif /* _IOAT_REGISTERS_H_ */
index 2f052265122f62e2681bbdb0dfbbd558cef24713..645ca8d54ec43350059bd8d7c8802f017b4a4abd 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/platform_device.h>
 #include <linux/memory.h>
 #include <linux/ioport.h>
+#include <linux/raid/pq.h>
 
 #include <mach/adma.h>
 
@@ -57,65 +58,110 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
        }
 }
 
+static void
+iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
+{
+       struct dma_async_tx_descriptor *tx = &desc->async_tx;
+       struct iop_adma_desc_slot *unmap = desc->group_head;
+       struct device *dev = &iop_chan->device->pdev->dev;
+       u32 len = unmap->unmap_len;
+       enum dma_ctrl_flags flags = tx->flags;
+       u32 src_cnt;
+       dma_addr_t addr;
+       dma_addr_t dest;
+
+       src_cnt = unmap->unmap_src_cnt;
+       dest = iop_desc_get_dest_addr(unmap, iop_chan);
+       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+               enum dma_data_direction dir;
+
+               if (src_cnt > 1) /* is xor? */
+                       dir = DMA_BIDIRECTIONAL;
+               else
+                       dir = DMA_FROM_DEVICE;
+
+               dma_unmap_page(dev, dest, len, dir);
+       }
+
+       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+               while (src_cnt--) {
+                       addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt);
+                       if (addr == dest)
+                               continue;
+                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+               }
+       }
+       desc->group_head = NULL;
+}
+
+static void
+iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
+{
+       struct dma_async_tx_descriptor *tx = &desc->async_tx;
+       struct iop_adma_desc_slot *unmap = desc->group_head;
+       struct device *dev = &iop_chan->device->pdev->dev;
+       u32 len = unmap->unmap_len;
+       enum dma_ctrl_flags flags = tx->flags;
+       u32 src_cnt = unmap->unmap_src_cnt;
+       dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan);
+       dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan);
+       int i;
+
+       if (tx->flags & DMA_PREP_CONTINUE)
+               src_cnt -= 3;
+
+       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) {
+               dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL);
+               dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL);
+       }
+
+       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+               dma_addr_t addr;
+
+               for (i = 0; i < src_cnt; i++) {
+                       addr = iop_desc_get_src_addr(unmap, iop_chan, i);
+                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+               }
+               if (desc->pq_check_result) {
+                       dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE);
+                       dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE);
+               }
+       }
+
+       desc->group_head = NULL;
+}
+
+
 static dma_cookie_t
 iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
        struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
 {
-       BUG_ON(desc->async_tx.cookie < 0);
-       if (desc->async_tx.cookie > 0) {
-               cookie = desc->async_tx.cookie;
-               desc->async_tx.cookie = 0;
+       struct dma_async_tx_descriptor *tx = &desc->async_tx;
+
+       BUG_ON(tx->cookie < 0);
+       if (tx->cookie > 0) {
+               cookie = tx->cookie;
+               tx->cookie = 0;
 
                /* call the callback (must not sleep or submit new
                 * operations to this channel)
                 */
-               if (desc->async_tx.callback)
-                       desc->async_tx.callback(
-                               desc->async_tx.callback_param);
+               if (tx->callback)
+                       tx->callback(tx->callback_param);
 
                /* unmap dma addresses
                 * (unmap_single vs unmap_page?)
                 */
                if (desc->group_head && desc->unmap_len) {
-                       struct iop_adma_desc_slot *unmap = desc->group_head;
-                       struct device *dev =
-                               &iop_chan->device->pdev->dev;
-                       u32 len = unmap->unmap_len;
-                       enum dma_ctrl_flags flags = desc->async_tx.flags;
-                       u32 src_cnt;
-                       dma_addr_t addr;
-                       dma_addr_t dest;
-
-                       src_cnt = unmap->unmap_src_cnt;
-                       dest = iop_desc_get_dest_addr(unmap, iop_chan);
-                       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                               enum dma_data_direction dir;
-
-                               if (src_cnt > 1) /* is xor? */
-                                       dir = DMA_BIDIRECTIONAL;
-                               else
-                                       dir = DMA_FROM_DEVICE;
-
-                               dma_unmap_page(dev, dest, len, dir);
-                       }
-
-                       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                               while (src_cnt--) {
-                                       addr = iop_desc_get_src_addr(unmap,
-                                                                    iop_chan,
-                                                                    src_cnt);
-                                       if (addr == dest)
-                                               continue;
-                                       dma_unmap_page(dev, addr, len,
-                                                      DMA_TO_DEVICE);
-                               }
-                       }
-                       desc->group_head = NULL;
+                       if (iop_desc_is_pq(desc))
+                               iop_desc_unmap_pq(iop_chan, desc);
+                       else
+                               iop_desc_unmap(iop_chan, desc);
                }
        }
 
        /* run dependent operations */
-       dma_run_dependencies(&desc->async_tx);
+       dma_run_dependencies(tx);
 
        return cookie;
 }
@@ -287,7 +333,12 @@ static void iop_adma_tasklet(unsigned long data)
 {
        struct iop_adma_chan *iop_chan = (struct iop_adma_chan *) data;
 
-       spin_lock(&iop_chan->lock);
+       /* lockdep will flag depedency submissions as potentially
+        * recursive locking, this is not the case as a dependency
+        * submission will never recurse a channels submit routine.
+        * There are checks in async_tx.c to prevent this.
+        */
+       spin_lock_nested(&iop_chan->lock, SINGLE_DEPTH_NESTING);
        __iop_adma_slot_cleanup(iop_chan);
        spin_unlock(&iop_chan->lock);
 }
@@ -370,7 +421,7 @@ retry:
                        }
                        alloc_tail->group_head = alloc_start;
                        alloc_tail->async_tx.cookie = -EBUSY;
-                       list_splice(&chain, &alloc_tail->async_tx.tx_list);
+                       list_splice(&chain, &alloc_tail->tx_list);
                        iop_chan->last_used = last_used;
                        iop_desc_clear_next_desc(alloc_start);
                        iop_desc_clear_next_desc(alloc_tail);
@@ -429,7 +480,7 @@ iop_adma_tx_submit(struct dma_async_tx_descriptor *tx)
 
        old_chain_tail = list_entry(iop_chan->chain.prev,
                struct iop_adma_desc_slot, chain_node);
-       list_splice_init(&sw_desc->async_tx.tx_list,
+       list_splice_init(&sw_desc->tx_list,
                         &old_chain_tail->chain_node);
 
        /* fix up the hardware chain */
@@ -496,6 +547,7 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
 
                dma_async_tx_descriptor_init(&slot->async_tx, chan);
                slot->async_tx.tx_submit = iop_adma_tx_submit;
+               INIT_LIST_HEAD(&slot->tx_list);
                INIT_LIST_HEAD(&slot->chain_node);
                INIT_LIST_HEAD(&slot->slot_node);
                hw_desc = (char *) iop_chan->device->dma_desc_pool;
@@ -660,9 +712,9 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
 }
 
 static struct dma_async_tx_descriptor *
-iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src,
-                          unsigned int src_cnt, size_t len, u32 *result,
-                          unsigned long flags)
+iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src,
+                         unsigned int src_cnt, size_t len, u32 *result,
+                         unsigned long flags)
 {
        struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
        struct iop_adma_desc_slot *sw_desc, *grp_start;
@@ -696,6 +748,118 @@ iop_adma_prep_dma_zero_sum(struct dma_chan *chan, dma_addr_t *dma_src,
        return sw_desc ? &sw_desc->async_tx : NULL;
 }
 
+static struct dma_async_tx_descriptor *
+iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+                    unsigned int src_cnt, const unsigned char *scf, size_t len,
+                    unsigned long flags)
+{
+       struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
+       struct iop_adma_desc_slot *sw_desc, *g;
+       int slot_cnt, slots_per_op;
+       int continue_srcs;
+
+       if (unlikely(!len))
+               return NULL;
+       BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
+
+       dev_dbg(iop_chan->device->common.dev,
+               "%s src_cnt: %d len: %u flags: %lx\n",
+               __func__, src_cnt, len, flags);
+
+       if (dmaf_p_disabled_continue(flags))
+               continue_srcs = 1+src_cnt;
+       else if (dmaf_continue(flags))
+               continue_srcs = 3+src_cnt;
+       else
+               continue_srcs = 0+src_cnt;
+
+       spin_lock_bh(&iop_chan->lock);
+       slot_cnt = iop_chan_pq_slot_count(len, continue_srcs, &slots_per_op);
+       sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
+       if (sw_desc) {
+               int i;
+
+               g = sw_desc->group_head;
+               iop_desc_set_byte_count(g, iop_chan, len);
+
+               /* even if P is disabled its destination address (bits
+                * [3:0]) must match Q.  It is ok if P points to an
+                * invalid address, it won't be written.
+                */
+               if (flags & DMA_PREP_PQ_DISABLE_P)
+                       dst[0] = dst[1] & 0x7;
+
+               iop_desc_set_pq_addr(g, dst);
+               sw_desc->unmap_src_cnt = src_cnt;
+               sw_desc->unmap_len = len;
+               sw_desc->async_tx.flags = flags;
+               for (i = 0; i < src_cnt; i++)
+                       iop_desc_set_pq_src_addr(g, i, src[i], scf[i]);
+
+               /* if we are continuing a previous operation factor in
+                * the old p and q values, see the comment for dma_maxpq
+                * in include/linux/dmaengine.h
+                */
+               if (dmaf_p_disabled_continue(flags))
+                       iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
+               else if (dmaf_continue(flags)) {
+                       iop_desc_set_pq_src_addr(g, i++, dst[0], 0);
+                       iop_desc_set_pq_src_addr(g, i++, dst[1], 1);
+                       iop_desc_set_pq_src_addr(g, i++, dst[1], 0);
+               }
+               iop_desc_init_pq(g, i, flags);
+       }
+       spin_unlock_bh(&iop_chan->lock);
+
+       return sw_desc ? &sw_desc->async_tx : NULL;
+}
+
+static struct dma_async_tx_descriptor *
+iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+                        unsigned int src_cnt, const unsigned char *scf,
+                        size_t len, enum sum_check_flags *pqres,
+                        unsigned long flags)
+{
+       struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
+       struct iop_adma_desc_slot *sw_desc, *g;
+       int slot_cnt, slots_per_op;
+
+       if (unlikely(!len))
+               return NULL;
+       BUG_ON(len > IOP_ADMA_XOR_MAX_BYTE_COUNT);
+
+       dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n",
+               __func__, src_cnt, len);
+
+       spin_lock_bh(&iop_chan->lock);
+       slot_cnt = iop_chan_pq_zero_sum_slot_count(len, src_cnt + 2, &slots_per_op);
+       sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
+       if (sw_desc) {
+               /* for validate operations p and q are tagged onto the
+                * end of the source list
+                */
+               int pq_idx = src_cnt;
+
+               g = sw_desc->group_head;
+               iop_desc_init_pq_zero_sum(g, src_cnt+2, flags);
+               iop_desc_set_pq_zero_sum_byte_count(g, len);
+               g->pq_check_result = pqres;
+               pr_debug("\t%s: g->pq_check_result: %p\n",
+                       __func__, g->pq_check_result);
+               sw_desc->unmap_src_cnt = src_cnt+2;
+               sw_desc->unmap_len = len;
+               sw_desc->async_tx.flags = flags;
+               while (src_cnt--)
+                       iop_desc_set_pq_zero_sum_src_addr(g, src_cnt,
+                                                         src[src_cnt],
+                                                         scf[src_cnt]);
+               iop_desc_set_pq_zero_sum_addr(g, pq_idx, src);
+       }
+       spin_unlock_bh(&iop_chan->lock);
+
+       return sw_desc ? &sw_desc->async_tx : NULL;
+}
+
 static void iop_adma_free_chan_resources(struct dma_chan *chan)
 {
        struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
@@ -906,7 +1070,7 @@ out:
 
 #define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */
 static int __devinit
-iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
+iop_adma_xor_val_self_test(struct iop_adma_device *device)
 {
        int i, src_idx;
        struct page *dest;
@@ -1002,7 +1166,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
                PAGE_SIZE, DMA_TO_DEVICE);
 
        /* skip zero sum if the capability is not present */
-       if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask))
+       if (!dma_has_cap(DMA_XOR_VAL, dma_chan->device->cap_mask))
                goto free_resources;
 
        /* zero sum the sources with the destintation page */
@@ -1016,10 +1180,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
                dma_srcs[i] = dma_map_page(dma_chan->device->dev,
                                           zero_sum_srcs[i], 0, PAGE_SIZE,
                                           DMA_TO_DEVICE);
-       tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs,
-                                       IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
-                                       &zero_sum_result,
-                                       DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
+                                      IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
+                                      &zero_sum_result,
+                                      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
        cookie = iop_adma_tx_submit(tx);
        iop_adma_issue_pending(dma_chan);
@@ -1072,10 +1236,10 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
                dma_srcs[i] = dma_map_page(dma_chan->device->dev,
                                           zero_sum_srcs[i], 0, PAGE_SIZE,
                                           DMA_TO_DEVICE);
-       tx = iop_adma_prep_dma_zero_sum(dma_chan, dma_srcs,
-                                       IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
-                                       &zero_sum_result,
-                                       DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       tx = iop_adma_prep_dma_xor_val(dma_chan, dma_srcs,
+                                      IOP_ADMA_NUM_SRC_TEST + 1, PAGE_SIZE,
+                                      &zero_sum_result,
+                                      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
        cookie = iop_adma_tx_submit(tx);
        iop_adma_issue_pending(dma_chan);
@@ -1105,6 +1269,170 @@ out:
        return err;
 }
 
+#ifdef CONFIG_MD_RAID6_PQ
+static int __devinit
+iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
+{
+       /* combined sources, software pq results, and extra hw pq results */
+       struct page *pq[IOP_ADMA_NUM_SRC_TEST+2+2];
+       /* ptr to the extra hw pq buffers defined above */
+       struct page **pq_hw = &pq[IOP_ADMA_NUM_SRC_TEST+2];
+       /* address conversion buffers (dma_map / page_address) */
+       void *pq_sw[IOP_ADMA_NUM_SRC_TEST+2];
+       dma_addr_t pq_src[IOP_ADMA_NUM_SRC_TEST];
+       dma_addr_t pq_dest[2];
+
+       int i;
+       struct dma_async_tx_descriptor *tx;
+       struct dma_chan *dma_chan;
+       dma_cookie_t cookie;
+       u32 zero_sum_result;
+       int err = 0;
+       struct device *dev;
+
+       dev_dbg(device->common.dev, "%s\n", __func__);
+
+       for (i = 0; i < ARRAY_SIZE(pq); i++) {
+               pq[i] = alloc_page(GFP_KERNEL);
+               if (!pq[i]) {
+                       while (i--)
+                               __free_page(pq[i]);
+                       return -ENOMEM;
+               }
+       }
+
+       /* Fill in src buffers */
+       for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) {
+               pq_sw[i] = page_address(pq[i]);
+               memset(pq_sw[i], 0x11111111 * (1<<i), PAGE_SIZE);
+       }
+       pq_sw[i] = page_address(pq[i]);
+       pq_sw[i+1] = page_address(pq[i+1]);
+
+       dma_chan = container_of(device->common.channels.next,
+                               struct dma_chan,
+                               device_node);
+       if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       dev = dma_chan->device->dev;
+
+       /* initialize the dests */
+       memset(page_address(pq_hw[0]), 0 , PAGE_SIZE);
+       memset(page_address(pq_hw[1]), 0 , PAGE_SIZE);
+
+       /* test pq */
+       pq_dest[0] = dma_map_page(dev, pq_hw[0], 0, PAGE_SIZE, DMA_FROM_DEVICE);
+       pq_dest[1] = dma_map_page(dev, pq_hw[1], 0, PAGE_SIZE, DMA_FROM_DEVICE);
+       for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++)
+               pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+                                        DMA_TO_DEVICE);
+
+       tx = iop_adma_prep_dma_pq(dma_chan, pq_dest, pq_src,
+                                 IOP_ADMA_NUM_SRC_TEST, (u8 *)raid6_gfexp,
+                                 PAGE_SIZE,
+                                 DMA_PREP_INTERRUPT |
+                                 DMA_CTRL_ACK);
+
+       cookie = iop_adma_tx_submit(tx);
+       iop_adma_issue_pending(dma_chan);
+       msleep(8);
+
+       if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+               DMA_SUCCESS) {
+               dev_err(dev, "Self-test pq timed out, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       raid6_call.gen_syndrome(IOP_ADMA_NUM_SRC_TEST+2, PAGE_SIZE, pq_sw);
+
+       if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST],
+                  page_address(pq_hw[0]), PAGE_SIZE) != 0) {
+               dev_err(dev, "Self-test p failed compare, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       if (memcmp(pq_sw[IOP_ADMA_NUM_SRC_TEST+1],
+                  page_address(pq_hw[1]), PAGE_SIZE) != 0) {
+               dev_err(dev, "Self-test q failed compare, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       /* test correct zero sum using the software generated pq values */
+       for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
+               pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+                                        DMA_TO_DEVICE);
+
+       zero_sum_result = ~0;
+       tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
+                                     pq_src, IOP_ADMA_NUM_SRC_TEST,
+                                     raid6_gfexp, PAGE_SIZE, &zero_sum_result,
+                                     DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
+
+       cookie = iop_adma_tx_submit(tx);
+       iop_adma_issue_pending(dma_chan);
+       msleep(8);
+
+       if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+               DMA_SUCCESS) {
+               dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       if (zero_sum_result != 0) {
+               dev_err(dev, "Self-test pq-zero-sum failed to validate: %x\n",
+                       zero_sum_result);
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       /* test incorrect zero sum */
+       i = IOP_ADMA_NUM_SRC_TEST;
+       memset(pq_sw[i] + 100, 0, 100);
+       memset(pq_sw[i+1] + 200, 0, 200);
+       for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 2; i++)
+               pq_src[i] = dma_map_page(dev, pq[i], 0, PAGE_SIZE,
+                                        DMA_TO_DEVICE);
+
+       zero_sum_result = 0;
+       tx = iop_adma_prep_dma_pq_val(dma_chan, &pq_src[IOP_ADMA_NUM_SRC_TEST],
+                                     pq_src, IOP_ADMA_NUM_SRC_TEST,
+                                     raid6_gfexp, PAGE_SIZE, &zero_sum_result,
+                                     DMA_PREP_INTERRUPT|DMA_CTRL_ACK);
+
+       cookie = iop_adma_tx_submit(tx);
+       iop_adma_issue_pending(dma_chan);
+       msleep(8);
+
+       if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) !=
+               DMA_SUCCESS) {
+               dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+       if (zero_sum_result != (SUM_CHECK_P_RESULT | SUM_CHECK_Q_RESULT)) {
+               dev_err(dev, "Self-test !pq-zero-sum failed to validate: %x\n",
+                       zero_sum_result);
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+free_resources:
+       iop_adma_free_chan_resources(dma_chan);
+out:
+       i = ARRAY_SIZE(pq);
+       while (i--)
+               __free_page(pq[i]);
+       return err;
+}
+#endif
+
 static int __devexit iop_adma_remove(struct platform_device *dev)
 {
        struct iop_adma_device *device = platform_get_drvdata(dev);
@@ -1192,9 +1520,16 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
                dma_dev->max_xor = iop_adma_get_max_xor();
                dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor;
        }
-       if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask))
-               dma_dev->device_prep_dma_zero_sum =
-                       iop_adma_prep_dma_zero_sum;
+       if (dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask))
+               dma_dev->device_prep_dma_xor_val =
+                       iop_adma_prep_dma_xor_val;
+       if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
+               dma_set_maxpq(dma_dev, iop_adma_get_max_pq(), 0);
+               dma_dev->device_prep_dma_pq = iop_adma_prep_dma_pq;
+       }
+       if (dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask))
+               dma_dev->device_prep_dma_pq_val =
+                       iop_adma_prep_dma_pq_val;
        if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask))
                dma_dev->device_prep_dma_interrupt =
                        iop_adma_prep_dma_interrupt;
@@ -1248,23 +1583,35 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
        }
 
        if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) ||
-               dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
-               ret = iop_adma_xor_zero_sum_self_test(adev);
+           dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
+               ret = iop_adma_xor_val_self_test(adev);
                dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
                if (ret)
                        goto err_free_iop_chan;
        }
 
+       if (dma_has_cap(DMA_PQ, dma_dev->cap_mask) &&
+           dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask)) {
+               #ifdef CONFIG_MD_RAID6_PQ
+               ret = iop_adma_pq_zero_sum_self_test(adev);
+               dev_dbg(&pdev->dev, "pq self test returned %d\n", ret);
+               #else
+               /* can not test raid6, so do not publish capability */
+               dma_cap_clear(DMA_PQ, dma_dev->cap_mask);
+               dma_cap_clear(DMA_PQ_VAL, dma_dev->cap_mask);
+               ret = 0;
+               #endif
+               if (ret)
+                       goto err_free_iop_chan;
+       }
+
        dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: "
-         "( %s%s%s%s%s%s%s%s%s%s)\n",
-         dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "",
-         dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "",
-         dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "",
+         "( %s%s%s%s%s%s%s)\n",
+         dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "",
+         dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
          dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
-         dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "",
-         dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "",
+         dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "",
          dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)  ? "fill " : "",
-         dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "",
          dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
          dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
@@ -1296,7 +1643,7 @@ static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan)
        if (sw_desc) {
                grp_start = sw_desc->group_head;
 
-               list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
+               list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
                async_tx_ack(&sw_desc->async_tx);
                iop_desc_init_memcpy(grp_start, 0);
                iop_desc_set_byte_count(grp_start, iop_chan, 0);
@@ -1352,7 +1699,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan)
        sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
        if (sw_desc) {
                grp_start = sw_desc->group_head;
-               list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain);
+               list_splice_init(&sw_desc->tx_list, &iop_chan->chain);
                async_tx_ack(&sw_desc->async_tx);
                iop_desc_init_null_xor(grp_start, 2, 0);
                iop_desc_set_byte_count(grp_start, iop_chan, 0);
index 9f6fe46a9b87351d6b172d817a3532c05ec4c90d..c0a272c7368267ad25bd5185e14d5fa965cb2751 100644 (file)
@@ -183,6 +183,11 @@ dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
                                        iov_byte_offset,
                                        kdata,
                                        copy);
+                       /* poll for a descriptor slot */
+                       if (unlikely(dma_cookie < 0)) {
+                               dma_async_issue_pending(chan);
+                               continue;
+                       }
 
                        len -= copy;
                        iov[iovec_idx].iov_len -= copy;
@@ -248,6 +253,11 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
                                        page,
                                        offset,
                                        copy);
+                       /* poll for a descriptor slot */
+                       if (unlikely(dma_cookie < 0)) {
+                               dma_async_issue_pending(chan);
+                               continue;
+                       }
 
                        len -= copy;
                        iov[iovec_idx].iov_len -= copy;
index 3f23eabe09f2076cc05ddc1805129615d1dde6bc..466ab10c1ff10de1d001178fde9fdce203c410e3 100644 (file)
@@ -517,7 +517,7 @@ retry:
                        }
                        alloc_tail->group_head = alloc_start;
                        alloc_tail->async_tx.cookie = -EBUSY;
-                       list_splice(&chain, &alloc_tail->async_tx.tx_list);
+                       list_splice(&chain, &alloc_tail->tx_list);
                        mv_chan->last_used = last_used;
                        mv_desc_clear_next_desc(alloc_start);
                        mv_desc_clear_next_desc(alloc_tail);
@@ -565,14 +565,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx)
        cookie = mv_desc_assign_cookie(mv_chan, sw_desc);
 
        if (list_empty(&mv_chan->chain))
-               list_splice_init(&sw_desc->async_tx.tx_list, &mv_chan->chain);
+               list_splice_init(&sw_desc->tx_list, &mv_chan->chain);
        else {
                new_hw_chain = 0;
 
                old_chain_tail = list_entry(mv_chan->chain.prev,
                                            struct mv_xor_desc_slot,
                                            chain_node);
-               list_splice_init(&grp_start->async_tx.tx_list,
+               list_splice_init(&grp_start->tx_list,
                                 &old_chain_tail->chain_node);
 
                if (!mv_can_chain(grp_start))
@@ -632,6 +632,7 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
                slot->async_tx.tx_submit = mv_xor_tx_submit;
                INIT_LIST_HEAD(&slot->chain_node);
                INIT_LIST_HEAD(&slot->slot_node);
+               INIT_LIST_HEAD(&slot->tx_list);
                hw_desc = (char *) mv_chan->device->dma_desc_pool;
                slot->async_tx.phys =
                        (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE];
index 06cafe1ef521b06acaa8a2ea90ee5c76afbf72fb..977b592e976bfa9eea31b64cb0ef5aa8c1ff6b49 100644 (file)
@@ -126,9 +126,8 @@ struct mv_xor_chan {
  * @idx: pool index
  * @unmap_src_cnt: number of xor sources
  * @unmap_len: transaction bytecount
+ * @tx_list: list of slots that make up a multi-descriptor transaction
  * @async_tx: support for the async_tx api
- * @group_list: list of slots that make up a multi-descriptor transaction
- *     for example transfer lengths larger than the supported hw max
  * @xor_check_result: result of zero sum
  * @crc32_result: result crc calculation
  */
@@ -145,6 +144,7 @@ struct mv_xor_desc_slot {
        u16                     unmap_src_cnt;
        u32                     value;
        size_t                  unmap_len;
+       struct list_head        tx_list;
        struct dma_async_tx_descriptor  async_tx;
        union {
                u32             *xor_check_result;
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
new file mode 100644 (file)
index 0000000..b3b065c
--- /dev/null
@@ -0,0 +1,786 @@
+/*
+ * Renesas SuperH DMA Engine support
+ *
+ * base is drivers/dma/flsdma.c
+ *
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * - DMA of SuperH does not have Hardware DMA chain mode.
+ * - MAX DMA size is 16MB.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/platform_device.h>
+#include <cpu/dma.h>
+#include <asm/dma-sh.h>
+#include "shdma.h"
+
+/* DMA descriptor control */
+#define DESC_LAST      (-1)
+#define DESC_COMP      (1)
+#define DESC_NCOMP     (0)
+
+#define NR_DESCS_PER_CHANNEL 32
+/*
+ * Define the default configuration for dual address memory-memory transfer.
+ * The 0x400 value represents auto-request, external->external.
+ *
+ * And this driver set 4byte burst mode.
+ * If you want to change mode, you need to change RS_DEFAULT of value.
+ * (ex 1byte burst mode -> (RS_DUAL & ~TS_32)
+ */
+#define RS_DEFAULT  (RS_DUAL)
+
+#define SH_DMAC_CHAN_BASE(id) (dma_base_addr[id])
+static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg)
+{
+       ctrl_outl(data, (SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
+}
+
+static u32 sh_dmae_readl(struct sh_dmae_chan *sh_dc, u32 reg)
+{
+       return ctrl_inl((SH_DMAC_CHAN_BASE(sh_dc->id) + reg));
+}
+
+static void dmae_init(struct sh_dmae_chan *sh_chan)
+{
+       u32 chcr = RS_DEFAULT; /* default is DUAL mode */
+       sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+/*
+ * Reset DMA controller
+ *
+ * SH7780 has two DMAOR register
+ */
+static void sh_dmae_ctl_stop(int id)
+{
+       unsigned short dmaor = dmaor_read_reg(id);
+
+       dmaor &= ~(DMAOR_NMIF | DMAOR_AE);
+       dmaor_write_reg(id, dmaor);
+}
+
+static int sh_dmae_rst(int id)
+{
+       unsigned short dmaor;
+
+       sh_dmae_ctl_stop(id);
+       dmaor = (dmaor_read_reg(id)|DMAOR_INIT);
+
+       dmaor_write_reg(id, dmaor);
+       if ((dmaor_read_reg(id) & (DMAOR_AE | DMAOR_NMIF))) {
+               pr_warning(KERN_ERR "dma-sh: Can't initialize DMAOR.\n");
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int dmae_is_idle(struct sh_dmae_chan *sh_chan)
+{
+       u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+       if (chcr & CHCR_DE) {
+               if (!(chcr & CHCR_TE))
+                       return -EBUSY; /* working */
+       }
+       return 0; /* waiting */
+}
+
+static inline unsigned int calc_xmit_shift(struct sh_dmae_chan *sh_chan)
+{
+       u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+       return ts_shift[(chcr & CHCR_TS_MASK) >> CHCR_TS_SHIFT];
+}
+
+static void dmae_set_reg(struct sh_dmae_chan *sh_chan, struct sh_dmae_regs hw)
+{
+       sh_dmae_writel(sh_chan, hw.sar, SAR);
+       sh_dmae_writel(sh_chan, hw.dar, DAR);
+       sh_dmae_writel(sh_chan,
+               (hw.tcr >> calc_xmit_shift(sh_chan)), TCR);
+}
+
+static void dmae_start(struct sh_dmae_chan *sh_chan)
+{
+       u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+       chcr |= (CHCR_DE|CHCR_IE);
+       sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+static void dmae_halt(struct sh_dmae_chan *sh_chan)
+{
+       u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+       chcr &= ~(CHCR_DE | CHCR_TE | CHCR_IE);
+       sh_dmae_writel(sh_chan, chcr, CHCR);
+}
+
+static int dmae_set_chcr(struct sh_dmae_chan *sh_chan, u32 val)
+{
+       int ret = dmae_is_idle(sh_chan);
+       /* When DMA was working, can not set data to CHCR */
+       if (ret)
+               return ret;
+
+       sh_dmae_writel(sh_chan, val, CHCR);
+       return 0;
+}
+
+#define DMARS1_ADDR    0x04
+#define DMARS2_ADDR    0x08
+#define DMARS_SHIFT 8
+#define DMARS_CHAN_MSK 0x01
+static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val)
+{
+       u32 addr;
+       int shift = 0;
+       int ret = dmae_is_idle(sh_chan);
+       if (ret)
+               return ret;
+
+       if (sh_chan->id & DMARS_CHAN_MSK)
+               shift = DMARS_SHIFT;
+
+       switch (sh_chan->id) {
+       /* DMARS0 */
+       case 0:
+       case 1:
+               addr = SH_DMARS_BASE;
+               break;
+       /* DMARS1 */
+       case 2:
+       case 3:
+               addr = (SH_DMARS_BASE + DMARS1_ADDR);
+               break;
+       /* DMARS2 */
+       case 4:
+       case 5:
+               addr = (SH_DMARS_BASE + DMARS2_ADDR);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ctrl_outw((val << shift) |
+               (ctrl_inw(addr) & (shift ? 0xFF00 : 0x00FF)),
+               addr);
+
+       return 0;
+}
+
+static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+       struct sh_desc *desc = tx_to_sh_desc(tx);
+       struct sh_dmae_chan *sh_chan = to_sh_chan(tx->chan);
+       dma_cookie_t cookie;
+
+       spin_lock_bh(&sh_chan->desc_lock);
+
+       cookie = sh_chan->common.cookie;
+       cookie++;
+       if (cookie < 0)
+               cookie = 1;
+
+       /* If desc only in the case of 1 */
+       if (desc->async_tx.cookie != -EBUSY)
+               desc->async_tx.cookie = cookie;
+       sh_chan->common.cookie = desc->async_tx.cookie;
+
+       list_splice_init(&desc->tx_list, sh_chan->ld_queue.prev);
+
+       spin_unlock_bh(&sh_chan->desc_lock);
+
+       return cookie;
+}
+
+static struct sh_desc *sh_dmae_get_desc(struct sh_dmae_chan *sh_chan)
+{
+       struct sh_desc *desc, *_desc, *ret = NULL;
+
+       spin_lock_bh(&sh_chan->desc_lock);
+       list_for_each_entry_safe(desc, _desc, &sh_chan->ld_free, node) {
+               if (async_tx_test_ack(&desc->async_tx)) {
+                       list_del(&desc->node);
+                       ret = desc;
+                       break;
+               }
+       }
+       spin_unlock_bh(&sh_chan->desc_lock);
+
+       return ret;
+}
+
+static void sh_dmae_put_desc(struct sh_dmae_chan *sh_chan, struct sh_desc *desc)
+{
+       if (desc) {
+               spin_lock_bh(&sh_chan->desc_lock);
+
+               list_splice_init(&desc->tx_list, &sh_chan->ld_free);
+               list_add(&desc->node, &sh_chan->ld_free);
+
+               spin_unlock_bh(&sh_chan->desc_lock);
+       }
+}
+
+static int sh_dmae_alloc_chan_resources(struct dma_chan *chan)
+{
+       struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+       struct sh_desc *desc;
+
+       spin_lock_bh(&sh_chan->desc_lock);
+       while (sh_chan->descs_allocated < NR_DESCS_PER_CHANNEL) {
+               spin_unlock_bh(&sh_chan->desc_lock);
+               desc = kzalloc(sizeof(struct sh_desc), GFP_KERNEL);
+               if (!desc) {
+                       spin_lock_bh(&sh_chan->desc_lock);
+                       break;
+               }
+               dma_async_tx_descriptor_init(&desc->async_tx,
+                                       &sh_chan->common);
+               desc->async_tx.tx_submit = sh_dmae_tx_submit;
+               desc->async_tx.flags = DMA_CTRL_ACK;
+               INIT_LIST_HEAD(&desc->tx_list);
+               sh_dmae_put_desc(sh_chan, desc);
+
+               spin_lock_bh(&sh_chan->desc_lock);
+               sh_chan->descs_allocated++;
+       }
+       spin_unlock_bh(&sh_chan->desc_lock);
+
+       return sh_chan->descs_allocated;
+}
+
+/*
+ * sh_dma_free_chan_resources - Free all resources of the channel.
+ */
+static void sh_dmae_free_chan_resources(struct dma_chan *chan)
+{
+       struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+       struct sh_desc *desc, *_desc;
+       LIST_HEAD(list);
+
+       BUG_ON(!list_empty(&sh_chan->ld_queue));
+       spin_lock_bh(&sh_chan->desc_lock);
+
+       list_splice_init(&sh_chan->ld_free, &list);
+       sh_chan->descs_allocated = 0;
+
+       spin_unlock_bh(&sh_chan->desc_lock);
+
+       list_for_each_entry_safe(desc, _desc, &list, node)
+               kfree(desc);
+}
+
+static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy(
+       struct dma_chan *chan, dma_addr_t dma_dest, dma_addr_t dma_src,
+       size_t len, unsigned long flags)
+{
+       struct sh_dmae_chan *sh_chan;
+       struct sh_desc *first = NULL, *prev = NULL, *new;
+       size_t copy_size;
+
+       if (!chan)
+               return NULL;
+
+       if (!len)
+               return NULL;
+
+       sh_chan = to_sh_chan(chan);
+
+       do {
+               /* Allocate the link descriptor from DMA pool */
+               new = sh_dmae_get_desc(sh_chan);
+               if (!new) {
+                       dev_err(sh_chan->dev,
+                                       "No free memory for link descriptor\n");
+                       goto err_get_desc;
+               }
+
+               copy_size = min(len, (size_t)SH_DMA_TCR_MAX);
+
+               new->hw.sar = dma_src;
+               new->hw.dar = dma_dest;
+               new->hw.tcr = copy_size;
+               if (!first)
+                       first = new;
+
+               new->mark = DESC_NCOMP;
+               async_tx_ack(&new->async_tx);
+
+               prev = new;
+               len -= copy_size;
+               dma_src += copy_size;
+               dma_dest += copy_size;
+               /* Insert the link descriptor to the LD ring */
+               list_add_tail(&new->node, &first->tx_list);
+       } while (len);
+
+       new->async_tx.flags = flags; /* client is in control of this ack */
+       new->async_tx.cookie = -EBUSY; /* Last desc */
+
+       return &first->async_tx;
+
+err_get_desc:
+       sh_dmae_put_desc(sh_chan, first);
+       return NULL;
+
+}
+
+/*
+ * sh_chan_ld_cleanup - Clean up link descriptors
+ *
+ * This function clean up the ld_queue of DMA channel.
+ */
+static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan)
+{
+       struct sh_desc *desc, *_desc;
+
+       spin_lock_bh(&sh_chan->desc_lock);
+       list_for_each_entry_safe(desc, _desc, &sh_chan->ld_queue, node) {
+               dma_async_tx_callback callback;
+               void *callback_param;
+
+               /* non send data */
+               if (desc->mark == DESC_NCOMP)
+                       break;
+
+               /* send data sesc */
+               callback = desc->async_tx.callback;
+               callback_param = desc->async_tx.callback_param;
+
+               /* Remove from ld_queue list */
+               list_splice_init(&desc->tx_list, &sh_chan->ld_free);
+
+               dev_dbg(sh_chan->dev, "link descriptor %p will be recycle.\n",
+                               desc);
+
+               list_move(&desc->node, &sh_chan->ld_free);
+               /* Run the link descriptor callback function */
+               if (callback) {
+                       spin_unlock_bh(&sh_chan->desc_lock);
+                       dev_dbg(sh_chan->dev, "link descriptor %p callback\n",
+                                       desc);
+                       callback(callback_param);
+                       spin_lock_bh(&sh_chan->desc_lock);
+               }
+       }
+       spin_unlock_bh(&sh_chan->desc_lock);
+}
+
+static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan)
+{
+       struct list_head *ld_node;
+       struct sh_dmae_regs hw;
+
+       /* DMA work check */
+       if (dmae_is_idle(sh_chan))
+               return;
+
+       /* Find the first un-transfer desciptor */
+       for (ld_node = sh_chan->ld_queue.next;
+               (ld_node != &sh_chan->ld_queue)
+                       && (to_sh_desc(ld_node)->mark == DESC_COMP);
+               ld_node = ld_node->next)
+               cpu_relax();
+
+       if (ld_node != &sh_chan->ld_queue) {
+               /* Get the ld start address from ld_queue */
+               hw = to_sh_desc(ld_node)->hw;
+               dmae_set_reg(sh_chan, hw);
+               dmae_start(sh_chan);
+       }
+}
+
+static void sh_dmae_memcpy_issue_pending(struct dma_chan *chan)
+{
+       struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+       sh_chan_xfer_ld_queue(sh_chan);
+}
+
+static enum dma_status sh_dmae_is_complete(struct dma_chan *chan,
+                                       dma_cookie_t cookie,
+                                       dma_cookie_t *done,
+                                       dma_cookie_t *used)
+{
+       struct sh_dmae_chan *sh_chan = to_sh_chan(chan);
+       dma_cookie_t last_used;
+       dma_cookie_t last_complete;
+
+       sh_dmae_chan_ld_cleanup(sh_chan);
+
+       last_used = chan->cookie;
+       last_complete = sh_chan->completed_cookie;
+       if (last_complete == -EBUSY)
+               last_complete = last_used;
+
+       if (done)
+               *done = last_complete;
+
+       if (used)
+               *used = last_used;
+
+       return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static irqreturn_t sh_dmae_interrupt(int irq, void *data)
+{
+       irqreturn_t ret = IRQ_NONE;
+       struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
+       u32 chcr = sh_dmae_readl(sh_chan, CHCR);
+
+       if (chcr & CHCR_TE) {
+               /* DMA stop */
+               dmae_halt(sh_chan);
+
+               ret = IRQ_HANDLED;
+               tasklet_schedule(&sh_chan->tasklet);
+       }
+
+       return ret;
+}
+
+#if defined(CONFIG_CPU_SH4)
+static irqreturn_t sh_dmae_err(int irq, void *data)
+{
+       int err = 0;
+       struct sh_dmae_device *shdev = (struct sh_dmae_device *)data;
+
+       /* IRQ Multi */
+       if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+               int cnt = 0;
+               switch (irq) {
+#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
+               case DMTE6_IRQ:
+                       cnt++;
+#endif
+               case DMTE0_IRQ:
+                       if (dmaor_read_reg(cnt) & (DMAOR_NMIF | DMAOR_AE)) {
+                               disable_irq(irq);
+                               return IRQ_HANDLED;
+                       }
+               default:
+                       return IRQ_NONE;
+               }
+       } else {
+               /* reset dma controller */
+               err = sh_dmae_rst(0);
+               if (err)
+                       return err;
+               if (shdev->pdata.mode & SHDMA_DMAOR1) {
+                       err = sh_dmae_rst(1);
+                       if (err)
+                               return err;
+               }
+               disable_irq(irq);
+               return IRQ_HANDLED;
+       }
+}
+#endif
+
+static void dmae_do_tasklet(unsigned long data)
+{
+       struct sh_dmae_chan *sh_chan = (struct sh_dmae_chan *)data;
+       struct sh_desc *desc, *_desc, *cur_desc = NULL;
+       u32 sar_buf = sh_dmae_readl(sh_chan, SAR);
+       list_for_each_entry_safe(desc, _desc,
+                                       &sh_chan->ld_queue, node) {
+               if ((desc->hw.sar + desc->hw.tcr) == sar_buf) {
+                       cur_desc = desc;
+                       break;
+               }
+       }
+
+       if (cur_desc) {
+               switch (cur_desc->async_tx.cookie) {
+               case 0: /* other desc data */
+                       break;
+               case -EBUSY: /* last desc */
+               sh_chan->completed_cookie =
+                               cur_desc->async_tx.cookie;
+                       break;
+               default: /* first desc ( 0 < )*/
+                       sh_chan->completed_cookie =
+                               cur_desc->async_tx.cookie - 1;
+                       break;
+               }
+               cur_desc->mark = DESC_COMP;
+       }
+       /* Next desc */
+       sh_chan_xfer_ld_queue(sh_chan);
+       sh_dmae_chan_ld_cleanup(sh_chan);
+}
+
+static unsigned int get_dmae_irq(unsigned int id)
+{
+       unsigned int irq = 0;
+       if (id < ARRAY_SIZE(dmte_irq_map))
+               irq = dmte_irq_map[id];
+       return irq;
+}
+
+static int __devinit sh_dmae_chan_probe(struct sh_dmae_device *shdev, int id)
+{
+       int err;
+       unsigned int irq = get_dmae_irq(id);
+       unsigned long irqflags = IRQF_DISABLED;
+       struct sh_dmae_chan *new_sh_chan;
+
+       /* alloc channel */
+       new_sh_chan = kzalloc(sizeof(struct sh_dmae_chan), GFP_KERNEL);
+       if (!new_sh_chan) {
+               dev_err(shdev->common.dev, "No free memory for allocating "
+                               "dma channels!\n");
+               return -ENOMEM;
+       }
+
+       new_sh_chan->dev = shdev->common.dev;
+       new_sh_chan->id = id;
+
+       /* Init DMA tasklet */
+       tasklet_init(&new_sh_chan->tasklet, dmae_do_tasklet,
+                       (unsigned long)new_sh_chan);
+
+       /* Init the channel */
+       dmae_init(new_sh_chan);
+
+       spin_lock_init(&new_sh_chan->desc_lock);
+
+       /* Init descripter manage list */
+       INIT_LIST_HEAD(&new_sh_chan->ld_queue);
+       INIT_LIST_HEAD(&new_sh_chan->ld_free);
+
+       /* copy struct dma_device */
+       new_sh_chan->common.device = &shdev->common;
+
+       /* Add the channel to DMA device channel list */
+       list_add_tail(&new_sh_chan->common.device_node,
+                       &shdev->common.channels);
+       shdev->common.chancnt++;
+
+       if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+               irqflags = IRQF_SHARED;
+#if defined(DMTE6_IRQ)
+               if (irq >= DMTE6_IRQ)
+                       irq = DMTE6_IRQ;
+               else
+#endif
+                       irq = DMTE0_IRQ;
+       }
+
+       snprintf(new_sh_chan->dev_id, sizeof(new_sh_chan->dev_id),
+                       "sh-dmae%d", new_sh_chan->id);
+
+       /* set up channel irq */
+       err = request_irq(irq, &sh_dmae_interrupt,
+               irqflags, new_sh_chan->dev_id, new_sh_chan);
+       if (err) {
+               dev_err(shdev->common.dev, "DMA channel %d request_irq error "
+                       "with return %d\n", id, err);
+               goto err_no_irq;
+       }
+
+       /* CHCR register control function */
+       new_sh_chan->set_chcr = dmae_set_chcr;
+       /* DMARS register control function */
+       new_sh_chan->set_dmars = dmae_set_dmars;
+
+       shdev->chan[id] = new_sh_chan;
+       return 0;
+
+err_no_irq:
+       /* remove from dmaengine device node */
+       list_del(&new_sh_chan->common.device_node);
+       kfree(new_sh_chan);
+       return err;
+}
+
+static void sh_dmae_chan_remove(struct sh_dmae_device *shdev)
+{
+       int i;
+
+       for (i = shdev->common.chancnt - 1 ; i >= 0 ; i--) {
+               if (shdev->chan[i]) {
+                       struct sh_dmae_chan *shchan = shdev->chan[i];
+                       if (!(shdev->pdata.mode & SHDMA_MIX_IRQ))
+                               free_irq(dmte_irq_map[i], shchan);
+
+                       list_del(&shchan->common.device_node);
+                       kfree(shchan);
+                       shdev->chan[i] = NULL;
+               }
+       }
+       shdev->common.chancnt = 0;
+}
+
+static int __init sh_dmae_probe(struct platform_device *pdev)
+{
+       int err = 0, cnt, ecnt;
+       unsigned long irqflags = IRQF_DISABLED;
+#if defined(CONFIG_CPU_SH4)
+       int eirq[] = { DMAE0_IRQ,
+#if defined(DMAE1_IRQ)
+                       DMAE1_IRQ
+#endif
+               };
+#endif
+       struct sh_dmae_device *shdev;
+
+       shdev = kzalloc(sizeof(struct sh_dmae_device), GFP_KERNEL);
+       if (!shdev) {
+               dev_err(&pdev->dev, "No enough memory\n");
+               err = -ENOMEM;
+               goto shdev_err;
+       }
+
+       /* get platform data */
+       if (!pdev->dev.platform_data)
+               goto shdev_err;
+
+       /* platform data */
+       memcpy(&shdev->pdata, pdev->dev.platform_data,
+                       sizeof(struct sh_dmae_pdata));
+
+       /* reset dma controller */
+       err = sh_dmae_rst(0);
+       if (err)
+               goto rst_err;
+
+       /* SH7780/85/23 has DMAOR1 */
+       if (shdev->pdata.mode & SHDMA_DMAOR1) {
+               err = sh_dmae_rst(1);
+               if (err)
+                       goto rst_err;
+       }
+
+       INIT_LIST_HEAD(&shdev->common.channels);
+
+       dma_cap_set(DMA_MEMCPY, shdev->common.cap_mask);
+       shdev->common.device_alloc_chan_resources
+               = sh_dmae_alloc_chan_resources;
+       shdev->common.device_free_chan_resources = sh_dmae_free_chan_resources;
+       shdev->common.device_prep_dma_memcpy = sh_dmae_prep_memcpy;
+       shdev->common.device_is_tx_complete = sh_dmae_is_complete;
+       shdev->common.device_issue_pending = sh_dmae_memcpy_issue_pending;
+       shdev->common.dev = &pdev->dev;
+
+#if defined(CONFIG_CPU_SH4)
+       /* Non Mix IRQ mode SH7722/SH7730 etc... */
+       if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+               irqflags = IRQF_SHARED;
+               eirq[0] = DMTE0_IRQ;
+#if defined(DMTE6_IRQ) && defined(DMAE1_IRQ)
+               eirq[1] = DMTE6_IRQ;
+#endif
+       }
+
+       for (ecnt = 0 ; ecnt < ARRAY_SIZE(eirq); ecnt++) {
+               err = request_irq(eirq[ecnt], sh_dmae_err,
+                       irqflags, "DMAC Address Error", shdev);
+               if (err) {
+                       dev_err(&pdev->dev, "DMA device request_irq"
+                               "error (irq %d) with return %d\n",
+                               eirq[ecnt], err);
+                       goto eirq_err;
+               }
+       }
+#endif /* CONFIG_CPU_SH4 */
+
+       /* Create DMA Channel */
+       for (cnt = 0 ; cnt < MAX_DMA_CHANNELS ; cnt++) {
+               err = sh_dmae_chan_probe(shdev, cnt);
+               if (err)
+                       goto chan_probe_err;
+       }
+
+       platform_set_drvdata(pdev, shdev);
+       dma_async_device_register(&shdev->common);
+
+       return err;
+
+chan_probe_err:
+       sh_dmae_chan_remove(shdev);
+
+eirq_err:
+       for (ecnt-- ; ecnt >= 0; ecnt--)
+               free_irq(eirq[ecnt], shdev);
+
+rst_err:
+       kfree(shdev);
+
+shdev_err:
+       return err;
+}
+
+static int __exit sh_dmae_remove(struct platform_device *pdev)
+{
+       struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
+
+       dma_async_device_unregister(&shdev->common);
+
+       if (shdev->pdata.mode & SHDMA_MIX_IRQ) {
+               free_irq(DMTE0_IRQ, shdev);
+#if defined(DMTE6_IRQ)
+               free_irq(DMTE6_IRQ, shdev);
+#endif
+       }
+
+       /* channel data remove */
+       sh_dmae_chan_remove(shdev);
+
+       if (!(shdev->pdata.mode & SHDMA_MIX_IRQ)) {
+               free_irq(DMAE0_IRQ, shdev);
+#if defined(DMAE1_IRQ)
+               free_irq(DMAE1_IRQ, shdev);
+#endif
+       }
+       kfree(shdev);
+
+       return 0;
+}
+
+static void sh_dmae_shutdown(struct platform_device *pdev)
+{
+       struct sh_dmae_device *shdev = platform_get_drvdata(pdev);
+       sh_dmae_ctl_stop(0);
+       if (shdev->pdata.mode & SHDMA_DMAOR1)
+               sh_dmae_ctl_stop(1);
+}
+
+static struct platform_driver sh_dmae_driver = {
+       .remove         = __exit_p(sh_dmae_remove),
+       .shutdown       = sh_dmae_shutdown,
+       .driver = {
+               .name   = "sh-dma-engine",
+       },
+};
+
+static int __init sh_dmae_init(void)
+{
+       return platform_driver_probe(&sh_dmae_driver, sh_dmae_probe);
+}
+module_init(sh_dmae_init);
+
+static void __exit sh_dmae_exit(void)
+{
+       platform_driver_unregister(&sh_dmae_driver);
+}
+module_exit(sh_dmae_exit);
+
+MODULE_AUTHOR("Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>");
+MODULE_DESCRIPTION("Renesas SH DMA Engine driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/dma/shdma.h b/drivers/dma/shdma.h
new file mode 100644 (file)
index 0000000..2b4bc15
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Renesas SuperH DMA Engine support
+ *
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#ifndef __DMA_SHDMA_H
+#define __DMA_SHDMA_H
+
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/dmaengine.h>
+
+#define SH_DMA_TCR_MAX 0x00FFFFFF      /* 16MB */
+
+struct sh_dmae_regs {
+       u32 sar; /* SAR / source address */
+       u32 dar; /* DAR / destination address */
+       u32 tcr; /* TCR / transfer count */
+};
+
+struct sh_desc {
+       struct list_head tx_list;
+       struct sh_dmae_regs hw;
+       struct list_head node;
+       struct dma_async_tx_descriptor async_tx;
+       int mark;
+};
+
+struct sh_dmae_chan {
+       dma_cookie_t completed_cookie;  /* The maximum cookie completed */
+       spinlock_t desc_lock;                   /* Descriptor operation lock */
+       struct list_head ld_queue;              /* Link descriptors queue */
+       struct list_head ld_free;               /* Link descriptors free */
+       struct dma_chan common;                 /* DMA common channel */
+       struct device *dev;                             /* Channel device */
+       struct tasklet_struct tasklet;  /* Tasklet */
+       int descs_allocated;                    /* desc count */
+       int id;                         /* Raw id of this channel */
+       char dev_id[16];        /* unique name per DMAC of channel */
+
+       /* Set chcr */
+       int (*set_chcr)(struct sh_dmae_chan *sh_chan, u32 regs);
+       /* Set DMA resource */
+       int (*set_dmars)(struct sh_dmae_chan *sh_chan, u16 res);
+};
+
+struct sh_dmae_device {
+       struct dma_device common;
+       struct sh_dmae_chan *chan[MAX_DMA_CHANNELS];
+       struct sh_dmae_pdata pdata;
+};
+
+#define to_sh_chan(chan) container_of(chan, struct sh_dmae_chan, common)
+#define to_sh_desc(lh) container_of(lh, struct sh_desc, node)
+#define tx_to_sh_desc(tx) container_of(tx, struct sh_desc, async_tx)
+
+#endif /* __DMA_SHDMA_H */
index 7837930146a4fb2d229ce28dcf56675084a5783a..fb6bb64e88619a729ab9b2cd1fa4574fd29be16e 100644 (file)
@@ -180,9 +180,8 @@ static struct txx9dmac_desc *txx9dmac_first_queued(struct txx9dmac_chan *dc)
 
 static struct txx9dmac_desc *txx9dmac_last_child(struct txx9dmac_desc *desc)
 {
-       if (!list_empty(&desc->txd.tx_list))
-               desc = list_entry(desc->txd.tx_list.prev,
-                                 struct txx9dmac_desc, desc_node);
+       if (!list_empty(&desc->tx_list))
+               desc = list_entry(desc->tx_list.prev, typeof(*desc), desc_node);
        return desc;
 }
 
@@ -197,6 +196,7 @@ static struct txx9dmac_desc *txx9dmac_desc_alloc(struct txx9dmac_chan *dc,
        desc = kzalloc(sizeof(*desc), flags);
        if (!desc)
                return NULL;
+       INIT_LIST_HEAD(&desc->tx_list);
        dma_async_tx_descriptor_init(&desc->txd, &dc->chan);
        desc->txd.tx_submit = txx9dmac_tx_submit;
        /* txd.flags will be overwritten in prep funcs */
@@ -245,7 +245,7 @@ static void txx9dmac_sync_desc_for_cpu(struct txx9dmac_chan *dc,
        struct txx9dmac_dev *ddev = dc->ddev;
        struct txx9dmac_desc *child;
 
-       list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+       list_for_each_entry(child, &desc->tx_list, desc_node)
                dma_sync_single_for_cpu(chan2parent(&dc->chan),
                                child->txd.phys, ddev->descsize,
                                DMA_TO_DEVICE);
@@ -267,11 +267,11 @@ static void txx9dmac_desc_put(struct txx9dmac_chan *dc,
                txx9dmac_sync_desc_for_cpu(dc, desc);
 
                spin_lock_bh(&dc->lock);
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        dev_vdbg(chan2dev(&dc->chan),
                                 "moving child desc %p to freelist\n",
                                 child);
-               list_splice_init(&desc->txd.tx_list, &dc->free_list);
+               list_splice_init(&desc->tx_list, &dc->free_list);
                dev_vdbg(chan2dev(&dc->chan), "moving desc %p to freelist\n",
                         desc);
                list_add(&desc->desc_node, &dc->free_list);
@@ -429,7 +429,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc,
        param = txd->callback_param;
 
        txx9dmac_sync_desc_for_cpu(dc, desc);
-       list_splice_init(&txd->tx_list, &dc->free_list);
+       list_splice_init(&desc->tx_list, &dc->free_list);
        list_move(&desc->desc_node, &dc->free_list);
 
        if (!ds) {
@@ -571,7 +571,7 @@ static void txx9dmac_handle_error(struct txx9dmac_chan *dc, u32 csr)
                 "Bad descriptor submitted for DMA! (cookie: %d)\n",
                 bad_desc->txd.cookie);
        txx9dmac_dump_desc(dc, &bad_desc->hwdesc);
-       list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
+       list_for_each_entry(child, &bad_desc->tx_list, desc_node)
                txx9dmac_dump_desc(dc, &child->hwdesc);
        /* Pretend the descriptor completed successfully */
        txx9dmac_descriptor_complete(dc, bad_desc);
@@ -613,7 +613,7 @@ static void txx9dmac_scan_descriptors(struct txx9dmac_chan *dc)
                        return;
                }
 
-               list_for_each_entry(child, &desc->txd.tx_list, desc_node)
+               list_for_each_entry(child, &desc->tx_list, desc_node)
                        if (desc_read_CHAR(dc, child) == chain) {
                                /* Currently in progress */
                                if (csr & TXX9_DMA_CSR_ABCHC)
@@ -823,8 +823,7 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                        dma_sync_single_for_device(chan2parent(&dc->chan),
                                        prev->txd.phys, ddev->descsize,
                                        DMA_TO_DEVICE);
-                       list_add_tail(&desc->desc_node,
-                                       &first->txd.tx_list);
+                       list_add_tail(&desc->desc_node, &first->tx_list);
                }
                prev = desc;
        }
@@ -919,8 +918,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                                        prev->txd.phys,
                                        ddev->descsize,
                                        DMA_TO_DEVICE);
-                       list_add_tail(&desc->desc_node,
-                                       &first->txd.tx_list);
+                       list_add_tail(&desc->desc_node, &first->tx_list);
                }
                prev = desc;
        }
index c907ff01d27603a06539e8ac89ec71651f86de34..365d42366b9f15e237833ace8530e1f4e78e1a21 100644 (file)
@@ -231,6 +231,7 @@ struct txx9dmac_desc {
 
        /* THEN values for driver housekeeping */
        struct list_head                desc_node ____cacheline_aligned;
+       struct list_head                tx_list;
        struct dma_async_tx_descriptor  txd;
        size_t                          len;
 };
index a3ca18e2d7cfe76d285cb5aa728ff9a709def563..02127e59fe8e68f8f3518453028d47b84988e598 100644 (file)
@@ -133,6 +133,13 @@ config EDAC_I3000
          Support for error detection and correction on the Intel
          3000 and 3010 server chipsets.
 
+config EDAC_I3200
+       tristate "Intel 3200"
+       depends on EDAC_MM_EDAC && PCI && X86 && EXPERIMENTAL
+       help
+         Support for error detection and correction on the Intel
+         3200 and 3210 server chipsets.
+
 config EDAC_X38
        tristate "Intel X38"
        depends on EDAC_MM_EDAC && PCI && X86
@@ -176,11 +183,11 @@ config EDAC_I5100
          San Clemente MCH.
 
 config EDAC_MPC85XX
-       tristate "Freescale MPC85xx"
-       depends on EDAC_MM_EDAC && FSL_SOC && MPC85xx
+       tristate "Freescale MPC83xx / MPC85xx"
+       depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || MPC85xx)
        help
          Support for error detection and correction on the Freescale
-         MPC8560, MPC8540, MPC8548
+         MPC8349, MPC8560, MPC8540, MPC8548
 
 config EDAC_MV64X60
        tristate "Marvell MV64x60"
index cfa033ce53a7d3b2cfbaeb7f77d76fd902a610a0..7a473bbe8abd77bfab7784e2b707ca627a8cb798 100644 (file)
@@ -32,6 +32,7 @@ obj-$(CONFIG_EDAC_I82443BXGX)         += i82443bxgx_edac.o
 obj-$(CONFIG_EDAC_I82875P)             += i82875p_edac.o
 obj-$(CONFIG_EDAC_I82975X)             += i82975x_edac.o
 obj-$(CONFIG_EDAC_I3000)               += i3000_edac.o
+obj-$(CONFIG_EDAC_I3200)               += i3200_edac.o
 obj-$(CONFIG_EDAC_X38)                 += x38_edac.o
 obj-$(CONFIG_EDAC_I82860)              += i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)              += r82600_edac.o
@@ -49,3 +50,4 @@ obj-$(CONFIG_EDAC_CELL)                       += cell_edac.o
 obj-$(CONFIG_EDAC_PPC4XX)              += ppc4xx_edac.o
 obj-$(CONFIG_EDAC_AMD8111)             += amd8111_edac.o
 obj-$(CONFIG_EDAC_AMD8131)             += amd8131_edac.o
+
index 8c54196b5aba90011c5558ab312c7a19a2c54486..3d50274f1348d97b6e3cc4bc9d9f071e73ebb2ec 100644 (file)
@@ -885,14 +885,14 @@ static int __devinit cpc925_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdev->name)) {
                cpc925_printk(KERN_ERR, "Unable to request mem region\n");
                res = -EBUSY;
                goto err1;
        }
 
-       vbase = devm_ioremap(&pdev->dev, r->start, r->end - r->start + 1);
+       vbase = devm_ioremap(&pdev->dev, r->start, resource_size(r));
        if (!vbase) {
                cpc925_printk(KERN_ERR, "Unable to ioremap device\n");
                res = -ENOMEM;
@@ -953,7 +953,7 @@ err3:
        cpc925_mc_exit(mci);
        edac_mc_free(mci);
 err2:
-       devm_release_mem_region(&pdev->dev, r->start, r->end-r->start+1);
+       devm_release_mem_region(&pdev->dev, r->start, resource_size(r));
 err1:
        devres_release_group(&pdev->dev, cpc925_probe);
 out:
index b02a6a69a8f0148eebbaa19526a75e89890973d9..d5e13c94714f13bca93ae94927b55b144be9f10b 100644 (file)
@@ -356,7 +356,6 @@ static void complete_edac_device_list_del(struct rcu_head *head)
 
        edac_dev = container_of(head, struct edac_device_ctl_info, rcu);
        INIT_LIST_HEAD(&edac_dev->link);
-       complete(&edac_dev->removal_complete);
 }
 
 /*
@@ -369,10 +368,8 @@ static void del_edac_device_from_global_list(struct edac_device_ctl_info
                                                *edac_device)
 {
        list_del_rcu(&edac_device->link);
-
-       init_completion(&edac_device->removal_complete);
        call_rcu(&edac_device->rcu, complete_edac_device_list_del);
-       wait_for_completion(&edac_device->removal_complete);
+       rcu_barrier();
 }
 
 /*
index 335b7ebdb11c535481f785e11add07aef509177b..b629c41756f0324b348827cb1e1975d1cdefe7c9 100644 (file)
@@ -418,16 +418,14 @@ static void complete_mc_list_del(struct rcu_head *head)
 
        mci = container_of(head, struct mem_ctl_info, rcu);
        INIT_LIST_HEAD(&mci->link);
-       complete(&mci->complete);
 }
 
 static void del_mc_from_global_list(struct mem_ctl_info *mci)
 {
        atomic_dec(&edac_handlers);
        list_del_rcu(&mci->link);
-       init_completion(&mci->complete);
        call_rcu(&mci->rcu, complete_mc_list_del);
-       wait_for_completion(&mci->complete);
+       rcu_barrier();
 }
 
 /**
index 30b585b1d60bdcf50b433a93d948cc6b30e88f0c..efb5d565078304c60ea096b84bf1ae2b9761e4a4 100644 (file)
@@ -174,7 +174,6 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
 
        pci = container_of(head, struct edac_pci_ctl_info, rcu);
        INIT_LIST_HEAD(&pci->link);
-       complete(&pci->complete);
 }
 
 /*
@@ -185,9 +184,8 @@ static void complete_edac_pci_list_del(struct rcu_head *head)
 static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci)
 {
        list_del_rcu(&pci->link);
-       init_completion(&pci->complete);
        call_rcu(&pci->rcu, complete_edac_pci_list_del);
-       wait_for_completion(&pci->complete);
+       rcu_barrier();
 }
 
 #if 0
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
new file mode 100644 (file)
index 0000000..fde4db9
--- /dev/null
@@ -0,0 +1,527 @@
+/*
+ * Intel 3200/3210 Memory Controller kernel module
+ * Copyright (C) 2008-2009 Akamai Technologies, Inc.
+ * Portions by Hitoshi Mitake <h.mitake@gmail.com>.
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/edac.h>
+#include <linux/io.h>
+#include "edac_core.h"
+
+#define I3200_REVISION        "1.1"
+
+#define EDAC_MOD_STR        "i3200_edac"
+
+#define PCI_DEVICE_ID_INTEL_3200_HB    0x29f0
+
+#define I3200_RANKS            8
+#define I3200_RANKS_PER_CHANNEL        4
+#define I3200_CHANNELS         2
+
+/* Intel 3200 register addresses - device 0 function 0 - DRAM Controller */
+
+#define I3200_MCHBAR_LOW       0x48    /* MCH Memory Mapped Register BAR */
+#define I3200_MCHBAR_HIGH      0x4c
+#define I3200_MCHBAR_MASK      0xfffffc000ULL  /* bits 35:14 */
+#define I3200_MMR_WINDOW_SIZE  16384
+
+#define I3200_TOM              0xa0    /* Top of Memory (16b)
+                *
+                * 15:10 reserved
+                *  9:0  total populated physical memory
+                */
+#define I3200_TOM_MASK         0x3ff   /* bits 9:0 */
+#define I3200_TOM_SHIFT                26      /* 64MiB grain */
+
+#define I3200_ERRSTS           0xc8    /* Error Status Register (16b)
+                *
+                * 15    reserved
+                * 14    Isochronous TBWRR Run Behind FIFO Full
+                *       (ITCV)
+                * 13    Isochronous TBWRR Run Behind FIFO Put
+                *       (ITSTV)
+                * 12    reserved
+                * 11    MCH Thermal Sensor Event
+                *       for SMI/SCI/SERR (GTSE)
+                * 10    reserved
+                *  9    LOCK to non-DRAM Memory Flag (LCKF)
+                *  8    reserved
+                *  7    DRAM Throttle Flag (DTF)
+                *  6:2  reserved
+                *  1    Multi-bit DRAM ECC Error Flag (DMERR)
+                *  0    Single-bit DRAM ECC Error Flag (DSERR)
+                */
+#define I3200_ERRSTS_UE                0x0002
+#define I3200_ERRSTS_CE                0x0001
+#define I3200_ERRSTS_BITS      (I3200_ERRSTS_UE | I3200_ERRSTS_CE)
+
+
+/* Intel  MMIO register space - device 0 function 0 - MMR space */
+
+#define I3200_C0DRB    0x200   /* Channel 0 DRAM Rank Boundary (16b x 4)
+                *
+                * 15:10 reserved
+                *  9:0  Channel 0 DRAM Rank Boundary Address
+                */
+#define I3200_C1DRB    0x600   /* Channel 1 DRAM Rank Boundary (16b x 4) */
+#define I3200_DRB_MASK 0x3ff   /* bits 9:0 */
+#define I3200_DRB_SHIFT        26      /* 64MiB grain */
+
+#define I3200_C0ECCERRLOG      0x280   /* Channel 0 ECC Error Log (64b)
+                *
+                * 63:48 Error Column Address (ERRCOL)
+                * 47:32 Error Row Address (ERRROW)
+                * 31:29 Error Bank Address (ERRBANK)
+                * 28:27 Error Rank Address (ERRRANK)
+                * 26:24 reserved
+                * 23:16 Error Syndrome (ERRSYND)
+                * 15: 2 reserved
+                *    1  Multiple Bit Error Status (MERRSTS)
+                *    0  Correctable Error Status (CERRSTS)
+                */
+#define I3200_C1ECCERRLOG              0x680   /* Chan 1 ECC Error Log (64b) */
+#define I3200_ECCERRLOG_CE             0x1
+#define I3200_ECCERRLOG_UE             0x2
+#define I3200_ECCERRLOG_RANK_BITS      0x18000000
+#define I3200_ECCERRLOG_RANK_SHIFT     27
+#define I3200_ECCERRLOG_SYNDROME_BITS  0xff0000
+#define I3200_ECCERRLOG_SYNDROME_SHIFT 16
+#define I3200_CAPID0                   0xe0    /* P.95 of spec for details */
+
+struct i3200_priv {
+       void __iomem *window;
+};
+
+static int nr_channels;
+
+static int how_many_channels(struct pci_dev *pdev)
+{
+       unsigned char capid0_8b; /* 8th byte of CAPID0 */
+
+       pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
+       if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
+               debugf0("In single channel mode.\n");
+               return 1;
+       } else {
+               debugf0("In dual channel mode.\n");
+               return 2;
+       }
+}
+
+static unsigned long eccerrlog_syndrome(u64 log)
+{
+       return (log & I3200_ECCERRLOG_SYNDROME_BITS) >>
+               I3200_ECCERRLOG_SYNDROME_SHIFT;
+}
+
+static int eccerrlog_row(int channel, u64 log)
+{
+       u64 rank = ((log & I3200_ECCERRLOG_RANK_BITS) >>
+               I3200_ECCERRLOG_RANK_SHIFT);
+       return rank | (channel * I3200_RANKS_PER_CHANNEL);
+}
+
+enum i3200_chips {
+       I3200 = 0,
+};
+
+struct i3200_dev_info {
+       const char *ctl_name;
+};
+
+struct i3200_error_info {
+       u16 errsts;
+       u16 errsts2;
+       u64 eccerrlog[I3200_CHANNELS];
+};
+
+static const struct i3200_dev_info i3200_devs[] = {
+       [I3200] = {
+               .ctl_name = "i3200"
+       },
+};
+
+static struct pci_dev *mci_pdev;
+static int i3200_registered = 1;
+
+
+static void i3200_clear_error_info(struct mem_ctl_info *mci)
+{
+       struct pci_dev *pdev;
+
+       pdev = to_pci_dev(mci->dev);
+
+       /*
+        * Clear any error bits.
+        * (Yes, we really clear bits by writing 1 to them.)
+        */
+       pci_write_bits16(pdev, I3200_ERRSTS, I3200_ERRSTS_BITS,
+               I3200_ERRSTS_BITS);
+}
+
+static void i3200_get_and_clear_error_info(struct mem_ctl_info *mci,
+               struct i3200_error_info *info)
+{
+       struct pci_dev *pdev;
+       struct i3200_priv *priv = mci->pvt_info;
+       void __iomem *window = priv->window;
+
+       pdev = to_pci_dev(mci->dev);
+
+       /*
+        * This is a mess because there is no atomic way to read all the
+        * registers at once and the registers can transition from CE being
+        * overwritten by UE.
+        */
+       pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts);
+       if (!(info->errsts & I3200_ERRSTS_BITS))
+               return;
+
+       info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
+       if (nr_channels == 2)
+               info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
+
+       pci_read_config_word(pdev, I3200_ERRSTS, &info->errsts2);
+
+       /*
+        * If the error is the same for both reads then the first set
+        * of reads is valid.  If there is a change then there is a CE
+        * with no info and the second set of reads is valid and
+        * should be UE info.
+        */
+       if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
+               info->eccerrlog[0] = readq(window + I3200_C0ECCERRLOG);
+               if (nr_channels == 2)
+                       info->eccerrlog[1] = readq(window + I3200_C1ECCERRLOG);
+       }
+
+       i3200_clear_error_info(mci);
+}
+
+static void i3200_process_error_info(struct mem_ctl_info *mci,
+               struct i3200_error_info *info)
+{
+       int channel;
+       u64 log;
+
+       if (!(info->errsts & I3200_ERRSTS_BITS))
+               return;
+
+       if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) {
+               edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+               info->errsts = info->errsts2;
+       }
+
+       for (channel = 0; channel < nr_channels; channel++) {
+               log = info->eccerrlog[channel];
+               if (log & I3200_ECCERRLOG_UE) {
+                       edac_mc_handle_ue(mci, 0, 0,
+                               eccerrlog_row(channel, log),
+                               "i3200 UE");
+               } else if (log & I3200_ECCERRLOG_CE) {
+                       edac_mc_handle_ce(mci, 0, 0,
+                               eccerrlog_syndrome(log),
+                               eccerrlog_row(channel, log), 0,
+                               "i3200 CE");
+               }
+       }
+}
+
+static void i3200_check(struct mem_ctl_info *mci)
+{
+       struct i3200_error_info info;
+
+       debugf1("MC%d: %s()\n", mci->mc_idx, __func__);
+       i3200_get_and_clear_error_info(mci, &info);
+       i3200_process_error_info(mci, &info);
+}
+
+
+void __iomem *i3200_map_mchbar(struct pci_dev *pdev)
+{
+       union {
+               u64 mchbar;
+               struct {
+                       u32 mchbar_low;
+                       u32 mchbar_high;
+               };
+       } u;
+       void __iomem *window;
+
+       pci_read_config_dword(pdev, I3200_MCHBAR_LOW, &u.mchbar_low);
+       pci_read_config_dword(pdev, I3200_MCHBAR_HIGH, &u.mchbar_high);
+       u.mchbar &= I3200_MCHBAR_MASK;
+
+       if (u.mchbar != (resource_size_t)u.mchbar) {
+               printk(KERN_ERR
+                       "i3200: mmio space beyond accessible range (0x%llx)\n",
+                       (unsigned long long)u.mchbar);
+               return NULL;
+       }
+
+       window = ioremap_nocache(u.mchbar, I3200_MMR_WINDOW_SIZE);
+       if (!window)
+               printk(KERN_ERR "i3200: cannot map mmio space at 0x%llx\n",
+                       (unsigned long long)u.mchbar);
+
+       return window;
+}
+
+
+static void i3200_get_drbs(void __iomem *window,
+       u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
+{
+       int i;
+
+       for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
+               drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
+               drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
+       }
+}
+
+static bool i3200_is_stacked(struct pci_dev *pdev,
+       u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL])
+{
+       u16 tom;
+
+       pci_read_config_word(pdev, I3200_TOM, &tom);
+       tom &= I3200_TOM_MASK;
+
+       return drbs[I3200_CHANNELS - 1][I3200_RANKS_PER_CHANNEL - 1] == tom;
+}
+
+static unsigned long drb_to_nr_pages(
+       u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL], bool stacked,
+       int channel, int rank)
+{
+       int n;
+
+       n = drbs[channel][rank];
+       if (rank > 0)
+               n -= drbs[channel][rank - 1];
+       if (stacked && (channel == 1) &&
+       drbs[channel][rank] == drbs[channel][I3200_RANKS_PER_CHANNEL - 1])
+               n -= drbs[0][I3200_RANKS_PER_CHANNEL - 1];
+
+       n <<= (I3200_DRB_SHIFT - PAGE_SHIFT);
+       return n;
+}
+
+static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
+{
+       int rc;
+       int i;
+       struct mem_ctl_info *mci = NULL;
+       unsigned long last_page;
+       u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL];
+       bool stacked;
+       void __iomem *window;
+       struct i3200_priv *priv;
+
+       debugf0("MC: %s()\n", __func__);
+
+       window = i3200_map_mchbar(pdev);
+       if (!window)
+               return -ENODEV;
+
+       i3200_get_drbs(window, drbs);
+       nr_channels = how_many_channels(pdev);
+
+       mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS,
+               nr_channels, 0);
+       if (!mci)
+               return -ENOMEM;
+
+       debugf3("MC: %s(): init mci\n", __func__);
+
+       mci->dev = &pdev->dev;
+       mci->mtype_cap = MEM_FLAG_DDR2;
+
+       mci->edac_ctl_cap = EDAC_FLAG_SECDED;
+       mci->edac_cap = EDAC_FLAG_SECDED;
+
+       mci->mod_name = EDAC_MOD_STR;
+       mci->mod_ver = I3200_REVISION;
+       mci->ctl_name = i3200_devs[dev_idx].ctl_name;
+       mci->dev_name = pci_name(pdev);
+       mci->edac_check = i3200_check;
+       mci->ctl_page_to_phys = NULL;
+       priv = mci->pvt_info;
+       priv->window = window;
+
+       stacked = i3200_is_stacked(pdev, drbs);
+
+       /*
+        * The dram rank boundary (DRB) reg values are boundary addresses
+        * for each DRAM rank with a granularity of 64MB.  DRB regs are
+        * cumulative; the last one will contain the total memory
+        * contained in all ranks.
+        */
+       last_page = -1UL;
+       for (i = 0; i < mci->nr_csrows; i++) {
+               unsigned long nr_pages;
+               struct csrow_info *csrow = &mci->csrows[i];
+
+               nr_pages = drb_to_nr_pages(drbs, stacked,
+                       i / I3200_RANKS_PER_CHANNEL,
+                       i % I3200_RANKS_PER_CHANNEL);
+
+               if (nr_pages == 0) {
+                       csrow->mtype = MEM_EMPTY;
+                       continue;
+               }
+
+               csrow->first_page = last_page + 1;
+               last_page += nr_pages;
+               csrow->last_page = last_page;
+               csrow->nr_pages = nr_pages;
+
+               csrow->grain = nr_pages << PAGE_SHIFT;
+               csrow->mtype = MEM_DDR2;
+               csrow->dtype = DEV_UNKNOWN;
+               csrow->edac_mode = EDAC_UNKNOWN;
+       }
+
+       i3200_clear_error_info(mci);
+
+       rc = -ENODEV;
+       if (edac_mc_add_mc(mci)) {
+               debugf3("MC: %s(): failed edac_mc_add_mc()\n", __func__);
+               goto fail;
+       }
+
+       /* get this far and it's successful */
+       debugf3("MC: %s(): success\n", __func__);
+       return 0;
+
+fail:
+       iounmap(window);
+       if (mci)
+               edac_mc_free(mci);
+
+       return rc;
+}
+
+static int __devinit i3200_init_one(struct pci_dev *pdev,
+               const struct pci_device_id *ent)
+{
+       int rc;
+
+       debugf0("MC: %s()\n", __func__);
+
+       if (pci_enable_device(pdev) < 0)
+               return -EIO;
+
+       rc = i3200_probe1(pdev, ent->driver_data);
+       if (!mci_pdev)
+               mci_pdev = pci_dev_get(pdev);
+
+       return rc;
+}
+
+static void __devexit i3200_remove_one(struct pci_dev *pdev)
+{
+       struct mem_ctl_info *mci;
+       struct i3200_priv *priv;
+
+       debugf0("%s()\n", __func__);
+
+       mci = edac_mc_del_mc(&pdev->dev);
+       if (!mci)
+               return;
+
+       priv = mci->pvt_info;
+       iounmap(priv->window);
+
+       edac_mc_free(mci);
+}
+
+static const struct pci_device_id i3200_pci_tbl[] __devinitdata = {
+       {
+               PCI_VEND_DEV(INTEL, 3200_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+               I3200},
+       {
+               0,
+       }            /* 0 terminated list. */
+};
+
+MODULE_DEVICE_TABLE(pci, i3200_pci_tbl);
+
+static struct pci_driver i3200_driver = {
+       .name = EDAC_MOD_STR,
+       .probe = i3200_init_one,
+       .remove = __devexit_p(i3200_remove_one),
+       .id_table = i3200_pci_tbl,
+};
+
+static int __init i3200_init(void)
+{
+       int pci_rc;
+
+       debugf3("MC: %s()\n", __func__);
+
+       /* Ensure that the OPSTATE is set correctly for POLL or NMI */
+       opstate_init();
+
+       pci_rc = pci_register_driver(&i3200_driver);
+       if (pci_rc < 0)
+               goto fail0;
+
+       if (!mci_pdev) {
+               i3200_registered = 0;
+               mci_pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
+                               PCI_DEVICE_ID_INTEL_3200_HB, NULL);
+               if (!mci_pdev) {
+                       debugf0("i3200 pci_get_device fail\n");
+                       pci_rc = -ENODEV;
+                       goto fail1;
+               }
+
+               pci_rc = i3200_init_one(mci_pdev, i3200_pci_tbl);
+               if (pci_rc < 0) {
+                       debugf0("i3200 init fail\n");
+                       pci_rc = -ENODEV;
+                       goto fail1;
+               }
+       }
+
+       return 0;
+
+fail1:
+       pci_unregister_driver(&i3200_driver);
+
+fail0:
+       if (mci_pdev)
+               pci_dev_put(mci_pdev);
+
+       return pci_rc;
+}
+
+static void __exit i3200_exit(void)
+{
+       debugf3("MC: %s()\n", __func__);
+
+       pci_unregister_driver(&i3200_driver);
+       if (!i3200_registered) {
+               i3200_remove_one(mci_pdev);
+               pci_dev_put(mci_pdev);
+       }
+}
+
+module_init(i3200_init);
+module_exit(i3200_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Akamai Technologies, Inc.");
+MODULE_DESCRIPTION("MC support for Intel 3200 memory hub controllers");
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
index 3f2ccfc6407c298be7be4260e5b09512590e0dfe..157f6504f25ea14bee970b909f2f04856ffe5395 100644 (file)
@@ -41,7 +41,9 @@ static u32 orig_pci_err_en;
 #endif
 
 static u32 orig_l2_err_disable;
+#ifdef CONFIG_MPC85xx
 static u32 orig_hid1[2];
+#endif
 
 /************************ MC SYSFS parts ***********************************/
 
@@ -646,6 +648,7 @@ static struct of_device_id mpc85xx_l2_err_of_match[] = {
        { .compatible = "fsl,mpc8560-l2-cache-controller", },
        { .compatible = "fsl,mpc8568-l2-cache-controller", },
        { .compatible = "fsl,mpc8572-l2-cache-controller", },
+       { .compatible = "fsl,p2020-l2-cache-controller", },
        {},
 };
 
@@ -788,19 +791,20 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci)
                csrow = &mci->csrows[index];
                cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 +
                                  (index * MPC85XX_MC_CS_BNDS_OFS));
-               start = (cs_bnds & 0xfff0000) << 4;
-               end = ((cs_bnds & 0xfff) << 20);
-               if (start)
-                       start |= 0xfffff;
-               if (end)
-                       end |= 0xfffff;
+
+               start = (cs_bnds & 0xffff0000) >> 16;
+               end   = (cs_bnds & 0x0000ffff);
 
                if (start == end)
                        continue;       /* not populated */
 
+               start <<= (24 - PAGE_SHIFT);
+               end   <<= (24 - PAGE_SHIFT);
+               end    |= (1 << (24 - PAGE_SHIFT)) - 1;
+
                csrow->first_page = start >> PAGE_SHIFT;
                csrow->last_page = end >> PAGE_SHIFT;
-               csrow->nr_pages = csrow->last_page + 1 - csrow->first_page;
+               csrow->nr_pages = end + 1 - start;
                csrow->grain = 8;
                csrow->mtype = mtype;
                csrow->dtype = DEV_UNKNOWN;
@@ -984,6 +988,8 @@ static struct of_device_id mpc85xx_mc_err_of_match[] = {
        { .compatible = "fsl,mpc8560-memory-controller", },
        { .compatible = "fsl,mpc8568-memory-controller", },
        { .compatible = "fsl,mpc8572-memory-controller", },
+       { .compatible = "fsl,mpc8349-memory-controller", },
+       { .compatible = "fsl,p2020-memory-controller", },
        {},
 };
 
@@ -999,13 +1005,13 @@ static struct of_platform_driver mpc85xx_mc_err_driver = {
                   },
 };
 
-
+#ifdef CONFIG_MPC85xx
 static void __init mpc85xx_mc_clear_rfxe(void *data)
 {
        orig_hid1[smp_processor_id()] = mfspr(SPRN_HID1);
        mtspr(SPRN_HID1, (orig_hid1[smp_processor_id()] & ~0x20000));
 }
-
+#endif
 
 static int __init mpc85xx_mc_init(void)
 {
@@ -1038,26 +1044,32 @@ static int __init mpc85xx_mc_init(void)
                printk(KERN_WARNING EDAC_MOD_STR "PCI fails to register\n");
 #endif
 
+#ifdef CONFIG_MPC85xx
        /*
         * need to clear HID1[RFXE] to disable machine check int
         * so we can catch it
         */
        if (edac_op_state == EDAC_OPSTATE_INT)
                on_each_cpu(mpc85xx_mc_clear_rfxe, NULL, 0);
+#endif
 
        return 0;
 }
 
 module_init(mpc85xx_mc_init);
 
+#ifdef CONFIG_MPC85xx
 static void __exit mpc85xx_mc_restore_hid1(void *data)
 {
        mtspr(SPRN_HID1, orig_hid1[smp_processor_id()]);
 }
+#endif
 
 static void __exit mpc85xx_mc_exit(void)
 {
+#ifdef CONFIG_MPC85xx
        on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0);
+#endif
 #ifdef CONFIG_PCI
        of_unregister_platform_driver(&mpc85xx_pci_err_driver);
 #endif
index 5131aaae8e03787d0c3e237f3cbac85b77453880..a6b9fec13a74cd4a8afb8a10a89c55e240a7fa86 100644 (file)
@@ -90,7 +90,7 @@ static int __init mv64x60_pci_fixup(struct platform_device *pdev)
                return -ENOENT;
        }
 
-       pci_serr = ioremap(r->start, r->end - r->start + 1);
+       pci_serr = ioremap(r->start, resource_size(r));
        if (!pci_serr)
                return -ENOMEM;
 
@@ -140,7 +140,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdata->name)) {
                printk(KERN_ERR "%s: Error while requesting mem region\n",
                       __func__);
@@ -150,7 +150,7 @@ static int __devinit mv64x60_pci_err_probe(struct platform_device *pdev)
 
        pdata->pci_vbase = devm_ioremap(&pdev->dev,
                                        r->start,
-                                       r->end - r->start + 1);
+                                       resource_size(r));
        if (!pdata->pci_vbase) {
                printk(KERN_ERR "%s: Unable to setup PCI err regs\n", __func__);
                res = -ENOMEM;
@@ -306,7 +306,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdata->name)) {
                printk(KERN_ERR "%s: Error while request mem region\n",
                       __func__);
@@ -316,7 +316,7 @@ static int __devinit mv64x60_sram_err_probe(struct platform_device *pdev)
 
        pdata->sram_vbase = devm_ioremap(&pdev->dev,
                                         r->start,
-                                        r->end - r->start + 1);
+                                        resource_size(r));
        if (!pdata->sram_vbase) {
                printk(KERN_ERR "%s: Unable to setup SRAM err regs\n",
                       __func__);
@@ -474,7 +474,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdata->name)) {
                printk(KERN_ERR "%s: Error while requesting mem region\n",
                       __func__);
@@ -484,7 +484,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
        pdata->cpu_vbase[0] = devm_ioremap(&pdev->dev,
                                           r->start,
-                                          r->end - r->start + 1);
+                                          resource_size(r));
        if (!pdata->cpu_vbase[0]) {
                printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
                res = -ENOMEM;
@@ -501,7 +501,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdata->name)) {
                printk(KERN_ERR "%s: Error while requesting mem region\n",
                       __func__);
@@ -511,7 +511,7 @@ static int __devinit mv64x60_cpu_err_probe(struct platform_device *pdev)
 
        pdata->cpu_vbase[1] = devm_ioremap(&pdev->dev,
                                           r->start,
-                                          r->end - r->start + 1);
+                                          resource_size(r));
        if (!pdata->cpu_vbase[1]) {
                printk(KERN_ERR "%s: Unable to setup CPU err regs\n", __func__);
                res = -ENOMEM;
@@ -726,7 +726,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 
        if (!devm_request_mem_region(&pdev->dev,
                                     r->start,
-                                    r->end - r->start + 1,
+                                    resource_size(r),
                                     pdata->name)) {
                printk(KERN_ERR "%s: Error while requesting mem region\n",
                       __func__);
@@ -736,7 +736,7 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev)
 
        pdata->mc_vbase = devm_ioremap(&pdev->dev,
                                       r->start,
-                                      r->end - r->start + 1);
+                                      resource_size(r));
        if (!pdata->mc_vbase) {
                printk(KERN_ERR "%s: Unable to setup MC err regs\n", __func__);
                res = -ENOMEM;
index 949c97ff57e35bec6917c4aab4aca2ebaa165c32..1f20a042a4f508197ff45ebec146a9ff2daeeccd 100644 (file)
@@ -29,8 +29,8 @@
 
 #include <asm/idle.h>
 
-#include "../dma/ioatdma_hw.h"
-#include "../dma/ioatdma_registers.h"
+#include "../dma/ioat/hw.h"
+#include "../dma/ioat/registers.h"
 
 #define I7300_IDLE_DRIVER_VERSION      "1.55"
 #define I7300_PRINT                    "i7300_idle:"
@@ -126,9 +126,9 @@ static void i7300_idle_ioat_stop(void)
                udelay(10);
 
                sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-                       IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+                       IOAT_CHANSTS_STATUS;
 
-               if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE)
+               if (sts != IOAT_CHANSTS_ACTIVE)
                        break;
 
        }
@@ -160,9 +160,9 @@ static int __init i7300_idle_ioat_selftest(u8 *ctl,
        udelay(1000);
 
        chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-                       IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+                       IOAT_CHANSTS_STATUS;
 
-       if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) {
+       if (chan_sts != IOAT_CHANSTS_DONE) {
                /* Not complete, reset the channel */
                writeb(IOAT_CHANCMD_RESET,
                       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
@@ -288,9 +288,9 @@ static void __exit i7300_idle_ioat_exit(void)
                       ioat_chanbase + IOAT1_CHANCMD_OFFSET);
 
                chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-                       IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+                       IOAT_CHANSTS_STATUS;
 
-               if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+               if (chan_sts != IOAT_CHANSTS_ACTIVE) {
                        writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET);
                        break;
                }
@@ -298,14 +298,14 @@ static void __exit i7300_idle_ioat_exit(void)
        }
 
        chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) &
-                       IOAT_CHANSTS_DMA_TRANSFER_STATUS;
+                       IOAT_CHANSTS_STATUS;
 
        /*
         * We tried to reset multiple times. If IO A/T channel is still active
         * flag an error and return without cleanup. Memory leak is better
         * than random corruption in that extreme error situation.
         */
-       if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) {
+       if (chan_sts == IOAT_CHANSTS_ACTIVE) {
                printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels."
                        " Not freeing resources\n");
                return;
index 76d6751f89a7607784a5293b566d4d3b2f87f36e..02f4f8f1db6f3af8a440167518a02a2ebd61e31a 100644 (file)
@@ -225,6 +225,7 @@ config INPUT_SGI_BTNS
 config INPUT_WINBOND_CIR
        tristate "Winbond IR remote control"
        depends on X86 && PNP
+       select NEW_LEDS
        select LEDS_CLASS
        select BITREVERSE
        help
index 020f9573fd82011babb4ad666a2966a44d088aba..2158377a13593a45938278ac860d5de3db8a06fa 100644 (file)
@@ -124,6 +124,8 @@ config MD_RAID456
        select MD_RAID6_PQ
        select ASYNC_MEMCPY
        select ASYNC_XOR
+       select ASYNC_PQ
+       select ASYNC_RAID6_RECOV
        ---help---
          A RAID-5 set of N drives with a capacity of C MB per drive provides
          the capacity of C * (N - 1) MB, and protects against a failure
@@ -152,9 +154,33 @@ config MD_RAID456
 
          If unsure, say Y.
 
+config MULTICORE_RAID456
+       bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
+       depends on MD_RAID456
+       depends on SMP
+       depends on EXPERIMENTAL
+       ---help---
+         Enable the raid456 module to dispatch per-stripe raid operations to a
+         thread pool.
+
+         If unsure, say N.
+
 config MD_RAID6_PQ
        tristate
 
+config ASYNC_RAID6_TEST
+       tristate "Self test for hardware accelerated raid6 recovery"
+       depends on MD_RAID6_PQ
+       select ASYNC_RAID6_RECOV
+       ---help---
+         This is a one-shot self test that permutes through the
+         recovery of all the possible two disk failure scenarios for a
+         N-disk array.  Recovery is performed with the asynchronous
+         raid6 recovery routines, and will optionally use an offload
+         engine if one is available.
+
+         If unsure, say N.
+
 config MD_MULTIPATH
        tristate "Multipath I/O support"
        depends on BLK_DEV_MD
index 3319c2fec28e40f7908d7e0688595c113f3a597f..6986b0059d23279fd83e67da452b225bdf946708 100644 (file)
@@ -108,6 +108,8 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
  * allocated while we're using it
  */
 static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
 {
        unsigned char *mappage;
 
@@ -325,7 +327,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
        return 0;
 
  bad_alignment:
-       rcu_read_unlock();
        return -EINVAL;
 }
 
@@ -1207,6 +1208,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
                                            sector_t offset, int *blocks,
                                            int create)
+__releases(bitmap->lock)
+__acquires(bitmap->lock)
 {
        /* If 'create', we might release the lock and reclaim it.
         * The lock must have been taken with interrupts enabled.
index ea48429054441e35b4d3230ed876e729b1169806..1ceceb334d5ebe8f5ce637d29898fc984bfcc797 100644 (file)
@@ -108,6 +108,9 @@ static int linear_congested(void *data, int bits)
        linear_conf_t *conf;
        int i, ret = 0;
 
+       if (mddev_congested(mddev, bits))
+               return 1;
+
        rcu_read_lock();
        conf = rcu_dereference(mddev->private);
 
index 6aa497e4baf85d30526ad12ab8a96dbc179d9538..26ba42a79129bdb953fa5b98628a744689a6594c 100644 (file)
@@ -262,6 +262,12 @@ static void mddev_resume(mddev_t *mddev)
        mddev->pers->quiesce(mddev, 0);
 }
 
+int mddev_congested(mddev_t *mddev, int bits)
+{
+       return mddev->suspended;
+}
+EXPORT_SYMBOL(mddev_congested);
+
 
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
@@ -4218,7 +4224,7 @@ static int do_md_run(mddev_t * mddev)
                        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                        mddev->sync_thread = md_register_thread(md_do_sync,
                                                                mddev,
-                                                               "%s_resync");
+                                                               "resync");
                        if (!mddev->sync_thread) {
                                printk(KERN_ERR "%s: could not start resync"
                                       " thread...\n",
@@ -4575,10 +4581,10 @@ static int get_version(void __user * arg)
 static int get_array_info(mddev_t * mddev, void __user * arg)
 {
        mdu_array_info_t info;
-       int nr,working,active,failed,spare;
+       int nr,working,insync,failed,spare;
        mdk_rdev_t *rdev;
 
-       nr=working=active=failed=spare=0;
+       nr=working=insync=failed=spare=0;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
@@ -4586,7 +4592,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
                else {
                        working++;
                        if (test_bit(In_sync, &rdev->flags))
-                               active++;       
+                               insync++;       
                        else
                                spare++;
                }
@@ -4611,7 +4617,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
                info.state = (1<<MD_SB_CLEAN);
        if (mddev->bitmap && mddev->bitmap_offset)
                info.state = (1<<MD_SB_BITMAP_PRESENT);
-       info.active_disks  = active;
+       info.active_disks  = insync;
        info.working_disks = working;
        info.failed_disks  = failed;
        info.spare_disks   = spare;
@@ -4721,7 +4727,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                if (!list_empty(&mddev->disks)) {
                        mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
                                                        mdk_rdev_t, same_set);
-                       int err = super_types[mddev->major_version]
+                       err = super_types[mddev->major_version]
                                .load_super(rdev, rdev0, mddev->minor_version);
                        if (err < 0) {
                                printk(KERN_WARNING 
@@ -5631,7 +5637,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
        thread->run = run;
        thread->mddev = mddev;
        thread->timeout = MAX_SCHEDULE_TIMEOUT;
-       thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
+       thread->tsk = kthread_run(md_thread, thread,
+                                 "%s_%s",
+                                 mdname(thread->mddev),
+                                 name ?: mddev->pers->name);
        if (IS_ERR(thread->tsk)) {
                kfree(thread);
                return NULL;
@@ -6745,7 +6754,7 @@ void md_check_recovery(mddev_t *mddev)
                        }
                        mddev->sync_thread = md_register_thread(md_do_sync,
                                                                mddev,
-                                                               "%s_resync");
+                                                               "resync");
                        if (!mddev->sync_thread) {
                                printk(KERN_ERR "%s: could not start resync"
                                        " thread...\n", 
index f55d2ff9513329b973fe0c6c96ec9434eb53de9c..f184b69ef337514d460d669ffc4309a7d5dcc515 100644 (file)
@@ -430,6 +430,7 @@ extern void md_write_end(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 
+extern int mddev_congested(mddev_t *mddev, int bits);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                           sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
index d2d3fd54cc681184cab146bf90ddc000f3b706de..ee7646f974a07165bd368deb3a18f6390e5553dc 100644 (file)
@@ -150,7 +150,6 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
        }
 
        mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
-       memset(mp_bh, 0, sizeof(*mp_bh));
 
        mp_bh->master_bio = bio;
        mp_bh->mddev = mddev;
@@ -199,6 +198,9 @@ static int multipath_congested(void *data, int bits)
        multipath_conf_t *conf = mddev->private;
        int i, ret = 0;
 
+       if (mddev_congested(mddev, bits))
+               return 1;
+
        rcu_read_lock();
        for (i = 0; i < mddev->raid_disks ; i++) {
                mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
@@ -504,7 +506,7 @@ static int multipath_run (mddev_t *mddev)
        }
 
        {
-               mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
+               mddev->thread = md_register_thread(multipathd, mddev, NULL);
                if (!mddev->thread) {
                        printk(KERN_ERR "multipath: couldn't allocate thread"
                                " for %s\n", mdname(mddev));
index f845ed98fec9c5126862f85a3e298f7083b80d83..d3a4ce06015a300e9d4df5e95ee63669613db17e 100644 (file)
@@ -44,6 +44,9 @@ static int raid0_congested(void *data, int bits)
        mdk_rdev_t **devlist = conf->devlist;
        int i, ret = 0;
 
+       if (mddev_congested(mddev, bits))
+               return 1;
+
        for (i = 0; i < mddev->raid_disks && !ret ; i++) {
                struct request_queue *q = bdev_get_queue(devlist[i]->bdev);
 
@@ -86,7 +89,7 @@ static void dump_zones(mddev_t *mddev)
 
 static int create_strip_zones(mddev_t *mddev)
 {
-       int i, c, j, err;
+       int i, c, err;
        sector_t curr_zone_end, sectors;
        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
        struct strip_zone *zone;
@@ -198,6 +201,8 @@ static int create_strip_zones(mddev_t *mddev)
        /* now do the other zones */
        for (i = 1; i < conf->nr_strip_zones; i++)
        {
+               int j;
+
                zone = conf->strip_zone + i;
                dev = conf->devlist + i * mddev->raid_disks;
 
@@ -207,7 +212,6 @@ static int create_strip_zones(mddev_t *mddev)
                c = 0;
 
                for (j=0; j<cnt; j++) {
-                       char b[BDEVNAME_SIZE];
                        rdev = conf->devlist[j];
                        printk(KERN_INFO "raid0: checking %s ...",
                                bdevname(rdev->bdev, b));
index ff7ed33359959e39747509e4a16a24a0c584b292..d1b9bd5fd4f6cad04f4037ece137f43904f50de6 100644 (file)
@@ -576,6 +576,9 @@ static int raid1_congested(void *data, int bits)
        conf_t *conf = mddev->private;
        int i, ret = 0;
 
+       if (mddev_congested(mddev, bits))
+               return 1;
+
        rcu_read_lock();
        for (i = 0; i < mddev->raid_disks; i++) {
                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -851,7 +854,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
                read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
                read_bio->bi_bdev = mirror->rdev->bdev;
                read_bio->bi_end_io = raid1_end_read_request;
-               read_bio->bi_rw = READ | do_sync;
+               read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
                read_bio->bi_private = r1_bio;
 
                generic_make_request(read_bio);
@@ -943,7 +946,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-               mbio->bi_rw = WRITE | do_barriers | do_sync;
+               mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
+                       (do_sync << BIO_RW_SYNCIO);
                mbio->bi_private = r1_bio;
 
                if (behind_pages) {
@@ -1623,7 +1627,8 @@ static void raid1d(mddev_t *mddev)
                                                conf->mirrors[i].rdev->data_offset;
                                        bio->bi_bdev = conf->mirrors[i].rdev->bdev;
                                        bio->bi_end_io = raid1_end_write_request;
-                                       bio->bi_rw = WRITE | do_sync;
+                                       bio->bi_rw = WRITE |
+                                               (do_sync << BIO_RW_SYNCIO);
                                        bio->bi_private = r1_bio;
                                        r1_bio->bios[i] = bio;
                                        generic_make_request(bio);
@@ -1672,7 +1677,7 @@ static void raid1d(mddev_t *mddev)
                                bio->bi_sector = r1_bio->sector + rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                bio->bi_end_io = raid1_end_read_request;
-                               bio->bi_rw = READ | do_sync;
+                               bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
                                bio->bi_private = r1_bio;
                                unplug = 1;
                                generic_make_request(bio);
@@ -2047,7 +2052,7 @@ static int run(mddev_t *mddev)
        conf->last_used = j;
 
 
-       mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
+       mddev->thread = md_register_thread(raid1d, mddev, NULL);
        if (!mddev->thread) {
                printk(KERN_ERR
                       "raid1: couldn't allocate thread for %s\n",
index d0a2152e064f75430afa8117db184392c1a4ef3b..51c4c5c4d87add417a297714f7873df95979001d 100644 (file)
@@ -631,6 +631,8 @@ static int raid10_congested(void *data, int bits)
        conf_t *conf = mddev->private;
        int i, ret = 0;
 
+       if (mddev_congested(mddev, bits))
+               return 1;
        rcu_read_lock();
        for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
@@ -882,7 +884,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
                        mirror->rdev->data_offset;
                read_bio->bi_bdev = mirror->rdev->bdev;
                read_bio->bi_end_io = raid10_end_read_request;
-               read_bio->bi_rw = READ | do_sync;
+               read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
                read_bio->bi_private = r10_bio;
 
                generic_make_request(read_bio);
@@ -950,7 +952,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
                        conf->mirrors[d].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                mbio->bi_end_io = raid10_end_write_request;
-               mbio->bi_rw = WRITE | do_sync;
+               mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO);
                mbio->bi_private = r10_bio;
 
                atomic_inc(&r10_bio->remaining);
@@ -1623,7 +1625,7 @@ static void raid10d(mddev_t *mddev)
                                bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
                                        + rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
-                               bio->bi_rw = READ | do_sync;
+                               bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = raid10_end_read_request;
                                unplug = 1;
@@ -1773,7 +1775,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
        if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                /* recovery... the complicated one */
-               int i, j, k;
+               int j, k;
                r10_bio = NULL;
 
                for (i=0 ; i<conf->raid_disks; i++)
@@ -2188,7 +2190,7 @@ static int run(mddev_t *mddev)
        }
 
 
-       mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
+       mddev->thread = md_register_thread(raid10d, mddev, NULL);
        if (!mddev->thread) {
                printk(KERN_ERR
                       "raid10: couldn't allocate thread for %s\n",
index 826eb3467357f72af5952599525bc93355f5279c..94829804ab7fd2f68c839e0f481444bfaeb3be7a 100644 (file)
@@ -47,7 +47,9 @@
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
+#include <linux/async.h>
 #include <linux/seq_file.h>
+#include <linux/cpu.h>
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
@@ -499,11 +501,18 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
        struct page *bio_page;
        int i;
        int page_offset;
+       struct async_submit_ctl submit;
+       enum async_tx_flags flags = 0;
 
        if (bio->bi_sector >= sector)
                page_offset = (signed)(bio->bi_sector - sector) * 512;
        else
                page_offset = (signed)(sector - bio->bi_sector) * -512;
+
+       if (frombio)
+               flags |= ASYNC_TX_FENCE;
+       init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
+
        bio_for_each_segment(bvl, bio, i) {
                int len = bio_iovec_idx(bio, i)->bv_len;
                int clen;
@@ -525,15 +534,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
                        bio_page = bio_iovec_idx(bio, i)->bv_page;
                        if (frombio)
                                tx = async_memcpy(page, bio_page, page_offset,
-                                       b_offset, clen,
-                                       ASYNC_TX_DEP_ACK,
-                                       tx, NULL, NULL);
+                                                 b_offset, clen, &submit);
                        else
                                tx = async_memcpy(bio_page, page, b_offset,
-                                       page_offset, clen,
-                                       ASYNC_TX_DEP_ACK,
-                                       tx, NULL, NULL);
+                                                 page_offset, clen, &submit);
                }
+               /* chain the operations */
+               submit.depend_tx = tx;
+
                if (clen < len) /* hit end of page */
                        break;
                page_offset +=  len;
@@ -592,6 +600,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 {
        struct dma_async_tx_descriptor *tx = NULL;
        raid5_conf_t *conf = sh->raid_conf;
+       struct async_submit_ctl submit;
        int i;
 
        pr_debug("%s: stripe %llu\n", __func__,
@@ -615,22 +624,34 @@ static void ops_run_biofill(struct stripe_head *sh)
        }
 
        atomic_inc(&sh->count);
-       async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
-               ops_complete_biofill, sh);
+       init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
+       async_trigger_callback(&submit);
 }
 
-static void ops_complete_compute5(void *stripe_head_ref)
+static void mark_target_uptodate(struct stripe_head *sh, int target)
 {
-       struct stripe_head *sh = stripe_head_ref;
-       int target = sh->ops.target;
-       struct r5dev *tgt = &sh->dev[target];
+       struct r5dev *tgt;
 
-       pr_debug("%s: stripe %llu\n", __func__,
-               (unsigned long long)sh->sector);
+       if (target < 0)
+               return;
 
+       tgt = &sh->dev[target];
        set_bit(R5_UPTODATE, &tgt->flags);
        BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
        clear_bit(R5_Wantcompute, &tgt->flags);
+}
+
+static void ops_complete_compute(void *stripe_head_ref)
+{
+       struct stripe_head *sh = stripe_head_ref;
+
+       pr_debug("%s: stripe %llu\n", __func__,
+               (unsigned long long)sh->sector);
+
+       /* mark the computed target(s) as uptodate */
+       mark_target_uptodate(sh, sh->ops.target);
+       mark_target_uptodate(sh, sh->ops.target2);
+
        clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
        if (sh->check_state == check_state_compute_run)
                sh->check_state = check_state_compute_result;
@@ -638,16 +659,24 @@ static void ops_complete_compute5(void *stripe_head_ref)
        release_stripe(sh);
 }
 
-static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
+/* return a pointer to the address conversion region of the scribble buffer */
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+                                struct raid5_percpu *percpu)
+{
+       return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        int target = sh->ops.target;
        struct r5dev *tgt = &sh->dev[target];
        struct page *xor_dest = tgt->page;
        int count = 0;
        struct dma_async_tx_descriptor *tx;
+       struct async_submit_ctl submit;
        int i;
 
        pr_debug("%s: stripe %llu block: %d\n",
@@ -660,17 +689,215 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
 
        atomic_inc(&sh->count);
 
+       init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
+                         ops_complete_compute, sh, to_addr_conv(sh, percpu));
        if (unlikely(count == 1))
-               tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-                       0, NULL, ops_complete_compute5, sh);
+               tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
        else
-               tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-                       ASYNC_TX_XOR_ZERO_DST, NULL,
-                       ops_complete_compute5, sh);
+               tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
        return tx;
 }
 
+/* set_syndrome_sources - populate source buffers for gen_syndrome
+ * @srcs - (struct page *) array of size sh->disks
+ * @sh - stripe_head to parse
+ *
+ * Populates srcs in proper layout order for the stripe and returns the
+ * 'count' of sources to be used in a call to async_gen_syndrome.  The P
+ * destination buffer is recorded in srcs[count] and the Q destination
+ * is recorded in srcs[count+1]].
+ */
+static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+{
+       int disks = sh->disks;
+       int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
+       int d0_idx = raid6_d0(sh);
+       int count;
+       int i;
+
+       for (i = 0; i < disks; i++)
+               srcs[i] = (void *)raid6_empty_zero_page;
+
+       count = 0;
+       i = d0_idx;
+       do {
+               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+               srcs[slot] = sh->dev[i].page;
+               i = raid6_next_disk(i, disks);
+       } while (i != d0_idx);
+       BUG_ON(count != syndrome_disks);
+
+       return count;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+       int disks = sh->disks;
+       struct page **blocks = percpu->scribble;
+       int target;
+       int qd_idx = sh->qd_idx;
+       struct dma_async_tx_descriptor *tx;
+       struct async_submit_ctl submit;
+       struct r5dev *tgt;
+       struct page *dest;
+       int i;
+       int count;
+
+       if (sh->ops.target < 0)
+               target = sh->ops.target2;
+       else if (sh->ops.target2 < 0)
+               target = sh->ops.target;
+       else
+               /* we should only have one valid target */
+               BUG();
+       BUG_ON(target < 0);
+       pr_debug("%s: stripe %llu block: %d\n",
+               __func__, (unsigned long long)sh->sector, target);
+
+       tgt = &sh->dev[target];
+       BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+       dest = tgt->page;
+
+       atomic_inc(&sh->count);
+
+       if (target == qd_idx) {
+               count = set_syndrome_sources(blocks, sh);
+               blocks[count] = NULL; /* regenerating p is not necessary */
+               BUG_ON(blocks[count+1] != dest); /* q should already be set */
+               init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+                                 ops_complete_compute, sh,
+                                 to_addr_conv(sh, percpu));
+               tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+       } else {
+               /* Compute any data- or p-drive using XOR */
+               count = 0;
+               for (i = disks; i-- ; ) {
+                       if (i == target || i == qd_idx)
+                               continue;
+                       blocks[count++] = sh->dev[i].page;
+               }
+
+               init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+                                 NULL, ops_complete_compute, sh,
+                                 to_addr_conv(sh, percpu));
+               tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
+       }
+
+       return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
+{
+       int i, count, disks = sh->disks;
+       int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+       int d0_idx = raid6_d0(sh);
+       int faila = -1, failb = -1;
+       int target = sh->ops.target;
+       int target2 = sh->ops.target2;
+       struct r5dev *tgt = &sh->dev[target];
+       struct r5dev *tgt2 = &sh->dev[target2];
+       struct dma_async_tx_descriptor *tx;
+       struct page **blocks = percpu->scribble;
+       struct async_submit_ctl submit;
+
+       pr_debug("%s: stripe %llu block1: %d block2: %d\n",
+                __func__, (unsigned long long)sh->sector, target, target2);
+       BUG_ON(target < 0 || target2 < 0);
+       BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+       BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
+
+       /* we need to open-code set_syndrome_sources to handle the
+        * slot number conversion for 'faila' and 'failb'
+        */
+       for (i = 0; i < disks ; i++)
+               blocks[i] = (void *)raid6_empty_zero_page;
+       count = 0;
+       i = d0_idx;
+       do {
+               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+               blocks[slot] = sh->dev[i].page;
+
+               if (i == target)
+                       faila = slot;
+               if (i == target2)
+                       failb = slot;
+               i = raid6_next_disk(i, disks);
+       } while (i != d0_idx);
+       BUG_ON(count != syndrome_disks);
+
+       BUG_ON(faila == failb);
+       if (failb < faila)
+               swap(faila, failb);
+       pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
+                __func__, (unsigned long long)sh->sector, faila, failb);
+
+       atomic_inc(&sh->count);
+
+       if (failb == syndrome_disks+1) {
+               /* Q disk is one of the missing disks */
+               if (faila == syndrome_disks) {
+                       /* Missing P+Q, just recompute */
+                       init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+                                         ops_complete_compute, sh,
+                                         to_addr_conv(sh, percpu));
+                       return async_gen_syndrome(blocks, 0, count+2,
+                                                 STRIPE_SIZE, &submit);
+               } else {
+                       struct page *dest;
+                       int data_target;
+                       int qd_idx = sh->qd_idx;
+
+                       /* Missing D+Q: recompute D from P, then recompute Q */
+                       if (target == qd_idx)
+                               data_target = target2;
+                       else
+                               data_target = target;
+
+                       count = 0;
+                       for (i = disks; i-- ; ) {
+                               if (i == data_target || i == qd_idx)
+                                       continue;
+                               blocks[count++] = sh->dev[i].page;
+                       }
+                       dest = sh->dev[data_target].page;
+                       init_async_submit(&submit,
+                                         ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
+                                         NULL, NULL, NULL,
+                                         to_addr_conv(sh, percpu));
+                       tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
+                                      &submit);
+
+                       count = set_syndrome_sources(blocks, sh);
+                       init_async_submit(&submit, ASYNC_TX_FENCE, tx,
+                                         ops_complete_compute, sh,
+                                         to_addr_conv(sh, percpu));
+                       return async_gen_syndrome(blocks, 0, count+2,
+                                                 STRIPE_SIZE, &submit);
+               }
+       } else {
+               init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
+                                 ops_complete_compute, sh,
+                                 to_addr_conv(sh, percpu));
+               if (failb == syndrome_disks) {
+                       /* We're missing D+P. */
+                       return async_raid6_datap_recov(syndrome_disks+2,
+                                                      STRIPE_SIZE, faila,
+                                                      blocks, &submit);
+               } else {
+                       /* We're missing D+D. */
+                       return async_raid6_2data_recov(syndrome_disks+2,
+                                                      STRIPE_SIZE, faila, failb,
+                                                      blocks, &submit);
+               }
+       }
+}
+
+
 static void ops_complete_prexor(void *stripe_head_ref)
 {
        struct stripe_head *sh = stripe_head_ref;
@@ -680,12 +907,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
+              struct dma_async_tx_descriptor *tx)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       struct page **xor_srcs = percpu->scribble;
        int count = 0, pd_idx = sh->pd_idx, i;
+       struct async_submit_ctl submit;
 
        /* existing parity data subtracted */
        struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
@@ -700,9 +928,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                        xor_srcs[count++] = dev->page;
        }
 
-       tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-               ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
-               ops_complete_prexor, sh);
+       init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+       tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
        return tx;
 }
@@ -742,17 +970,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
        return tx;
 }
 
-static void ops_complete_postxor(void *stripe_head_ref)
+static void ops_complete_reconstruct(void *stripe_head_ref)
 {
        struct stripe_head *sh = stripe_head_ref;
-       int disks = sh->disks, i, pd_idx = sh->pd_idx;
+       int disks = sh->disks;
+       int pd_idx = sh->pd_idx;
+       int qd_idx = sh->qd_idx;
+       int i;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
-               if (dev->written || i == pd_idx)
+
+               if (dev->written || i == pd_idx || i == qd_idx)
                        set_bit(R5_UPTODATE, &dev->flags);
        }
 
@@ -770,12 +1002,12 @@ static void ops_complete_postxor(void *stripe_head_ref)
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
+                    struct dma_async_tx_descriptor *tx)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
-
+       struct page **xor_srcs = percpu->scribble;
+       struct async_submit_ctl submit;
        int count = 0, pd_idx = sh->pd_idx, i;
        struct page *xor_dest;
        int prexor = 0;
@@ -809,18 +1041,36 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
         * for the synchronous xor case
         */
-       flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
+       flags = ASYNC_TX_ACK |
                (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
 
        atomic_inc(&sh->count);
 
-       if (unlikely(count == 1)) {
-               flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
-               tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-                       flags, tx, ops_complete_postxor, sh);
-       } else
-               tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-                       flags, tx, ops_complete_postxor, sh);
+       init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
+                         to_addr_conv(sh, percpu));
+       if (unlikely(count == 1))
+               tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
+       else
+               tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+}
+
+static void
+ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
+                    struct dma_async_tx_descriptor *tx)
+{
+       struct async_submit_ctl submit;
+       struct page **blocks = percpu->scribble;
+       int count;
+
+       pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+       count = set_syndrome_sources(blocks, sh);
+
+       atomic_inc(&sh->count);
+
+       init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+                         sh, to_addr_conv(sh, percpu));
+       async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -835,63 +1085,115 @@ static void ops_complete_check(void *stripe_head_ref)
        release_stripe(sh);
 }
 
-static void ops_run_check(struct stripe_head *sh)
+static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
-       /* kernel stack size limits the total number of disks */
        int disks = sh->disks;
-       struct page *xor_srcs[disks];
+       int pd_idx = sh->pd_idx;
+       int qd_idx = sh->qd_idx;
+       struct page *xor_dest;
+       struct page **xor_srcs = percpu->scribble;
        struct dma_async_tx_descriptor *tx;
-
-       int count = 0, pd_idx = sh->pd_idx, i;
-       struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+       struct async_submit_ctl submit;
+       int count;
+       int i;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
+       count = 0;
+       xor_dest = sh->dev[pd_idx].page;
+       xor_srcs[count++] = xor_dest;
        for (i = disks; i--; ) {
-               struct r5dev *dev = &sh->dev[i];
-               if (i != pd_idx)
-                       xor_srcs[count++] = dev->page;
+               if (i == pd_idx || i == qd_idx)
+                       continue;
+               xor_srcs[count++] = sh->dev[i].page;
        }
 
-       tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-               &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
+       init_async_submit(&submit, 0, NULL, NULL, NULL,
+                         to_addr_conv(sh, percpu));
+       tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+                          &sh->ops.zero_sum_result, &submit);
+
+       atomic_inc(&sh->count);
+       init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
+       tx = async_trigger_callback(&submit);
+}
+
+static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
+{
+       struct page **srcs = percpu->scribble;
+       struct async_submit_ctl submit;
+       int count;
+
+       pr_debug("%s: stripe %llu checkp: %d\n", __func__,
+               (unsigned long long)sh->sector, checkp);
+
+       count = set_syndrome_sources(srcs, sh);
+       if (!checkp)
+               srcs[count] = NULL;
 
        atomic_inc(&sh->count);
-       tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
-               ops_complete_check, sh);
+       init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
+                         sh, to_addr_conv(sh, percpu));
+       async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
+                          &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
        int overlap_clear = 0, i, disks = sh->disks;
        struct dma_async_tx_descriptor *tx = NULL;
+       raid5_conf_t *conf = sh->raid_conf;
+       int level = conf->level;
+       struct raid5_percpu *percpu;
+       unsigned long cpu;
 
+       cpu = get_cpu();
+       percpu = per_cpu_ptr(conf->percpu, cpu);
        if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
                ops_run_biofill(sh);
                overlap_clear++;
        }
 
        if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
-               tx = ops_run_compute5(sh);
-               /* terminate the chain if postxor is not set to be run */
-               if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
+               if (level < 6)
+                       tx = ops_run_compute5(sh, percpu);
+               else {
+                       if (sh->ops.target2 < 0 || sh->ops.target < 0)
+                               tx = ops_run_compute6_1(sh, percpu);
+                       else
+                               tx = ops_run_compute6_2(sh, percpu);
+               }
+               /* terminate the chain if reconstruct is not set to be run */
+               if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
                        async_tx_ack(tx);
        }
 
        if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-               tx = ops_run_prexor(sh, tx);
+               tx = ops_run_prexor(sh, percpu, tx);
 
        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                tx = ops_run_biodrain(sh, tx);
                overlap_clear++;
        }
 
-       if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-               ops_run_postxor(sh, tx);
+       if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
+               if (level < 6)
+                       ops_run_reconstruct5(sh, percpu, tx);
+               else
+                       ops_run_reconstruct6(sh, percpu, tx);
+       }
 
-       if (test_bit(STRIPE_OP_CHECK, &ops_request))
-               ops_run_check(sh);
+       if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
+               if (sh->check_state == check_state_run)
+                       ops_run_check_p(sh, percpu);
+               else if (sh->check_state == check_state_run_q)
+                       ops_run_check_pq(sh, percpu, 0);
+               else if (sh->check_state == check_state_run_pq)
+                       ops_run_check_pq(sh, percpu, 1);
+               else
+                       BUG();
+       }
 
        if (overlap_clear)
                for (i = disks; i--; ) {
@@ -899,6 +1201,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        if (test_and_clear_bit(R5_Overlap, &dev->flags))
                                wake_up(&sh->raid_conf->wait_for_overlap);
                }
+       put_cpu();
 }
 
 static int grow_one_stripe(raid5_conf_t *conf)
@@ -948,6 +1251,28 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        return 0;
 }
 
+/**
+ * scribble_len - return the required size of the scribble region
+ * @num - total number of disks in the array
+ *
+ * The size must be enough to contain:
+ * 1/ a struct page pointer for each device in the array +2
+ * 2/ room to convert each entry in (1) to its corresponding dma
+ *    (dma_map_page()) or page (page_address()) address.
+ *
+ * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
+ * calculate over all devices (not just the data blocks), using zeros in place
+ * of the P and Q blocks.
+ */
+static size_t scribble_len(int num)
+{
+       size_t len;
+
+       len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
+
+       return len;
+}
+
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -976,6 +1301,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        struct stripe_head *osh, *nsh;
        LIST_HEAD(newstripes);
        struct disk_info *ndisks;
+       unsigned long cpu;
        int err;
        struct kmem_cache *sc;
        int i;
@@ -1041,7 +1367,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        /* Step 3.
         * At this point, we are holding all the stripes so the array
         * is completely stalled, so now is a good time to resize
-        * conf->disks.
+        * conf->disks and the scribble region
         */
        ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
        if (ndisks) {
@@ -1052,10 +1378,30 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        } else
                err = -ENOMEM;
 
+       get_online_cpus();
+       conf->scribble_len = scribble_len(newsize);
+       for_each_present_cpu(cpu) {
+               struct raid5_percpu *percpu;
+               void *scribble;
+
+               percpu = per_cpu_ptr(conf->percpu, cpu);
+               scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+
+               if (scribble) {
+                       kfree(percpu->scribble);
+                       percpu->scribble = scribble;
+               } else {
+                       err = -ENOMEM;
+                       break;
+               }
+       }
+       put_online_cpus();
+
        /* Step 4, return new stripes to service */
        while(!list_empty(&newstripes)) {
                nsh = list_entry(newstripes.next, struct stripe_head, lru);
                list_del_init(&nsh->lru);
+
                for (i=conf->raid_disks; i < newsize; i++)
                        if (nsh->dev[i].page == NULL) {
                                struct page *p = alloc_page(GFP_NOIO);
@@ -1594,258 +1940,13 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 }
 
 
-
-/*
- * Copy data between a page in the stripe cache, and one or more bion
- * The page could align with the middle of the bio, or there could be
- * several bion, each with several bio_vecs, which cover part of the page
- * Multiple bion are linked together on bi_next.  There may be extras
- * at the end of this list.  We ignore them.
- */
-static void copy_data(int frombio, struct bio *bio,
-                    struct page *page,
-                    sector_t sector)
-{
-       char *pa = page_address(page);
-       struct bio_vec *bvl;
-       int i;
-       int page_offset;
-
-       if (bio->bi_sector >= sector)
-               page_offset = (signed)(bio->bi_sector - sector) * 512;
-       else
-               page_offset = (signed)(sector - bio->bi_sector) * -512;
-       bio_for_each_segment(bvl, bio, i) {
-               int len = bio_iovec_idx(bio,i)->bv_len;
-               int clen;
-               int b_offset = 0;
-
-               if (page_offset < 0) {
-                       b_offset = -page_offset;
-                       page_offset += b_offset;
-                       len -= b_offset;
-               }
-
-               if (len > 0 && page_offset + len > STRIPE_SIZE)
-                       clen = STRIPE_SIZE - page_offset;
-               else clen = len;
-
-               if (clen > 0) {
-                       char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
-                       if (frombio)
-                               memcpy(pa+page_offset, ba+b_offset, clen);
-                       else
-                               memcpy(ba+b_offset, pa+page_offset, clen);
-                       __bio_kunmap_atomic(ba, KM_USER0);
-               }
-               if (clen < len) /* hit end of page */
-                       break;
-               page_offset +=  len;
-       }
-}
-
-#define check_xor()    do {                                              \
-                               if (count == MAX_XOR_BLOCKS) {            \
-                               xor_blocks(count, STRIPE_SIZE, dest, ptr);\
-                               count = 0;                                \
-                          }                                              \
-                       } while(0)
-
-static void compute_parity6(struct stripe_head *sh, int method)
-{
-       raid5_conf_t *conf = sh->raid_conf;
-       int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
-       int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
-       struct bio *chosen;
-       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-       void *ptrs[syndrome_disks+2];
-
-       pd_idx = sh->pd_idx;
-       qd_idx = sh->qd_idx;
-       d0_idx = raid6_d0(sh);
-
-       pr_debug("compute_parity, stripe %llu, method %d\n",
-               (unsigned long long)sh->sector, method);
-
-       switch(method) {
-       case READ_MODIFY_WRITE:
-               BUG();          /* READ_MODIFY_WRITE N/A for RAID-6 */
-       case RECONSTRUCT_WRITE:
-               for (i= disks; i-- ;)
-                       if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
-                               chosen = sh->dev[i].towrite;
-                               sh->dev[i].towrite = NULL;
-
-                               if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                       wake_up(&conf->wait_for_overlap);
-
-                               BUG_ON(sh->dev[i].written);
-                               sh->dev[i].written = chosen;
-                       }
-               break;
-       case CHECK_PARITY:
-               BUG();          /* Not implemented yet */
-       }
-
-       for (i = disks; i--;)
-               if (sh->dev[i].written) {
-                       sector_t sector = sh->dev[i].sector;
-                       struct bio *wbi = sh->dev[i].written;
-                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-                               copy_data(1, wbi, sh->dev[i].page, sector);
-                               wbi = r5_next_bio(wbi, sector);
-                       }
-
-                       set_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
-               }
-
-       /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
-
-       for (i = 0; i < disks; i++)
-               ptrs[i] = (void *)raid6_empty_zero_page;
-
-       count = 0;
-       i = d0_idx;
-       do {
-               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
-               ptrs[slot] = page_address(sh->dev[i].page);
-               if (slot < syndrome_disks &&
-                   !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-                       printk(KERN_ERR "block %d/%d not uptodate "
-                              "on parity calc\n", i, count);
-                       BUG();
-               }
-
-               i = raid6_next_disk(i, disks);
-       } while (i != d0_idx);
-       BUG_ON(count != syndrome_disks);
-
-       raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
-
-       switch(method) {
-       case RECONSTRUCT_WRITE:
-               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-               set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-               set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
-               set_bit(R5_LOCKED,   &sh->dev[qd_idx].flags);
-               break;
-       case UPDATE_PARITY:
-               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-               set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
-               break;
-       }
-}
-
-
-/* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
-{
-       int i, count, disks = sh->disks;
-       void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-       int qd_idx = sh->qd_idx;
-
-       pr_debug("compute_block_1, stripe %llu, idx %d\n",
-               (unsigned long long)sh->sector, dd_idx);
-
-       if ( dd_idx == qd_idx ) {
-               /* We're actually computing the Q drive */
-               compute_parity6(sh, UPDATE_PARITY);
-       } else {
-               dest = page_address(sh->dev[dd_idx].page);
-               if (!nozero) memset(dest, 0, STRIPE_SIZE);
-               count = 0;
-               for (i = disks ; i--; ) {
-                       if (i == dd_idx || i == qd_idx)
-                               continue;
-                       p = page_address(sh->dev[i].page);
-                       if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                               ptr[count++] = p;
-                       else
-                               printk("compute_block() %d, stripe %llu, %d"
-                                      " not present\n", dd_idx,
-                                      (unsigned long long)sh->sector, i);
-
-                       check_xor();
-               }
-               if (count)
-                       xor_blocks(count, STRIPE_SIZE, dest, ptr);
-               if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-               else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-       }
-}
-
-/* Compute two missing blocks */
-static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
-{
-       int i, count, disks = sh->disks;
-       int syndrome_disks = sh->ddf_layout ? disks : disks-2;
-       int d0_idx = raid6_d0(sh);
-       int faila = -1, failb = -1;
-       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-       void *ptrs[syndrome_disks+2];
-
-       for (i = 0; i < disks ; i++)
-               ptrs[i] = (void *)raid6_empty_zero_page;
-       count = 0;
-       i = d0_idx;
-       do {
-               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
-
-               ptrs[slot] = page_address(sh->dev[i].page);
-
-               if (i == dd_idx1)
-                       faila = slot;
-               if (i == dd_idx2)
-                       failb = slot;
-               i = raid6_next_disk(i, disks);
-       } while (i != d0_idx);
-       BUG_ON(count != syndrome_disks);
-
-       BUG_ON(faila == failb);
-       if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
-
-       pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-                (unsigned long long)sh->sector, dd_idx1, dd_idx2,
-                faila, failb);
-
-       if (failb == syndrome_disks+1) {
-               /* Q disk is one of the missing disks */
-               if (faila == syndrome_disks) {
-                       /* Missing P+Q, just recompute */
-                       compute_parity6(sh, UPDATE_PARITY);
-                       return;
-               } else {
-                       /* We're missing D+Q; recompute D from P */
-                       compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
-                                            dd_idx2 : dd_idx1),
-                                       0);
-                       compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
-                       return;
-               }
-       }
-
-       /* We're missing D+P or D+D; */
-       if (failb == syndrome_disks) {
-               /* We're missing D+P. */
-               raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
-       } else {
-               /* We're missing D+D. */
-               raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
-                                 ptrs);
-       }
-
-       /* Both the above update both missing blocks */
-       set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-       set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
-}
-
 static void
-schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
+schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         int rcw, int expand)
 {
        int i, pd_idx = sh->pd_idx, disks = sh->disks;
+       raid5_conf_t *conf = sh->raid_conf;
+       int level = conf->level;
 
        if (rcw) {
                /* if we are not expanding this is a proper write request, and
@@ -1858,7 +1959,7 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
                } else
                        sh->reconstruct_state = reconstruct_state_run;
 
-               set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
@@ -1871,17 +1972,18 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
                                s->locked++;
                        }
                }
-               if (s->locked + 1 == disks)
+               if (s->locked + conf->max_degraded == disks)
                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
-                               atomic_inc(&sh->raid_conf->pending_full_writes);
+                               atomic_inc(&conf->pending_full_writes);
        } else {
+               BUG_ON(level == 6);
                BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                        test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
                sh->reconstruct_state = reconstruct_state_prexor_drain_run;
                set_bit(STRIPE_OP_PREXOR, &s->ops_request);
                set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
-               set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
+               set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
@@ -1899,13 +2001,22 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
                }
        }
 
-       /* keep the parity disk locked while asynchronous operations
+       /* keep the parity disk(s) locked while asynchronous operations
         * are in flight
         */
        set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
        clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
        s->locked++;
 
+       if (level == 6) {
+               int qd_idx = sh->qd_idx;
+               struct r5dev *dev = &sh->dev[qd_idx];
+
+               set_bit(R5_LOCKED, &dev->flags);
+               clear_bit(R5_UPTODATE, &dev->flags);
+               s->locked++;
+       }
+
        pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
                __func__, (unsigned long long)sh->sector,
                s->locked, s->ops_request);
@@ -1986,13 +2097,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
 static void end_reshape(raid5_conf_t *conf);
 
-static int page_is_zero(struct page *p)
-{
-       char *a = page_address(p);
-       return ((*(u32*)a) == 0 &&
-               memcmp(a, a+4, STRIPE_SIZE-4)==0);
-}
-
 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                            struct stripe_head *sh)
 {
@@ -2132,9 +2236,10 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
                        set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
                        set_bit(R5_Wantcompute, &dev->flags);
                        sh->ops.target = disk_idx;
+                       sh->ops.target2 = -1;
                        s->req_compute = 1;
                        /* Careful: from this point on 'uptodate' is in the eye
-                        * of raid5_run_ops which services 'compute' operations
+                        * of raid_run_ops which services 'compute' operations
                         * before writes. R5_Wantcompute flags a block that will
                         * be R5_UPTODATE by the time it is needed for a
                         * subsequent operation.
@@ -2173,61 +2278,104 @@ static void handle_stripe_fill5(struct stripe_head *sh,
        set_bit(STRIPE_HANDLE, &sh->state);
 }
 
-static void handle_stripe_fill6(struct stripe_head *sh,
-                       struct stripe_head_state *s, struct r6_state *r6s,
-                       int disks)
+/* fetch_block6 - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill6 to continue
+ */
+static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
+                        struct r6_state *r6s, int disk_idx, int disks)
 {
-       int i;
-       for (i = disks; i--; ) {
-               struct r5dev *dev = &sh->dev[i];
-               if (!test_bit(R5_LOCKED, &dev->flags) &&
-                   !test_bit(R5_UPTODATE, &dev->flags) &&
-                   (dev->toread || (dev->towrite &&
-                    !test_bit(R5_OVERWRITE, &dev->flags)) ||
-                    s->syncing || s->expanding ||
-                    (s->failed >= 1 &&
-                     (sh->dev[r6s->failed_num[0]].toread ||
-                      s->to_write)) ||
-                    (s->failed >= 2 &&
-                     (sh->dev[r6s->failed_num[1]].toread ||
-                      s->to_write)))) {
-                       /* we would like to get this block, possibly
-                        * by computing it, but we might not be able to
+       struct r5dev *dev = &sh->dev[disk_idx];
+       struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
+                                 &sh->dev[r6s->failed_num[1]] };
+
+       if (!test_bit(R5_LOCKED, &dev->flags) &&
+           !test_bit(R5_UPTODATE, &dev->flags) &&
+           (dev->toread ||
+            (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+            s->syncing || s->expanding ||
+            (s->failed >= 1 &&
+             (fdev[0]->toread || s->to_write)) ||
+            (s->failed >= 2 &&
+             (fdev[1]->toread || s->to_write)))) {
+               /* we would like to get this block, possibly by computing it,
+                * otherwise read it if the backing disk is insync
+                */
+               BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
+               BUG_ON(test_bit(R5_Wantread, &dev->flags));
+               if ((s->uptodate == disks - 1) &&
+                   (s->failed && (disk_idx == r6s->failed_num[0] ||
+                                  disk_idx == r6s->failed_num[1]))) {
+                       /* have disk failed, and we're requested to fetch it;
+                        * do compute it
                         */
-                       if ((s->uptodate == disks - 1) &&
-                           (s->failed && (i == r6s->failed_num[0] ||
-                                          i == r6s->failed_num[1]))) {
-                               pr_debug("Computing stripe %llu block %d\n",
-                                      (unsigned long long)sh->sector, i);
-                               compute_block_1(sh, i, 0);
-                               s->uptodate++;
-                       } else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
-                               /* Computing 2-failure is *very* expensive; only
-                                * do it if failed >= 2
-                                */
-                               int other;
-                               for (other = disks; other--; ) {
-                                       if (other == i)
-                                               continue;
-                                       if (!test_bit(R5_UPTODATE,
-                                             &sh->dev[other].flags))
-                                               break;
-                               }
-                               BUG_ON(other < 0);
-                               pr_debug("Computing stripe %llu blocks %d,%d\n",
-                                      (unsigned long long)sh->sector,
-                                      i, other);
-                               compute_block_2(sh, i, other);
-                               s->uptodate += 2;
-                       } else if (test_bit(R5_Insync, &dev->flags)) {
-                               set_bit(R5_LOCKED, &dev->flags);
-                               set_bit(R5_Wantread, &dev->flags);
-                               s->locked++;
-                               pr_debug("Reading block %d (sync=%d)\n",
-                                       i, s->syncing);
+                       pr_debug("Computing stripe %llu block %d\n",
+                              (unsigned long long)sh->sector, disk_idx);
+                       set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+                       set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+                       set_bit(R5_Wantcompute, &dev->flags);
+                       sh->ops.target = disk_idx;
+                       sh->ops.target2 = -1; /* no 2nd target */
+                       s->req_compute = 1;
+                       s->uptodate++;
+                       return 1;
+               } else if (s->uptodate == disks-2 && s->failed >= 2) {
+                       /* Computing 2-failure is *very* expensive; only
+                        * do it if failed >= 2
+                        */
+                       int other;
+                       for (other = disks; other--; ) {
+                               if (other == disk_idx)
+                                       continue;
+                               if (!test_bit(R5_UPTODATE,
+                                     &sh->dev[other].flags))
+                                       break;
                        }
+                       BUG_ON(other < 0);
+                       pr_debug("Computing stripe %llu blocks %d,%d\n",
+                              (unsigned long long)sh->sector,
+                              disk_idx, other);
+                       set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+                       set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+                       set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
+                       set_bit(R5_Wantcompute, &sh->dev[other].flags);
+                       sh->ops.target = disk_idx;
+                       sh->ops.target2 = other;
+                       s->uptodate += 2;
+                       s->req_compute = 1;
+                       return 1;
+               } else if (test_bit(R5_Insync, &dev->flags)) {
+                       set_bit(R5_LOCKED, &dev->flags);
+                       set_bit(R5_Wantread, &dev->flags);
+                       s->locked++;
+                       pr_debug("Reading block %d (sync=%d)\n",
+                               disk_idx, s->syncing);
                }
        }
+
+       return 0;
+}
+
+/**
+ * handle_stripe_fill6 - read or compute data to satisfy pending requests.
+ */
+static void handle_stripe_fill6(struct stripe_head *sh,
+                       struct stripe_head_state *s, struct r6_state *r6s,
+                       int disks)
+{
+       int i;
+
+       /* look for blocks to read/compute, skip this if a compute
+        * is already in flight, or if the stripe contents are in the
+        * midst of changing due to a write
+        */
+       if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
+           !sh->reconstruct_state)
+               for (i = disks; i--; )
+                       if (fetch_block6(sh, s, r6s, i, disks))
+                               break;
        set_bit(STRIPE_HANDLE, &sh->state);
 }
 
@@ -2361,114 +2509,61 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
         */
        /* since handle_stripe can be called at any time we need to handle the
         * case where a compute block operation has been submitted and then a
-        * subsequent call wants to start a write request.  raid5_run_ops only
-        * handles the case where compute block and postxor are requested
+        * subsequent call wants to start a write request.  raid_run_ops only
+        * handles the case where compute block and reconstruct are requested
         * simultaneously.  If this is not the case then new writes need to be
         * held off until the compute completes.
         */
        if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
            (s->locked == 0 && (rcw == 0 || rmw == 0) &&
            !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-               schedule_reconstruction5(sh, s, rcw == 0, 0);
+               schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
 static void handle_stripe_dirtying6(raid5_conf_t *conf,
                struct stripe_head *sh, struct stripe_head_state *s,
-               struct r6_state *r6s, int disks)
-{
-       int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
-       int qd_idx = sh->qd_idx;
-       for (i = disks; i--; ) {
-               struct r5dev *dev = &sh->dev[i];
-               /* Would I have to read this buffer for reconstruct_write */
-               if (!test_bit(R5_OVERWRITE, &dev->flags)
-                   && i != pd_idx && i != qd_idx
-                   && (!test_bit(R5_LOCKED, &dev->flags)
-                           ) &&
-                   !test_bit(R5_UPTODATE, &dev->flags)) {
-                       if (test_bit(R5_Insync, &dev->flags)) rcw++;
-                       else {
-                               pr_debug("raid6: must_compute: "
-                                       "disk %d flags=%#lx\n", i, dev->flags);
-                               must_compute++;
-                       }
-               }
-       }
-       pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",
-              (unsigned long long)sh->sector, rcw, must_compute);
-       set_bit(STRIPE_HANDLE, &sh->state);
-
-       if (rcw > 0)
-               /* want reconstruct write, but need to get some data */
-               for (i = disks; i--; ) {
-                       struct r5dev *dev = &sh->dev[i];
-                       if (!test_bit(R5_OVERWRITE, &dev->flags)
-                           && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
-                           && !test_bit(R5_LOCKED, &dev->flags) &&
-                           !test_bit(R5_UPTODATE, &dev->flags) &&
-                           test_bit(R5_Insync, &dev->flags)) {
-                               if (
-                                 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                                       pr_debug("Read_old stripe %llu "
-                                               "block %d for Reconstruct\n",
-                                            (unsigned long long)sh->sector, i);
-                                       set_bit(R5_LOCKED, &dev->flags);
-                                       set_bit(R5_Wantread, &dev->flags);
-                                       s->locked++;
-                               } else {
-                                       pr_debug("Request delayed stripe %llu "
-                                               "block %d for Reconstruct\n",
-                                            (unsigned long long)sh->sector, i);
-                                       set_bit(STRIPE_DELAYED, &sh->state);
-                                       set_bit(STRIPE_HANDLE, &sh->state);
-                               }
+               struct r6_state *r6s, int disks)
+{
+       int rcw = 0, pd_idx = sh->pd_idx, i;
+       int qd_idx = sh->qd_idx;
+
+       set_bit(STRIPE_HANDLE, &sh->state);
+       for (i = disks; i--; ) {
+               struct r5dev *dev = &sh->dev[i];
+               /* check if we haven't enough data */
+               if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+                   i != pd_idx && i != qd_idx &&
+                   !test_bit(R5_LOCKED, &dev->flags) &&
+                   !(test_bit(R5_UPTODATE, &dev->flags) ||
+                     test_bit(R5_Wantcompute, &dev->flags))) {
+                       rcw++;
+                       if (!test_bit(R5_Insync, &dev->flags))
+                               continue; /* it's a failed drive */
+
+                       if (
+                         test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                               pr_debug("Read_old stripe %llu "
+                                       "block %d for Reconstruct\n",
+                                    (unsigned long long)sh->sector, i);
+                               set_bit(R5_LOCKED, &dev->flags);
+                               set_bit(R5_Wantread, &dev->flags);
+                               s->locked++;
+                       } else {
+                               pr_debug("Request delayed stripe %llu "
+                                       "block %d for Reconstruct\n",
+                                    (unsigned long long)sh->sector, i);
+                               set_bit(STRIPE_DELAYED, &sh->state);
+                               set_bit(STRIPE_HANDLE, &sh->state);
                        }
                }
+       }
        /* now if nothing is locked, and if we have enough data, we can start a
         * write request
         */
-       if (s->locked == 0 && rcw == 0 &&
+       if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
+           s->locked == 0 && rcw == 0 &&
            !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-               if (must_compute > 0) {
-                       /* We have failed blocks and need to compute them */
-                       switch (s->failed) {
-                       case 0:
-                               BUG();
-                       case 1:
-                               compute_block_1(sh, r6s->failed_num[0], 0);
-                               break;
-                       case 2:
-                               compute_block_2(sh, r6s->failed_num[0],
-                                               r6s->failed_num[1]);
-                               break;
-                       default: /* This request should have been failed? */
-                               BUG();
-                       }
-               }
-
-               pr_debug("Computing parity for stripe %llu\n",
-                       (unsigned long long)sh->sector);
-               compute_parity6(sh, RECONSTRUCT_WRITE);
-               /* now every locked buffer is ready to be written */
-               for (i = disks; i--; )
-                       if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-                               pr_debug("Writing stripe %llu block %d\n",
-                                      (unsigned long long)sh->sector, i);
-                               s->locked++;
-                               set_bit(R5_Wantwrite, &sh->dev[i].flags);
-                       }
-               if (s->locked == disks)
-                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
-                               atomic_inc(&conf->pending_full_writes);
-               /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-               set_bit(STRIPE_INSYNC, &sh->state);
-
-               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                       atomic_dec(&conf->preread_active_stripes);
-                       if (atomic_read(&conf->preread_active_stripes) <
-                           IO_THRESHOLD)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               schedule_reconstruction(sh, s, 1, 0);
        }
 }
 
@@ -2527,7 +2622,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                 * we are done.  Otherwise update the mismatch count and repair
                 * parity if !MD_RECOVERY_CHECK
                 */
-               if (sh->ops.zero_sum_result == 0)
+               if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
                        /* parity is correct (on disc,
                         * not in buffer any more)
                         */
@@ -2544,6 +2639,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                set_bit(R5_Wantcompute,
                                        &sh->dev[sh->pd_idx].flags);
                                sh->ops.target = sh->pd_idx;
+                               sh->ops.target2 = -1;
                                s->uptodate++;
                        }
                }
@@ -2560,67 +2656,74 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 
 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
-                               struct stripe_head_state *s,
-                               struct r6_state *r6s, struct page *tmp_page,
-                               int disks)
+                                 struct stripe_head_state *s,
+                                 struct r6_state *r6s, int disks)
 {
-       int update_p = 0, update_q = 0;
-       struct r5dev *dev;
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
+       struct r5dev *dev;
 
        set_bit(STRIPE_HANDLE, &sh->state);
 
        BUG_ON(s->failed > 2);
-       BUG_ON(s->uptodate < disks);
+
        /* Want to check and possibly repair P and Q.
         * However there could be one 'failed' device, in which
         * case we can only check one of them, possibly using the
         * other to generate missing data
         */
 
-       /* If !tmp_page, we cannot do the calculations,
-        * but as we have set STRIPE_HANDLE, we will soon be called
-        * by stripe_handle with a tmp_page - just wait until then.
-        */
-       if (tmp_page) {
+       switch (sh->check_state) {
+       case check_state_idle:
+               /* start a new check operation if there are < 2 failures */
                if (s->failed == r6s->q_failed) {
-                       /* The only possible failed device holds 'Q', so it
+                       /* The only possible failed device holds Q, so it
                         * makes sense to check P (If anything else were failed,
                         * we would have used P to recreate it).
                         */
-                       compute_block_1(sh, pd_idx, 1);
-                       if (!page_is_zero(sh->dev[pd_idx].page)) {
-                               compute_block_1(sh, pd_idx, 0);
-                               update_p = 1;
-                       }
+                       sh->check_state = check_state_run;
                }
                if (!r6s->q_failed && s->failed < 2) {
-                       /* q is not failed, and we didn't use it to generate
+                       /* Q is not failed, and we didn't use it to generate
                         * anything, so it makes sense to check it
                         */
-                       memcpy(page_address(tmp_page),
-                              page_address(sh->dev[qd_idx].page),
-                              STRIPE_SIZE);
-                       compute_parity6(sh, UPDATE_PARITY);
-                       if (memcmp(page_address(tmp_page),
-                                  page_address(sh->dev[qd_idx].page),
-                                  STRIPE_SIZE) != 0) {
-                               clear_bit(STRIPE_INSYNC, &sh->state);
-                               update_q = 1;
-                       }
+                       if (sh->check_state == check_state_run)
+                               sh->check_state = check_state_run_pq;
+                       else
+                               sh->check_state = check_state_run_q;
                }
-               if (update_p || update_q) {
-                       conf->mddev->resync_mismatches += STRIPE_SECTORS;
-                       if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-                               /* don't try to repair!! */
-                               update_p = update_q = 0;
+
+               /* discard potentially stale zero_sum_result */
+               sh->ops.zero_sum_result = 0;
+
+               if (sh->check_state == check_state_run) {
+                       /* async_xor_zero_sum destroys the contents of P */
+                       clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+                       s->uptodate--;
+               }
+               if (sh->check_state >= check_state_run &&
+                   sh->check_state <= check_state_run_pq) {
+                       /* async_syndrome_zero_sum preserves P and Q, so
+                        * no need to mark them !uptodate here
+                        */
+                       set_bit(STRIPE_OP_CHECK, &s->ops_request);
+                       break;
                }
 
+               /* we have 2-disk failure */
+               BUG_ON(s->failed != 2);
+               /* fall through */
+       case check_state_compute_result:
+               sh->check_state = check_state_idle;
+
+               /* check that a write has not made the stripe insync */
+               if (test_bit(STRIPE_INSYNC, &sh->state))
+                       break;
+
                /* now write out any block on a failed drive,
-                * or P or Q if they need it
+                * or P or Q if they were recomputed
                 */
-
+               BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
                if (s->failed == 2) {
                        dev = &sh->dev[r6s->failed_num[1]];
                        s->locked++;
@@ -2633,14 +2736,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
                        set_bit(R5_LOCKED, &dev->flags);
                        set_bit(R5_Wantwrite, &dev->flags);
                }
-
-               if (update_p) {
+               if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
                        dev = &sh->dev[pd_idx];
                        s->locked++;
                        set_bit(R5_LOCKED, &dev->flags);
                        set_bit(R5_Wantwrite, &dev->flags);
                }
-               if (update_q) {
+               if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
                        dev = &sh->dev[qd_idx];
                        s->locked++;
                        set_bit(R5_LOCKED, &dev->flags);
@@ -2649,6 +2751,70 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
                clear_bit(STRIPE_DEGRADED, &sh->state);
 
                set_bit(STRIPE_INSYNC, &sh->state);
+               break;
+       case check_state_run:
+       case check_state_run_q:
+       case check_state_run_pq:
+               break; /* we will be called again upon completion */
+       case check_state_check_result:
+               sh->check_state = check_state_idle;
+
+               /* handle a successful check operation, if parity is correct
+                * we are done.  Otherwise update the mismatch count and repair
+                * parity if !MD_RECOVERY_CHECK
+                */
+               if (sh->ops.zero_sum_result == 0) {
+                       /* both parities are correct */
+                       if (!s->failed)
+                               set_bit(STRIPE_INSYNC, &sh->state);
+                       else {
+                               /* in contrast to the raid5 case we can validate
+                                * parity, but still have a failure to write
+                                * back
+                                */
+                               sh->check_state = check_state_compute_result;
+                               /* Returning at this point means that we may go
+                                * off and bring p and/or q uptodate again so
+                                * we make sure to check zero_sum_result again
+                                * to verify if p or q need writeback
+                                */
+                       }
+               } else {
+                       conf->mddev->resync_mismatches += STRIPE_SECTORS;
+                       if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+                               /* don't try to repair!! */
+                               set_bit(STRIPE_INSYNC, &sh->state);
+                       else {
+                               int *target = &sh->ops.target;
+
+                               sh->ops.target = -1;
+                               sh->ops.target2 = -1;
+                               sh->check_state = check_state_compute_run;
+                               set_bit(STRIPE_COMPUTE_RUN, &sh->state);
+                               set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+                               if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
+                                       set_bit(R5_Wantcompute,
+                                               &sh->dev[pd_idx].flags);
+                                       *target = pd_idx;
+                                       target = &sh->ops.target2;
+                                       s->uptodate++;
+                               }
+                               if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
+                                       set_bit(R5_Wantcompute,
+                                               &sh->dev[qd_idx].flags);
+                                       *target = qd_idx;
+                                       s->uptodate++;
+                               }
+                       }
+               }
+               break;
+       case check_state_compute_run:
+               break;
+       default:
+               printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+                      __func__, sh->check_state,
+                      (unsigned long long) sh->sector);
+               BUG();
        }
 }
 
@@ -2666,6 +2832,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                if (i != sh->pd_idx && i != sh->qd_idx) {
                        int dd_idx, j;
                        struct stripe_head *sh2;
+                       struct async_submit_ctl submit;
 
                        sector_t bn = compute_blocknr(sh, i, 1);
                        sector_t s = raid5_compute_sector(conf, bn, 0,
@@ -2685,9 +2852,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                        }
 
                        /* place all the copies on one channel */
+                       init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
                        tx = async_memcpy(sh2->dev[dd_idx].page,
-                               sh->dev[i].page, 0, 0, STRIPE_SIZE,
-                               ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+                                         sh->dev[i].page, 0, 0, STRIPE_SIZE,
+                                         &submit);
 
                        set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
                        set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
@@ -2756,7 +2924,8 @@ static bool handle_stripe5(struct stripe_head *sh)
        rcu_read_lock();
        for (i=disks; i--; ) {
                mdk_rdev_t *rdev;
-               struct r5dev *dev = &sh->dev[i];
+
+               dev = &sh->dev[i];
                clear_bit(R5_Insync, &dev->flags);
 
                pr_debug("check %d: state 0x%lx toread %p read %p write %p "
@@ -2973,7 +3142,7 @@ static bool handle_stripe5(struct stripe_head *sh)
                /* Need to write out all blocks after computing parity */
                sh->disks = conf->raid_disks;
                stripe_set_idx(sh->sector, conf, 0, sh);
-               schedule_reconstruction5(sh, &s, 1, 1);
+               schedule_reconstruction(sh, &s, 1, 1);
        } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
                atomic_dec(&conf->reshape_stripes);
@@ -2993,7 +3162,7 @@ static bool handle_stripe5(struct stripe_head *sh)
                md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
        if (s.ops_request)
-               raid5_run_ops(sh, s.ops_request);
+               raid_run_ops(sh, s.ops_request);
 
        ops_run_io(sh, &s);
 
@@ -3002,7 +3171,7 @@ static bool handle_stripe5(struct stripe_head *sh)
        return blocked_rdev == NULL;
 }
 
-static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
+static bool handle_stripe6(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
@@ -3014,9 +3183,10 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        mdk_rdev_t *blocked_rdev = NULL;
 
        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
-               "pd_idx=%d, qd_idx=%d\n",
+               "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
               (unsigned long long)sh->sector, sh->state,
-              atomic_read(&sh->count), pd_idx, qd_idx);
+              atomic_read(&sh->count), pd_idx, qd_idx,
+              sh->check_state, sh->reconstruct_state);
        memset(&s, 0, sizeof(s));
 
        spin_lock(&sh->lock);
@@ -3036,35 +3206,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
                        i, dev->flags, dev->toread, dev->towrite, dev->written);
-               /* maybe we can reply to a read */
-               if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-                       struct bio *rbi, *rbi2;
-                       pr_debug("Return read for disc %d\n", i);
-                       spin_lock_irq(&conf->device_lock);
-                       rbi = dev->toread;
-                       dev->toread = NULL;
-                       if (test_and_clear_bit(R5_Overlap, &dev->flags))
-                               wake_up(&conf->wait_for_overlap);
-                       spin_unlock_irq(&conf->device_lock);
-                       while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-                               copy_data(0, rbi, dev->page, dev->sector);
-                               rbi2 = r5_next_bio(rbi, dev->sector);
-                               spin_lock_irq(&conf->device_lock);
-                               if (!raid5_dec_bi_phys_segments(rbi)) {
-                                       rbi->bi_next = return_bi;
-                                       return_bi = rbi;
-                               }
-                               spin_unlock_irq(&conf->device_lock);
-                               rbi = rbi2;
-                       }
-               }
+               /* maybe we can reply to a read
+                *
+                * new wantfill requests are only permitted while
+                * ops_complete_biofill is guaranteed to be inactive
+                */
+               if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+                   !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
+                       set_bit(R5_Wantfill, &dev->flags);
 
                /* now count some things */
                if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
                if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+               if (test_bit(R5_Wantcompute, &dev->flags)) {
+                       s.compute++;
+                       BUG_ON(s.compute > 2);
+               }
 
-
-               if (dev->toread)
+               if (test_bit(R5_Wantfill, &dev->flags)) {
+                       s.to_fill++;
+               } else if (dev->toread)
                        s.to_read++;
                if (dev->towrite) {
                        s.to_write++;
@@ -3105,6 +3266,11 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                blocked_rdev = NULL;
        }
 
+       if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
+               set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
+               set_bit(STRIPE_BIOFILL_RUN, &sh->state);
+       }
+
        pr_debug("locked=%d uptodate=%d to_read=%d"
               " to_write=%d failed=%d failed_num=%d,%d\n",
               s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3145,19 +3311,62 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         * or to load a block that is being partially written.
         */
        if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
-           (s.syncing && (s.uptodate < disks)) || s.expanding)
+           (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
                handle_stripe_fill6(sh, &s, &r6s, disks);
 
-       /* now to consider writing and what else, if anything should be read */
-       if (s.to_write)
+       /* Now we check to see if any write operations have recently
+        * completed
+        */
+       if (sh->reconstruct_state == reconstruct_state_drain_result) {
+               int qd_idx = sh->qd_idx;
+
+               sh->reconstruct_state = reconstruct_state_idle;
+               /* All the 'written' buffers and the parity blocks are ready to
+                * be written back to disk
+                */
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
+               for (i = disks; i--; ) {
+                       dev = &sh->dev[i];
+                       if (test_bit(R5_LOCKED, &dev->flags) &&
+                           (i == sh->pd_idx || i == qd_idx ||
+                            dev->written)) {
+                               pr_debug("Writing block %d\n", i);
+                               BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+                               set_bit(R5_Wantwrite, &dev->flags);
+                               if (!test_bit(R5_Insync, &dev->flags) ||
+                                   ((i == sh->pd_idx || i == qd_idx) &&
+                                     s.failed == 0))
+                                       set_bit(STRIPE_INSYNC, &sh->state);
+                       }
+               }
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                       atomic_dec(&conf->preread_active_stripes);
+                       if (atomic_read(&conf->preread_active_stripes) <
+                               IO_THRESHOLD)
+                               md_wakeup_thread(conf->mddev->thread);
+               }
+       }
+
+       /* Now to consider new write requests and what else, if anything
+        * should be read.  We do not handle new writes when:
+        * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
+        * 2/ A 'check' operation is in flight, as it may clobber the parity
+        *    block.
+        */
+       if (s.to_write && !sh->reconstruct_state && !sh->check_state)
                handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
 
        /* maybe we need to check and possibly fix the parity for this stripe
         * Any reads will already have been scheduled, so we just see if enough
-        * data is available
+        * data is available.  The parity check is held off while parity
+        * dependent operations are in flight.
         */
-       if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
-               handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
+       if (sh->check_state ||
+           (s.syncing && s.locked == 0 &&
+            !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
+            !test_bit(STRIPE_INSYNC, &sh->state)))
+               handle_parity_checks6(conf, sh, &s, &r6s, disks);
 
        if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
                md_done_sync(conf->mddev, STRIPE_SECTORS,1);
@@ -3178,15 +3387,29 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                        set_bit(R5_Wantwrite, &dev->flags);
                                        set_bit(R5_ReWrite, &dev->flags);
                                        set_bit(R5_LOCKED, &dev->flags);
+                                       s.locked++;
                                } else {
                                        /* let's read it back */
                                        set_bit(R5_Wantread, &dev->flags);
                                        set_bit(R5_LOCKED, &dev->flags);
+                                       s.locked++;
                                }
                        }
                }
 
-       if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+       /* Finish reconstruct operations initiated by the expansion process */
+       if (sh->reconstruct_state == reconstruct_state_result) {
+               sh->reconstruct_state = reconstruct_state_idle;
+               clear_bit(STRIPE_EXPANDING, &sh->state);
+               for (i = conf->raid_disks; i--; ) {
+                       set_bit(R5_Wantwrite, &sh->dev[i].flags);
+                       set_bit(R5_LOCKED, &sh->dev[i].flags);
+                       s.locked++;
+               }
+       }
+
+       if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+           !sh->reconstruct_state) {
                struct stripe_head *sh2
                        = get_active_stripe(conf, sh->sector, 1, 1, 1);
                if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
@@ -3207,14 +3430,8 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                /* Need to write out all blocks after computing P&Q */
                sh->disks = conf->raid_disks;
                stripe_set_idx(sh->sector, conf, 0, sh);
-               compute_parity6(sh, RECONSTRUCT_WRITE);
-               for (i = conf->raid_disks ; i-- ;  ) {
-                       set_bit(R5_LOCKED, &sh->dev[i].flags);
-                       s.locked++;
-                       set_bit(R5_Wantwrite, &sh->dev[i].flags);
-               }
-               clear_bit(STRIPE_EXPANDING, &sh->state);
-       } else if (s.expanded) {
+               schedule_reconstruction(sh, &s, 1, 1);
+       } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
                atomic_dec(&conf->reshape_stripes);
                wake_up(&conf->wait_for_overlap);
@@ -3232,6 +3449,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        if (unlikely(blocked_rdev))
                md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
+       if (s.ops_request)
+               raid_run_ops(sh, s.ops_request);
+
        ops_run_io(sh, &s);
 
        return_io(return_bi);
@@ -3240,16 +3460,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 }
 
 /* returns true if the stripe was handled */
-static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
+static bool handle_stripe(struct stripe_head *sh)
 {
        if (sh->raid_conf->level == 6)
-               return handle_stripe6(sh, tmp_page);
+               return handle_stripe6(sh);
        else
                return handle_stripe5(sh);
 }
 
-
-
 static void raid5_activate_delayed(raid5_conf_t *conf)
 {
        if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
@@ -3331,6 +3549,9 @@ static int raid5_congested(void *data, int bits)
        /* No difference between reads and writes.  Just check
         * how busy the stripe_cache is
         */
+
+       if (mddev_congested(mddev, bits))
+               return 1;
        if (conf->inactive_blocked)
                return 1;
        if (conf->quiesce)
@@ -3880,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
        INIT_LIST_HEAD(&stripes);
        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
-               int skipped = 0;
+               int skipped_disk = 0;
                sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
                set_bit(STRIPE_EXPANDING, &sh->state);
                atomic_inc(&conf->reshape_stripes);
@@ -3896,14 +4117,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                                continue;
                        s = compute_blocknr(sh, j, 0);
                        if (s < raid5_size(mddev, 0, 0)) {
-                               skipped = 1;
+                               skipped_disk = 1;
                                continue;
                        }
                        memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
                        set_bit(R5_Expanded, &sh->dev[j].flags);
                        set_bit(R5_UPTODATE, &sh->dev[j].flags);
                }
-               if (!skipped) {
+               if (!skipped_disk) {
                        set_bit(STRIPE_EXPAND_READY, &sh->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
@@ -4057,7 +4278,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        spin_unlock(&sh->lock);
 
        /* wait for any blocked device to be handled */
-       while(unlikely(!handle_stripe(sh, NULL)))
+       while (unlikely(!handle_stripe(sh)))
                ;
        release_stripe(sh);
 
@@ -4114,7 +4335,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                        return handled;
                }
 
-               handle_stripe(sh, NULL);
+               handle_stripe(sh);
                release_stripe(sh);
                handled++;
        }
@@ -4128,6 +4349,36 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
        return handled;
 }
 
+#ifdef CONFIG_MULTICORE_RAID456
+static void __process_stripe(void *param, async_cookie_t cookie)
+{
+       struct stripe_head *sh = param;
+
+       handle_stripe(sh);
+       release_stripe(sh);
+}
+
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+       async_schedule_domain(__process_stripe, sh, domain);
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+       async_synchronize_full_domain(domain);
+}
+#else
+static void process_stripe(struct stripe_head *sh, struct list_head *domain)
+{
+       handle_stripe(sh);
+       release_stripe(sh);
+       cond_resched();
+}
+
+static void synchronize_stripe_processing(struct list_head *domain)
+{
+}
+#endif
 
 
 /*
@@ -4142,6 +4393,7 @@ static void raid5d(mddev_t *mddev)
        struct stripe_head *sh;
        raid5_conf_t *conf = mddev->private;
        int handled;
+       LIST_HEAD(raid_domain);
 
        pr_debug("+++ raid5d active\n");
 
@@ -4178,8 +4430,7 @@ static void raid5d(mddev_t *mddev)
                spin_unlock_irq(&conf->device_lock);
                
                handled++;
-               handle_stripe(sh, conf->spare_page);
-               release_stripe(sh);
+               process_stripe(sh, &raid_domain);
 
                spin_lock_irq(&conf->device_lock);
        }
@@ -4187,6 +4438,7 @@ static void raid5d(mddev_t *mddev)
 
        spin_unlock_irq(&conf->device_lock);
 
+       synchronize_stripe_processing(&raid_domain);
        async_tx_issue_pending_all();
        unplug_slaves(mddev);
 
@@ -4319,15 +4571,118 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
        return sectors * (raid_disks - conf->max_degraded);
 }
 
+static void raid5_free_percpu(raid5_conf_t *conf)
+{
+       struct raid5_percpu *percpu;
+       unsigned long cpu;
+
+       if (!conf->percpu)
+               return;
+
+       get_online_cpus();
+       for_each_possible_cpu(cpu) {
+               percpu = per_cpu_ptr(conf->percpu, cpu);
+               safe_put_page(percpu->spare_page);
+               kfree(percpu->scribble);
+       }
+#ifdef CONFIG_HOTPLUG_CPU
+       unregister_cpu_notifier(&conf->cpu_notify);
+#endif
+       put_online_cpus();
+
+       free_percpu(conf->percpu);
+}
+
 static void free_conf(raid5_conf_t *conf)
 {
        shrink_stripes(conf);
-       safe_put_page(conf->spare_page);
+       raid5_free_percpu(conf);
        kfree(conf->disks);
        kfree(conf->stripe_hashtbl);
        kfree(conf);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
+                             void *hcpu)
+{
+       raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
+       long cpu = (long)hcpu;
+       struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               if (conf->level == 6 && !percpu->spare_page)
+                       percpu->spare_page = alloc_page(GFP_KERNEL);
+               if (!percpu->scribble)
+                       percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+               if (!percpu->scribble ||
+                   (conf->level == 6 && !percpu->spare_page)) {
+                       safe_put_page(percpu->spare_page);
+                       kfree(percpu->scribble);
+                       pr_err("%s: failed memory allocation for cpu%ld\n",
+                              __func__, cpu);
+                       return NOTIFY_BAD;
+               }
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               safe_put_page(percpu->spare_page);
+               kfree(percpu->scribble);
+               percpu->spare_page = NULL;
+               percpu->scribble = NULL;
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+#endif
+
+static int raid5_alloc_percpu(raid5_conf_t *conf)
+{
+       unsigned long cpu;
+       struct page *spare_page;
+       struct raid5_percpu *allcpus;
+       void *scribble;
+       int err;
+
+       allcpus = alloc_percpu(struct raid5_percpu);
+       if (!allcpus)
+               return -ENOMEM;
+       conf->percpu = allcpus;
+
+       get_online_cpus();
+       err = 0;
+       for_each_present_cpu(cpu) {
+               if (conf->level == 6) {
+                       spare_page = alloc_page(GFP_KERNEL);
+                       if (!spare_page) {
+                               err = -ENOMEM;
+                               break;
+                       }
+                       per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+               }
+               scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL);
+               if (!scribble) {
+                       err = -ENOMEM;
+                       break;
+               }
+               per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+       }
+#ifdef CONFIG_HOTPLUG_CPU
+       conf->cpu_notify.notifier_call = raid456_cpu_notify;
+       conf->cpu_notify.priority = 0;
+       if (err == 0)
+               err = register_cpu_notifier(&conf->cpu_notify);
+#endif
+       put_online_cpus();
+
+       return err;
+}
+
 static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
        raid5_conf_t *conf;
@@ -4369,6 +4724,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                goto abort;
 
        conf->raid_disks = mddev->raid_disks;
+       conf->scribble_len = scribble_len(conf->raid_disks);
        if (mddev->reshape_position == MaxSector)
                conf->previous_raid_disks = mddev->raid_disks;
        else
@@ -4384,11 +4740,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
 
-       if (mddev->new_level == 6) {
-               conf->spare_page = alloc_page(GFP_KERNEL);
-               if (!conf->spare_page)
-                       goto abort;
-       }
+       conf->level = mddev->new_level;
+       if (raid5_alloc_percpu(conf) != 0)
+               goto abort;
+
        spin_lock_init(&conf->device_lock);
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
@@ -4447,7 +4802,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                printk(KERN_INFO "raid5: allocated %dkB for %s\n",
                        memory, mdname(mddev));
 
-       conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+       conf->thread = md_register_thread(raid5d, mddev, NULL);
        if (!conf->thread) {
                printk(KERN_ERR
                       "raid5: couldn't allocate thread for %s\n",
@@ -4613,7 +4968,7 @@ static int run(mddev_t *mddev)
                set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-                                                       "%s_reshape");
+                                                       "reshape");
        }
 
        /* read-ahead size must cover two whole stripes, which is
@@ -5031,7 +5386,7 @@ static int raid5_start_reshape(mddev_t *mddev)
        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
        mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-                                               "%s_reshape");
+                                               "reshape");
        if (!mddev->sync_thread) {
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
index 9459689c4ea00a50073cdcfca38ee038bda10364..2390e0e83daf7c939344f8062a9d4a6ef737c476 100644 (file)
@@ -2,6 +2,7 @@
 #define _RAID5_H
 
 #include <linux/raid/xor.h>
+#include <linux/dmaengine.h>
 
 /*
  *
  */
 enum check_states {
        check_state_idle = 0,
-       check_state_run, /* parity check */
+       check_state_run, /* xor parity check */
+       check_state_run_q, /* q-parity check */
+       check_state_run_pq, /* pq dual parity check */
        check_state_check_result,
        check_state_compute_run, /* parity repair */
        check_state_compute_result,
@@ -215,8 +218,8 @@ struct stripe_head {
         * @target - STRIPE_OP_COMPUTE_BLK target
         */
        struct stripe_operations {
-               int                target;
-               u32                zero_sum_result;
+               int                  target, target2;
+               enum sum_check_flags zero_sum_result;
        } ops;
        struct r5dev {
                struct bio      req;
@@ -298,7 +301,7 @@ struct r6_state {
 #define STRIPE_OP_COMPUTE_BLK  1
 #define STRIPE_OP_PREXOR       2
 #define STRIPE_OP_BIODRAIN     3
-#define STRIPE_OP_POSTXOR      4
+#define STRIPE_OP_RECONSTRUCT  4
 #define STRIPE_OP_CHECK        5
 
 /*
@@ -385,8 +388,21 @@ struct raid5_private_data {
                                            * (fresh device added).
                                            * Cleared when a sync completes.
                                            */
-
-       struct page             *spare_page; /* Used when checking P/Q in raid6 */
+       /* per cpu variables */
+       struct raid5_percpu {
+               struct page     *spare_page; /* Used when checking P/Q in raid6 */
+               void            *scribble;   /* space for constructing buffer
+                                             * lists and performing address
+                                             * conversions
+                                             */
+       } *percpu;
+       size_t                  scribble_len; /* size of scribble region must be
+                                              * associated with conf to handle
+                                              * cpu hotplug while reshaping
+                                              */
+#ifdef CONFIG_HOTPLUG_CPU
+       struct notifier_block   cpu_notify;
+#endif
 
        /*
         * Free stripes pool
index 895e2efca8a974a3c2996b51b5b86ef8a644b76a..01fc704847434fb1faa08e61c09053e053ecaff3 100644 (file)
 #define DVB_MAJOR 212
 
 #if defined(CONFIG_DVB_MAX_ADAPTERS) && CONFIG_DVB_MAX_ADAPTERS > 0
-#define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS
+  #define DVB_MAX_ADAPTERS CONFIG_DVB_MAX_ADAPTERS
 #else
-#warning invalid CONFIG_DVB_MAX_ADAPTERS value
-#define DVB_MAX_ADAPTERS 8
+  #define DVB_MAX_ADAPTERS 8
 #endif
 
 #define DVB_UNSET (-1)
index 0e4b97fba384bb837bd1946d1fc502ce03f46dc2..9744b0692417597814f55f3d0a2d8057198e27d9 100644 (file)
@@ -75,7 +75,7 @@ config DVB_USB_DIB0700
        select DVB_DIB3000MC if !DVB_FE_CUSTOMISE
        select DVB_S5H1411 if !DVB_FE_CUSTOMISE
        select DVB_LGDT3305 if !DVB_FE_CUSTOMISE
-       select DVB_TUNER_DIB0070 if !DVB_FE_CUSTOMISE
+       select DVB_TUNER_DIB0070
        select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMISE
        select MEDIA_TUNER_MT2266 if !MEDIA_TUNER_CUSTOMISE
        select MEDIA_TUNER_XC2028 if !MEDIA_TUNER_CUSTOMISE
index bb6df1b276bee161eeca7b0a6f3a0fb3627c994c..6f094a96ac810afbfbad67667b50bcd70cdca7d9 100644 (file)
@@ -415,7 +415,7 @@ int saa7164_api_enum_subdevs(struct saa7164_dev *dev)
                goto out;
        }
 
-       if (debug & DBGLVL_API)
+       if (saa_debug & DBGLVL_API)
                saa7164_dumphex16(dev, buf, (buflen/16)*16);
 
        saa7164_api_dump_subdevs(dev, buf, buflen);
@@ -480,7 +480,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
 
        dprintk(DBGLVL_API, "%s() len = %d bytes\n", __func__, len);
 
-       if (debug & DBGLVL_I2C)
+       if (saa_debug & DBGLVL_I2C)
                saa7164_dumphex16(dev, buf, 2 * 16);
 
        ret = saa7164_cmd_send(bus->dev, unitid, GET_CUR,
@@ -488,7 +488,7 @@ int saa7164_api_i2c_read(struct saa7164_i2c *bus, u8 addr, u32 reglen, u8 *reg,
        if (ret != SAA_OK)
                printk(KERN_ERR "%s() error, ret(2) = 0x%x\n", __func__, ret);
        else {
-               if (debug & DBGLVL_I2C)
+               if (saa_debug & DBGLVL_I2C)
                        saa7164_dumphex16(dev, buf, sizeof(buf));
                memcpy(data, (buf + 2 * sizeof(u32) + reglen), datalen);
        }
@@ -548,7 +548,7 @@ int saa7164_api_i2c_write(struct saa7164_i2c *bus, u8 addr, u32 datalen,
        *((u32 *)(buf + 1 * sizeof(u32))) = datalen - reglen;
        memcpy((buf + 2 * sizeof(u32)), data, datalen);
 
-       if (debug & DBGLVL_I2C)
+       if (saa_debug & DBGLVL_I2C)
                saa7164_dumphex16(dev, buf, sizeof(buf));
 
        ret = saa7164_cmd_send(bus->dev, unitid, SET_CUR,
index e097f1a0969a26441a0aa1a02df5136aaf1d4460..c45966edc0cf2f202ea4a38e158506390f145f46 100644 (file)
@@ -250,7 +250,7 @@ int saa7164_cmd_wait(struct saa7164_dev *dev, u8 seqno)
        unsigned long stamp;
        int r;
 
-       if (debug >= 4)
+       if (saa_debug >= 4)
                saa7164_bus_dump(dev);
 
        dprintk(DBGLVL_CMD, "%s(seqno=%d)\n", __func__, seqno);
index f0dbead188c893741c2bbdb6dc55035ee36436e1..709affc31042571fdc66efd0d99b72673451869b 100644 (file)
@@ -45,8 +45,8 @@ MODULE_LICENSE("GPL");
  32 bus
  */
 
-unsigned int debug;
-module_param(debug, int, 0644);
+unsigned int saa_debug;
+module_param_named(debug, saa_debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
 
 unsigned int waitsecs = 10;
@@ -653,7 +653,7 @@ static int __devinit saa7164_initdev(struct pci_dev *pci_dev,
                printk(KERN_ERR "%s() Unsupported board detected, "
                        "registering without firmware\n", __func__);
 
-       dprintk(1, "%s() parameter debug = %d\n", __func__, debug);
+       dprintk(1, "%s() parameter debug = %d\n", __func__, saa_debug);
        dprintk(1, "%s() parameter waitsecs = %d\n", __func__, waitsecs);
 
 fail_fw:
index 6753008a9c9be4a47e35abf04a20e558157cc1f0..42660b546f0e811f528b7c3af1c8959c2e8cb0e4 100644 (file)
@@ -375,9 +375,9 @@ extern int saa7164_buffer_dealloc(struct saa7164_tsport *port,
 
 /* ----------------------------------------------------------- */
 
-extern unsigned int debug;
+extern unsigned int saa_debug;
 #define dprintk(level, fmt, arg...)\
-       do { if (debug & level)\
+       do { if (saa_debug & level)\
                printk(KERN_DEBUG "%s: " fmt, dev->name, ## arg);\
        } while (0)
 
index a5b448ea4eab54372a2263688cf04d2243775356..b3bf1c44d74d92c67452b96e89cf4d9741d68d4f 100644 (file)
@@ -339,9 +339,9 @@ static int h_memstick_read_dev_id(struct memstick_dev *card,
                        card->id.type = id_reg.type;
                        card->id.category = id_reg.category;
                        card->id.class = id_reg.class;
+                       dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
                }
                complete(&card->mrq_complete);
-               dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
                return -EAGAIN;
        }
 }
index 79689b10f937e0b8405a6a3568d5503c532d64a9..766e21e15574c75ceba18828a4d492ac6a90a189 100644 (file)
@@ -937,6 +937,8 @@ static int quicktest1(unsigned long arg)
 
        /* Need  1K cacheline aligned that does not cross page boundary */
        p = kmalloc(4096, 0);
+       if (p == NULL)
+               return -ENOMEM;
        mq = ALIGNUP(p, 1024);
        memset(mes, 0xee, sizeof(mes));
        dw = mq;
index 9cbf95bedce6ba6b35a6078bf792ea403a18ae55..ccd4408a26c73efe3efa11753ac64dbed0a55936 100644 (file)
@@ -340,10 +340,9 @@ static struct proc_dir_entry *proc_gru __read_mostly;
 
 static int create_proc_file(struct proc_entry *p)
 {
-       p->entry = create_proc_entry(p->name, p->mode, proc_gru);
+       p->entry = proc_create(p->name, p->mode, proc_gru, p->fops);
        if (!p->entry)
                return -1;
-       p->entry->proc_fops = p->fops;
        return 0;
 }
 
index 065fa818be5750a8b59dc7d12a53cb217933b5d0..fc25586b7ee1c94575e5674f89c3382c2a7ffd88 100644 (file)
@@ -599,6 +599,7 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
        struct scatterlist              *sg;
        unsigned int                    i;
        enum dma_data_direction         direction;
+       unsigned int                    sglen;
 
        /*
         * We don't do DMA on "complex" transfers, i.e. with
@@ -628,11 +629,14 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
        else
                direction = DMA_TO_DEVICE;
 
+       sglen = dma_map_sg(&host->pdev->dev, data->sg, data->sg_len, direction);
+       if (sglen != data->sg_len)
+               goto unmap_exit;
        desc = chan->device->device_prep_slave_sg(chan,
                        data->sg, data->sg_len, direction,
                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc)
-               return -ENOMEM;
+               goto unmap_exit;
 
        host->dma.data_desc = desc;
        desc->callback = atmci_dma_complete;
@@ -643,6 +647,9 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
        chan->device->device_issue_pending(chan);
 
        return 0;
+unmap_exit:
+       dma_unmap_sg(&host->pdev->dev, data->sg, sglen, direction);
+       return -ENOMEM;
 }
 
 #else /* CONFIG_MMC_ATMELMCI_DMA */
index 2ab1d59870f4e1114daca7a1212e43a61026bf68..a8b689635a3ba1a45862bc99ccf9813e09ecf41c 100644 (file)
@@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev)
 
 static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0";
 
-static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info(ctl_table * ctl, int write,
                      void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int i;
@@ -629,7 +629,7 @@ final:
        *lenp = pos;
 
        if (!write)
-               retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+               retv = proc_dostring(ctl, write, buffer, lenp, ppos);
        else
        {
                *lenp = 0;
@@ -639,7 +639,7 @@ final:
 }
 
 
-static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info161719(ctl_table * ctl, int write,
                            void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int i;
@@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp
 
 final:
        *lenp = pos;
-       retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       retv = proc_dostring(ctl, write, buffer, lenp, ppos);
        return retv;
 }
 
-static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_infotxRing(ctl_table * ctl, int write,
                            void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int i;
@@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp
        SARLBNpln(u_char, txBuffer, 0x800);
 final:
        *lenp = pos;
-       retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       retv = proc_dostring(ctl, write, buffer, lenp, ppos);
        return retv;
 }
 
-static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_inforxRing(ctl_table * ctl, int write,
                            void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int i;
@@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp
        SARLBNpln(u_char, rxBuffer, 0x800);
 final:
        *lenp = pos;
-       retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       retv = proc_dostring(ctl, write, buffer, lenp, ppos);
        return retv;
 }
 
-static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info18(ctl_table * ctl, int write,
                        void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int i;
@@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
 
 final:
        *lenp = pos;
-       retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       retv = proc_dostring(ctl, write, buffer, lenp, ppos);
        return retv;
 }
 
@@ -766,7 +766,7 @@ final:
 
 static char conf_reset_result[200];
 
-static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
+static int arlan_configure(ctl_table * ctl, int write,
                    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int pos = 0;
@@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
                return -1;
 
        *lenp = pos;
-       return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
-static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_reset(ctl_table * ctl, int write,
                       void __user *buffer, size_t * lenp, loff_t *ppos)
 {
        int pos = 0;
@@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
        } else
                return -1;
        *lenp = pos + 3;
-       return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+       return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
 
index 554e11f9e1cea9705ab4b54fa43ea9c60f33d558..8eefe56f1cbe8d470b580f2f5ca1dfe2f97027e2 100644 (file)
@@ -31,7 +31,7 @@
 #define PARPORT_MIN_SPINTIME_VALUE 1
 #define PARPORT_MAX_SPINTIME_VALUE 1000
 
-static int do_active_device(ctl_table *table, int write, struct file *filp,
+static int do_active_device(ctl_table *table, int write,
                      void __user *result, size_t *lenp, loff_t *ppos)
 {
        struct parport *port = (struct parport *)table->extra1;
@@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp,
 }
 
 #ifdef CONFIG_PARPORT_1284
-static int do_autoprobe(ctl_table *table, int write, struct file *filp,
+static int do_autoprobe(ctl_table *table, int write,
                        void __user *result, size_t *lenp, loff_t *ppos)
 {
        struct parport_device_info *info = table->extra2;
@@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp,
 #endif /* IEEE1284.3 support. */
 
 static int do_hardware_base_addr (ctl_table *table, int write,
-                                 struct file *filp, void __user *result,
+                                 void __user *result,
                                  size_t *lenp, loff_t *ppos)
 {
        struct parport *port = (struct parport *)table->extra1;
@@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write,
 }
 
 static int do_hardware_irq (ctl_table *table, int write,
-                           struct file *filp, void __user *result,
+                           void __user *result,
                            size_t *lenp, loff_t *ppos)
 {
        struct parport *port = (struct parport *)table->extra1;
@@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write,
 }
 
 static int do_hardware_dma (ctl_table *table, int write,
-                           struct file *filp, void __user *result,
+                           void __user *result,
                            size_t *lenp, loff_t *ppos)
 {
        struct parport *port = (struct parport *)table->extra1;
@@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write,
 }
 
 static int do_hardware_modes (ctl_table *table, int write,
-                             struct file *filp, void __user *result,
+                             void __user *result,
                              size_t *lenp, loff_t *ppos)
 {
        struct parport *port = (struct parport *)table->extra1;
index d14ea84a01f62f4ce7b82f51b22ec72381af2052..1301caa7495ddcdeb9515b2988d96be3f04e4d5f 100644 (file)
@@ -32,8 +32,3 @@ endif
 
 EXTRA_CFLAGS += -Idrivers/media/dvb/frontends
 EXTRA_CFLAGS += -Idrivers/media/dvb/dvb-core
-
-# Ubuntu 8.04 has CONFIG_SND undefined, so include lum sound/config.h too
-ifeq ($(CONFIG_SND),)
-EXTRA_CFLAGS += -include sound/config.h
-endif
index 68fa0e43b78107fd7b02bac3d6ede9049edcbf01..8c075b2416bb6237402326dcd92d9d00dc454b7c 100644 (file)
@@ -912,6 +912,7 @@ static void sierra_release(struct usb_serial *serial)
        }
 }
 
+#ifdef CONFIG_PM
 static void stop_read_write_urbs(struct usb_serial *serial)
 {
        int i, j;
@@ -988,6 +989,10 @@ static int sierra_resume(struct usb_serial *serial)
 
        return ec ? -EIO : 0;
 }
+#else
+#define sierra_suspend NULL
+#define sierra_resume NULL
+#endif
 
 static struct usb_serial_driver sierra_device = {
        .driver = {
index ba3d71f5c7d09a1464a3bfdaf09dc6bf646e80b0..9554ad5f9af799641e6eda438f136886704a061a 100644 (file)
@@ -702,7 +702,7 @@ static int vlynq_probe(struct platform_device *pdev)
        dev->mem_start = mem_res->start;
        dev->mem_end = mem_res->end;
 
-       len = regs_res->end - regs_res->start;
+       len = resource_size(regs_res);
        if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) {
                printk(KERN_ERR "%s: Can't request vlynq registers\n",
                       dev_name(&dev->dev));
index 798cb071d1329a45cb3fe95d1a84161167f4084e..3f57ce4bee5d4371649c2a67354dcc16f4eacb48 100644 (file)
@@ -19,9 +19,6 @@ static int
 adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
               int create)
 {
-       if (block < 0)
-               goto abort_negative;
-
        if (!create) {
                if (block >= inode->i_blocks)
                        goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
        /* don't support allocation of blocks yet */
        return -EIO;
 
-abort_negative:
-       adfs_error(inode->i_sb, "block %d < 0", block);
-       return -EIO;
-
 abort_toobig:
        return 0;
 }
index 9fe1b1bd30a808c82b1ebfbd8f644732e4f36f3c..96d394bdaddfa29e52b74d815dd5842e87cb6820 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,7 +18,7 @@
 /* Taken over from the old code... */
 
 /* POSIX UID/GID verification for setting inode attributes. */
-int inode_change_ok(struct inode *inode, struct iattr *attr)
+int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
        int retval = -EPERM;
        unsigned int ia_valid = attr->ia_valid;
@@ -60,9 +60,51 @@ fine:
 error:
        return retval;
 }
-
 EXPORT_SYMBOL(inode_change_ok);
 
+/**
+ * inode_newsize_ok - may this inode be truncated to a given size
+ * @inode:     the inode to be truncated
+ * @offset:    the new size to assign to the inode
+ * @Returns:   0 on success, -ve errno on failure
+ *
+ * inode_newsize_ok will check filesystem limits and ulimits to check that the
+ * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
+ * when necessary. Caller must not proceed with inode size change if failure is
+ * returned. @inode must be a file (not directory), with appropriate
+ * permissions to allow truncate (inode_newsize_ok does NOT check these
+ * conditions).
+ *
+ * inode_newsize_ok must be called with i_mutex held.
+ */
+int inode_newsize_ok(const struct inode *inode, loff_t offset)
+{
+       if (inode->i_size < offset) {
+               unsigned long limit;
+
+               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+               if (limit != RLIM_INFINITY && offset > limit)
+                       goto out_sig;
+               if (offset > inode->i_sb->s_maxbytes)
+                       goto out_big;
+       } else {
+               /*
+                * truncation of in-use swapfiles is disallowed - it would
+                * cause subsequent swapout to scribble on the now-freed
+                * blocks.
+                */
+               if (IS_SWAPFILE(inode))
+                       return -ETXTBSY;
+       }
+
+       return 0;
+out_sig:
+       send_sig(SIGXFSZ, current, 0);
+out_big:
+       return -EFBIG;
+}
+EXPORT_SYMBOL(inode_newsize_ok);
+
 int inode_setattr(struct inode * inode, struct iattr * attr)
 {
        unsigned int ia_valid = attr->ia_valid;
index dd376c124e71561193f0a824f407314d6edda4aa..33baf27fac78e5c4fbc36e12e3a4a16b2322afe4 100644 (file)
@@ -737,12 +737,7 @@ befs_put_super(struct super_block *sb)
 {
        kfree(BEFS_SB(sb)->mount_opts.iocharset);
        BEFS_SB(sb)->mount_opts.iocharset = NULL;
-
-       if (BEFS_SB(sb)->nls) {
-               unload_nls(BEFS_SB(sb)->nls);
-               BEFS_SB(sb)->nls = NULL;
-       }
-
+       unload_nls(BEFS_SB(sb)->nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
 }
index 442d94fe255cfab4c81e26bdf0aa8cc46c3283ac..b9b3bb51b1e485ff0caa0b7555d468804b552b9a 100644 (file)
@@ -1711,42 +1711,52 @@ struct elf_note_info {
        int numnote;
 };
 
-static int fill_note_info(struct elfhdr *elf, int phdrs,
-                         struct elf_note_info *info,
-                         long signr, struct pt_regs *regs)
+static int elf_note_info_init(struct elf_note_info *info)
 {
-#define        NUM_NOTES       6
-       struct list_head *t;
-
-       info->notes = NULL;
-       info->prstatus = NULL;
-       info->psinfo = NULL;
-       info->fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
-       info->xfpu = NULL;
-#endif
+       memset(info, 0, sizeof(*info));
        INIT_LIST_HEAD(&info->thread_list);
 
-       info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
-                             GFP_KERNEL);
+       /* Allocate space for six ELF notes */
+       info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
        if (!info->notes)
                return 0;
        info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
        if (!info->psinfo)
-               return 0;
+               goto notes_free;
        info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
        if (!info->prstatus)
-               return 0;
+               goto psinfo_free;
        info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
        if (!info->fpu)
-               return 0;
+               goto prstatus_free;
 #ifdef ELF_CORE_COPY_XFPREGS
        info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
        if (!info->xfpu)
-               return 0;
+               goto fpu_free;
+#endif
+       return 1;
+#ifdef ELF_CORE_COPY_XFPREGS
+ fpu_free:
+       kfree(info->fpu);
 #endif
+ prstatus_free:
+       kfree(info->prstatus);
+ psinfo_free:
+       kfree(info->psinfo);
+ notes_free:
+       kfree(info->notes);
+       return 0;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+                         struct elf_note_info *info,
+                         long signr, struct pt_regs *regs)
+{
+       struct list_head *t;
+
+       if (!elf_note_info_init(info))
+               return 0;
 
-       info->thread_status_size = 0;
        if (signr) {
                struct core_thread *ct;
                struct elf_thread_status *ets;
@@ -1806,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 #endif
 
        return 1;
-
-#undef NUM_NOTES
 }
 
 static size_t get_note_info_size(struct elf_note_info *info)
index 76285471073ec18193edaf0304cb4e385affae3f..38502c67987c573541325aaf0f8a5db1ada1f756 100644 (file)
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        }
 
        stack_size = exec_params.stack_size;
-       if (stack_size < interp_params.stack_size)
-               stack_size = interp_params.stack_size;
-
        if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
                executable_stack = EXSTACK_ENABLE_X;
        else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
                executable_stack = EXSTACK_DISABLE_X;
-       else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
-               executable_stack = EXSTACK_ENABLE_X;
-       else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
-               executable_stack = EXSTACK_DISABLE_X;
        else
                executable_stack = EXSTACK_DEFAULT;
 
+       if (stack_size == 0) {
+               stack_size = interp_params.stack_size;
+               if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
+                       executable_stack = EXSTACK_ENABLE_X;
+               else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
+                       executable_stack = EXSTACK_DISABLE_X;
+               else
+                       executable_stack = EXSTACK_DEFAULT;
+       }
+
        retval = -ENOEXEC;
        if (stack_size == 0)
                goto error;
index e92f229e3c6e9d994d61f5581625c35c2bde6bd7..a2796651e75690eb8e8a875146be852bfb1d637b 100644 (file)
@@ -278,8 +278,6 @@ static int decompress_exec(
                ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
                if (ret <= 0)
                        break;
-               if (ret >= (unsigned long) -4096)
-                       break;
                len -= ret;
 
                strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
                                        "(%d != %d)", (unsigned) r, curid, id);
                        goto failed;
                } else if ( ! p->lib_list[id].loaded &&
-                               load_flat_shared_library(id, p) > (unsigned long) -4096) {
+                               IS_ERR_VALUE(load_flat_shared_library(id, p))) {
                        printk("BINFMT_FLAT: failed to load library %d", id);
                        goto failed;
                }
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
                                  MAP_PRIVATE|MAP_EXECUTABLE, 0);
                up_write(&current->mm->mmap_sem);
-               if (!textpos  || textpos >= (unsigned long) -4096) {
+               if (!textpos || IS_ERR_VALUE(textpos)) {
                        if (!textpos)
                                textpos = (unsigned long) -ENOMEM;
                        printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                        PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
                up_write(&current->mm->mmap_sem);
 
-               if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
+               if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
                        if (!realdatastart)
                                realdatastart = (unsigned long) -ENOMEM;
                        printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                        result = bprm->file->f_op->read(bprm->file, (char *) datapos,
                                        data_len + (relocs * sizeof(unsigned long)), &fpos);
                }
-               if (result >= (unsigned long)-4096) {
+               if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
                        do_munmap(current->mm, textpos, text_len);
                        do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                        PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
                up_write(&current->mm->mmap_sem);
 
-               if (!textpos  || textpos >= (unsigned long) -4096) {
+               if (!textpos || IS_ERR_VALUE(textpos)) {
                        if (!textpos)
                                textpos = (unsigned long) -ENOMEM;
                        printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                        fpos = 0;
                        result = bprm->file->f_op->read(bprm->file,
                                        (char *) textpos, text_len, &fpos);
-                       if (result < (unsigned long) -4096)
+                       if (!IS_ERR_VALUE(result))
                                result = decompress_exec(bprm, text_len, (char *) datapos,
                                                 data_len + (relocs * sizeof(unsigned long)), 0);
                }
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
                        fpos = 0;
                        result = bprm->file->f_op->read(bprm->file,
                                        (char *) textpos, text_len, &fpos);
-                       if (result < (unsigned long) -4096) {
+                       if (!IS_ERR_VALUE(result)) {
                                fpos = ntohl(hdr->data_start);
                                result = bprm->file->f_op->read(bprm->file, (char *) datapos,
                                        data_len + (relocs * sizeof(unsigned long)), &fpos);
                        }
                }
-               if (result >= (unsigned long)-4096) {
+               if (IS_ERR_VALUE(result)) {
                        printk("Unable to read code+data+bss, errno %d\n",(int)-result);
                        do_munmap(current->mm, textpos, text_len + data_len + extra +
                                MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 
        res = prepare_binprm(&bprm);
 
-       if (res <= (unsigned long)-4096)
+       if (!IS_ERR_VALUE(res))
                res = load_flat_file(&bprm, libs, id, NULL);
 
        abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
-       if (res > (unsigned long)-4096)
+       if (IS_ERR_VALUE(res))
                return res;
        
        /* Update data segment pointers for all libraries */
index 5d1ed50bd46c591c1ef63b312c4844d013f5239b..9cf4b926f8e47a509f0994ad0e696d79205335bb 100644 (file)
@@ -216,8 +216,6 @@ EXPORT_SYMBOL(fsync_bdev);
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:      blockdevice to lock
  *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
  * The reference counter (bd_fsfreeze_count) guarantees that only the last
@@ -232,46 +230,55 @@ struct super_block *freeze_bdev(struct block_device *bdev)
        int error = 0;
 
        mutex_lock(&bdev->bd_fsfreeze_mutex);
-       if (bdev->bd_fsfreeze_count > 0) {
-               bdev->bd_fsfreeze_count++;
+       if (++bdev->bd_fsfreeze_count > 1) {
+               /*
+                * We don't even need to grab a reference - the first call
+                * to freeze_bdev grab an active reference and only the last
+                * thaw_bdev drops it.
+                */
                sb = get_super(bdev);
+               drop_super(sb);
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return sb;
        }
-       bdev->bd_fsfreeze_count++;
-
-       down(&bdev->bd_mount_sem);
-       sb = get_super(bdev);
-       if (sb && !(sb->s_flags & MS_RDONLY)) {
-               sb->s_frozen = SB_FREEZE_WRITE;
-               smp_wmb();
-
-               sync_filesystem(sb);
-
-               sb->s_frozen = SB_FREEZE_TRANS;
-               smp_wmb();
-
-               sync_blockdev(sb->s_bdev);
-
-               if (sb->s_op->freeze_fs) {
-                       error = sb->s_op->freeze_fs(sb);
-                       if (error) {
-                               printk(KERN_ERR
-                                       "VFS:Filesystem freeze failed\n");
-                               sb->s_frozen = SB_UNFROZEN;
-                               drop_super(sb);
-                               up(&bdev->bd_mount_sem);
-                               bdev->bd_fsfreeze_count--;
-                               mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                               return ERR_PTR(error);
-                       }
+
+       sb = get_active_super(bdev);
+       if (!sb)
+               goto out;
+       if (sb->s_flags & MS_RDONLY) {
+               deactivate_locked_super(sb);
+               mutex_unlock(&bdev->bd_fsfreeze_mutex);
+               return sb;
+       }
+
+       sb->s_frozen = SB_FREEZE_WRITE;
+       smp_wmb();
+
+       sync_filesystem(sb);
+
+       sb->s_frozen = SB_FREEZE_TRANS;
+       smp_wmb();
+
+       sync_blockdev(sb->s_bdev);
+
+       if (sb->s_op->freeze_fs) {
+               error = sb->s_op->freeze_fs(sb);
+               if (error) {
+                       printk(KERN_ERR
+                               "VFS:Filesystem freeze failed\n");
+                       sb->s_frozen = SB_UNFROZEN;
+                       deactivate_locked_super(sb);
+                       bdev->bd_fsfreeze_count--;
+                       mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                       return ERR_PTR(error);
                }
        }
+       up_write(&sb->s_umount);
 
+ out:
        sync_blockdev(bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-       return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
+       return sb;      /* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
 
@@ -284,44 +291,44 @@ EXPORT_SYMBOL(freeze_bdev);
  */
 int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
-       int error = 0;
+       int error = -EINVAL;
 
        mutex_lock(&bdev->bd_fsfreeze_mutex);
-       if (!bdev->bd_fsfreeze_count) {
-               mutex_unlock(&bdev->bd_fsfreeze_mutex);
-               return -EINVAL;
-       }
-
-       bdev->bd_fsfreeze_count--;
-       if (bdev->bd_fsfreeze_count > 0) {
-               if (sb)
-                       drop_super(sb);
-               mutex_unlock(&bdev->bd_fsfreeze_mutex);
-               return 0;
-       }
-
-       if (sb) {
-               BUG_ON(sb->s_bdev != bdev);
-               if (!(sb->s_flags & MS_RDONLY)) {
-                       if (sb->s_op->unfreeze_fs) {
-                               error = sb->s_op->unfreeze_fs(sb);
-                               if (error) {
-                                       printk(KERN_ERR
-                                               "VFS:Filesystem thaw failed\n");
-                                       sb->s_frozen = SB_FREEZE_TRANS;
-                                       bdev->bd_fsfreeze_count++;
-                                       mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                                       return error;
-                               }
-                       }
-                       sb->s_frozen = SB_UNFROZEN;
-                       smp_wmb();
-                       wake_up(&sb->s_wait_unfrozen);
+       if (!bdev->bd_fsfreeze_count)
+               goto out_unlock;
+
+       error = 0;
+       if (--bdev->bd_fsfreeze_count > 0)
+               goto out_unlock;
+
+       if (!sb)
+               goto out_unlock;
+
+       BUG_ON(sb->s_bdev != bdev);
+       down_write(&sb->s_umount);
+       if (sb->s_flags & MS_RDONLY)
+               goto out_deactivate;
+
+       if (sb->s_op->unfreeze_fs) {
+               error = sb->s_op->unfreeze_fs(sb);
+               if (error) {
+                       printk(KERN_ERR
+                               "VFS:Filesystem thaw failed\n");
+                       sb->s_frozen = SB_FREEZE_TRANS;
+                       bdev->bd_fsfreeze_count++;
+                       mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                       return error;
                }
-               drop_super(sb);
        }
 
-       up(&bdev->bd_mount_sem);
+       sb->s_frozen = SB_UNFROZEN;
+       smp_wmb();
+       wake_up(&sb->s_wait_unfrozen);
+
+out_deactivate:
+       if (sb)
+               deactivate_locked_super(sb);
+out_unlock:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return 0;
 }
@@ -430,7 +437,6 @@ static void init_once(void *foo)
 
        memset(bdev, 0, sizeof(*bdev));
        mutex_init(&bdev->bd_mutex);
-       sema_init(&bdev->bd_mount_sem, 1);
        INIT_LIST_HEAD(&bdev->bd_inodes);
        INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
index 976bfda032e062cc8958e79ea36a60c9df6a8ba1..e9b76bcd1c129e0d1f4664f54fefd33caba72e08 100644 (file)
@@ -5590,6 +5590,7 @@ static const struct address_space_operations btrfs_aops = {
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
        .set_page_dirty = btrfs_set_page_dirty,
+       .error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations btrfs_symlink_aops = {
index 209f7f15f5f801b4023f6e121643b3744ce1b11e..24afd7422ae866851ecfbd12d7e1c231c5bffda5 100644 (file)
@@ -2239,16 +2239,10 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata;
-       unsigned long limit;
        int err;
 
-       err = -EFBIG;
-        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-       if (limit != RLIM_INFINITY && size > (loff_t)limit) {
-               send_sig(SIGXFSZ, current, 0);
-               goto out;
-       }
-       if (size > inode->i_sb->s_maxbytes)
+       err = inode_newsize_ok(inode, size);
+       if (err)
                goto out;
 
        err = pagecache_write_begin(NULL, mapping, size, 0,
index 3cbc57f932d26a32d8fc8761e3bfe8465ddab9ae..d6db933df2b27ce43c0fe31d6e35bb0968598891 100644 (file)
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
 {
        struct char_device_struct *cd;
        struct cdev *cdev;
-       char *s;
        int err = -ENOMEM;
 
        cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
        cdev->owner = fops->owner;
        cdev->ops = fops;
        kobject_set_name(&cdev->kobj, "%s", name);
-       for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
-               *s = '!';
                
        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
index d79ce2e95c2357a92cf8e252ed27cf7d59557dc5..90c5b39f03135cf0763bdba37701154cc9b40a61 100644 (file)
@@ -185,8 +185,7 @@ out_mount_failed:
                        cifs_sb->mountdata = NULL;
                }
 #endif
-               if (cifs_sb->local_nls)
-                       unload_nls(cifs_sb->local_nls);
+               unload_nls(cifs_sb->local_nls);
                kfree(cifs_sb);
        }
        return rc;
index 1f09c7619319d794dd10a3d58a3543aa34e36ca9..5e2492535daa4ca626ec1e45e812ebd194fa7a2b 100644 (file)
@@ -1557,57 +1557,24 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 static int cifs_vmtruncate(struct inode *inode, loff_t offset)
 {
-       struct address_space *mapping = inode->i_mapping;
-       unsigned long limit;
+       loff_t oldsize;
+       int err;
 
        spin_lock(&inode->i_lock);
-       if (inode->i_size < offset)
-               goto do_expand;
-       /*
-        * truncation of in-use swapfiles is disallowed - it would cause
-        * subsequent swapout to scribble on the now-freed blocks.
-        */
-       if (IS_SWAPFILE(inode)) {
-               spin_unlock(&inode->i_lock);
-               goto out_busy;
-       }
-       i_size_write(inode, offset);
-       spin_unlock(&inode->i_lock);
-       /*
-        * unmap_mapping_range is called twice, first simply for efficiency
-        * so that truncate_inode_pages does fewer single-page unmaps. However
-        * after this first call, and before truncate_inode_pages finishes,
-        * it is possible for private pages to be COWed, which remain after
-        * truncate_inode_pages finishes, hence the second unmap_mapping_range
-        * call must be made for correctness.
-        */
-       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-       truncate_inode_pages(mapping, offset);
-       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-       goto out_truncate;
-
-do_expand:
-       limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-       if (limit != RLIM_INFINITY && offset > limit) {
+       err = inode_newsize_ok(inode, offset);
+       if (err) {
                spin_unlock(&inode->i_lock);
-               goto out_sig;
-       }
-       if (offset > inode->i_sb->s_maxbytes) {
-               spin_unlock(&inode->i_lock);
-               goto out_big;
+               goto out;
        }
+
+       oldsize = inode->i_size;
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
-out_truncate:
+       truncate_pagecache(inode, oldsize, offset);
        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
-       return 0;
-out_sig:
-       send_sig(SIGXFSZ, current, 0);
-out_big:
-       return -EFBIG;
-out_busy:
-       return -ETXTBSY;
+out:
+       return err;
 }
 
 static int
index 8ccd5ed81d9cae6713e8fda58c25723a61c49c9d..d99860a33890d7ae9a0734b17c5fc4e0263eebdf 100644 (file)
@@ -2,6 +2,7 @@
 #define _CODA_INT_
 
 struct dentry;
+struct file;
 
 extern struct file_system_type coda_fs_type;
 extern unsigned long coda_timeout;
index 3aa48834a222b87819eb5302a63a507b1b910071..d576b552e8e2eb90a98324e79b7d8917fb96c04c 100644 (file)
@@ -768,13 +768,13 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
                                 char __user * type, unsigned long flags,
                                 void __user * data)
 {
-       unsigned long type_page;
+       char *kernel_type;
        unsigned long data_page;
-       unsigned long dev_page;
+       char *kernel_dev;
        char *dir_page;
        int retval;
 
-       retval = copy_mount_options (type, &type_page);
+       retval = copy_mount_string(type, &kernel_type);
        if (retval < 0)
                goto out;
 
@@ -783,38 +783,38 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
        if (IS_ERR(dir_page))
                goto out1;
 
-       retval = copy_mount_options (dev_name, &dev_page);
+       retval = copy_mount_string(dev_name, &kernel_dev);
        if (retval < 0)
                goto out2;
 
-       retval = copy_mount_options (data, &data_page);
+       retval = copy_mount_options(data, &data_page);
        if (retval < 0)
                goto out3;
 
        retval = -EINVAL;
 
-       if (type_page && data_page) {
-               if (!strcmp((char *)type_page, SMBFS_NAME)) {
+       if (kernel_type && data_page) {
+               if (!strcmp(kernel_type, SMBFS_NAME)) {
                        do_smb_super_data_conv((void *)data_page);
-               } else if (!strcmp((char *)type_page, NCPFS_NAME)) {
+               } else if (!strcmp(kernel_type, NCPFS_NAME)) {
                        do_ncp_super_data_conv((void *)data_page);
-               } else if (!strcmp((char *)type_page, NFS4_NAME)) {
+               } else if (!strcmp(kernel_type, NFS4_NAME)) {
                        if (do_nfs4_super_data_conv((void *) data_page))
                                goto out4;
                }
        }
 
-       retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
+       retval = do_mount(kernel_dev, dir_page, kernel_type,
                        flags, (void*)data_page);
 
  out4:
        free_page(data_page);
  out3:
-       free_page(dev_page);
+       kfree(kernel_dev);
  out2:
        putname(dir_page);
  out1:
-       free_page(type_page);
+       kfree(kernel_type);
  out:
        return retval;
 }
index a2edb79134472170e95d5fe835e7ad6a783e00db..31f4b0e6d72c333bf1633ef51118ad6ccfc31069 100644 (file)
@@ -63,9 +63,9 @@ static void drop_slab(void)
 }
 
 int drop_caches_sysctl_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (write) {
                if (sysctl_drop_caches & 1)
                        drop_pagecache();
index 5c833c18d0d47bef760c9a4053f5188e34781105..d49be6bc1793b57fdbfb5efc958311dd8f94801e 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
 
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
 /* The maximal length of core_pattern is also specified in sysctl.c */
@@ -1393,18 +1395,16 @@ out_ret:
        return retval;
 }
 
-int set_binfmt(struct linux_binfmt *new)
+void set_binfmt(struct linux_binfmt *new)
 {
-       struct linux_binfmt *old = current->binfmt;
+       struct mm_struct *mm = current->mm;
 
-       if (new) {
-               if (!try_module_get(new->module))
-                       return -1;
-       }
-       current->binfmt = new;
-       if (old)
-               module_put(old->module);
-       return 0;
+       if (mm->binfmt)
+               module_put(mm->binfmt->module);
+
+       mm->binfmt = new;
+       if (new)
+               __module_get(new->module);
 }
 
 EXPORT_SYMBOL(set_binfmt);
@@ -1728,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
        return (ret >= 2) ? 2 : ret;
 }
 
+static void wait_for_dump_helpers(struct file *file)
+{
+       struct pipe_inode_info *pipe;
+
+       pipe = file->f_path.dentry->d_inode->i_pipe;
+
+       pipe_lock(pipe);
+       pipe->readers++;
+       pipe->writers--;
+
+       while ((pipe->readers > 1) && (!signal_pending(current))) {
+               wake_up_interruptible_sync(&pipe->wait);
+               kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+               pipe_wait(pipe);
+       }
+
+       pipe->readers--;
+       pipe->writers++;
+       pipe_unlock(pipe);
+
+}
+
+
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
@@ -1744,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
        char **helper_argv = NULL;
        int helper_argc = 0;
-       char *delimit;
+       int dump_count = 0;
+       static atomic_t core_dump_count = ATOMIC_INIT(0);
 
        audit_core_dumps(signr);
 
-       binfmt = current->binfmt;
+       binfmt = mm->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
 
@@ -1799,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        lock_kernel();
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-       /*
-        * Don't bother to check the RLIMIT_CORE value if core_pattern points
-        * to a pipe.  Since we're not writing directly to the filesystem
-        * RLIMIT_CORE doesn't really apply, as no actual core file will be
-        * created unless the pipe reader choses to write out the core file
-        * at which point file size limits and permissions will be imposed
-        * as it does with any other process
-        */
+
        if ((!ispipe) && (core_limit < binfmt->min_coredump))
                goto fail_unlock;
 
        if (ispipe) {
+               if (core_limit == 0) {
+                       /*
+                        * Normally core limits are irrelevant to pipes, since
+                        * we're not writing to the file system, but we use
+                        * core_limit of 0 here as a speacial value. Any
+                        * non-zero limit gets set to RLIM_INFINITY below, but
+                        * a limit of 0 skips the dump.  This is a consistent
+                        * way to catch recursive crashes.  We can still crash
+                        * if the core_pattern binary sets RLIM_CORE =  !0
+                        * but it runs as root, and can do lots of stupid things
+                        * Note that we use task_tgid_vnr here to grab the pid
+                        * of the process group leader.  That way we get the
+                        * right pid if a thread in a multi-threaded
+                        * core_pattern process dies.
+                        */
+                       printk(KERN_WARNING
+                               "Process %d(%s) has RLIMIT_CORE set to 0\n",
+                               task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_unlock;
+               }
+
+               dump_count = atomic_inc_return(&core_dump_count);
+               if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+                       printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+                              task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Skipping core dump\n");
+                       goto fail_dropcount;
+               }
+
                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
-                       goto fail_unlock;
-               }
-               /* Terminate the string before the first option */
-               delimit = strchr(corename, ' ');
-               if (delimit)
-                       *delimit = '\0';
-               delimit = strrchr(helper_argv[0], '/');
-               if (delimit)
-                       delimit++;
-               else
-                       delimit = helper_argv[0];
-               if (!strcmp(delimit, current->comm)) {
-                       printk(KERN_NOTICE "Recursive core dump detected, "
-                                       "aborting\n");
-                       goto fail_unlock;
+                       goto fail_dropcount;
                }
 
                core_limit = RLIM_INFINITY;
 
                /* SIGPIPE can happen, but it's just never processed */
-               if (call_usermodehelper_pipe(corename+1, helper_argv, NULL,
+               if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
                                &file)) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
-                       goto fail_unlock;
+                       goto fail_dropcount;
                }
        } else
                file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
        if (IS_ERR(file))
-               goto fail_unlock;
+               goto fail_dropcount;
        inode = file->f_path.dentry->d_inode;
        if (inode->i_nlink > 1)
                goto close_fail;        /* multiple links - don't dump */
@@ -1875,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        if (retval)
                current->signal->group_exit_code |= 0x80;
 close_fail:
+       if (ispipe && core_pipe_limit)
+               wait_for_dump_helpers(file);
        filp_close(file, NULL);
+fail_dropcount:
+       if (dump_count)
+               atomic_dec(&core_dump_count);
 fail_unlock:
        if (helper_argv)
                argv_free(helper_argv);
index 5ab10c3bbebec0dd1ba2ab81c340b6be415dcafd..9f500dec3b5901982adcc2618670f72ca4b3c01a 100644 (file)
@@ -214,7 +214,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        }
 
        lock_super(sb);
-       lock_kernel();
        sbi = sb->s_fs_info;
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
        fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
@@ -245,7 +244,6 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 out:
        if (or)
                osd_end_request(or);
-       unlock_kernel();
        unlock_super(sb);
        kfree(fscb);
        return ret;
@@ -268,8 +266,6 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
 
-       lock_kernel();
-
        if (sb->s_dirt)
                exofs_write_super(sb);
 
@@ -286,8 +282,6 @@ static void exofs_put_super(struct super_block *sb)
        osduld_put_device(sbi->s_dev);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-
-       unlock_kernel();
 }
 
 /*
index 1c1638f873a4ac6d0d1e92179c76721ac2c55c4a..ade634076d0ab75371a852f0ec3c50d198a9012e 100644 (file)
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
        .writepages             = ext2_writepages,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
        .migratepage            = buffer_migrate_page,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 /*
index cd098a7b77fc04b7255fe5586248faa67dbbfb07..acf1b14233275e891fd5e1d55560fed331add18c 100644 (file)
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
        .direct_IO              = ext3_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
        .direct_IO              = ext3_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
        .invalidatepage         = ext3_invalidatepage,
        .releasepage            = ext3_releasepage,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 void ext3_set_aops(struct inode *inode)
index 3a798737e305756a493e6ad13f865b302f9174a8..064746fad5812e693ef6d3ef2578822a3007cadb 100644 (file)
@@ -3386,6 +3386,7 @@ static const struct address_space_operations ext4_ordered_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3402,7 @@ static const struct address_space_operations ext4_writeback_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3417,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3434,7 @@ static const struct address_space_operations ext4_da_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
 
 void ext4_set_aops(struct inode *inode)
index 8970d8c49bb00eaa791a390f5dfd87b56a438294..04629d1302fc45e42aef40b180cfd1fcc4b0f73b 100644 (file)
@@ -470,19 +470,11 @@ static void fat_put_super(struct super_block *sb)
 
        iput(sbi->fat_inode);
 
-       if (sbi->nls_disk) {
-               unload_nls(sbi->nls_disk);
-               sbi->nls_disk = NULL;
-               sbi->options.codepage = fat_default_codepage;
-       }
-       if (sbi->nls_io) {
-               unload_nls(sbi->nls_io);
-               sbi->nls_io = NULL;
-       }
-       if (sbi->options.iocharset != fat_default_iocharset) {
+       unload_nls(sbi->nls_disk);
+       unload_nls(sbi->nls_io);
+
+       if (sbi->options.iocharset != fat_default_iocharset)
                kfree(sbi->options.iocharset);
-               sbi->options.iocharset = fat_default_iocharset;
-       }
 
        sb->s_fs_info = NULL;
        kfree(sbi);
index ae413086db978093123b2a59947d78e23a02a693..fc089f2f7f56ccbbd5662eb63e2ec9832d768004 100644 (file)
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
        return pid;
 }
 
+static int f_setown_ex(struct file *filp, unsigned long arg)
+{
+       struct f_owner_ex * __user owner_p = (void * __user)arg;
+       struct f_owner_ex owner;
+       struct pid *pid;
+       int type;
+       int ret;
+
+       ret = copy_from_user(&owner, owner_p, sizeof(owner));
+       if (ret)
+               return ret;
+
+       switch (owner.type) {
+       case F_OWNER_TID:
+               type = PIDTYPE_MAX;
+               break;
+
+       case F_OWNER_PID:
+               type = PIDTYPE_PID;
+               break;
+
+       case F_OWNER_GID:
+               type = PIDTYPE_PGID;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       rcu_read_lock();
+       pid = find_vpid(owner.pid);
+       if (owner.pid && !pid)
+               ret = -ESRCH;
+       else
+               ret = __f_setown(filp, pid, type, 1);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int f_getown_ex(struct file *filp, unsigned long arg)
+{
+       struct f_owner_ex * __user owner_p = (void * __user)arg;
+       struct f_owner_ex owner;
+       int ret = 0;
+
+       read_lock(&filp->f_owner.lock);
+       owner.pid = pid_vnr(filp->f_owner.pid);
+       switch (filp->f_owner.pid_type) {
+       case PIDTYPE_MAX:
+               owner.type = F_OWNER_TID;
+               break;
+
+       case PIDTYPE_PID:
+               owner.type = F_OWNER_PID;
+               break;
+
+       case PIDTYPE_PGID:
+               owner.type = F_OWNER_GID;
+               break;
+
+       default:
+               WARN_ON(1);
+               ret = -EINVAL;
+               break;
+       }
+       read_unlock(&filp->f_owner.lock);
+
+       if (!ret)
+               ret = copy_to_user(owner_p, &owner, sizeof(owner));
+       return ret;
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
 {
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_SETOWN:
                err = f_setown(filp, arg, 1);
                break;
+       case F_GETOWN_EX:
+               err = f_getown_ex(filp, arg);
+               break;
+       case F_SETOWN_EX:
+               err = f_setown_ex(filp, arg);
+               break;
        case F_GETSIG:
                err = filp->f_owner.signum;
                break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
 
 static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
-                              int fd,
-                              int reason)
+                              int fd, int reason, int group)
 {
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
                        else
                                si.si_band = band_table[reason - POLL_IN];
                        si.si_fd    = fd;
-                       if (!group_send_sig_info(signum, &si, p))
+                       if (!do_send_sig_info(signum, &si, p, group))
                                break;
                /* fall-through: fall back on the old plain SIGIO signal */
                case 0:
-                       group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
+                       do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
        }
 }
 
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
+       int group = 1;
        
        read_lock(&fown->lock);
+
        type = fown->pid_type;
+       if (type == PIDTYPE_MAX) {
+               group = 0;
+               type = PIDTYPE_PID;
+       }
+
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;
        
        read_lock(&tasklist_lock);
        do_each_pid_task(pid, type, p) {
-               send_sigio_to_task(p, fown, fd, band);
+               send_sigio_to_task(p, fown, fd, band, group);
        } while_each_pid_task(pid, type, p);
        read_unlock(&tasklist_lock);
  out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
 }
 
 static void send_sigurg_to_task(struct task_struct *p,
-                                struct fown_struct *fown)
+                               struct fown_struct *fown, int group)
 {
        if (sigio_perm(p, fown, SIGURG))
-               group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
+               do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
 }
 
 int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
+       int group = 1;
        int ret = 0;
        
        read_lock(&fown->lock);
+
        type = fown->pid_type;
+       if (type == PIDTYPE_MAX) {
+               group = 0;
+               type = PIDTYPE_PID;
+       }
+
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
        
        read_lock(&tasklist_lock);
        do_each_pid_task(pid, type, p) {
-               send_sigurg_to_task(p, fown);
+               send_sigurg_to_task(p, fown, group);
        } while_each_pid_task(pid, type, p);
        read_unlock(&tasklist_lock);
  out_unlock_fown:
index 334ce39881f8fea36897196a262f4164cccde1f8..8eb44042e00934dbe1fe13c2af1fd659ddd8c296 100644 (file)
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
  * Handle nr_files sysctl
  */
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        files_stat.nr_files = get_nr_files();
-       return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
index e703654e7f40d901975d3a0953ce353a755248d7..992f6c9410bb0f27c8e6b02f21505c1fbda26aa5 100644 (file)
@@ -1276,14 +1276,9 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
                return 0;
 
        if (attr->ia_valid & ATTR_SIZE) {
-               unsigned long limit;
-               if (IS_SWAPFILE(inode))
-                       return -ETXTBSY;
-               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-               if (limit != RLIM_INFINITY && attr->ia_size > (loff_t) limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
+               err = inode_newsize_ok(inode, attr->ia_size);
+               if (err)
+                       return err;
                is_truncate = true;
        }
 
@@ -1350,8 +1345,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
         * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
         */
        if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
-               if (outarg.attr.size < oldsize)
-                       fuse_truncate(inode->i_mapping, outarg.attr.size);
+               truncate_pagecache(inode, oldsize, outarg.attr.size);
                invalidate_inode_pages2(inode->i_mapping);
        }
 
index fc9c79feb5f7c2150edae61b4af2b56d2291f7b9..01cc462ff45d5ccdf0bc0bc0526e493ed60fc729 100644 (file)
@@ -606,8 +606,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
                                   u64 attr_valid);
 
-void fuse_truncate(struct address_space *mapping, loff_t offset);
-
 /**
  * Initialize the client device
  */
index 6da947daabda1894f55a5079226ade24b30d72e0..1a822ce2b24b7a83fd8c579921d07d769e4c2645 100644 (file)
@@ -140,14 +140,6 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
        return 0;
 }
 
-void fuse_truncate(struct address_space *mapping, loff_t offset)
-{
-       /* See vmtruncate() */
-       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-       truncate_inode_pages(mapping, offset);
-       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-}
-
 void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
                                   u64 attr_valid)
 {
@@ -205,8 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
        spin_unlock(&fc->lock);
 
        if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
-               if (attr->size < oldsize)
-                       fuse_truncate(inode->i_mapping, attr->size);
+               truncate_pagecache(inode, oldsize, attr->size);
                invalidate_inode_pages2(inode->i_mapping);
        }
 }
index 7ebae9a4ecc01b50533cf0316dc936626a3350b9..694b5d48f0366e1b535abb102cbf456f3ae2e2c6 100644 (file)
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
        .direct_IO = gfs2_direct_IO,
        .migratepage = buffer_migrate_page,
        .is_partially_uptodate = block_is_partially_uptodate,
+       .error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
        .direct_IO = gfs2_direct_IO,
        .migratepage = buffer_migrate_page,
        .is_partially_uptodate = block_is_partially_uptodate,
+       .error_remove_page = generic_error_remove_page,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
        .invalidatepage = gfs2_invalidatepage,
        .releasepage = gfs2_releasepage,
        .is_partially_uptodate = block_is_partially_uptodate,
+       .error_remove_page = generic_error_remove_page,
 };
 
 void gfs2_set_aops(struct inode *inode)
index 7b6165f25fbefe14cacb5a8be3464d79f20e84a1..8bbe03c3f6d54e5286ff3c0c0cce4e6c5f653134 100644 (file)
@@ -344,10 +344,8 @@ void hfs_mdb_put(struct super_block *sb)
        brelse(HFS_SB(sb)->mdb_bh);
        brelse(HFS_SB(sb)->alt_mdb_bh);
 
-       if (HFS_SB(sb)->nls_io)
-               unload_nls(HFS_SB(sb)->nls_io);
-       if (HFS_SB(sb)->nls_disk)
-               unload_nls(HFS_SB(sb)->nls_disk);
+       unload_nls(HFS_SB(sb)->nls_io);
+       unload_nls(HFS_SB(sb)->nls_disk);
 
        free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
        kfree(HFS_SB(sb));
index c0759fe0855b623863eac1345c8b8e4bf35608a9..43022f3d514871d9f2405ff32eacbd1319f8e26a 100644 (file)
@@ -229,8 +229,7 @@ static void hfsplus_put_super(struct super_block *sb)
        iput(HFSPLUS_SB(sb).alloc_file);
        iput(HFSPLUS_SB(sb).hidden_dir);
        brelse(HFSPLUS_SB(sb).s_vhbh);
-       if (HFSPLUS_SB(sb).nls)
-               unload_nls(HFSPLUS_SB(sb).nls);
+       unload_nls(HFSPLUS_SB(sb).nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
 
@@ -464,8 +463,7 @@ out:
 
 cleanup:
        hfsplus_put_super(sb);
-       if (nls)
-               unload_nls(nls);
+       unload_nls(nls);
        return err;
 }
 
index eba6d552d9c901668d7803c5f3a3569bbaab081b..87a1258953b8e387fe49367acfd6d2ff5c164ca1 100644 (file)
@@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 
 static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
 {
-       struct super_block *sb = inode->i_sb;
-
-       if (!hlist_unhashed(&inode->i_hash)) {
-               if (!(inode->i_state & (I_DIRTY|I_SYNC)))
-                       list_move(&inode->i_list, &inode_unused);
-               inodes_stat.nr_unused++;
-               if (!sb || (sb->s_flags & MS_ACTIVE)) {
-                       spin_unlock(&inode_lock);
-                       return;
-               }
-               inode->i_state |= I_WILL_FREE;
-               spin_unlock(&inode_lock);
-               /*
-                * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
-                * in our backing_dev_info.
-                */
-               write_inode_now(inode, 1);
-               spin_lock(&inode_lock);
-               inode->i_state &= ~I_WILL_FREE;
-               inodes_stat.nr_unused--;
-               hlist_del_init(&inode->i_hash);
+       if (generic_detach_inode(inode)) {
+               truncate_hugepages(inode, 0);
+               clear_inode(inode);
+               destroy_inode(inode);
        }
-       list_del_init(&inode->i_list);
-       list_del_init(&inode->i_sb_list);
-       inode->i_state |= I_FREEING;
-       inodes_stat.nr_inodes--;
-       spin_unlock(&inode_lock);
-       truncate_hugepages(inode, 0);
-       clear_inode(inode);
-       destroy_inode(inode);
 }
 
 static void hugetlbfs_drop_inode(struct inode *inode)
@@ -936,15 +911,9 @@ static struct file_system_type hugetlbfs_fs_type = {
 
 static struct vfsmount *hugetlbfs_vfsmount;
 
-static int can_do_hugetlb_shm(int creat_flags)
+static int can_do_hugetlb_shm(void)
 {
-       if (creat_flags != HUGETLB_SHMFS_INODE)
-               return 0;
-       if (capable(CAP_IPC_LOCK))
-               return 1;
-       if (in_group_p(sysctl_hugetlb_shm_group))
-               return 1;
-       return 0;
+       return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
@@ -960,7 +929,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
 
-       if (!can_do_hugetlb_shm(creat_flags)) {
+       if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
                        WARN_ONCE(1,
index 76582b06ab975d76afcba7f5800d6c8e3873ebc6..4d8e3be55976272732f6f42610ab7813f66b8133 100644 (file)
@@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(generic_delete_inode);
 
-static void generic_forget_inode(struct inode *inode)
+/**
+ *     generic_detach_inode - remove inode from inode lists
+ *     @inode: inode to remove
+ *
+ *     Remove inode from inode lists, write it if it's dirty. This is just an
+ *     internal VFS helper exported for hugetlbfs. Do not use!
+ *
+ *     Returns 1 if inode should be completely destroyed.
+ */
+int generic_detach_inode(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
 
@@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode)
                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
                        spin_unlock(&inode_lock);
-                       return;
+                       return 0;
                }
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_WILL_FREE;
@@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode)
        inode->i_state |= I_FREEING;
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
+       return 1;
+}
+EXPORT_SYMBOL_GPL(generic_detach_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+       if (!generic_detach_inode(inode))
+               return;
        if (inode->i_data.nrpages)
                truncate_inode_pages(&inode->i_data, 0);
        clear_inode(inode);
@@ -1399,31 +1416,31 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        struct timespec now;
 
-       if (mnt_want_write(mnt))
-               return;
        if (inode->i_flags & S_NOATIME)
-               goto out;
+               return;
        if (IS_NOATIME(inode))
-               goto out;
+               return;
        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-               goto out;
+               return;
 
        if (mnt->mnt_flags & MNT_NOATIME)
-               goto out;
+               return;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-               goto out;
+               return;
 
        now = current_fs_time(inode->i_sb);
 
        if (!relatime_need_update(mnt, inode, now))
-               goto out;
+               return;
 
        if (timespec_equal(&inode->i_atime, &now))
-               goto out;
+               return;
+
+       if (mnt_want_write(mnt))
+               return;
 
        inode->i_atime = now;
        mark_inode_dirty_sync(inode);
-out:
        mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
@@ -1444,34 +1461,37 @@ void file_update_time(struct file *file)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct timespec now;
-       int sync_it = 0;
-       int err;
+       enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
 
+       /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return;
 
-       err = mnt_want_write_file(file);
-       if (err)
-               return;
-
        now = current_fs_time(inode->i_sb);
-       if (!timespec_equal(&inode->i_mtime, &now)) {
-               inode->i_mtime = now;
-               sync_it = 1;
-       }
+       if (!timespec_equal(&inode->i_mtime, &now))
+               sync_it = S_MTIME;
 
-       if (!timespec_equal(&inode->i_ctime, &now)) {
-               inode->i_ctime = now;
-               sync_it = 1;
-       }
+       if (!timespec_equal(&inode->i_ctime, &now))
+               sync_it |= S_CTIME;
 
-       if (IS_I_VERSION(inode)) {
-               inode_inc_iversion(inode);
-               sync_it = 1;
-       }
+       if (IS_I_VERSION(inode))
+               sync_it |= S_VERSION;
+
+       if (!sync_it)
+               return;
 
-       if (sync_it)
-               mark_inode_dirty_sync(inode);
+       /* Finally allowed to write? Takes lock. */
+       if (mnt_want_write_file(file))
+               return;
+
+       /* Only change inode inside the lock region */
+       if (sync_it & S_VERSION)
+               inode_inc_iversion(inode);
+       if (sync_it & S_CTIME)
+               inode->i_ctime = now;
+       if (sync_it & S_MTIME)
+               inode->i_mtime = now;
+       mark_inode_dirty_sync(inode);
        mnt_drop_write(file->f_path.mnt);
 }
 EXPORT_SYMBOL(file_update_time);
@@ -1599,7 +1619,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
        else if (S_ISSOCK(mode))
                inode->i_fop = &bad_sock_fops;
        else
-               printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
-                      mode);
+               printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
+                                 " inode %s:%lu\n", mode, inode->i_sb->s_id,
+                                 inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
index d55ef562f0bb588939d3b870d675a94e163a5a1e..515175b8b72e95f47893d3b1f32017da481e0aec 100644 (file)
@@ -57,6 +57,7 @@ extern int check_unsafe_exec(struct linux_binprm *);
  * namespace.c
  */
 extern int copy_mount_options(const void __user *, unsigned long *);
+extern int copy_mount_string(const void __user *, char **);
 
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
index 5612880fcbe7d7436f3579c7c1add7360170e407..7b17a14396ff792152ae6da49d178355d5a5e7ea 100644 (file)
@@ -162,20 +162,21 @@ EXPORT_SYMBOL(fiemap_check_flags);
 static int fiemap_check_ranges(struct super_block *sb,
                               u64 start, u64 len, u64 *new_len)
 {
+       u64 maxbytes = (u64) sb->s_maxbytes;
+
        *new_len = len;
 
        if (len == 0)
                return -EINVAL;
 
-       if (start > sb->s_maxbytes)
+       if (start > maxbytes)
                return -EFBIG;
 
        /*
         * Shrink request scope to what the fs can actually handle.
         */
-       if ((len > sb->s_maxbytes) ||
-           (sb->s_maxbytes - len) < start)
-               *new_len = sb->s_maxbytes - start;
+       if (len > maxbytes || (maxbytes - len) < start)
+               *new_len = maxbytes - start;
 
        return 0;
 }
index 85f96bc651c727ba04915138c50b45e9b63a3acd..6b4dcd4f2943e632c9d89115a2395084be95adc5 100644 (file)
@@ -46,10 +46,7 @@ static void isofs_put_super(struct super_block *sb)
 #ifdef CONFIG_JOLIET
        lock_kernel();
 
-       if (sbi->s_nls_iocharset) {
-               unload_nls(sbi->s_nls_iocharset);
-               sbi->s_nls_iocharset = NULL;
-       }
+       unload_nls(sbi->s_nls_iocharset);
 
        unlock_kernel();
 #endif
@@ -912,8 +909,7 @@ out_no_root:
                printk(KERN_WARNING "%s: get root inode failed\n", __func__);
 out_no_inode:
 #ifdef CONFIG_JOLIET
-       if (sbi->s_nls_iocharset)
-               unload_nls(sbi->s_nls_iocharset);
+       unload_nls(sbi->s_nls_iocharset);
 #endif
        goto out_freesbi;
 out_no_read:
index 37e6dcda8fc84f587508f8db58abdecee69524e1..2234c73fc5773531bd59ee81b3bbdf45d05a83ff 100644 (file)
@@ -178,13 +178,11 @@ static void jfs_put_super(struct super_block *sb)
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
-       if (sbi->nls_tab)
-               unload_nls(sbi->nls_tab);
-       sbi->nls_tab = NULL;
+
+       unload_nls(sbi->nls_tab);
 
        truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
        iput(sbi->direct_inode);
-       sbi->direct_inode = NULL;
 
        kfree(sbi);
 
@@ -347,8 +345,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 
        if (nls_map != (void *) -1) {
                /* Discard old (if remount) */
-               if (sbi->nls_tab)
-                       unload_nls(sbi->nls_tab);
+               unload_nls(sbi->nls_tab);
                sbi->nls_tab = nls_map;
        }
        return 1;
index dcec3d3ea64f944cd51d36f35b4150279c5659f2..219576c52d807e15b779d53be3a42dfa1baae9ae 100644 (file)
@@ -527,14 +527,18 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
 {
        loff_t pos = *ppos;
+       size_t ret;
+
        if (pos < 0)
                return -EINVAL;
-       if (pos >= available)
+       if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
-       if (copy_to_user(to, from + pos, count))
+       ret = copy_to_user(to, from + pos, count);
+       if (ret == count)
                return -EFAULT;
+       count -= ret;
        *ppos = pos + count;
        return count;
 }
@@ -735,10 +739,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;
 
-       ret = len; /* claim we got the whole input */
        attr->set_buf[size] = '\0';
        val = simple_strtol(attr->set_buf, NULL, 0);
-       attr->set(attr->data, val);
+       ret = attr->set(attr->data, val);
+       if (ret == 0)
+               ret = len; /* on success, claim we got the whole input */
 out:
        mutex_unlock(&attr->mutex);
        return ret;
index 7230787d18b02979122218429b4bf8af59b24cf7..bdc3cb4fd2220c6fde0f35159a902768a75d60f6 100644 (file)
@@ -1640,7 +1640,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
 {
        struct vfsmount *mnt;
 
-       if (!type || !memchr(type, 0, PAGE_SIZE))
+       if (!type)
                return -EINVAL;
 
        /* we need capabilities... */
@@ -1871,6 +1871,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
        return 0;
 }
 
+int copy_mount_string(const void __user *data, char **where)
+{
+       char *tmp;
+
+       if (!data) {
+               *where = NULL;
+               return 0;
+       }
+
+       tmp = strndup_user(data, PAGE_SIZE);
+       if (IS_ERR(tmp))
+               return PTR_ERR(tmp);
+
+       *where = tmp;
+       return 0;
+}
+
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1900,8 +1917,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 
        if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
                return -EINVAL;
-       if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
-               return -EINVAL;
 
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
@@ -2070,40 +2085,42 @@ EXPORT_SYMBOL(create_mnt_ns);
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
 {
-       int retval;
+       int ret;
+       char *kernel_type;
+       char *kernel_dir;
+       char *kernel_dev;
        unsigned long data_page;
-       unsigned long type_page;
-       unsigned long dev_page;
-       char *dir_page;
 
-       retval = copy_mount_options(type, &type_page);
-       if (retval < 0)
-               return retval;
+       ret = copy_mount_string(type, &kernel_type);
+       if (ret < 0)
+               goto out_type;
 
-       dir_page = getname(dir_name);
-       retval = PTR_ERR(dir_page);
-       if (IS_ERR(dir_page))
-               goto out1;
+       kernel_dir = getname(dir_name);
+       if (IS_ERR(kernel_dir)) {
+               ret = PTR_ERR(kernel_dir);
+               goto out_dir;
+       }
 
-       retval = copy_mount_options(dev_name, &dev_page);
-       if (retval < 0)
-               goto out2;
+       ret = copy_mount_string(dev_name, &kernel_dev);
+       if (ret < 0)
+               goto out_dev;
 
-       retval = copy_mount_options(data, &data_page);
-       if (retval < 0)
-               goto out3;
+       ret = copy_mount_options(data, &data_page);
+       if (ret < 0)
+               goto out_data;
 
-       retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
-                         flags, (void *)data_page);
-       free_page(data_page);
+       ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+               (void *) data_page);
 
-out3:
-       free_page(dev_page);
-out2:
-       putname(dir_page);
-out1:
-       free_page(type_page);
-       return retval;
+       free_page(data_page);
+out_data:
+       kfree(kernel_dev);
+out_dev:
+       putname(kernel_dir);
+out_dir:
+       kfree(kernel_type);
+out_type:
+       return ret;
 }
 
 /*
index b99ce205b1bd7d6715b81ca7d87452d0011ecb2d..cf98da1be23e861dbbeedc76849969f0fa454aff 100644 (file)
@@ -746,16 +746,8 @@ static void ncp_put_super(struct super_block *sb)
 
 #ifdef CONFIG_NCPFS_NLS
        /* unload the NLS charsets */
-       if (server->nls_vol)
-       {
-               unload_nls(server->nls_vol);
-               server->nls_vol = NULL;
-       }
-       if (server->nls_io)
-       {
-               unload_nls(server->nls_io);
-               server->nls_io = NULL;
-       }
+       unload_nls(server->nls_vol);
+       unload_nls(server->nls_io);
 #endif /* CONFIG_NCPFS_NLS */
 
        if (server->info_filp)
index 53a7ed7eb9c66da4b69090789afae1e3c5772e7b..0d58caf4a6e1414e51ecd55b09dc11ebbb11beec 100644 (file)
@@ -223,10 +223,8 @@ ncp_set_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
        oldset_io = server->nls_io;
        server->nls_io = iocharset;
 
-       if (oldset_cp)
-               unload_nls(oldset_cp);
-       if (oldset_io)
-               unload_nls(oldset_io);
+       unload_nls(oldset_cp);
+       unload_nls(oldset_io);
 
        return 0;
 }
index 5021b75d2d1e65910177128b719efb20ec098077..86d6b4db1096ea47529c472b819ec7bdb483672c 100644 (file)
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
        .direct_IO = nfs_direct_IO,
        .migratepage = nfs_migrate_page,
        .launder_page = nfs_launder_page,
+       .error_remove_page = generic_error_remove_page,
 };
 
 /*
index 060022b4651c38744c533413ed355f161bd05aad..faa091865ad05c956114ab7c7642994845717382 100644 (file)
@@ -458,49 +458,21 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
  */
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 {
-       if (i_size_read(inode) < offset) {
-               unsigned long limit;
-
-               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-               if (limit != RLIM_INFINITY && offset > limit)
-                       goto out_sig;
-               if (offset > inode->i_sb->s_maxbytes)
-                       goto out_big;
-               spin_lock(&inode->i_lock);
-               i_size_write(inode, offset);
-               spin_unlock(&inode->i_lock);
-       } else {
-               struct address_space *mapping = inode->i_mapping;
+       loff_t oldsize;
+       int err;
 
-               /*
-                * truncation of in-use swapfiles is disallowed - it would
-                * cause subsequent swapout to scribble on the now-freed
-                * blocks.
-                */
-               if (IS_SWAPFILE(inode))
-                       return -ETXTBSY;
-               spin_lock(&inode->i_lock);
-               i_size_write(inode, offset);
-               spin_unlock(&inode->i_lock);
+       err = inode_newsize_ok(inode, offset);
+       if (err)
+               goto out;
 
-               /*
-                * unmap_mapping_range is called twice, first simply for
-                * efficiency so that truncate_inode_pages does fewer
-                * single-page unmaps.  However after this first call, and
-                * before truncate_inode_pages finishes, it is possible for
-                * private pages to be COWed, which remain after
-                * truncate_inode_pages finishes, hence the second
-                * unmap_mapping_range call must be made for correctness.
-                */
-               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-               truncate_inode_pages(mapping, offset);
-               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-       }
-       return 0;
-out_sig:
-       send_sig(SIGXFSZ, current, 0);
-out_big:
-       return -EFBIG;
+       spin_lock(&inode->i_lock);
+       oldsize = inode->i_size;
+       i_size_write(inode, offset);
+       spin_unlock(&inode->i_lock);
+
+       truncate_pagecache(inode, oldsize, offset);
+out:
+       return err;
 }
 
 /**
index 477d37d83b316367e1ac04fb31ba98e375a37b1a..2224b4d07bf0116b478f9d8cf565c424f9b64b78 100644 (file)
@@ -270,7 +270,8 @@ struct nls_table *load_nls(char *charset)
 
 void unload_nls(struct nls_table *nls)
 {
-       module_put(nls->owner);
+       if (nls)
+               module_put(nls->owner);
 }
 
 static const wchar_t charset2uni[256] = {
index b38f944f0667351b69d947acf0e15660d995834b..cfce53cb65d76e0177601b129d972b8365d7d892 100644 (file)
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
        .migratepage    = buffer_migrate_page,  /* Move a page cache page from
                                                   one physical page to an
                                                   other. */
+       .error_remove_page = generic_error_remove_page,
 };
 
 /**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
        .migratepage    = buffer_migrate_page,  /* Move a page cache page from
                                                   one physical page to an
                                                   other. */
+       .error_remove_page = generic_error_remove_page,
 };
 
 #ifdef NTFS_RW
index abaaa1cbf8de4dac7ac9e8bfb6e143e82b06bff7..80b04770e8e9c22a64a4becf0686260627e0d7e6 100644 (file)
@@ -201,8 +201,7 @@ use_utf8:
                                                v, old_nls->charset);
                                nls_map = old_nls;
                        } else /* nls_map */ {
-                               if (old_nls)
-                                       unload_nls(old_nls);
+                               unload_nls(old_nls);
                        }
                } else if (!strcmp(p, "utf8")) {
                        bool val = false;
@@ -2427,10 +2426,9 @@ static void ntfs_put_super(struct super_block *sb)
                ntfs_free(vol->upcase);
                vol->upcase = NULL;
        }
-       if (vol->nls_map) {
-               unload_nls(vol->nls_map);
-               vol->nls_map = NULL;
-       }
+
+       unload_nls(vol->nls_map);
+
        sb->s_fs_info = NULL;
        kfree(vol);
 
index 72e76062a900d2555fac2dae44ef325a5d8aa897..deb2b132ae5ed42b68fd11f58413f2ffa4779b83 100644 (file)
@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
        .releasepage            = ocfs2_releasepage,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
index 171e052c07b3684f4a4264ebb44e802affc20f6e..c7bff4f603ff1557f7663fa22b9fb7ae8ed8a2e8 100644 (file)
@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                "Committed_AS:   %8lu kB\n"
                "VmallocTotal:   %8lu kB\n"
                "VmallocUsed:    %8lu kB\n"
-               "VmallocChunk:   %8lu kB\n",
+               "VmallocChunk:   %8lu kB\n"
+#ifdef CONFIG_MEMORY_FAILURE
+               "HardwareCorrupted: %8lu kB\n"
+#endif
+               ,
                K(i.totalram),
                K(i.freeram),
                K(i.bufferram),
@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                (unsigned long)VMALLOC_TOTAL >> 10,
                vmi.used >> 10,
                vmi.largest_chunk >> 10
+#ifdef CONFIG_MEMORY_FAILURE
+               ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+#endif
                );
 
        hugetlb_report_meminfo(m);
index 9b1e4e9a16bfd0f1ee07f109b1964f5700c1a577..f667e8aeabdf1145b42f4cecdf082246eed1e030 100644 (file)
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 
        /* careful: calling conventions are nasty here */
        res = count;
-       error = table->proc_handler(table, write, filp, buf, &res, ppos);
+       error = table->proc_handler(table, write, buf, &res, ppos);
        if (!error)
                error = res;
 out:
index 11f0c06316ded778283af0bdbe53d06101225df0..32fae4040ebf46a94bc84cf1260c2edebce0c330 100644 (file)
@@ -69,14 +69,11 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        /* make various checks */
        order = get_order(newsize);
        if (unlikely(order >= MAX_ORDER))
-               goto too_big;
+               return -EFBIG;
 
-       limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-       if (limit != RLIM_INFINITY && newsize > limit)
-               goto fsize_exceeded;
-
-       if (newsize > inode->i_sb->s_maxbytes)
-               goto too_big;
+       ret = inode_newsize_ok(inode, newsize);
+       if (ret)
+               return ret;
 
        i_size_write(inode, newsize);
 
@@ -118,12 +115,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
        return 0;
 
- fsize_exceeded:
-       send_sig(SIGXFSZ, current, 0);
- too_big:
-       return -EFBIG;
-
- add_error:
+add_error:
        while (loop < npages)
                __free_page(pages + loop++);
        return ret;
index 6c8c55dec2bcd6b2759f9698abab5b663258cc4b..3ac28987f22a38b3c2005a74e8ee3dac01621679 100644 (file)
@@ -839,9 +839,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 
        pos = *ppos;
-       retval = -EINVAL;
-       if (unlikely(pos < 0))
-               goto fput_out;
        if (unlikely(pos + count > max)) {
                retval = -EOVERFLOW;
                if (pos >= max)
index 47f132df0c3f0a630673831c2b862ce96ac72968..c117fa80d1e9b9ddc9be29a7d412022b9d158010 100644 (file)
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
        pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
 
        root = romfs_iget(sb, pos);
-       if (!root)
+       if (IS_ERR(root))
                goto error;
 
        sb->s_root = d_alloc_root(root);
index 6c959275f2d0ef41577d9cdb09086f3b9c670883..eae7d9dbf3ffed297d62b8321137de25ba965c83 100644 (file)
@@ -429,20 +429,21 @@ EXPORT_SYMBOL(mangle_path);
  */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
-       if (m->count < m->size) {
-               char *s = m->buf + m->count;
-               char *p = d_path(path, s, m->size - m->count);
+       char *buf;
+       size_t size = seq_get_buf(m, &buf);
+       int res = -1;
+
+       if (size) {
+               char *p = d_path(path, buf, size);
                if (!IS_ERR(p)) {
-                       s = mangle_path(s, p, esc);
-                       if (s) {
-                               p = m->buf + m->count;
-                               m->count = s - m->buf;
-                               return s - p;
-                       }
+                       char *end = mangle_path(buf, p, esc);
+                       if (end)
+                               res = end - buf;
                }
        }
-       m->count = m->size;
-       return -1;
+       seq_commit(m, res);
+
+       return res;
 }
 EXPORT_SYMBOL(seq_path);
 
@@ -454,26 +455,28 @@ EXPORT_SYMBOL(seq_path);
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
                  char *esc)
 {
-       int err = -ENAMETOOLONG;
-       if (m->count < m->size) {
-               char *s = m->buf + m->count;
+       char *buf;
+       size_t size = seq_get_buf(m, &buf);
+       int res = -ENAMETOOLONG;
+
+       if (size) {
                char *p;
 
                spin_lock(&dcache_lock);
-               p = __d_path(path, root, s, m->size - m->count);
+               p = __d_path(path, root, buf, size);
                spin_unlock(&dcache_lock);
-               err = PTR_ERR(p);
+               res = PTR_ERR(p);
                if (!IS_ERR(p)) {
-                       s = mangle_path(s, p, esc);
-                       if (s) {
-                               p = m->buf + m->count;
-                               m->count = s - m->buf;
-                               return 0;
-                       }
+                       char *end = mangle_path(buf, p, esc);
+                       if (end)
+                               res = end - buf;
+                       else
+                               res = -ENAMETOOLONG;
                }
        }
-       m->count = m->size;
-       return err;
+       seq_commit(m, res);
+
+       return res < 0 ? res : 0;
 }
 
 /*
@@ -481,20 +484,21 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
  */
 int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 {
-       if (m->count < m->size) {
-               char *s = m->buf + m->count;
-               char *p = dentry_path(dentry, s, m->size - m->count);
+       char *buf;
+       size_t size = seq_get_buf(m, &buf);
+       int res = -1;
+
+       if (size) {
+               char *p = dentry_path(dentry, buf, size);
                if (!IS_ERR(p)) {
-                       s = mangle_path(s, p, esc);
-                       if (s) {
-                               p = m->buf + m->count;
-                               m->count = s - m->buf;
-                               return s - p;
-                       }
+                       char *end = mangle_path(buf, p, esc);
+                       if (end)
+                               res = end - buf;
                }
        }
-       m->count = m->size;
-       return -1;
+       seq_commit(m, res);
+
+       return res;
 }
 
 int seq_bitmap(struct seq_file *m, const unsigned long *bits,
index 1402d2d54f5239d43663e01c792506946bebdbe9..1c4c8f089970041c49de182704bf537dbfce9259 100644 (file)
@@ -459,14 +459,8 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
 static void
 smb_unload_nls(struct smb_sb_info *server)
 {
-       if (server->remote_nls) {
-               unload_nls(server->remote_nls);
-               server->remote_nls = NULL;
-       }
-       if (server->local_nls) {
-               unload_nls(server->local_nls);
-               server->local_nls = NULL;
-       }
+       unload_nls(server->remote_nls);
+       unload_nls(server->local_nls);
 }
 
 static void
index 0e7207b9815c78a33f237e678da91f0adb969c45..19eb70b374bcb3239cd7a75ee901336f942b02e4 100644 (file)
@@ -465,6 +465,48 @@ rescan:
 }
 
 EXPORT_SYMBOL(get_super);
+
+/**
+ * get_active_super - get an active reference to the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given.  Returns the superblock with an active
+ * reference and s_umount held exclusively or %NULL if none was found.
+ */
+struct super_block *get_active_super(struct block_device *bdev)
+{
+       struct super_block *sb;
+
+       if (!bdev)
+               return NULL;
+
+       spin_lock(&sb_lock);
+       list_for_each_entry(sb, &super_blocks, s_list) {
+               if (sb->s_bdev != bdev)
+                       continue;
+
+               sb->s_count++;
+               spin_unlock(&sb_lock);
+               down_write(&sb->s_umount);
+               if (sb->s_root) {
+                       spin_lock(&sb_lock);
+                       if (sb->s_count > S_BIAS) {
+                               atomic_inc(&sb->s_active);
+                               sb->s_count--;
+                               spin_unlock(&sb_lock);
+                               return sb;
+                       }
+                       spin_unlock(&sb_lock);
+               }
+               up_write(&sb->s_umount);
+               put_super(sb);
+               yield();
+               spin_lock(&sb_lock);
+       }
+       spin_unlock(&sb_lock);
+       return NULL;
+}
  
 struct super_block * user_get_super(dev_t dev)
 {
@@ -527,11 +569,15 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
        int remount_rw;
-       
+
+       if (sb->s_frozen != SB_UNFROZEN)
+               return -EBUSY;
+
 #ifdef CONFIG_BLOCK
        if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
                return -EACCES;
 #endif
+
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
        shrink_dcache_sb(sb);
@@ -743,9 +789,14 @@ int get_sb_bdev(struct file_system_type *fs_type,
         * will protect the lockfs code from trying to start a snapshot
         * while we are mounting
         */
-       down(&bdev->bd_mount_sem);
+       mutex_lock(&bdev->bd_fsfreeze_mutex);
+       if (bdev->bd_fsfreeze_count > 0) {
+               mutex_unlock(&bdev->bd_fsfreeze_mutex);
+               error = -EBUSY;
+               goto error_bdev;
+       }
        s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-       up(&bdev->bd_mount_sem);
+       mutex_unlock(&bdev->bd_fsfreeze_mutex);
        if (IS_ERR(s))
                goto error_s;
 
@@ -892,6 +943,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (error)
                goto out_sb;
 
+       /*
+        * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
+        * but s_maxbytes was an unsigned long long for many releases. Throw
+        * this warning for a little while to try and catch filesystems that
+        * violate this rule. This warning should be either removed or
+        * converted to a BUG() in 2.6.34.
+        */
+       WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+               "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+
        mnt->mnt_mountpoint = mnt->mnt_root;
        mnt->mnt_parent = mnt;
        up_write(&mnt->mnt_sb->s_umount);
index d5e5559e31db3774ba6f76e350302009ccf831eb..381854461b282fe928c93cf61efb41edb1ad1acf 100644 (file)
@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
 };
index 916c0ffb6083de2be56fdf56656ee199e248df73..c5bc67c4e3bb32527acca1e560797bec87e217cb 100644 (file)
@@ -26,7 +26,6 @@ STATIC int
 xfs_stats_clear_proc_handler(
        ctl_table       *ctl,
        int             write,
-       struct file     *filp,
        void            __user *buffer,
        size_t          *lenp,
        loff_t          *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
        int             c, ret, *valp = ctl->data;
        __uint32_t      vn_active;
 
-       ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
        if (!ret && write && *valp) {
                printk("XFS Clearing xfsstats\n");
index 4d3e48373e74eee4ce2881bc56442c7ef59409c3..0c3dd8603927ac2d20394a8e531d82eb963dfc34 100644 (file)
 #define F_SETSIG       10      /* for sockets. */
 #define F_GETSIG       11      /* for sockets. */
 #endif
+#ifndef F_SETOWN_EX
+#define F_SETOWN_EX    12
+#define F_GETOWN_EX    13
+#endif
+
+#define F_OWNER_TID    0
+#define F_OWNER_PID    1
+#define F_OWNER_GID    2
+
+struct f_owner_ex {
+       int     type;
+       pid_t   pid;
+};
 
 /* for F_[GET|SET]FL */
 #define FD_CLOEXEC     1       /* actually anything with low bit set goes */
index dd63bd38864b23627f36e57b53c8413af63cd238..5ee13b2fd223599422446b509227eadb24b7f7e8 100644 (file)
@@ -34,6 +34,7 @@
 #define MADV_REMOVE    9               /* remove these pages & resources */
 #define MADV_DONTFORK  10              /* don't inherit across fork */
 #define MADV_DOFORK    11              /* do inherit across fork */
+#define MADV_HWPOISON  100             /* poison a page for testing */
 
 #define MADV_MERGEABLE   12            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 13            /* KSM may not merge identical pages */
index c840719a8c595cea463df2113d4934e6ff2a0ff8..942d30b5aab15186cef7ca2709f362b4ea393176 100644 (file)
@@ -82,6 +82,7 @@ typedef struct siginfo {
 #ifdef __ARCH_SI_TRAPNO
                        int _trapno;    /* TRAP # which caused the signal */
 #endif
+                       short _addr_lsb; /* LSB of the reported address */
                } _sigfault;
 
                /* SIGPOLL */
@@ -112,6 +113,7 @@ typedef struct siginfo {
 #ifdef __ARCH_SI_TRAPNO
 #define si_trapno      _sifields._sigfault._trapno
 #endif
+#define si_addr_lsb    _sifields._sigfault._addr_lsb
 #define si_band                _sifields._sigpoll._band
 #define si_fd          _sifields._sigpoll._fd
 
@@ -192,7 +194,11 @@ typedef struct siginfo {
 #define BUS_ADRALN     (__SI_FAULT|1)  /* invalid address alignment */
 #define BUS_ADRERR     (__SI_FAULT|2)  /* non-existant physical address */
 #define BUS_OBJERR     (__SI_FAULT|3)  /* object specific hardware error */
-#define NSIGBUS                3
+/* hardware memory error consumed on a machine check: action required */
+#define BUS_MCEERR_AR  (__SI_FAULT|4)
+/* hardware memory error detected in process but not consumed: action optional*/
+#define BUS_MCEERR_AO  (__SI_FAULT|5)
+#define NSIGBUS                5
 
 /*
  * SIGTRAP si_codes
index 5fc2ef8d97fac5851eb2d40f7b7e2d67a3be48e8..a1c486a88e8856bc572f1c481764833d0987fa16 100644 (file)
@@ -58,25 +58,60 @@ struct dma_chan_ref {
  * array.
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
- * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
+ * @ASYNC_TX_FENCE: specify that the next operation in the dependency
+ * chain uses this operation's result as an input
  */
 enum async_tx_flags {
        ASYNC_TX_XOR_ZERO_DST    = (1 << 0),
        ASYNC_TX_XOR_DROP_DST    = (1 << 1),
-       ASYNC_TX_ACK             = (1 << 3),
-       ASYNC_TX_DEP_ACK         = (1 << 4),
+       ASYNC_TX_ACK             = (1 << 2),
+       ASYNC_TX_FENCE           = (1 << 3),
+};
+
+/**
+ * struct async_submit_ctl - async_tx submission/completion modifiers
+ * @flags: submission modifiers
+ * @depend_tx: parent dependency of the current operation being submitted
+ * @cb_fn: callback routine to run at operation completion
+ * @cb_param: parameter for the callback routine
+ * @scribble: caller provided space for dma/page address conversions
+ */
+struct async_submit_ctl {
+       enum async_tx_flags flags;
+       struct dma_async_tx_descriptor *depend_tx;
+       dma_async_tx_callback cb_fn;
+       void *cb_param;
+       void *scribble;
 };
 
 #ifdef CONFIG_DMA_ENGINE
 #define async_tx_issue_pending_all dma_issue_pending_all
+
+/**
+ * async_tx_issue_pending - send pending descriptor to the hardware channel
+ * @tx: descriptor handle to retrieve hardware context
+ *
+ * Note: any dependent operations will have already been issued by
+ * async_tx_channel_switch, or (in the case of no channel switch) will
+ * be already pending on this channel.
+ */
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+       if (likely(tx)) {
+               struct dma_chan *chan = tx->chan;
+               struct dma_device *dma = chan->device;
+
+               dma->device_issue_pending(chan);
+       }
+}
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
 #define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \
         __async_tx_find_channel(dep, type)
 struct dma_chan *
-__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-       enum dma_transaction_type tx_type);
+__async_tx_find_channel(struct async_submit_ctl *submit,
+                       enum dma_transaction_type tx_type);
 #endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */
 #else
 static inline void async_tx_issue_pending_all(void)
@@ -84,10 +119,16 @@ static inline void async_tx_issue_pending_all(void)
        do { } while (0);
 }
 
+static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx)
+{
+       do { } while (0);
+}
+
 static inline struct dma_chan *
-async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
-       enum dma_transaction_type tx_type, struct page **dst, int dst_count,
-       struct page **src, int src_count, size_t len)
+async_tx_find_channel(struct async_submit_ctl *submit,
+                     enum dma_transaction_type tx_type, struct page **dst,
+                     int dst_count, struct page **src, int src_count,
+                     size_t len)
 {
        return NULL;
 }
@@ -99,46 +140,70 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
  * @cb_fn_param: parameter to pass to the callback routine
  */
 static inline void
-async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param)
+async_tx_sync_epilog(struct async_submit_ctl *submit)
 {
-       if (cb_fn)
-               cb_fn(cb_fn_param);
+       if (submit->cb_fn)
+               submit->cb_fn(submit->cb_param);
 }
 
-void
-async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
-       enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+typedef union {
+       unsigned long addr;
+       struct page *page;
+       dma_addr_t dma;
+} addr_conv_t;
+
+static inline void
+init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags,
+                 struct dma_async_tx_descriptor *tx,
+                 dma_async_tx_callback cb_fn, void *cb_param,
+                 addr_conv_t *scribble)
+{
+       args->flags = flags;
+       args->depend_tx = tx;
+       args->cb_fn = cb_fn;
+       args->cb_param = cb_param;
+       args->scribble = scribble;
+}
+
+void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+                    struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_xor(struct page *dest, struct page **src_list, unsigned int offset,
-       int src_cnt, size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+         int src_cnt, size_t len, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_xor_zero_sum(struct page *dest, struct page **src_list,
-       unsigned int offset, int src_cnt, size_t len,
-       u32 *result, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
+             int src_cnt, size_t len, enum sum_check_flags *result,
+             struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
-       unsigned int src_offset, size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+            unsigned int src_offset, size_t len,
+            struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
 async_memset(struct page *dest, int val, unsigned int offset,
-       size_t len, enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+            size_t len, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt,
+                  size_t len, struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt,
+                  size_t len, enum sum_check_flags *pqres, struct page *spare,
+                  struct async_submit_ctl *submit);
+
+struct dma_async_tx_descriptor *
+async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb,
+                       struct page **ptrs, struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
-async_trigger_callback(enum async_tx_flags flags,
-       struct dma_async_tx_descriptor *depend_tx,
-       dma_async_tx_callback cb_fn, void *cb_fn_param);
+async_raid6_datap_recov(int src_num, size_t bytes, int faila,
+                       struct page **ptrs, struct async_submit_ctl *submit);
 
 void async_tx_quiesce(struct dma_async_tx_descriptor **tx);
 #endif /* _ASYNC_TX_H_ */
index 2046b5b8af48ae094d0182b143450b5459b67b30..aece486ac7349b8463ace95585b2fa9379a04ab3 100644 (file)
@@ -120,7 +120,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
-extern int set_binfmt(struct linux_binfmt *new);
+extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
 
 #endif /* __KERNEL__ */
index 90bba9e622864b88156638d231476a7889e4b125..b62bb9294d0c594618515c5f0214e963f7e3a904 100644 (file)
@@ -141,6 +141,38 @@ enum {
        CGRP_WAIT_ON_RMDIR,
 };
 
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+       CGROUP_FILE_PROCS,
+       CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+       /*
+        * used to find which pidlist is wanted. doesn't change as long as
+        * this particular list stays in the list.
+        */
+       struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+       /* array of xids */
+       pid_t *list;
+       /* how many elements the above list has */
+       int length;
+       /* how many files are using the current array */
+       int use_count;
+       /* each of these stored in a list by its cgroup */
+       struct list_head links;
+       /* pointer to the cgroup we belong to, for list removal purposes */
+       struct cgroup *owner;
+       /* protects the other fields */
+       struct rw_semaphore mutex;
+};
+
 struct cgroup {
        unsigned long flags;            /* "unsigned long" so bitops work */
 
@@ -179,11 +211,12 @@ struct cgroup {
         */
        struct list_head release_list;
 
-       /* pids_mutex protects pids_list and cached pid arrays. */
-       struct rw_semaphore pids_mutex;
-
-       /* Linked list of struct cgroup_pids */
-       struct list_head pids_list;
+       /*
+        * list of pidlists, up to two for each namespace (one for procs, one
+        * for tasks); created on demand.
+        */
+       struct list_head pidlists;
+       struct mutex pidlist_mutex;
 
        /* For RCU-protected deletion */
        struct rcu_head rcu_head;
@@ -227,6 +260,9 @@ struct css_set {
         * during subsystem registration (at boot time).
         */
        struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+       /* For RCU-protected deletion */
+       struct rcu_head rcu_head;
 };
 
 /*
@@ -389,10 +425,11 @@ struct cgroup_subsys {
                                                  struct cgroup *cgrp);
        int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
        void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-       int (*can_attach)(struct cgroup_subsys *ss,
-                         struct cgroup *cgrp, struct task_struct *tsk);
+       int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                         struct task_struct *tsk, bool threadgroup);
        void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                       struct cgroup *old_cgrp, struct task_struct *tsk);
+                       struct cgroup *old_cgrp, struct task_struct *tsk,
+                       bool threadgroup);
        void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
        void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
        int (*populate)(struct cgroup_subsys *ss,
index 7f627775c947a4a89701092422eae9025cbd7731..ddb7a97c78c25c1cf93a8302d79aa7f00d5e67a2 100644 (file)
@@ -27,8 +27,8 @@
  *
  * configfs Copyright (C) 2005 Oracle.  All rights reserved.
  *
- * Please read Documentation/filesystems/configfs.txt before using the
- * configfs interface, ESPECIALLY the parts about reference counts and
+ * Please read Documentation/filesystems/configfs/configfs.txt before using
+ * the configfs interface, ESPECIALLY the parts about reference counts and
  * item destructors.
  */
 
index 9c20c7e87d0aa830ac3009a676328e691f926903..d27a7a05718d3d0b299228f7c217d7924f242253 100644 (file)
@@ -20,6 +20,9 @@
  */
 #ifndef DCA_H
 #define DCA_H
+
+#include <linux/pci.h>
+
 /* DCA Provider API */
 
 /* DCA Notifier Interface */
@@ -36,6 +39,12 @@ struct dca_provider {
        int                      id;
 };
 
+struct dca_domain {
+       struct list_head        node;
+       struct list_head        dca_providers;
+       struct pci_bus          *pci_rc;
+};
+
 struct dca_ops {
        int     (*add_requester)    (struct dca_provider *, struct device *);
        int     (*remove_requester) (struct dca_provider *, struct device *);
@@ -47,7 +56,7 @@ struct dca_ops {
 struct dca_provider *alloc_dca_provider(struct dca_ops *ops, int priv_size);
 void free_dca_provider(struct dca_provider *dca);
 int register_dca_provider(struct dca_provider *dca, struct device *dev);
-void unregister_dca_provider(struct dca_provider *dca);
+void unregister_dca_provider(struct dca_provider *dca, struct device *dev);
 
 static inline void *dca_priv(struct dca_provider *dca)
 {
index eb5c2ba2f81ab25e7b6e1a6d0a2fe93cc61b1894..fc1b930f246cddd7c196ff698d38a2419e842058 100644 (file)
@@ -9,7 +9,7 @@
  *     2 as published by the Free Software Foundation.
  *
  *  debugfs is for people to use instead of /proc or /sys.
- *  See Documentation/DocBook/kernel-api for more details.
+ *  See Documentation/DocBook/filesystems for more details.
  */
 
 #ifndef _DEBUGFS_H_
index ffefba81c818ccc7424b54fade9f8af2a1eea10d..2b9f2ac7ed60f0e8c61e79f2f7791545f877e1ef 100644 (file)
@@ -48,19 +48,20 @@ enum dma_status {
 
 /**
  * enum dma_transaction_type - DMA transaction types/indexes
+ *
+ * Note: The DMA_ASYNC_TX capability is not to be set by drivers.  It is
+ * automatically set as dma devices are registered.
  */
 enum dma_transaction_type {
        DMA_MEMCPY,
        DMA_XOR,
-       DMA_PQ_XOR,
-       DMA_DUAL_XOR,
-       DMA_PQ_UPDATE,
-       DMA_ZERO_SUM,
-       DMA_PQ_ZERO_SUM,
+       DMA_PQ,
+       DMA_XOR_VAL,
+       DMA_PQ_VAL,
        DMA_MEMSET,
-       DMA_MEMCPY_CRC32C,
        DMA_INTERRUPT,
        DMA_PRIVATE,
+       DMA_ASYNC_TX,
        DMA_SLAVE,
 };
 
@@ -70,18 +71,25 @@ enum dma_transaction_type {
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
- *     control completion, and communicate status.
+ *  control completion, and communicate status.
  * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of
- *     this transaction
+ *  this transaction
  * @DMA_CTRL_ACK - the descriptor cannot be reused until the client
- *     acknowledges receipt, i.e. has has a chance to establish any
- *     dependency chains
+ *  acknowledges receipt, i.e. has has a chance to establish any dependency
+ *  chains
  * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
  * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
  * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single
  *     (if not set, do the source dma-unmapping as page)
  * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single
  *     (if not set, do the destination dma-unmapping as page)
+ * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
+ * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
+ * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
+ *  sources that were the result of a previous operation, in the case of a PQ
+ *  operation it continues the calculation with new sources
+ * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
+ *  on the result of this operation
  */
 enum dma_ctrl_flags {
        DMA_PREP_INTERRUPT = (1 << 0),
@@ -90,8 +98,31 @@ enum dma_ctrl_flags {
        DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
        DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4),
        DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5),
+       DMA_PREP_PQ_DISABLE_P = (1 << 6),
+       DMA_PREP_PQ_DISABLE_Q = (1 << 7),
+       DMA_PREP_CONTINUE = (1 << 8),
+       DMA_PREP_FENCE = (1 << 9),
 };
 
+/**
+ * enum sum_check_bits - bit position of pq_check_flags
+ */
+enum sum_check_bits {
+       SUM_CHECK_P = 0,
+       SUM_CHECK_Q = 1,
+};
+
+/**
+ * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations
+ * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise
+ * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise
+ */
+enum sum_check_flags {
+       SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P),
+       SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q),
+};
+
+
 /**
  * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
  * See linux/cpumask.h
@@ -180,8 +211,6 @@ typedef void (*dma_async_tx_callback)(void *dma_async_param);
  * @flags: flags to augment operation preparation, control completion, and
  *     communicate status
  * @phys: physical address of the descriptor
- * @tx_list: driver common field for operations that require multiple
- *     descriptors
  * @chan: target channel for this operation
  * @tx_submit: set the prepared descriptor(s) to be executed by the engine
  * @callback: routine to call after this operation is complete
@@ -195,7 +224,6 @@ struct dma_async_tx_descriptor {
        dma_cookie_t cookie;
        enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */
        dma_addr_t phys;
-       struct list_head tx_list;
        struct dma_chan *chan;
        dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
        dma_async_tx_callback callback;
@@ -213,6 +241,11 @@ struct dma_async_tx_descriptor {
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
  * @max_xor: maximum number of xor sources, 0 if no capability
+ * @max_pq: maximum number of PQ sources and PQ-continue capability
+ * @copy_align: alignment shift for memcpy operations
+ * @xor_align: alignment shift for xor operations
+ * @pq_align: alignment shift for pq operations
+ * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
@@ -220,7 +253,9 @@ struct dma_async_tx_descriptor {
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
- * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_xor_val: prepares a xor validation operation
+ * @device_prep_dma_pq: prepares a pq operation
+ * @device_prep_dma_pq_val: prepares a pqzero_sum operation
  * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
@@ -235,7 +270,13 @@ struct dma_device {
        struct list_head channels;
        struct list_head global_node;
        dma_cap_mask_t  cap_mask;
-       int max_xor;
+       unsigned short max_xor;
+       unsigned short max_pq;
+       u8 copy_align;
+       u8 xor_align;
+       u8 pq_align;
+       u8 fill_align;
+       #define DMA_HAS_PQ_CONTINUE (1 << 15)
 
        int dev_id;
        struct device *dev;
@@ -249,9 +290,17 @@ struct dma_device {
        struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
                struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
                unsigned int src_cnt, size_t len, unsigned long flags);
-       struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
+       struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)(
                struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt,
-               size_t len, u32 *result, unsigned long flags);
+               size_t len, enum sum_check_flags *result, unsigned long flags);
+       struct dma_async_tx_descriptor *(*device_prep_dma_pq)(
+               struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
+               unsigned int src_cnt, const unsigned char *scf,
+               size_t len, unsigned long flags);
+       struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)(
+               struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
+               unsigned int src_cnt, const unsigned char *scf, size_t len,
+               enum sum_check_flags *pqres, unsigned long flags);
        struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
                struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
                unsigned long flags);
@@ -270,6 +319,96 @@ struct dma_device {
        void (*device_issue_pending)(struct dma_chan *chan);
 };
 
+static inline bool dmaengine_check_align(u8 align, size_t off1, size_t off2, size_t len)
+{
+       size_t mask;
+
+       if (!align)
+               return true;
+       mask = (1 << align) - 1;
+       if (mask & (off1 | off2 | len))
+               return false;
+       return true;
+}
+
+static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1,
+                                      size_t off2, size_t len)
+{
+       return dmaengine_check_align(dev->copy_align, off1, off2, len);
+}
+
+static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1,
+                                     size_t off2, size_t len)
+{
+       return dmaengine_check_align(dev->xor_align, off1, off2, len);
+}
+
+static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
+                                    size_t off2, size_t len)
+{
+       return dmaengine_check_align(dev->pq_align, off1, off2, len);
+}
+
+static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
+                                      size_t off2, size_t len)
+{
+       return dmaengine_check_align(dev->fill_align, off1, off2, len);
+}
+
+static inline void
+dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
+{
+       dma->max_pq = maxpq;
+       if (has_pq_continue)
+               dma->max_pq |= DMA_HAS_PQ_CONTINUE;
+}
+
+static inline bool dmaf_continue(enum dma_ctrl_flags flags)
+{
+       return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE;
+}
+
+static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags)
+{
+       enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P;
+
+       return (flags & mask) == mask;
+}
+
+static inline bool dma_dev_has_pq_continue(struct dma_device *dma)
+{
+       return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE;
+}
+
+static unsigned short dma_dev_to_maxpq(struct dma_device *dma)
+{
+       return dma->max_pq & ~DMA_HAS_PQ_CONTINUE;
+}
+
+/* dma_maxpq - reduce maxpq in the face of continued operations
+ * @dma - dma device with PQ capability
+ * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set
+ *
+ * When an engine does not support native continuation we need 3 extra
+ * source slots to reuse P and Q with the following coefficients:
+ * 1/ {00} * P : remove P from Q', but use it as a source for P'
+ * 2/ {01} * Q : use Q to continue Q' calculation
+ * 3/ {00} * Q : subtract Q from P' to cancel (2)
+ *
+ * In the case where P is disabled we only need 1 extra source:
+ * 1/ {01} * Q : use Q to continue Q' calculation
+ */
+static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
+{
+       if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags))
+               return dma_dev_to_maxpq(dma);
+       else if (dmaf_p_disabled_continue(flags))
+               return dma_dev_to_maxpq(dma) - 1;
+       else if (dmaf_continue(flags))
+               return dma_dev_to_maxpq(dma) - 3;
+       BUG();
+}
+
 /* --- public DMA engine API --- */
 
 #ifdef CONFIG_DMA_ENGINE
@@ -299,7 +438,11 @@ static inline void net_dmaengine_put(void)
 #ifdef CONFIG_ASYNC_TX_DMA
 #define async_dmaengine_get()  dmaengine_get()
 #define async_dmaengine_put()  dmaengine_put()
+#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+#define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX)
+#else
 #define async_dma_find_channel(type) dma_find_channel(type)
+#endif /* CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH */
 #else
 static inline void async_dmaengine_get(void)
 {
@@ -312,7 +455,7 @@ async_dma_find_channel(enum dma_transaction_type type)
 {
        return NULL;
 }
-#endif
+#endif /* CONFIG_ASYNC_TX_DMA */
 
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
        void *dest, void *src, size_t len);
index 51803528b095fa70380dc6ef201f11704e229f75..2adaa2529f184fda637a6a2aeea55bda41693104 100644 (file)
@@ -595,6 +595,7 @@ struct address_space_operations {
        int (*launder_page) (struct page *);
        int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
                                        unsigned long);
+       int (*error_remove_page)(struct address_space *, struct page *);
 };
 
 /*
@@ -640,7 +641,6 @@ struct block_device {
        struct super_block *    bd_super;
        int                     bd_openers;
        struct mutex            bd_mutex;       /* open/close mutex */
-       struct semaphore        bd_mount_sem;
        struct list_head        bd_inodes;
        void *                  bd_holder;
        int                     bd_holders;
@@ -1315,7 +1315,7 @@ struct super_block {
        unsigned long           s_blocksize;
        unsigned char           s_blocksize_bits;
        unsigned char           s_dirt;
-       unsigned long long      s_maxbytes;     /* Max file size */
+       loff_t                  s_maxbytes;     /* Max file size */
        struct file_system_type *s_type;
        const struct super_operations   *s_op;
        const struct dquot_operations   *dq_op;
@@ -2156,6 +2156,7 @@ extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
+extern int generic_detach_inode(struct inode *inode);
 
 extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
@@ -2334,6 +2335,7 @@ extern void get_filesystem(struct file_system_type *fs);
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_active_super(struct block_device *bdev);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
 
@@ -2381,7 +2383,8 @@ extern int buffer_migrate_page(struct address_space *,
 #define buffer_migrate_page NULL
 #endif
 
-extern int inode_change_ok(struct inode *, struct iattr *);
+extern int inode_change_ok(const struct inode *, struct iattr *);
+extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
 extern void file_update_time(struct file *file);
@@ -2467,7 +2470,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+int proc_nr_files(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 int __init get_filesystem_list(char *buf);
index 3c0924a18dafd0bac336e1c45b79a080ea834e55..cd3d2abaf30a806f508d0d98950f97739ba26562 100644 (file)
@@ -19,7 +19,7 @@
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-                    struct file *filp, void __user *buffer, size_t *lenp,
+                    void __user *buffer, size_t *lenp,
                     loff_t *ppos);
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
@@ -94,7 +94,7 @@ static inline void ftrace_start(void) { }
 extern int stack_tracer_enabled;
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-                  struct file *file, void __user *buffer, size_t *lenp,
+                  void __user *buffer, size_t *lenp,
                   loff_t *ppos);
 #endif
 
index 34956c8fdebf8df63ab44c1f84b136406b3d0f34..8ec17997d94fa0aafeaab5cc95824194c9e913f3 100644 (file)
@@ -4,11 +4,6 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-struct inode;
-struct mm_struct;
-struct task_struct;
-union ktime;
-
 /* Second argument to futex syscall */
 
 
@@ -129,6 +124,11 @@ struct robust_list_head {
 #define FUTEX_BITSET_MATCH_ANY 0xffffffff
 
 #ifdef __KERNEL__
+struct inode;
+struct mm_struct;
+struct task_struct;
+union ktime;
+
 long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
              u32 __user *uaddr2, u32 val2, u32 val3);
 
index 176e7ee73eff5f8c9a97a77b69def8e50cadc5d3..11ab19ac6b3d98af1b4389b62a085c929254c491 100644 (file)
@@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 }
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
+int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                        struct page **, struct vm_area_struct **,
index e46a0734ab6e68c9f4df95ce5f9771923af392ea..bf9213b2db8f2da2a443d8788d2d70eb0a003af8 100644 (file)
@@ -118,6 +118,9 @@ static inline bool mem_cgroup_disabled(void)
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                               gfp_t gfp_mask, int nid,
+                                               int zid);
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct mem_cgroup;
 
@@ -276,6 +279,13 @@ static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
 {
 }
 
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                           gfp_t gfp_mask, int nid, int zid)
+{
+       return 0;
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
index b6eae5e3144b9dff9ce70e0186c4e7fbbda55475..df08551cb0ad04aa6a16e03c8ab7b40999995bdf 100644 (file)
@@ -695,11 +695,12 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS        0x0002
 #define VM_FAULT_MAJOR 0x0004
 #define VM_FAULT_WRITE 0x0008  /* Special case for get_user_pages */
+#define VM_FAULT_HWPOISON 0x0010       /* Hit poisoned page */
 
 #define VM_FAULT_NOPAGE        0x0100  /* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 
-#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS)
+#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
 
 /*
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@ -791,8 +792,14 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
        unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
-extern int vmtruncate(struct inode * inode, loff_t offset);
-extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
+extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
+extern int vmtruncate(struct inode *inode, loff_t offset);
+extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
+
+int truncate_inode_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_page(struct address_space *mapping, struct page *page);
+
+int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1279,7 +1286,7 @@ int in_gate_area_no_task(unsigned long addr);
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif /* __HAVE_ARCH_GATE_AREA */
 
-int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+int drop_caches_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                        unsigned long lru_pages);
@@ -1308,5 +1315,12 @@ void vmemmap_populate_print_last(void);
 extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
                                 size_t size);
 extern void refund_locked_memory(struct mm_struct *mm, size_t size);
+
+extern void memory_failure(unsigned long pfn, int trapno);
+extern int __memory_failure(unsigned long pfn, int trapno, int ref);
+extern int sysctl_memory_failure_early_kill;
+extern int sysctl_memory_failure_recovery;
+extern atomic_long_t mce_bad_pages;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
index 0042090a4d70cd839a97c6b436ea91f8ddf51d40..21d6aa45206aa985a01c1a1fcfb9faaf7ff83307 100644 (file)
@@ -240,6 +240,8 @@ struct mm_struct {
 
        unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+       struct linux_binfmt *binfmt;
+
        cpumask_t cpu_vm_mask;
 
        /* Architecture-specific MM context */
@@ -259,11 +261,10 @@ struct mm_struct {
        unsigned long flags; /* Must use atomic bitops to access the bits */
 
        struct core_state *core_state; /* coredumping support */
-
-       /* aio bits */
+#ifdef CONFIG_AIO
        spinlock_t              ioctx_lock;
        struct hlist_head       ioctx_list;
-
+#endif
 #ifdef CONFIG_MM_OWNER
        /*
         * "owner" points to a task that is regarded as the canonical
index 652ef01be5823230e8cf89ea96e14b12acd2076a..6f7561730d88c3b8c816e34e76b140cb9612ce8c 100644 (file)
@@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-struct file;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
-                       struct file *, void __user *, size_t *, loff_t *);
+                       void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
-                       struct file *, void __user *, size_t *, loff_t *);
+                       void __user *, size_t *, loff_t *);
 
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
-                       struct file *, void __user *, size_t *, loff_t *);
+                       void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16     /* string buffer size */
 
index 13de789f0a5c1b8b47660dd37ca0c7da73dd26fd..6b202b173955541adbe03ebd8137af6749f1c214 100644 (file)
@@ -51,6 +51,9 @@
  * PG_buddy is set to indicate that the page is free and in the buddy system
  * (see mm/page_alloc.c).
  *
+ * PG_hwpoison indicates that a page got corrupted in hardware and contains
+ * data with incorrect ECC bits that triggered a machine check. Accessing is
+ * not safe since it may cause another machine check. Don't touch!
  */
 
 /*
@@ -101,6 +104,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,            /* Page has been mapped as uncached */
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+       PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
        __NR_PAGEFLAGS,
 
@@ -269,6 +275,15 @@ PAGEFLAG(Uncached, uncached)
 PAGEFLAG_FALSE(Uncached)
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+PAGEFLAG(HWPoison, hwpoison)
+TESTSETFLAG(HWPoison, hwpoison)
+#define __PG_HWPOISON (1UL << PG_hwpoison)
+#else
+PAGEFLAG_FALSE(HWPoison)
+#define __PG_HWPOISON 0
+#endif
+
 static inline int PageUptodate(struct page *page)
 {
        int ret = test_bit(PG_uptodate, &(page)->flags);
@@ -393,7 +408,7 @@ static inline void __ClearPageTail(struct page *page)
         1 << PG_private | 1 << PG_private_2 | \
         1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED)
+        1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
index ada779f2417846bcee829c5e4f33a42f79a76b73..4b938d4f3ac2eff57736a6b8ced5a32f0c260d6a 100644 (file)
@@ -38,6 +38,7 @@ enum {
        PCG_LOCK,  /* page cgroup is locked */
        PCG_CACHE, /* charged as cache */
        PCG_USED, /* this object is in use. */
+       PCG_ACCT_LRU, /* page has been accounted for */
 };
 
 #define TESTPCGFLAG(uname, lname)                      \
@@ -52,11 +53,23 @@ static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
 static inline void ClearPageCgroup##uname(struct page_cgroup *pc)      \
        { clear_bit(PCG_##lname, &pc->flags);  }
 
+#define TESTCLEARPCGFLAG(uname, lname)                 \
+static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)   \
+       { return test_and_clear_bit(PCG_##lname, &pc->flags);  }
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
+CLEARPCGFLAG(Cache, CACHE)
+SETPCGFLAG(Cache, CACHE)
 
 TESTPCGFLAG(Used, USED)
 CLEARPCGFLAG(Used, USED)
+SETPCGFLAG(Used, USED)
+
+SETPCGFLAG(AcctLRU, ACCT_LRU)
+CLEARPCGFLAG(AcctLRU, ACCT_LRU)
+TESTPCGFLAG(AcctLRU, ACCT_LRU)
+TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
 static inline int page_cgroup_nid(struct page_cgroup *pc)
 {
index 7803565aa877a2f3e39d4be29d794989f971ce87..da1fda8623e089fbd8d1a67d40f317216e071a6f 100644 (file)
 #define PCI_DEVICE_ID_INTEL_E7525_MCH  0x359e
 #define PCI_DEVICE_ID_INTEL_IOAT_CNB   0x360b
 #define PCI_DEVICE_ID_INTEL_FBD_CNB    0x360c
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF0  0x3710
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF1  0x3711
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF2  0x3712
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF3  0x3713
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF4  0x3714
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF5  0x3715
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF6  0x3716
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF7  0x3717
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF8  0x3718
+#define PCI_DEVICE_ID_INTEL_IOAT_JSF9  0x3719
 #define PCI_DEVICE_ID_INTEL_ICH10_0    0x3a14
 #define PCI_DEVICE_ID_INTEL_ICH10_1    0x3a16
 #define PCI_DEVICE_ID_INTEL_ICH10_2    0x3a18
index 07bff666e65b695d1062638f8d20f9770d610161..931150566ade8d720f156665b2a5400bbc499f0d 100644 (file)
@@ -88,4 +88,6 @@
 #define PR_TASK_PERF_EVENTS_DISABLE            31
 #define PR_TASK_PERF_EVENTS_ENABLE             32
 
+#define PR_MCE_KILL    33
+
 #endif /* _LINUX_PRCTL_H */
index 953fc055e87567fc4034af88020c114885b78b3c..14a86bc7102b47805843a84c97c8e59ce8b34e02 100644 (file)
@@ -140,7 +140,7 @@ struct rchan_callbacks
         * cause relay_open() to create a single global buffer rather
         * than the default set of per-cpu buffers.
         *
-        * See Documentation/filesystems/relayfs.txt for more info.
+        * See Documentation/filesystems/relay.txt for more info.
         */
        struct dentry *(*create_buf_file)(const char *filename,
                                          struct dentry *parent,
index 511f42fc68166475b063b54a09f596ff74556310..731af71cddc9b1a8f3fb326f08f4d0bec048e08e 100644 (file)
@@ -34,6 +34,10 @@ struct res_counter {
         * the limit that usage cannot exceed
         */
        unsigned long long limit;
+       /*
+        * the limit that usage can be exceed
+        */
+       unsigned long long soft_limit;
        /*
         * the number of unsuccessful attempts to consume the resource
         */
@@ -87,6 +91,7 @@ enum {
        RES_MAX_USAGE,
        RES_LIMIT,
        RES_FAILCNT,
+       RES_SOFT_LIMIT,
 };
 
 /*
@@ -109,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
                unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-               unsigned long val, struct res_counter **limit_fail_at);
+               unsigned long val, struct res_counter **limit_fail_at,
+               struct res_counter **soft_limit_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
@@ -122,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter,
  */
 
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+                               bool *was_soft_limit_excess);
 
 static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 {
@@ -132,6 +139,36 @@ static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
        return false;
 }
 
+static inline bool res_counter_soft_limit_check_locked(struct res_counter *cnt)
+{
+       if (cnt->usage < cnt->soft_limit)
+               return true;
+
+       return false;
+}
+
+/**
+ * Get the difference between the usage and the soft limit
+ * @cnt: The counter
+ *
+ * Returns 0 if usage is less than or equal to soft limit
+ * The difference between usage and soft limit, otherwise.
+ */
+static inline unsigned long long
+res_counter_soft_limit_excess(struct res_counter *cnt)
+{
+       unsigned long long excess;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cnt->lock, flags);
+       if (cnt->usage <= cnt->soft_limit)
+               excess = 0;
+       else
+               excess = cnt->usage - cnt->soft_limit;
+       spin_unlock_irqrestore(&cnt->lock, flags);
+       return excess;
+}
+
 /*
  * Helper function to detect if the cgroup is within it's limit or
  * not. It's currently called from cgroup_rss_prepare()
@@ -147,6 +184,17 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
        return ret;
 }
 
+static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt)
+{
+       bool ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cnt->lock, flags);
+       ret = res_counter_soft_limit_check_locked(cnt);
+       spin_unlock_irqrestore(&cnt->lock, flags);
+       return ret;
+}
+
 static inline void res_counter_reset_max(struct res_counter *cnt)
 {
        unsigned long flags;
@@ -180,4 +228,16 @@ static inline int res_counter_set_limit(struct res_counter *cnt,
        return ret;
 }
 
+static inline int
+res_counter_set_soft_limit(struct res_counter *cnt,
+                               unsigned long long soft_limit)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cnt->lock, flags);
+       cnt->soft_limit = soft_limit;
+       spin_unlock_irqrestore(&cnt->lock, flags);
+       return 0;
+}
+
 #endif
index 477841d29fce238a2888a7c0af0c7d05bc795524..cb0ba7032609d5602a709a54f4b012413b981587 100644 (file)
@@ -81,7 +81,19 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *cnt, unsigned long *vm_flags);
-int try_to_unmap(struct page *, int ignore_refs);
+enum ttu_flags {
+       TTU_UNMAP = 0,                  /* unmap mode */
+       TTU_MIGRATION = 1,              /* migration mode */
+       TTU_MUNLOCK = 2,                /* munlock mode */
+       TTU_ACTION_MASK = 0xff,
+
+       TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
+       TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
+       TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+};
+#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+
+int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
@@ -108,6 +120,13 @@ int page_mkclean(struct page *);
  */
 int try_to_munlock(struct page *);
 
+/*
+ * Called by memory-failure.c to kill processes.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page);
+void page_unlock_anon_vma(struct anon_vma *anon_vma);
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+
 #else  /* !CONFIG_MMU */
 
 #define anon_vma_init()                do {} while (0)
index 848d1f20086e64d53d0023b8ea7538b047ee6adc..75e6e60bf583bb89a7784d4476a32766d10db420 100644 (file)
@@ -309,7 +309,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-                                   struct file *filp, void __user *buffer,
+                                   void __user *buffer,
                                    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
@@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-                                        struct file *filp, void __user *buffer,
+                                        void __user *buffer,
                                         size_t *lenp, loff_t *ppos);
 #endif
 
@@ -1271,7 +1271,6 @@ struct task_struct {
        struct mm_struct *mm, *active_mm;
 
 /* task state */
-       struct linux_binfmt *binfmt;
        int exit_state;
        int exit_code, exit_signal;
        int pdeath_signal;  /*  The signal sent when the parent dies  */
@@ -1735,6 +1734,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
 #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
 #define PF_FORKNOEXEC  0x00000040      /* forked but didn't exec */
+#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV   0x00000100      /* used super-user privileges */
 #define PF_DUMPCORE    0x00000200      /* dumped core */
 #define PF_SIGNALED    0x00000400      /* killed by a signal */
@@ -1754,6 +1754,7 @@ extern cputime_t task_gtime(struct task_struct *p);
 #define PF_SPREAD_PAGE 0x01000000      /* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB 0x02000000      /* Spread some slab caches over cpuset */
 #define PF_THREAD_BOUND        0x04000000      /* Thread bound to specific cpu */
+#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY   0x10000000      /* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER        0x20000000      /* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP        0x40000000      /* Freezer should not count it as freezeable */
@@ -1906,7 +1907,7 @@ extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-               struct file *file, void __user *buffer, size_t *length,
+               void __user *buffer, size_t *length,
                loff_t *ppos);
 #endif
 #ifdef CONFIG_SCHED_DEBUG
@@ -1924,7 +1925,7 @@ extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
 int sched_rt_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos);
 
 extern unsigned int sysctl_sched_compat_yield;
@@ -2059,6 +2060,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
@@ -2336,7 +2338,10 @@ static inline int signal_pending(struct task_struct *p)
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 
-extern int __fatal_signal_pending(struct task_struct *p);
+static inline int __fatal_signal_pending(struct task_struct *p)
+{
+       return unlikely(sigismember(&p->pending.signal, SIGKILL));
+}
 
 static inline int fatal_signal_pending(struct task_struct *p)
 {
index d050b66ab9ef0415b169122590f7335714ad9b2e..239e40d0450bc02380b30c08e8854ecc3d4ba638 100644 (file)
@@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
 }
-extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+extern int mmap_min_addr_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_SECURITY
index 0c6a86b795968621a107c2c8c611b5ef7d387516..8366d8f12e537ac9cfb3e9cc053371bcea174025 100644 (file)
@@ -35,6 +35,44 @@ struct seq_operations {
 
 #define SEQ_SKIP 1
 
+/**
+ * seq_get_buf - get buffer to write arbitrary data to
+ * @m: the seq_file handle
+ * @bufp: the beginning of the buffer is stored here
+ *
+ * Return the number of bytes available in the buffer, or zero if
+ * there's no space.
+ */
+static inline size_t seq_get_buf(struct seq_file *m, char **bufp)
+{
+       BUG_ON(m->count > m->size);
+       if (m->count < m->size)
+               *bufp = m->buf + m->count;
+       else
+               *bufp = NULL;
+
+       return m->size - m->count;
+}
+
+/**
+ * seq_commit - commit data to the buffer
+ * @m: the seq_file handle
+ * @num: the number of bytes to commit
+ *
+ * Commit @num bytes of data written to a buffer previously acquired
+ * by seq_buf_get.  To signal an error condition, or that the data
+ * didn't fit in the available space, pass a negative @num value.
+ */
+static inline void seq_commit(struct seq_file *m, int num)
+{
+       if (num < 0) {
+               m->count = m->size;
+       } else {
+               BUG_ON(m->count + num > m->size);
+               m->count += num;
+       }
+}
+
 char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
index c7552836bd954a0f0a0235feea5cf9acb5c1a2c8..ab9272cc270c2a4b4f5a8c0d0d479f3eceeeb32d 100644 (file)
@@ -233,6 +233,8 @@ static inline int valid_signal(unsigned long sig)
 }
 
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
+extern int do_send_sig_info(int sig, struct siginfo *info,
+                               struct task_struct *p, bool group);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
index 6c990e658f4ed33dc2244b1b8ef5d1649cc22a31..4ec90019c1a4c3997c8da80dd3d36f3b31d09b11 100644 (file)
@@ -34,15 +34,37 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT    5
-#ifndef CONFIG_MIGRATION
-#define MAX_SWAPFILES          (1 << MAX_SWAPFILES_SHIFT)
+
+/*
+ * Use some of the swap files numbers for other purposes. This
+ * is a convenient way to hook into the VM to trigger special
+ * actions on faults.
+ */
+
+/*
+ * NUMA node memory migration support
+ */
+#ifdef CONFIG_MIGRATION
+#define SWP_MIGRATION_NUM 2
+#define SWP_MIGRATION_READ     (MAX_SWAPFILES + SWP_HWPOISON_NUM)
+#define SWP_MIGRATION_WRITE    (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
 #else
-/* Use last two entries for page migration swap entries */
-#define MAX_SWAPFILES          ((1 << MAX_SWAPFILES_SHIFT)-2)
-#define SWP_MIGRATION_READ     MAX_SWAPFILES
-#define SWP_MIGRATION_WRITE    (MAX_SWAPFILES + 1)
+#define SWP_MIGRATION_NUM 0
 #endif
 
+/*
+ * Handling of hardware poisoned pages with memory corruption.
+ */
+#ifdef CONFIG_MEMORY_FAILURE
+#define SWP_HWPOISON_NUM 1
+#define SWP_HWPOISON           MAX_SWAPFILES
+#else
+#define SWP_HWPOISON_NUM 0
+#endif
+
+#define MAX_SWAPFILES \
+       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+
 /*
  * Magic header for a swap area. The first part of the union is
  * what the swap magic looks like for the old (limited to 128MB)
@@ -217,6 +239,11 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
                                                  gfp_t gfp_mask, bool noswap,
                                                  unsigned int swappiness);
+extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+                                               gfp_t gfp_mask, bool noswap,
+                                               unsigned int swappiness,
+                                               struct zone *zone,
+                                               int nid);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
@@ -240,7 +267,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
 extern unsigned long scan_unevictable_pages;
-extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+extern int scan_unevictable_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
index 6ec39ab27b4b48aaf31b02a19bf5a164217abcce..cd42e30b7c6eb5e1b04bd69ca393f1c7878b3351 100644 (file)
@@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 
 #endif
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Support for hardware poisoned pages
+ */
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+       BUG_ON(!PageLocked(page));
+       return swp_entry(SWP_HWPOISON, page_to_pfn(page));
+}
+
+static inline int is_hwpoison_entry(swp_entry_t entry)
+{
+       return swp_type(entry) == SWP_HWPOISON;
+}
+#else
+
+static inline swp_entry_t make_hwpoison_entry(struct page *page)
+{
+       return swp_entry(0, 0);
+}
+
+static inline int is_hwpoison_entry(swp_entry_t swp)
+{
+       return 0;
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+static inline int non_swap_entry(swp_entry_t entry)
+{
+       return swp_type(entry) >= MAX_SWAPFILES;
+}
+#else
+static inline int non_swap_entry(swp_entry_t entry)
+{
+       return 0;
+}
+#endif
index e76d3b22a46645b503283138b317ced96a0fbbef..1e4743ee6831039d8b6d8e5773d7f5eebafcd33d 100644 (file)
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct file;
 struct completion;
 
 #define CTL_MAXNAME 10         /* how many path components do we allow in a
@@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table,
                         void __user *oldval, size_t __user *oldlenp,
                         void __user *newval, size_t newlen);
 
-typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp,
+typedef int proc_handler (struct ctl_table *ctl, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos);
 
-extern int proc_dostring(struct ctl_table *, int, struct file *,
+extern int proc_dostring(struct ctl_table *, int,
                         void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int, struct file *,
+extern int proc_dointvec(struct ctl_table *, int,
                         void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_minmax(struct ctl_table *, int,
                                void __user *, size_t *, loff_t *);
-extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_jiffies(struct ctl_table *, int,
                                 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
                                    void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_doulongvec_minmax(struct ctl_table *, int,
                                  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
-                                     struct file *, void __user *, size_t *, loff_t *);
+                                     void __user *, size_t *, loff_t *);
 
 extern int do_sysctl (int __user *name, int nlen,
                      void __user *oldval, size_t __user *oldlenp,
index 56787c0933456c560bc98e860da4a989e8437f18..fe04e5ef6a592ebddf885b2f313323ec5226e452 100644 (file)
@@ -155,6 +155,34 @@ extern void timekeeping_leap_insert(int leapsecond);
 struct tms;
 extern void do_sys_times(struct tms *);
 
+/*
+ * Similar to the struct tm in userspace <time.h>, but it needs to be here so
+ * that the kernel source is self contained.
+ */
+struct tm {
+       /*
+        * the number of seconds after the minute, normally in the range
+        * 0 to 59, but can be up to 60 to allow for leap seconds
+        */
+       int tm_sec;
+       /* the number of minutes after the hour, in the range 0 to 59*/
+       int tm_min;
+       /* the number of hours past midnight, in the range 0 to 23 */
+       int tm_hour;
+       /* the day of the month, in the range 1 to 31 */
+       int tm_mday;
+       /* the number of months since January, in the range 0 to 11 */
+       int tm_mon;
+       /* the number of years since 1900 */
+       long tm_year;
+       /* the number of days since Sunday, in the range 0 to 6 */
+       int tm_wday;
+       /* the number of days since January 1, in the range 0 to 365 */
+       int tm_yday;
+};
+
+void time_to_tm(time_t totalsecs, int offset, struct tm *result);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:                pointer to the timespec variable to be converted
index 17ba82efa4830543bace0639129b7a0f6e2cd3bb..1eb44a924e5643066677f7d3edef90d6ef9cce0d 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Tracing hooks
  *
- * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -463,22 +463,38 @@ static inline int tracehook_get_signal(struct task_struct *task,
 
 /**
  * tracehook_notify_jctl - report about job control stop/continue
- * @notify:            nonzero if this is the last thread in the group to stop
+ * @notify:            zero, %CLD_STOPPED or %CLD_CONTINUED
  * @why:               %CLD_STOPPED or %CLD_CONTINUED
  *
  * This is called when we might call do_notify_parent_cldstop().
- * It's called when about to stop for job control; we are already in
- * %TASK_STOPPED state, about to call schedule().  It's also called when
- * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made.
  *
- * Return nonzero to generate a %SIGCHLD with @why, which is
- * normal if @notify is nonzero.
+ * @notify is zero if we would not ordinarily send a %SIGCHLD,
+ * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD.
  *
- * Called with no locks held.
+ * @why is %CLD_STOPPED when about to stop for job control;
+ * we are already in %TASK_STOPPED state, about to call schedule().
+ * It might also be that we have just exited (check %PF_EXITING),
+ * but need to report that a group-wide stop is complete.
+ *
+ * @why is %CLD_CONTINUED when waking up after job control stop and
+ * ready to make a delayed @notify report.
+ *
+ * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal.
+ *
+ * Called with the siglock held.
  */
 static inline int tracehook_notify_jctl(int notify, int why)
 {
-       return notify || (current->ptrace & PT_PTRACED);
+       return notify ?: (current->ptrace & PT_PTRACED) ? why : 0;
+}
+
+/**
+ * tracehook_finish_jctl - report about return from job control stop
+ *
+ * This is called by do_signal_stop() after wakeup.
+ */
+static inline void tracehook_finish_jctl(void)
+{
 }
 
 #define DEATH_REAP                     -1
index 63a3f7a8058032e1069d96e73ce5749b76a8e8da..660a9de96f81dba7ca358b8c4e952dc32df1e05f 100644 (file)
@@ -4,7 +4,7 @@
 /*
  * Kernel Tracepoint API.
  *
- * See Documentation/tracepoint.txt.
+ * See Documentation/trace/tracepoints.txt.
  *
  * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
  *
index 46dd12c5709e60d8b95ae0e151952d16ea02dc94..9356b24223ac3505343deaee610509a681ff3e1c 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_BE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_BE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_be16(const u8 *p)
 {
index 59777e951baf31fda11df5c0d977f745aa64388c..be376fb79b6454a50908f5d5b7cfad66fc083b00 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _LINUX_UNALIGNED_LE_BYTESHIFT_H
 #define _LINUX_UNALIGNED_LE_BYTESHIFT_H
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 static inline u16 __get_unaligned_le16(const u8 *p)
 {
index 75cf58666ff9f4a57c71b2d9512dbacbefc41911..66ebddcff6641f95498aa3c2e7af07cbffaa3a29 100644 (file)
@@ -110,21 +110,20 @@ extern int laptop_mode;
 extern unsigned long determine_dirtyable_memory(void);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos);
 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos);
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos);
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos);
 
 struct ctl_table;
-struct file;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
+int dirty_writeback_centisecs_handler(struct ctl_table *, int,
                                      void __user *, size_t *, loff_t *);
 
 void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
index 72c36926c26d80363bec2bdc3094d91ecb109e9e..5b26a0bd178ecc031d357786f519bfdacde0f8eb 100644 (file)
@@ -399,7 +399,7 @@ extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
  * fed into the routing cache should use these handlers.
  */
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-                        struct file* filp, void __user *buffer,
+                        void __user *buffer,
                         size_t *lenp, loff_t *ppos);
 int ipv4_doint_and_flush_strategy(ctl_table *table,
                                  void __user *oldval, size_t __user *oldlenp,
index 1459ed3e2697b495bb37226c8190b7f81d428c3c..f76f22d057216bb3e28b68a3e68fd4c002e50b30 100644 (file)
@@ -55,7 +55,6 @@ enum {
 #include <net/neighbour.h>
 
 struct ctl_table;
-struct file;
 struct inet6_dev;
 struct net_device;
 struct net_proto_family;
@@ -139,7 +138,6 @@ extern int                  igmp6_event_report(struct sk_buff *skb);
 #ifdef CONFIG_SYSCTL
 extern int                     ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
                                                           int write,
-                                                          struct file * filp,
                                                           void __user *buffer,
                                                           size_t *lenp,
                                                           loff_t *ppos);
index 40eab7314aeb6c38fcef65ea58cea52421e646ff..7d3704750efc1fd7ef9010acc20fcf2b76e14a28 100644 (file)
@@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write,
        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table ipc_table;
        memcpy(&ipc_table, table, sizeof(ipc_table));
        ipc_table.data = get_ipc(table);
 
-       return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+       return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 }
 
 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
-       struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table ipc_table;
        size_t lenp_bef = *lenp;
@@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
        memcpy(&ipc_table, table, sizeof(ipc_table));
        ipc_table.data = get_ipc(table);
 
-       rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+       rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 
        if (write && !rc && lenp_bef == *lenp)
                /*
@@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 }
 
 static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
-       struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table ipc_table;
        memcpy(&ipc_table, table, sizeof(ipc_table));
        ipc_table.data = get_ipc(table);
 
-       return proc_doulongvec_minmax(&ipc_table, write, filp, buffer,
+       return proc_doulongvec_minmax(&ipc_table, write, buffer,
                                        lenp, ppos);
 }
 
@@ -95,7 +95,7 @@ static void ipc_auto_callback(int val)
 }
 
 static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
-       struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table ipc_table;
        size_t lenp_bef = *lenp;
@@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
        ipc_table.data = get_ipc(table);
        oldval = *((int *)(ipc_table.data));
 
-       rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos);
+       rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
        if (write && !rc && lenp_bef == *lenp) {
                int newval = *((int *)(ipc_table.data));
index 24ae46dfe45daaf2ff0a1cb8511c9e4a2ef4e29e..8a058711fc103ad9df3a19c4b59eecc7dfcff686 100644 (file)
@@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table)
        return which;
 }
 
-static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_mq_dointvec(ctl_table *table, int write,
        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table mq_table;
        memcpy(&mq_table, table, sizeof(mq_table));
        mq_table.data = get_mq(table);
 
-       return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+       return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
 }
 
 static int proc_mq_dointvec_minmax(ctl_table *table, int write,
-       struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table mq_table;
        memcpy(&mq_table, table, sizeof(mq_table));
        mq_table.data = get_mq(table);
 
-       return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+       return proc_dointvec_minmax(&mq_table, write, buffer,
                                        lenp, ppos);
 }
 #else
index 187c89b4783d4a43e8ad13d0a98dbed8ae6df15d..b8d4cd8ac0b9d5d93e303833e70ed55d450582ee 100644 (file)
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
index defc2e6f1e3bc9742606d7efc0a0af4e79095b14..5feed232be9d4b55f72f5b38a010e4e6619c7a19 100644 (file)
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                break;
        }
        case AUDIT_SIGNAL_INFO:
-               err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
-               if (err)
-                       return err;
+               len = 0;
+               if (audit_sig_sid) {
+                       err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+                       if (err)
+                               return err;
+               }
                sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
                if (!sig_data) {
-                       security_release_secctx(ctx, len);
+                       if (audit_sig_sid)
+                               security_release_secctx(ctx, len);
                        return -ENOMEM;
                }
                sig_data->uid = audit_sig_uid;
                sig_data->pid = audit_sig_pid;
-               memcpy(sig_data->ctx, ctx, len);
-               security_release_secctx(ctx, len);
+               if (audit_sig_sid) {
+                       memcpy(sig_data->ctx, ctx, len);
+                       security_release_secctx(ctx, len);
+               }
                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
                                0, 0, sig_data, sizeof(*sig_data) + len);
                kfree(sig_data);
index 0e96dbc60ea9b45896a0a451287b406f2592b48c..cc7e87936cbc57f7a2eebebbd212fe297c530154 100644 (file)
@@ -45,8 +45,8 @@
 
 struct audit_watch {
        atomic_t                count;  /* reference count */
-       char                    *path;  /* insertion path */
        dev_t                   dev;    /* associated superblock device */
+       char                    *path;  /* insertion path */
        unsigned long           ino;    /* associated inode number */
        struct audit_parent     *parent; /* associated parent */
        struct list_head        wlist;  /* entry in parent->watches list */
index 68d3c6a0ecd635bbeba49c5de2d3e89b50315897..267e484f019817b87187c74bfa7aa12275411b0c 100644 (file)
@@ -168,12 +168,12 @@ struct audit_context {
        int                 in_syscall; /* 1 if task is in a syscall */
        enum audit_state    state, current_state;
        unsigned int        serial;     /* serial number for record */
-       struct timespec     ctime;      /* time of syscall entry */
        int                 major;      /* syscall number */
+       struct timespec     ctime;      /* time of syscall entry */
        unsigned long       argv[4];    /* syscall arguments */
-       int                 return_valid; /* return code is valid */
        long                return_code;/* syscall return code */
        u64                 prio;
+       int                 return_valid; /* return code is valid */
        int                 name_count;
        struct audit_names  names[AUDIT_NAMES];
        char *              filterkey;  /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
        char                target_comm[TASK_COMM_LEN];
 
        struct audit_tree_refs *trees, *first_trees;
-       int tree_count;
        struct list_head killed_trees;
+       int tree_count;
 
        int type;
        union {
index cd83d9933b6b85cd05bf6db17f424056d2d504fc..7ccba4bc5e3b815a9ac9a9c5a0e40d8e70b5c780 100644 (file)
@@ -23,6 +23,7 @@
  */
 
 #include <linux/cgroup.h>
+#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -48,6 +49,8 @@
 #include <linux/namei.h>
 #include <linux/smp_lock.h>
 #include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 
 #include <asm/atomic.h>
 
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 
+#define MAX_CGROUP_ROOT_NAMELEN 64
+
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy,
  * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
         */
        unsigned long subsys_bits;
 
+       /* Unique id for this hierarchy. */
+       int hierarchy_id;
+
        /* The bitmask of subsystems currently attached to this hierarchy */
        unsigned long actual_subsys_bits;
 
@@ -94,6 +102,9 @@ struct cgroupfs_root {
 
        /* The path to use for release notifications. */
        char release_agent_path[PATH_MAX];
+
+       /* The name for this hierarchy - may be empty */
+       char name[MAX_CGROUP_ROOT_NAMELEN];
 };
 
 /*
@@ -141,6 +152,10 @@ struct css_id {
 static LIST_HEAD(roots);
 static int root_count;
 
+static DEFINE_IDA(hierarchy_ida);
+static int next_hierarchy_id;
+static DEFINE_SPINLOCK(hierarchy_id_lock);
+
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
         * cgroup, anchored on cgroup->css_sets
         */
        struct list_head cgrp_link_list;
+       struct cgroup *cgrp;
        /*
         * List running through cg_cgroup_links pointing at a
         * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
 
-/* hash table for cgroup groups. This improves the performance to
- * find an existing css_set */
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
 #define CSS_SET_HASH_BITS      7
 #define CSS_SET_TABLE_SIZE     (1 << CSS_SET_HASH_BITS)
 static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
 
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+       struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+       kfree(cg);
+}
+
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-/* When we create or destroy a css_set, the operation simply
- * takes/releases a reference count on all the cgroups referenced
- * by subsystems in this css_set. This can end up multiple-counting
- * some cgroups, but that's OK - the ref-count is just a
- * busy/not-busy indicator; ensuring that we only count each cgroup
- * once would require taking a global lock to ensure that no
- * subsystems moved between hierarchies while we were doing so.
- *
- * Possible TODO: decide at boot time based on the number of
- * registered subsystems and the number of CPUs or NUMA nodes whether
- * it's better for performance to ref-count every subsystem, or to
- * take a global lock and only add one ref count to each hierarchy.
- */
-
-/*
- * unlink a css_set from the list and free it
- */
-static void unlink_css_set(struct css_set *cg)
+static void __put_css_set(struct css_set *cg, int taskexit)
 {
        struct cg_cgroup_link *link;
        struct cg_cgroup_link *saved_link;
-
-       hlist_del(&cg->hlist);
-       css_set_count--;
-
-       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                                cg_link_list) {
-               list_del(&link->cg_link_list);
-               list_del(&link->cgrp_link_list);
-               kfree(link);
-       }
-}
-
-static void __put_css_set(struct css_set *cg, int taskexit)
-{
-       int i;
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                write_unlock(&css_set_lock);
                return;
        }
-       unlink_css_set(cg);
-       write_unlock(&css_set_lock);
 
-       rcu_read_lock();
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
+       /* This css_set is dead. unlink it and release cgroup refcounts */
+       hlist_del(&cg->hlist);
+       css_set_count--;
+
+       list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+                                cg_link_list) {
+               struct cgroup *cgrp = link->cgrp;
+               list_del(&link->cg_link_list);
+               list_del(&link->cgrp_link_list);
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
+
+               kfree(link);
        }
-       rcu_read_unlock();
-       kfree(cg);
+
+       write_unlock(&css_set_lock);
+       call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 
 /*
@@ -337,6 +337,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
        __put_css_set(cg, 1);
 }
 
+/*
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cg: candidate css_set being tested
+ * @old_cg: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cg,
+                            struct css_set *old_cg,
+                            struct cgroup *new_cgrp,
+                            struct cgroup_subsys_state *template[])
+{
+       struct list_head *l1, *l2;
+
+       if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+               /* Not all subsystems matched */
+               return false;
+       }
+
+       /*
+        * Compare cgroup pointers in order to distinguish between
+        * different cgroups in heirarchies with no subsystems. We
+        * could get by with just this check alone (and skip the
+        * memcmp above) but on most setups the memcmp check will
+        * avoid the need for this more expensive check on almost all
+        * candidates.
+        */
+
+       l1 = &cg->cg_links;
+       l2 = &old_cg->cg_links;
+       while (1) {
+               struct cg_cgroup_link *cgl1, *cgl2;
+               struct cgroup *cg1, *cg2;
+
+               l1 = l1->next;
+               l2 = l2->next;
+               /* See if we reached the end - both lists are equal length. */
+               if (l1 == &cg->cg_links) {
+                       BUG_ON(l2 != &old_cg->cg_links);
+                       break;
+               } else {
+                       BUG_ON(l2 == &old_cg->cg_links);
+               }
+               /* Locate the cgroups associated with these links. */
+               cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+               cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+               cg1 = cgl1->cgrp;
+               cg2 = cgl2->cgrp;
+               /* Hierarchies should be linked in the same order. */
+               BUG_ON(cg1->root != cg2->root);
+
+               /*
+                * If this hierarchy is the hierarchy of the cgroup
+                * that's changing, then we need to check that this
+                * css_set points to the new cgroup; if it's any other
+                * hierarchy, then this css_set should point to the
+                * same cgroup as the old css_set.
+                */
+               if (cg1->root == new_cgrp->root) {
+                       if (cg1 != new_cgrp)
+                               return false;
+               } else {
+                       if (cg1 != cg2)
+                               return false;
+               }
+       }
+       return true;
+}
+
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
 
        hhead = css_set_hash(template);
        hlist_for_each_entry(cg, node, hhead, hlist) {
-               if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
-                       /* All subsystems matched */
-                       return cg;
-               }
+               if (!compare_css_sets(cg, oldcg, cgrp, template))
+                       continue;
+
+               /* This css_set matches what we need */
+               return cg;
        }
 
        /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
        link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
                                cgrp_link_list);
        link->cg = cg;
+       link->cgrp = cgrp;
+       atomic_inc(&cgrp->count);
        list_move(&link->cgrp_link_list, &cgrp->css_sets);
-       list_add(&link->cg_link_list, &cg->cg_links);
+       /*
+        * Always add links to the tail of the list so that the list
+        * is sorted by order of hierarchy creation
+        */
+       list_add_tail(&link->cg_link_list, &cg->cg_links);
 }
 
 /*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
 {
        struct css_set *res;
        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-       int i;
 
        struct list_head tmp_cg_links;
 
        struct hlist_head *hhead;
+       struct cg_cgroup_link *link;
 
        /* First see if we already have a cgroup group that matches
         * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
 
        write_lock(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup *cgrp = res->subsys[i]->cgroup;
-               struct cgroup_subsys *ss = subsys[i];
-               atomic_inc(&cgrp->count);
-               /*
-                * We want to add a link once per cgroup, so we
-                * only do it for the first subsystem in each
-                * hierarchy
-                */
-               if (ss->root->subsys_list.next == &ss->sibling)
-                       link_css_set(&tmp_cg_links, res, cgrp);
+       list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+               struct cgroup *c = link->cgrp;
+               if (c->root == cgrp->root)
+                       c = cgrp;
+               link_css_set(&tmp_cg_links, res, c);
        }
-       if (list_empty(&rootnode.subsys_list))
-               link_css_set(&tmp_cg_links, res, dummytop);
 
        BUG_ON(!list_empty(&tmp_cg_links));
 
@@ -517,6 +588,41 @@ static struct css_set *find_css_set(
        return res;
 }
 
+/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+                                           struct cgroupfs_root *root)
+{
+       struct css_set *css;
+       struct cgroup *res = NULL;
+
+       BUG_ON(!mutex_is_locked(&cgroup_mutex));
+       read_lock(&css_set_lock);
+       /*
+        * No need to lock the task - since we hold cgroup_mutex the
+        * task can't change groups, so the only thing that can happen
+        * is that it exits and its css is set back to init_css_set.
+        */
+       css = task->cgroups;
+       if (css == &init_css_set) {
+               res = &root->top_cgroup;
+       } else {
+               struct cg_cgroup_link *link;
+               list_for_each_entry(link, &css->cg_links, cg_link_list) {
+                       struct cgroup *c = link->cgrp;
+                       if (c->root == root) {
+                               res = c;
+                               break;
+                       }
+               }
+       }
+       read_unlock(&css_set_lock);
+       BUG_ON(!res);
+       return res;
+}
+
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                deactivate_super(cgrp->root->sb);
 
+               /*
+                * if we're getting rid of the cgroup, refcount should ensure
+                * that there are no pidlists left.
+                */
+               BUG_ON(!list_empty(&cgrp->pidlists));
+
                call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
        }
        iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noprefix");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+       if (strlen(root->name))
+               seq_printf(seq, ",name=%s", root->name);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
        unsigned long subsys_bits;
        unsigned long flags;
        char *release_agent;
+       char *name;
+       /* User explicitly requested empty subsystem */
+       bool none;
+
+       struct cgroupfs_root *new_root;
+
 };
 
 /* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
        mask = ~(1UL << cpuset_subsys_id);
 #endif
 
-       opts->subsys_bits = 0;
-       opts->flags = 0;
-       opts->release_agent = NULL;
+       memset(opts, 0, sizeof(*opts));
 
        while ((token = strsep(&o, ",")) != NULL) {
                if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
+               } else if (!strcmp(token, "none")) {
+                       /* Explicitly have no subsystems */
+                       opts->none = true;
                } else if (!strcmp(token, "noprefix")) {
                        set_bit(ROOT_NOPREFIX, &opts->flags);
                } else if (!strncmp(token, "release_agent=", 14)) {
                        /* Specifying two release agents is forbidden */
                        if (opts->release_agent)
                                return -EINVAL;
-                       opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+                       opts->release_agent =
+                               kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
                        if (!opts->release_agent)
                                return -ENOMEM;
-                       strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
-                       opts->release_agent[PATH_MAX - 1] = 0;
+               } else if (!strncmp(token, "name=", 5)) {
+                       int i;
+                       const char *name = token + 5;
+                       /* Can't specify an empty name */
+                       if (!strlen(name))
+                               return -EINVAL;
+                       /* Must match [\w.-]+ */
+                       for (i = 0; i < strlen(name); i++) {
+                               char c = name[i];
+                               if (isalnum(c))
+                                       continue;
+                               if ((c == '.') || (c == '-') || (c == '_'))
+                                       continue;
+                               return -EINVAL;
+                       }
+                       /* Specifying two names is forbidden */
+                       if (opts->name)
+                               return -EINVAL;
+                       opts->name = kstrndup(name,
+                                             MAX_CGROUP_ROOT_NAMELEN,
+                                             GFP_KERNEL);
+                       if (!opts->name)
+                               return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
                        int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
                }
        }
 
+       /* Consistency checks */
+
        /*
         * Option noprefix was introduced just for backward compatibility
         * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
            (opts->subsys_bits & mask))
                return -EINVAL;
 
-       /* We can't have an empty hierarchy */
-       if (!opts->subsys_bits)
+
+       /* Can't specify "none" and some subsystems */
+       if (opts->subsys_bits && opts->none)
+               return -EINVAL;
+
+       /*
+        * We either have to specify by name or by subsystems. (So all
+        * empty hierarchies must have a name).
+        */
+       if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
 
        return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        }
 
+       /* Don't allow name to change at remount */
+       if (opts.name && strcmp(opts.name, root->name)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
        ret = rebind_subsystems(root, opts.subsys_bits);
        if (ret)
                goto out_unlock;
@@ -955,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
        kfree(opts.release_agent);
+       kfree(opts.name);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        unlock_kernel();
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->css_sets);
        INIT_LIST_HEAD(&cgrp->release_list);
-       INIT_LIST_HEAD(&cgrp->pids_list);
-       init_rwsem(&cgrp->pids_mutex);
+       INIT_LIST_HEAD(&cgrp->pidlists);
+       mutex_init(&cgrp->pidlist_mutex);
 }
+
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        init_cgroup_housekeeping(cgrp);
 }
 
+static bool init_root_id(struct cgroupfs_root *root)
+{
+       int ret = 0;
+
+       do {
+               if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+                       return false;
+               spin_lock(&hierarchy_id_lock);
+               /* Try to allocate the next unused ID */
+               ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+                                       &root->hierarchy_id);
+               if (ret == -ENOSPC)
+                       /* Try again starting from 0 */
+                       ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+               if (!ret) {
+                       next_hierarchy_id = root->hierarchy_id + 1;
+               } else if (ret != -EAGAIN) {
+                       /* Can only get here if the 31-bit IDR is full ... */
+                       BUG_ON(ret);
+               }
+               spin_unlock(&hierarchy_id_lock);
+       } while (ret);
+       return true;
+}
+
 static int cgroup_test_super(struct super_block *sb, void *data)
 {
-       struct cgroupfs_root *new = data;
+       struct cgroup_sb_opts *opts = data;
        struct cgroupfs_root *root = sb->s_fs_info;
 
-       /* First check subsystems */
-       if (new->subsys_bits != root->subsys_bits)
-           return 0;
+       /* If we asked for a name then it must match */
+       if (opts->name && strcmp(opts->name, root->name))
+               return 0;
 
-       /* Next check flags */
-       if (new->flags != root->flags)
+       /*
+        * If we asked for subsystems (or explicitly for no
+        * subsystems) then they must match
+        */
+       if ((opts->subsys_bits || opts->none)
+           && (opts->subsys_bits != root->subsys_bits))
                return 0;
 
        return 1;
 }
 
+static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+{
+       struct cgroupfs_root *root;
+
+       if (!opts->subsys_bits && !opts->none)
+               return NULL;
+
+       root = kzalloc(sizeof(*root), GFP_KERNEL);
+       if (!root)
+               return ERR_PTR(-ENOMEM);
+
+       if (!init_root_id(root)) {
+               kfree(root);
+               return ERR_PTR(-ENOMEM);
+       }
+       init_cgroup_root(root);
+
+       root->subsys_bits = opts->subsys_bits;
+       root->flags = opts->flags;
+       if (opts->release_agent)
+               strcpy(root->release_agent_path, opts->release_agent);
+       if (opts->name)
+               strcpy(root->name, opts->name);
+       return root;
+}
+
+static void cgroup_drop_root(struct cgroupfs_root *root)
+{
+       if (!root)
+               return;
+
+       BUG_ON(!root->hierarchy_id);
+       spin_lock(&hierarchy_id_lock);
+       ida_remove(&hierarchy_ida, root->hierarchy_id);
+       spin_unlock(&hierarchy_id_lock);
+       kfree(root);
+}
+
 static int cgroup_set_super(struct super_block *sb, void *data)
 {
        int ret;
-       struct cgroupfs_root *root = data;
+       struct cgroup_sb_opts *opts = data;
+
+       /* If we don't have a new root, we can't set up a new sb */
+       if (!opts->new_root)
+               return -EINVAL;
+
+       BUG_ON(!opts->subsys_bits && !opts->none);
 
        ret = set_anon_super(sb, NULL);
        if (ret)
                return ret;
 
-       sb->s_fs_info = root;
-       root->sb = sb;
+       sb->s_fs_info = opts->new_root;
+       opts->new_root->sb = sb;
 
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                         void *data, struct vfsmount *mnt)
 {
        struct cgroup_sb_opts opts;
+       struct cgroupfs_root *root;
        int ret = 0;
        struct super_block *sb;
-       struct cgroupfs_root *root;
-       struct list_head tmp_cg_links;
+       struct cgroupfs_root *new_root;
 
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
-       if (ret) {
-               kfree(opts.release_agent);
-               return ret;
-       }
-
-       root = kzalloc(sizeof(*root), GFP_KERNEL);
-       if (!root) {
-               kfree(opts.release_agent);
-               return -ENOMEM;
-       }
+       if (ret)
+               goto out_err;
 
-       init_cgroup_root(root);
-       root->subsys_bits = opts.subsys_bits;
-       root->flags = opts.flags;
-       if (opts.release_agent) {
-               strcpy(root->release_agent_path, opts.release_agent);
-               kfree(opts.release_agent);
+       /*
+        * Allocate a new cgroup root. We may not need it if we're
+        * reusing an existing hierarchy.
+        */
+       new_root = cgroup_root_from_opts(&opts);
+       if (IS_ERR(new_root)) {
+               ret = PTR_ERR(new_root);
+               goto out_err;
        }
+       opts.new_root = new_root;
 
-       sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
-
+       /* Locate an existing or new sb for this hierarchy */
+       sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
        if (IS_ERR(sb)) {
-               kfree(root);
-               return PTR_ERR(sb);
+               ret = PTR_ERR(sb);
+               cgroup_drop_root(opts.new_root);
+               goto out_err;
        }
 
-       if (sb->s_fs_info != root) {
-               /* Reusing an existing superblock */
-               BUG_ON(sb->s_root == NULL);
-               kfree(root);
-               root = NULL;
-       } else {
-               /* New superblock */
+       root = sb->s_fs_info;
+       BUG_ON(!root);
+       if (root == opts.new_root) {
+               /* We used the new root structure, so this is a new hierarchy */
+               struct list_head tmp_cg_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
+               struct cgroupfs_root *existing_root;
                int i;
 
                BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
 
+               if (strlen(root->name)) {
+                       /* Check for name clashes with existing mounts */
+                       for_each_active_root(existing_root) {
+                               if (!strcmp(existing_root->name, root->name)) {
+                                       ret = -EBUSY;
+                                       mutex_unlock(&cgroup_mutex);
+                                       mutex_unlock(&inode->i_mutex);
+                                       goto drop_new_super;
+                               }
+                       }
+               }
+
                /*
                 * We're accessing css_set_count without locking
                 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                if (ret == -EBUSY) {
                        mutex_unlock(&cgroup_mutex);
                        mutex_unlock(&inode->i_mutex);
-                       goto free_cg_links;
+                       free_cg_links(&tmp_cg_links);
+                       goto drop_new_super;
                }
 
                /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                BUG_ON(root->number_of_cgroups != 1);
 
                cgroup_populate_dir(root_cgrp);
-               mutex_unlock(&inode->i_mutex);
                mutex_unlock(&cgroup_mutex);
+               mutex_unlock(&inode->i_mutex);
+       } else {
+               /*
+                * We re-used an existing hierarchy - the new root (if
+                * any) is not needed
+                */
+               cgroup_drop_root(opts.new_root);
        }
 
        simple_set_mnt(mnt, sb);
+       kfree(opts.release_agent);
+       kfree(opts.name);
        return 0;
 
- free_cg_links:
-       free_cg_links(&tmp_cg_links);
  drop_new_super:
        deactivate_locked_super(sb);
+ out_err:
+       kfree(opts.release_agent);
+       kfree(opts.name);
+
        return ret;
 }
 
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
        mutex_unlock(&cgroup_mutex);
 
        kill_litter_super(sb);
-       kfree(root);
+       cgroup_drop_root(root);
 }
 
 static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        return 0;
 }
 
-/*
- * Return the first subsystem attached to a cgroup's hierarchy, and
- * its subsystem id.
- */
-
-static void get_first_subsys(const struct cgroup *cgrp,
-                       struct cgroup_subsys_state **css, int *subsys_id)
-{
-       const struct cgroupfs_root *root = cgrp->root;
-       const struct cgroup_subsys *test_ss;
-       BUG_ON(list_empty(&root->subsys_list));
-       test_ss = list_entry(root->subsys_list.next,
-                            struct cgroup_subsys, sibling);
-       if (css) {
-               *css = cgrp->subsys[test_ss->subsys_id];
-               BUG_ON(!*css);
-       }
-       if (subsys_id)
-               *subsys_id = test_ss->subsys_id;
-}
-
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
  * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        struct css_set *cg;
        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
-       int subsys_id;
-
-       get_first_subsys(cgrp, NULL, &subsys_id);
 
        /* Nothing to do if the task is already in that cgroup */
-       oldcgrp = task_cgroup(tsk, subsys_id);
+       oldcgrp = task_cgroup_from_root(tsk, root);
        if (cgrp == oldcgrp)
                return 0;
 
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                       retval = ss->can_attach(ss, cgrp, tsk);
+                       retval = ss->can_attach(ss, cgrp, tsk, false);
                        if (retval)
                                return retval;
                }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
        for_each_subsys(root, ss) {
                if (ss->attach)
-                       ss->attach(ss, cgrp, oldcgrp, tsk);
+                       ss->attach(ss, cgrp, oldcgrp, tsk, false);
        }
        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
        return ret;
 }
 
-/* The various types of files and directories in a cgroup file system */
-enum cgroup_filetype {
-       FILE_ROOT,
-       FILE_DIR,
-       FILE_TASKLIST,
-       FILE_NOTIFY_ON_RELEASE,
-       FILE_RELEASE_AGENT,
-};
-
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
@@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
  * the start of a css_set
  */
 static void cgroup_advance_iter(struct cgroup *cgrp,
-                                         struct cgroup_iter *it)
+                               struct cgroup_iter *it)
 {
        struct list_head *l = it->cg_link;
        struct cg_cgroup_link *link;
@@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 }
 
 /*
- * Stuff for reading the 'tasks' file.
+ * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  */
 
 /*
- * Load into 'pidarray' up to 'npids' of the tasks using cgroup
- * 'cgrp'.  Return actual number of pids loaded.  No need to
- * task_lock(p) when reading out p->cgroup, since we're in an RCU
- * read section, so the css_set can't go away, and is
- * immutable after creation.
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+       if (PIDLIST_TOO_LARGE(count))
+               return vmalloc(count * sizeof(pid_t));
+       else
+               return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+static void pidlist_free(void *p)
+{
+       if (is_vmalloc_addr(p))
+               vfree(p);
+       else
+               kfree(p);
+}
+static void *pidlist_resize(void *p, int newcount)
+{
+       void *newlist;
+       /* note: if new alloc fails, old p will still be valid either way */
+       if (is_vmalloc_addr(p)) {
+               newlist = vmalloc(newcount * sizeof(pid_t));
+               if (!newlist)
+                       return NULL;
+               memcpy(newlist, p, newcount * sizeof(pid_t));
+               vfree(p);
+       } else {
+               newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
+       }
+       return newlist;
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * If the new stripped list is sufficiently smaller and there's enough memory
+ * to allocate a new buffer, will let go of the unneeded memory. Returns the
+ * number of unique elements.
+ */
+/* is the size difference enough that we should re-allocate the array? */
+#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
+static int pidlist_uniq(pid_t **p, int length)
+{
+       int src, dest = 1;
+       pid_t *list = *p;
+       pid_t *newlist;
+
+       /*
+        * we presume the 0th element is unique, so i starts at 1. trivial
+        * edge cases first; no work needs to be done for either
+        */
+       if (length == 0 || length == 1)
+               return length;
+       /* src and dest walk down the list; dest counts unique elements */
+       for (src = 1; src < length; src++) {
+               /* find next unique element */
+               while (list[src] == list[src-1]) {
+                       src++;
+                       if (src == length)
+                               goto after;
+               }
+               /* dest always points to where the next unique element goes */
+               list[dest] = list[src];
+               dest++;
+       }
+after:
+       /*
+        * if the length difference is large enough, we want to allocate a
+        * smaller buffer to save memory. if this fails due to out of memory,
+        * we'll just stay with what we've got.
+        */
+       if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
+               newlist = pidlist_resize(list, dest);
+               if (newlist)
+                       *p = newlist;
+       }
+       return dest;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+       return *(pid_t *)a - *(pid_t *)b;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
  */
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+                                                 enum cgroup_filetype type)
 {
-       int n = 0, pid;
+       struct cgroup_pidlist *l;
+       /* don't need task_nsproxy() if we're looking at ourself */
+       struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+       /*
+        * We can't drop the pidlist_mutex before taking the l->mutex in case
+        * the last ref-holder is trying to remove l from the list at the same
+        * time. Holding the pidlist_mutex precludes somebody taking whichever
+        * list we find out from under us - compare release_pid_array().
+        */
+       mutex_lock(&cgrp->pidlist_mutex);
+       list_for_each_entry(l, &cgrp->pidlists, links) {
+               if (l->key.type == type && l->key.ns == ns) {
+                       /* found a matching list - drop the extra refcount */
+                       put_pid_ns(ns);
+                       /* make sure l doesn't vanish out from under us */
+                       down_write(&l->mutex);
+                       mutex_unlock(&cgrp->pidlist_mutex);
+                       l->use_count++;
+                       return l;
+               }
+       }
+       /* entry not found; create a new one */
+       l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+       if (!l) {
+               mutex_unlock(&cgrp->pidlist_mutex);
+               put_pid_ns(ns);
+               return l;
+       }
+       init_rwsem(&l->mutex);
+       down_write(&l->mutex);
+       l->key.type = type;
+       l->key.ns = ns;
+       l->use_count = 0; /* don't increment here */
+       l->list = NULL;
+       l->owner = cgrp;
+       list_add(&l->links, &cgrp->pidlists);
+       mutex_unlock(&cgrp->pidlist_mutex);
+       return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+                             struct cgroup_pidlist **lp)
+{
+       pid_t *array;
+       int length;
+       int pid, n = 0; /* used for populating the array */
        struct cgroup_iter it;
        struct task_struct *tsk;
+       struct cgroup_pidlist *l;
+
+       /*
+        * If cgroup gets more users after we read count, we won't have
+        * enough space - tough.  This race is indistinguishable to the
+        * caller from the case that the additional cgroup users didn't
+        * show up until sometime later on.
+        */
+       length = cgroup_task_count(cgrp);
+       array = pidlist_allocate(length);
+       if (!array)
+               return -ENOMEM;
+       /* now, populate the array */
        cgroup_iter_start(cgrp, &it);
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
-               if (unlikely(n == npids))
+               if (unlikely(n == length))
                        break;
-               pid = task_pid_vnr(tsk);
-               if (pid > 0)
-                       pidarray[n++] = pid;
+               /* get tgid or pid for procs or tasks file respectively */
+               if (type == CGROUP_FILE_PROCS)
+                       pid = task_tgid_vnr(tsk);
+               else
+                       pid = task_pid_vnr(tsk);
+               if (pid > 0) /* make sure to only use valid results */
+                       array[n++] = pid;
        }
        cgroup_iter_end(cgrp, &it);
-       return n;
+       length = n;
+       /* now sort & (if procs) strip out duplicates */
+       sort(array, length, sizeof(pid_t), cmppid, NULL);
+       if (type == CGROUP_FILE_PROCS)
+               length = pidlist_uniq(&array, length);
+       l = cgroup_pidlist_find(cgrp, type);
+       if (!l) {
+               pidlist_free(array);
+               return -ENOMEM;
+       }
+       /* store array, freeing old if necessary - lock already held */
+       pidlist_free(l->list);
+       l->list = array;
+       l->length = length;
+       l->use_count++;
+       up_write(&l->mutex);
+       *lp = l;
+       return 0;
 }
 
 /**
@@ -2216,37 +2604,14 @@ err:
        return ret;
 }
 
-/*
- * Cache pids for all threads in the same pid namespace that are
- * opening the same "tasks" file.
- */
-struct cgroup_pids {
-       /* The node in cgrp->pids_list */
-       struct list_head list;
-       /* The cgroup those pids belong to */
-       struct cgroup *cgrp;
-       /* The namepsace those pids belong to */
-       struct pid_namespace *ns;
-       /* Array of process ids in the cgroup */
-       pid_t *tasks_pids;
-       /* How many files are using the this tasks_pids array */
-       int use_count;
-       /* Length of the current tasks_pids array */
-       int length;
-};
-
-static int cmppid(const void *a, const void *b)
-{
-       return *(pid_t *)a - *(pid_t *)b;
-}
 
 /*
- * seq_file methods for the "tasks" file. The seq_file position is the
+ * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->tasks_pids array.
+ * in the cgroup->l->list array.
  */
 
-static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
        /*
         * Initially we receive a position value that corresponds to
@@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
         * after a seek to the start). Use a binary-search to find the
         * next pid to display, if any
         */
-       struct cgroup_pids *cp = s->private;
-       struct cgroup *cgrp = cp->cgrp;
+       struct cgroup_pidlist *l = s->private;
        int index = 0, pid = *pos;
        int *iter;
 
-       down_read(&cgrp->pids_mutex);
+       down_read(&l->mutex);
        if (pid) {
-               int end = cp->length;
+               int end = l->length;
 
                while (index < end) {
                        int mid = (index + end) / 2;
-                       if (cp->tasks_pids[mid] == pid) {
+                       if (l->list[mid] == pid) {
                                index = mid;
                                break;
-                       } else if (cp->tasks_pids[mid] <= pid)
+                       } else if (l->list[mid] <= pid)
                                index = mid + 1;
                        else
                                end = mid;
                }
        }
        /* If we're off the end of the array, we're done */
-       if (index >= cp->length)
+       if (index >= l->length)
                return NULL;
        /* Update the abstract position to be the actual pid that we found */
-       iter = cp->tasks_pids + index;
+       iter = l->list + index;
        *pos = *iter;
        return iter;
 }
 
-static void cgroup_tasks_stop(struct seq_file *s, void *v)
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-       struct cgroup_pids *cp = s->private;
-       struct cgroup *cgrp = cp->cgrp;
-       up_read(&cgrp->pids_mutex);
+       struct cgroup_pidlist *l = s->private;
+       up_read(&l->mutex);
 }
 
-static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-       struct cgroup_pids *cp = s->private;
-       int *p = v;
-       int *end = cp->tasks_pids + cp->length;
-
+       struct cgroup_pidlist *l = s->private;
+       pid_t *p = v;
+       pid_t *end = l->list + l->length;
        /*
         * Advance to the next pid in the array. If this goes off the
         * end, we're done
@@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
        }
 }
 
-static int cgroup_tasks_show(struct seq_file *s, void *v)
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
        return seq_printf(s, "%d\n", *(int *)v);
 }
 
-static const struct seq_operations cgroup_tasks_seq_operations = {
-       .start = cgroup_tasks_start,
-       .stop = cgroup_tasks_stop,
-       .next = cgroup_tasks_next,
-       .show = cgroup_tasks_show,
+/*
+ * seq_operations functions for iterating on pidlists through seq_file -
+ * independent of whether it's tasks or procs
+ */
+static const struct seq_operations cgroup_pidlist_seq_operations = {
+       .start = cgroup_pidlist_start,
+       .stop = cgroup_pidlist_stop,
+       .next = cgroup_pidlist_next,
+       .show = cgroup_pidlist_show,
 };
 
-static void release_cgroup_pid_array(struct cgroup_pids *cp)
+static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-       struct cgroup *cgrp = cp->cgrp;
-
-       down_write(&cgrp->pids_mutex);
-       BUG_ON(!cp->use_count);
-       if (!--cp->use_count) {
-               list_del(&cp->list);
-               put_pid_ns(cp->ns);
-               kfree(cp->tasks_pids);
-               kfree(cp);
+       /*
+        * the case where we're the last user of this particular pidlist will
+        * have us remove it from the cgroup's list, which entails taking the
+        * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
+        * pidlist_mutex, we have to take pidlist_mutex first.
+        */
+       mutex_lock(&l->owner->pidlist_mutex);
+       down_write(&l->mutex);
+       BUG_ON(!l->use_count);
+       if (!--l->use_count) {
+               /* we're the last user if refcount is 0; remove and free */
+               list_del(&l->links);
+               mutex_unlock(&l->owner->pidlist_mutex);
+               pidlist_free(l->list);
+               put_pid_ns(l->key.ns);
+               up_write(&l->mutex);
+               kfree(l);
+               return;
        }
-       up_write(&cgrp->pids_mutex);
+       mutex_unlock(&l->owner->pidlist_mutex);
+       up_write(&l->mutex);
 }
 
-static int cgroup_tasks_release(struct inode *inode, struct file *file)
+static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
-       struct seq_file *seq;
-       struct cgroup_pids *cp;
-
+       struct cgroup_pidlist *l;
        if (!(file->f_mode & FMODE_READ))
                return 0;
-
-       seq = file->private_data;
-       cp = seq->private;
-
-       release_cgroup_pid_array(cp);
+       /*
+        * the seq_file will only be initialized if the file was opened for
+        * reading; hence we check if it's not null only in that case.
+        */
+       l = ((struct seq_file *)file->private_data)->private;
+       cgroup_release_pid_array(l);
        return seq_release(inode, file);
 }
 
-static struct file_operations cgroup_tasks_operations = {
+static const struct file_operations cgroup_pidlist_operations = {
        .read = seq_read,
        .llseek = seq_lseek,
        .write = cgroup_file_write,
-       .release = cgroup_tasks_release,
+       .release = cgroup_pidlist_release,
 };
 
 /*
- * Handle an open on 'tasks' file.  Prepare an array containing the
- * process id's of tasks currently attached to the cgroup being opened.
+ * The following functions handle opens on a file that displays a pidlist
+ * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
+ * in the cgroup.
  */
-
-static int cgroup_tasks_open(struct inode *unused, struct file *file)
+/* helper function for the two below it */
+static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-       struct pid_namespace *ns = current->nsproxy->pid_ns;
-       struct cgroup_pids *cp;
-       pid_t *pidarray;
-       int npids;
+       struct cgroup_pidlist *l;
        int retval;
 
        /* Nothing to do for write-only files */
        if (!(file->f_mode & FMODE_READ))
                return 0;
 
-       /*
-        * If cgroup gets more users after we read count, we won't have
-        * enough space - tough.  This race is indistinguishable to the
-        * caller from the case that the additional cgroup users didn't
-        * show up until sometime later on.
-        */
-       npids = cgroup_task_count(cgrp);
-       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-       if (!pidarray)
-               return -ENOMEM;
-       npids = pid_array_load(pidarray, npids, cgrp);
-       sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-       /*
-        * Store the array in the cgroup, freeing the old
-        * array if necessary
-        */
-       down_write(&cgrp->pids_mutex);
-
-       list_for_each_entry(cp, &cgrp->pids_list, list) {
-               if (ns == cp->ns)
-                       goto found;
-       }
-
-       cp = kzalloc(sizeof(*cp), GFP_KERNEL);
-       if (!cp) {
-               up_write(&cgrp->pids_mutex);
-               kfree(pidarray);
-               return -ENOMEM;
-       }
-       cp->cgrp = cgrp;
-       cp->ns = ns;
-       get_pid_ns(ns);
-       list_add(&cp->list, &cgrp->pids_list);
-found:
-       kfree(cp->tasks_pids);
-       cp->tasks_pids = pidarray;
-       cp->length = npids;
-       cp->use_count++;
-       up_write(&cgrp->pids_mutex);
-
-       file->f_op = &cgroup_tasks_operations;
+       /* have the array populated */
+       retval = pidlist_array_load(cgrp, type, &l);
+       if (retval)
+               return retval;
+       /* configure file information */
+       file->f_op = &cgroup_pidlist_operations;
 
-       retval = seq_open(file, &cgroup_tasks_seq_operations);
+       retval = seq_open(file, &cgroup_pidlist_seq_operations);
        if (retval) {
-               release_cgroup_pid_array(cp);
+               cgroup_release_pid_array(l);
                return retval;
        }
-       ((struct seq_file *)file->private_data)->private = cp;
+       ((struct seq_file *)file->private_data)->private = l;
        return 0;
 }
+static int cgroup_tasks_open(struct inode *unused, struct file *file)
+{
+       return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
+}
+static int cgroup_procs_open(struct inode *unused, struct file *file)
+{
+       return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
+}
 
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
                                            struct cftype *cft)
@@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 /*
  * for the common functions, 'private' gives the type of file
  */
+/* for hysterical raisins, we can't put this on the older files */
+#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
 static struct cftype files[] = {
        {
                .name = "tasks",
                .open = cgroup_tasks_open,
                .write_u64 = cgroup_tasks_write,
-               .release = cgroup_tasks_release,
-               .private = FILE_TASKLIST,
+               .release = cgroup_pidlist_release,
                .mode = S_IRUGO | S_IWUSR,
        },
-
+       {
+               .name = CGROUP_FILE_GENERIC_PREFIX "procs",
+               .open = cgroup_procs_open,
+               /* .write_u64 = cgroup_procs_write, TODO */
+               .release = cgroup_pidlist_release,
+               .mode = S_IRUGO,
+       },
        {
                .name = "notify_on_release",
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
-               .private = FILE_NOTIFY_ON_RELEASE,
        },
 };
 
@@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = {
        .read_seq_string = cgroup_release_agent_show,
        .write_string = cgroup_release_agent_write,
        .max_write_len = PATH_MAX,
-       .private = FILE_RELEASE_AGENT,
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void)
        init_task.cgroups = &init_css_set;
 
        init_css_set_link.cg = &init_css_set;
+       init_css_set_link.cgrp = dummytop;
        list_add(&init_css_set_link.cgrp_link_list,
                 &rootnode.top_cgroup.css_sets);
        list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3284,7 @@ int __init cgroup_init(void)
        /* Add init_css_set to the hash table */
        hhead = css_set_hash(init_css_set.subsys);
        hlist_add_head(&init_css_set.hlist, hhead);
-
+       BUG_ON(!init_root_id(&rootnode));
        err = register_filesystem(&cgroup_fs_type);
        if (err < 0)
                goto out;
@@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
        for_each_active_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
-               int subsys_id;
                int count = 0;
 
-               seq_printf(m, "%lu:", root->subsys_bits);
+               seq_printf(m, "%d:", root->hierarchy_id);
                for_each_subsys(root, ss)
                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+               if (strlen(root->name))
+                       seq_printf(m, "%sname=%s", count ? "," : "",
+                                  root->name);
                seq_putc(m, ':');
-               get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
-               cgrp = task_cgroup(tsk, subsys_id);
+               cgrp = task_cgroup_from_root(tsk, root);
                retval = cgroup_path(cgrp, buf, PAGE_SIZE);
                if (retval < 0)
                        goto out_unlock;
@@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
-               seq_printf(m, "%s\t%lu\t%d\t%d\n",
-                          ss->name, ss->root->subsys_bits,
+               seq_printf(m, "%s\t%d\t%d\t%d\n",
+                          ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
        }
        mutex_unlock(&cgroup_mutex);
@@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
        int ret;
        struct cgroup *target;
-       int subsys_id;
 
        if (cgrp == dummytop)
                return 1;
 
-       get_first_subsys(cgrp, NULL, &subsys_id);
-       target = task_cgroup(task, subsys_id);
+       target = task_cgroup_from_root(task, cgrp->root);
        while (cgrp != target && cgrp!= cgrp->top_cgroup)
                cgrp = cgrp->parent;
        ret = (cgrp == target);
@@ -3693,3 +4043,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
        return ret;
 }
 
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+                                                  struct cgroup *cont)
+{
+       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+       if (!css)
+               return ERR_PTR(-ENOMEM);
+
+       return css;
+}
+
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+       kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+{
+       return atomic_read(&cont->count);
+}
+
+static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+{
+       return cgroup_task_count(cont);
+}
+
+static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+{
+       return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup *cont,
+                                          struct cftype *cft)
+{
+       u64 count;
+
+       rcu_read_lock();
+       count = atomic_read(&current->cgroups->refcount);
+       rcu_read_unlock();
+       return count;
+}
+
+static int current_css_set_cg_links_read(struct cgroup *cont,
+                                        struct cftype *cft,
+                                        struct seq_file *seq)
+{
+       struct cg_cgroup_link *link;
+       struct css_set *cg;
+
+       read_lock(&css_set_lock);
+       rcu_read_lock();
+       cg = rcu_dereference(current->cgroups);
+       list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+               struct cgroup *c = link->cgrp;
+               const char *name;
+
+               if (c->dentry)
+                       name = c->dentry->d_name.name;
+               else
+                       name = "?";
+               seq_printf(seq, "Root %d group %s\n",
+                          c->root->hierarchy_id, name);
+       }
+       rcu_read_unlock();
+       read_unlock(&css_set_lock);
+       return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct cgroup *cont,
+                                struct cftype *cft,
+                                struct seq_file *seq)
+{
+       struct cg_cgroup_link *link;
+
+       read_lock(&css_set_lock);
+       list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
+               struct css_set *cg = link->cg;
+               struct task_struct *task;
+               int count = 0;
+               seq_printf(seq, "css_set %p\n", cg);
+               list_for_each_entry(task, &cg->tasks, cg_list) {
+                       if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
+                               seq_puts(seq, "  ...\n");
+                               break;
+                       } else {
+                               seq_printf(seq, "  task %d\n",
+                                          task_pid_vnr(task));
+                       }
+               }
+       }
+       read_unlock(&css_set_lock);
+       return 0;
+}
+
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
+static struct cftype debug_files[] =  {
+       {
+               .name = "cgroup_refcount",
+               .read_u64 = cgroup_refcount_read,
+       },
+       {
+               .name = "taskcount",
+               .read_u64 = debug_taskcount_read,
+       },
+
+       {
+               .name = "current_css_set",
+               .read_u64 = current_css_set_read,
+       },
+
+       {
+               .name = "current_css_set_refcount",
+               .read_u64 = current_css_set_refcount_read,
+       },
+
+       {
+               .name = "current_css_set_cg_links",
+               .read_seq_string = current_css_set_cg_links_read,
+       },
+
+       {
+               .name = "cgroup_css_links",
+               .read_seq_string = cgroup_css_links_read,
+       },
+
+       {
+               .name = "releasable",
+               .read_u64 = releasable_read,
+       },
+};
+
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+       return cgroup_add_files(cont, ss, debug_files,
+                               ARRAY_SIZE(debug_files));
+}
+
+struct cgroup_subsys debug_subsys = {
+       .name = "debug",
+       .create = debug_create,
+       .destroy = debug_destroy,
+       .populate = debug_populate,
+       .subsys_id = debug_subsys_id,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644 (file)
index 0c92d79..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * kernel/cgroup_debug.c - Example cgroup subsystem that
- * exposes debug info
- *
- * Copyright (C) Google Inc, 2007
- *
- * Developed by Paul Menage (menage@google.com)
- *
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-
-#include <asm/atomic.h>
-
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
-                                                  struct cgroup *cont)
-{
-       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-       if (!css)
-               return ERR_PTR(-ENOMEM);
-
-       return css;
-}
-
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-       kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
-       return atomic_read(&cont->count);
-}
-
-static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
-       u64 count;
-
-       count = cgroup_task_count(cont);
-       return count;
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
-{
-       return (u64)(long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup *cont,
-                                          struct cftype *cft)
-{
-       u64 count;
-
-       rcu_read_lock();
-       count = atomic_read(&current->cgroups->refcount);
-       rcu_read_unlock();
-       return count;
-}
-
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
-static struct cftype files[] =  {
-       {
-               .name = "cgroup_refcount",
-               .read_u64 = cgroup_refcount_read,
-       },
-       {
-               .name = "taskcount",
-               .read_u64 = taskcount_read,
-       },
-
-       {
-               .name = "current_css_set",
-               .read_u64 = current_css_set_read,
-       },
-
-       {
-               .name = "current_css_set_refcount",
-               .read_u64 = current_css_set_refcount_read,
-       },
-
-       {
-               .name = "releasable",
-               .read_u64 = releasable_read,
-       },
-};
-
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-       return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys debug_subsys = {
-       .name = "debug",
-       .create = debug_create,
-       .destroy = debug_destroy,
-       .populate = debug_populate,
-       .subsys_id = debug_subsys_id,
-};
index fb249e2bcada880a19105488c7fdde33e9483126..59e9ef6aab4002e1d99170f50156e733e8f46343 100644 (file)
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
  */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                             struct task_struct *task)
+                             struct task_struct *task, bool threadgroup)
 {
        struct freezer *freezer;
 
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        if (freezer->state == CGROUP_FROZEN)
                return -EBUSY;
 
+       if (threadgroup) {
+               struct task_struct *c;
+
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+                       if (is_task_frozen_enough(c)) {
+                               rcu_read_unlock();
+                               return -EBUSY;
+                       }
+               }
+               rcu_read_unlock();
+       }
+
        return 0;
 }
 
index 7e75a41bd50855caf3a33109bed11a52fa967f80..b5cb469d25456b03292d27e22ece4508ddba1ca2 100644 (file)
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
 static cpumask_var_t cpus_attach;
 
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss,
-                            struct cgroup *cont, struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+                            struct task_struct *tsk, bool threadgroup)
 {
+       int ret;
        struct cpuset *cs = cgroup_cs(cont);
 
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
 
-       return security_task_setscheduler(tsk, 0, NULL);
+       ret = security_task_setscheduler(tsk, 0, NULL);
+       if (ret)
+               return ret;
+       if (threadgroup) {
+               struct task_struct *c;
+
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+                       ret = security_task_setscheduler(c, 0, NULL);
+                       if (ret) {
+                               rcu_read_unlock();
+                               return ret;
+                       }
+               }
+               rcu_read_unlock();
+       }
+       return 0;
+}
+
+static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+                              struct cpuset *cs)
+{
+       int err;
+       /*
+        * can_attach beforehand should guarantee that this doesn't fail.
+        * TODO: have a better way to handle failure here
+        */
+       err = set_cpus_allowed_ptr(tsk, cpus_attach);
+       WARN_ON_ONCE(err);
+
+       task_lock(tsk);
+       cpuset_change_task_nodemask(tsk, to);
+       task_unlock(tsk);
+       cpuset_update_task_spread_flag(cs, tsk);
+
 }
 
-static void cpuset_attach(struct cgroup_subsys *ss,
-                         struct cgroup *cont, struct cgroup *oldcont,
-                         struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+                         struct cgroup *oldcont, struct task_struct *tsk,
+                         bool threadgroup)
 {
        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-       int err;
 
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
                guarantee_online_cpus(cs, cpus_attach);
                guarantee_online_mems(cs, &to);
        }
-       err = set_cpus_allowed_ptr(tsk, cpus_attach);
-       if (err)
-               return;
 
-       task_lock(tsk);
-       cpuset_change_task_nodemask(tsk, &to);
-       task_unlock(tsk);
-       cpuset_update_task_spread_flag(cs, tsk);
+       /* do per-task migration stuff possibly for each in the threadgroup */
+       cpuset_attach_task(tsk, &to, cs);
+       if (threadgroup) {
+               struct task_struct *c;
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+                       cpuset_attach_task(c, &to, cs);
+               }
+               rcu_read_unlock();
+       }
 
+       /* change mm; only needs to be done once even if threadgroup */
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
        mm = get_task_mm(tsk);
index 60d6fdcc926509da90ebe38b4e59c275290e8d9f..5859f598c951bde881a9e7ce1c29d19fd66b3463 100644 (file)
@@ -976,8 +976,6 @@ NORET_TYPE void do_exit(long code)
                disassociate_ctty(1);
 
        module_put(task_thread_info(tsk)->exec_domain->module);
-       if (tsk->binfmt)
-               module_put(tsk->binfmt->module);
 
        proc_exit_connector(tsk);
 
@@ -1097,28 +1095,28 @@ struct wait_opts {
        int __user              *wo_stat;
        struct rusage __user    *wo_rusage;
 
+       wait_queue_t            child_wait;
        int                     notask_error;
 };
 
-static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+static inline
+struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
-       struct pid *pid = NULL;
-       if (type == PIDTYPE_PID)
-               pid = task->pids[type].pid;
-       else if (type < PIDTYPE_MAX)
-               pid = task->group_leader->pids[type].pid;
-       return pid;
+       if (type != PIDTYPE_PID)
+               task = task->group_leader;
+       return task->pids[type].pid;
 }
 
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
-       int err;
-
-       if (wo->wo_type < PIDTYPE_MAX) {
-               if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
-                       return 0;
-       }
+       return  wo->wo_type == PIDTYPE_MAX ||
+               task_pid_type(p, wo->wo_type) == wo->wo_pid;
+}
 
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+{
+       if (!eligible_pid(wo, p))
+               return 0;
        /* Wait for all children (clone and not) if __WALL is set;
         * otherwise, wait for clone children *only* if __WCLONE is
         * set; otherwise, wait for non-clone children *only*.  (Note:
@@ -1128,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
            && !(wo->wo_flags & __WALL))
                return 0;
 
-       err = security_task_wait(p);
-       if (err)
-               return err;
-
        return 1;
 }
 
@@ -1144,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 
        put_task_struct(p);
        infop = wo->wo_info;
-       if (!retval)
-               retval = put_user(SIGCHLD, &infop->si_signo);
-       if (!retval)
-               retval = put_user(0, &infop->si_errno);
-       if (!retval)
-               retval = put_user((short)why, &infop->si_code);
-       if (!retval)
-               retval = put_user(pid, &infop->si_pid);
-       if (!retval)
-               retval = put_user(uid, &infop->si_uid);
-       if (!retval)
-               retval = put_user(status, &infop->si_status);
+       if (infop) {
+               if (!retval)
+                       retval = put_user(SIGCHLD, &infop->si_signo);
+               if (!retval)
+                       retval = put_user(0, &infop->si_errno);
+               if (!retval)
+                       retval = put_user((short)why, &infop->si_code);
+               if (!retval)
+                       retval = put_user(pid, &infop->si_pid);
+               if (!retval)
+                       retval = put_user(uid, &infop->si_uid);
+               if (!retval)
+                       retval = put_user(status, &infop->si_status);
+       }
        if (!retval)
                retval = pid;
        return retval;
@@ -1485,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
  * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
-                               int ptrace, struct task_struct *p)
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+                               struct task_struct *p)
 {
        int ret = eligible_child(wo, p);
        if (!ret)
                return ret;
 
+       ret = security_task_wait(p);
        if (unlikely(ret < 0)) {
                /*
                 * If we have not yet seen any eligible child,
@@ -1553,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
                 * Do not consider detached threads.
                 */
                if (!task_detached(p)) {
-                       int ret = wait_consider_task(wo, tsk, 0, p);
+                       int ret = wait_consider_task(wo, 0, p);
                        if (ret)
                                return ret;
                }
@@ -1567,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
        struct task_struct *p;
 
        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-               int ret = wait_consider_task(wo, tsk, 1, p);
+               int ret = wait_consider_task(wo, 1, p);
                if (ret)
                        return ret;
        }
@@ -1575,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
        return 0;
 }
 
+static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+                               int sync, void *key)
+{
+       struct wait_opts *wo = container_of(wait, struct wait_opts,
+                                               child_wait);
+       struct task_struct *p = key;
+
+       if (!eligible_pid(wo, p))
+               return 0;
+
+       if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
+               return 0;
+
+       return default_wake_function(wait, mode, sync, key);
+}
+
+void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
+{
+       __wake_up_sync_key(&parent->signal->wait_chldexit,
+                               TASK_INTERRUPTIBLE, 1, p);
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-       DECLARE_WAITQUEUE(wait, current);
        struct task_struct *tsk;
        int retval;
 
        trace_sched_process_wait(wo->wo_pid);
 
-       add_wait_queue(&current->signal->wait_chldexit,&wait);
+       init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+       wo->child_wait.private = current;
+       add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 repeat:
        /*
         * If there is nothing that can match our critiera just get out.
@@ -1624,32 +1644,7 @@ notask:
        }
 end:
        __set_current_state(TASK_RUNNING);
-       remove_wait_queue(&current->signal->wait_chldexit,&wait);
-       if (wo->wo_info) {
-               struct siginfo __user *infop = wo->wo_info;
-
-               if (retval > 0)
-                       retval = 0;
-               else {
-                       /*
-                        * For a WNOHANG return, clear out all the fields
-                        * we would set so the user can easily tell the
-                        * difference.
-                        */
-                       if (!retval)
-                               retval = put_user(0, &infop->si_signo);
-                       if (!retval)
-                               retval = put_user(0, &infop->si_errno);
-                       if (!retval)
-                               retval = put_user(0, &infop->si_code);
-                       if (!retval)
-                               retval = put_user(0, &infop->si_pid);
-                       if (!retval)
-                               retval = put_user(0, &infop->si_uid);
-                       if (!retval)
-                               retval = put_user(0, &infop->si_status);
-               }
-       }
+       remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
 }
 
@@ -1694,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        wo.wo_stat      = NULL;
        wo.wo_rusage    = ru;
        ret = do_wait(&wo);
+
+       if (ret > 0) {
+               ret = 0;
+       } else if (infop) {
+               /*
+                * For a WNOHANG return, clear out all the fields
+                * we would set so the user can easily tell the
+                * difference.
+                */
+               if (!ret)
+                       ret = put_user(0, &infop->si_signo);
+               if (!ret)
+                       ret = put_user(0, &infop->si_errno);
+               if (!ret)
+                       ret = put_user(0, &infop->si_code);
+               if (!ret)
+                       ret = put_user(0, &infop->si_pid);
+               if (!ret)
+                       ret = put_user(0, &infop->si_uid);
+               if (!ret)
+                       ret = put_user(0, &infop->si_status);
+       }
+
        put_pid(pid);
 
        /* avoid REGPARM breakage on x86: */
index 51ad0b0b72664ad327c042c33272e90630ec932b..266c6af6ef1b089a1c64ee96428bd153db39217c 100644 (file)
@@ -434,6 +434,14 @@ __setup("coredump_filter=", coredump_filter_setup);
 
 #include <linux/init_task.h>
 
+static void mm_init_aio(struct mm_struct *mm)
+{
+#ifdef CONFIG_AIO
+       spin_lock_init(&mm->ioctx_lock);
+       INIT_HLIST_HEAD(&mm->ioctx_list);
+#endif
+}
+
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
@@ -447,10 +455,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        set_mm_counter(mm, file_rss, 0);
        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
-       spin_lock_init(&mm->ioctx_lock);
-       INIT_HLIST_HEAD(&mm->ioctx_list);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
+       mm_init_aio(mm);
        mm_init_owner(mm, p);
 
        if (likely(!mm_alloc_pgd(mm))) {
@@ -511,6 +518,8 @@ void mmput(struct mm_struct *mm)
                        spin_unlock(&mmlist_lock);
                }
                put_swap_token(mm);
+               if (mm->binfmt)
+                       module_put(mm->binfmt->module);
                mmdrop(mm);
        }
 }
@@ -636,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;
 
+       if (mm->binfmt && !try_module_get(mm->binfmt->module))
+               goto free_pt;
+
        return mm;
 
 free_pt:
+       /* don't put binfmt in mmput, we haven't got module yet */
+       mm->binfmt = NULL;
        mmput(mm);
 
 fail_nomem:
@@ -979,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);
 
+       /*
+        * Siblings of global init remain as zombies on exit since they are
+        * not reaped by their parent (swapper). To solve this and to avoid
+        * multi-rooted process trees, prevent global and container-inits
+        * from creating siblings.
+        */
+       if ((clone_flags & CLONE_PARENT) &&
+                               current->signal->flags & SIGNAL_UNKILLABLE)
+               return ERR_PTR(-EINVAL);
+
        retval = security_task_create(clone_flags);
        if (retval)
                goto fork_out;
@@ -1020,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (!try_module_get(task_thread_info(p)->exec_domain->module))
                goto bad_fork_cleanup_count;
 
-       if (p->binfmt && !try_module_get(p->binfmt->module))
-               goto bad_fork_cleanup_put_domain;
-
        p->did_exec = 0;
        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p);
@@ -1310,9 +1331,6 @@ bad_fork_cleanup_cgroup:
 #endif
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
-       if (p->binfmt)
-               module_put(p->binfmt->module);
-bad_fork_cleanup_put_domain:
        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
index 022a4927b78539f5e123558d13e6c012d0188ead..d4e84174740018f30bdf8aa02d62e8799676a938 100644 (file)
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
  * Process updating of timeout sysctl
  */
 int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-                                 struct file *filp, void __user *buffer,
+                                 void __user *buffer,
                                  size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
        if (ret || !write)
                goto out;
index 5aa854f9e5ae0cae90d37d7594d265cdbab6fc01..2a5dfec8efe0504fc974a9500e934c51b78a5207 100644 (file)
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
  *       (hence either you are in the same cgroup as task, or in an
  *        ancestor cgroup thereof)
  */
-static int ns_can_attach(struct cgroup_subsys *ss,
-               struct cgroup *new_cgroup, struct task_struct *task)
+static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
+                        struct task_struct *task, bool threadgroup)
 {
        if (current != task) {
                if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
        if (!cgroup_is_descendant(new_cgroup, task))
                return -EPERM;
 
+       if (threadgroup) {
+               struct task_struct *c;
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+                       if (!cgroup_is_descendant(new_cgroup, c)) {
+                               rcu_read_unlock();
+                               return -EPERM;
+                       }
+               }
+               rcu_read_unlock();
+       }
+
        return 0;
 }
 
index 821722ae58a732760f62a00912554e3779fc2d04..86b3796b0436726e5b0208021cc990bfdc7044e2 100644 (file)
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
 {
        if (!(flags & CLONE_NEWPID))
                return get_pid_ns(old_ns);
-       if (flags & CLONE_THREAD)
+       if (flags & (CLONE_THREAD|CLONE_PARENT))
                return ERR_PTR(-EINVAL);
        return create_pid_namespace(old_ns);
 }
index 307c285af59e89141412181d547f0fca7d4c7073..23bd09cd042ea9e6987763bb4261a331de24e4a7 100644 (file)
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
  * or self-reaping.  Do notification now if it would have happened earlier.
  * If it should reap itself, return true.
  *
- * If it's our own child, there is no notification to do.
- * But if our normal children self-reap, then this child
- * was prevented by ptrace and we must reap it now.
+ * If it's our own child, there is no notification to do. But if our normal
+ * children self-reap, then this child was prevented by ptrace and we must
+ * reap it now, in that case we must also wake up sub-threads sleeping in
+ * do_wait().
  */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
                if (!task_detached(p) && thread_group_empty(p)) {
                        if (!same_thread_group(p->real_parent, tracer))
                                do_notify_parent(p, p->exit_signal);
-                       else if (ignoring_children(tracer->sighand))
+                       else if (ignoring_children(tracer->sighand)) {
+                               __wake_up_parent(p, tracer);
                                p->exit_signal = -1;
+                       }
                }
                if (task_detached(p)) {
                        /* Mark it as in the process of being reaped. */
index e1338f074314d7e85fca035ac4ab55f20704b8cd..88faec23e83301e5b64490318aba2ac722dfb00c 100644 (file)
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
        spin_lock_init(&counter->lock);
        counter->limit = RESOURCE_MAX;
+       counter->soft_limit = RESOURCE_MAX;
        counter->parent = parent;
 }
 
@@ -36,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 }
 
 int res_counter_charge(struct res_counter *counter, unsigned long val,
-                       struct res_counter **limit_fail_at)
+                       struct res_counter **limit_fail_at,
+                       struct res_counter **soft_limit_fail_at)
 {
        int ret;
        unsigned long flags;
        struct res_counter *c, *u;
 
        *limit_fail_at = NULL;
+       if (soft_limit_fail_at)
+               *soft_limit_fail_at = NULL;
        local_irq_save(flags);
        for (c = counter; c != NULL; c = c->parent) {
                spin_lock(&c->lock);
                ret = res_counter_charge_locked(c, val);
+               /*
+                * With soft limits, we return the highest ancestor
+                * that exceeds its soft limit
+                */
+               if (soft_limit_fail_at &&
+                       !res_counter_soft_limit_check_locked(c))
+                       *soft_limit_fail_at = c;
                spin_unlock(&c->lock);
                if (ret < 0) {
                        *limit_fail_at = c;
@@ -74,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
        counter->usage -= val;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge(struct res_counter *counter, unsigned long val,
+                               bool *was_soft_limit_excess)
 {
        unsigned long flags;
        struct res_counter *c;
@@ -82,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
        local_irq_save(flags);
        for (c = counter; c != NULL; c = c->parent) {
                spin_lock(&c->lock);
+               if (was_soft_limit_excess)
+                       *was_soft_limit_excess =
+                               !res_counter_soft_limit_check_locked(c);
                res_counter_uncharge_locked(c, val);
                spin_unlock(&c->lock);
        }
@@ -101,6 +116,8 @@ res_counter_member(struct res_counter *counter, int member)
                return &counter->limit;
        case RES_FAILCNT:
                return &counter->failcnt;
+       case RES_SOFT_LIMIT:
+               return &counter->soft_limit;
        };
 
        BUG();
index 2f76e06bea583d28e95d0466f35eb41d569b4986..ee61f454a98b9005ed7055b6f3e4bba5f2036ebe 100644 (file)
@@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret;
@@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
        old_period = sysctl_sched_rt_period;
        old_runtime = sysctl_sched_rt_runtime;
 
-       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
        if (!ret && write) {
                ret = sched_rt_global_constraints();
@@ -10377,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 }
 
 static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                     struct task_struct *tsk)
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
        if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10388,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (tsk->sched_class != &fair_sched_class)
                return -EINVAL;
 #endif
+       return 0;
+}
 
+static int
+cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                     struct task_struct *tsk, bool threadgroup)
+{
+       int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
+       if (retval)
+               return retval;
+       if (threadgroup) {
+               struct task_struct *c;
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+                       retval = cpu_cgroup_can_attach_task(cgrp, c);
+                       if (retval) {
+                               rcu_read_unlock();
+                               return retval;
+                       }
+               }
+               rcu_read_unlock();
+       }
        return 0;
 }
 
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                       struct cgroup *old_cont, struct task_struct *tsk)
+                 struct cgroup *old_cont, struct task_struct *tsk,
+                 bool threadgroup)
 {
        sched_move_task(tsk);
+       if (threadgroup) {
+               struct task_struct *c;
+               rcu_read_lock();
+               list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+                       sched_move_task(c);
+               }
+               rcu_read_unlock();
+       }
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
index ecc637a0d591b74ced8c17eea315631296b0b205..4e777b47eedac1f5f779c002091e2dc8b696abe1 100644 (file)
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_SCHED_DEBUG
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
        if (ret || !write)
                return ret;
index 64c5deeaca5d9a70f64f391bc73fde46e300bff1..6705320784fd2b07a518c4d2bb47520d0b342aa8 100644 (file)
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 
                if (why) {
                        /*
-                        * The first thread which returns from finish_stop()
+                        * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal_to_deliver().
                         */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
        return send_signal(sig, info, t, 0);
 }
 
+int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+                       bool group)
+{
+       unsigned long flags;
+       int ret = -ESRCH;
+
+       if (lock_task_sighand(p, &flags)) {
+               ret = send_signal(sig, info, p, group);
+               unlock_task_sighand(p, &flags);
+       }
+
+       return ret;
+}
+
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
        }
 }
 
-int __fatal_signal_pending(struct task_struct *tsk)
-{
-       return sigismember(&tsk->pending.signal, SIGKILL);
-}
-EXPORT_SYMBOL(__fatal_signal_pending);
-
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
 {
        struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-       unsigned long flags;
-       int ret;
+       int ret = check_kill_permission(sig, info, p);
 
-       ret = check_kill_permission(sig, info, p);
-
-       if (!ret && sig) {
-               ret = -ESRCH;
-               if (lock_task_sighand(p, &flags)) {
-                       ret = __group_send_sig_info(sig, info, p);
-                       unlock_task_sighand(p, &flags);
-               }
-       }
+       if (!ret && sig)
+               ret = do_send_sig_info(sig, info, p, true);
 
        return ret;
 }
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
  * These are for backward compatibility with the rest of the kernel source.
  */
 
-/*
- * The caller must ensure the task can't exit.
- */
 int
 send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
-       int ret;
-       unsigned long flags;
-
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
        if (!valid_signal(sig))
                return -EINVAL;
 
-       spin_lock_irqsave(&p->sighand->siglock, flags);
-       ret = specific_send_sig_info(sig, info, p);
-       spin_unlock_irqrestore(&p->sighand->siglock, flags);
-       return ret;
+       return do_send_sig_info(sig, info, p, false);
 }
 
 #define __si_special(priv) \
@@ -1382,15 +1373,6 @@ ret:
        return ret;
 }
 
-/*
- * Wake up any threads in the parent blocked in wait* syscalls.
- */
-static inline void __wake_up_parent(struct task_struct *p,
-                                   struct task_struct *parent)
-{
-       wake_up_interruptible_sync(&parent->signal->wait_chldexit);
-}
-
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
        spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void
-finish_stop(int stop_count)
-{
-       /*
-        * If there are no other threads in the group, or if there is
-        * a group stop in progress and we are the last to stop,
-        * report to the parent.  When ptraced, every thread reports itself.
-        */
-       if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
-               read_lock(&tasklist_lock);
-               do_notify_parent_cldstop(current, CLD_STOPPED);
-               read_unlock(&tasklist_lock);
-       }
-
-       do {
-               schedule();
-       } while (try_to_freeze());
-       /*
-        * Now we don't run again until continued.
-        */
-       current->exit_code = 0;
-}
-
 /*
  * This performs the stopping for SIGSTOP and other stop signals.
  * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
 static int do_signal_stop(int signr)
 {
        struct signal_struct *sig = current->signal;
-       int stop_count;
+       int notify;
 
-       if (sig->group_stop_count > 0) {
-               /*
-                * There is a group stop in progress.  We don't need to
-                * start another one.
-                */
-               stop_count = --sig->group_stop_count;
-       } else {
+       if (!sig->group_stop_count) {
                struct task_struct *t;
 
                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
                 */
                sig->group_exit_code = signr;
 
-               stop_count = 0;
+               sig->group_stop_count = 1;
                for (t = next_thread(current); t != current; t = next_thread(t))
                        /*
                         * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
                         */
                        if (!(t->flags & PF_EXITING) &&
                            !task_is_stopped_or_traced(t)) {
-                               stop_count++;
+                               sig->group_stop_count++;
                                signal_wake_up(t, 0);
                        }
-               sig->group_stop_count = stop_count;
        }
+       /*
+        * If there are no other threads in the group, or if there is
+        * a group stop in progress and we are the last to stop, report
+        * to the parent.  When ptraced, every thread reports itself.
+        */
+       notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
+       notify = tracehook_notify_jctl(notify, CLD_STOPPED);
+       /*
+        * tracehook_notify_jctl() can drop and reacquire siglock, so
+        * we keep ->group_stop_count != 0 before the call. If SIGCONT
+        * or SIGKILL comes in between ->group_stop_count == 0.
+        */
+       if (sig->group_stop_count) {
+               if (!--sig->group_stop_count)
+                       sig->flags = SIGNAL_STOP_STOPPED;
+               current->exit_code = sig->group_exit_code;
+               __set_current_state(TASK_STOPPED);
+       }
+       spin_unlock_irq(&current->sighand->siglock);
 
-       if (stop_count == 0)
-               sig->flags = SIGNAL_STOP_STOPPED;
-       current->exit_code = sig->group_exit_code;
-       __set_current_state(TASK_STOPPED);
+       if (notify) {
+               read_lock(&tasklist_lock);
+               do_notify_parent_cldstop(current, notify);
+               read_unlock(&tasklist_lock);
+       }
+
+       /* Now we don't run again until woken by SIGCONT or SIGKILL */
+       do {
+               schedule();
+       } while (try_to_freeze());
+
+       tracehook_finish_jctl();
+       current->exit_code = 0;
 
-       spin_unlock_irq(&current->sighand->siglock);
-       finish_stop(stop_count);
        return 1;
 }
 
@@ -1815,14 +1793,15 @@ relock:
                int why = (signal->flags & SIGNAL_STOP_CONTINUED)
                                ? CLD_CONTINUED : CLD_STOPPED;
                signal->flags &= ~SIGNAL_CLD_MASK;
-               spin_unlock_irq(&sighand->siglock);
 
-               if (unlikely(!tracehook_notify_jctl(1, why)))
-                       goto relock;
+               why = tracehook_notify_jctl(why, CLD_CONTINUED);
+               spin_unlock_irq(&sighand->siglock);
 
-               read_lock(&tasklist_lock);
-               do_notify_parent_cldstop(current->group_leader, why);
-               read_unlock(&tasklist_lock);
+               if (why) {
+                       read_lock(&tasklist_lock);
+                       do_notify_parent_cldstop(current->group_leader, why);
+                       read_unlock(&tasklist_lock);
+               }
                goto relock;
        }
 
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
        if (unlikely(tsk->signal->group_stop_count) &&
                        !--tsk->signal->group_stop_count) {
                tsk->signal->flags = SIGNAL_STOP_STOPPED;
-               group_stop = 1;
+               group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
        }
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
 
-       if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
+       if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
-               do_notify_parent_cldstop(tsk, CLD_STOPPED);
+               do_notify_parent_cldstop(tsk, group_stop);
                read_unlock(&tasklist_lock);
        }
 }
@@ -2290,7 +2269,6 @@ static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
        struct task_struct *p;
-       unsigned long flags;
        int error = -ESRCH;
 
        rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
-                *
-                * If lock_task_sighand() fails we pretend the task dies
-                * after receiving the signal. The window is tiny, and the
-                * signal is private anyway.
                 */
-               if (!error && sig && lock_task_sighand(p, &flags)) {
-                       error = specific_send_sig_info(sig, info, p);
-                       unlock_task_sighand(p, &flags);
+               if (!error && sig) {
+                       error = do_send_sig_info(sig, info, p, false);
+                       /*
+                        * If lock_task_sighand() failed we pretend the task
+                        * dies after receiving the signal. The window is tiny,
+                        * and the signal is private anyway.
+                        */
+                       if (unlikely(error == -ESRCH))
+                               error = 0;
                }
        }
        rcu_read_unlock();
index 09d7519557d35181fab35591a17516efe5aba723..0d31135efbf4cab0b98babcaa05355eaf5130dfa 100644 (file)
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
 #ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+static int slow_work_min_threads_sysctl(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 
-static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
                                        void __user *, size_t *, loff_t *);
 #endif
 
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
  * Handle adjustment of the minimum number of threads
  */
 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-                                       struct file *filp, void __user *buffer,
+                                       void __user *buffer,
                                        size_t *lenp, loff_t *ppos)
 {
-       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        int n;
 
        if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
  * Handle adjustment of the maximum number of threads
  */
 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-                                       struct file *filp, void __user *buffer,
+                                       void __user *buffer,
                                        size_t *lenp, loff_t *ppos)
 {
-       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        int n;
 
        if (ret == 0) {
index 88796c330838dd66c24b8628f8b56ce27d3a3d23..81324d12eb35a5db7fae8a0ae3d18c76ca38ce67 100644 (file)
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
 int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-                            struct file *filp, void __user *buffer,
+                            void __user *buffer,
                             size_t *lenp, loff_t *ppos)
 {
        touch_all_softlockup_watchdogs();
-       return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 
 /*
index ebcb15611728c510ef59312e3a0cd398757e0b8f..255475d163e0cdb62602306a28134d67005e87c0 100644 (file)
@@ -1542,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                current->timer_slack_ns = arg2;
                        error = 0;
                        break;
+               case PR_MCE_KILL:
+                       if (arg4 | arg5)
+                               return -EINVAL;
+                       switch (arg2) {
+                       case 0:
+                               if (arg3 != 0)
+                                       return -EINVAL;
+                               current->flags &= ~PF_MCE_PROCESS;
+                               break;
+                       case 1:
+                               current->flags |= PF_MCE_PROCESS;
+                               if (arg3 != 0)
+                                       current->flags |= PF_MCE_EARLY;
+                               else
+                                       current->flags &= ~PF_MCE_EARLY;
+                               break;
+                       default:
+                               return -EINVAL;
+                       }
+                       error = 0;
+                       break;
+
                default:
                        error = -EINVAL;
                        break;
index 7f4f57bea4ce5ceb77a677f6415f1f0cfe13f16c..0d949c517412ee16822a5ca7d6e7c79218543741 100644 (file)
@@ -76,6 +76,7 @@ extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
+extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
@@ -162,9 +163,9 @@ extern int max_lock_depth;
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
@@ -423,6 +424,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "core_pipe_limit",
+               .data           = &core_pipe_limit,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
@@ -1389,6 +1398,31 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &scan_unevictable_handler,
        },
+#ifdef CONFIG_MEMORY_FAILURE
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_early_kill",
+               .data           = &sysctl_memory_failure_early_kill,
+               .maxlen         = sizeof(sysctl_memory_failure_early_kill),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_recovery",
+               .data           = &sysctl_memory_failure_recovery,
+               .maxlen         = sizeof(sysctl_memory_failure_recovery),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
+
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -2217,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 #ifdef CONFIG_PROC_SYSCTL
 
 static int _proc_do_string(void* data, int maxlen, int write,
-                          struct file *filp, void __user *buffer,
+                          void __user *buffer,
                           size_t *lenp, loff_t *ppos)
 {
        size_t len;
@@ -2278,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
  * proc_dostring - read a string sysctl
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2292,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
  *
  * Returns 0 on success.
  */
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       return _proc_do_string(table->data, table->maxlen, write, filp,
+       return _proc_do_string(table->data, table->maxlen, write,
                               buffer, lenp, ppos);
 }
 
@@ -2320,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 }
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-                 int write, struct file *filp, void __user *buffer,
+                 int write, void __user *buffer,
                  size_t *lenp, loff_t *ppos,
                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
@@ -2427,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 #undef TMPBUFLEN
 }
 
-static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+static int do_proc_dointvec(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos,
                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
                  void *data)
 {
-       return __do_proc_dointvec(table->data, table, write, filp,
+       return __do_proc_dointvec(table->data, table, write,
                        buffer, lenp, ppos, conv, data);
 }
 
@@ -2441,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2451,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  *
  * Returns 0 on success.
  */
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            NULL,NULL);
 }
 
@@ -2462,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
  * Taint values can only be increased
  * This means we can safely use a temporary.
  */
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table t;
@@ -2474,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 
        t = *table;
        t.data = &tmptaint;
-       err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+       err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;
 
@@ -2526,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_minmax - read a vector of integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2539,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct do_proc_dointvec_minmax_conv_param param = {
                .min = (int *) table->extra1,
                .max = (int *) table->extra2,
        };
-       return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+       return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                do_proc_dointvec_minmax_conv, &param);
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-                                    struct file *filp,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
                                     unsigned long convmul,
@@ -2656,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
-                                    struct file *filp,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
                                     unsigned long convmul,
                                     unsigned long convdiv)
 {
        return __do_proc_doulongvec_minmax(table->data, table, write,
-                       filp, buffer, lenp, ppos, convmul, convdiv);
+                       buffer, lenp, ppos, convmul, convdiv);
 }
 
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2683,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
 }
 
 /**
  * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2708,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
  * Returns 0 on success.
  */
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-                                     struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+    return do_proc_doulongvec_minmax(table, write, buffer,
                                     lenp, ppos, HZ, 1000l);
 }
 
@@ -2788,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2800,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            do_proc_dointvec_jiffies_conv,NULL);
 }
 
@@ -2811,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: pointer to the file position
@@ -2823,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            do_proc_dointvec_userhz_jiffies_conv,NULL);
 }
 
@@ -2834,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2847,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  *
  * Returns 0 on success.
  */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-       return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+       return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct pid *new_pid;
@@ -2863,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
        tmp = pid_vnr(cad_pid);
 
-       r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+       r = __do_proc_dointvec(&tmp, table, write, buffer,
                               lenp, ppos, NULL, NULL);
        if (r || !write)
                return r;
@@ -2878,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 #else /* CONFIG_PROC_FS */
 
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        return -ENOSYS;
 }
 
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-                                     struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos)
 {
index 0b0a6366c9d482b8cabbbdc3e290087e2622c0bf..ee266620b06ca336a9246489ff9d01d1dd7cd91d 100644 (file)
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)                += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)              += tick-common.o
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644 (file)
index 0000000..86628e7
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Paul Eggert (eggert@twinsun.com).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the GNU C Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Converts the calendar time to broken-down time representation
+ * Based on code from glibc-2.6
+ *
+ * 2009-7-14:
+ *   Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+
+/*
+ * Nonzero if YEAR is a leap year (every 4 years,
+ * except every 100th isn't, and every 400th is).
+ */
+static int __isleap(long year)
+{
+       return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
+}
+
+/* do a mathdiv for long type */
+static long math_div(long a, long b)
+{
+       return a / b - (a % b < 0);
+}
+
+/* How many leap years between y1 and y2, y1 must less or equal to y2 */
+static long leaps_between(long y1, long y2)
+{
+       long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+               + math_div(y1 - 1, 400);
+       long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+               + math_div(y2 - 1, 400);
+       return leaps2 - leaps1;
+}
+
+/* How many days come before each month (0-12). */
+static const unsigned short __mon_yday[2][13] = {
+       /* Normal years. */
+       {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+       /* Leap years. */
+       {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define SECS_PER_HOUR  (60 * 60)
+#define SECS_PER_DAY   (SECS_PER_HOUR * 24)
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs  the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ *             Coordinated Universal Time (UTC).
+ * @offset     offset seconds adding to totalsecs.
+ * @result     pointer to struct tm variable to receive broken-down time
+ */
+void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+       long days, rem, y;
+       const unsigned short *ip;
+
+       days = totalsecs / SECS_PER_DAY;
+       rem = totalsecs % SECS_PER_DAY;
+       rem += offset;
+       while (rem < 0) {
+               rem += SECS_PER_DAY;
+               --days;
+       }
+       while (rem >= SECS_PER_DAY) {
+               rem -= SECS_PER_DAY;
+               ++days;
+       }
+
+       result->tm_hour = rem / SECS_PER_HOUR;
+       rem %= SECS_PER_HOUR;
+       result->tm_min = rem / 60;
+       result->tm_sec = rem % 60;
+
+       /* January 1, 1970 was a Thursday. */
+       result->tm_wday = (4 + days) % 7;
+       if (result->tm_wday < 0)
+               result->tm_wday += 7;
+
+       y = 1970;
+
+       while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+               /* Guess a corrected year, assuming 365 days per year. */
+               long yg = y + math_div(days, 365);
+
+               /* Adjust DAYS and Y to match the guessed year. */
+               days -= (yg - y) * 365 + leaps_between(y, yg);
+               y = yg;
+       }
+
+       result->tm_year = y - 1900;
+
+       result->tm_yday = days;
+
+       ip = __mon_yday[__isleap(y)];
+       for (y = 11; days < ip[y]; y--)
+               continue;
+       days -= ip[y];
+
+       result->tm_mon = y;
+       result->tm_mday = days + 1;
+}
+EXPORT_SYMBOL(time_to_tm);
index 23df7771c937acfab46e4f695950a69f35fbe44b..a142579765bf6635ef8f426ac2e3f99b95daaaf0 100644 (file)
@@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-                    struct file *file, void __user *buffer, size_t *lenp,
+                    void __user *buffer, size_t *lenp,
                     loff_t *ppos)
 {
        int ret;
@@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
        mutex_lock(&ftrace_lock);
 
-       ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+       ret  = proc_dointvec(table, write, buffer, lenp, ppos);
 
        if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
                goto out;
index 0f6facb050a11631d182cc1fd85b236fbf8befa8..8504ac71e4e8f9831edd7cc0d3186f98fdcad6c1 100644 (file)
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
 
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-                  struct file *file, void __user *buffer, size_t *lenp,
+                  void __user *buffer, size_t *lenp,
                   loff_t *ppos)
 {
        int ret;
 
        mutex_lock(&stack_sysctl_mutex);
 
-       ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
        if (ret || !write ||
            (last_stack_tracer_enabled == !!stack_tracer_enabled))
index 92359cc747a7c52cb85e362ab3b9178e4756ee2d..69eae358a726d8a8848d7f3e141c22762493c7b4 100644 (file)
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
  *     Special case of dostring for the UTS structure. This has locks
  *     to observe. Should this be in kernel/sys.c ????
  */
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+static int proc_do_uts_string(ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table uts_table;
        int r;
        memcpy(&uts_table, table, sizeof(uts_table));
        uts_table.data = get_uts(table, write);
-       r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+       r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
        put_uts(table, write, uts_table.data);
        return r;
 }
index 68dfce59c1b80be510544c0effc3f87669527b8f..fc686c7a0a0da688dd9dee52156dc7f6c0c380ab 100644 (file)
 
 #define GZIP_IOBUF_SIZE (16*1024)
 
+static int nofill(void *buffer, unsigned int len)
+{
+       return -1;
+}
+
 /* Included from initramfs et al code */
 STATIC int INIT gunzip(unsigned char *buf, int len,
                       int(*fill)(void*, unsigned int),
@@ -76,6 +81,9 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
                goto gunzip_nomem4;
        }
 
+       if (!fill)
+               fill = nofill;
+
        if (len == 0)
                len = fill(zbuf, GZIP_IOBUF_SIZE);
 
index 0b954e04bd3015bb08332b2a30a13e1ff6414fea..ca82fde81c8fc48a39a22495fbe9d489142d04aa 100644 (file)
@@ -82,6 +82,11 @@ struct rc {
 #define RC_MODEL_TOTAL_BITS 11
 
 
+static int nofill(void *buffer, unsigned int len)
+{
+       return -1;
+}
+
 /* Called twice: once at startup and once in rc_normalize() */
 static void INIT rc_read(struct rc *rc)
 {
@@ -97,7 +102,10 @@ static inline void INIT rc_init(struct rc *rc,
                                       int (*fill)(void*, unsigned int),
                                       char *buffer, int buffer_size)
 {
-       rc->fill = fill;
+       if (fill)
+               rc->fill = fill;
+       else
+               rc->fill = nofill;
        rc->buffer = (uint8_t *)buffer;
        rc->buffer_size = buffer_size;
        rc->buffer_end = rc->buffer + rc->buffer_size;
index 71eb0b4cce8dbc425aa476d54f2048e5679c7dea..247760729593d37f841655dd54ec4572523255f7 100644 (file)
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR
          /proc/sys/vm/mmap_min_addr tunable.
 
 
+config MEMORY_FAILURE
+       depends on MMU
+       depends on X86_MCE
+       bool "Enable recovery from hardware memory errors"
+       help
+         Enables code to recover from some memory failures on systems
+         with MCA recovery. This allows a system to continue running
+         even when some of its memory has uncorrected errors. This requires
+         special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+       tristate "Poison pages injector"
+       depends on MEMORY_FAILURE && DEBUG_KERNEL
+
 config NOMMU_INITIAL_TRIM_EXCESS
        int "Turn on mmap() excess space trimming before booting"
        depends on !MMU
index 88193d73cd1a30dd623e94eb9b5bed0c96cf08e0..515fd793c17fa989cffe0f3a686c8086e2f7ddca 100644 (file)
@@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
index bcc7372aebbc4375d0763e4f3acd8d096bcb612d..6c84e598b4a9f7a0c2901387f32307c4e96ebaae 100644 (file)
@@ -58,7 +58,7 @@
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock              (vmtruncate)
+ *  ->i_mmap_lock              (truncate_pagecache)
  *    ->private_lock           (__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock            (exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->task->proc_lock
  *    ->dcache_lock            (proc_pid_lookup)
+ *
+ *  (code doesn't rely on that order, so you could switch it around)
+ *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+ *    ->i_mmap_lock
  */
 
 /*
index 815dbd4a6dcb919f28d4331aceae088b8e5f62d7..6f048fcc749ca48b3bae0a1a37a792f15e0c203d 100644 (file)
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-                          struct file *file, void __user *buffer,
+                          void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-       proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+       proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
        if (write)
                h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-                       struct file *file, void __user *buffer,
+                       void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
-       proc_dointvec(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, buffer, length, ppos);
        if (hugepages_treat_as_movable)
                htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
        else
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-                       struct file *file, void __user *buffer,
+                       void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-       proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+       proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
        if (write) {
                spin_lock(&hugetlb_lock);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644 (file)
index 0000000..e1d8513
--- /dev/null
@@ -0,0 +1,41 @@
+/* Inject a hwpoison memory failure on a arbitary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+static struct dentry *hwpoison_dir, *corrupt_pfn;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
+       return __memory_failure(val, 18, 0);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+       if (hwpoison_dir)
+               debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+       hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+       if (hwpoison_dir == NULL)
+               return -ENOMEM;
+       corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+                                         NULL, &hwpoison_fops);
+       if (corrupt_pfn == NULL) {
+               pfn_inject_exit();
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
index 37cc37325094b20d3c6153bdc27314d3c81398c9..f7edac356f465275031110db70c1e57aafbc5cda 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/mmu_notifier.h>
+#include <linux/swap.h>
 #include <linux/ksm.h>
 
 #include <asm/tlbflush.h>
@@ -162,10 +163,10 @@ static unsigned long ksm_pages_unshared;
 static unsigned long ksm_rmap_items;
 
 /* Limit on the number of unswappable pages used */
-static unsigned long ksm_max_kernel_pages = 2000;
+static unsigned long ksm_max_kernel_pages;
 
 /* Number of pages ksmd should scan in one batch */
-static unsigned int ksm_thread_pages_to_scan = 200;
+static unsigned int ksm_thread_pages_to_scan = 100;
 
 /* Milliseconds ksmd should sleep between batches */
 static unsigned int ksm_thread_sleep_millisecs = 20;
@@ -173,7 +174,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20;
 #define KSM_RUN_STOP   0
 #define KSM_RUN_MERGE  1
 #define KSM_RUN_UNMERGE        2
-static unsigned int ksm_run = KSM_RUN_MERGE;
+static unsigned int ksm_run = KSM_RUN_STOP;
 
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
@@ -183,6 +184,11 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
                sizeof(struct __struct), __alignof__(struct __struct),\
                (__flags), NULL)
 
+static void __init ksm_init_max_kernel_pages(void)
+{
+       ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
+}
+
 static int __init ksm_slab_init(void)
 {
        rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1667,6 +1673,8 @@ static int __init ksm_init(void)
        struct task_struct *ksm_thread;
        int err;
 
+       ksm_init_max_kernel_pages();
+
        err = ksm_slab_init();
        if (err)
                goto out;
index d9ae2067952e5d2b5d09b260bc443d1ba0899d6a..35b1479b7c9d080ed97b772da44603c4f757093c 100644 (file)
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma,
        return error;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Error injection support for memory error handling.
+ */
+static int madvise_hwpoison(unsigned long start, unsigned long end)
+{
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       for (; start < end; start += PAGE_SIZE) {
+               struct page *p;
+               int ret = get_user_pages(current, current->mm, start, 1,
+                                               0, 0, &p, NULL);
+               if (ret != 1)
+                       return ret;
+               printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+                      page_to_pfn(p), start);
+               /* Ignore return value for now */
+               __memory_failure(page_to_pfn(p), 0, 1);
+               put_page(p);
+       }
+       return ret;
+}
+#endif
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        int write;
        size_t len;
 
+#ifdef CONFIG_MEMORY_FAILURE
+       if (behavior == MADV_HWPOISON)
+               return madvise_hwpoison(start, start+len_in);
+#endif
        if (!madvise_behavior_valid(behavior))
                return error;
 
index 9b10d8753784c5e9fbbe33239a277a514fe99c67..e2b98a6875c079b36f103b724130462591094735 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -43,6 +44,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES     5
+struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #endif
 
 static DEFINE_MUTEX(memcg_tasklist);   /* can be hold under cgroup_mutex */
+#define SOFTLIMIT_EVENTS_THRESH (1000)
 
 /*
  * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
+       MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
+       MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
        struct mem_cgroup_stat_cpu cpustat[0];
 };
 
+static inline void
+__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
+                               enum mem_cgroup_stat_index idx)
+{
+       stat->count[idx] = 0;
+}
+
+static inline s64
+__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
+                               enum mem_cgroup_stat_index idx)
+{
+       return stat->count[idx];
+}
+
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
        unsigned long           count[NR_LRU_LISTS];
 
        struct zone_reclaim_stat reclaim_stat;
+       struct rb_node          tree_node;      /* RB tree node */
+       unsigned long long      usage_in_excess;/* Set to the value by which */
+                                               /* the soft limit is exceeded*/
+       bool                    on_tree;
+       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
+                                               /* use container_of        */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)      ((mz)->count[(idx)])
@@ -129,6 +154,26 @@ struct mem_cgroup_lru_info {
        struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+       struct rb_root rb_root;
+       spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+       struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+       struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -186,6 +231,13 @@ struct mem_cgroup {
        struct mem_cgroup_stat stat;
 };
 
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define        MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
+#define        MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
 #define PCGF_CACHE     (1UL << PCG_CACHE)
 #define PCGF_USED      (1UL << PCG_USED)
 #define PCGF_LOCK      (1UL << PCG_LOCK)
-static const unsigned long
-pcg_default_flags[NR_CHARGE_TYPE] = {
-       PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
-       PCGF_USED | PCGF_LOCK, /* Anon */
-       PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
-       0, /* FORCE */
-};
+/* Not used, but added here for completeness */
+#define PCGF_ACCT      (1UL << PCG_ACCT)
 
 /* for encoding cft->private value on file */
 #define _MEM                   (0)
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 #define MEMFILE_TYPE(val)      (((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)      ((val) & 0xffff)
 
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT  0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP      (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
+#define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_SOFT_BIT    0x2
+#define MEM_CGROUP_RECLAIM_SOFT                (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
+
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 
+static struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+       return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+       struct mem_cgroup *mem = pc->mem_cgroup;
+       int nid = page_cgroup_nid(pc);
+       int zid = page_cgroup_zid(pc);
+
+       if (!mem)
+               return NULL;
+
+       return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+       int nid = page_to_nid(page);
+       int zid = page_zonenum(page);
+
+       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct rb_node **p = &mctz->rb_root.rb_node;
+       struct rb_node *parent = NULL;
+       struct mem_cgroup_per_zone *mz_node;
+
+       if (mz->on_tree)
+               return;
+
+       mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+       while (*p) {
+               parent = *p;
+               mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+                                       tree_node);
+               if (mz->usage_in_excess < mz_node->usage_in_excess)
+                       p = &(*p)->rb_left;
+               /*
+                * We can't avoid mem cgroups that are over their soft
+                * limit by the same amount
+                */
+               else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&mz->tree_node, parent, p);
+       rb_insert_color(&mz->tree_node, &mctz->rb_root);
+       mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       if (!mz->on_tree)
+               return;
+       rb_erase(&mz->tree_node, &mctz->rb_root);
+       mz->on_tree = false;
+}
+
+static void
+mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       spin_lock(&mctz->lock);
+       __mem_cgroup_insert_exceeded(mem, mz, mctz);
+       spin_unlock(&mctz->lock);
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       spin_lock(&mctz->lock);
+       __mem_cgroup_remove_exceeded(mem, mz, mctz);
+       spin_unlock(&mctz->lock);
+}
+
+static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
+{
+       bool ret = false;
+       int cpu;
+       s64 val;
+       struct mem_cgroup_stat_cpu *cpustat;
+
+       cpu = get_cpu();
+       cpustat = &mem->stat.cpustat[cpu];
+       val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
+       if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
+               __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
+               ret = true;
+       }
+       put_cpu();
+       return ret;
+}
+
+static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+{
+       unsigned long long prev_usage_in_excess, new_usage_in_excess;
+       bool updated_tree = false;
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_tree_per_zone *mctz;
+
+       mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+       mctz = soft_limit_tree_from_page(page);
+
+       /*
+        * We do updates in lazy mode, mem's are removed
+        * lazily from the per-zone, per-node rb tree
+        */
+       prev_usage_in_excess = mz->usage_in_excess;
+
+       new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+       if (prev_usage_in_excess) {
+               mem_cgroup_remove_exceeded(mem, mz, mctz);
+               updated_tree = true;
+       }
+       if (!new_usage_in_excess)
+               goto done;
+       mem_cgroup_insert_exceeded(mem, mz, mctz);
+
+done:
+       if (updated_tree) {
+               spin_lock(&mctz->lock);
+               mz->usage_in_excess = new_usage_in_excess;
+               spin_unlock(&mctz->lock);
+       }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+{
+       int node, zone;
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_tree_per_zone *mctz;
+
+       for_each_node_state(node, N_POSSIBLE) {
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       mz = mem_cgroup_zoneinfo(mem, node, zone);
+                       mctz = soft_limit_tree_node_zone(node, zone);
+                       mem_cgroup_remove_exceeded(mem, mz, mctz);
+               }
+       }
+}
+
+static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
+{
+       return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct rb_node *rightmost = NULL;
+       struct mem_cgroup_per_zone *mz = NULL;
+
+retry:
+       rightmost = rb_last(&mctz->rb_root);
+       if (!rightmost)
+               goto done;              /* Nothing to reclaim from */
+
+       mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+       /*
+        * Remove the node now but someone else can add it back,
+        * we will to add it back at the end of reclaim to its correct
+        * position in the tree.
+        */
+       __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+       if (!res_counter_soft_limit_excess(&mz->mem->res) ||
+               !css_tryget(&mz->mem->css))
+               goto retry;
+done:
+       return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       spin_lock(&mctz->lock);
+       mz = __mem_cgroup_largest_soft_limit_node(mctz);
+       spin_unlock(&mctz->lock);
+       return mz;
+}
+
+static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+                                        bool charge)
+{
+       int val = (charge) ? 1 : -1;
+       struct mem_cgroup_stat *stat = &mem->stat;
+       struct mem_cgroup_stat_cpu *cpustat;
+       int cpu = get_cpu();
+
+       cpustat = &stat->cpustat[cpu];
+       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
+       put_cpu();
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         struct page_cgroup *pc,
                                         bool charge)
 {
-       int val = (charge)? 1 : -1;
+       int val = (charge) ? 1 : -1;
        struct mem_cgroup_stat *stat = &mem->stat;
        struct mem_cgroup_stat_cpu *cpustat;
        int cpu = get_cpu();
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        else
                __mem_cgroup_stat_add_safe(cpustat,
                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
        put_cpu();
 }
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
-{
-       return &mem->info.nodeinfo[nid]->zoneinfo[zid];
-}
-
-static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
-{
-       struct mem_cgroup *mem = pc->mem_cgroup;
-       int nid = page_cgroup_nid(pc);
-       int zid = page_cgroup_zid(pc);
-
-       if (!mem)
-               return NULL;
-
-       return mem_cgroup_zoneinfo(mem, nid, zid);
-}
-
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                        enum lru_list idx)
 {
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
        return ret;
 }
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+{
+       return (mem == root_mem_cgroup);
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 {
        struct page_cgroup *pc;
-       struct mem_cgroup *mem;
        struct mem_cgroup_per_zone *mz;
 
        if (mem_cgroup_disabled())
                return;
        pc = lookup_page_cgroup(page);
        /* can happen while we handle swapcache. */
-       if (list_empty(&pc->lru) || !pc->mem_cgroup)
+       if (!TestClearPageCgroupAcctLRU(pc))
                return;
+       VM_BUG_ON(!pc->mem_cgroup);
        /*
         * We don't check PCG_USED bit. It's cleared when the "page" is finally
         * removed from global LRU.
         */
        mz = page_cgroup_zoneinfo(pc);
-       mem = pc->mem_cgroup;
        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+       if (mem_cgroup_is_root(pc->mem_cgroup))
+               return;
+       VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
        return;
 }
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
         * For making pc->mem_cgroup visible, insert smp_rmb() here.
         */
        smp_rmb();
-       /* unused page is not rotated. */
-       if (!PageCgroupUsed(pc))
+       /* unused or root page is not rotated. */
+       if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
                return;
        mz = page_cgroup_zoneinfo(pc);
        list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
        if (mem_cgroup_disabled())
                return;
        pc = lookup_page_cgroup(page);
+       VM_BUG_ON(PageCgroupAcctLRU(pc));
        /*
         * Used bit is set without atomic ops but after smp_wmb().
         * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 
        mz = page_cgroup_zoneinfo(pc);
        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+       SetPageCgroupAcctLRU(pc);
+       if (mem_cgroup_is_root(pc->mem_cgroup))
+               return;
        list_add(&pc->lru, &mz->lists[lru]);
 }
 
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 
        spin_lock_irqsave(&zone->lru_lock, flags);
        /* link when the page is linked to LRU but page_cgroup isn't */
-       if (PageLRU(page) && list_empty(&pc->lru))
+       if (PageLRU(page) && !PageCgroupAcctLRU(pc))
                mem_cgroup_add_lru_list(page, page_lru(page));
        spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
-                                  gfp_t gfp_mask, bool noswap, bool shrink)
+                                               struct zone *zone,
+                                               gfp_t gfp_mask,
+                                               unsigned long reclaim_options)
 {
        struct mem_cgroup *victim;
        int ret, total = 0;
        int loop = 0;
+       bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
+       bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
+       bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
+       unsigned long excess = mem_cgroup_get_excess(root_mem);
 
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
        if (root_mem->memsw_is_minimum)
                noswap = true;
 
-       while (loop < 2) {
+       while (1) {
                victim = mem_cgroup_select_victim(root_mem);
-               if (victim == root_mem)
+               if (victim == root_mem) {
                        loop++;
+                       if (loop >= 2) {
+                               /*
+                                * If we have not been able to reclaim
+                                * anything, it might because there are
+                                * no reclaimable pages under this hierarchy
+                                */
+                               if (!check_soft || !total) {
+                                       css_put(&victim->css);
+                                       break;
+                               }
+                               /*
+                                * We want to do more targetted reclaim.
+                                * excess >> 2 is not to excessive so as to
+                                * reclaim too much, nor too less that we keep
+                                * coming back to reclaim from this cgroup
+                                */
+                               if (total >= (excess >> 2) ||
+                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+                                       css_put(&victim->css);
+                                       break;
+                               }
+                       }
+               }
                if (!mem_cgroup_local_usage(&victim->stat)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
                }
                /* we use swappiness of local cgroup */
-               ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
-                                                  get_swappiness(victim));
+               if (check_soft)
+                       ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
+                               noswap, get_swappiness(victim), zone,
+                               zone->zone_pgdat->node_id);
+               else
+                       ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
+                                               noswap, get_swappiness(victim));
                css_put(&victim->css);
                /*
                 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                if (shrink)
                        return ret;
                total += ret;
-               if (mem_cgroup_check_under_limit(root_mem))
+               if (check_soft) {
+                       if (res_counter_check_under_soft_limit(&root_mem->res))
+                               return total;
+               } else if (mem_cgroup_check_under_limit(root_mem))
                        return 1 + total;
        }
        return total;
@@ -965,11 +1268,11 @@ done:
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
                        gfp_t gfp_mask, struct mem_cgroup **memcg,
-                       bool oom)
+                       bool oom, struct page *page)
 {
-       struct mem_cgroup *mem, *mem_over_limit;
+       struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-       struct res_counter *fail_res;
+       struct res_counter *fail_res, *soft_fail_res = NULL;
 
        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
                /* Don't account this! */
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
        VM_BUG_ON(css_is_removed(&mem->css));
 
        while (1) {
-               int ret;
-               bool noswap = false;
+               int ret = 0;
+               unsigned long flags = 0;
 
-               ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+               if (mem_cgroup_is_root(mem))
+                       goto done;
+               ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
+                                               &soft_fail_res);
                if (likely(!ret)) {
                        if (!do_swap_account)
                                break;
                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
-                                                       &fail_res);
+                                                       &fail_res, NULL);
                        if (likely(!ret))
                                break;
                        /* mem+swap counter fails */
-                       res_counter_uncharge(&mem->res, PAGE_SIZE);
-                       noswap = true;
+                       res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+                       flags |= MEM_CGROUP_RECLAIM_NOSWAP;
                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
                                                                        memsw);
                } else
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                if (!(gfp_mask & __GFP_WAIT))
                        goto nomem;
 
-               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
-                                                       noswap, false);
+               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                               gfp_mask, flags);
                if (ret)
                        continue;
 
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                        goto nomem;
                }
        }
+       /*
+        * Insert just the ancestor, we should trickle down to the correct
+        * cgroup for reclaim, since the other nodes will be below their
+        * soft limit
+        */
+       if (soft_fail_res) {
+               mem_over_soft_limit =
+                       mem_cgroup_from_res_counter(soft_fail_res, res);
+               if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
+                       mem_cgroup_update_tree(mem_over_soft_limit, page);
+       }
+done:
        return 0;
 nomem:
        css_put(&mem->css);
        return -ENOMEM;
 }
 
-
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
-               res_counter_uncharge(&mem->res, PAGE_SIZE);
-               if (do_swap_account)
-                       res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+               if (!mem_cgroup_is_root(mem)) {
+                       res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+                       if (do_swap_account)
+                               res_counter_uncharge(&mem->memsw, PAGE_SIZE,
+                                                       NULL);
+               }
                css_put(&mem->css);
                return;
        }
+
        pc->mem_cgroup = mem;
+       /*
+        * We access a page_cgroup asynchronously without lock_page_cgroup().
+        * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
+        * is accessed after testing USED bit. To make pc->mem_cgroup visible
+        * before USED bit, we need memory barrier here.
+        * See mem_cgroup_add_lru_list(), etc.
+        */
        smp_wmb();
-       pc->flags = pcg_default_flags[ctype];
+       switch (ctype) {
+       case MEM_CGROUP_CHARGE_TYPE_CACHE:
+       case MEM_CGROUP_CHARGE_TYPE_SHMEM:
+               SetPageCgroupCache(pc);
+               SetPageCgroupUsed(pc);
+               break;
+       case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+               ClearPageCgroupCache(pc);
+               SetPageCgroupUsed(pc);
+               break;
+       default:
+               break;
+       }
 
        mem_cgroup_charge_statistics(mem, pc, true);
 
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        if (pc->mem_cgroup != from)
                goto out;
 
-       res_counter_uncharge(&from->res, PAGE_SIZE);
+       if (!mem_cgroup_is_root(from))
+               res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
        mem_cgroup_charge_statistics(from, pc, false);
 
        page = pc->page;
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                                                1);
        }
 
-       if (do_swap_account)
-               res_counter_uncharge(&from->memsw, PAGE_SIZE);
+       if (do_swap_account && !mem_cgroup_is_root(from))
+               res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
        css_put(&from->css);
 
        css_get(&to->css);
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        parent = mem_cgroup_from_cont(pcg);
 
 
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
        if (ret || !parent)
                return ret;
 
@@ -1268,9 +1609,11 @@ uncharge:
        /* drop extra refcnt by try_charge() */
        css_put(&parent->css);
        /* uncharge if move fails */
-       res_counter_uncharge(&parent->res, PAGE_SIZE);
-       if (do_swap_account)
-               res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+       if (!mem_cgroup_is_root(parent)) {
+               res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+               if (do_swap_account)
+                       res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+       }
        return ret;
 }
 
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        prefetchw(pc);
 
        mem = memcg;
-       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
        if (ret || !mem)
                return ret;
 
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-       return __mem_cgroup_try_charge(mm, mask, ptr, true);
+       return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
 }
 
 static void
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                         * This recorded memcg can be obsolete one. So, avoid
                         * calling css_tryget
                         */
-                       res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                       if (!mem_cgroup_is_root(memcg))
+                               res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
+                                                       NULL);
+                       mem_cgroup_swap_statistics(memcg, false);
                        mem_cgroup_put(memcg);
                }
                rcu_read_unlock();
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-       res_counter_uncharge(&mem->res, PAGE_SIZE);
-       if (do_swap_account)
-               res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+       if (!mem_cgroup_is_root(mem)) {
+               res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+               if (do_swap_account)
+                       res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+       }
        css_put(&mem->css);
 }
 
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
        struct mem_cgroup_per_zone *mz;
+       bool soft_limit_excess = false;
 
        if (mem_cgroup_disabled())
                return NULL;
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
 
-       res_counter_uncharge(&mem->res, PAGE_SIZE);
-       if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-               res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+       if (!mem_cgroup_is_root(mem)) {
+               res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+               if (do_swap_account &&
+                               (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+                       res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+       }
+       if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+               mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
 
        ClearPageCgroupUsed(pc);
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
 
+       if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+               mem_cgroup_update_tree(mem, page);
        /* at swapout, this memcg will be accessed to record to swap */
        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                css_put(&mem->css);
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
                 * We uncharge this because swap is freed.
                 * This memcg can be obsolete one. We avoid calling css_tryget
                 */
-               res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+               if (!mem_cgroup_is_root(memcg))
+                       res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+               mem_cgroup_swap_statistics(memcg, false);
                mem_cgroup_put(memcg);
        }
        rcu_read_unlock();
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
        unlock_page_cgroup(pc);
 
        if (mem) {
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+                                               page);
                css_put(&mem->css);
        }
        *ptr = mem;
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
-                                                  false, true);
+               progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+                                               GFP_KERNEL,
+                                               MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
+               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
+                                               MEM_CGROUP_RECLAIM_NOSWAP |
+                                               MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
        return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                               gfp_t gfp_mask, int nid,
+                                               int zid)
+{
+       unsigned long nr_reclaimed = 0;
+       struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+       unsigned long reclaimed;
+       int loop = 0;
+       struct mem_cgroup_tree_per_zone *mctz;
+
+       if (order > 0)
+               return 0;
+
+       mctz = soft_limit_tree_node_zone(nid, zid);
+       /*
+        * This loop can run a while, specially if mem_cgroup's continuously
+        * keep exceeding their soft limit and putting the system under
+        * pressure
+        */
+       do {
+               if (next_mz)
+                       mz = next_mz;
+               else
+                       mz = mem_cgroup_largest_soft_limit_node(mctz);
+               if (!mz)
+                       break;
+
+               reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
+                                               gfp_mask,
+                                               MEM_CGROUP_RECLAIM_SOFT);
+               nr_reclaimed += reclaimed;
+               spin_lock(&mctz->lock);
+
+               /*
+                * If we failed to reclaim anything from this memory cgroup
+                * it is time to move on to the next cgroup
+                */
+               next_mz = NULL;
+               if (!reclaimed) {
+                       do {
+                               /*
+                                * Loop until we find yet another one.
+                                *
+                                * By the time we get the soft_limit lock
+                                * again, someone might have aded the
+                                * group back on the RB tree. Iterate to
+                                * make sure we get a different mem.
+                                * mem_cgroup_largest_soft_limit_node returns
+                                * NULL if no other cgroup is present on
+                                * the tree
+                                */
+                               next_mz =
+                               __mem_cgroup_largest_soft_limit_node(mctz);
+                               if (next_mz == mz) {
+                                       css_put(&next_mz->mem->css);
+                                       next_mz = NULL;
+                               } else /* next_mz == NULL or other memcg */
+                                       break;
+                       } while (1);
+               }
+               mz->usage_in_excess =
+                       res_counter_soft_limit_excess(&mz->mem->res);
+               __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+               /*
+                * One school of thought says that we should not add
+                * back the node to the tree if reclaim returns 0.
+                * But our reclaim could return 0, simply because due
+                * to priority we are exposing a smaller subset of
+                * memory to reclaim from. Consider this as a longer
+                * term TODO.
+                */
+               if (mz->usage_in_excess)
+                       __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+               spin_unlock(&mctz->lock);
+               css_put(&mz->mem->css);
+               loop++;
+               /*
+                * Could not reclaim anything and there are no more
+                * mem cgroups to try or we seem to be looping without
+                * reclaiming anything.
+                */
+               if (!nr_reclaimed &&
+                       (next_mz == NULL ||
+                       loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+                       break;
+       } while (!nr_reclaimed);
+       if (next_mz)
+               css_put(&next_mz->mem->css);
+       return nr_reclaimed;
+}
+
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        return retval;
 }
 
+struct mem_cgroup_idx_data {
+       s64 val;
+       enum mem_cgroup_stat_index idx;
+};
+
+static int
+mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+{
+       struct mem_cgroup_idx_data *d = data;
+       d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+       return 0;
+}
+
+static void
+mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+                               enum mem_cgroup_stat_index idx, s64 *val)
+{
+       struct mem_cgroup_idx_data d;
+       d.idx = idx;
+       d.val = 0;
+       mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
+       *val = d.val;
+}
+
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-       u64 val = 0;
+       u64 idx_val, val;
        int type, name;
 
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
        switch (type) {
        case _MEM:
-               val = res_counter_read_u64(&mem->res, name);
+               if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+                       mem_cgroup_get_recursive_idx_stat(mem,
+                               MEM_CGROUP_STAT_CACHE, &idx_val);
+                       val = idx_val;
+                       mem_cgroup_get_recursive_idx_stat(mem,
+                               MEM_CGROUP_STAT_RSS, &idx_val);
+                       val += idx_val;
+                       val <<= PAGE_SHIFT;
+               } else
+                       val = res_counter_read_u64(&mem->res, name);
                break;
        case _MEMSWAP:
-               val = res_counter_read_u64(&mem->memsw, name);
+               if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+                       mem_cgroup_get_recursive_idx_stat(mem,
+                               MEM_CGROUP_STAT_CACHE, &idx_val);
+                       val = idx_val;
+                       mem_cgroup_get_recursive_idx_stat(mem,
+                               MEM_CGROUP_STAT_RSS, &idx_val);
+                       val += idx_val;
+                       mem_cgroup_get_recursive_idx_stat(mem,
+                               MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+                       val <<= PAGE_SHIFT;
+               } else
+                       val = res_counter_read_u64(&mem->memsw, name);
                break;
        default:
                BUG();
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
        name = MEMFILE_ATTR(cft->private);
        switch (name) {
        case RES_LIMIT:
+               if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+                       ret = -EINVAL;
+                       break;
+               }
                /* This function does all necessary parse...reuse it */
                ret = res_counter_memparse_write_strategy(buffer, &val);
                if (ret)
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                else
                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
                break;
+       case RES_SOFT_LIMIT:
+               ret = res_counter_memparse_write_strategy(buffer, &val);
+               if (ret)
+                       break;
+               /*
+                * For memsw, soft limits are hard to implement in terms
+                * of semantics, for now, we support soft limits for
+                * control without swap
+                */
+               if (type == _MEM)
+                       ret = res_counter_set_soft_limit(&memcg->res, val);
+               else
+                       ret = -EINVAL;
+               break;
        default:
                ret = -EINVAL; /* should be BUG() ? */
                break;
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
                        res_counter_reset_failcnt(&mem->memsw);
                break;
        }
+
        return 0;
 }
 
@@ -2160,6 +2676,7 @@ enum {
        MCS_MAPPED_FILE,
        MCS_PGPGIN,
        MCS_PGPGOUT,
+       MCS_SWAP,
        MCS_INACTIVE_ANON,
        MCS_ACTIVE_ANON,
        MCS_INACTIVE_FILE,
@@ -2181,6 +2698,7 @@ struct {
        {"mapped_file", "total_mapped_file"},
        {"pgpgin", "total_pgpgin"},
        {"pgpgout", "total_pgpgout"},
+       {"swap", "total_swap"},
        {"inactive_anon", "total_inactive_anon"},
        {"active_anon", "total_active_anon"},
        {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s->stat[MCS_PGPGIN] += val;
        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
        s->stat[MCS_PGPGOUT] += val;
+       if (do_swap_account) {
+               val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+               s->stat[MCS_SWAP] += val * PAGE_SIZE;
+       }
 
        /* per zone stat */
        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        memset(&mystat, 0, sizeof(mystat));
        mem_cgroup_get_local_stat(mem_cont, &mystat);
 
-       for (i = 0; i < NR_MCS_STAT; i++)
+       for (i = 0; i < NR_MCS_STAT; i++) {
+               if (i == MCS_SWAP && !do_swap_account)
+                       continue;
                cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+       }
 
        /* Hierarchical information */
        {
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 
        memset(&mystat, 0, sizeof(mystat));
        mem_cgroup_get_total_stat(mem_cont, &mystat);
-       for (i = 0; i < NR_MCS_STAT; i++)
+       for (i = 0; i < NR_MCS_STAT; i++) {
+               if (i == MCS_SWAP && !do_swap_account)
+                       continue;
                cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
-
+       }
 
 #ifdef CONFIG_DEBUG_VM
        cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2344,6 +2871,12 @@ static struct cftype mem_cgroup_files[] = {
                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
+       {
+               .name = "soft_limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+               .write_string = mem_cgroup_write,
+               .read_u64 = mem_cgroup_read,
+       },
        {
                .name = "failcnt",
                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
                        INIT_LIST_HEAD(&mz->lists[l]);
+               mz->usage_in_excess = 0;
+               mz->on_tree = false;
+               mz->mem = mem;
        }
        return 0;
 }
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
        int node;
 
+       mem_cgroup_remove_from_trees(mem);
        free_css_id(&mem_cgroup_subsys, &mem->css);
 
        for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+static int mem_cgroup_soft_limit_tree_init(void)
+{
+       struct mem_cgroup_tree_per_node *rtpn;
+       struct mem_cgroup_tree_per_zone *rtpz;
+       int tmp, node, zone;
+
+       for_each_node_state(node, N_POSSIBLE) {
+               tmp = node;
+               if (!node_state(node, N_NORMAL_MEMORY))
+                       tmp = -1;
+               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+               if (!rtpn)
+                       return 1;
+
+               soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       rtpz = &rtpn->rb_tree_per_zone[zone];
+                       rtpz->rb_root = RB_ROOT;
+                       spin_lock_init(&rtpz->lock);
+               }
+       }
+       return 0;
+}
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        for_each_node_state(node, N_POSSIBLE)
                if (alloc_mem_cgroup_per_zone_info(mem, node))
                        goto free_out;
+
        /* root ? */
        if (cont->parent == NULL) {
                enable_swap_cgroup();
                parent = NULL;
+               root_mem_cgroup = mem;
+               if (mem_cgroup_soft_limit_tree_init())
+                       goto free_out;
+
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
                mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        return &mem->css;
 free_out:
        __mem_cgroup_free(mem);
+       root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
 
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                               struct task_struct *p)
+                               struct task_struct *p,
+                               bool threadgroup)
 {
        mutex_lock(&memcg_tasklist);
        /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644 (file)
index 0000000..729d4b1
--- /dev/null
@@ -0,0 +1,832 @@
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * failure.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronous to other VM
+ * users, because memory failures could happen anytime and anywhere,
+ * possibly violating some of their assumptions. This is why this code
+ * has to be extremely careful. Generally it tries to use normal locking
+ * rules, as in get the standard locks, even if that means the
+ * error handling takes potentially a long time.
+ *
+ * The operation to map back from RMAP chains to processes has to walk
+ * the complete process list and has non linear complexity with the number
+ * mappings. In short it can be quite slow. But since memory corruptions
+ * are rare we hope to get away with this.
+ */
+
+/*
+ * Notebook:
+ * - hugetlb needs more code
+ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
+ * - pass bad pages to kdump next kernel
+ */
+#define DEBUG 1                /* remove me in 2.6.34 */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include "internal.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+/*
+ * Send all the processes who have the page mapped an ``action optional''
+ * signal.
+ */
+static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
+                       unsigned long pfn)
+{
+       struct siginfo si;
+       int ret;
+
+       printk(KERN_ERR
+               "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+               pfn, t->comm, t->pid);
+       si.si_signo = SIGBUS;
+       si.si_errno = 0;
+       si.si_code = BUS_MCEERR_AO;
+       si.si_addr = (void *)addr;
+#ifdef __ARCH_SI_TRAPNO
+       si.si_trapno = trapno;
+#endif
+       si.si_addr_lsb = PAGE_SHIFT;
+       /*
+        * Don't use force here, it's convenient if the signal
+        * can be temporarily blocked.
+        * This could cause a loop when the user sets SIGBUS
+        * to SIG_IGN, but hopefully noone will do that?
+        */
+       ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+       if (ret < 0)
+               printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
+                      t->comm, t->pid, ret);
+       return ret;
+}
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+       struct list_head nd;
+       struct task_struct *tsk;
+       unsigned long addr;
+       unsigned addr_valid:1;
+};
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do. We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ * TBD would GFP_NOIO be enough?
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+                      struct vm_area_struct *vma,
+                      struct list_head *to_kill,
+                      struct to_kill **tkc)
+{
+       struct to_kill *tk;
+
+       if (*tkc) {
+               tk = *tkc;
+               *tkc = NULL;
+       } else {
+               tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+               if (!tk) {
+                       printk(KERN_ERR
+               "MCE: Out of memory while machine check handling\n");
+                       return;
+               }
+       }
+       tk->addr = page_address_in_vma(p, vma);
+       tk->addr_valid = 1;
+
+       /*
+        * In theory we don't have to kill when the page was
+        * munmaped. But it could be also a mremap. Since that's
+        * likely very rare kill anyways just out of paranoia, but use
+        * a SIGKILL because the error is not contained anymore.
+        */
+       if (tk->addr == -EFAULT) {
+               pr_debug("MCE: Unable to find user space address %lx in %s\n",
+                       page_to_pfn(p), tsk->comm);
+               tk->addr_valid = 0;
+       }
+       get_task_struct(tsk);
+       tk->tsk = tsk;
+       list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
+                         int fail, unsigned long pfn)
+{
+       struct to_kill *tk, *next;
+
+       list_for_each_entry_safe (tk, next, to_kill, nd) {
+               if (doit) {
+                       /*
+                        * In case something went wrong with munmaping
+                        * make sure the process doesn't catch the
+                        * signal and then access the memory. Just kill it.
+                        * the signal handlers
+                        */
+                       if (fail || tk->addr_valid == 0) {
+                               printk(KERN_ERR
+               "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+                                       pfn, tk->tsk->comm, tk->tsk->pid);
+                               force_sig(SIGKILL, tk->tsk);
+                       }
+
+                       /*
+                        * In theory the process could have mapped
+                        * something else on the address in-between. We could
+                        * check for that, but we need to tell the
+                        * process anyways.
+                        */
+                       else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
+                                             pfn) < 0)
+                               printk(KERN_ERR
+               "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+                                       pfn, tk->tsk->comm, tk->tsk->pid);
+               }
+               put_task_struct(tk->tsk);
+               kfree(tk);
+       }
+}
+
+static int task_early_kill(struct task_struct *tsk)
+{
+       if (!tsk->mm)
+               return 0;
+       if (tsk->flags & PF_MCE_PROCESS)
+               return !!(tsk->flags & PF_MCE_EARLY);
+       return sysctl_memory_failure_early_kill;
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+                             struct to_kill **tkc)
+{
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct anon_vma *av;
+
+       read_lock(&tasklist_lock);
+       av = page_lock_anon_vma(page);
+       if (av == NULL) /* Not actually mapped anymore */
+               goto out;
+       for_each_process (tsk) {
+               if (!task_early_kill(tsk))
+                       continue;
+               list_for_each_entry (vma, &av->head, anon_vma_node) {
+                       if (!page_mapped_in_vma(page, vma))
+                               continue;
+                       if (vma->vm_mm == tsk->mm)
+                               add_to_kill(tsk, page, vma, to_kill, tkc);
+               }
+       }
+       page_unlock_anon_vma(av);
+out:
+       read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+                             struct to_kill **tkc)
+{
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       struct prio_tree_iter iter;
+       struct address_space *mapping = page->mapping;
+
+       /*
+        * A note on the locking order between the two locks.
+        * We don't rely on this particular order.
+        * If you have some other code that needs a different order
+        * feel free to switch them around. Or add a reverse link
+        * from mm_struct to task_struct, then this could be all
+        * done without taking tasklist_lock and looping over all tasks.
+        */
+
+       read_lock(&tasklist_lock);
+       spin_lock(&mapping->i_mmap_lock);
+       for_each_process(tsk) {
+               pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+               if (!task_early_kill(tsk))
+                       continue;
+
+               vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+                                     pgoff) {
+                       /*
+                        * Send early kill signal to tasks where a vma covers
+                        * the page but the corrupted page is not necessarily
+                        * mapped it in its pte.
+                        * Assume applications who requested early kill want
+                        * to be informed of all such data corruptions.
+                        */
+                       if (vma->vm_mm == tsk->mm)
+                               add_to_kill(tsk, page, vma, to_kill, tkc);
+               }
+       }
+       spin_unlock(&mapping->i_mmap_lock);
+       read_unlock(&tasklist_lock);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ * This is done in two steps for locking reasons.
+ * First preallocate one tokill structure outside the spin locks,
+ * so that we can kill at least one process reasonably reliable.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill)
+{
+       struct to_kill *tk;
+
+       if (!page->mapping)
+               return;
+
+       tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
+       if (!tk)
+               return;
+       if (PageAnon(page))
+               collect_procs_anon(page, tokill, &tk);
+       else
+               collect_procs_file(page, tokill, &tk);
+       kfree(tk);
+}
+
+/*
+ * Error handlers for various types of pages.
+ */
+
+enum outcome {
+       FAILED,         /* Error handling failed */
+       DELAYED,        /* Will be handled later */
+       IGNORED,        /* Error safely ignored */
+       RECOVERED,      /* Successfully recovered */
+};
+
+static const char *action_name[] = {
+       [FAILED] = "Failed",
+       [DELAYED] = "Delayed",
+       [IGNORED] = "Ignored",
+       [RECOVERED] = "Recovered",
+};
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+       return DELAYED;
+}
+
+/*
+ * Already poisoned page.
+ */
+static int me_ignore(struct page *p, unsigned long pfn)
+{
+       return IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+       printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+       return FAILED;
+}
+
+/*
+ * Free memory
+ */
+static int me_free(struct page *p, unsigned long pfn)
+{
+       return DELAYED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+       int err;
+       int ret = FAILED;
+       struct address_space *mapping;
+
+       if (!isolate_lru_page(p))
+               page_cache_release(p);
+
+       /*
+        * For anonymous pages we're done the only reference left
+        * should be the one m_f() holds.
+        */
+       if (PageAnon(p))
+               return RECOVERED;
+
+       /*
+        * Now truncate the page in the page cache. This is really
+        * more like a "temporary hole punch"
+        * Don't do this for block devices when someone else
+        * has a reference, because it could be file system metadata
+        * and that's not safe to truncate.
+        */
+       mapping = page_mapping(p);
+       if (!mapping) {
+               /*
+                * Page has been teared down in the meanwhile
+                */
+               return FAILED;
+       }
+
+       /*
+        * Truncation is a bit tricky. Enable it per file system for now.
+        *
+        * Open: to take i_mutex or not for this? Right now we don't.
+        */
+       if (mapping->a_ops->error_remove_page) {
+               err = mapping->a_ops->error_remove_page(mapping, p);
+               if (err != 0) {
+                       printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
+                                       pfn, err);
+               } else if (page_has_private(p) &&
+                               !try_to_release_page(p, GFP_NOIO)) {
+                       pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+               } else {
+                       ret = RECOVERED;
+               }
+       } else {
+               /*
+                * If the file system doesn't support it just invalidate
+                * This fails on dirty or anything with private pages
+                */
+               if (invalidate_inode_page(p))
+                       ret = RECOVERED;
+               else
+                       printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
+                               pfn);
+       }
+       return ret;
+}
+
+/*
+ * Dirty cache page page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+       struct address_space *mapping = page_mapping(p);
+
+       SetPageError(p);
+       /* TBD: print more information about the file. */
+       if (mapping) {
+               /*
+                * IO error will be reported by write(), fsync(), etc.
+                * who check the mapping.
+                * This way the application knows that something went
+                * wrong with its dirty file data.
+                *
+                * There's one open issue:
+                *
+                * The EIO will be only reported on the next IO
+                * operation and then cleared through the IO map.
+                * Normally Linux has two mechanisms to pass IO error
+                * first through the AS_EIO flag in the address space
+                * and then through the PageError flag in the page.
+                * Since we drop pages on memory failure handling the
+                * only mechanism open to use is through AS_AIO.
+                *
+                * This has the disadvantage that it gets cleared on
+                * the first operation that returns an error, while
+                * the PageError bit is more sticky and only cleared
+                * when the page is reread or dropped.  If an
+                * application assumes it will always get error on
+                * fsync, but does other operations on the fd before
+                * and the page is dropped inbetween then the error
+                * will not be properly reported.
+                *
+                * This can already happen even without hwpoisoned
+                * pages: first on metadata IO errors (which only
+                * report through AS_EIO) or when the page is dropped
+                * at the wrong time.
+                *
+                * So right now we assume that the application DTRT on
+                * the first EIO, but we're not worse than other parts
+                * of the kernel.
+                */
+               mapping_set_error(mapping, EIO);
+       }
+
+       return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ *      - clear dirty bit to prevent IO
+ *      - remove from LRU
+ *      - but keep in the swap cache, so that when we return to it on
+ *        a later page fault, we know the application is accessing
+ *        corrupted data and shall be killed (we installed simple
+ *        interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+       int ret = FAILED;
+
+       ClearPageDirty(p);
+       /* Trigger EIO in shmem: */
+       ClearPageUptodate(p);
+
+       if (!isolate_lru_page(p)) {
+               page_cache_release(p);
+               ret = DELAYED;
+       }
+
+       return ret;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+       int ret = FAILED;
+
+       if (!isolate_lru_page(p)) {
+               page_cache_release(p);
+               ret = RECOVERED;
+       }
+       delete_from_swap_cache(p);
+       return ret;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * No rmap support so we cannot find the original mapper. In theory could walk
+ * all MMs and look for the mappings, but that would be non atomic and racy.
+ * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * like just walking the current process and hoping it has it mapped (that
+ * should be usually true for the common "shared database cache" case)
+ * Should handle free huge pages and dequeue them too, but this needs to
+ * handle huge page accounting correctly.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+       return FAILED;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremly careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty          (1UL << PG_dirty)
+#define sc             (1UL << PG_swapcache)
+#define unevict                (1UL << PG_unevictable)
+#define mlock          (1UL << PG_mlocked)
+#define writeback      (1UL << PG_writeback)
+#define lru            (1UL << PG_lru)
+#define swapbacked     (1UL << PG_swapbacked)
+#define head           (1UL << PG_head)
+#define tail           (1UL << PG_tail)
+#define compound       (1UL << PG_compound)
+#define slab           (1UL << PG_slab)
+#define buddy          (1UL << PG_buddy)
+#define reserved       (1UL << PG_reserved)
+
+static struct page_state {
+       unsigned long mask;
+       unsigned long res;
+       char *msg;
+       int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+       { reserved,     reserved,       "reserved kernel",      me_ignore },
+       { buddy,        buddy,          "free kernel",  me_free },
+
+       /*
+        * Could in theory check if slab page is free or if we can drop
+        * currently unused objects without touching them. But just
+        * treat it as standard kernel for now.
+        */
+       { slab,         slab,           "kernel slab",  me_kernel },
+
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+       { head,         head,           "huge",         me_huge_page },
+       { tail,         tail,           "huge",         me_huge_page },
+#else
+       { compound,     compound,       "huge",         me_huge_page },
+#endif
+
+       { sc|dirty,     sc|dirty,       "swapcache",    me_swapcache_dirty },
+       { sc|dirty,     sc,             "swapcache",    me_swapcache_clean },
+
+       { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
+       { unevict,      unevict,        "unevictable LRU", me_pagecache_clean},
+
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
+       { mlock|dirty,  mlock|dirty,    "mlocked LRU",  me_pagecache_dirty },
+       { mlock,        mlock,          "mlocked LRU",  me_pagecache_clean },
+#endif
+
+       { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
+       { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
+       { swapbacked,   swapbacked,     "anonymous",    me_pagecache_clean },
+
+       /*
+        * Catchall entry: must be at end.
+        */
+       { 0,            0,              "unknown page state",   me_unknown },
+};
+
+#undef lru
+
+static void action_result(unsigned long pfn, char *msg, int result)
+{
+       struct page *page = NULL;
+       if (pfn_valid(pfn))
+               page = pfn_to_page(pfn);
+
+       printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
+               pfn,
+               page && PageDirty(page) ? "dirty " : "",
+               msg, action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+                       unsigned long pfn, int ref)
+{
+       int result;
+
+       result = ps->action(p, pfn);
+       action_result(pfn, ps->msg, result);
+       if (page_count(p) != 1 + ref)
+               printk(KERN_ERR
+                      "MCE %#lx: %s page still referenced by %d users\n",
+                      pfn, ps->msg, page_count(p) - 1);
+
+       /* Could do more checks here if page looks ok */
+       /*
+        * Could adjust zone counters here to correct for the missing page.
+        */
+
+       return result == RECOVERED ? 0 : -EBUSY;
+}
+
+#define N_UNMAP_TRIES 5
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+                                 int trapno)
+{
+       enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+       struct address_space *mapping;
+       LIST_HEAD(tokill);
+       int ret;
+       int i;
+       int kill = 1;
+
+       if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+               return;
+
+       if (!PageLRU(p))
+               lru_add_drain_all();
+
+       /*
+        * This check implies we don't kill processes if their pages
+        * are in the swap cache early. Those are always late kills.
+        */
+       if (!page_mapped(p))
+               return;
+
+       if (PageSwapCache(p)) {
+               printk(KERN_ERR
+                      "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+               ttu |= TTU_IGNORE_HWPOISON;
+       }
+
+       /*
+        * Propagate the dirty bit from PTEs to struct page first, because we
+        * need this to decide if we should kill or just drop the page.
+        */
+       mapping = page_mapping(p);
+       if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+               if (page_mkclean(p)) {
+                       SetPageDirty(p);
+               } else {
+                       kill = 0;
+                       ttu |= TTU_IGNORE_HWPOISON;
+                       printk(KERN_INFO
+       "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+                               pfn);
+               }
+       }
+
+       /*
+        * First collect all the processes that have the page
+        * mapped in dirty form.  This has to be done before try_to_unmap,
+        * because ttu takes the rmap data structures down.
+        *
+        * Error handling: We ignore errors here because
+        * there's nothing that can be done.
+        */
+       if (kill)
+               collect_procs(p, &tokill);
+
+       /*
+        * try_to_unmap can fail temporarily due to races.
+        * Try a few times (RED-PEN better strategy?)
+        */
+       for (i = 0; i < N_UNMAP_TRIES; i++) {
+               ret = try_to_unmap(p, ttu);
+               if (ret == SWAP_SUCCESS)
+                       break;
+               pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
+       }
+
+       if (ret != SWAP_SUCCESS)
+               printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+                               pfn, page_mapcount(p));
+
+       /*
+        * Now that the dirty bit has been propagated to the
+        * struct page and all unmaps done we can decide if
+        * killing is needed or not.  Only kill when the page
+        * was dirty, otherwise the tokill list is merely
+        * freed.  When there was a problem unmapping earlier
+        * use a more force-full uncatchable kill to prevent
+        * any accesses to the poisoned memory.
+        */
+       kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+                     ret != SWAP_SUCCESS, pfn);
+}
+
+int __memory_failure(unsigned long pfn, int trapno, int ref)
+{
+       struct page_state *ps;
+       struct page *p;
+       int res;
+
+       if (!sysctl_memory_failure_recovery)
+               panic("Memory failure from trap %d on page %lx", trapno, pfn);
+
+       if (!pfn_valid(pfn)) {
+               action_result(pfn, "memory outside kernel control", IGNORED);
+               return -EIO;
+       }
+
+       p = pfn_to_page(pfn);
+       if (TestSetPageHWPoison(p)) {
+               action_result(pfn, "already hardware poisoned", IGNORED);
+               return 0;
+       }
+
+       atomic_long_add(1, &mce_bad_pages);
+
+       /*
+        * We need/can do nothing about count=0 pages.
+        * 1) it's a free page, and therefore in safe hand:
+        *    prep_new_page() will be the gate keeper.
+        * 2) it's part of a non-compound high order page.
+        *    Implies some kernel user: cannot stop them from
+        *    R/W the page; let's pray that the page has been
+        *    used and will be freed some time later.
+        * In fact it's dangerous to directly bump up page count from 0,
+        * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+        */
+       if (!get_page_unless_zero(compound_head(p))) {
+               action_result(pfn, "free or high order kernel", IGNORED);
+               return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+       }
+
+       /*
+        * Lock the page and wait for writeback to finish.
+        * It's very difficult to mess with pages currently under IO
+        * and in many cases impossible, so we just avoid it here.
+        */
+       lock_page_nosync(p);
+       wait_on_page_writeback(p);
+
+       /*
+        * Now take care of user space mappings.
+        */
+       hwpoison_user_mappings(p, pfn, trapno);
+
+       /*
+        * Torn down by someone else?
+        */
+       if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+               action_result(pfn, "already truncated LRU", IGNORED);
+               res = 0;
+               goto out;
+       }
+
+       res = -EBUSY;
+       for (ps = error_states;; ps++) {
+               if ((p->flags & ps->mask) == ps->res) {
+                       res = page_action(ps, p, pfn, ref);
+                       break;
+               }
+       }
+out:
+       unlock_page(p);
+       return res;
+}
+EXPORT_SYMBOL_GPL(__memory_failure);
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+void memory_failure(unsigned long pfn, int trapno)
+{
+       __memory_failure(pfn, trapno, 0);
+}
index b1443ac07c00a4f1a46de6bb260d00e8f52f99db..7e91b5f9f690e4ae9d7c3e58bc1530c995311089 100644 (file)
@@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long addr = vma->vm_start;
 
                /*
-                * Hide vma from rmap and vmtruncate before freeing pgtables
+                * Hide vma from rmap and truncate_pagecache before freeing
+                * pgtables
                 */
                anon_vma_unlink(vma);
                unlink_file_vma(vma);
@@ -1325,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
-                                       else if (ret & VM_FAULT_SIGBUS)
+                                       if (ret &
+                                           (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
@@ -2407,7 +2409,7 @@ restart:
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
- * boundary.  Note that this is different from vmtruncate(), which
+ * boundary.  Note that this is different from truncate_pagecache(), which
  * must keep the partial page.  In contrast, we must get rid of
  * partial pages.
  * @holelen: size of prospective hole in bytes.  This will be rounded
@@ -2458,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @offset: file offset to start truncating
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode * inode, loff_t offset)
-{
-       if (inode->i_size < offset) {
-               unsigned long limit;
-
-               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-               if (limit != RLIM_INFINITY && offset > limit)
-                       goto out_sig;
-               if (offset > inode->i_sb->s_maxbytes)
-                       goto out_big;
-               i_size_write(inode, offset);
-       } else {
-               struct address_space *mapping = inode->i_mapping;
-
-               /*
-                * truncation of in-use swapfiles is disallowed - it would
-                * cause subsequent swapout to scribble on the now-freed
-                * blocks.
-                */
-               if (IS_SWAPFILE(inode))
-                       return -ETXTBSY;
-               i_size_write(inode, offset);
-
-               /*
-                * unmap_mapping_range is called twice, first simply for
-                * efficiency so that truncate_inode_pages does fewer
-                * single-page unmaps.  However after this first call, and
-                * before truncate_inode_pages finishes, it is possible for
-                * private pages to be COWed, which remain after
-                * truncate_inode_pages finishes, hence the second
-                * unmap_mapping_range call must be made for correctness.
-                */
-               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-               truncate_inode_pages(mapping, offset);
-               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-       }
-
-       if (inode->i_op->truncate)
-               inode->i_op->truncate(inode);
-       return 0;
-
-out_sig:
-       send_sig(SIGXFSZ, current, 0);
-out_big:
-       return -EFBIG;
-}
-EXPORT_SYMBOL(vmtruncate);
-
 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 {
        struct address_space *mapping = inode->i_mapping;
@@ -2559,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out;
 
        entry = pte_to_swp_entry(orig_pte);
-       if (is_migration_entry(entry)) {
-               migration_entry_wait(mm, pmd, address);
+       if (unlikely(non_swap_entry(entry))) {
+               if (is_migration_entry(entry)) {
+                       migration_entry_wait(mm, pmd, address);
+               } else if (is_hwpoison_entry(entry)) {
+                       ret = VM_FAULT_HWPOISON;
+               } else {
+                       print_bad_pte(vma, address, orig_pte, NULL);
+                       ret = VM_FAULT_OOM;
+               }
                goto out;
        }
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2584,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
+       } else if (PageHWPoison(page)) {
+               ret = VM_FAULT_HWPOISON;
+               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               goto out;
        }
 
        lock_page(page);
@@ -2760,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
 
+       if (unlikely(PageHWPoison(vmf.page))) {
+               if (ret & VM_FAULT_LOCKED)
+                       unlock_page(vmf.page);
+               return VM_FAULT_HWPOISON;
+       }
+
        /*
         * For consistency in subsequent calls, make the faulted page always
         * locked.
index 16052e80aaacbc182c9ea421bb7f84b02fc3a5b4..1a4bf4813780eb700ee026030bca18fedc2fbae6 100644 (file)
@@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        }
 
        /* Establish migration ptes or remove ptes */
-       try_to_unmap(page, 1);
+       try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 skip_unmap:
        if (!page_mapped(page))
index 20a07dba6be04fb20f21bbac5ed1e289a8ccd354..97bff2547719e702150e1cdc4d4a3f6b31a23213 100644 (file)
@@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
-                * moving file-based ptes, we must lock vmtruncate out,
-                * since it might clean the dst vma before the src vma,
+                * moving file-based ptes, we must lock truncate_pagecache
+                * out, since it might clean the dst vma before the src vma,
                 * and we propagate stale pages into the dst afterward.
                 */
                mapping = vma->vm_file->f_mapping;
index 8d484241d0345e71750b71b36d37bfa7f21bdd06..56a446f059716ccb95736fa7670dd1f07e6b30b5 100644 (file)
@@ -82,46 +82,6 @@ DECLARE_RWSEM(nommu_region_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page.  Ugly, but necessary.
- */
-int vmtruncate(struct inode *inode, loff_t offset)
-{
-       struct address_space *mapping = inode->i_mapping;
-       unsigned long limit;
-
-       if (inode->i_size < offset)
-               goto do_expand;
-       i_size_write(inode, offset);
-
-       truncate_inode_pages(mapping, offset);
-       goto out_truncate;
-
-do_expand:
-       limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-       if (limit != RLIM_INFINITY && offset > limit)
-               goto out_sig;
-       if (offset > inode->i_sb->s_maxbytes)
-               goto out;
-       i_size_write(inode, offset);
-
-out_truncate:
-       if (inode->i_op->truncate)
-               inode->i_op->truncate(inode);
-       return 0;
-out_sig:
-       send_sig(SIGXFSZ, current, 0);
-out:
-       return -EFBIG;
-}
-
-EXPORT_SYMBOL(vmtruncate);
-
 /*
  * Return the total memory allocated for this pointer, not
  * just what the caller asked for.
index 5f378dd588027c227dc95551e27e4b6431e5df60..d99664e8607e761235a13b2662353df131ac5b41 100644 (file)
@@ -155,37 +155,37 @@ static void update_completion_period(void)
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
 }
 
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret;
 
-       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_ratio = 0;
        return ret;
 }
 
 int dirty_ratio_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
        int ret;
 
-       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                update_completion_period();
                vm_dirty_bytes = 0;
@@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 
 
 int dirty_bytes_handler(struct ctl_table *table, int write,
-               struct file *filp, void __user *buffer, size_t *lenp,
+               void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;
 
-       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                update_completion_period();
                vm_dirty_ratio = 0;
@@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, buffer, length, ppos);
        return 0;
 }
 
@@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 EXPORT_SYMBOL(redirty_page_for_writepage);
 
 /*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
index 5717f27a0704b18637221c0bd9dcd820c7bc17c3..bf720550b44d85adc294f7fd0b8ede38f73a8902 100644 (file)
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
        static unsigned long nr_shown;
        static unsigned long nr_unshown;
 
+       /* Don't complain about poisoned pages */
+       if (PageHWPoison(page)) {
+               __ClearPageBuddy(page);
+               return;
+       }
+
        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
@@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static inline int check_new_page(struct page *page)
 {
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
@@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
                bad_page(page);
                return 1;
        }
+       return 0;
+}
+
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+       int i;
+
+       for (i = 0; i < (1 << order); i++) {
+               struct page *p = page + i;
+               if (unlikely(check_new_page(p)))
+                       return 1;
+       }
 
        set_page_private(page, 0);
        set_page_refcounted(page);
@@ -2373,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
-               struct file *file, void __user *buffer, size_t *length,
+               void __user *buffer, size_t *length,
                loff_t *ppos)
 {
        char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
        if (write)
                strncpy(saved_string, (char*)table->data,
                        NUMA_ZONELIST_ORDER_LEN);
-       ret = proc_dostring(table, write, file, buffer, length, ppos);
+       ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
                return ret;
        if (write) {
@@ -4706,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
  *     changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, buffer, length, ppos);
        if (write)
                setup_per_zone_wmarks();
        return 0;
@@ -4716,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct zone *zone;
        int rc;
 
-       rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (rc)
                return rc;
 
@@ -4732,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct zone *zone;
        int rc;
 
-       rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (rc)
                return rc;
 
@@ -4758,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       proc_dointvec_minmax(table, write, buffer, length, ppos);
        setup_per_zone_lowmem_reserve();
        return 0;
 }
@@ -4772,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
  */
 
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct zone *zone;
        unsigned int cpu;
        int ret;
 
-       ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+       ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (!write || (ret == -EINVAL))
                return ret;
        for_each_populated_zone(zone) {
index 720fc03a7bc454de75fa86f542770ab9b9660788..28aafe2b530668b03c766619a83873ee2a91087e 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
  *                           within inode_lock in __sync_single_inode)
+ *
+ * (code doesn't rely on that order so it could be switched around)
+ * ->tasklist_lock
+ *   anon_vma->lock      (memory_failure, collect_procs_anon)
+ *     pte map lock
  */
 
 #include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
        return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * if the page is not mapped into the page tables of this VMA.  Only
  * valid for normal file or anonymous VMAs.
  */
-static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
        unsigned long address;
        pte_t *pte;
@@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page)
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                               int migration)
+                               enum ttu_flags flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
         */
-       if (!migration) {
+       if (!(flags & TTU_IGNORE_MLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
                        ret = SWAP_MLOCK;
                        goto out_unmap;
                }
+       }
+       if (!(flags & TTU_IGNORE_ACCESS)) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        ret = SWAP_FAIL;
                        goto out_unmap;
@@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
 
-       if (PageAnon(page)) {
+       if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+               if (PageAnon(page))
+                       dec_mm_counter(mm, anon_rss);
+               else
+                       dec_mm_counter(mm, file_rss);
+               set_pte_at(mm, address, pte,
+                               swp_entry_to_pte(make_hwpoison_entry(page)));
+       } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
 
                if (PageSwapCache(page)) {
@@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                       BUG_ON(!migration);
+                       BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
                        entry = make_migration_entry(page, pte_write(pteval));
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-       } else if (PAGE_MIGRATION && migration) {
+       } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
@@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
        unsigned int mlocked = 0;
        int ret = SWAP_AGAIN;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
                                continue;  /* must visit all unlocked vmas */
                        ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                break;
                }
@@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
- * @unlock:  request for unlock rather than unmap [unlikely]
- * @migration:  unmapping for migration - ignored if @unlock
+ * @flags: action and flags
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
@@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int unlock, int migration)
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
        unsigned int mlocked = 0;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
 
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
                                continue;       /* must visit all vmas */
                        ret = SWAP_MLOCK;
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                goto out;
                }
@@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
                        ret = SWAP_MLOCK;       /* leave mlocked == 0 */
                        goto out;               /* no need to look further */
                }
-               if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+               if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+                       (vma->vm_flags & VM_LOCKED))
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
@@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                       if (!MLOCK_PAGES && !migration &&
+                       if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
                            (vma->vm_flags & VM_LOCKED))
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
@@ -1177,7 +1193,7 @@ out:
 /**
  * try_to_unmap - try to remove all page table mappings to a page
  * @page: the page to get unmapped
- * @migration: migration flag
+ * @flags: action and flags
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
@@ -1188,16 +1204,16 @@ out:
  * SWAP_FAIL   - the page is unswappable
  * SWAP_MLOCK  - page is mlocked.
  */
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
        int ret;
 
        BUG_ON(!PageLocked(page));
 
        if (PageAnon(page))
-               ret = try_to_unmap_anon(page, 0, migration);
+               ret = try_to_unmap_anon(page, flags);
        else
-               ret = try_to_unmap_file(page, 0, migration);
+               ret = try_to_unmap_file(page, flags);
        if (ret != SWAP_MLOCK && !page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page)
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
 
        if (PageAnon(page))
-               return try_to_unmap_anon(page, 1, 0);
+               return try_to_unmap_anon(page, TTU_MUNLOCK);
        else
-               return try_to_unmap_file(page, 1, 0);
+               return try_to_unmap_file(page, TTU_MUNLOCK);
 }
 
index b206a7a32e2a4e00bc7446ae839407f656031643..98631c26c20001931a6e4ca13032716992d6808c 100644 (file)
@@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);
 
-       unlock_page(page);
        set_page_dirty(page);
+       unlock_page(page);
        page_cache_release(page);
 
        return copied;
@@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        iput(inode);
                        return error;
                }
-               unlock_page(page);
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
                kunmap_atomic(kaddr, KM_USER0);
                set_page_dirty(page);
+               unlock_page(page);
                page_cache_release(page);
        }
        if (dir->i_mode & S_ISGID)
@@ -2420,6 +2420,7 @@ static const struct address_space_operations shmem_aops = {
        .write_end      = shmem_write_end,
 #endif
        .migratepage    = migrate_page,
+       .error_remove_page = generic_error_remove_page,
 };
 
 static const struct file_operations shmem_file_operations = {
index f1bf19daadc67143b099518c1bc29aa52ae04227..4de7f02f820b03bfcf36b5fc8d6827b5eecd38cb 100644 (file)
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
        struct swap_info_struct *p;
        struct page *page = NULL;
 
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                return 1;
 
        p = swap_info_get(entry);
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
        int count;
        bool has_cache;
 
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                return -EINVAL;
 
        type = swp_type(entry);
index ccc3ecf7cb9839a90eddc0086be770796e5b8884..450cebdabfc0470b2bcb0bc3f2de941feb6453c2 100644 (file)
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  */
-static void
+static int
 truncate_complete_page(struct address_space *mapping, struct page *page)
 {
        if (page->mapping != mapping)
-               return;
+               return -EIO;
 
        if (page_has_private(page))
                do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        remove_from_page_cache(page);
        ClearPageMappedToDisk(page);
        page_cache_release(page);       /* pagecache ref */
+       return 0;
 }
 
 /*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        return ret;
 }
 
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+       if (page_mapped(page)) {
+               unmap_mapping_range(mapping,
+                                  (loff_t)page->index << PAGE_CACHE_SHIFT,
+                                  PAGE_CACHE_SIZE, 0);
+       }
+       return truncate_complete_page(mapping, page);
+}
+
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+       if (!mapping)
+               return -EINVAL;
+       /*
+        * Only punch for normal data pages for now.
+        * Handling other types like directories would need more auditing.
+        */
+       if (!S_ISREG(mapping->host->i_mode))
+               return -EIO;
+       return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+       struct address_space *mapping = page_mapping(page);
+       if (!mapping)
+               return 0;
+       if (PageDirty(page) || PageWriteback(page))
+               return 0;
+       if (page_mapped(page))
+               return 0;
+       return invalidate_complete_page(mapping, page);
+}
+
 /**
  * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                                unlock_page(page);
                                continue;
                        }
-                       if (page_mapped(page)) {
-                               unmap_mapping_range(mapping,
-                                 (loff_t)page_index<<PAGE_CACHE_SHIFT,
-                                 PAGE_CACHE_SIZE, 0);
-                       }
-                       truncate_complete_page(mapping, page);
+                       truncate_inode_page(mapping, page);
                        unlock_page(page);
                }
                pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
                                break;
                        lock_page(page);
                        wait_on_page_writeback(page);
-                       if (page_mapped(page)) {
-                               unmap_mapping_range(mapping,
-                                 (loff_t)page->index<<PAGE_CACHE_SHIFT,
-                                 PAGE_CACHE_SIZE, 0);
-                       }
+                       truncate_inode_page(mapping, page);
                        if (page->index > next)
                                next = page->index;
                        next++;
-                       truncate_complete_page(mapping, page);
                        unlock_page(page);
                }
                pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                        if (lock_failed)
                                continue;
 
-                       if (PageDirty(page) || PageWriteback(page))
-                               goto unlock;
-                       if (page_mapped(page))
-                               goto unlock;
-                       ret += invalidate_complete_page(mapping, page);
-unlock:
+                       ret += invalidate_inode_page(page);
+
                        unlock_page(page);
                        if (next > end)
                                break;
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
        return invalidate_inode_pages2_range(mapping, 0, -1);
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @old: old file offset
+ * @new: new file offset
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+{
+       if (new < old) {
+               struct address_space *mapping = inode->i_mapping;
+
+               /*
+                * unmap_mapping_range is called twice, first simply for
+                * efficiency so that truncate_inode_pages does fewer
+                * single-page unmaps.  However after this first call, and
+                * before truncate_inode_pages finishes, it is possible for
+                * private pages to be COWed, which remain after
+                * truncate_inode_pages finishes, hence the second
+                * unmap_mapping_range call must be made for correctness.
+                */
+               unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+               truncate_inode_pages(mapping, new);
+               unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+       }
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * NOTE! We have to be ready to update the memory sharing
+ * between the file and the memory map for a potential last
+ * incomplete page.  Ugly, but necessary.
+ */
+int vmtruncate(struct inode *inode, loff_t offset)
+{
+       loff_t oldsize;
+       int error;
+
+       error = inode_newsize_ok(inode, offset);
+       if (error)
+               return error;
+       oldsize = inode->i_size;
+       i_size_write(inode, offset);
+       truncate_pagecache(inode, oldsize, offset);
+       if (inode->i_op->truncate)
+               inode->i_op->truncate(inode);
+
+       return error;
+}
+EXPORT_SYMBOL(vmtruncate);
index 613e89f471d92c2710be84e9e0fdcf17b0976d0b..1219ceb8a9b2d992da20bb9a10942e7cef2d98b1 100644 (file)
@@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, 0)) {
+                       switch (try_to_unmap(page, TTU_UNMAP)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+                                               gfp_t gfp_mask, bool noswap,
+                                               unsigned int swappiness,
+                                               struct zone *zone, int nid)
+{
+       struct scan_control sc = {
+               .may_writepage = !laptop_mode,
+               .may_unmap = 1,
+               .may_swap = !noswap,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swappiness = swappiness,
+               .order = 0,
+               .mem_cgroup = mem,
+               .isolate_pages = mem_cgroup_isolate_pages,
+       };
+       nodemask_t nm  = nodemask_of_node(nid);
+
+       sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                       (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+       sc.nodemask = &nm;
+       sc.nr_reclaimed = 0;
+       sc.nr_scanned = 0;
+       /*
+        * NOTE: Although we can get the priority field, using it
+        * here is not a good idea, since it limits the pages we can scan.
+        * if we don't reclaim here, the shrink_zone from balance_pgdat
+        * will pick up pages from other mem cgroup's as well. We hack
+        * the priority and make it zero.
+        */
+       shrink_zone(0, zone, &sc);
+       return sc.nr_reclaimed;
+}
+
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           gfp_t gfp_mask,
                                           bool noswap,
                                           unsigned int swappiness)
 {
+       struct zonelist *zonelist;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
                .nodemask = NULL, /* we don't care the placement */
        };
-       struct zonelist *zonelist;
 
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1974,6 +2007,7 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
+                       int nid, zid;
 
                        if (!populated_zone(zone))
                                continue;
@@ -1988,6 +2022,15 @@ loop_again:
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
+
+                       nid = pgdat->node_id;
+                       zid = zone_idx(zone);
+                       /*
+                        * Call soft limit reclaim before calling shrink_zone.
+                        * For now we ignore the return value
+                        */
+                       mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+                                                       nid, zid);
                        /*
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
@@ -2801,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
 unsigned long scan_unevictable_pages;
 
 int scan_unevictable_handler(struct ctl_table *table, int write,
-                          struct file *file, void __user *buffer,
+                          void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
-       proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+       proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
        if (write && *(unsigned long *)table->data)
                scan_all_zones_unevictable_pages();
index 907a82e9023d1d4cb09a82c677afad229657ca34..a16a2342f6bf2ee0b6a30329736c91acaff19bb9 100644 (file)
@@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 
 #ifdef CONFIG_SYSCTL
 static
-int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp,
+int brnf_sysctl_call_tables(ctl_table * ctl, int write,
                            void __user * buffer, size_t * lenp, loff_t * ppos)
 {
        int ret;
 
-       ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write && *(int *)(ctl->data))
                *(int *)(ctl->data) = 1;
index 1c6a5bb6f0c8da8cba2e31021ad0f1171f0aeecf..6e1f085db06af33a1f069d22816d9b463f413939 100644 (file)
@@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
 static int min_priority[1];
 static int max_priority[] = { 127 }; /* From DECnet spec */
 
-static int dn_forwarding_proc(ctl_table *, int, struct file *,
+static int dn_forwarding_proc(ctl_table *, int,
                        void __user *, size_t *, loff_t *);
 static int dn_forwarding_sysctl(ctl_table *table,
                        void __user *oldval, size_t __user *oldlenp,
@@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
 }
 
 static int dn_forwarding_proc(ctl_table *table, int write,
-                               struct file *filep,
                                void __user *buffer,
                                size_t *lenp, loff_t *ppos)
 {
@@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
        dn_db = dev->dn_ptr;
        old = dn_db->parms.forwarding;
 
-       err = proc_dointvec(table, write, filep, buffer, lenp, ppos);
+       err = proc_dointvec(table, write, buffer, lenp, ppos);
 
        if ((err >= 0) && write) {
                if (dn_db->parms.forwarding < 0)
index 5bcd592ae6dd0f3883d25e4543e0085ca81133ea..26b0ab1e9f560b75d046c687fabf27dfed1b4f48 100644 (file)
@@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table,
 }
 
 static int dn_node_address_handler(ctl_table *table, int write,
-                               struct file *filp,
                                void __user *buffer,
                                size_t *lenp, loff_t *ppos)
 {
@@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table,
 
 
 static int dn_def_dev_handler(ctl_table *table, int write,
-                               struct file * filp,
                                void __user *buffer,
                                size_t *lenp, loff_t *ppos)
 {
index 07336c6201f0dede9e93c75f811b64af7d8cc853..e92f1fd28aa5161bc28969ba25b3d9d0bf95418e 100644 (file)
@@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net)
 }
 
 static int devinet_conf_proc(ctl_table *ctl, int write,
-                            struct file *filp, void __user *buffer,
+                            void __user *buffer,
                             size_t *lenp, loff_t *ppos)
 {
-       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write) {
                struct ipv4_devconf *cnf = ctl->extra1;
@@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table,
 }
 
 static int devinet_sysctl_forward(ctl_table *ctl, int write,
-                                 struct file *filp, void __user *buffer,
+                                 void __user *buffer,
                                  size_t *lenp, loff_t *ppos)
 {
        int *valp = ctl->data;
        int val = *valp;
-       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write && *valp != val) {
                struct net *net = ctl->extra2;
@@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 }
 
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-                        struct file *filp, void __user *buffer,
+                        void __user *buffer,
                         size_t *lenp, loff_t *ppos)
 {
        int *valp = ctl->data;
        int val = *valp;
-       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        struct net *net = ctl->extra2;
 
        if (write && *valp != val)
index df9347314538bd353431afde7a4ca49ee48b5e2a..bb41992520268b08c49f345e2170aeb644c56b7d 100644 (file)
@@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 
 #ifdef CONFIG_SYSCTL
 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
-                                       struct file *filp, void __user *buffer,
+                                       void __user *buffer,
                                        size_t *lenp, loff_t *ppos)
 {
        if (write) {
@@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
                memcpy(&ctl, __ctl, sizeof(ctl));
                ctl.data = &flush_delay;
-               proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
+               proc_dointvec(&ctl, write, buffer, lenp, ppos);
 
                net = (struct net *)__ctl->extra1;
                rt_cache_flush(net, flush_delay);
@@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old)
 }
 
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-                                         struct file *filp,
                                          void __user *buffer, size_t *lenp,
                                          loff_t *ppos)
 {
        int old = ip_rt_secret_interval;
-       int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
 
        rt_secret_reschedule(old);
 
index 4710d219f06ae9df6c2754906ca09663ed4da738..2dcf04d9b005cda549d2d4ee58cd98697163d1bb 100644 (file)
@@ -36,7 +36,7 @@ static void set_local_port_range(int range[2])
 }
 
 /* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
+static int ipv4_local_port_range(ctl_table *table, int write,
                                 void __user *buffer,
                                 size_t *lenp, loff_t *ppos)
 {
@@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
        };
 
        inet_get_local_port_range(range, range + 1);
-       ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
        if (write && ret == 0) {
                if (range[1] < range[0])
@@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table,
 }
 
 
-static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+static int proc_tcp_congestion_control(ctl_table *ctl, int write,
                                       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        char val[TCP_CA_NAME_MAX];
@@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 
        tcp_get_default_congestion_control(val);
 
-       ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        if (write && ret == 0)
                ret = tcp_set_default_congestion_control(val);
        return ret;
@@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table,
 }
 
 static int proc_tcp_available_congestion_control(ctl_table *ctl,
-                                                int write, struct file * filp,
+                                                int write,
                                                 void __user *buffer, size_t *lenp,
                                                 loff_t *ppos)
 {
@@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
        if (!tbl.data)
                return -ENOMEM;
        tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
-       ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        kfree(tbl.data);
        return ret;
 }
 
 static int proc_allowed_congestion_control(ctl_table *ctl,
-                                          int write, struct file * filp,
+                                          int write,
                                           void __user *buffer, size_t *lenp,
                                           loff_t *ppos)
 {
@@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
                return -ENOMEM;
 
        tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
-       ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        if (write && ret == 0)
                ret = tcp_set_allowed_congestion_control(tbl.data);
        kfree(tbl.data);
index 55f486d89c88eeb1d3e1409cd115cf996c9a8b07..1fd0a3d775d26767dec15c78f599dc96b1c3c08d 100644 (file)
@@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 #ifdef CONFIG_SYSCTL
 
 static
-int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_forward(ctl_table *ctl, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int *valp = ctl->data;
        int val = *valp;
        int ret;
 
-       ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write)
                ret = addrconf_fixup_forwarding(ctl, valp, val);
@@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
 }
 
 static
-int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_disable(ctl_table *ctl, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int *valp = ctl->data;
        int val = *valp;
        int ret;
 
-       ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        if (write)
                ret = addrconf_disable_ipv6(ctl, valp, val);
index 7015478797f667be60b94c762302ce24593d1fef..498b9b0b0fade607c0ece1b1ce3edaacc03ef968 100644 (file)
@@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
        }
 }
 
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct net_device *dev = ctl->extra1;
        struct inet6_dev *idev;
@@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
                ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
 
        if (strcmp(ctl->procname, "retrans_time") == 0)
-               ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+               ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
        else if (strcmp(ctl->procname, "base_reachable_time") == 0)
                ret = proc_dointvec_jiffies(ctl, write,
-                                           filp, buffer, lenp, ppos);
+                                           buffer, lenp, ppos);
 
        else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
                 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
                ret = proc_dointvec_ms_jiffies(ctl, write,
-                                              filp, buffer, lenp, ppos);
+                                              buffer, lenp, ppos);
        else
                ret = -1;
 
index 77aecbe8ff6caf261c04700d6bea889bb2221a7e..d6fe7646a8ff7d8599c3565e6e6c9deb68732cef 100644 (file)
@@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = {
 #ifdef CONFIG_SYSCTL
 
 static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
+int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
                              void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct net *net = current->nsproxy->net_ns;
        int delay = net->ipv6.sysctl.flush_delay;
        if (write) {
-               proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+               proc_dointvec(ctl, write, buffer, lenp, ppos);
                fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
                return 0;
        } else
index 57f8817c3979029a38a680d1ea65d948766d2ef9..5c86567e5a78c3d84c511f038dd18a9834f9efe2 100644 (file)
@@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100;    /* 100us */
 /* For other sysctl, I've no idea of the range. Maybe Dag could help
  * us on that - Jean II */
 
-static int do_devname(ctl_table *table, int write, struct file *filp,
+static int do_devname(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dostring(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dostring(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                struct ias_value *val;
 
@@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp,
 }
 
 
-static int do_discovery(ctl_table *table, int write, struct file *filp,
+static int do_discovery(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret)
               return ret;
 
index fba2892b99e10157a90bcf73969a61647c390605..446e9bd4b4bc2b90aa850f8734ee64ced9f03fe1 100644 (file)
@@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void)
 
 
 static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+proc_do_defense_mode(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int *valp = table->data;
        int val = *valp;
        int rc;
 
-       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       rc = proc_dointvec(table, write, buffer, lenp, ppos);
        if (write && (*valp != val)) {
                if ((*valp < 0) || (*valp > 3)) {
                        /* Restore the correct value */
@@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 
 
 static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+proc_do_sync_threshold(ctl_table *table, int write,
                       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int *valp = table->data;
@@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
        /* backup the value first */
        memcpy(val, valp, sizeof(val));
 
-       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       rc = proc_dointvec(table, write, buffer, lenp, ppos);
        if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
                /* Restore the correct value */
                memcpy(valp, val, sizeof(val));
index 4e620305f28c765548c5c5b35b1a5b833034bf30..c93494fef8ef3cfdc8eb22352c59554d0c751eb2 100644 (file)
@@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
 static struct ctl_table_header *nf_log_dir_header;
 
-static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
+static int nf_log_proc_dostring(ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        const struct nf_logger *logger;
@@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
                        table->data = "NONE";
                else
                        table->data = logger->name;
-               r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+               r = proc_dostring(table, write, buffer, lenp, ppos);
                mutex_unlock(&nf_log_mutex);
        }
 
index 7b5749ee2765c221532e9c3624f66d27e0843db5..2220f33223267287b0801830465641331d7602a4 100644 (file)
@@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max)
        } while (read_seqretry(&local_port_range_lock, seq));
 }
 
-static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
+static int proc_local_port_range(ctl_table *table, int write,
                                void __user *buffer,
                                size_t *lenp, loff_t *ppos)
 {
@@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
                .extra2 = &local_port_range_max,
        };
 
-       ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
        if (write && ret == 0) {
                if (range[1] < range[0])
index 5231f7aaac0e483ebb8b457c3c6e2218464da409..42f9748ae0939f8f2f22f8c8cb93505d378bf4e3 100644 (file)
@@ -56,7 +56,7 @@ rpc_unregister_sysctl(void)
        }
 }
 
-static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+static int proc_do_xprt(ctl_table *table, int write,
                        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        char tmpbuf[256];
@@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
 }
 
 static int
-proc_dodebug(ctl_table *table, int write, struct file *file,
+proc_dodebug(ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        char            tmpbuf[20], c, *s;
index 87101177825b83073ed153bc51367e64132b0a16..35fb68b9c8ec96ca2080530e014135d643aaabac 100644 (file)
@@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep;
  * current value.
  */
 static int read_reset_stat(ctl_table *table, int write,
-                          struct file *filp, void __user *buffer, size_t *lenp,
+                          void __user *buffer, size_t *lenp,
                           loff_t *ppos)
 {
        atomic_t *stat = (atomic_t *)table->data;
index b8186bac8b7eb08088b40914137e4f51b8e403fd..6cf8fd2b79e80df26e142aa94e6fed9d4c3e7015 100644 (file)
@@ -61,7 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 struct cgroup_subsys devices_subsys;
 
 static int devcgroup_can_attach(struct cgroup_subsys *ss,
-               struct cgroup *new_cgroup, struct task_struct *task)
+               struct cgroup *new_cgroup, struct task_struct *task,
+               bool threadgroup)
 {
        if (current != task && !capable(CAP_SYS_ADMIN))
                        return -EPERM;
index 500aad0ebd6acd8af2e63a7cbc8ae0965bbc5cdc..3bb90b6f1dd3db9d697e02209273008eb7a73cc1 100644 (file)
@@ -187,7 +187,7 @@ static inline void print_ipv6_addr(struct audit_buffer *ab,
                                   char *name1, char *name2)
 {
        if (!ipv6_addr_any(addr))
-               audit_log_format(ab, " %s=%pI6", name1, addr);
+               audit_log_format(ab, " %s=%pI6c", name1, addr);
        if (port)
                audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
index 14cc7b3b8d0379ef0cd17e071aa2821ea6dd4a95..c844eed7915d0d270c058c16d6b3db40ffa576d0 100644 (file)
@@ -28,12 +28,12 @@ static void update_mmap_min_addr(void)
  * sysctl handler which just sets dac_mmap_min_addr = the new value and then
  * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
  */
-int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+int mmap_min_addr_handler(struct ctl_table *table, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
        update_mmap_min_addr();
 
index 417f7c9945229175f79842c5a9637af6fd48fc1d..bb230d5d7085a9612f915edbb50124a0077db4be 100644 (file)
@@ -2411,7 +2411,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
        read_lock(&tasklist_lock);
-       wake_up_interruptible(&current->real_parent->signal->wait_chldexit);
+       __wake_up_parent(current, current->real_parent);
        read_unlock(&tasklist_lock);
 }