Merge commit 'linus/master' into merge-linus
authorArjan van de Ven <arjan@linux.intel.com>
Fri, 17 Oct 2008 16:20:26 +0000 (09:20 -0700)
committerArjan van de Ven <arjan@linux.intel.com>
Fri, 17 Oct 2008 16:20:26 +0000 (09:20 -0700)
Conflicts:

arch/x86/kvm/i8254.c

15 files changed:
1  2 
arch/ia64/kvm/kvm-ia64.c
arch/x86/kvm/i8254.c
arch/x86/kvm/lapic.c
drivers/s390/crypto/ap_bus.c
fs/compat.c
include/linux/hrtimer.h
include/linux/sched.h
include/linux/time.h
kernel/fork.c
kernel/hrtimer.c
kernel/posix-timers.c
kernel/sched.c
kernel/sys.c
kernel/time/ntp.c
kernel/time/tick-sched.c

diff --combined arch/ia64/kvm/kvm-ia64.c
index cf8eae1855e627bee0bf5d3a2ee123c318fdf26e,c0699f0e35a926936113e42d925b4fd08318c1c3..a312c9e9b9efa2ecd194aa7f88a98a233749dbf6
@@@ -31,6 -31,7 +31,7 @@@
  #include <linux/bitops.h>
  #include <linux/hrtimer.h>
  #include <linux/uaccess.h>
+ #include <linux/intel-iommu.h>
  
  #include <asm/pgtable.h>
  #include <asm/gcc_intrin.h>
  #include <asm/cacheflush.h>
  #include <asm/div64.h>
  #include <asm/tlb.h>
+ #include <asm/elf.h>
  
  #include "misc.h"
  #include "vti.h"
  #include "iodev.h"
  #include "ioapic.h"
  #include "lapic.h"
+ #include "irq.h"
  
  static unsigned long kvm_vmm_base;
  static unsigned long kvm_vsa_base;
@@@ -61,12 -64,6 +64,6 @@@ struct kvm_stats_debugfs_item debugfs_e
        { NULL }
  };
  
- struct fdesc{
-     unsigned long ip;
-     unsigned long gp;
- };
  static void kvm_flush_icache(unsigned long start, unsigned long len)
  {
        int l;
@@@ -184,12 -181,16 +181,16 @@@ int kvm_dev_ioctl_check_extension(long 
        switch (ext) {
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_MP_STATE:
  
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
+       case KVM_CAP_IOMMU:
+               r = intel_iommu_found();
+               break;
        default:
                r = 0;
        }
@@@ -776,6 -777,7 +777,7 @@@ static void kvm_init_vm(struct kvm *kvm
         */
        kvm_build_io_pmt(kvm);
  
+       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
  }
  
  struct  kvm *kvm_arch_create_vm(void)
@@@ -1112,7 -1114,7 +1114,7 @@@ static void kvm_migrate_hlt_timer(struc
        struct hrtimer *p_ht = &vcpu->arch.hlt_timer;
  
        if (hrtimer_cancel(p_ht))
 -              hrtimer_start(p_ht, p_ht->expires, HRTIMER_MODE_ABS);
 +              hrtimer_start_expires(p_ht, HRTIMER_MODE_ABS);
  }
  
  static enum hrtimer_restart hlt_timer_fn(struct hrtimer *data)
@@@ -1339,6 -1341,10 +1341,10 @@@ static void kvm_release_vm_pages(struc
  
  void kvm_arch_destroy_vm(struct kvm *kvm)
  {
+       kvm_iommu_unmap_guest(kvm);
+ #ifdef  KVM_CAP_DEVICE_ASSIGNMENT
+       kvm_free_all_assigned_devices(kvm);
+ #endif
        kfree(kvm->arch.vioapic);
        kvm_release_vm_pages(kvm);
        kvm_free_physmem(kvm);
@@@ -1440,17 -1446,24 +1446,24 @@@ int kvm_arch_set_memory_region(struct k
                int user_alloc)
  {
        unsigned long i;
-       struct page *page;
+       unsigned long pfn;
        int npages = mem->memory_size >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
        unsigned long base_gfn = memslot->base_gfn;
  
        for (i = 0; i < npages; i++) {
-               page = gfn_to_page(kvm, base_gfn + i);
-               kvm_set_pmt_entry(kvm, base_gfn + i,
-                               page_to_pfn(page) << PAGE_SHIFT,
-                               _PAGE_AR_RWX|_PAGE_MA_WB);
-               memslot->rmap[i] = (unsigned long)page;
+               pfn = gfn_to_pfn(kvm, base_gfn + i);
+               if (!kvm_is_mmio_pfn(pfn)) {
+                       kvm_set_pmt_entry(kvm, base_gfn + i,
+                                       pfn << PAGE_SHIFT,
+                               _PAGE_AR_RWX | _PAGE_MA_WB);
+                       memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
+               } else {
+                       kvm_set_pmt_entry(kvm, base_gfn + i,
+                                       GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
+                                       _PAGE_MA_UC);
+                       memslot->rmap[i] = 0;
+                       }
        }
  
        return 0;
@@@ -1794,11 -1807,43 +1807,43 @@@ int kvm_arch_vcpu_runnable(struct kvm_v
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
-       return -EINVAL;
+       vcpu_load(vcpu);
+       mp_state->mp_state = vcpu->arch.mp_state;
+       vcpu_put(vcpu);
+       return 0;
+ }
+ static int vcpu_reset(struct kvm_vcpu *vcpu)
+ {
+       int r;
+       long psr;
+       local_irq_save(psr);
+       r = kvm_insert_vmm_mapping(vcpu);
+       if (r)
+               goto fail;
+       vcpu->arch.launched = 0;
+       kvm_arch_vcpu_uninit(vcpu);
+       r = kvm_arch_vcpu_init(vcpu);
+       if (r)
+               goto fail;
+       kvm_purge_vmm_mapping(vcpu);
+       r = 0;
+ fail:
+       local_irq_restore(psr);
+       return r;
  }
  
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
-       return -EINVAL;
+       int r = 0;
+       vcpu_load(vcpu);
+       vcpu->arch.mp_state = mp_state->mp_state;
+       if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+               r = vcpu_reset(vcpu);
+       vcpu_put(vcpu);
+       return r;
  }
diff --combined arch/x86/kvm/i8254.c
index 1bf8f57a30411d69afc00438456555f9310bd635,634132a9a512391d324def8826390a709d257c80..11c6725fb798b6967d60f07ea920fc594d292994
@@@ -200,13 -200,14 +200,14 @@@ static int __pit_timer_fn(struct kvm_kp
  
        if (!atomic_inc_and_test(&pt->pending))
                set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-       if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-               vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       if (vcpu0 && waitqueue_active(&vcpu0->wq))
                wake_up_interruptible(&vcpu0->wq);
-       }
  
 -      pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
 -      pt->scheduled = ktime_to_ns(pt->timer.expires);
 +      hrtimer_add_expires_ns(&pt->timer, pt->period);
-       pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer));
++      pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
+       if (pt->period)
 -              ps->channels[0].count_load_time = pt->timer.expires;
++              ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
  
        return (pt->period == 0 ? 0 : 1);
  }
@@@ -215,12 -216,22 +216,22 @@@ int pit_has_pending_timer(struct kvm_vc
  {
        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
  
-       if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+       if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
                return atomic_read(&pit->pit_state.pit_timer.pending);
        return 0;
  }
  
+ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+ {
+       struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+                                                irq_ack_notifier);
+       spin_lock(&ps->inject_lock);
+       if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+               atomic_inc(&ps->pit_timer.pending);
+       ps->irq_ack = 1;
+       spin_unlock(&ps->inject_lock);
+ }
  static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
  {
        struct kvm_kpit_state *ps;
@@@ -246,7 -257,7 +257,7 @@@ void __kvm_migrate_pit_timer(struct kvm
  
        timer = &pit->pit_state.pit_timer.timer;
        if (hrtimer_cancel(timer))
 -              hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 +              hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  }
  
  static void destroy_pit_timer(struct kvm_kpit_timer *pt)
        hrtimer_cancel(&pt->timer);
  }
  
- static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
  {
+       struct kvm_kpit_timer *pt = &ps->pit_timer;
        s64 interval;
  
        interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
        pt->period = (is_period == 0) ? 0 : interval;
        pt->timer.function = pit_timer_fn;
        atomic_set(&pt->pending, 0);
+       ps->irq_ack = 1;
  
        hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
                      HRTIMER_MODE_ABS);
@@@ -302,11 -315,11 +315,11 @@@ static void pit_load_count(struct kvm *
        case 1:
          /* FIXME: enhance mode 4 precision */
        case 4:
-               create_pit_timer(&ps->pit_timer, val, 0);
+               create_pit_timer(ps, val, 0);
                break;
        case 2:
        case 3:
-               create_pit_timer(&ps->pit_timer, val, 1);
+               create_pit_timer(ps, val, 1);
                break;
        default:
                destroy_pit_timer(&ps->pit_timer);
@@@ -520,7 -533,7 +533,7 @@@ void kvm_pit_reset(struct kvm_pit *pit
        mutex_unlock(&pit->pit_state.lock);
  
        atomic_set(&pit->pit_state.pit_timer.pending, 0);
-       pit->pit_state.inject_pending = 1;
+       pit->pit_state.irq_ack = 1;
  }
  
  struct kvm_pit *kvm_create_pit(struct kvm *kvm)
  
        mutex_init(&pit->pit_state.lock);
        mutex_lock(&pit->pit_state.lock);
+       spin_lock_init(&pit->pit_state.inject_lock);
  
        /* Initialize PIO device */
        pit->dev.read = pit_ioport_read;
        pit_state->pit = pit;
        hrtimer_init(&pit_state->pit_timer.timer,
                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       pit_state->irq_ack_notifier.gsi = 0;
+       pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+       kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
        mutex_unlock(&pit->pit_state.lock);
  
        kvm_pit_reset(pit);
@@@ -578,10 -595,8 +595,8 @@@ void kvm_free_pit(struct kvm *kvm
  static void __inject_pit_timer_intr(struct kvm *kvm)
  {
        mutex_lock(&kvm->lock);
-       kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
-       kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
-       kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
-       kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+       kvm_set_irq(kvm, 0, 1);
+       kvm_set_irq(kvm, 0, 0);
        mutex_unlock(&kvm->lock);
  }
  
@@@ -592,37 -607,19 +607,19 @@@ void kvm_inject_pit_timer_irqs(struct k
        struct kvm_kpit_state *ps;
  
        if (vcpu && pit) {
+               int inject = 0;
                ps = &pit->pit_state;
  
-               /* Try to inject pending interrupts when:
-                * 1. Pending exists
-                * 2. Last interrupt was accepted or waited for too long time*/
-               if (atomic_read(&ps->pit_timer.pending) &&
-                   (ps->inject_pending ||
-                   (jiffies - ps->last_injected_time
-                               >= KVM_MAX_PIT_INTR_INTERVAL))) {
-                       ps->inject_pending = 0;
-                       __inject_pit_timer_intr(kvm);
-                       ps->last_injected_time = jiffies;
-               }
-       }
- }
- void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
- {
-       struct kvm_arch *arch = &vcpu->kvm->arch;
-       struct kvm_kpit_state *ps;
-       if (vcpu && arch->vpit) {
-               ps = &arch->vpit->pit_state;
-               if (atomic_read(&ps->pit_timer.pending) &&
-               (((arch->vpic->pics[0].imr & 1) == 0 &&
-                 arch->vpic->pics[0].irq_base == vec) ||
-                 (arch->vioapic->redirtbl[0].fields.vector == vec &&
-                 arch->vioapic->redirtbl[0].fields.mask != 1))) {
-                       ps->inject_pending = 1;
-                       atomic_dec(&ps->pit_timer.pending);
-                       ps->channels[0].count_load_time = ktime_get();
+               /* Try to inject pending interrupts when
+                * last one has been acked.
+                */
+               spin_lock(&ps->inject_lock);
+               if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+                       ps->irq_ack = 0;
+                       inject = 1;
                }
+               spin_unlock(&ps->inject_lock);
+               if (inject)
+                       __inject_pit_timer_intr(kvm);
        }
  }
diff --combined arch/x86/kvm/lapic.c
index a5b61de6adf1c7cb4e8b43cef1fe0dfaca0b4e20,6571926bfd339b498c2ca06835b71d8ead494787..0fc3cab48943da8a3c513c7c753a4792c6743198
@@@ -32,6 -32,7 +32,7 @@@
  #include <asm/current.h>
  #include <asm/apicdef.h>
  #include <asm/atomic.h>
+ #include "kvm_cache_regs.h"
  #include "irq.h"
  
  #define PRId64 "d"
@@@ -338,13 -339,7 +339,7 @@@ static int __apic_accept_irq(struct kvm
                } else
                        apic_clear_vector(vector, apic->regs + APIC_TMR);
  
-               if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-                       kvm_vcpu_kick(vcpu);
-               else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
-                       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
-               }
+               kvm_vcpu_kick(vcpu);
  
                result = (orig_irr == 0);
                break;
                        vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                        kvm_vcpu_kick(vcpu);
                } else {
-                       printk(KERN_DEBUG
-                              "Ignoring de-assert INIT to vcpu %d\n",
-                              vcpu->vcpu_id);
+                       apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+                                  vcpu->vcpu_id);
                }
                break;
  
        case APIC_DM_STARTUP:
-               printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-                      vcpu->vcpu_id, vector);
+               apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+                          vcpu->vcpu_id, vector);
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        vcpu->arch.sipi_vector = vector;
                        vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
+                       kvm_vcpu_kick(vcpu);
                }
                break;
  
@@@ -438,7 -430,7 +430,7 @@@ struct kvm_vcpu *kvm_get_lowest_prio_vc
  static void apic_set_eoi(struct kvm_lapic *apic)
  {
        int vector = apic_find_highest_isr(apic);
+       int trigger_mode;
        /*
         * Not every write EOI will has corresponding ISR,
         * one example is when Kernel check timer on setup_IO_APIC
        apic_update_ppr(apic);
  
        if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+               trigger_mode = IOAPIC_LEVEL_TRIG;
+       else
+               trigger_mode = IOAPIC_EDGE_TRIG;
+       kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
  }
  
  static void apic_send_ipi(struct kvm_lapic *apic)
@@@ -558,8 -553,7 +553,7 @@@ static void __report_tpr_access(struct 
        struct kvm_run *run = vcpu->run;
  
        set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
-       kvm_x86_ops->cache_regs(vcpu);
-       run->tpr_access.rip = vcpu->arch.rip;
+       run->tpr_access.rip = kvm_rip_read(vcpu);
        run->tpr_access.is_write = write;
  }
  
@@@ -683,9 -677,9 +677,9 @@@ static void apic_mmio_write(struct kvm_
         * Refer SDM 8.4.1
         */
        if (len != 4 || alignment) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "apic write: bad size=%d %lx\n",
-                              len, (long)address);
+               /* Don't shout loud, $infamous_os would cause only noise. */
+               apic_debug("apic write: bad size=%d %lx\n",
+                          len, (long)address);
                return;
        }
  
@@@ -947,13 -941,14 +941,12 @@@ static int __apic_timer_fn(struct kvm_l
  
        if(!atomic_inc_and_test(&apic->timer.pending))
                set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-       if (waitqueue_active(q)) {
-               apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       if (waitqueue_active(q))
                wake_up_interruptible(q);
-       }
        if (apic_lvtt_period(apic)) {
                result = 1;
 -              apic->timer.dev.expires = ktime_add_ns(
 -                                      apic->timer.dev.expires,
 -                                      apic->timer.period);
 +              hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
        }
        return result;
  }
@@@ -1122,7 -1117,7 +1115,7 @@@ void __kvm_migrate_apic_timer(struct kv
  
        timer = &apic->timer.dev;
        if (hrtimer_cancel(timer))
 -              hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 +              hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  }
  
  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
index 6f02f1e674d4ca9438315d07e4fb110b329c965b,326db1e827c4dbe41f70419a9af53b1a997a7e7a..e3fe6838293ad4f238cd9bb81df92a59ec61d72b
@@@ -659,9 -659,9 +659,9 @@@ static ssize_t poll_timeout_store(struc
        hr_time = ktime_set(0, poll_timeout);
  
        if (!hrtimer_is_queued(&ap_poll_timer) ||
 -          !hrtimer_forward(&ap_poll_timer, ap_poll_timer.expires, hr_time)) {
 -              ap_poll_timer.expires = hr_time;
 -              hrtimer_start(&ap_poll_timer, hr_time, HRTIMER_MODE_ABS);
 +          !hrtimer_forward(&ap_poll_timer, hrtimer_get_expires(&ap_poll_timer), hr_time)) {
 +              hrtimer_set_expires(&ap_poll_timer, hr_time);
 +              hrtimer_start_expires(&ap_poll_timer, HRTIMER_MODE_ABS);
        }
        return count;
  }
@@@ -892,8 -892,8 +892,8 @@@ static void ap_scan_bus(struct work_str
  
                ap_dev->device.bus = &ap_bus_type;
                ap_dev->device.parent = ap_root_device;
-               snprintf(ap_dev->device.bus_id, BUS_ID_SIZE, "card%02x",
-                        AP_QID_DEVICE(ap_dev->qid));
+               dev_set_name(&ap_dev->device, "card%02x",
+                            AP_QID_DEVICE(ap_dev->qid));
                ap_dev->device.release = ap_device_release;
                rc = device_register(&ap_dev->device);
                if (rc) {
diff --combined fs/compat.c
index 133ed7f5d681ef111922953fb5a1b2ca99cb5bba,5f9ec449c799854e19a9190b489ccf4a23fc0b03..3b58c32be526301998fe7d0698bbdf98ac84d3a2
@@@ -137,6 -137,45 +137,45 @@@ asmlinkage long compat_sys_utimes(char 
        return compat_sys_futimesat(AT_FDCWD, filename, t);
  }
  
+ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+ {
+       compat_ino_t ino = stat->ino;
+       typeof(ubuf->st_uid) uid = 0;
+       typeof(ubuf->st_gid) gid = 0;
+       int err;
+       SET_UID(uid, stat->uid);
+       SET_GID(gid, stat->gid);
+       if ((u64) stat->size > MAX_NON_LFS ||
+           !old_valid_dev(stat->dev) ||
+           !old_valid_dev(stat->rdev))
+               return -EOVERFLOW;
+       if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+               return -EOVERFLOW;
+       if (clear_user(ubuf, sizeof(*ubuf)))
+               return -EFAULT;
+       err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
+       err |= __put_user(ino, &ubuf->st_ino);
+       err |= __put_user(stat->mode, &ubuf->st_mode);
+       err |= __put_user(stat->nlink, &ubuf->st_nlink);
+       err |= __put_user(uid, &ubuf->st_uid);
+       err |= __put_user(gid, &ubuf->st_gid);
+       err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
+       err |= __put_user(stat->size, &ubuf->st_size);
+       err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
+       err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
+       err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
+       err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
+       err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
+       err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
+       err |= __put_user(stat->blksize, &ubuf->st_blksize);
+       err |= __put_user(stat->blocks, &ubuf->st_blocks);
+       return err;
+ }
  asmlinkage long compat_sys_newstat(char __user * filename,
                struct compat_stat __user *statbuf)
  {
@@@ -1239,7 -1278,7 +1278,7 @@@ static int compat_count(compat_uptr_t _
                        if (!p)
                                break;
                        argv++;
-                       if(++i > max)
+                       if (i++ >= max)
                                return -E2BIG;
                }
        }
@@@ -1436,57 -1475,6 +1475,57 @@@ out_ret
  
  #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
  
 +static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 +                                    int timeval, int ret)
 +{
 +      struct timespec ts;
 +
 +      if (!p)
 +              return ret;
 +
 +      if (current->personality & STICKY_TIMEOUTS)
 +              goto sticky;
 +
 +      /* No update for zero timeout */
 +      if (!end_time->tv_sec && !end_time->tv_nsec)
 +              return ret;
 +
 +      ktime_get_ts(&ts);
 +      ts = timespec_sub(*end_time, ts);
 +      if (ts.tv_sec < 0)
 +              ts.tv_sec = ts.tv_nsec = 0;
 +
 +      if (timeval) {
 +              struct compat_timeval rtv;
 +
 +              rtv.tv_sec = ts.tv_sec;
 +              rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
 +
 +              if (!copy_to_user(p, &rtv, sizeof(rtv)))
 +                      return ret;
 +      } else {
 +              struct compat_timespec rts;
 +
 +              rts.tv_sec = ts.tv_sec;
 +              rts.tv_nsec = ts.tv_nsec;
 +
 +              if (!copy_to_user(p, &rts, sizeof(rts)))
 +                      return ret;
 +      }
 +      /*
 +       * If an application puts its timeval in read-only memory, we
 +       * don't want the Linux-specific update to the timeval to
 +       * cause a fault after the select has completed
 +       * successfully. However, because we're not updating the
 +       * timeval, we can't restart the system call.
 +       */
 +
 +sticky:
 +      if (ret == -ERESTARTNOHAND)
 +              ret = -EINTR;
 +      return ret;
 +}
 +
  /*
   * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
   * 64-bit unsigned longs.
@@@ -1568,8 -1556,7 +1607,8 @@@ int compat_set_fd_set(unsigned long nr
        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
  
  int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 -      compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
 +      compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 +      struct timespec *end_time)
  {
        fd_set_bits fds;
        void *bits;
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);
  
 -      ret = do_select(n, &fds, timeout);
 +      ret = do_select(n, &fds, end_time);
  
        if (ret < 0)
                goto out;
@@@ -1642,7 -1629,7 +1681,7 @@@ asmlinkage long compat_sys_select(int n
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct compat_timeval __user *tvp)
  {
 -      s64 timeout = -1;
 +      struct timespec end_time, *to = NULL;
        struct compat_timeval tv;
        int ret;
  
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;
  
 -              if (tv.tv_sec < 0 || tv.tv_usec < 0)
 +              to = &end_time;
 +              if (poll_select_set_timeout(to, tv.tv_sec,
 +                                          tv.tv_usec * NSEC_PER_USEC))
                        return -EINVAL;
 -
 -              /* Cast to u64 to make GCC stop complaining */
 -              if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
 -                      timeout = -1;   /* infinite */
 -              else {
 -                      timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
 -                      timeout += tv.tv_sec * HZ;
 -              }
        }
  
 -      ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
 -
 -      if (tvp) {
 -              struct compat_timeval rtv;
 -
 -              if (current->personality & STICKY_TIMEOUTS)
 -                      goto sticky;
 -              rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 -              rtv.tv_sec = timeout;
 -              if (compat_timeval_compare(&rtv, &tv) >= 0)
 -                      rtv = tv;
 -              if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 -sticky:
 -                      /*
 -                       * If an application puts its timeval in read-only
 -                       * memory, we don't want the Linux-specific update to
 -                       * the timeval to cause a fault after the select has
 -                       * completed successfully. However, because we're not
 -                       * updating the timeval, we can't restart the system
 -                       * call.
 -                       */
 -                      if (ret == -ERESTARTNOHAND)
 -                              ret = -EINTR;
 -              }
 -      }
 +      ret = compat_core_sys_select(n, inp, outp, exp, to);
 +      ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
  
        return ret;
  }
@@@ -1670,16 -1686,15 +1709,16 @@@ asmlinkage long compat_sys_pselect7(in
  {
        compat_sigset_t ss32;
        sigset_t ksigmask, sigsaved;
 -      s64 timeout = MAX_SCHEDULE_TIMEOUT;
        struct compat_timespec ts;
 +      struct timespec end_time, *to = NULL;
        int ret;
  
        if (tsp) {
                if (copy_from_user(&ts, tsp, sizeof(ts)))
                        return -EFAULT;
  
 -              if (ts.tv_sec < 0 || ts.tv_nsec < 0)
 +              to = &end_time;
 +              if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }
  
                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
        }
  
 -      do {
 -              if (tsp) {
 -                      if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
 -                              timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
 -                              timeout += ts.tv_sec * (unsigned long)HZ;
 -                              ts.tv_sec = 0;
 -                              ts.tv_nsec = 0;
 -                      } else {
 -                              ts.tv_sec -= MAX_SELECT_SECONDS;
 -                              timeout = MAX_SELECT_SECONDS * HZ;
 -                      }
 -              }
 -
 -              ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
 -
 -      } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
 -
 -      if (tsp) {
 -              struct compat_timespec rts;
 -
 -              if (current->personality & STICKY_TIMEOUTS)
 -                      goto sticky;
 -
 -              rts.tv_sec = timeout / HZ;
 -              rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
 -              if (rts.tv_nsec >= NSEC_PER_SEC) {
 -                      rts.tv_sec++;
 -                      rts.tv_nsec -= NSEC_PER_SEC;
 -              }
 -              if (compat_timespec_compare(&rts, &ts) >= 0)
 -                      rts = ts;
 -              if (copy_to_user(tsp, &rts, sizeof(rts))) {
 -sticky:
 -                      /*
 -                       * If an application puts its timeval in read-only
 -                       * memory, we don't want the Linux-specific update to
 -                       * the timeval to cause a fault after the select has
 -                       * completed successfully. However, because we're not
 -                       * updating the timeval, we can't restart the system
 -                       * call.
 -                       */
 -                      if (ret == -ERESTARTNOHAND)
 -                              ret = -EINTR;
 -              }
 -      }
 +      ret = compat_core_sys_select(n, inp, outp, exp, to);
 +      ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
  
        if (ret == -ERESTARTNOHAND) {
                /*
@@@ -1740,16 -1798,18 +1779,16 @@@ asmlinkage long compat_sys_ppoll(struc
        compat_sigset_t ss32;
        sigset_t ksigmask, sigsaved;
        struct compat_timespec ts;
 -      s64 timeout = -1;
 +      struct timespec end_time, *to = NULL;
        int ret;
  
        if (tsp) {
                if (copy_from_user(&ts, tsp, sizeof(ts)))
                        return -EFAULT;
  
 -              /* We assume that ts.tv_sec is always lower than
 -                 the number of seconds that can be expressed in
 -                 an s64. Otherwise the compiler bitches at us */
 -              timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
 -              timeout += ts.tv_sec * HZ;
 +              to = &end_time;
 +              if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 +                      return -EINVAL;
        }
  
        if (sigmask) {
                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
        }
  
 -      ret = do_sys_poll(ufds, nfds, &timeout);
 +      ret = do_sys_poll(ufds, nfds, to);
  
        /* We can restart this syscall, usually */
        if (ret == -EINTR) {
        } else if (sigmask)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  
 -      if (tsp && timeout >= 0) {
 -              struct compat_timespec rts;
 -
 -              if (current->personality & STICKY_TIMEOUTS)
 -                      goto sticky;
 -              /* Yes, we know it's actually an s64, but it's also positive. */
 -              rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 -                                      1000;
 -              rts.tv_sec = timeout;
 -              if (compat_timespec_compare(&rts, &ts) >= 0)
 -                      rts = ts;
 -              if (copy_to_user(tsp, &rts, sizeof(rts))) {
 -sticky:
 -                      /*
 -                       * If an application puts its timeval in read-only
 -                       * memory, we don't want the Linux-specific update to
 -                       * the timeval to cause a fault after the select has
 -                       * completed successfully. However, because we're not
 -                       * updating the timeval, we can't restart the system
 -                       * call.
 -                       */
 -                      if (ret == -ERESTARTNOHAND && timeout >= 0)
 -                              ret = -EINTR;
 -              }
 -      }
 +      ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
  
        return ret;
  }
diff --combined include/linux/hrtimer.h
index 1e6f731381d9ce5faad45fe30aef6de1eebfc116,2f245fe63bda5611ad909c1452aa8a79c4f29eb4..cb25c1cc2352fc2ef5a743a183d1cc90645aff9a
@@@ -20,8 -20,6 +20,8 @@@
  #include <linux/init.h>
  #include <linux/list.h>
  #include <linux/wait.h>
 +#include <linux/percpu.h>
 +
  
  struct hrtimer_clock_base;
  struct hrtimer_cpu_base;
@@@ -49,14 -47,22 +49,22 @@@ enum hrtimer_restart 
   *    HRTIMER_CB_IRQSAFE:             Callback may run in hardirq context
   *    HRTIMER_CB_IRQSAFE_NO_RESTART:  Callback may run in hardirq context and
   *                                    does not restart the timer
-  *    HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:  Callback must run in hardirq context
-  *                                    Special mode for tick emultation
+  *    HRTIMER_CB_IRQSAFE_PERCPU:      Callback must run in hardirq context
+  *                                    Special mode for tick emulation and
+  *                                    scheduler timer. Such timers are per
+  *                                    cpu and not allowed to be migrated on
+  *                                    cpu unplug.
+  *    HRTIMER_CB_IRQSAFE_UNLOCKED:    Callback should run in hardirq context
+  *                                    with timer->base lock unlocked
+  *                                    used for timers which call wakeup to
+  *                                    avoid lock order problems with rq->lock
   */
  enum hrtimer_cb_mode {
        HRTIMER_CB_SOFTIRQ,
        HRTIMER_CB_IRQSAFE,
        HRTIMER_CB_IRQSAFE_NO_RESTART,
-       HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
+       HRTIMER_CB_IRQSAFE_PERCPU,
+       HRTIMER_CB_IRQSAFE_UNLOCKED,
  };
  
  /*
   * 0x02               callback function running
   * 0x04               callback pending (high resolution mode)
   *
-  * Special case:
+  * Special cases:
   * 0x03               callback function running and enqueued
   *            (was requeued on another CPU)
+  * 0x09               timer was migrated on CPU hotunplug
   * The "callback function running and enqueued" status is only possible on
   * SMP. It happens for example when a posix timer expired and the callback
   * queued a signal. Between dropping the lock which protects the posix timer
@@@ -89,6 -96,7 +98,7 @@@
  #define HRTIMER_STATE_ENQUEUED        0x01
  #define HRTIMER_STATE_CALLBACK        0x02
  #define HRTIMER_STATE_PENDING 0x04
+ #define HRTIMER_STATE_MIGRATE 0x08
  
  /**
   * struct hrtimer - the basic hrtimer structure
   */
  struct hrtimer {
        struct rb_node                  node;
 -      ktime_t                         expires;
 +      ktime_t                         _expires;
 +      ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
        unsigned long                   state;
@@@ -200,71 -207,6 +210,71 @@@ struct hrtimer_cpu_base 
  #endif
  };
  
 +static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 +{
 +      timer->_expires = time;
 +      timer->_softexpires = time;
 +}
 +
 +static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
 +{
 +      timer->_softexpires = time;
 +      timer->_expires = ktime_add_safe(time, delta);
 +}
 +
 +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
 +{
 +      timer->_softexpires = time;
 +      timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
 +}
 +
 +static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 +{
 +      timer->_expires.tv64 = tv64;
 +      timer->_softexpires.tv64 = tv64;
 +}
 +
 +static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 +{
 +      timer->_expires = ktime_add_safe(timer->_expires, time);
 +      timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 +}
 +
 +static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
 +{
 +      timer->_expires = ktime_add_ns(timer->_expires, ns);
 +      timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 +}
 +
 +static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 +{
 +      return timer->_expires;
 +}
 +
 +static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 +{
 +      return timer->_softexpires;
 +}
 +
 +static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 +{
 +      return timer->_expires.tv64;
 +}
 +static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 +{
 +      return timer->_softexpires.tv64;
 +}
 +
 +static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 +{
 +      return ktime_to_ns(timer->_expires);
 +}
 +
 +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 +{
 +    return ktime_sub(timer->_expires, timer->base->get_time());
 +}
 +
  #ifdef CONFIG_HIGH_RES_TIMERS
  struct clock_event_device;
  
@@@ -285,8 -227,6 +295,8 @@@ static inline int hrtimer_is_hres_activ
        return timer->base->cpu_base->hres_active;
  }
  
 +extern void hrtimer_peek_ahead_timers(void);
 +
  /*
   * The resolution of the clocks. The resolution value is returned in
   * the clock_getres() system call to give application programmers an
   * is expired in the next softirq when the clock was advanced.
   */
  static inline void clock_was_set(void) { }
 +static inline void hrtimer_peek_ahead_timers(void) { }
  
  static inline void hres_timers_resume(void) { }
  
@@@ -331,10 -270,6 +341,10 @@@ static inline int hrtimer_is_hres_activ
  extern ktime_t ktime_get(void);
  extern ktime_t ktime_get_real(void);
  
 +
 +DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 +
 +
  /* Exported timer functions: */
  
  /* Initialize timers: */
@@@ -359,25 -294,12 +369,25 @@@ static inline void destroy_hrtimer_on_s
  /* Basic timer operations: */
  extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
                         const enum hrtimer_mode mode);
 +extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 +                      unsigned long range_ns, const enum hrtimer_mode mode);
  extern int hrtimer_cancel(struct hrtimer *timer);
  extern int hrtimer_try_to_cancel(struct hrtimer *timer);
  
 +static inline int hrtimer_start_expires(struct hrtimer *timer,
 +                                              enum hrtimer_mode mode)
 +{
 +      unsigned long delta;
 +      ktime_t soft, hard;
 +      soft = hrtimer_get_softexpires(timer);
 +      hard = hrtimer_get_expires(timer);
 +      delta = ktime_to_ns(ktime_sub(hard, soft));
 +      return hrtimer_start_range_ns(timer, soft, delta, mode);
 +}
 +
  static inline int hrtimer_restart(struct hrtimer *timer)
  {
 -      return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
 +      return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  }
  
  /* Query timers: */
@@@ -434,10 -356,6 +444,10 @@@ extern long hrtimer_nanosleep_restart(s
  extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                 struct task_struct *tsk);
  
 +extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 +                                              const enum hrtimer_mode mode);
 +extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 +
  /* Soft interrupt function to run the hrtimer queues: */
  extern void hrtimer_run_queues(void);
  extern void hrtimer_run_pending(void);
diff --combined include/linux/sched.h
index dcc03fd5a7f3df86a005ac88b1dc2843c5c178b4,c226c7b82946ce1d830853c4fd3b9bad3d92fa0d..de53c109fd04f1de9eb9fd973af5ae3e7f790922
@@@ -352,7 -352,7 +352,7 @@@ arch_get_unmapped_area_topdown(struct f
  extern void arch_unmap_area(struct mm_struct *, unsigned long);
  extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
  
- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ #if USE_SPLIT_PTLOCKS
  /*
   * The mm counters are not protected by its page_table_lock,
   * so must be incremented atomically.
  #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
  #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
  
- #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+ #else  /* !USE_SPLIT_PTLOCKS */
  /*
   * The mm counters are protected by its page_table_lock,
   * so can be incremented directly.
  #define inc_mm_counter(mm, member) (mm)->_##member++
  #define dec_mm_counter(mm, member) (mm)->_##member--
  
- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+ #endif /* !USE_SPLIT_PTLOCKS */
  
  #define get_mm_rss(mm)                                        \
        (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
@@@ -451,8 -451,8 +451,8 @@@ struct signal_struct 
         * - everyone except group_exit_task is stopped during signal delivery
         *   of fatal signals, group_exit_task processes the signal.
         */
-       struct task_struct      *group_exit_task;
        int                     notify_count;
+       struct task_struct      *group_exit_task;
  
        /* thread group stop support, overloads group_exit_code too */
        int                     group_stop_count;
@@@ -824,6 -824,9 +824,9 @@@ struct sched_domain 
        unsigned int ttwu_move_affine;
        unsigned int ttwu_move_balance;
  #endif
+ #ifdef CONFIG_SCHED_DEBUG
+       char *name;
+ #endif
  };
  
  extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
@@@ -897,7 -900,7 +900,7 @@@ struct sched_class 
        void (*yield_task) (struct rq *rq);
        int  (*select_task_rq)(struct task_struct *p, int sync);
  
-       void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+       void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
  
        struct task_struct * (*pick_next_task) (struct rq *rq);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
@@@ -1010,8 -1013,8 +1013,8 @@@ struct sched_entity 
  
  struct sched_rt_entity {
        struct list_head run_list;
-       unsigned int time_slice;
        unsigned long timeout;
+       unsigned int time_slice;
        int nr_cpus_allowed;
  
        struct sched_rt_entity *back;
@@@ -1301,12 -1304,6 +1304,12 @@@ struct task_struct 
        int latency_record_count;
        struct latency_record latency_record[LT_SAVECOUNT];
  #endif
 +      /*
 +       * time slack values; these are used to round up poll() and
 +       * select() etc timeout values. These are in nanoseconds.
 +       */
 +      unsigned long timer_slack_ns;
 +      unsigned long default_timer_slack_ns;
  };
  
  /*
diff --combined include/linux/time.h
index 726976478480a644c13a75ebf8f048e44a38ef98,51e883df0fa51fe598832747477533fc4303e30a..c911ef69ea87e2dbb2c7cd8988dce850d13b79be
@@@ -29,6 -29,8 +29,8 @@@ struct timezone 
  
  #ifdef __KERNEL__
  
+ extern struct timezone sys_tz;
  /* Parameters used to convert the timespec values: */
  #define MSEC_PER_SEC  1000L
  #define USEC_PER_MSEC 1000L
@@@ -38,8 -40,6 +40,8 @@@
  #define NSEC_PER_SEC  1000000000L
  #define FSEC_PER_SEC  1000000000000000L
  
 +#define TIME_T_MAX    (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 +
  static inline int timespec_equal(const struct timespec *a,
                                   const struct timespec *b)
  {
@@@ -74,8 -74,6 +76,8 @@@ extern unsigned long mktime(const unsig
                            const unsigned int min, const unsigned int sec);
  
  extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 +extern struct timespec timespec_add_safe(const struct timespec lhs,
 +                                       const struct timespec rhs);
  
  /*
   * sub = lhs - rhs, in normalized form
diff --combined kernel/fork.c
index 4308d75f0fa5bc67c36748d31bb0e2518ab58990,30de644a40c4d4d9617d650589f4c90da1e977a2..37b3e150ae3956759b684c6806bbb6dc5ad12376
@@@ -802,6 -802,7 +802,7 @@@ static int copy_signal(unsigned long cl
  
        sig->leader = 0;        /* session leadership doesn't inherit */
        sig->tty_old_pgrp = NULL;
+       sig->tty = NULL;
  
        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
        sig->gtime = cputime_zero;
  void __cleanup_signal(struct signal_struct *sig)
  {
        exit_thread_group_keys(sig);
+       tty_kref_put(sig->tty);
        kmem_cache_free(signal_cachep, sig);
  }
  
@@@ -987,8 -989,6 +989,8 @@@ static struct task_struct *copy_process
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
  
 +      p->default_timer_slack_ns = current->timer_slack_ns;
 +
  #ifdef CONFIG_DETECT_SOFTLOCKUP
        p->last_switch_count = 0;
        p->last_switch_timestamp = 0;
                                p->nsproxy->pid_ns->child_reaper = p;
  
                        p->signal->leader_pid = pid;
-                       p->signal->tty = current->signal->tty;
+                       tty_kref_put(p->signal->tty);
+                       p->signal->tty = tty_kref_get(current->signal->tty);
                        set_task_pgrp(p, task_pgrp_nr(current));
                        set_task_session(p, task_session_nr(current));
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --combined kernel/hrtimer.c
index 2bd230be1cb5ae3a778546d2ed0e651e153ba46e,cdec83e722fa1b80ee0af0f828d8e47532431a20..51ee90bca2dedcc8b882f96130c3486fe3111ce5
@@@ -517,7 -517,7 +517,7 @@@ static void hrtimer_force_reprogram(str
                if (!base->first)
                        continue;
                timer = rb_entry(base->first, struct hrtimer, node);
 -              expires = ktime_sub(timer->expires, base->offset);
 +              expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires.tv64 < cpu_base->expires_next.tv64)
                        cpu_base->expires_next = expires;
        }
@@@ -539,10 -539,10 +539,10 @@@ static int hrtimer_reprogram(struct hrt
                             struct hrtimer_clock_base *base)
  {
        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
 -      ktime_t expires = ktime_sub(timer->expires, base->offset);
 +      ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
  
 -      WARN_ON_ONCE(timer->expires.tv64 < 0);
 +      WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
  
        /*
         * When the callback is running, we do not reprogram the clock event
@@@ -672,13 -672,14 +672,14 @@@ static inline int hrtimer_enqueue_repro
                         */
                        BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
                        return 1;
-               case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+               case HRTIMER_CB_IRQSAFE_PERCPU:
+               case HRTIMER_CB_IRQSAFE_UNLOCKED:
                        /*
                         * This is solely for the sched tick emulation with
                         * dynamic tick support to ensure that we do not
                         * restart the tick right on the edge and end up with
                         * the tick timer in the softirq ! The calling site
-                        * takes care of this.
+                        * takes care of this. Also used for hrtimer sleeper !
                         */
                        debug_hrtimer_deactivate(timer);
                        return 1;
@@@ -794,7 -795,7 +795,7 @@@ u64 hrtimer_forward(struct hrtimer *tim
        u64 orun = 1;
        ktime_t delta;
  
 -      delta = ktime_sub(now, timer->expires);
 +      delta = ktime_sub(now, hrtimer_get_expires(timer));
  
        if (delta.tv64 < 0)
                return 0;
                s64 incr = ktime_to_ns(interval);
  
                orun = ktime_divns(delta, incr);
 -              timer->expires = ktime_add_ns(timer->expires, incr * orun);
 -              if (timer->expires.tv64 > now.tv64)
 +              hrtimer_add_expires_ns(timer, incr * orun);
 +              if (hrtimer_get_expires_tv64(timer) > now.tv64)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 */
                orun++;
        }
 -      timer->expires = ktime_add_safe(timer->expires, interval);
 +      hrtimer_add_expires(timer, interval);
  
        return orun;
  }
@@@ -847,8 -848,7 +848,8 @@@ static void enqueue_hrtimer(struct hrti
                 * We dont care about collisions. Nodes with
                 * the same expiry time stay together.
                 */
 -              if (timer->expires.tv64 < entry->expires.tv64) {
 +              if (hrtimer_get_expires_tv64(timer) <
 +                              hrtimer_get_expires_tv64(entry)) {
                        link = &(*link)->rb_left;
                } else {
                        link = &(*link)->rb_right;
@@@ -945,10 -945,9 +946,10 @@@ remove_hrtimer(struct hrtimer *timer, s
  }
  
  /**
 - * hrtimer_start - (re)start an relative timer on the current CPU
 + * hrtimer_start_range_ns - (re)start an relative timer on the current CPU
   * @timer:    the timer to be added
   * @tim:      expiry time
 + * @delta_ns: "slack" range for the timer
   * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
   *
   * Returns:
   *  1 when the timer was active
   */
  int
 -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 +hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
 +                      const enum hrtimer_mode mode)
  {
        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
  #endif
        }
  
 -      timer->expires = tim;
 +      hrtimer_set_expires_range_ns(timer, tim, delta_ns);
  
        timer_stats_hrtimer_set_start_info(timer);
  
  
        return ret;
  }
 +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 +
 +/**
 + * hrtimer_start - (re)start an relative timer on the current CPU
 + * @timer:    the timer to be added
 + * @tim:      expiry time
 + * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 + *
 + * Returns:
 + *  0 on success
 + *  1 when the timer was active
 + */
 +int
 +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 +{
 +      return hrtimer_start_range_ns(timer, tim, 0, mode);
 +}
  EXPORT_SYMBOL_GPL(hrtimer_start);
  
 +
  /**
   * hrtimer_try_to_cancel - try to deactivate a timer
   * @timer:    hrtimer to stop
@@@ -1097,7 -1077,7 +1098,7 @@@ ktime_t hrtimer_get_remaining(const str
        ktime_t rem;
  
        base = lock_hrtimer_base(timer, &flags);
 -      rem = ktime_sub(timer->expires, base->get_time());
 +      rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);
  
        return rem;
@@@ -1129,7 -1109,7 +1130,7 @@@ ktime_t hrtimer_get_next_event(void
                                continue;
  
                        timer = rb_entry(base->first, struct hrtimer, node);
 -                      delta.tv64 = timer->expires.tv64;
 +                      delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
                                mindelta.tv64 = delta.tv64;
@@@ -1266,7 -1246,8 +1267,8 @@@ static void __run_hrtimer(struct hrtime
        timer_stats_account_hrtimer(timer);
  
        fn = timer->function;
-       if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+       if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+           timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
                /*
                 * Used for scheduler timers, avoid lock inversion with
                 * rq->lock and tasklist_lock.
@@@ -1329,23 -1310,10 +1331,23 @@@ void hrtimer_interrupt(struct clock_eve
  
                        timer = rb_entry(node, struct hrtimer, node);
  
 -                      if (basenow.tv64 < timer->expires.tv64) {
 +                      /*
 +                       * The immediate goal for using the softexpires is
 +                       * minimizing wakeups, not running timers at the
 +                       * earliest interrupt after their soft expiration.
 +                       * This allows us to avoid using a Priority Search
 +                       * Tree, which can answer a stabbing querry for
 +                       * overlapping intervals and instead use the simple
 +                       * BST we already have.
 +                       * We don't add extra wakeups by delaying timers that
 +                       * are right-of a not yet expired timer, because that
 +                       * timer will have to trigger a wakeup anyway.
 +                       */
 +
 +                      if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
                                ktime_t expires;
  
 -                              expires = ktime_sub(timer->expires,
 +                              expires = ktime_sub(hrtimer_get_expires(timer),
                                                    base->offset);
                                if (expires.tv64 < expires_next.tv64)
                                        expires_next = expires;
                raise_softirq(HRTIMER_SOFTIRQ);
  }
  
 +/**
 + * hrtimer_peek_ahead_timers -- run soft-expired timers now
 + *
 + * hrtimer_peek_ahead_timers will peek at the timer queue of
 + * the current cpu and check if there are any timers for which
 + * the soft expires time has passed. If any such timers exist,
 + * they are run immediately and then removed from the timer queue.
 + *
 + */
 +void hrtimer_peek_ahead_timers(void)
 +{
 +      unsigned long flags;
 +      struct tick_device *td;
 +      struct clock_event_device *dev;
 +
 +      if (!hrtimer_hres_active())
 +              return;
 +
 +      local_irq_save(flags);
 +      td = &__get_cpu_var(tick_cpu_device);
 +      if (!td)
 +              goto out;
 +      dev = td->evtdev;
 +      if (!dev)
 +              goto out;
 +      hrtimer_interrupt(dev);
 +out:
 +      local_irq_restore(flags);
 +}
 +
  static void run_hrtimer_softirq(struct softirq_action *h)
  {
        run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@@ -1478,8 -1416,7 +1480,8 @@@ void hrtimer_run_queues(void
                        struct hrtimer *timer;
  
                        timer = rb_entry(node, struct hrtimer, node);
 -                      if (base->softirq_time.tv64 <= timer->expires.tv64)
 +                      if (base->softirq_time.tv64 <=
 +                                      hrtimer_get_expires_tv64(timer))
                                break;
  
                        if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@@ -1517,7 -1454,7 +1519,7 @@@ void hrtimer_init_sleeper(struct hrtime
        sl->timer.function = hrtimer_wakeup;
        sl->task = task;
  #ifdef CONFIG_HIGH_RES_TIMERS
-       sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
  #endif
  }
  
@@@ -1527,7 -1464,7 +1529,7 @@@ static int __sched do_nanosleep(struct 
  
        do {
                set_current_state(TASK_INTERRUPTIBLE);
 -              hrtimer_start(&t->timer, t->timer.expires, mode);
 +              hrtimer_start_expires(&t->timer, mode);
                if (!hrtimer_active(&t->timer))
                        t->task = NULL;
  
@@@ -1549,7 -1486,7 +1551,7 @@@ static int update_rmtp(struct hrtimer *
        struct timespec rmt;
        ktime_t rem;
  
 -      rem = ktime_sub(timer->expires, timer->base->get_time());
 +      rem = hrtimer_expires_remaining(timer);
        if (rem.tv64 <= 0)
                return 0;
        rmt = ktime_to_timespec(rem);
@@@ -1568,7 -1505,7 +1570,7 @@@ long __sched hrtimer_nanosleep_restart(
  
        hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
                                HRTIMER_MODE_ABS);
 -      t.timer.expires.tv64 = restart->nanosleep.expires;
 +      hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
  
        if (do_nanosleep(&t, HRTIMER_MODE_ABS))
                goto out;
@@@ -1593,14 -1530,9 +1595,14 @@@ long hrtimer_nanosleep(struct timespec 
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
 +      unsigned long slack;
 +
 +      slack = current->timer_slack_ns;
 +      if (rt_task(current))
 +              slack = 0;
  
        hrtimer_init_on_stack(&t.timer, clockid, mode);
 -      t.timer.expires = timespec_to_ktime(*rqtp);
 +      hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
                goto out;
  
        restart->fn = hrtimer_nanosleep_restart;
        restart->nanosleep.index = t.timer.base->index;
        restart->nanosleep.rmtp = rmtp;
 -      restart->nanosleep.expires = t.timer.expires.tv64;
 +      restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
  
        ret = -ERESTART_RESTARTBLOCK;
  out:
@@@ -1661,29 -1593,95 +1663,95 @@@ static void __cpuinit init_hrtimers_cpu
  
  #ifdef CONFIG_HOTPLUG_CPU
  
- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-                               struct hrtimer_clock_base *new_base)
+ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+                               struct hrtimer_clock_base *new_base, int dcpu)
  {
        struct hrtimer *timer;
        struct rb_node *node;
+       int raise = 0;
  
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_hrtimer_deactivate(timer);
-               __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+               /*
+                * Should not happen. Per CPU timers should be
+                * canceled _before_ the migration code is called
+                */
+               if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+                       __remove_hrtimer(timer, old_base,
+                                        HRTIMER_STATE_INACTIVE, 0);
+                       WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+                            timer, timer->function, dcpu);
+                       continue;
+               }
+               /*
+                * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                * timer could be seen as !active and just vanish away
+                * under us on another CPU
+                */
+               __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timer. Allow reprogramming of the event device
                 */
                enqueue_hrtimer(timer, new_base, 1);
+ #ifdef CONFIG_HIGH_RES_TIMERS
+               /*
+                * Happens with high res enabled when the timer was
+                * already expired and the callback mode is
+                * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+                * enqueue code does not move them to the soft irq
+                * pending list for performance/latency reasons, but
+                * in the migration state, we need to do that
+                * otherwise we end up with a stale timer.
+                */
+               if (timer->state == HRTIMER_STATE_MIGRATE) {
+                       timer->state = HRTIMER_STATE_PENDING;
+                       list_add_tail(&timer->cb_entry,
+                                     &new_base->cpu_base->cb_pending);
+                       raise = 1;
+               }
+ #endif
+               /* Clear the migration state bit */
+               timer->state &= ~HRTIMER_STATE_MIGRATE;
+       }
+       return raise;
+ }
+ #ifdef CONFIG_HIGH_RES_TIMERS
+ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                  struct hrtimer_cpu_base *new_base)
+ {
+       struct hrtimer *timer;
+       int raise = 0;
+       while (!list_empty(&old_base->cb_pending)) {
+               timer = list_entry(old_base->cb_pending.next,
+                                  struct hrtimer, cb_entry);
+               __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+               timer->base = &new_base->clock_base[timer->base->index];
+               list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+               raise = 1;
        }
+       return raise;
+ }
+ #else
+ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                  struct hrtimer_cpu_base *new_base)
+ {
+       return 0;
  }
+ #endif
  
  static void migrate_hrtimers(int cpu)
  {
        struct hrtimer_cpu_base *old_base, *new_base;
-       int i;
+       int i, raise = 0;
  
        BUG_ON(cpu_online(cpu));
        old_base = &per_cpu(hrtimer_bases, cpu);
        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
  
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-               migrate_hrtimer_list(&old_base->clock_base[i],
-                                    &new_base->clock_base[i]);
+               if (migrate_hrtimer_list(&old_base->clock_base[i],
+                                        &new_base->clock_base[i], cpu))
+                       raise = 1;
        }
  
+       if (migrate_hrtimer_pending(old_base, new_base))
+               raise = 1;
        spin_unlock(&old_base->lock);
        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(hrtimer_bases);
+       if (raise)
+               hrtimer_raise_softirq();
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@@ -1748,103 -1753,3 +1823,103 @@@ void __init hrtimers_init(void
  #endif
  }
  
 +/**
 + * schedule_hrtimeout_range - sleep until timeout
 + * @expires:  timeout value (ktime_t)
 + * @delta:    slack in expires timeout (ktime_t)
 + * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
 + *
 + * Make the current task sleep until the given expiry time has
 + * elapsed. The routine will return immediately unless
 + * the current task state has been set (see set_current_state()).
 + *
 + * The @delta argument gives the kernel the freedom to schedule the
 + * actual wakeup to a time that is both power and performance friendly.
 + * The kernel give the normal best effort behavior for "@expires+@delta",
 + * but may decide to fire the timer earlier, but no earlier than @expires.
 + *
 + * You can set the task state as follows -
 + *
 + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 + * pass before the routine returns.
 + *
 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 + * delivered to the current task.
 + *
 + * The current task state is guaranteed to be TASK_RUNNING when this
 + * routine returns.
 + *
 + * Returns 0 when the timer has expired otherwise -EINTR
 + */
 +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 +                             const enum hrtimer_mode mode)
 +{
 +      struct hrtimer_sleeper t;
 +
 +      /*
 +       * Optimize when a zero timeout value is given. It does not
 +       * matter whether this is an absolute or a relative time.
 +       */
 +      if (expires && !expires->tv64) {
 +              __set_current_state(TASK_RUNNING);
 +              return 0;
 +      }
 +
 +      /*
 +       * A NULL parameter means "inifinte"
 +       */
 +      if (!expires) {
 +              schedule();
 +              __set_current_state(TASK_RUNNING);
 +              return -EINTR;
 +      }
 +
 +      hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
 +      hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 +
 +      hrtimer_init_sleeper(&t, current);
 +
 +      hrtimer_start_expires(&t.timer, mode);
 +      if (!hrtimer_active(&t.timer))
 +              t.task = NULL;
 +
 +      if (likely(t.task))
 +              schedule();
 +
 +      hrtimer_cancel(&t.timer);
 +      destroy_hrtimer_on_stack(&t.timer);
 +
 +      __set_current_state(TASK_RUNNING);
 +
 +      return !t.task ? 0 : -EINTR;
 +}
 +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
 +
 +/**
 + * schedule_hrtimeout - sleep until timeout
 + * @expires:  timeout value (ktime_t)
 + * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
 + *
 + * Make the current task sleep until the given expiry time has
 + * elapsed. The routine will return immediately unless
 + * the current task state has been set (see set_current_state()).
 + *
 + * You can set the task state as follows -
 + *
 + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 + * pass before the routine returns.
 + *
 + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 + * delivered to the current task.
 + *
 + * The current task state is guaranteed to be TASK_RUNNING when this
 + * routine returns.
 + *
 + * Returns 0 when the timer has expired otherwise -EINTR
 + */
 +int __sched schedule_hrtimeout(ktime_t *expires,
 +                             const enum hrtimer_mode mode)
 +{
 +      return schedule_hrtimeout_range(expires, 0, mode);
 +}
 +EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --combined kernel/posix-timers.c
index f85efcdcab2d0e68037ee1ad03dc0cf2981c9635,5131e5471169226ef8db42f20792c8ffdac6d12b..ee204586149a0c8ab6144808c0610320545f7b8f
@@@ -441,7 -441,7 +441,7 @@@ static struct k_itimer * alloc_posix_ti
                return tmr;
        if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                kmem_cache_free(posix_timers_cache, tmr);
-               tmr = NULL;
+               return NULL;
        }
        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
        return tmr;
@@@ -668,7 -668,7 +668,7 @@@ common_timer_get(struct k_itimer *timr
            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
                timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
  
 -      remaining = ktime_sub(timer->expires, now);
 +      remaining = ktime_sub(hrtimer_get_expires(timer), now);
        /* Return 0 only, when the timer is expired and not pending */
        if (remaining.tv64 <= 0) {
                /*
@@@ -762,7 -762,7 +762,7 @@@ common_timer_set(struct k_itimer *timr
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;
  
 -      timer->expires = timespec_to_ktime(new_setting->it_value);
 +      hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
  
        /* Convert interval */
        timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
                /* Setup correct expiry time for relative timers */
                if (mode == HRTIMER_MODE_REL) {
 -                      timer->expires =
 -                              ktime_add_safe(timer->expires,
 -                                             timer->base->get_time());
 +                      hrtimer_add_expires(timer, timer->base->get_time());
                }
                return 0;
        }
  
 -      hrtimer_start(timer, timer->expires, mode);
 +      hrtimer_start_expires(timer, mode);
        return 0;
  }
  
diff --combined kernel/sched.c
index e46b5afa200d6cc63730ddb388c70c856610ea46,6f230596bd0c1d21a2c68ffbff8207e93dcd65b5..eb3c72953615c06d19b08be2480a3144b259c55d
@@@ -201,14 -201,19 +201,19 @@@ void init_rt_bandwidth(struct rt_bandwi
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+ }
+ static inline int rt_bandwidth_enabled(void)
+ {
+       return sysctl_sched_rt_runtime >= 0;
  }
  
  static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  {
        ktime_t now;
  
-       if (rt_b->rt_runtime == RUNTIME_INF)
+       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                return;
  
        if (hrtimer_active(&rt_b->rt_period_timer))
  
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 -              hrtimer_start(&rt_b->rt_period_timer,
 -                            rt_b->rt_period_timer.expires,
 -                            HRTIMER_MODE_ABS);
 +              hrtimer_start_expires(&rt_b->rt_period_timer,
 +                              HRTIMER_MODE_ABS);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
  }
@@@ -297,9 -303,9 +302,9 @@@ static DEFINE_PER_CPU(struct cfs_rq, in
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  #endif /* CONFIG_RT_GROUP_SCHED */
- #else /* !CONFIG_FAIR_GROUP_SCHED */
+ #else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
- #endif /* CONFIG_FAIR_GROUP_SCHED */
+ #endif /* CONFIG_USER_SCHED */
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@@ -603,9 -609,9 +608,9 @@@ struct rq 
  
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
  {
-       rq->curr->sched_class->check_preempt_curr(rq, p);
+       rq->curr->sched_class->check_preempt_curr(rq, p, sync);
  }
  
  static inline int cpu_of(struct rq *rq)
@@@ -1057,7 -1063,7 +1062,7 @@@ static void hrtick_start(struct rq *rq
        struct hrtimer *timer = &rq->hrtick_timer;
        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
  
 -      timer->expires = time;
 +      hrtimer_set_expires(timer, time);
  
        if (rq == this_rq()) {
                hrtimer_restart(timer);
@@@ -1086,7 -1092,7 +1091,7 @@@ hotplug_hrtick(struct notifier_block *n
        return NOTIFY_DONE;
  }
  
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
  {
        hotcpu_notifier(hotplug_hrtick, 0);
  }
@@@ -1101,7 -1107,7 +1106,7 @@@ static void hrtick_start(struct rq *rq
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
  }
  
- static void init_hrtick(void)
+ static inline void init_hrtick(void)
  {
  }
  #endif /* CONFIG_SMP */
@@@ -1118,9 -1124,9 +1123,9 @@@ static void init_rq_hrtick(struct rq *r
  
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
- #else
+ #else /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
  {
  }
@@@ -1132,7 -1138,7 +1137,7 @@@ static inline void init_rq_hrtick(struc
  static inline void init_hrtick(void)
  {
  }
- #endif
+ #endif        /* CONFIG_SCHED_HRTICK */
  
  /*
   * resched_task - mark a task 'to be rescheduled now'.
@@@ -1379,38 -1385,24 +1384,24 @@@ static inline void dec_cpu_load(struct 
        update_load_sub(&rq->load, load);
  }
  
- #ifdef CONFIG_SMP
- static unsigned long source_load(int cpu, int type);
- static unsigned long target_load(int cpu, int type);
- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-       return rq->avg_load_per_task;
- }
- #ifdef CONFIG_FAIR_GROUP_SCHED
- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+ typedef int (*tg_visitor)(struct task_group *, void *);
  
  /*
   * Iterate the full tree, calling @down when first entering a node and @up when
   * leaving it for the final time.
   */
- static void
- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
  {
        struct task_group *parent, *child;
+       int ret;
  
        rcu_read_lock();
        parent = &root_task_group;
  down:
-       (*down)(parent, cpu, sd);
+       ret = (*down)(parent, data);
+       if (ret)
+               goto out_unlock;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
  up:
                continue;
        }
-       (*up)(parent, cpu, sd);
+       ret = (*up)(parent, data);
+       if (ret)
+               goto out_unlock;
  
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
+ out_unlock:
        rcu_read_unlock();
+       return ret;
  }
  
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+       return 0;
+ }
+ #endif
+ #ifdef CONFIG_SMP
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       return rq->avg_load_per_task;
+ }
+ #ifdef CONFIG_FAIR_GROUP_SCHED
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
  /*
@@@ -1485,11 -1505,11 +1504,11 @@@ __update_group_shares_cpu(struct task_g
   * This needs to be done in a bottom-up fashion because the rq weight of a
   * parent group depends on the shares of its child groups.
   */
- static void
- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_shares_up(struct task_group *tg, void *data)
  {
        unsigned long rq_weight = 0;
        unsigned long shares = 0;
+       struct sched_domain *sd = data;
        int i;
  
        for_each_cpu_mask(i, sd->span) {
                __update_group_shares_cpu(tg, i, shares, rq_weight);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
+       return 0;
  }
  
  /*
   * This needs to be done in a top-down fashion because the load of a child
   * group is a fraction of its parents load.
   */
- static void
- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_load_down(struct task_group *tg, void *data)
  {
        unsigned long load;
+       long cpu = (long)data;
  
        if (!tg->parent) {
                load = cpu_rq(cpu)->load.weight;
        }
  
        tg->cfs_rq[cpu]->h_load = load;
- }
  
- static void
- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
- {
+       return 0;
  }
  
  static void update_shares(struct sched_domain *sd)
  
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+               walk_tg_tree(tg_nop, tg_shares_up, sd);
        }
  }
  
@@@ -1560,9 -1579,9 +1578,9 @@@ static void update_shares_locked(struc
        spin_lock(&rq->lock);
  }
  
- static void update_h_load(int cpu)
+ static void update_h_load(long cpu)
  {
-       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
  #else
@@@ -1920,11 -1939,8 +1938,8 @@@ unsigned long wait_task_inactive(struc
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
-               if (!match_state || p->state == match_state) {
-                       ncsw = p->nivcsw + p->nvcsw;
-                       if (unlikely(!ncsw))
-                               ncsw = 1;
-               }
+               if (!match_state || p->state == match_state)
+                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, &flags);
  
                /*
@@@ -2284,7 -2300,7 +2299,7 @@@ out_running
        trace_mark(kernel_sched_wakeup,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
@@@ -2419,7 -2435,7 +2434,7 @@@ void wake_up_new_task(struct task_struc
        trace_mark(kernel_sched_wakeup_new,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@@ -2879,7 -2895,7 +2894,7 @@@ static void pull_task(struct rq *src_rq
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
         */
-       check_preempt_curr(this_rq, p);
+       check_preempt_curr(this_rq, p, 0);
  }
  
  /*
@@@ -4626,6 -4642,15 +4641,15 @@@ __wake_up_sync(wait_queue_head_t *q, un
  }
  EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
  
+ /**
+  * complete: - signals a single thread waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up a single thread waiting on this completion. Threads will be
+  * awakened in the same order in which they were queued.
+  *
+  * See also complete_all(), wait_for_completion() and related routines.
+  */
  void complete(struct completion *x)
  {
        unsigned long flags;
  }
  EXPORT_SYMBOL(complete);
  
+ /**
+  * complete_all: - signals all threads waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up all threads waiting on this particular completion event.
+  */
  void complete_all(struct completion *x)
  {
        unsigned long flags;
@@@ -4657,10 -4688,7 +4687,7 @@@ do_wait_for_common(struct completion *x
                wait.flags |= WQ_FLAG_EXCLUSIVE;
                __add_wait_queue_tail(&x->wait, &wait);
                do {
-                       if ((state == TASK_INTERRUPTIBLE &&
-                            signal_pending(current)) ||
-                           (state == TASK_KILLABLE &&
-                            fatal_signal_pending(current))) {
+                       if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
                                break;
                        }
@@@ -4688,12 -4716,31 +4715,31 @@@ wait_for_common(struct completion *x, l
        return timeout;
  }
  
+ /**
+  * wait_for_completion: - waits for completion of a task
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It is NOT
+  * interruptible and there is no timeout.
+  *
+  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+  * and interrupt capability. Also see complete().
+  */
  void __sched wait_for_completion(struct completion *x)
  {
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_for_completion);
  
+ /**
+  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. The timeout is in jiffies. It is not
+  * interruptible.
+  */
  unsigned long __sched
  wait_for_completion_timeout(struct completion *x, unsigned long timeout)
  {
  }
  EXPORT_SYMBOL(wait_for_completion_timeout);
  
+ /**
+  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits for completion of a specific task to be signaled. It is
+  * interruptible.
+  */
  int __sched wait_for_completion_interruptible(struct completion *x)
  {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_for_completion_interruptible);
  
+ /**
+  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+  */
  unsigned long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
  }
  EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
  
+ /**
+  * wait_for_completion_killable: - waits for completion of a task (killable)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It can be
+  * interrupted by a kill signal.
+  */
  int __sched wait_for_completion_killable(struct completion *x)
  {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@ -5120,7 -5189,8 +5188,8 @@@ recheck
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
                 */
-               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0)
                        return -EPERM;
  #endif
  
@@@ -5956,7 -6026,7 +6025,7 @@@ static int __migrate_task(struct task_s
        set_task_cpu(p, dest_cpu);
        if (on_rq) {
                activate_task(rq_dest, p, 0);
-               check_preempt_curr(rq_dest, p);
+               check_preempt_curr(rq_dest, p, 0);
        }
  done:
        ret = 1;
@@@ -6281,7 -6351,7 +6350,7 @@@ set_table_entry(struct ctl_table *entry
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(12);
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
  
        if (table == NULL)
                return NULL;
                sizeof(int), 0644, proc_dointvec_minmax);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax);
-       /* &table[11] is terminator */
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
  
        return table;
  }
@@@ -7193,13 -7265,21 +7264,21 @@@ static void init_sched_groups_power(in
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
  
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(sd, type)               sd->name = #type
+ #else
+ # define SD_INIT_NAME(sd, type)               do { } while (0)
+ #endif
  #define       SD_INIT(sd, type)       sd_init_##type(sd)
  #define SD_INIT_FUNC(type)    \
  static noinline void sd_init_##type(struct sched_domain *sd)  \
  {                                                             \
        memset(sd, 0, sizeof(*sd));                             \
        *sd = SD_##type##_INIT;                                 \
        sd->level = SD_LV_##type;                               \
+       SD_INIT_NAME(sd, type);                                 \
  }
  
  SD_INIT_FUNC(CPU)
@@@ -7695,24 -7775,27 +7774,27 @@@ static int dattrs_equal(struct sched_do
   * and partition_sched_domains() will fallback to the single partition
   * 'fallback_doms', it also forces the domains to be rebuilt.
   *
+  * If doms_new==NULL it will be replaced with cpu_online_map.
+  * ndoms_new==0 is a special case for destroying existing domains.
+  * It will not create the default domain.
+  *
   * Call with hotplug lock held
   */
  void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                             struct sched_domain_attr *dattr_new)
  {
-       int i, j;
+       int i, j, n;
  
        mutex_lock(&sched_domains_mutex);
  
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
  
-       if (doms_new == NULL)
-               ndoms_new = 0;
+       n = doms_new ? ndoms_new : 0;
  
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < ndoms_new; j++) {
+               for (j = 0; j < n; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@@ -7725,7 -7808,6 +7807,6 @@@ match1
  
        if (doms_new == NULL) {
                ndoms_cur = 0;
-               ndoms_new = 1;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                dattr_new = NULL;
@@@ -7762,8 -7844,13 +7843,13 @@@ match2
  int arch_reinit_sched_domains(void)
  {
        get_online_cpus();
+       /* Destroy domains first to force the rebuild */
+       partition_sched_domains(0, NULL, NULL);
        rebuild_sched_domains();
        put_online_cpus();
        return 0;
  }
  
@@@ -7847,7 -7934,7 +7933,7 @@@ static int update_sched_domains(struct 
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-               partition_sched_domains(0, NULL, NULL);
+               partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
  
        default:
@@@ -8234,20 -8321,25 +8320,25 @@@ void __might_sleep(char *file, int line
  #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
  
-       if ((in_atomic() || irqs_disabled()) &&
-           system_state == SYSTEM_RUNNING && !oops_in_progress) {
-               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                       return;
-               prev_jiffy = jiffies;
-               printk(KERN_ERR "BUG: sleeping function called from invalid"
-                               " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
-               debug_show_held_locks(current);
-               if (irqs_disabled())
-                       print_irqtrace_events(current);
-               dump_stack();
-       }
+       if ((!in_atomic() && !irqs_disabled()) ||
+                   system_state != SYSTEM_RUNNING || oops_in_progress)
+               return;
+       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+               return;
+       prev_jiffy = jiffies;
+       printk(KERN_ERR
+               "BUG: sleeping function called from invalid context at %s:%d\n",
+                       file, line);
+       printk(KERN_ERR
+               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(),
+                       current->pid, current->comm);
+       debug_show_held_locks(current);
+       if (irqs_disabled())
+               print_irqtrace_events(current);
+       dump_stack();
  #endif
  }
  EXPORT_SYMBOL(__might_sleep);
@@@ -8745,73 -8837,95 +8836,95 @@@ static DEFINE_MUTEX(rt_constraints_mute
  static unsigned long to_ratio(u64 period, u64 runtime)
  {
        if (runtime == RUNTIME_INF)
-               return 1ULL << 16;
+               return 1ULL << 20;
  
-       return div64_u64(runtime << 16, period);
+       return div64_u64(runtime << 20, period);
  }
  
- #ifdef CONFIG_CGROUP_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
  {
-       struct task_group *tgi, *parent = tg->parent;
-       unsigned long total = 0;
+       struct task_struct *g, *p;
  
-       if (!parent) {
-               if (global_rt_period() < period)
-                       return 0;
+       do_each_thread(g, p) {
+               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                       return 1;
+       } while_each_thread(g, p);
  
-               return to_ratio(period, runtime) <
-                       to_ratio(global_rt_period(), global_rt_runtime());
-       }
+       return 0;
+ }
  
-       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-               return 0;
+ struct rt_schedulable_data {
+       struct task_group *tg;
+       u64 rt_period;
+       u64 rt_runtime;
+ };
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-               if (tgi == tg)
-                       continue;
+ static int tg_schedulable(struct task_group *tg, void *data)
+ {
+       struct rt_schedulable_data *d = data;
+       struct task_group *child;
+       unsigned long total, sum = 0;
+       u64 period, runtime;
+       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       runtime = tg->rt_bandwidth.rt_runtime;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       if (tg == d->tg) {
+               period = d->rt_period;
+               runtime = d->rt_runtime;
        }
-       rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) <=
-               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-                               parent->rt_bandwidth.rt_runtime);
- }
- #elif defined CONFIG_USER_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
-       struct task_group *tgi;
-       unsigned long total = 0;
-       unsigned long global_ratio =
-               to_ratio(global_rt_period(), global_rt_runtime());
+       /*
+        * Cannot have more runtime than the period.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &task_groups, list) {
-               if (tgi == tg)
-                       continue;
+       /*
+        * Ensure we don't starve existing RT tasks.
+        */
+       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+               return -EBUSY;
+       total = to_ratio(period, runtime);
+       /*
+        * Nobody can have more than the global setting allows.
+        */
+       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+               return -EINVAL;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       /*
+        * The sum of our children's runtime should not exceed our own.
+        */
+       list_for_each_entry_rcu(child, &tg->children, siblings) {
+               period = ktime_to_ns(child->rt_bandwidth.rt_period);
+               runtime = child->rt_bandwidth.rt_runtime;
+               if (child == d->tg) {
+                       period = d->rt_period;
+                       runtime = d->rt_runtime;
+               }
+               sum += to_ratio(period, runtime);
        }
-       rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) < global_ratio;
+       if (sum > total)
+               return -EINVAL;
+       return 0;
  }
- #endif
  
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
-       struct task_struct *g, *p;
-       do_each_thread(g, p) {
-               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                       return 1;
-       } while_each_thread(g, p);
-       return 0;
+       struct rt_schedulable_data data = {
+               .tg = tg,
+               .rt_period = period,
+               .rt_runtime = runtime,
+       };
+       return walk_tg_tree(tg_schedulable, tg_nop, &data);
  }
  
  static int tg_set_bandwidth(struct task_group *tg,
  
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
-       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-               err = -EBUSY;
-               goto unlock;
-       }
-       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-               err = -EINVAL;
+       err = __rt_schedulable(tg, rt_period, rt_runtime);
+       if (err)
                goto unlock;
-       }
  
        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@ -8897,16 -9006,25 +9005,25 @@@ long sched_group_rt_period(struct task_
  
  static int sched_rt_global_constraints(void)
  {
-       struct task_group *tg = &root_task_group;
-       u64 rt_runtime, rt_period;
+       u64 runtime, period;
        int ret = 0;
  
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+       runtime = global_rt_runtime();
+       period = global_rt_period();
+       /*
+        * Sanity check on the sysctl variables.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
        mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(tg, rt_period, rt_runtime))
-               ret = -EINVAL;
+       read_lock(&tasklist_lock);
+       ret = __rt_schedulable(NULL, 0, 0);
+       read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
  
        return ret;
@@@ -8917,6 -9035,9 +9034,9 @@@ static int sched_rt_global_constraints(
        unsigned long flags;
        int i;
  
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@@ -8977,7 -9098,6 +9097,6 @@@ cpu_cgroup_create(struct cgroup_subsys 
  
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-               init_task_group.css.cgroup = cgrp;
                return &init_task_group.css;
        }
  
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
  
-       /* Bind the cgroup to task_group object we just created */
-       tg->css.cgroup = cgrp;
        return &tg->css;
  }
  
diff --combined kernel/sys.c
index 1b96401a0576ab8b3d5b09641eabc3c53080b0e4,0bc8fa3c2288110b49fad4e9eaab2326f52c69f7..fc71f99fb469a22da88b08263e2609409566173b
@@@ -1060,9 -1060,7 +1060,7 @@@ asmlinkage long sys_setsid(void
        group_leader->signal->leader = 1;
        __set_special_pids(sid);
  
-       spin_lock(&group_leader->sighand->siglock);
-       group_leader->signal->tty = NULL;
-       spin_unlock(&group_leader->sighand->siglock);
+       proc_clear_tty(group_leader);
  
        err = session;
  out:
@@@ -1351,8 -1349,10 +1349,10 @@@ asmlinkage long sys_sethostname(char __
        down_write(&uts_sem);
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
-               memcpy(utsname()->nodename, tmp, len);
-               utsname()->nodename[len] = 0;
+               struct new_utsname *u = utsname();
+               memcpy(u->nodename, tmp, len);
+               memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
        }
        up_write(&uts_sem);
  asmlinkage long sys_gethostname(char __user *name, int len)
  {
        int i, errno;
+       struct new_utsname *u;
  
        if (len < 0)
                return -EINVAL;
        down_read(&uts_sem);
-       i = 1 + strlen(utsname()->nodename);
+       u = utsname();
+       i = 1 + strlen(u->nodename);
        if (i > len)
                i = len;
        errno = 0;
-       if (copy_to_user(name, utsname()->nodename, i))
+       if (copy_to_user(name, u->nodename, i))
                errno = -EFAULT;
        up_read(&uts_sem);
        return errno;
@@@ -1397,8 -1399,10 +1399,10 @@@ asmlinkage long sys_setdomainname(char 
        down_write(&uts_sem);
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
-               memcpy(utsname()->domainname, tmp, len);
-               utsname()->domainname[len] = 0;
+               struct new_utsname *u = utsname();
+               memcpy(u->domainname, tmp, len);
+               memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
        }
        up_write(&uts_sem);
@@@ -1452,14 -1456,22 +1456,22 @@@ asmlinkage long sys_setrlimit(unsigned 
                return -EINVAL;
        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
-       if (new_rlim.rlim_cur > new_rlim.rlim_max)
-               return -EINVAL;
        old_rlim = current->signal->rlim + resource;
        if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
            !capable(CAP_SYS_RESOURCE))
                return -EPERM;
-       if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-               return -EPERM;
+       if (resource == RLIMIT_NOFILE) {
+               if (new_rlim.rlim_max == RLIM_INFINITY)
+                       new_rlim.rlim_max = sysctl_nr_open;
+               if (new_rlim.rlim_cur == RLIM_INFINITY)
+                       new_rlim.rlim_cur = sysctl_nr_open;
+               if (new_rlim.rlim_max > sysctl_nr_open)
+                       return -EPERM;
+       }
+       if (new_rlim.rlim_cur > new_rlim.rlim_max)
+               return -EINVAL;
  
        retval = security_task_setrlimit(resource, &new_rlim);
        if (retval)
@@@ -1727,16 -1739,6 +1739,16 @@@ asmlinkage long sys_prctl(int option, u
                case PR_SET_TSC:
                        error = SET_TSC_CTL(arg2);
                        break;
 +              case PR_GET_TIMERSLACK:
 +                      error = current->timer_slack_ns;
 +                      break;
 +              case PR_SET_TIMERSLACK:
 +                      if (arg2 <= 0)
 +                              current->timer_slack_ns =
 +                                      current->default_timer_slack_ns;
 +                      else
 +                              current->timer_slack_ns = arg2;
 +                      break;
                default:
                        error = -EINVAL;
                        break;
diff --combined kernel/time/ntp.c
index 4c8d85421d24b84429be27e096611ad86a3e686d,1ad46f3df6e76cd8994403b1c1ca72c14ec3553b..9c114b726ab3353e75fa222e8597e52db9d34a82
@@@ -142,7 -142,8 +142,7 @@@ static enum hrtimer_restart ntp_leap_se
                time_state = TIME_OOP;
                printk(KERN_NOTICE "Clock: "
                       "inserting leap second 23:59:60 UTC\n");
 -              leap_timer.expires = ktime_add_ns(leap_timer.expires,
 -                                                NSEC_PER_SEC);
 +              hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
                res = HRTIMER_RESTART;
                break;
        case TIME_DEL:
@@@ -244,7 -245,7 +244,7 @@@ static void sync_cmos_clock(unsigned lo
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
                fail = update_persistent_clock(now);
  
-       next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+       next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
  
diff --combined kernel/time/tick-sched.c
index b33be61c0f6bec1ece99784ba42f508f1bcaedc5,b711ffcb106c906be19eab01140ec1e8cafd0063..a547be11cf976c9abba879b4fcd1c025c78ca78d
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/profile.h>
  #include <linux/sched.h>
  #include <linux/tick.h>
+ #include <linux/module.h>
  
  #include <asm/irq_regs.h>
  
@@@ -75,6 -76,9 +76,9 @@@ static void tick_do_update_jiffies64(kt
                                                           incr * ticks);
                }
                do_timer(++ticks);
+               /* Keep the tick_next_period variable up to date */
+               tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
        write_sequnlock(&xtime_lock);
  }
@@@ -187,9 -191,17 +191,17 @@@ u64 get_cpu_idle_time_us(int cpu, u64 *
  {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
  
-       *last_update_time = ktime_to_us(ts->idle_lastupdate);
+       if (!tick_nohz_enabled)
+               return -1;
+       if (ts->idle_active)
+               *last_update_time = ktime_to_us(ts->idle_lastupdate);
+       else
+               *last_update_time = ktime_to_us(ktime_get());
        return ktime_to_us(ts->idle_sleeptime);
  }
+ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
  
  /**
   * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@@ -221,7 -233,7 +233,7 @@@ void tick_nohz_stop_sched_tick(int inid
         */
        if (unlikely(!cpu_online(cpu))) {
                if (cpu == tick_do_timer_cpu)
-                       tick_do_timer_cpu = -1;
+                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
        }
  
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
        next_jiffies = get_next_timer_interrupt(last_jiffies);
        delta_jiffies = next_jiffies - last_jiffies;
  
-       if (rcu_needs_cpu(cpu))
+       if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
        /*
         * Do not stop the tick, if we are only one off
                                goto out;
                        }
  
 -                      ts->idle_tick = ts->sched_timer.expires;
 +                      ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
                 * invoked.
                 */
                if (cpu == tick_do_timer_cpu)
-                       tick_do_timer_cpu = -1;
+                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
  
                ts->idle_sleeps++;
  
@@@ -419,21 -431,21 +431,21 @@@ void tick_nohz_restart_sched_tick(void
        ts->tick_stopped  = 0;
        ts->idle_exittime = now;
        hrtimer_cancel(&ts->sched_timer);
 -      ts->sched_timer.expires = ts->idle_tick;
 +      hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
  
        while (1) {
                /* Forward the time to expire in the future */
                hrtimer_forward(&ts->sched_timer, now, tick_period);
  
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 -                      hrtimer_start(&ts->sched_timer,
 -                                    ts->sched_timer.expires,
 +                      hrtimer_start_expires(&ts->sched_timer,
                                      HRTIMER_MODE_ABS);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                break;
                } else {
 -                      if (!tick_program_event(ts->sched_timer.expires, 0))
 +                      if (!tick_program_event(
 +                              hrtimer_get_expires(&ts->sched_timer), 0))
                                break;
                }
                /* Update jiffies and reread time */
  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
  {
        hrtimer_forward(&ts->sched_timer, now, tick_period);
 -      return tick_program_event(ts->sched_timer.expires, 0);
 +      return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
  }
  
  /*
@@@ -468,7 -480,7 +480,7 @@@ static void tick_nohz_handler(struct cl
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-       if (unlikely(tick_do_timer_cpu == -1))
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
  
        /* Check, if the jiffies need an update */
@@@ -529,7 -541,7 +541,7 @@@ static void tick_nohz_switch_to_nohz(vo
        next = tick_init_jiffy_update();
  
        for (;;) {
 -              ts->sched_timer.expires = next;
 +              hrtimer_set_expires(&ts->sched_timer, next);
                if (!tick_program_event(next, 0))
                        break;
                next = ktime_add(next, tick_period);
@@@ -570,7 -582,7 +582,7 @@@ static enum hrtimer_restart tick_sched_
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-       if (unlikely(tick_do_timer_cpu == -1))
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
  #endif
  
@@@ -622,18 -634,19 +634,18 @@@ void tick_setup_sched_timer(void
         */
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        ts->sched_timer.function = tick_sched_timer;
-       ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  
        /* Get the next period (per cpu) */
 -      ts->sched_timer.expires = tick_init_jiffy_update();
 +      hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
        offset = ktime_to_ns(tick_period) >> 1;
        do_div(offset, num_possible_cpus());
        offset *= smp_processor_id();
 -      ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
 +      hrtimer_add_expires_ns(&ts->sched_timer, offset);
  
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
 -              hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
 -                            HRTIMER_MODE_ABS);
 +              hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;