kernel/time/timer.c

   1 /*
   2  *  linux/kernel/timer.c
   3  *
   4  *  Kernel internal timers
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
   9  *
  10  *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
  11  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
  12  *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  13  *              serialize accesses to xtime/lost_ticks).
  14  *                              Copyright (C) 1998  Andrea Arcangeli
  15  *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  16  *  2002-05-31  Move sys_sysinfo here and make its locking sane, Robert Love
  17  *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
  18  *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
  19  *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  20  */
  21
  22 #include <linux/kernel_stat.h>
  23 #include <linux/export.h>
  24 #include <linux/interrupt.h>
  25 #include <linux/percpu.h>
  26 #include <linux/init.h>
  27 #include <linux/mm.h>
  28 #include <linux/swap.h>
  29 #include <linux/pid_namespace.h>
  30 #include <linux/notifier.h>
  31 #include <linux/thread_info.h>
  32 #include <linux/time.h>
  33 #include <linux/jiffies.h>
  34 #include <linux/posix-timers.h>
  35 #include <linux/cpu.h>
  36 #include <linux/syscalls.h>
  37 #include <linux/delay.h>
  38 #include <linux/tick.h>
  39 #include <linux/kallsyms.h>
  40 #include <linux/irq_work.h>
  41 #include <linux/sched.h>
  42 #include <linux/sched/sysctl.h>
  43 #include <linux/slab.h>
  44 #include <linux/compat.h>
  45
  46 #include <asm/uaccess.h>
  47 #include <asm/unistd.h>
  48 #include <asm/div64.h>
  49 #include <asm/timex.h>
  50 #include <asm/io.h>
  51
  52 #include "tick-internal.h"
  53
  54 #define CREATE_TRACE_POINTS
  55 #include <trace/events/timer.h>
  56
  57 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  58
  59 EXPORT_SYMBOL(jiffies_64);
  60
  61 /*
  62  * per-CPU timer vector definitions:
  63  */
  64 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
  65 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
  66 #define TVN_SIZE (1 << TVN_BITS)
  67 #define TVR_SIZE (1 << TVR_BITS)
  68 #define TVN_MASK (TVN_SIZE - 1)
  69 #define TVR_MASK (TVR_SIZE - 1)
  70 #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
  71
  72 struct tvec {
  73         struct hlist_head vec[TVN_SIZE];
  74 };
  75
  76 struct tvec_root {
  77         struct hlist_head vec[TVR_SIZE];
  78 };
  79
  80 struct tvec_base {
  81         spinlock_t lock;
  82         struct timer_list *running_timer;
  83         unsigned long timer_jiffies;
  84         unsigned long next_timer;
  85         unsigned long active_timers;
  86         unsigned long all_timers;
  87         int cpu;
  88         struct tvec_root tv1;
  89         struct tvec tv2;
  90         struct tvec tv3;
  91         struct tvec tv4;
  92         struct tvec tv5;
  93 } ____cacheline_aligned;
  94
  95
  96 static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
  97
  98 static unsigned long round_jiffies_common(unsigned long j, int cpu,
  99                 bool force_up)
 100 {
 101         int rem;
 102         unsigned long original = j;
 103
 104         /*
 105          * We don't want all cpus firing their timers at once hitting the
 106          * same lock or cachelines, so we skew each extra cpu with an extra
 107          * 3 jiffies. This 3 jiffies came originally from the mm/ code which
 108          * already did this.
 109          * The skew is done by adding 3*cpunr, then round, then subtract this
 110          * extra offset again.
 111          */
 112         j += cpu * 3;
 113
 114         rem = j % HZ;
 115
 116         /*
 117          * If the target jiffie is just after a whole second (which can happen
 118          * due to delays of the timer irq, long irq off times etc etc) then
 119          * we should round down to the whole second, not up. Use 1/4th second
 120          * as cutoff for this rounding as an extreme upper bound for this.
 121          * But never round down if @force_up is set.
 122          */
 123         if (rem < HZ/4 && !force_up) /* round down */
 124                 j = j - rem;
 125         else /* round up */
 126                 j = j - rem + HZ;
 127
 128         /* now that we have rounded, subtract the extra skew again */
 129         j -= cpu * 3;
 130
 131         /*
 132          * Make sure j is still in the future. Otherwise return the
 133          * unmodified value.
 134          */
 135         return time_is_after_jiffies(j) ? j : original;
 136 }
 137
 138 /**
 139  * __round_jiffies - function to round jiffies to a full second
 140  * @j: the time in (absolute) jiffies that should be rounded
 141  * @cpu: the processor number on which the timeout will happen
 142  *
 143  * __round_jiffies() rounds an absolute time in the future (in jiffies)
 144  * up or down to (approximately) full seconds. This is useful for timers
 145  * for which the exact time they fire does not matter too much, as long as
 146  * they fire approximately every X seconds.
 147  *
 148  * By rounding these timers to whole seconds, all such timers will fire
 149  * at the same time, rather than at various times spread out. The goal
 150  * of this is to have the CPU wake up less, which saves power.
 151  *
 152  * The exact rounding is skewed for each processor to avoid all
 153  * processors firing at the exact same time, which could lead
 154  * to lock contention or spurious cache line bouncing.
 155  *
 156  * The return value is the rounded version of the @j parameter.
 157  */
 158 unsigned long __round_jiffies(unsigned long j, int cpu)
 159 {
 160         return round_jiffies_common(j, cpu, false);
 161 }
 162 EXPORT_SYMBOL_GPL(__round_jiffies);
 163
 164 /**
 165  * __round_jiffies_relative - function to round jiffies to a full second
 166  * @j: the time in (relative) jiffies that should be rounded
 167  * @cpu: the processor number on which the timeout will happen
 168  *
 169  * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 170  * up or down to (approximately) full seconds. This is useful for timers
 171  * for which the exact time they fire does not matter too much, as long as
 172  * they fire approximately every X seconds.
 173  *
 174  * By rounding these timers to whole seconds, all such timers will fire
 175  * at the same time, rather than at various times spread out. The goal
 176  * of this is to have the CPU wake up less, which saves power.
 177  *
 178  * The exact rounding is skewed for each processor to avoid all
 179  * processors firing at the exact same time, which could lead
 180  * to lock contention or spurious cache line bouncing.
 181  *
 182  * The return value is the rounded version of the @j parameter.
 183  */
 184 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 185 {
 186         unsigned long j0 = jiffies;
 187
 188         /* Use j0 because jiffies might change while we run */
 189         return round_jiffies_common(j + j0, cpu, false) - j0;
 190 }
 191 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 192
 193 /**
 194  * round_jiffies - function to round jiffies to a full second
 195  * @j: the time in (absolute) jiffies that should be rounded
 196  *
 197  * round_jiffies() rounds an absolute time in the future (in jiffies)
 198  * up or down to (approximately) full seconds. This is useful for timers
 199  * for which the exact time they fire does not matter too much, as long as
 200  * they fire approximately every X seconds.
 201  *
 202  * By rounding these timers to whole seconds, all such timers will fire
 203  * at the same time, rather than at various times spread out. The goal
 204  * of this is to have the CPU wake up less, which saves power.
 205  *
 206  * The return value is the rounded version of the @j parameter.
 207  */
 208 unsigned long round_jiffies(unsigned long j)
 209 {
 210         return round_jiffies_common(j, raw_smp_processor_id(), false);
 211 }
 212 EXPORT_SYMBOL_GPL(round_jiffies);
 213
 214 /**
 215  * round_jiffies_relative - function to round jiffies to a full second
 216  * @j: the time in (relative) jiffies that should be rounded
 217  *
 218  * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 219  * up or down to (approximately) full seconds. This is useful for timers
 220  * for which the exact time they fire does not matter too much, as long as
 221  * they fire approximately every X seconds.
 222  *
 223  * By rounding these timers to whole seconds, all such timers will fire
 224  * at the same time, rather than at various times spread out. The goal
 225  * of this is to have the CPU wake up less, which saves power.
 226  *
 227  * The return value is the rounded version of the @j parameter.
 228  */
 229 unsigned long round_jiffies_relative(unsigned long j)
 230 {
 231         return __round_jiffies_relative(j, raw_smp_processor_id());
 232 }
 233 EXPORT_SYMBOL_GPL(round_jiffies_relative);
 234
 235 /**
 236  * __round_jiffies_up - function to round jiffies up to a full second
 237  * @j: the time in (absolute) jiffies that should be rounded
 238  * @cpu: the processor number on which the timeout will happen
 239  *
 240  * This is the same as __round_jiffies() except that it will never
 241  * round down.  This is useful for timeouts for which the exact time
 242  * of firing does not matter too much, as long as they don't fire too
 243  * early.
 244  */
 245 unsigned long __round_jiffies_up(unsigned long j, int cpu)
 246 {
 247         return round_jiffies_common(j, cpu, true);
 248 }
 249 EXPORT_SYMBOL_GPL(__round_jiffies_up);
 250
 251 /**
 252  * __round_jiffies_up_relative - function to round jiffies up to a full second
 253  * @j: the time in (relative) jiffies that should be rounded
 254  * @cpu: the processor number on which the timeout will happen
 255  *
 256  * This is the same as __round_jiffies_relative() except that it will never
 257  * round down.  This is useful for timeouts for which the exact time
 258  * of firing does not matter too much, as long as they don't fire too
 259  * early.
 260  */
 261 unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
 262 {
 263         unsigned long j0 = jiffies;
 264
 265         /* Use j0 because jiffies might change while we run */
 266         return round_jiffies_common(j + j0, cpu, true) - j0;
 267 }
 268 EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
 269
 270 /**
 271  * round_jiffies_up - function to round jiffies up to a full second
 272  * @j: the time in (absolute) jiffies that should be rounded
 273  *
 274  * This is the same as round_jiffies() except that it will never
 275  * round down.  This is useful for timeouts for which the exact time
 276  * of firing does not matter too much, as long as they don't fire too
 277  * early.
 278  */
 279 unsigned long round_jiffies_up(unsigned long j)
 280 {
 281         return round_jiffies_common(j, raw_smp_processor_id(), true);
 282 }
 283 EXPORT_SYMBOL_GPL(round_jiffies_up);
 284
 285 /**
 286  * round_jiffies_up_relative - function to round jiffies up to a full second
 287  * @j: the time in (relative) jiffies that should be rounded
 288  *
 289  * This is the same as round_jiffies_relative() except that it will never
 290  * round down.  This is useful for timeouts for which the exact time
 291  * of firing does not matter too much, as long as they don't fire too
 292  * early.
 293  */
 294 unsigned long round_jiffies_up_relative(unsigned long j)
 295 {
 296         return __round_jiffies_up_relative(j, raw_smp_processor_id());
 297 }
 298 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 299
 300 /**
 301  * set_timer_slack - set the allowed slack for a timer
 302  * @timer: the timer to be modified
 303  * @slack_hz: the amount of time (in jiffies) allowed for rounding
 304  *
 305  * Set the amount of time, in jiffies, that a certain timer has
 306  * in terms of slack. By setting this value, the timer subsystem
 307  * will schedule the actual timer somewhere between
 308  * the time mod_timer() asks for, and that time plus the slack.
 309  *
 310  * By setting the slack to -1, a percentage of the delay is used
 311  * instead.
 312  */
 313 void set_timer_slack(struct timer_list *timer, int slack_hz)
 314 {
 315         timer->slack = slack_hz;
 316 }
 317 EXPORT_SYMBOL_GPL(set_timer_slack);
 318
 319 static void
 320 __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 321 {
 322         unsigned long expires = timer->expires;
 323         unsigned long idx = expires - base->timer_jiffies;
 324         struct hlist_head *vec;
 325
 326         if (idx < TVR_SIZE) {
 327                 int i = expires & TVR_MASK;
 328                 vec = base->tv1.vec + i;
 329         } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 330                 int i = (expires >> TVR_BITS) & TVN_MASK;
 331                 vec = base->tv2.vec + i;
 332         } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 333                 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 334                 vec = base->tv3.vec + i;
 335         } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 336                 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 337                 vec = base->tv4.vec + i;
 338         } else if ((signed long) idx < 0) {
 339                 /*
 340                  * Can happen if you add a timer with expires == jiffies,
 341                  * or you set a timer to go off in the past
 342                  */
 343                 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
 344         } else {
 345                 int i;
 346                 /* If the timeout is larger than MAX_TVAL (on 64-bit
 347                  * architectures or with CONFIG_BASE_SMALL=1) then we
 348                  * use the maximum timeout.
 349                  */
 350                 if (idx > MAX_TVAL) {
 351                         idx = MAX_TVAL;
 352                         expires = idx + base->timer_jiffies;
 353                 }
 354                 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 355                 vec = base->tv5.vec + i;
 356         }
 357
 358         hlist_add_head(&timer->entry, vec);
 359 }
 360
 361 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 362 {
 363         /* Advance base->jiffies, if the base is empty */
 364         if (!base->all_timers++)
 365                 base->timer_jiffies = jiffies;
 366
 367         __internal_add_timer(base, timer);
 368         /*
 369          * Update base->active_timers and base->next_timer
 370          */
 371         if (!(timer->flags & TIMER_DEFERRABLE)) {
 372                 if (!base->active_timers++ ||
 373                     time_before(timer->expires, base->next_timer))
 374                         base->next_timer = timer->expires;
 375         }
 376
 377         /*
 378          * Check whether the other CPU is in dynticks mode and needs
 379          * to be triggered to reevaluate the timer wheel.
 380          * We are protected against the other CPU fiddling
 381          * with the timer by holding the timer base lock. This also
 382          * makes sure that a CPU on the way to stop its tick can not
 383          * evaluate the timer wheel.
 384          *
 385          * Spare the IPI for deferrable timers on idle targets though.
 386          * The next busy ticks will take care of it. Except full dynticks
 387          * require special care against races with idle_cpu(), lets deal
 388          * with that later.
 389          */
 390         if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu))
 391                 wake_up_nohz_cpu(base->cpu);
 392 }
 393
 394 #ifdef CONFIG_TIMER_STATS
 395 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 396 {
 397         if (timer->start_site)
 398                 return;
 399
 400         timer->start_site = addr;
 401         memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
 402         timer->start_pid = current->pid;
 403 }
 404
 405 static void timer_stats_account_timer(struct timer_list *timer)
 406 {
 407         if (likely(!timer->start_site))
 408                 return;
 409
 410         timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
 411                                  timer->function, timer->start_comm,
 412                                  timer->flags);
 413 }
 414
 415 #else
 416 static void timer_stats_account_timer(struct timer_list *timer) {}
 417 #endif
 418
 419 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 420
 421 static struct debug_obj_descr timer_debug_descr;
 422
 423 static void *timer_debug_hint(void *addr)
 424 {
 425         return ((struct timer_list *) addr)->function;
 426 }
 427
 428 /*
 429  * fixup_init is called when:
 430  * - an active object is initialized
 431  */
 432 static int timer_fixup_init(void *addr, enum debug_obj_state state)
 433 {
 434         struct timer_list *timer = addr;
 435
 436         switch (state) {
 437         case ODEBUG_STATE_ACTIVE:
 438                 del_timer_sync(timer);
 439                 debug_object_init(timer, &timer_debug_descr);
 440                 return 1;
 441         default:
 442                 return 0;
 443         }
 444 }
 445
 446 /* Stub timer callback for improperly used timers. */
 447 static void stub_timer(unsigned long data)
 448 {
 449         WARN_ON(1);
 450 }
 451
 452 /*
 453  * fixup_activate is called when:
 454  * - an active object is activated
 455  * - an unknown object is activated (might be a statically initialized object)
 456  */
 457 static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 458 {
 459         struct timer_list *timer = addr;
 460
 461         switch (state) {
 462
 463         case ODEBUG_STATE_NOTAVAILABLE:
 464                 /*
 465                  * This is not really a fixup. The timer was
 466                  * statically initialized. We just make sure that it
 467                  * is tracked in the object tracker.
 468                  */
 469                 if (timer->entry.pprev == NULL &&
 470                     timer->entry.next == TIMER_ENTRY_STATIC) {
 471                         debug_object_init(timer, &timer_debug_descr);
 472                         debug_object_activate(timer, &timer_debug_descr);
 473                         return 0;
 474                 } else {
 475                         setup_timer(timer, stub_timer, 0);
 476                         return 1;
 477                 }
 478                 return 0;
 479
 480         case ODEBUG_STATE_ACTIVE:
 481                 WARN_ON(1);
 482
 483         default:
 484                 return 0;
 485         }
 486 }
 487
 488 /*
 489  * fixup_free is called when:
 490  * - an active object is freed
 491  */
 492 static int timer_fixup_free(void *addr, enum debug_obj_state state)
 493 {
 494         struct timer_list *timer = addr;
 495
 496         switch (state) {
 497         case ODEBUG_STATE_ACTIVE:
 498                 del_timer_sync(timer);
 499                 debug_object_free(timer, &timer_debug_descr);
 500                 return 1;
 501         default:
 502                 return 0;
 503         }
 504 }
 505
 506 /*
 507  * fixup_assert_init is called when:
 508  * - an untracked/uninit-ed object is found
 509  */
 510 static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 511 {
 512         struct timer_list *timer = addr;
 513
 514         switch (state) {
 515         case ODEBUG_STATE_NOTAVAILABLE:
 516                 if (timer->entry.next == TIMER_ENTRY_STATIC) {
 517                         /*
 518                          * This is not really a fixup. The timer was
 519                          * statically initialized. We just make sure that it
 520                          * is tracked in the object tracker.
 521                          */
 522                         debug_object_init(timer, &timer_debug_descr);
 523                         return 0;
 524                 } else {
 525                         setup_timer(timer, stub_timer, 0);
 526                         return 1;
 527                 }
 528         default:
 529                 return 0;
 530         }
 531 }
 532
 533 static struct debug_obj_descr timer_debug_descr = {
 534         .name                   = "timer_list",
 535         .debug_hint             = timer_debug_hint,
 536         .fixup_init             = timer_fixup_init,
 537         .fixup_activate         = timer_fixup_activate,
 538         .fixup_free             = timer_fixup_free,
 539         .fixup_assert_init      = timer_fixup_assert_init,
 540 };
 541
 542 static inline void debug_timer_init(struct timer_list *timer)
 543 {
 544         debug_object_init(timer, &timer_debug_descr);
 545 }
 546
 547 static inline void debug_timer_activate(struct timer_list *timer)
 548 {
 549         debug_object_activate(timer, &timer_debug_descr);
 550 }
 551
 552 static inline void debug_timer_deactivate(struct timer_list *timer)
 553 {
 554         debug_object_deactivate(timer, &timer_debug_descr);
 555 }
 556
 557 static inline void debug_timer_free(struct timer_list *timer)
 558 {
 559         debug_object_free(timer, &timer_debug_descr);
 560 }
 561
 562 static inline void debug_timer_assert_init(struct timer_list *timer)
 563 {
 564         debug_object_assert_init(timer, &timer_debug_descr);
 565 }
 566
 567 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 568                           const char *name, struct lock_class_key *key);
 569
 570 void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
 571                              const char *name, struct lock_class_key *key)
 572 {
 573         debug_object_init_on_stack(timer, &timer_debug_descr);
 574         do_init_timer(timer, flags, name, key);
 575 }
 576 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 577
 578 void destroy_timer_on_stack(struct timer_list *timer)
 579 {
 580         debug_object_free(timer, &timer_debug_descr);
 581 }
 582 EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 583
 584 #else
 585 static inline void debug_timer_init(struct timer_list *timer) { }
 586 static inline void debug_timer_activate(struct timer_list *timer) { }
 587 static inline void debug_timer_deactivate(struct timer_list *timer) { }
 588 static inline void debug_timer_assert_init(struct timer_list *timer) { }
 589 #endif
 590
 591 static inline void debug_init(struct timer_list *timer)
 592 {
 593         debug_timer_init(timer);
 594         trace_timer_init(timer);
 595 }
 596
 597 static inline void
 598 debug_activate(struct timer_list *timer, unsigned long expires)
 599 {
 600         debug_timer_activate(timer);
 601         trace_timer_start(timer, expires, timer->flags);
 602 }
 603
 604 static inline void debug_deactivate(struct timer_list *timer)
 605 {
 606         debug_timer_deactivate(timer);
 607         trace_timer_cancel(timer);
 608 }
 609
 610 static inline void debug_assert_init(struct timer_list *timer)
 611 {
 612         debug_timer_assert_init(timer);
 613 }
 614
 615 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 616                           const char *name, struct lock_class_key *key)
 617 {
 618         timer->entry.pprev = NULL;
 619         timer->flags = flags | raw_smp_processor_id();
 620         timer->slack = -1;
 621 #ifdef CONFIG_TIMER_STATS
 622         timer->start_site = NULL;
 623         timer->start_pid = -1;
 624         memset(timer->start_comm, 0, TASK_COMM_LEN);
 625 #endif
 626         lockdep_init_map(&timer->lockdep_map, name, key, 0);
 627 }
 628
 629 /**
 630  * init_timer_key - initialize a timer
 631  * @timer: the timer to be initialized
 632  * @flags: timer flags
 633  * @name: name of the timer
 634  * @key: lockdep class key of the fake lock used for tracking timer
 635  *       sync lock dependencies
 636  *
 637  * init_timer_key() must be done to a timer prior calling *any* of the
 638  * other timer functions.
 639  */
 640 void init_timer_key(struct timer_list *timer, unsigned int flags,
 641                     const char *name, struct lock_class_key *key)
 642 {
 643         debug_init(timer);
 644         do_init_timer(timer, flags, name, key);
 645 }
 646 EXPORT_SYMBOL(init_timer_key);
 647
 648 static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 649 {
 650         struct hlist_node *entry = &timer->entry;
 651
 652         debug_deactivate(timer);
 653
 654         __hlist_del(entry);
 655         if (clear_pending)
 656                 entry->pprev = NULL;
 657         entry->next = LIST_POISON2;
 658 }
 659
 660 static inline void
 661 detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 662 {
 663         detach_timer(timer, true);
 664         if (!(timer->flags & TIMER_DEFERRABLE))
 665                 base->active_timers--;
 666         base->all_timers--;
 667 }
 668
 669 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 670                              bool clear_pending)
 671 {
 672         if (!timer_pending(timer))
 673                 return 0;
 674
 675         detach_timer(timer, clear_pending);
 676         if (!(timer->flags & TIMER_DEFERRABLE)) {
 677                 base->active_timers--;
 678                 if (timer->expires == base->next_timer)
 679                         base->next_timer = base->timer_jiffies;
 680         }
 681         /* If this was the last timer, advance base->jiffies */
 682         if (!--base->all_timers)
 683                 base->timer_jiffies = jiffies;
 684         return 1;
 685 }
 686
 687 /*
 688  * We are using hashed locking: holding per_cpu(tvec_bases).lock
 689  * means that all timers which are tied to this base via timer->base are
 690  * locked, and the base itself is locked too.
 691  *
 692  * So __run_timers/migrate_timers can safely modify all timers which could
 693  * be found on ->tvX lists.
 694  *
 695  * When the timer's base is locked and removed from the list, the
 696  * TIMER_MIGRATING flag is set, FIXME
 697  */
 698 static struct tvec_base *lock_timer_base(struct timer_list *timer,
 699                                         unsigned long *flags)
 700         __acquires(timer->base->lock)
 701 {
 702         for (;;) {
 703                 u32 tf = timer->flags;
 704                 struct tvec_base *base;
 705
 706                 if (!(tf & TIMER_MIGRATING)) {
 707                         base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
 708                         spin_lock_irqsave(&base->lock, *flags);
 709                         if (timer->flags == tf)
 710                                 return base;
 711                         spin_unlock_irqrestore(&base->lock, *flags);
 712                 }
 713                 cpu_relax();
 714         }
 715 }
 716
 717 static inline int
 718 __mod_timer(struct timer_list *timer, unsigned long expires,
 719                                                 bool pending_only, int pinned)
 720 {
 721         struct tvec_base *base, *new_base;
 722         unsigned long flags;
 723         int ret = 0 , cpu;
 724
 725         timer_stats_timer_set_start_info(timer);
 726         BUG_ON(!timer->function);
 727
 728         base = lock_timer_base(timer, &flags);
 729
 730         ret = detach_if_pending(timer, base, false);
 731         if (!ret && pending_only)
 732                 goto out_unlock;
 733
 734         debug_activate(timer, expires);
 735
 736         cpu = get_nohz_timer_target(pinned);
 737         new_base = per_cpu_ptr(&tvec_bases, cpu);
 738
 739         if (base != new_base) {
 740                 /*
 741                  * We are trying to schedule the timer on the local CPU.
 742                  * However we can't change timer's base while it is running,
 743                  * otherwise del_timer_sync() can't detect that the timer's
 744                  * handler yet has not finished. This also guarantees that
 745                  * the timer is serialized wrt itself.
 746                  */
 747                 if (likely(base->running_timer != timer)) {
 748                         /* See the comment in lock_timer_base() */
 749                         timer->flags |= TIMER_MIGRATING;
 750
 751                         spin_unlock(&base->lock);
 752                         base = new_base;
 753                         spin_lock(&base->lock);
 754                         timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 755                 }
 756         }
 757
 758         timer->expires = expires;
 759         internal_add_timer(base, timer);
 760
 761 out_unlock:
 762         spin_unlock_irqrestore(&base->lock, flags);
 763
 764         return ret;
 765 }
 766
 767 /**
 768  * mod_timer_pending - modify a pending timer's timeout
 769  * @timer: the pending timer to be modified
 770  * @expires: new timeout in jiffies
 771  *
 772  * mod_timer_pending() is the same for pending timers as mod_timer(),
 773  * but will not re-activate and modify already deleted timers.
 774  *
 775  * It is useful for unserialized use of timers.
 776  */
 777 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 778 {
 779         return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 780 }
 781 EXPORT_SYMBOL(mod_timer_pending);
 782
 783 /*
 784  * Decide where to put the timer while taking the slack into account
 785  *
 786  * Algorithm:
 787  *   1) calculate the maximum (absolute) time
 788  *   2) calculate the highest bit where the expires and new max are different
 789  *   3) use this bit to make a mask
 790  *   4) use the bitmask to round down the maximum time, so that all last
 791  *      bits are zeros
 792  */
 793 static inline
 794 unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 795 {
 796         unsigned long expires_limit, mask;
 797         int bit;
 798
 799         if (timer->slack >= 0) {
 800                 expires_limit = expires + timer->slack;
 801         } else {
 802                 long delta = expires - jiffies;
 803
 804                 if (delta < 256)
 805                         return expires;
 806
 807                 expires_limit = expires + delta / 256;
 808         }
 809         mask = expires ^ expires_limit;
 810         if (mask == 0)
 811                 return expires;
 812
 813         bit = find_last_bit(&mask, BITS_PER_LONG);
 814
 815         mask = (1UL << bit) - 1;
 816
 817         expires_limit = expires_limit & ~(mask);
 818
 819         return expires_limit;
 820 }
 821
 822 /**
 823  * mod_timer - modify a timer's timeout
 824  * @timer: the timer to be modified
 825  * @expires: new timeout in jiffies
 826  *
 827  * mod_timer() is a more efficient way to update the expire field of an
 828  * active timer (if the timer is inactive it will be activated)
 829  *
 830  * mod_timer(timer, expires) is equivalent to:
 831  *
 832  *     del_timer(timer); timer->expires = expires; add_timer(timer);
 833  *
 834  * Note that if there are multiple unserialized concurrent users of the
 835  * same timer, then mod_timer() is the only safe way to modify the timeout,
 836  * since add_timer() cannot modify an already running timer.
 837  *
 838  * The function returns whether it has modified a pending timer or not.
 839  * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
 840  * active timer returns 1.)
 841  */
 842 int mod_timer(struct timer_list *timer, unsigned long expires)
 843 {
 844         expires = apply_slack(timer, expires);
 845
 846         /*
 847          * This is a common optimization triggered by the
 848          * networking code - if the timer is re-modified
 849          * to be the same thing then just return:
 850          */
 851         if (timer_pending(timer) && timer->expires == expires)
 852                 return 1;
 853
 854         return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 855 }
 856 EXPORT_SYMBOL(mod_timer);
 857
 858 /**
 859  * mod_timer_pinned - modify a timer's timeout
 860  * @timer: the timer to be modified
 861  * @expires: new timeout in jiffies
 862  *
 863  * mod_timer_pinned() is a way to update the expire field of an
 864  * active timer (if the timer is inactive it will be activated)
 865  * and to ensure that the timer is scheduled on the current CPU.
 866  *
 867  * Note that this does not prevent the timer from being migrated
 868  * when the current CPU goes offline.  If this is a problem for
 869  * you, use CPU-hotplug notifiers to handle it correctly, for
 870  * example, cancelling the timer when the corresponding CPU goes
 871  * offline.
 872  *
 873  * mod_timer_pinned(timer, expires) is equivalent to:
 874  *
 875  *     del_timer(timer); timer->expires = expires; add_timer(timer);
 876  */
 877 int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
 878 {
 879         if (timer->expires == expires && timer_pending(timer))
 880                 return 1;
 881
 882         return __mod_timer(timer, expires, false, TIMER_PINNED);
 883 }
 884 EXPORT_SYMBOL(mod_timer_pinned);
 885
 886 /**
 887  * add_timer - start a timer
 888  * @timer: the timer to be added
 889  *
 890  * The kernel will do a ->function(->data) callback from the
 891  * timer interrupt at the ->expires point in the future. The
 892  * current time is 'jiffies'.
 893  *
 894  * The timer's ->expires, ->function (and if the handler uses it, ->data)
 895  * fields must be set prior calling this function.
 896  *
 897  * Timers with an ->expires field in the past will be executed in the next
 898  * timer tick.
 899  */
 900 void add_timer(struct timer_list *timer)
 901 {
 902         BUG_ON(timer_pending(timer));
 903         mod_timer(timer, timer->expires);
 904 }
 905 EXPORT_SYMBOL(add_timer);
 906
 907 /**
 908  * add_timer_on - start a timer on a particular CPU
 909  * @timer: the timer to be added
 910  * @cpu: the CPU to start it on
 911  *
 912  * This is not very scalable on SMP. Double adds are not possible.
 913  */
 914 void add_timer_on(struct timer_list *timer, int cpu)
 915 {
 916         struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 917         unsigned long flags;
 918
 919         timer_stats_timer_set_start_info(timer);
 920         BUG_ON(timer_pending(timer) || !timer->function);
 921         spin_lock_irqsave(&base->lock, flags);
 922         timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 923         debug_activate(timer, timer->expires);
 924         internal_add_timer(base, timer);
 925         spin_unlock_irqrestore(&base->lock, flags);
 926 }
 927 EXPORT_SYMBOL_GPL(add_timer_on);
 928
 929 /**
 930  * del_timer - deactive a timer.
 931  * @timer: the timer to be deactivated
 932  *
 933  * del_timer() deactivates a timer - this works on both active and inactive
 934  * timers.
 935  *
 936  * The function returns whether it has deactivated a pending timer or not.
 937  * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
 938  * active timer returns 1.)
 939  */
 940 int del_timer(struct timer_list *timer)
 941 {
 942         struct tvec_base *base;
 943         unsigned long flags;
 944         int ret = 0;
 945
 946         debug_assert_init(timer);
 947
 948         timer_stats_timer_clear_start_info(timer);
 949         if (timer_pending(timer)) {
 950                 base = lock_timer_base(timer, &flags);
 951                 ret = detach_if_pending(timer, base, true);
 952                 spin_unlock_irqrestore(&base->lock, flags);
 953         }
 954
 955         return ret;
 956 }
 957 EXPORT_SYMBOL(del_timer);
 958
 959 /**
 960  * try_to_del_timer_sync - Try to deactivate a timer
 961  * @timer: timer do del
 962  *
 963  * This function tries to deactivate a timer. Upon successful (ret >= 0)
 964  * exit the timer is not queued and the handler is not running on any CPU.
 965  */
 966 int try_to_del_timer_sync(struct timer_list *timer)
 967 {
 968         struct tvec_base *base;
 969         unsigned long flags;
 970         int ret = -1;
 971
 972         debug_assert_init(timer);
 973
 974         base = lock_timer_base(timer, &flags);
 975
 976         if (base->running_timer != timer) {
 977                 timer_stats_timer_clear_start_info(timer);
 978                 ret = detach_if_pending(timer, base, true);
 979         }
 980         spin_unlock_irqrestore(&base->lock, flags);
 981
 982         return ret;
 983 }
 984 EXPORT_SYMBOL(try_to_del_timer_sync);
 985
 986 #ifdef CONFIG_SMP
 987 /**
 988  * del_timer_sync - deactivate a timer and wait for the handler to finish.
 989  * @timer: the timer to be deactivated
 990  *
 991  * This function only differs from del_timer() on SMP: besides deactivating
 992  * the timer it also makes sure the handler has finished executing on other
 993  * CPUs.
 994  *
 995  * Synchronization rules: Callers must prevent restarting of the timer,
 996  * otherwise this function is meaningless. It must not be called from
 997  * interrupt contexts unless the timer is an irqsafe one. The caller must
 998  * not hold locks which would prevent completion of the timer's
 999  * handler. The timer's handler must not call add_timer_on(). Upon exit the
1000  * timer is not queued and the handler is not running on any CPU.
1001  *
1002  * Note: For !irqsafe timers, you must not hold locks that are held in
1003  *   interrupt context while calling this function. Even if the lock has
1004  *   nothing to do with the timer in question.  Here's why:
1005  *
1006  *    CPU0                             CPU1
1007  *    ----                             ----
1008  *                                   <SOFTIRQ>
1009  *                                   call_timer_fn();
1010  *                                     base->running_timer = mytimer;
1011  *  spin_lock_irq(somelock);
1012  *                                     <IRQ>
1013  *                                        spin_lock(somelock);
1014  *  del_timer_sync(mytimer);
1015  *   while (base->running_timer == mytimer);
1016  *
1017  * Now del_timer_sync() will never return and never release somelock.
1018  * The interrupt on the other CPU is waiting to grab somelock but
1019  * it has interrupted the softirq that CPU0 is waiting to finish.
1020  *
1021  * The function returns whether it has deactivated a pending timer or not.
1022  */
1023 int del_timer_sync(struct timer_list *timer)
1024 {
1025 #ifdef CONFIG_LOCKDEP
1026         unsigned long flags;
1027
1028         /*
1029          * If lockdep gives a backtrace here, please reference
1030          * the synchronization rules above.
1031          */
1032         local_irq_save(flags);
1033         lock_map_acquire(&timer->lockdep_map);
1034         lock_map_release(&timer->lockdep_map);
1035         local_irq_restore(flags);
1036 #endif
1037         /*
1038          * don't use it in hardirq context, because it
1039          * could lead to deadlock.
1040          */
1041         WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
1042         for (;;) {
1043                 int ret = try_to_del_timer_sync(timer);
1044                 if (ret >= 0)
1045                         return ret;
1046                 cpu_relax();
1047         }
1048 }
1049 EXPORT_SYMBOL(del_timer_sync);
1050 #endif
1051
1052 static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1053 {
1054         /* cascade all the timers from tv up one level */
1055         struct timer_list *timer;
1056         struct hlist_node *tmp;
1057         struct hlist_head tv_list;
1058
1059         hlist_move_list(tv->vec + index, &tv_list);
1060
1061         /*
1062          * We are removing _all_ timers from the list, so we
1063          * don't have to detach them individually.
1064          */
1065         hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1066                 /* No accounting, while moving them */
1067                 __internal_add_timer(base, timer);
1068         }
1069
1070         return index;
1071 }
1072
1073 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1074                           unsigned long data)
1075 {
1076         int count = preempt_count();
1077
1078 #ifdef CONFIG_LOCKDEP
1079         /*
1080          * It is permissible to free the timer from inside the
1081          * function that is called from it, this we need to take into
1082          * account for lockdep too. To avoid bogus "held lock freed"
1083          * warnings as well as problems when looking into
1084          * timer->lockdep_map, make a copy and use that here.
1085          */
1086         struct lockdep_map lockdep_map;
1087
1088         lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1089 #endif
1090         /*
1091          * Couple the lock chain with the lock chain at
1092          * del_timer_sync() by acquiring the lock_map around the fn()
1093          * call here and in del_timer_sync().
1094          */
1095         lock_map_acquire(&lockdep_map);
1096
1097         trace_timer_expire_entry(timer);
1098         fn(data);
1099         trace_timer_expire_exit(timer);
1100
1101         lock_map_release(&lockdep_map);
1102
1103         if (count != preempt_count()) {
1104                 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1105                           fn, count, preempt_count());
1106                 /*
1107                  * Restore the preempt count. That gives us a decent
1108                  * chance to survive and extract information. If the
1109                  * callback kept a lock held, bad luck, but not worse
1110                  * than the BUG() we had.
1111                  */
1112                 preempt_count_set(count);
1113         }
1114 }
1115
1116 #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
1117
1118 /**
1119  * __run_timers - run all expired timers (if any) on this CPU.
1120  * @base: the timer vector to be processed.
1121  *
1122  * This function cascades all vectors and executes all expired timer
1123  * vectors.
1124  */
1125 static inline void __run_timers(struct tvec_base *base)
1126 {
1127         struct timer_list *timer;
1128
1129         spin_lock_irq(&base->lock);
1130
1131         while (time_after_eq(jiffies, base->timer_jiffies)) {
1132                 struct hlist_head work_list;
1133                 struct hlist_head *head = &work_list;
1134                 int index;
1135
1136                 if (!base->all_timers) {
1137                         base->timer_jiffies = jiffies;
1138                         break;
1139                 }
1140
1141                 index = base->timer_jiffies & TVR_MASK;
1142
1143                 /*
1144                  * Cascade timers:
1145                  */
1146                 if (!index &&
1147                         (!cascade(base, &base->tv2, INDEX(0))) &&
1148                                 (!cascade(base, &base->tv3, INDEX(1))) &&
1149                                         !cascade(base, &base->tv4, INDEX(2)))
1150                         cascade(base, &base->tv5, INDEX(3));
1151                 ++base->timer_jiffies;
1152                 hlist_move_list(base->tv1.vec + index, head);
1153                 while (!hlist_empty(head)) {
1154                         void (*fn)(unsigned long);
1155                         unsigned long data;
1156                         bool irqsafe;
1157
1158                         timer = hlist_entry(head->first, struct timer_list, entry);
1159                         fn = timer->function;
1160                         data = timer->data;
1161                         irqsafe = timer->flags & TIMER_IRQSAFE;
1162
1163                         timer_stats_account_timer(timer);
1164
1165                         base->running_timer = timer;
1166                         detach_expired_timer(timer, base);
1167
1168                         if (irqsafe) {
1169                                 spin_unlock(&base->lock);
1170                                 call_timer_fn(timer, fn, data);
1171                                 spin_lock(&base->lock);
1172                         } else {
1173                                 spin_unlock_irq(&base->lock);
1174                                 call_timer_fn(timer, fn, data);
1175                                 spin_lock_irq(&base->lock);
1176                         }
1177                 }
1178         }
1179         base->running_timer = NULL;
1180         spin_unlock_irq(&base->lock);
1181 }
1182
1183 #ifdef CONFIG_NO_HZ_COMMON
1184 /*
1185  * Find out when the next timer event is due to happen. This
1186  * is used on S/390 to stop all activity when a CPU is idle.
1187  * This function needs to be called with interrupts disabled.
1188  */
1189 static unsigned long __next_timer_interrupt(struct tvec_base *base)
1190 {
1191         unsigned long timer_jiffies = base->timer_jiffies;
1192         unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
1193         int index, slot, array, found = 0;
1194         struct timer_list *nte;
1195         struct tvec *varray[4];
1196
1197         /* Look for timer events in tv1. */
1198         index = slot = timer_jiffies & TVR_MASK;
1199         do {
1200                 hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
1201                         if (nte->flags & TIMER_DEFERRABLE)
1202                                 continue;
1203
1204                         found = 1;
1205                         expires = nte->expires;
1206                         /* Look at the cascade bucket(s)? */
1207                         if (!index || slot < index)
1208                                 goto cascade;
1209                         return expires;
1210                 }
1211                 slot = (slot + 1) & TVR_MASK;
1212         } while (slot != index);
1213
1214 cascade:
1215         /* Calculate the next cascade event */
1216         if (index)
1217                 timer_jiffies += TVR_SIZE - index;
1218         timer_jiffies >>= TVR_BITS;
1219
1220         /* Check tv2-tv5. */
1221         varray[0] = &base->tv2;
1222         varray[1] = &base->tv3;
1223         varray[2] = &base->tv4;
1224         varray[3] = &base->tv5;
1225
1226         for (array = 0; array < 4; array++) {
1227                 struct tvec *varp = varray[array];
1228
1229                 index = slot = timer_jiffies & TVN_MASK;
1230                 do {
1231                         hlist_for_each_entry(nte, varp->vec + slot, entry) {
1232                                 if (nte->flags & TIMER_DEFERRABLE)
1233                                         continue;
1234
1235                                 found = 1;
1236                                 if (time_before(nte->expires, expires))
1237                                         expires = nte->expires;
1238                         }
1239                         /*
1240                          * Do we still search for the first timer or are
1241                          * we looking up the cascade buckets ?
1242                          */
1243                         if (found) {
1244                                 /* Look at the cascade bucket(s)? */
1245                                 if (!index || slot < index)
1246                                         break;
1247                                 return expires;
1248                         }
1249                         slot = (slot + 1) & TVN_MASK;
1250                 } while (slot != index);
1251
1252                 if (index)
1253                         timer_jiffies += TVN_SIZE - index;
1254                 timer_jiffies >>= TVN_BITS;
1255         }
1256         return expires;
1257 }
1258
1259 /*
1260  * Check, if the next hrtimer event is before the next timer wheel
1261  * event:
1262  */
1263 static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1264 {
1265         u64 nextevt = hrtimer_get_next_event();
1266
1267         /*
1268          * If high resolution timers are enabled
1269          * hrtimer_get_next_event() returns KTIME_MAX.
1270          */
1271         if (expires <= nextevt)
1272                 return expires;
1273
1274         /*
1275          * If the next timer is already expired, return the tick base
1276          * time so the tick is fired immediately.
1277          */
1278         if (nextevt <= basem)
1279                 return basem;
1280
1281         /*
1282          * Round up to the next jiffie. High resolution timers are
1283          * off, so the hrtimers are expired in the tick and we need to
1284          * make sure that this tick really expires the timer to avoid
1285          * a ping pong of the nohz stop code.
1286          *
1287          * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
1288          */
1289         return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
1290 }
1291
1292 /**
1293  * get_next_timer_interrupt - return the time (clock mono) of the next timer
1294  * @basej:      base time jiffies
1295  * @basem:      base time clock monotonic
1296  *
1297  * Returns the tick aligned clock monotonic time of the next pending
1298  * timer or KTIME_MAX if no timer is pending.
1299  */
1300 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1301 {
1302         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
1303         u64 expires = KTIME_MAX;
1304         unsigned long nextevt;
1305
1306         /*
1307          * Pretend that there is no timer pending if the cpu is offline.
1308          * Possible pending timers will be migrated later to an active cpu.
1309          */
1310         if (cpu_is_offline(smp_processor_id()))
1311                 return expires;
1312
1313         spin_lock(&base->lock);
1314         if (base->active_timers) {
1315                 if (time_before_eq(base->next_timer, base->timer_jiffies))
1316                         base->next_timer = __next_timer_interrupt(base);
1317                 nextevt = base->next_timer;
1318                 if (time_before_eq(nextevt, basej))
1319                         expires = basem;
1320                 else
1321                         expires = basem + (nextevt - basej) * TICK_NSEC;
1322         }
1323         spin_unlock(&base->lock);
1324
1325         return cmp_next_hrtimer_event(basem, expires);
1326 }
1327 #endif
1328
1329 /*
1330  * Called from the timer interrupt handler to charge one tick to the current
1331  * process.  user_tick is 1 if the tick is user time, 0 for system.
1332  */
1333 void update_process_times(int user_tick)
1334 {
1335         struct task_struct *p = current;
1336
1337         /* Note: this timer irq context must be accounted for as well. */
1338         account_process_tick(p, user_tick);
1339         run_local_timers();
1340         rcu_check_callbacks(user_tick);
1341 #ifdef CONFIG_IRQ_WORK
1342         if (in_irq())
1343                 irq_work_tick();
1344 #endif
1345         scheduler_tick();
1346         run_posix_cpu_timers(p);
1347 }
1348
1349 /*
1350  * This function runs timers and the timer-tq in bottom half context.
1351  */
1352 static void run_timer_softirq(struct softirq_action *h)
1353 {
1354         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
1355
1356         if (time_after_eq(jiffies, base->timer_jiffies))
1357                 __run_timers(base);
1358 }
1359
1360 /*
1361  * Called by the local, per-CPU timer interrupt on SMP.
1362  */
1363 void run_local_timers(void)
1364 {
1365         hrtimer_run_queues();
1366         raise_softirq(TIMER_SOFTIRQ);
1367 }
1368
1369 #ifdef __ARCH_WANT_SYS_ALARM
1370
1371 /*
1372  * For backwards compatibility?  This can be done in libc so Alpha
1373  * and all newer ports shouldn't need it.
1374  */
1375 SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1376 {
1377         return alarm_setitimer(seconds);
1378 }
1379
1380 #endif
1381
1382 static void process_timeout(unsigned long __data)
1383 {
1384         wake_up_process((struct task_struct *)__data);
1385 }
1386
1387 /**
1388  * schedule_timeout - sleep until timeout
1389  * @timeout: timeout value in jiffies
1390  *
1391  * Make the current task sleep until @timeout jiffies have
1392  * elapsed. The routine will return immediately unless
1393  * the current task state has been set (see set_current_state()).
1394  *
1395  * You can set the task state as follows -
1396  *
1397  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1398  * pass before the routine returns. The routine will return 0
1399  *
1400  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1401  * delivered to the current task. In this case the remaining time
1402  * in jiffies will be returned, or 0 if the timer expired in time
1403  *
1404  * The current task state is guaranteed to be TASK_RUNNING when this
1405  * routine returns.
1406  *
1407  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1408  * the CPU away without a bound on the timeout. In this case the return
1409  * value will be %MAX_SCHEDULE_TIMEOUT.
1410  *
1411  * In all cases the return value is guaranteed to be non-negative.
1412  */
1413 signed long __sched schedule_timeout(signed long timeout)
1414 {
1415         struct timer_list timer;
1416         unsigned long expire;
1417
1418         switch (timeout)
1419         {
1420         case MAX_SCHEDULE_TIMEOUT:
1421                 /*
1422                  * These two special cases are useful to be comfortable
1423                  * in the caller. Nothing more. We could take
1424                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
1425                  * but I' d like to return a valid offset (>=0) to allow
1426                  * the caller to do everything it want with the retval.
1427                  */
1428                 schedule();
1429                 goto out;
1430         default:
1431                 /*
1432                  * Another bit of PARANOID. Note that the retval will be
1433                  * 0 since no piece of kernel is supposed to do a check
1434                  * for a negative retval of schedule_timeout() (since it
1435                  * should never happens anyway). You just have the printk()
1436                  * that will tell you if something is gone wrong and where.
1437                  */
1438                 if (timeout < 0) {
1439                         printk(KERN_ERR "schedule_timeout: wrong timeout "
1440                                 "value %lx\n", timeout);
1441                         dump_stack();
1442                         current->state = TASK_RUNNING;
1443                         goto out;
1444                 }
1445         }
1446
1447         expire = timeout + jiffies;
1448
1449         setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1450         __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1451         schedule();
1452         del_singleshot_timer_sync(&timer);
1453
1454         /* Remove the timer from the object tracker */
1455         destroy_timer_on_stack(&timer);
1456
1457         timeout = expire - jiffies;
1458
1459  out:
1460         return timeout < 0 ? 0 : timeout;
1461 }
1462 EXPORT_SYMBOL(schedule_timeout);
1463
1464 /*
1465  * We can use __set_current_state() here because schedule_timeout() calls
1466  * schedule() unconditionally.
1467  */
1468 signed long __sched schedule_timeout_interruptible(signed long timeout)
1469 {
1470         __set_current_state(TASK_INTERRUPTIBLE);
1471         return schedule_timeout(timeout);
1472 }
1473 EXPORT_SYMBOL(schedule_timeout_interruptible);
1474
1475 signed long __sched schedule_timeout_killable(signed long timeout)
1476 {
1477         __set_current_state(TASK_KILLABLE);
1478         return schedule_timeout(timeout);
1479 }
1480 EXPORT_SYMBOL(schedule_timeout_killable);
1481
1482 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1483 {
1484         __set_current_state(TASK_UNINTERRUPTIBLE);
1485         return schedule_timeout(timeout);
1486 }
1487 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1488
1489 #ifdef CONFIG_HOTPLUG_CPU
1490 static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
1491 {
1492         struct timer_list *timer;
1493         int cpu = new_base->cpu;
1494
1495         while (!hlist_empty(head)) {
1496                 timer = hlist_entry(head->first, struct timer_list, entry);
1497                 /* We ignore the accounting on the dying cpu */
1498                 detach_timer(timer, false);
1499                 timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
1500                 internal_add_timer(new_base, timer);
1501         }
1502 }
1503
1504 static void migrate_timers(int cpu)
1505 {
1506         struct tvec_base *old_base;
1507         struct tvec_base *new_base;
1508         int i;
1509
1510         BUG_ON(cpu_online(cpu));
1511         old_base = per_cpu_ptr(&tvec_bases, cpu);
1512         new_base = this_cpu_ptr(&tvec_bases);
1513         /*
1514          * The caller is globally serialized and nobody else
1515          * takes two locks at once, deadlock is not possible.
1516          */
1517         spin_lock_irq(&new_base->lock);
1518         spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1519
1520         BUG_ON(old_base->running_timer);
1521
1522         for (i = 0; i < TVR_SIZE; i++)
1523                 migrate_timer_list(new_base, old_base->tv1.vec + i);
1524         for (i = 0; i < TVN_SIZE; i++) {
1525                 migrate_timer_list(new_base, old_base->tv2.vec + i);
1526                 migrate_timer_list(new_base, old_base->tv3.vec + i);
1527                 migrate_timer_list(new_base, old_base->tv4.vec + i);
1528                 migrate_timer_list(new_base, old_base->tv5.vec + i);
1529         }
1530
1531         old_base->active_timers = 0;
1532         old_base->all_timers = 0;
1533
1534         spin_unlock(&old_base->lock);
1535         spin_unlock_irq(&new_base->lock);
1536 }
1537
1538 static int timer_cpu_notify(struct notifier_block *self,
1539                                 unsigned long action, void *hcpu)
1540 {
1541         switch (action) {
1542         case CPU_DEAD:
1543         case CPU_DEAD_FROZEN:
1544                 migrate_timers((long)hcpu);
1545                 break;
1546         default:
1547                 break;
1548         }
1549
1550         return NOTIFY_OK;
1551 }
1552
1553 static inline void timer_register_cpu_notifier(void)
1554 {
1555         cpu_notifier(timer_cpu_notify, 0);
1556 }
1557 #else
1558 static inline void timer_register_cpu_notifier(void) { }
1559 #endif /* CONFIG_HOTPLUG_CPU */
1560
1561 static void __init init_timer_cpu(int cpu)
1562 {
1563         struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
1564
1565         base->cpu = cpu;
1566         spin_lock_init(&base->lock);
1567
1568         base->timer_jiffies = jiffies;
1569         base->next_timer = base->timer_jiffies;
1570 }
1571
1572 static void __init init_timer_cpus(void)
1573 {
1574         int cpu;
1575
1576         for_each_possible_cpu(cpu)
1577                 init_timer_cpu(cpu);
1578 }
1579
1580 void __init init_timers(void)
1581 {
1582         init_timer_cpus();
1583         init_timer_stats();
1584         timer_register_cpu_notifier();
1585         open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1586 }
1587
1588 /**
1589  * msleep - sleep safely even with waitqueue interruptions
1590  * @msecs: Time in milliseconds to sleep for
1591  */
1592 void msleep(unsigned int msecs)
1593 {
1594         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1595
1596         while (timeout)
1597                 timeout = schedule_timeout_uninterruptible(timeout);
1598 }
1599
1600 EXPORT_SYMBOL(msleep);
1601
1602 /**
1603  * msleep_interruptible - sleep waiting for signals
1604  * @msecs: Time in milliseconds to sleep for
1605  */
1606 unsigned long msleep_interruptible(unsigned int msecs)
1607 {
1608         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1609
1610         while (timeout && !signal_pending(current))
1611                 timeout = schedule_timeout_interruptible(timeout);
1612         return jiffies_to_msecs(timeout);
1613 }
1614
1615 EXPORT_SYMBOL(msleep_interruptible);
1616
1617 static void __sched do_usleep_range(unsigned long min, unsigned long max)
1618 {
1619         ktime_t kmin;
1620         unsigned long delta;
1621
1622         kmin = ktime_set(0, min * NSEC_PER_USEC);
1623         delta = (max - min) * NSEC_PER_USEC;
1624         schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
1625 }
1626
1627 /**
1628  * usleep_range - Drop in replacement for udelay where wakeup is flexible
1629  * @min: Minimum time in usecs to sleep
1630  * @max: Maximum time in usecs to sleep
1631  */
1632 void __sched usleep_range(unsigned long min, unsigned long max)
1633 {
1634         __set_current_state(TASK_UNINTERRUPTIBLE);
1635         do_usleep_range(min, max);
1636 }
1637 EXPORT_SYMBOL(usleep_range);