drivers/md/raid5.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * raid5.c : Multiple Devices driver for Linux
   4  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   5  *         Copyright (C) 1999, 2000 Ingo Molnar
   6  *         Copyright (C) 2002, 2003 H. Peter Anvin
   7  *
   8  * RAID-4/5/6 management functions.
   9  * Thanks to Penguin Computing for making the RAID-6 development possible
  10  * by donating a test server!
  11  */
  12
  13 /*
  14  * BITMAP UNPLUGGING:
  15  *
  16  * The sequencing for updating the bitmap reliably is a little
  17  * subtle (and I got it wrong the first time) so it deserves some
  18  * explanation.
  19  *
  20  * We group bitmap updates into batches.  Each batch has a number.
  21  * We may write out several batches at once, but that isn't very important.
  22  * conf->seq_write is the number of the last batch successfully written.
  23  * conf->seq_flush is the number of the last batch that was closed to
  24  *    new additions.
  25  * When we discover that we will need to write to any block in a stripe
  26  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  27  * the number of the batch it will be in. This is seq_flush+1.
  28  * When we are ready to do a write, if that batch hasn't been written yet,
  29  *   we plug the array and queue the stripe for later.
  30  * When an unplug happens, we increment bm_flush, thus closing the current
  31  *   batch.
  32  * When we notice that bm_flush > bm_write, we write out all pending updates
  33  * to the bitmap, and advance bm_write to where bm_flush was.
  34  * This may occasionally write a bit out twice, but is sure never to
  35  * miss any bits.
  36  */
  37
  38 #include <linux/blkdev.h>
  39 #include <linux/delay.h>
  40 #include <linux/kthread.h>
  41 #include <linux/raid/pq.h>
  42 #include <linux/async_tx.h>
  43 #include <linux/module.h>
  44 #include <linux/async.h>
  45 #include <linux/seq_file.h>
  46 #include <linux/cpu.h>
  47 #include <linux/slab.h>
  48 #include <linux/ratelimit.h>
  49 #include <linux/nodemask.h>
  50
  51 #include <trace/events/block.h>
  52 #include <linux/list_sort.h>
  53
  54 #include "md.h"
  55 #include "raid5.h"
  56 #include "raid0.h"
  57 #include "md-bitmap.h"
  58 #include "raid5-log.h"
  59
  60 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
  61
  62 #define cpu_to_group(cpu) cpu_to_node(cpu)
  63 #define ANY_GROUP NUMA_NO_NODE
  64
  65 #define RAID5_MAX_REQ_STRIPES 256
  66
  67 static bool devices_handle_discard_safely = false;
  68 module_param(devices_handle_discard_safely, bool, 0644);
  69 MODULE_PARM_DESC(devices_handle_discard_safely,
  70                  "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  71 static struct workqueue_struct *raid5_wq;
  72
  73 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  74 {
  75         int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
  76         return &conf->stripe_hashtbl[hash];
  77 }
  78
  79 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
  80 {
  81         return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
  82 }
  83
  84 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
  85         __acquires(&conf->device_lock)
  86 {
  87         spin_lock_irq(conf->hash_locks + hash);
  88         spin_lock(&conf->device_lock);
  89 }
  90
  91 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
  92         __releases(&conf->device_lock)
  93 {
  94         spin_unlock(&conf->device_lock);
  95         spin_unlock_irq(conf->hash_locks + hash);
  96 }
  97
  98 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
  99         __acquires(&conf->device_lock)
 100 {
 101         int i;
 102         spin_lock_irq(conf->hash_locks);
 103         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 104                 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 105         spin_lock(&conf->device_lock);
 106 }
 107
 108 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 109         __releases(&conf->device_lock)
 110 {
 111         int i;
 112         spin_unlock(&conf->device_lock);
 113         for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
 114                 spin_unlock(conf->hash_locks + i);
 115         spin_unlock_irq(conf->hash_locks);
 116 }
 117
 118 /* Find first data disk in a raid6 stripe */
 119 static inline int raid6_d0(struct stripe_head *sh)
 120 {
 121         if (sh->ddf_layout)
 122                 /* ddf always start from first device */
 123                 return 0;
 124         /* md starts just after Q block */
 125         if (sh->qd_idx == sh->disks - 1)
 126                 return 0;
 127         else
 128                 return sh->qd_idx + 1;
 129 }
 130 static inline int raid6_next_disk(int disk, int raid_disks)
 131 {
 132         disk++;
 133         return (disk < raid_disks) ? disk : 0;
 134 }
 135
 136 /* When walking through the disks in a raid5, starting at raid6_d0,
 137  * We need to map each disk to a 'slot', where the data disks are slot
 138  * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 139  * is raid_disks-1.  This help does that mapping.
 140  */
 141 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 142                              int *count, int syndrome_disks)
 143 {
 144         int slot = *count;
 145
 146         if (sh->ddf_layout)
 147                 (*count)++;
 148         if (idx == sh->pd_idx)
 149                 return syndrome_disks;
 150         if (idx == sh->qd_idx)
 151                 return syndrome_disks + 1;
 152         if (!sh->ddf_layout)
 153                 (*count)++;
 154         return slot;
 155 }
 156
 157 static void print_raid5_conf (struct r5conf *conf);
 158
 159 static int stripe_operations_active(struct stripe_head *sh)
 160 {
 161         return sh->check_state || sh->reconstruct_state ||
 162                test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 163                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 164 }
 165
 166 static bool stripe_is_lowprio(struct stripe_head *sh)
 167 {
 168         return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
 169                 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
 170                !test_bit(STRIPE_R5C_CACHING, &sh->state);
 171 }
 172
 173 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 174         __must_hold(&sh->raid_conf->device_lock)
 175 {
 176         struct r5conf *conf = sh->raid_conf;
 177         struct r5worker_group *group;
 178         int thread_cnt;
 179         int i, cpu = sh->cpu;
 180
 181         if (!cpu_online(cpu)) {
 182                 cpu = cpumask_any(cpu_online_mask);
 183                 sh->cpu = cpu;
 184         }
 185
 186         if (list_empty(&sh->lru)) {
 187                 struct r5worker_group *group;
 188                 group = conf->worker_groups + cpu_to_group(cpu);
 189                 if (stripe_is_lowprio(sh))
 190                         list_add_tail(&sh->lru, &group->loprio_list);
 191                 else
 192                         list_add_tail(&sh->lru, &group->handle_list);
 193                 group->stripes_cnt++;
 194                 sh->group = group;
 195         }
 196
 197         if (conf->worker_cnt_per_group == 0) {
 198                 md_wakeup_thread(conf->mddev->thread);
 199                 return;
 200         }
 201
 202         group = conf->worker_groups + cpu_to_group(sh->cpu);
 203
 204         group->workers[0].working = true;
 205         /* at least one worker should run to avoid race */
 206         queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
 207
 208         thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
 209         /* wakeup more workers */
 210         for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
 211                 if (group->workers[i].working == false) {
 212                         group->workers[i].working = true;
 213                         queue_work_on(sh->cpu, raid5_wq,
 214                                       &group->workers[i].work);
 215                         thread_cnt--;
 216                 }
 217         }
 218 }
 219
 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 221                               struct list_head *temp_inactive_list)
 222         __must_hold(&conf->device_lock)
 223 {
 224         int i;
 225         int injournal = 0;      /* number of date pages with R5_InJournal */
 226
 227         BUG_ON(!list_empty(&sh->lru));
 228         BUG_ON(atomic_read(&conf->active_stripes)==0);
 229
 230         if (r5c_is_writeback(conf->log))
 231                 for (i = sh->disks; i--; )
 232                         if (test_bit(R5_InJournal, &sh->dev[i].flags))
 233                                 injournal++;
 234         /*
 235          * In the following cases, the stripe cannot be released to cached
 236          * lists. Therefore, we make the stripe write out and set
 237          * STRIPE_HANDLE:
 238          *   1. when quiesce in r5c write back;
 239          *   2. when resync is requested fot the stripe.
 240          */
 241         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
 242             (conf->quiesce && r5c_is_writeback(conf->log) &&
 243              !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
 244                 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 245                         r5c_make_stripe_write_out(sh);
 246                 set_bit(STRIPE_HANDLE, &sh->state);
 247         }
 248
 249         if (test_bit(STRIPE_HANDLE, &sh->state)) {
 250                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
 251                     !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 252                         list_add_tail(&sh->lru, &conf->delayed_list);
 253                 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 254                            sh->bm_seq - conf->seq_write > 0)
 255                         list_add_tail(&sh->lru, &conf->bitmap_list);
 256                 else {
 257                         clear_bit(STRIPE_DELAYED, &sh->state);
 258                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
 259                         if (conf->worker_cnt_per_group == 0) {
 260                                 if (stripe_is_lowprio(sh))
 261                                         list_add_tail(&sh->lru,
 262                                                         &conf->loprio_list);
 263                                 else
 264                                         list_add_tail(&sh->lru,
 265                                                         &conf->handle_list);
 266                         } else {
 267                                 raid5_wakeup_stripe_thread(sh);
 268                                 return;
 269                         }
 270                 }
 271                 md_wakeup_thread(conf->mddev->thread);
 272         } else {
 273                 BUG_ON(stripe_operations_active(sh));
 274                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 275                         if (atomic_dec_return(&conf->preread_active_stripes)
 276                             < IO_THRESHOLD)
 277                                 md_wakeup_thread(conf->mddev->thread);
 278                 atomic_dec(&conf->active_stripes);
 279                 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 280                         if (!r5c_is_writeback(conf->log))
 281                                 list_add_tail(&sh->lru, temp_inactive_list);
 282                         else {
 283                                 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
 284                                 if (injournal == 0)
 285                                         list_add_tail(&sh->lru, temp_inactive_list);
 286                                 else if (injournal == conf->raid_disks - conf->max_degraded) {
 287                                         /* full stripe */
 288                                         if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
 289                                                 atomic_inc(&conf->r5c_cached_full_stripes);
 290                                         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 291                                                 atomic_dec(&conf->r5c_cached_partial_stripes);
 292                                         list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 293                                         r5c_check_cached_full_stripe(conf);
 294                                 } else
 295                                         /*
 296                                          * STRIPE_R5C_PARTIAL_STRIPE is set in
 297                                          * r5c_try_caching_write(). No need to
 298                                          * set it again.
 299                                          */
 300                                         list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
 301                         }
 302                 }
 303         }
 304 }
 305
 306 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
 307                              struct list_head *temp_inactive_list)
 308         __must_hold(&conf->device_lock)
 309 {
 310         if (atomic_dec_and_test(&sh->count))
 311                 do_release_stripe(conf, sh, temp_inactive_list);
 312 }
 313
 314 /*
 315  * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 316  *
 317  * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 318  * given time. Adding stripes only takes device lock, while deleting stripes
 319  * only takes hash lock.
 320  */
 321 static void release_inactive_stripe_list(struct r5conf *conf,
 322                                          struct list_head *temp_inactive_list,
 323                                          int hash)
 324 {
 325         int size;
 326         bool do_wakeup = false;
 327         unsigned long flags;
 328
 329         if (hash == NR_STRIPE_HASH_LOCKS) {
 330                 size = NR_STRIPE_HASH_LOCKS;
 331                 hash = NR_STRIPE_HASH_LOCKS - 1;
 332         } else
 333                 size = 1;
 334         while (size) {
 335                 struct list_head *list = &temp_inactive_list[size - 1];
 336
 337                 /*
 338                  * We don't hold any lock here yet, raid5_get_active_stripe() might
 339                  * remove stripes from the list
 340                  */
 341                 if (!list_empty_careful(list)) {
 342                         spin_lock_irqsave(conf->hash_locks + hash, flags);
 343                         if (list_empty(conf->inactive_list + hash) &&
 344                             !list_empty(list))
 345                                 atomic_dec(&conf->empty_inactive_list_nr);
 346                         list_splice_tail_init(list, conf->inactive_list + hash);
 347                         do_wakeup = true;
 348                         spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 349                 }
 350                 size--;
 351                 hash--;
 352         }
 353
 354         if (do_wakeup) {
 355                 wake_up(&conf->wait_for_stripe);
 356                 if (atomic_read(&conf->active_stripes) == 0)
 357                         wake_up(&conf->wait_for_quiescent);
 358                 if (conf->retry_read_aligned)
 359                         md_wakeup_thread(conf->mddev->thread);
 360         }
 361 }
 362
 363 static int release_stripe_list(struct r5conf *conf,
 364                                struct list_head *temp_inactive_list)
 365         __must_hold(&conf->device_lock)
 366 {
 367         struct stripe_head *sh, *t;
 368         int count = 0;
 369         struct llist_node *head;
 370
 371         head = llist_del_all(&conf->released_stripes);
 372         head = llist_reverse_order(head);
 373         llist_for_each_entry_safe(sh, t, head, release_list) {
 374                 int hash;
 375
 376                 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 377                 smp_mb();
 378                 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
 379                 /*
 380                  * Don't worry the bit is set here, because if the bit is set
 381                  * again, the count is always > 1. This is true for
 382                  * STRIPE_ON_UNPLUG_LIST bit too.
 383                  */
 384                 hash = sh->hash_lock_index;
 385                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
 386                 count++;
 387         }
 388
 389         return count;
 390 }
 391
 392 void raid5_release_stripe(struct stripe_head *sh)
 393 {
 394         struct r5conf *conf = sh->raid_conf;
 395         unsigned long flags;
 396         struct list_head list;
 397         int hash;
 398         bool wakeup;
 399
 400         /* Avoid release_list until the last reference.
 401          */
 402         if (atomic_add_unless(&sh->count, -1, 1))
 403                 return;
 404
 405         if (unlikely(!conf->mddev->thread) ||
 406                 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
 407                 goto slow_path;
 408         wakeup = llist_add(&sh->release_list, &conf->released_stripes);
 409         if (wakeup)
 410                 md_wakeup_thread(conf->mddev->thread);
 411         return;
 412 slow_path:
 413         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 414         if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
 415                 INIT_LIST_HEAD(&list);
 416                 hash = sh->hash_lock_index;
 417                 do_release_stripe(conf, sh, &list);
 418                 spin_unlock_irqrestore(&conf->device_lock, flags);
 419                 release_inactive_stripe_list(conf, &list, hash);
 420         }
 421 }
 422
 423 static inline void remove_hash(struct stripe_head *sh)
 424 {
 425         pr_debug("remove_hash(), stripe %llu\n",
 426                 (unsigned long long)sh->sector);
 427
 428         hlist_del_init(&sh->hash);
 429 }
 430
 431 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 432 {
 433         struct hlist_head *hp = stripe_hash(conf, sh->sector);
 434
 435         pr_debug("insert_hash(), stripe %llu\n",
 436                 (unsigned long long)sh->sector);
 437
 438         hlist_add_head(&sh->hash, hp);
 439 }
 440
 441 /* find an idle stripe, make sure it is unhashed, and return it. */
 442 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 443 {
 444         struct stripe_head *sh = NULL;
 445         struct list_head *first;
 446
 447         if (list_empty(conf->inactive_list + hash))
 448                 goto out;
 449         first = (conf->inactive_list + hash)->next;
 450         sh = list_entry(first, struct stripe_head, lru);
 451         list_del_init(first);
 452         remove_hash(sh);
 453         atomic_inc(&conf->active_stripes);
 454         BUG_ON(hash != sh->hash_lock_index);
 455         if (list_empty(conf->inactive_list + hash))
 456                 atomic_inc(&conf->empty_inactive_list_nr);
 457 out:
 458         return sh;
 459 }
 460
 461 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 462 static void free_stripe_pages(struct stripe_head *sh)
 463 {
 464         int i;
 465         struct page *p;
 466
 467         /* Have not allocate page pool */
 468         if (!sh->pages)
 469                 return;
 470
 471         for (i = 0; i < sh->nr_pages; i++) {
 472                 p = sh->pages[i];
 473                 if (p)
 474                         put_page(p);
 475                 sh->pages[i] = NULL;
 476         }
 477 }
 478
 479 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
 480 {
 481         int i;
 482         struct page *p;
 483
 484         for (i = 0; i < sh->nr_pages; i++) {
 485                 /* The page have allocated. */
 486                 if (sh->pages[i])
 487                         continue;
 488
 489                 p = alloc_page(gfp);
 490                 if (!p) {
 491                         free_stripe_pages(sh);
 492                         return -ENOMEM;
 493                 }
 494                 sh->pages[i] = p;
 495         }
 496         return 0;
 497 }
 498
 499 static int
 500 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
 501 {
 502         int nr_pages, cnt;
 503
 504         if (sh->pages)
 505                 return 0;
 506
 507         /* Each of the sh->dev[i] need one conf->stripe_size */
 508         cnt = PAGE_SIZE / conf->stripe_size;
 509         nr_pages = (disks + cnt - 1) / cnt;
 510
 511         sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 512         if (!sh->pages)
 513                 return -ENOMEM;
 514         sh->nr_pages = nr_pages;
 515         sh->stripes_per_page = cnt;
 516         return 0;
 517 }
 518 #endif
 519
 520 static void shrink_buffers(struct stripe_head *sh)
 521 {
 522         int i;
 523         int num = sh->raid_conf->pool_size;
 524
 525 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 526         for (i = 0; i < num ; i++) {
 527                 struct page *p;
 528
 529                 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 530                 p = sh->dev[i].page;
 531                 if (!p)
 532                         continue;
 533                 sh->dev[i].page = NULL;
 534                 put_page(p);
 535         }
 536 #else
 537         for (i = 0; i < num; i++)
 538                 sh->dev[i].page = NULL;
 539         free_stripe_pages(sh); /* Free pages */
 540 #endif
 541 }
 542
 543 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 544 {
 545         int i;
 546         int num = sh->raid_conf->pool_size;
 547
 548 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 549         for (i = 0; i < num; i++) {
 550                 struct page *page;
 551
 552                 if (!(page = alloc_page(gfp))) {
 553                         return 1;
 554                 }
 555                 sh->dev[i].page = page;
 556                 sh->dev[i].orig_page = page;
 557                 sh->dev[i].offset = 0;
 558         }
 559 #else
 560         if (alloc_stripe_pages(sh, gfp))
 561                 return -ENOMEM;
 562
 563         for (i = 0; i < num; i++) {
 564                 sh->dev[i].page = raid5_get_dev_page(sh, i);
 565                 sh->dev[i].orig_page = sh->dev[i].page;
 566                 sh->dev[i].offset = raid5_get_page_offset(sh, i);
 567         }
 568 #endif
 569         return 0;
 570 }
 571
 572 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 573                             struct stripe_head *sh);
 574
 575 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 576 {
 577         struct r5conf *conf = sh->raid_conf;
 578         int i, seq;
 579
 580         BUG_ON(atomic_read(&sh->count) != 0);
 581         BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 582         BUG_ON(stripe_operations_active(sh));
 583         BUG_ON(sh->batch_head);
 584
 585         pr_debug("init_stripe called, stripe %llu\n",
 586                 (unsigned long long)sector);
 587 retry:
 588         seq = read_seqcount_begin(&conf->gen_lock);
 589         sh->generation = conf->generation - previous;
 590         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 591         sh->sector = sector;
 592         stripe_set_idx(sector, conf, previous, sh);
 593         sh->state = 0;
 594
 595         for (i = sh->disks; i--; ) {
 596                 struct r5dev *dev = &sh->dev[i];
 597
 598                 if (dev->toread || dev->read || dev->towrite || dev->written ||
 599                     test_bit(R5_LOCKED, &dev->flags)) {
 600                         pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 601                                (unsigned long long)sh->sector, i, dev->toread,
 602                                dev->read, dev->towrite, dev->written,
 603                                test_bit(R5_LOCKED, &dev->flags));
 604                         WARN_ON(1);
 605                 }
 606                 dev->flags = 0;
 607                 dev->sector = raid5_compute_blocknr(sh, i, previous);
 608         }
 609         if (read_seqcount_retry(&conf->gen_lock, seq))
 610                 goto retry;
 611         sh->overwrite_disks = 0;
 612         insert_hash(conf, sh);
 613         sh->cpu = smp_processor_id();
 614         set_bit(STRIPE_BATCH_READY, &sh->state);
 615 }
 616
 617 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 618                                          short generation)
 619 {
 620         struct stripe_head *sh;
 621
 622         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 623         hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 624                 if (sh->sector == sector && sh->generation == generation)
 625                         return sh;
 626         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 627         return NULL;
 628 }
 629
 630 static struct stripe_head *find_get_stripe(struct r5conf *conf,
 631                 sector_t sector, short generation, int hash)
 632 {
 633         int inc_empty_inactive_list_flag;
 634         struct stripe_head *sh;
 635
 636         sh = __find_stripe(conf, sector, generation);
 637         if (!sh)
 638                 return NULL;
 639
 640         if (atomic_inc_not_zero(&sh->count))
 641                 return sh;
 642
 643         /*
 644          * Slow path. The reference count is zero which means the stripe must
 645          * be on a list (sh->lru). Must remove the stripe from the list that
 646          * references it with the device_lock held.
 647          */
 648
 649         spin_lock(&conf->device_lock);
 650         if (!atomic_read(&sh->count)) {
 651                 if (!test_bit(STRIPE_HANDLE, &sh->state))
 652                         atomic_inc(&conf->active_stripes);
 653                 BUG_ON(list_empty(&sh->lru) &&
 654                        !test_bit(STRIPE_EXPANDING, &sh->state));
 655                 inc_empty_inactive_list_flag = 0;
 656                 if (!list_empty(conf->inactive_list + hash))
 657                         inc_empty_inactive_list_flag = 1;
 658                 list_del_init(&sh->lru);
 659                 if (list_empty(conf->inactive_list + hash) &&
 660                     inc_empty_inactive_list_flag)
 661                         atomic_inc(&conf->empty_inactive_list_nr);
 662                 if (sh->group) {
 663                         sh->group->stripes_cnt--;
 664                         sh->group = NULL;
 665                 }
 666         }
 667         atomic_inc(&sh->count);
 668         spin_unlock(&conf->device_lock);
 669
 670         return sh;
 671 }
 672
 673 /*
 674  * Need to check if array has failed when deciding whether to:
 675  *  - start an array
 676  *  - remove non-faulty devices
 677  *  - add a spare
 678  *  - allow a reshape
 679  * This determination is simple when no reshape is happening.
 680  * However if there is a reshape, we need to carefully check
 681  * both the before and after sections.
 682  * This is because some failed devices may only affect one
 683  * of the two sections, and some non-in_sync devices may
 684  * be insync in the section most affected by failed devices.
 685  *
 686  * Most calls to this function hold &conf->device_lock. Calls
 687  * in raid5_run() do not require the lock as no other threads
 688  * have been started yet.
 689  */
 690 int raid5_calc_degraded(struct r5conf *conf)
 691 {
 692         int degraded, degraded2;
 693         int i;
 694
 695         rcu_read_lock();
 696         degraded = 0;
 697         for (i = 0; i < conf->previous_raid_disks; i++) {
 698                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 699                 if (rdev && test_bit(Faulty, &rdev->flags))
 700                         rdev = rcu_dereference(conf->disks[i].replacement);
 701                 if (!rdev || test_bit(Faulty, &rdev->flags))
 702                         degraded++;
 703                 else if (test_bit(In_sync, &rdev->flags))
 704                         ;
 705                 else
 706                         /* not in-sync or faulty.
 707                          * If the reshape increases the number of devices,
 708                          * this is being recovered by the reshape, so
 709                          * this 'previous' section is not in_sync.
 710                          * If the number of devices is being reduced however,
 711                          * the device can only be part of the array if
 712                          * we are reverting a reshape, so this section will
 713                          * be in-sync.
 714                          */
 715                         if (conf->raid_disks >= conf->previous_raid_disks)
 716                                 degraded++;
 717         }
 718         rcu_read_unlock();
 719         if (conf->raid_disks == conf->previous_raid_disks)
 720                 return degraded;
 721         rcu_read_lock();
 722         degraded2 = 0;
 723         for (i = 0; i < conf->raid_disks; i++) {
 724                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 725                 if (rdev && test_bit(Faulty, &rdev->flags))
 726                         rdev = rcu_dereference(conf->disks[i].replacement);
 727                 if (!rdev || test_bit(Faulty, &rdev->flags))
 728                         degraded2++;
 729                 else if (test_bit(In_sync, &rdev->flags))
 730                         ;
 731                 else
 732                         /* not in-sync or faulty.
 733                          * If reshape increases the number of devices, this
 734                          * section has already been recovered, else it
 735                          * almost certainly hasn't.
 736                          */
 737                         if (conf->raid_disks <= conf->previous_raid_disks)
 738                                 degraded2++;
 739         }
 740         rcu_read_unlock();
 741         if (degraded2 > degraded)
 742                 return degraded2;
 743         return degraded;
 744 }
 745
 746 static bool has_failed(struct r5conf *conf)
 747 {
 748         int degraded = conf->mddev->degraded;
 749
 750         if (test_bit(MD_BROKEN, &conf->mddev->flags))
 751                 return true;
 752
 753         if (conf->mddev->reshape_position != MaxSector)
 754                 degraded = raid5_calc_degraded(conf);
 755
 756         return degraded > conf->max_degraded;
 757 }
 758
 759 enum stripe_result {
 760         STRIPE_SUCCESS = 0,
 761         STRIPE_RETRY,
 762         STRIPE_SCHEDULE_AND_RETRY,
 763         STRIPE_FAIL,
 764 };
 765
 766 struct stripe_request_ctx {
 767         /* a reference to the last stripe_head for batching */
 768         struct stripe_head *batch_last;
 769
 770         /* first sector in the request */
 771         sector_t first_sector;
 772
 773         /* last sector in the request */
 774         sector_t last_sector;
 775
 776         /*
 777          * bitmap to track stripe sectors that have been added to stripes
 778          * add one to account for unaligned requests
 779          */
 780         DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
 781
 782         /* the request had REQ_PREFLUSH, cleared after the first stripe_head */
 783         bool do_flush;
 784 };
 785
 786 /*
 787  * Block until another thread clears R5_INACTIVE_BLOCKED or
 788  * there are fewer than 3/4 the maximum number of active stripes
 789  * and there is an inactive stripe available.
 790  */
 791 static bool is_inactive_blocked(struct r5conf *conf, int hash)
 792 {
 793         if (list_empty(conf->inactive_list + hash))
 794                 return false;
 795
 796         if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 797                 return true;
 798
 799         return (atomic_read(&conf->active_stripes) <
 800                 (conf->max_nr_stripes * 3 / 4));
 801 }
 802
 803 struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
 804                 struct stripe_request_ctx *ctx, sector_t sector,
 805                 unsigned int flags)
 806 {
 807         struct stripe_head *sh;
 808         int hash = stripe_hash_locks_hash(conf, sector);
 809         int previous = !!(flags & R5_GAS_PREVIOUS);
 810
 811         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 812
 813         spin_lock_irq(conf->hash_locks + hash);
 814
 815         for (;;) {
 816                 if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
 817                         /*
 818                          * Must release the reference to batch_last before
 819                          * waiting, on quiesce, otherwise the batch_last will
 820                          * hold a reference to a stripe and raid5_quiesce()
 821                          * will deadlock waiting for active_stripes to go to
 822                          * zero.
 823                          */
 824                         if (ctx && ctx->batch_last) {
 825                                 raid5_release_stripe(ctx->batch_last);
 826                                 ctx->batch_last = NULL;
 827                         }
 828
 829                         wait_event_lock_irq(conf->wait_for_quiescent,
 830                                             !conf->quiesce,
 831                                             *(conf->hash_locks + hash));
 832                 }
 833
 834                 sh = find_get_stripe(conf, sector, conf->generation - previous,
 835                                      hash);
 836                 if (sh)
 837                         break;
 838
 839                 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 840                         sh = get_free_stripe(conf, hash);
 841                         if (sh) {
 842                                 r5c_check_stripe_cache_usage(conf);
 843                                 init_stripe(sh, sector, previous);
 844                                 atomic_inc(&sh->count);
 845                                 break;
 846                         }
 847
 848                         if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
 849                                 set_bit(R5_ALLOC_MORE, &conf->cache_state);
 850                 }
 851
 852                 if (flags & R5_GAS_NOBLOCK)
 853                         break;
 854
 855                 set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
 856                 r5l_wake_reclaim(conf->log, 0);
 857                 wait_event_lock_irq(conf->wait_for_stripe,
 858                                     is_inactive_blocked(conf, hash),
 859                                     *(conf->hash_locks + hash));
 860                 clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
 861         }
 862
 863         spin_unlock_irq(conf->hash_locks + hash);
 864         return sh;
 865 }
 866
 867 static bool is_full_stripe_write(struct stripe_head *sh)
 868 {
 869         BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
 870         return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 871 }
 872
 873 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 874                 __acquires(&sh1->stripe_lock)
 875                 __acquires(&sh2->stripe_lock)
 876 {
 877         if (sh1 > sh2) {
 878                 spin_lock_irq(&sh2->stripe_lock);
 879                 spin_lock_nested(&sh1->stripe_lock, 1);
 880         } else {
 881                 spin_lock_irq(&sh1->stripe_lock);
 882                 spin_lock_nested(&sh2->stripe_lock, 1);
 883         }
 884 }
 885
 886 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 887                 __releases(&sh1->stripe_lock)
 888                 __releases(&sh2->stripe_lock)
 889 {
 890         spin_unlock(&sh1->stripe_lock);
 891         spin_unlock_irq(&sh2->stripe_lock);
 892 }
 893
 894 /* Only freshly new full stripe normal write stripe can be added to a batch list */
 895 static bool stripe_can_batch(struct stripe_head *sh)
 896 {
 897         struct r5conf *conf = sh->raid_conf;
 898
 899         if (raid5_has_log(conf) || raid5_has_ppl(conf))
 900                 return false;
 901         return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 902                 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 903                 is_full_stripe_write(sh);
 904 }
 905
 906 /* we only do back search */
 907 static void stripe_add_to_batch_list(struct r5conf *conf,
 908                 struct stripe_head *sh, struct stripe_head *last_sh)
 909 {
 910         struct stripe_head *head;
 911         sector_t head_sector, tmp_sec;
 912         int hash;
 913         int dd_idx;
 914
 915         /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 916         tmp_sec = sh->sector;
 917         if (!sector_div(tmp_sec, conf->chunk_sectors))
 918                 return;
 919         head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
 920
 921         if (last_sh && head_sector == last_sh->sector) {
 922                 head = last_sh;
 923                 atomic_inc(&head->count);
 924         } else {
 925                 hash = stripe_hash_locks_hash(conf, head_sector);
 926                 spin_lock_irq(conf->hash_locks + hash);
 927                 head = find_get_stripe(conf, head_sector, conf->generation,
 928                                        hash);
 929                 spin_unlock_irq(conf->hash_locks + hash);
 930                 if (!head)
 931                         return;
 932                 if (!stripe_can_batch(head))
 933                         goto out;
 934         }
 935
 936         lock_two_stripes(head, sh);
 937         /* clear_batch_ready clear the flag */
 938         if (!stripe_can_batch(head) || !stripe_can_batch(sh))
 939                 goto unlock_out;
 940
 941         if (sh->batch_head)
 942                 goto unlock_out;
 943
 944         dd_idx = 0;
 945         while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 946                 dd_idx++;
 947         if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
 948             bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
 949                 goto unlock_out;
 950
 951         if (head->batch_head) {
 952                 spin_lock(&head->batch_head->batch_lock);
 953                 /* This batch list is already running */
 954                 if (!stripe_can_batch(head)) {
 955                         spin_unlock(&head->batch_head->batch_lock);
 956                         goto unlock_out;
 957                 }
 958                 /*
 959                  * We must assign batch_head of this stripe within the
 960                  * batch_lock, otherwise clear_batch_ready of batch head
 961                  * stripe could clear BATCH_READY bit of this stripe and
 962                  * this stripe->batch_head doesn't get assigned, which
 963                  * could confuse clear_batch_ready for this stripe
 964                  */
 965                 sh->batch_head = head->batch_head;
 966
 967                 /*
 968                  * at this point, head's BATCH_READY could be cleared, but we
 969                  * can still add the stripe to batch list
 970                  */
 971                 list_add(&sh->batch_list, &head->batch_list);
 972                 spin_unlock(&head->batch_head->batch_lock);
 973         } else {
 974                 head->batch_head = head;
 975                 sh->batch_head = head->batch_head;
 976                 spin_lock(&head->batch_lock);
 977                 list_add_tail(&sh->batch_list, &head->batch_list);
 978                 spin_unlock(&head->batch_lock);
 979         }
 980
 981         if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 982                 if (atomic_dec_return(&conf->preread_active_stripes)
 983                     < IO_THRESHOLD)
 984                         md_wakeup_thread(conf->mddev->thread);
 985
 986         if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
 987                 int seq = sh->bm_seq;
 988                 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
 989                     sh->batch_head->bm_seq > seq)
 990                         seq = sh->batch_head->bm_seq;
 991                 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
 992                 sh->batch_head->bm_seq = seq;
 993         }
 994
 995         atomic_inc(&sh->count);
 996 unlock_out:
 997         unlock_two_stripes(head, sh);
 998 out:
 999         raid5_release_stripe(head);
1000 }
1001
1002 /* Determine if 'data_offset' or 'new_data_offset' should be used
1003  * in this stripe_head.
1004  */
1005 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
1006 {
1007         sector_t progress = conf->reshape_progress;
1008         /* Need a memory barrier to make sure we see the value
1009          * of conf->generation, or ->data_offset that was set before
1010          * reshape_progress was updated.
1011          */
1012         smp_rmb();
1013         if (progress == MaxSector)
1014                 return 0;
1015         if (sh->generation == conf->generation - 1)
1016                 return 0;
1017         /* We are in a reshape, and this is a new-generation stripe,
1018          * so use new_data_offset.
1019          */
1020         return 1;
1021 }
1022
1023 static void dispatch_bio_list(struct bio_list *tmp)
1024 {
1025         struct bio *bio;
1026
1027         while ((bio = bio_list_pop(tmp)))
1028                 submit_bio_noacct(bio);
1029 }
1030
1031 static int cmp_stripe(void *priv, const struct list_head *a,
1032                       const struct list_head *b)
1033 {
1034         const struct r5pending_data *da = list_entry(a,
1035                                 struct r5pending_data, sibling);
1036         const struct r5pending_data *db = list_entry(b,
1037                                 struct r5pending_data, sibling);
1038         if (da->sector > db->sector)
1039                 return 1;
1040         if (da->sector < db->sector)
1041                 return -1;
1042         return 0;
1043 }
1044
1045 static void dispatch_defer_bios(struct r5conf *conf, int target,
1046                                 struct bio_list *list)
1047 {
1048         struct r5pending_data *data;
1049         struct list_head *first, *next = NULL;
1050         int cnt = 0;
1051
1052         if (conf->pending_data_cnt == 0)
1053                 return;
1054
1055         list_sort(NULL, &conf->pending_list, cmp_stripe);
1056
1057         first = conf->pending_list.next;
1058
1059         /* temporarily move the head */
1060         if (conf->next_pending_data)
1061                 list_move_tail(&conf->pending_list,
1062                                 &conf->next_pending_data->sibling);
1063
1064         while (!list_empty(&conf->pending_list)) {
1065                 data = list_first_entry(&conf->pending_list,
1066                         struct r5pending_data, sibling);
1067                 if (&data->sibling == first)
1068                         first = data->sibling.next;
1069                 next = data->sibling.next;
1070
1071                 bio_list_merge(list, &data->bios);
1072                 list_move(&data->sibling, &conf->free_list);
1073                 cnt++;
1074                 if (cnt >= target)
1075                         break;
1076         }
1077         conf->pending_data_cnt -= cnt;
1078         BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
1079
1080         if (next != &conf->pending_list)
1081                 conf->next_pending_data = list_entry(next,
1082                                 struct r5pending_data, sibling);
1083         else
1084                 conf->next_pending_data = NULL;
1085         /* list isn't empty */
1086         if (first != &conf->pending_list)
1087                 list_move_tail(&conf->pending_list, first);
1088 }
1089
1090 static void flush_deferred_bios(struct r5conf *conf)
1091 {
1092         struct bio_list tmp = BIO_EMPTY_LIST;
1093
1094         if (conf->pending_data_cnt == 0)
1095                 return;
1096
1097         spin_lock(&conf->pending_bios_lock);
1098         dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
1099         BUG_ON(conf->pending_data_cnt != 0);
1100         spin_unlock(&conf->pending_bios_lock);
1101
1102         dispatch_bio_list(&tmp);
1103 }
1104
1105 static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1106                                 struct bio_list *bios)
1107 {
1108         struct bio_list tmp = BIO_EMPTY_LIST;
1109         struct r5pending_data *ent;
1110
1111         spin_lock(&conf->pending_bios_lock);
1112         ent = list_first_entry(&conf->free_list, struct r5pending_data,
1113                                                         sibling);
1114         list_move_tail(&ent->sibling, &conf->pending_list);
1115         ent->sector = sector;
1116         bio_list_init(&ent->bios);
1117         bio_list_merge(&ent->bios, bios);
1118         conf->pending_data_cnt++;
1119         if (conf->pending_data_cnt >= PENDING_IO_MAX)
1120                 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
1121
1122         spin_unlock(&conf->pending_bios_lock);
1123
1124         dispatch_bio_list(&tmp);
1125 }
1126
1127 static void
1128 raid5_end_read_request(struct bio *bi);
1129 static void
1130 raid5_end_write_request(struct bio *bi);
1131
1132 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
1133 {
1134         struct r5conf *conf = sh->raid_conf;
1135         int i, disks = sh->disks;
1136         struct stripe_head *head_sh = sh;
1137         struct bio_list pending_bios = BIO_EMPTY_LIST;
1138         struct r5dev *dev;
1139         bool should_defer;
1140
1141         might_sleep();
1142
1143         if (log_stripe(sh, s) == 0)
1144                 return;
1145
1146         should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1147
1148         for (i = disks; i--; ) {
1149                 enum req_op op;
1150                 blk_opf_t op_flags = 0;
1151                 int replace_only = 0;
1152                 struct bio *bi, *rbi;
1153                 struct md_rdev *rdev, *rrdev = NULL;
1154
1155                 sh = head_sh;
1156                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
1157                         op = REQ_OP_WRITE;
1158                         if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
1159                                 op_flags = REQ_FUA;
1160                         if (test_bit(R5_Discard, &sh->dev[i].flags))
1161                                 op = REQ_OP_DISCARD;
1162                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1163                         op = REQ_OP_READ;
1164                 else if (test_and_clear_bit(R5_WantReplace,
1165                                             &sh->dev[i].flags)) {
1166                         op = REQ_OP_WRITE;
1167                         replace_only = 1;
1168                 } else
1169                         continue;
1170                 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
1171                         op_flags |= REQ_SYNC;
1172
1173 again:
1174                 dev = &sh->dev[i];
1175                 bi = &dev->req;
1176                 rbi = &dev->rreq; /* For writing to replacement */
1177
1178                 rcu_read_lock();
1179                 rrdev = rcu_dereference(conf->disks[i].replacement);
1180                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
1181                 rdev = rcu_dereference(conf->disks[i].rdev);
1182                 if (!rdev) {
1183                         rdev = rrdev;
1184                         rrdev = NULL;
1185                 }
1186                 if (op_is_write(op)) {
1187                         if (replace_only)
1188                                 rdev = NULL;
1189                         if (rdev == rrdev)
1190                                 /* We raced and saw duplicates */
1191                                 rrdev = NULL;
1192                 } else {
1193                         if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1194                                 rdev = rrdev;
1195                         rrdev = NULL;
1196                 }
1197
1198                 if (rdev && test_bit(Faulty, &rdev->flags))
1199                         rdev = NULL;
1200                 if (rdev)
1201                         atomic_inc(&rdev->nr_pending);
1202                 if (rrdev && test_bit(Faulty, &rrdev->flags))
1203                         rrdev = NULL;
1204                 if (rrdev)
1205                         atomic_inc(&rrdev->nr_pending);
1206                 rcu_read_unlock();
1207
1208                 /* We have already checked bad blocks for reads.  Now
1209                  * need to check for writes.  We never accept write errors
1210                  * on the replacement, so we don't to check rrdev.
1211                  */
1212                 while (op_is_write(op) && rdev &&
1213                        test_bit(WriteErrorSeen, &rdev->flags)) {
1214                         sector_t first_bad;
1215                         int bad_sectors;
1216                         int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
1217                                               &first_bad, &bad_sectors);
1218                         if (!bad)
1219                                 break;
1220
1221                         if (bad < 0) {
1222                                 set_bit(BlockedBadBlocks, &rdev->flags);
1223                                 if (!conf->mddev->external &&
1224                                     conf->mddev->sb_flags) {
1225                                         /* It is very unlikely, but we might
1226                                          * still need to write out the
1227                                          * bad block log - better give it
1228                                          * a chance*/
1229                                         md_check_recovery(conf->mddev);
1230                                 }
1231                                 /*
1232                                  * Because md_wait_for_blocked_rdev
1233                                  * will dec nr_pending, we must
1234                                  * increment it first.
1235                                  */
1236                                 atomic_inc(&rdev->nr_pending);
1237                                 md_wait_for_blocked_rdev(rdev, conf->mddev);
1238                         } else {
1239                                 /* Acknowledged bad block - skip the write */
1240                                 rdev_dec_pending(rdev, conf->mddev);
1241                                 rdev = NULL;
1242                         }
1243                 }
1244
1245                 if (rdev) {
1246                         if (s->syncing || s->expanding || s->expanded
1247                             || s->replacing)
1248                                 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1249
1250                         set_bit(STRIPE_IO_STARTED, &sh->state);
1251
1252                         bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
1253                         bi->bi_end_io = op_is_write(op)
1254                                 ? raid5_end_write_request
1255                                 : raid5_end_read_request;
1256                         bi->bi_private = sh;
1257
1258                         pr_debug("%s: for %llu schedule op %d on disc %d\n",
1259                                 __func__, (unsigned long long)sh->sector,
1260                                 bi->bi_opf, i);
1261                         atomic_inc(&sh->count);
1262                         if (sh != head_sh)
1263                                 atomic_inc(&head_sh->count);
1264                         if (use_new_offset(conf, sh))
1265                                 bi->bi_iter.bi_sector = (sh->sector
1266                                                  + rdev->new_data_offset);
1267                         else
1268                                 bi->bi_iter.bi_sector = (sh->sector
1269                                                  + rdev->data_offset);
1270                         if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1271                                 bi->bi_opf |= REQ_NOMERGE;
1272
1273                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1274                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1275
1276                         if (!op_is_write(op) &&
1277                             test_bit(R5_InJournal, &sh->dev[i].flags))
1278                                 /*
1279                                  * issuing read for a page in journal, this
1280                                  * must be preparing for prexor in rmw; read
1281                                  * the data into orig_page
1282                                  */
1283                                 sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1284                         else
1285                                 sh->dev[i].vec.bv_page = sh->dev[i].page;
1286                         bi->bi_vcnt = 1;
1287                         bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1288                         bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1289                         bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1290                         /*
1291                          * If this is discard request, set bi_vcnt 0. We don't
1292                          * want to confuse SCSI because SCSI will replace payload
1293                          */
1294                         if (op == REQ_OP_DISCARD)
1295                                 bi->bi_vcnt = 0;
1296                         if (rrdev)
1297                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1298
1299                         if (conf->mddev->gendisk)
1300                                 trace_block_bio_remap(bi,
1301                                                 disk_devt(conf->mddev->gendisk),
1302                                                 sh->dev[i].sector);
1303                         if (should_defer && op_is_write(op))
1304                                 bio_list_add(&pending_bios, bi);
1305                         else
1306                                 submit_bio_noacct(bi);
1307                 }
1308                 if (rrdev) {
1309                         if (s->syncing || s->expanding || s->expanded
1310                             || s->replacing)
1311                                 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1312
1313                         set_bit(STRIPE_IO_STARTED, &sh->state);
1314
1315                         bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
1316                         BUG_ON(!op_is_write(op));
1317                         rbi->bi_end_io = raid5_end_write_request;
1318                         rbi->bi_private = sh;
1319
1320                         pr_debug("%s: for %llu schedule op %d on "
1321                                  "replacement disc %d\n",
1322                                 __func__, (unsigned long long)sh->sector,
1323                                 rbi->bi_opf, i);
1324                         atomic_inc(&sh->count);
1325                         if (sh != head_sh)
1326                                 atomic_inc(&head_sh->count);
1327                         if (use_new_offset(conf, sh))
1328                                 rbi->bi_iter.bi_sector = (sh->sector
1329                                                   + rrdev->new_data_offset);
1330                         else
1331                                 rbi->bi_iter.bi_sector = (sh->sector
1332                                                   + rrdev->data_offset);
1333                         if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1334                                 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1335                         sh->dev[i].rvec.bv_page = sh->dev[i].page;
1336                         rbi->bi_vcnt = 1;
1337                         rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1338                         rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1339                         rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1340                         /*
1341                          * If this is discard request, set bi_vcnt 0. We don't
1342                          * want to confuse SCSI because SCSI will replace payload
1343                          */
1344                         if (op == REQ_OP_DISCARD)
1345                                 rbi->bi_vcnt = 0;
1346                         if (conf->mddev->gendisk)
1347                                 trace_block_bio_remap(rbi,
1348                                                 disk_devt(conf->mddev->gendisk),
1349                                                 sh->dev[i].sector);
1350                         if (should_defer && op_is_write(op))
1351                                 bio_list_add(&pending_bios, rbi);
1352                         else
1353                                 submit_bio_noacct(rbi);
1354                 }
1355                 if (!rdev && !rrdev) {
1356                         if (op_is_write(op))
1357                                 set_bit(STRIPE_DEGRADED, &sh->state);
1358                         pr_debug("skip op %d on disc %d for sector %llu\n",
1359                                 bi->bi_opf, i, (unsigned long long)sh->sector);
1360                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
1361                         set_bit(STRIPE_HANDLE, &sh->state);
1362                 }
1363
1364                 if (!head_sh->batch_head)
1365                         continue;
1366                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
1367                                       batch_list);
1368                 if (sh != head_sh)
1369                         goto again;
1370         }
1371
1372         if (should_defer && !bio_list_empty(&pending_bios))
1373                 defer_issue_bios(conf, head_sh->sector, &pending_bios);
1374 }
1375
1376 static struct dma_async_tx_descriptor *
1377 async_copy_data(int frombio, struct bio *bio, struct page **page,
1378         unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1379         struct stripe_head *sh, int no_skipcopy)
1380 {
1381         struct bio_vec bvl;
1382         struct bvec_iter iter;
1383         struct page *bio_page;
1384         int page_offset;
1385         struct async_submit_ctl submit;
1386         enum async_tx_flags flags = 0;
1387         struct r5conf *conf = sh->raid_conf;
1388
1389         if (bio->bi_iter.bi_sector >= sector)
1390                 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1391         else
1392                 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1393
1394         if (frombio)
1395                 flags |= ASYNC_TX_FENCE;
1396         init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1397
1398         bio_for_each_segment(bvl, bio, iter) {
1399                 int len = bvl.bv_len;
1400                 int clen;
1401                 int b_offset = 0;
1402
1403                 if (page_offset < 0) {
1404                         b_offset = -page_offset;
1405                         page_offset += b_offset;
1406                         len -= b_offset;
1407                 }
1408
1409                 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1410                         clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1411                 else
1412                         clen = len;
1413
1414                 if (clen > 0) {
1415                         b_offset += bvl.bv_offset;
1416                         bio_page = bvl.bv_page;
1417                         if (frombio) {
1418                                 if (conf->skip_copy &&
1419                                     b_offset == 0 && page_offset == 0 &&
1420                                     clen == RAID5_STRIPE_SIZE(conf) &&
1421                                     !no_skipcopy)
1422                                         *page = bio_page;
1423                                 else
1424                                         tx = async_memcpy(*page, bio_page, page_offset + poff,
1425                                                   b_offset, clen, &submit);
1426                         } else
1427                                 tx = async_memcpy(bio_page, *page, b_offset,
1428                                                   page_offset + poff, clen, &submit);
1429                 }
1430                 /* chain the operations */
1431                 submit.depend_tx = tx;
1432
1433                 if (clen < len) /* hit end of page */
1434                         break;
1435                 page_offset +=  len;
1436         }
1437
1438         return tx;
1439 }
1440
1441 static void ops_complete_biofill(void *stripe_head_ref)
1442 {
1443         struct stripe_head *sh = stripe_head_ref;
1444         int i;
1445         struct r5conf *conf = sh->raid_conf;
1446
1447         pr_debug("%s: stripe %llu\n", __func__,
1448                 (unsigned long long)sh->sector);
1449
1450         /* clear completed biofills */
1451         for (i = sh->disks; i--; ) {
1452                 struct r5dev *dev = &sh->dev[i];
1453
1454                 /* acknowledge completion of a biofill operation */
1455                 /* and check if we need to reply to a read request,
1456                  * new R5_Wantfill requests are held off until
1457                  * !STRIPE_BIOFILL_RUN
1458                  */
1459                 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1460                         struct bio *rbi, *rbi2;
1461
1462                         BUG_ON(!dev->read);
1463                         rbi = dev->read;
1464                         dev->read = NULL;
1465                         while (rbi && rbi->bi_iter.bi_sector <
1466                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1467                                 rbi2 = r5_next_bio(conf, rbi, dev->sector);
1468                                 bio_endio(rbi);
1469                                 rbi = rbi2;
1470                         }
1471                 }
1472         }
1473         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1474
1475         set_bit(STRIPE_HANDLE, &sh->state);
1476         raid5_release_stripe(sh);
1477 }
1478
1479 static void ops_run_biofill(struct stripe_head *sh)
1480 {
1481         struct dma_async_tx_descriptor *tx = NULL;
1482         struct async_submit_ctl submit;
1483         int i;
1484         struct r5conf *conf = sh->raid_conf;
1485
1486         BUG_ON(sh->batch_head);
1487         pr_debug("%s: stripe %llu\n", __func__,
1488                 (unsigned long long)sh->sector);
1489
1490         for (i = sh->disks; i--; ) {
1491                 struct r5dev *dev = &sh->dev[i];
1492                 if (test_bit(R5_Wantfill, &dev->flags)) {
1493                         struct bio *rbi;
1494                         spin_lock_irq(&sh->stripe_lock);
1495                         dev->read = rbi = dev->toread;
1496                         dev->toread = NULL;
1497                         spin_unlock_irq(&sh->stripe_lock);
1498                         while (rbi && rbi->bi_iter.bi_sector <
1499                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1500                                 tx = async_copy_data(0, rbi, &dev->page,
1501                                                      dev->offset,
1502                                                      dev->sector, tx, sh, 0);
1503                                 rbi = r5_next_bio(conf, rbi, dev->sector);
1504                         }
1505                 }
1506         }
1507
1508         atomic_inc(&sh->count);
1509         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1510         async_trigger_callback(&submit);
1511 }
1512
1513 static void mark_target_uptodate(struct stripe_head *sh, int target)
1514 {
1515         struct r5dev *tgt;
1516
1517         if (target < 0)
1518                 return;
1519
1520         tgt = &sh->dev[target];
1521         set_bit(R5_UPTODATE, &tgt->flags);
1522         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1523         clear_bit(R5_Wantcompute, &tgt->flags);
1524 }
1525
1526 static void ops_complete_compute(void *stripe_head_ref)
1527 {
1528         struct stripe_head *sh = stripe_head_ref;
1529
1530         pr_debug("%s: stripe %llu\n", __func__,
1531                 (unsigned long long)sh->sector);
1532
1533         /* mark the computed target(s) as uptodate */
1534         mark_target_uptodate(sh, sh->ops.target);
1535         mark_target_uptodate(sh, sh->ops.target2);
1536
1537         clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1538         if (sh->check_state == check_state_compute_run)
1539                 sh->check_state = check_state_compute_result;
1540         set_bit(STRIPE_HANDLE, &sh->state);
1541         raid5_release_stripe(sh);
1542 }
1543
1544 /* return a pointer to the address conversion region of the scribble buffer */
1545 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1546 {
1547         return percpu->scribble + i * percpu->scribble_obj_size;
1548 }
1549
1550 /* return a pointer to the address conversion region of the scribble buffer */
1551 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1552                                  struct raid5_percpu *percpu, int i)
1553 {
1554         return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1555 }
1556
1557 /*
1558  * Return a pointer to record offset address.
1559  */
1560 static unsigned int *
1561 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1562 {
1563         return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
1564 }
1565
1566 static struct dma_async_tx_descriptor *
1567 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1568 {
1569         int disks = sh->disks;
1570         struct page **xor_srcs = to_addr_page(percpu, 0);
1571         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1572         int target = sh->ops.target;
1573         struct r5dev *tgt = &sh->dev[target];
1574         struct page *xor_dest = tgt->page;
1575         unsigned int off_dest = tgt->offset;
1576         int count = 0;
1577         struct dma_async_tx_descriptor *tx;
1578         struct async_submit_ctl submit;
1579         int i;
1580
1581         BUG_ON(sh->batch_head);
1582
1583         pr_debug("%s: stripe %llu block: %d\n",
1584                 __func__, (unsigned long long)sh->sector, target);
1585         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1586
1587         for (i = disks; i--; ) {
1588                 if (i != target) {
1589                         off_srcs[count] = sh->dev[i].offset;
1590                         xor_srcs[count++] = sh->dev[i].page;
1591                 }
1592         }
1593
1594         atomic_inc(&sh->count);
1595
1596         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1597                           ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1598         if (unlikely(count == 1))
1599                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1600                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1601         else
1602                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1603                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1604
1605         return tx;
1606 }
1607
1608 /* set_syndrome_sources - populate source buffers for gen_syndrome
1609  * @srcs - (struct page *) array of size sh->disks
1610  * @offs - (unsigned int) array of offset for each page
1611  * @sh - stripe_head to parse
1612  *
1613  * Populates srcs in proper layout order for the stripe and returns the
1614  * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1615  * destination buffer is recorded in srcs[count] and the Q destination
1616  * is recorded in srcs[count+1]].
1617  */
1618 static int set_syndrome_sources(struct page **srcs,
1619                                 unsigned int *offs,
1620                                 struct stripe_head *sh,
1621                                 int srctype)
1622 {
1623         int disks = sh->disks;
1624         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1625         int d0_idx = raid6_d0(sh);
1626         int count;
1627         int i;
1628
1629         for (i = 0; i < disks; i++)
1630                 srcs[i] = NULL;
1631
1632         count = 0;
1633         i = d0_idx;
1634         do {
1635                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1636                 struct r5dev *dev = &sh->dev[i];
1637
1638                 if (i == sh->qd_idx || i == sh->pd_idx ||
1639                     (srctype == SYNDROME_SRC_ALL) ||
1640                     (srctype == SYNDROME_SRC_WANT_DRAIN &&
1641                      (test_bit(R5_Wantdrain, &dev->flags) ||
1642                       test_bit(R5_InJournal, &dev->flags))) ||
1643                     (srctype == SYNDROME_SRC_WRITTEN &&
1644                      (dev->written ||
1645                       test_bit(R5_InJournal, &dev->flags)))) {
1646                         if (test_bit(R5_InJournal, &dev->flags))
1647                                 srcs[slot] = sh->dev[i].orig_page;
1648                         else
1649                                 srcs[slot] = sh->dev[i].page;
1650                         /*
1651                          * For R5_InJournal, PAGE_SIZE must be 4KB and will
1652                          * not shared page. In that case, dev[i].offset
1653                          * is 0.
1654                          */
1655                         offs[slot] = sh->dev[i].offset;
1656                 }
1657                 i = raid6_next_disk(i, disks);
1658         } while (i != d0_idx);
1659
1660         return syndrome_disks;
1661 }
1662
1663 static struct dma_async_tx_descriptor *
1664 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1665 {
1666         int disks = sh->disks;
1667         struct page **blocks = to_addr_page(percpu, 0);
1668         unsigned int *offs = to_addr_offs(sh, percpu);
1669         int target;
1670         int qd_idx = sh->qd_idx;
1671         struct dma_async_tx_descriptor *tx;
1672         struct async_submit_ctl submit;
1673         struct r5dev *tgt;
1674         struct page *dest;
1675         unsigned int dest_off;
1676         int i;
1677         int count;
1678
1679         BUG_ON(sh->batch_head);
1680         if (sh->ops.target < 0)
1681                 target = sh->ops.target2;
1682         else if (sh->ops.target2 < 0)
1683                 target = sh->ops.target;
1684         else
1685                 /* we should only have one valid target */
1686                 BUG();
1687         BUG_ON(target < 0);
1688         pr_debug("%s: stripe %llu block: %d\n",
1689                 __func__, (unsigned long long)sh->sector, target);
1690
1691         tgt = &sh->dev[target];
1692         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1693         dest = tgt->page;
1694         dest_off = tgt->offset;
1695
1696         atomic_inc(&sh->count);
1697
1698         if (target == qd_idx) {
1699                 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1700                 blocks[count] = NULL; /* regenerating p is not necessary */
1701                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
1702                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1703                                   ops_complete_compute, sh,
1704                                   to_addr_conv(sh, percpu, 0));
1705                 tx = async_gen_syndrome(blocks, offs, count+2,
1706                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1707         } else {
1708                 /* Compute any data- or p-drive using XOR */
1709                 count = 0;
1710                 for (i = disks; i-- ; ) {
1711                         if (i == target || i == qd_idx)
1712                                 continue;
1713                         offs[count] = sh->dev[i].offset;
1714                         blocks[count++] = sh->dev[i].page;
1715                 }
1716
1717                 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1718                                   NULL, ops_complete_compute, sh,
1719                                   to_addr_conv(sh, percpu, 0));
1720                 tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1721                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1722         }
1723
1724         return tx;
1725 }
1726
1727 static struct dma_async_tx_descriptor *
1728 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1729 {
1730         int i, count, disks = sh->disks;
1731         int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1732         int d0_idx = raid6_d0(sh);
1733         int faila = -1, failb = -1;
1734         int target = sh->ops.target;
1735         int target2 = sh->ops.target2;
1736         struct r5dev *tgt = &sh->dev[target];
1737         struct r5dev *tgt2 = &sh->dev[target2];
1738         struct dma_async_tx_descriptor *tx;
1739         struct page **blocks = to_addr_page(percpu, 0);
1740         unsigned int *offs = to_addr_offs(sh, percpu);
1741         struct async_submit_ctl submit;
1742
1743         BUG_ON(sh->batch_head);
1744         pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1745                  __func__, (unsigned long long)sh->sector, target, target2);
1746         BUG_ON(target < 0 || target2 < 0);
1747         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1748         BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1749
1750         /* we need to open-code set_syndrome_sources to handle the
1751          * slot number conversion for 'faila' and 'failb'
1752          */
1753         for (i = 0; i < disks ; i++) {
1754                 offs[i] = 0;
1755                 blocks[i] = NULL;
1756         }
1757         count = 0;
1758         i = d0_idx;
1759         do {
1760                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1761
1762                 offs[slot] = sh->dev[i].offset;
1763                 blocks[slot] = sh->dev[i].page;
1764
1765                 if (i == target)
1766                         faila = slot;
1767                 if (i == target2)
1768                         failb = slot;
1769                 i = raid6_next_disk(i, disks);
1770         } while (i != d0_idx);
1771
1772         BUG_ON(faila == failb);
1773         if (failb < faila)
1774                 swap(faila, failb);
1775         pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1776                  __func__, (unsigned long long)sh->sector, faila, failb);
1777
1778         atomic_inc(&sh->count);
1779
1780         if (failb == syndrome_disks+1) {
1781                 /* Q disk is one of the missing disks */
1782                 if (faila == syndrome_disks) {
1783                         /* Missing P+Q, just recompute */
1784                         init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1785                                           ops_complete_compute, sh,
1786                                           to_addr_conv(sh, percpu, 0));
1787                         return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1788                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1789                                                   &submit);
1790                 } else {
1791                         struct page *dest;
1792                         unsigned int dest_off;
1793                         int data_target;
1794                         int qd_idx = sh->qd_idx;
1795
1796                         /* Missing D+Q: recompute D from P, then recompute Q */
1797                         if (target == qd_idx)
1798                                 data_target = target2;
1799                         else
1800                                 data_target = target;
1801
1802                         count = 0;
1803                         for (i = disks; i-- ; ) {
1804                                 if (i == data_target || i == qd_idx)
1805                                         continue;
1806                                 offs[count] = sh->dev[i].offset;
1807                                 blocks[count++] = sh->dev[i].page;
1808                         }
1809                         dest = sh->dev[data_target].page;
1810                         dest_off = sh->dev[data_target].offset;
1811                         init_async_submit(&submit,
1812                                           ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1813                                           NULL, NULL, NULL,
1814                                           to_addr_conv(sh, percpu, 0));
1815                         tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1816                                        RAID5_STRIPE_SIZE(sh->raid_conf),
1817                                        &submit);
1818
1819                         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
1820                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1821                                           ops_complete_compute, sh,
1822                                           to_addr_conv(sh, percpu, 0));
1823                         return async_gen_syndrome(blocks, offs, count+2,
1824                                                   RAID5_STRIPE_SIZE(sh->raid_conf),
1825                                                   &submit);
1826                 }
1827         } else {
1828                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1829                                   ops_complete_compute, sh,
1830                                   to_addr_conv(sh, percpu, 0));
1831                 if (failb == syndrome_disks) {
1832                         /* We're missing D+P. */
1833                         return async_raid6_datap_recov(syndrome_disks+2,
1834                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1835                                                 faila,
1836                                                 blocks, offs, &submit);
1837                 } else {
1838                         /* We're missing D+D. */
1839                         return async_raid6_2data_recov(syndrome_disks+2,
1840                                                 RAID5_STRIPE_SIZE(sh->raid_conf),
1841                                                 faila, failb,
1842                                                 blocks, offs, &submit);
1843                 }
1844         }
1845 }
1846
1847 static void ops_complete_prexor(void *stripe_head_ref)
1848 {
1849         struct stripe_head *sh = stripe_head_ref;
1850
1851         pr_debug("%s: stripe %llu\n", __func__,
1852                 (unsigned long long)sh->sector);
1853
1854         if (r5c_is_writeback(sh->raid_conf->log))
1855                 /*
1856                  * raid5-cache write back uses orig_page during prexor.
1857                  * After prexor, it is time to free orig_page
1858                  */
1859                 r5c_release_extra_page(sh);
1860 }
1861
1862 static struct dma_async_tx_descriptor *
1863 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1864                 struct dma_async_tx_descriptor *tx)
1865 {
1866         int disks = sh->disks;
1867         struct page **xor_srcs = to_addr_page(percpu, 0);
1868         unsigned int *off_srcs = to_addr_offs(sh, percpu);
1869         int count = 0, pd_idx = sh->pd_idx, i;
1870         struct async_submit_ctl submit;
1871
1872         /* existing parity data subtracted */
1873         unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1874         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1875
1876         BUG_ON(sh->batch_head);
1877         pr_debug("%s: stripe %llu\n", __func__,
1878                 (unsigned long long)sh->sector);
1879
1880         for (i = disks; i--; ) {
1881                 struct r5dev *dev = &sh->dev[i];
1882                 /* Only process blocks that are known to be uptodate */
1883                 if (test_bit(R5_InJournal, &dev->flags)) {
1884                         /*
1885                          * For this case, PAGE_SIZE must be equal to 4KB and
1886                          * page offset is zero.
1887                          */
1888                         off_srcs[count] = dev->offset;
1889                         xor_srcs[count++] = dev->orig_page;
1890                 } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1891                         off_srcs[count] = dev->offset;
1892                         xor_srcs[count++] = dev->page;
1893                 }
1894         }
1895
1896         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1897                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1898         tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1899                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1900
1901         return tx;
1902 }
1903
1904 static struct dma_async_tx_descriptor *
1905 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1906                 struct dma_async_tx_descriptor *tx)
1907 {
1908         struct page **blocks = to_addr_page(percpu, 0);
1909         unsigned int *offs = to_addr_offs(sh, percpu);
1910         int count;
1911         struct async_submit_ctl submit;
1912
1913         pr_debug("%s: stripe %llu\n", __func__,
1914                 (unsigned long long)sh->sector);
1915
1916         count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
1917
1918         init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1919                           ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1920         tx = async_gen_syndrome(blocks, offs, count+2,
1921                         RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
1922
1923         return tx;
1924 }
1925
1926 static struct dma_async_tx_descriptor *
1927 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1928 {
1929         struct r5conf *conf = sh->raid_conf;
1930         int disks = sh->disks;
1931         int i;
1932         struct stripe_head *head_sh = sh;
1933
1934         pr_debug("%s: stripe %llu\n", __func__,
1935                 (unsigned long long)sh->sector);
1936
1937         for (i = disks; i--; ) {
1938                 struct r5dev *dev;
1939                 struct bio *chosen;
1940
1941                 sh = head_sh;
1942                 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1943                         struct bio *wbi;
1944
1945 again:
1946                         dev = &sh->dev[i];
1947                         /*
1948                          * clear R5_InJournal, so when rewriting a page in
1949                          * journal, it is not skipped by r5l_log_stripe()
1950                          */
1951                         clear_bit(R5_InJournal, &dev->flags);
1952                         spin_lock_irq(&sh->stripe_lock);
1953                         chosen = dev->towrite;
1954                         dev->towrite = NULL;
1955                         sh->overwrite_disks = 0;
1956                         BUG_ON(dev->written);
1957                         wbi = dev->written = chosen;
1958                         spin_unlock_irq(&sh->stripe_lock);
1959                         WARN_ON(dev->page != dev->orig_page);
1960
1961                         while (wbi && wbi->bi_iter.bi_sector <
1962                                 dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1963                                 if (wbi->bi_opf & REQ_FUA)
1964                                         set_bit(R5_WantFUA, &dev->flags);
1965                                 if (wbi->bi_opf & REQ_SYNC)
1966                                         set_bit(R5_SyncIO, &dev->flags);
1967                                 if (bio_op(wbi) == REQ_OP_DISCARD)
1968                                         set_bit(R5_Discard, &dev->flags);
1969                                 else {
1970                                         tx = async_copy_data(1, wbi, &dev->page,
1971                                                              dev->offset,
1972                                                              dev->sector, tx, sh,
1973                                                              r5c_is_writeback(conf->log));
1974                                         if (dev->page != dev->orig_page &&
1975                                             !r5c_is_writeback(conf->log)) {
1976                                                 set_bit(R5_SkipCopy, &dev->flags);
1977                                                 clear_bit(R5_UPTODATE, &dev->flags);
1978                                                 clear_bit(R5_OVERWRITE, &dev->flags);
1979                                         }
1980                                 }
1981                                 wbi = r5_next_bio(conf, wbi, dev->sector);
1982                         }
1983
1984                         if (head_sh->batch_head) {
1985                                 sh = list_first_entry(&sh->batch_list,
1986                                                       struct stripe_head,
1987                                                       batch_list);
1988                                 if (sh == head_sh)
1989                                         continue;
1990                                 goto again;
1991                         }
1992                 }
1993         }
1994
1995         return tx;
1996 }
1997
1998 static void ops_complete_reconstruct(void *stripe_head_ref)
1999 {
2000         struct stripe_head *sh = stripe_head_ref;
2001         int disks = sh->disks;
2002         int pd_idx = sh->pd_idx;
2003         int qd_idx = sh->qd_idx;
2004         int i;
2005         bool fua = false, sync = false, discard = false;
2006
2007         pr_debug("%s: stripe %llu\n", __func__,
2008                 (unsigned long long)sh->sector);
2009
2010         for (i = disks; i--; ) {
2011                 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
2012                 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
2013                 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
2014         }
2015
2016         for (i = disks; i--; ) {
2017                 struct r5dev *dev = &sh->dev[i];
2018
2019                 if (dev->written || i == pd_idx || i == qd_idx) {
2020                         if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2021                                 set_bit(R5_UPTODATE, &dev->flags);
2022                                 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2023                                         set_bit(R5_Expanded, &dev->flags);
2024                         }
2025                         if (fua)
2026                                 set_bit(R5_WantFUA, &dev->flags);
2027                         if (sync)
2028                                 set_bit(R5_SyncIO, &dev->flags);
2029                 }
2030         }
2031
2032         if (sh->reconstruct_state == reconstruct_state_drain_run)
2033                 sh->reconstruct_state = reconstruct_state_drain_result;
2034         else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2035                 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2036         else {
2037                 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2038                 sh->reconstruct_state = reconstruct_state_result;
2039         }
2040
2041         set_bit(STRIPE_HANDLE, &sh->state);
2042         raid5_release_stripe(sh);
2043 }
2044
2045 static void
2046 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
2047                      struct dma_async_tx_descriptor *tx)
2048 {
2049         int disks = sh->disks;
2050         struct page **xor_srcs;
2051         unsigned int *off_srcs;
2052         struct async_submit_ctl submit;
2053         int count, pd_idx = sh->pd_idx, i;
2054         struct page *xor_dest;
2055         unsigned int off_dest;
2056         int prexor = 0;
2057         unsigned long flags;
2058         int j = 0;
2059         struct stripe_head *head_sh = sh;
2060         int last_stripe;
2061
2062         pr_debug("%s: stripe %llu\n", __func__,
2063                 (unsigned long long)sh->sector);
2064
2065         for (i = 0; i < sh->disks; i++) {
2066                 if (pd_idx == i)
2067                         continue;
2068                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2069                         break;
2070         }
2071         if (i >= sh->disks) {
2072                 atomic_inc(&sh->count);
2073                 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
2074                 ops_complete_reconstruct(sh);
2075                 return;
2076         }
2077 again:
2078         count = 0;
2079         xor_srcs = to_addr_page(percpu, j);
2080         off_srcs = to_addr_offs(sh, percpu);
2081         /* check if prexor is active which means only process blocks
2082          * that are part of a read-modify-write (written)
2083          */
2084         if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2085                 prexor = 1;
2086                 off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2087                 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2088                 for (i = disks; i--; ) {
2089                         struct r5dev *dev = &sh->dev[i];
2090                         if (head_sh->dev[i].written ||
2091                             test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2092                                 off_srcs[count] = dev->offset;
2093                                 xor_srcs[count++] = dev->page;
2094                         }
2095                 }
2096         } else {
2097                 xor_dest = sh->dev[pd_idx].page;
2098                 off_dest = sh->dev[pd_idx].offset;
2099                 for (i = disks; i--; ) {
2100                         struct r5dev *dev = &sh->dev[i];
2101                         if (i != pd_idx) {
2102                                 off_srcs[count] = dev->offset;
2103                                 xor_srcs[count++] = dev->page;
2104                         }
2105                 }
2106         }
2107
2108         /* 1/ if we prexor'd then the dest is reused as a source
2109          * 2/ if we did not prexor then we are redoing the parity
2110          * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2111          * for the synchronous xor case
2112          */
2113         last_stripe = !head_sh->batch_head ||
2114                 list_first_entry(&sh->batch_list,
2115                                  struct stripe_head, batch_list) == head_sh;
2116         if (last_stripe) {
2117                 flags = ASYNC_TX_ACK |
2118                         (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2119
2120                 atomic_inc(&head_sh->count);
2121                 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
2122                                   to_addr_conv(sh, percpu, j));
2123         } else {
2124                 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2125                 init_async_submit(&submit, flags, tx, NULL, NULL,
2126                                   to_addr_conv(sh, percpu, j));
2127         }
2128
2129         if (unlikely(count == 1))
2130                 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2131                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2132         else
2133                 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2134                                 RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
2135         if (!last_stripe) {
2136                 j++;
2137                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2138                                       batch_list);
2139                 goto again;
2140         }
2141 }
2142
2143 static void
2144 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
2145                      struct dma_async_tx_descriptor *tx)
2146 {
2147         struct async_submit_ctl submit;
2148         struct page **blocks;
2149         unsigned int *offs;
2150         int count, i, j = 0;
2151         struct stripe_head *head_sh = sh;
2152         int last_stripe;
2153         int synflags;
2154         unsigned long txflags;
2155
2156         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2157
2158         for (i = 0; i < sh->disks; i++) {
2159                 if (sh->pd_idx == i || sh->qd_idx == i)
2160                         continue;
2161                 if (!test_bit(R5_Discard, &sh->dev[i].flags))
2162                         break;
2163         }
2164         if (i >= sh->disks) {
2165                 atomic_inc(&sh->count);
2166                 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
2167                 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
2168                 ops_complete_reconstruct(sh);
2169                 return;
2170         }
2171
2172 again:
2173         blocks = to_addr_page(percpu, j);
2174         offs = to_addr_offs(sh, percpu);
2175
2176         if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2177                 synflags = SYNDROME_SRC_WRITTEN;
2178                 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
2179         } else {
2180                 synflags = SYNDROME_SRC_ALL;
2181                 txflags = ASYNC_TX_ACK;
2182         }
2183
2184         count = set_syndrome_sources(blocks, offs, sh, synflags);
2185         last_stripe = !head_sh->batch_head ||
2186                 list_first_entry(&sh->batch_list,
2187                                  struct stripe_head, batch_list) == head_sh;
2188
2189         if (last_stripe) {
2190                 atomic_inc(&head_sh->count);
2191                 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
2192                                   head_sh, to_addr_conv(sh, percpu, j));
2193         } else
2194                 init_async_submit(&submit, 0, tx, NULL, NULL,
2195                                   to_addr_conv(sh, percpu, j));
2196         tx = async_gen_syndrome(blocks, offs, count+2,
2197                         RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
2198         if (!last_stripe) {
2199                 j++;
2200                 sh = list_first_entry(&sh->batch_list, struct stripe_head,
2201                                       batch_list);
2202                 goto again;
2203         }
2204 }
2205
2206 static void ops_complete_check(void *stripe_head_ref)
2207 {
2208         struct stripe_head *sh = stripe_head_ref;
2209
2210         pr_debug("%s: stripe %llu\n", __func__,
2211                 (unsigned long long)sh->sector);
2212
2213         sh->check_state = check_state_check_result;
2214         set_bit(STRIPE_HANDLE, &sh->state);
2215         raid5_release_stripe(sh);
2216 }
2217
2218 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
2219 {
2220         int disks = sh->disks;
2221         int pd_idx = sh->pd_idx;
2222         int qd_idx = sh->qd_idx;
2223         struct page *xor_dest;
2224         unsigned int off_dest;
2225         struct page **xor_srcs = to_addr_page(percpu, 0);
2226         unsigned int *off_srcs = to_addr_offs(sh, percpu);
2227         struct dma_async_tx_descriptor *tx;
2228         struct async_submit_ctl submit;
2229         int count;
2230         int i;
2231
2232         pr_debug("%s: stripe %llu\n", __func__,
2233                 (unsigned long long)sh->sector);
2234
2235         BUG_ON(sh->batch_head);
2236         count = 0;
2237         xor_dest = sh->dev[pd_idx].page;
2238         off_dest = sh->dev[pd_idx].offset;
2239         off_srcs[count] = off_dest;
2240         xor_srcs[count++] = xor_dest;
2241         for (i = disks; i--; ) {
2242                 if (i == pd_idx || i == qd_idx)
2243                         continue;
2244                 off_srcs[count] = sh->dev[i].offset;
2245                 xor_srcs[count++] = sh->dev[i].page;
2246         }
2247
2248         init_async_submit(&submit, 0, NULL, NULL, NULL,
2249                           to_addr_conv(sh, percpu, 0));
2250         tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2251                            RAID5_STRIPE_SIZE(sh->raid_conf),
2252                            &sh->ops.zero_sum_result, &submit);
2253
2254         atomic_inc(&sh->count);
2255         init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
2256         tx = async_trigger_callback(&submit);
2257 }
2258
2259 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
2260 {
2261         struct page **srcs = to_addr_page(percpu, 0);
2262         unsigned int *offs = to_addr_offs(sh, percpu);
2263         struct async_submit_ctl submit;
2264         int count;
2265
2266         pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2267                 (unsigned long long)sh->sector, checkp);
2268
2269         BUG_ON(sh->batch_head);
2270         count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
2271         if (!checkp)
2272                 srcs[count] = NULL;
2273
2274         atomic_inc(&sh->count);
2275         init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
2276                           sh, to_addr_conv(sh, percpu, 0));
2277         async_syndrome_val(srcs, offs, count+2,
2278                            RAID5_STRIPE_SIZE(sh->raid_conf),
2279                            &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
2280 }
2281
2282 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
2283 {
2284         int overlap_clear = 0, i, disks = sh->disks;
2285         struct dma_async_tx_descriptor *tx = NULL;
2286         struct r5conf *conf = sh->raid_conf;
2287         int level = conf->level;
2288         struct raid5_percpu *percpu;
2289
2290         local_lock(&conf->percpu->lock);
2291         percpu = this_cpu_ptr(conf->percpu);
2292         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2293                 ops_run_biofill(sh);
2294                 overlap_clear++;
2295         }
2296
2297         if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2298                 if (level < 6)
2299                         tx = ops_run_compute5(sh, percpu);
2300                 else {
2301                         if (sh->ops.target2 < 0 || sh->ops.target < 0)
2302                                 tx = ops_run_compute6_1(sh, percpu);
2303                         else
2304                                 tx = ops_run_compute6_2(sh, percpu);
2305                 }
2306                 /* terminate the chain if reconstruct is not set to be run */
2307                 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2308                         async_tx_ack(tx);
2309         }
2310
2311         if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2312                 if (level < 6)
2313                         tx = ops_run_prexor5(sh, percpu, tx);
2314                 else
2315                         tx = ops_run_prexor6(sh, percpu, tx);
2316         }
2317
2318         if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2319                 tx = ops_run_partial_parity(sh, percpu, tx);
2320
2321         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2322                 tx = ops_run_biodrain(sh, tx);
2323                 overlap_clear++;
2324         }
2325
2326         if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2327                 if (level < 6)
2328                         ops_run_reconstruct5(sh, percpu, tx);
2329                 else
2330                         ops_run_reconstruct6(sh, percpu, tx);
2331         }
2332
2333         if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2334                 if (sh->check_state == check_state_run)
2335                         ops_run_check_p(sh, percpu);
2336                 else if (sh->check_state == check_state_run_q)
2337                         ops_run_check_pq(sh, percpu, 0);
2338                 else if (sh->check_state == check_state_run_pq)
2339                         ops_run_check_pq(sh, percpu, 1);
2340                 else
2341                         BUG();
2342         }
2343
2344         if (overlap_clear && !sh->batch_head) {
2345                 for (i = disks; i--; ) {
2346                         struct r5dev *dev = &sh->dev[i];
2347                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
2348                                 wake_up(&sh->raid_conf->wait_for_overlap);
2349                 }
2350         }
2351         local_unlock(&conf->percpu->lock);
2352 }
2353
2354 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
2355 {
2356 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2357         kfree(sh->pages);
2358 #endif
2359         if (sh->ppl_page)
2360                 __free_page(sh->ppl_page);
2361         kmem_cache_free(sc, sh);
2362 }
2363
2364 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2365         int disks, struct r5conf *conf)
2366 {
2367         struct stripe_head *sh;
2368
2369         sh = kmem_cache_zalloc(sc, gfp);
2370         if (sh) {
2371                 spin_lock_init(&sh->stripe_lock);
2372                 spin_lock_init(&sh->batch_lock);
2373                 INIT_LIST_HEAD(&sh->batch_list);
2374                 INIT_LIST_HEAD(&sh->lru);
2375                 INIT_LIST_HEAD(&sh->r5c);
2376                 INIT_LIST_HEAD(&sh->log_list);
2377                 atomic_set(&sh->count, 1);
2378                 sh->raid_conf = conf;
2379                 sh->log_start = MaxSector;
2380
2381                 if (raid5_has_ppl(conf)) {
2382                         sh->ppl_page = alloc_page(gfp);
2383                         if (!sh->ppl_page) {
2384                                 free_stripe(sc, sh);
2385                                 return NULL;
2386                         }
2387                 }
2388 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2389                 if (init_stripe_shared_pages(sh, conf, disks)) {
2390                         free_stripe(sc, sh);
2391                         return NULL;
2392                 }
2393 #endif
2394         }
2395         return sh;
2396 }
2397 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2398 {
2399         struct stripe_head *sh;
2400
2401         sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
2402         if (!sh)
2403                 return 0;
2404
2405         if (grow_buffers(sh, gfp)) {
2406                 shrink_buffers(sh);
2407                 free_stripe(conf->slab_cache, sh);
2408                 return 0;
2409         }
2410         sh->hash_lock_index =
2411                 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2412         /* we just created an active stripe so... */
2413         atomic_inc(&conf->active_stripes);
2414
2415         raid5_release_stripe(sh);
2416         conf->max_nr_stripes++;
2417         return 1;
2418 }
2419
2420 static int grow_stripes(struct r5conf *conf, int num)
2421 {
2422         struct kmem_cache *sc;
2423         size_t namelen = sizeof(conf->cache_name[0]);
2424         int devs = max(conf->raid_disks, conf->previous_raid_disks);
2425
2426         if (conf->mddev->gendisk)
2427                 snprintf(conf->cache_name[0], namelen,
2428                         "raid%d-%s", conf->level, mdname(conf->mddev));
2429         else
2430                 snprintf(conf->cache_name[0], namelen,
2431                         "raid%d-%p", conf->level, conf->mddev);
2432         snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
2433
2434         conf->active_name = 0;
2435         sc = kmem_cache_create(conf->cache_name[conf->active_name],
2436                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2437                                0, 0, NULL);
2438         if (!sc)
2439                 return 1;
2440         conf->slab_cache = sc;
2441         conf->pool_size = devs;
2442         while (num--)
2443                 if (!grow_one_stripe(conf, GFP_KERNEL))
2444                         return 1;
2445
2446         return 0;
2447 }
2448
2449 /**
2450  * scribble_alloc - allocate percpu scribble buffer for required size
2451  *                  of the scribble region
2452  * @percpu: from for_each_present_cpu() of the caller
2453  * @num: total number of disks in the array
2454  * @cnt: scribble objs count for required size of the scribble region
2455  *
2456  * The scribble buffer size must be enough to contain:
2457  * 1/ a struct page pointer for each device in the array +2
2458  * 2/ room to convert each entry in (1) to its corresponding dma
2459  *    (dma_map_page()) or page (page_address()) address.
2460  *
2461  * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2462  * calculate over all devices (not just the data blocks), using zeros in place
2463  * of the P and Q blocks.
2464  */
2465 static int scribble_alloc(struct raid5_percpu *percpu,
2466                           int num, int cnt)
2467 {
2468         size_t obj_size =
2469                 sizeof(struct page *) * (num + 2) +
2470                 sizeof(addr_conv_t) * (num + 2) +
2471                 sizeof(unsigned int) * (num + 2);
2472         void *scribble;
2473
2474         /*
2475          * If here is in raid array suspend context, it is in memalloc noio
2476          * context as well, there is no potential recursive memory reclaim
2477          * I/Os with the GFP_KERNEL flag.
2478          */
2479         scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2480         if (!scribble)
2481                 return -ENOMEM;
2482
2483         kvfree(percpu->scribble);
2484
2485         percpu->scribble = scribble;
2486         percpu->scribble_obj_size = obj_size;
2487         return 0;
2488 }
2489
2490 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2491 {
2492         unsigned long cpu;
2493         int err = 0;
2494
2495         /*
2496          * Never shrink. And mddev_suspend() could deadlock if this is called
2497          * from raid5d. In that case, scribble_disks and scribble_sectors
2498          * should equal to new_disks and new_sectors
2499          */
2500         if (conf->scribble_disks >= new_disks &&
2501             conf->scribble_sectors >= new_sectors)
2502                 return 0;
2503         mddev_suspend(conf->mddev);
2504         cpus_read_lock();
2505
2506         for_each_present_cpu(cpu) {
2507                 struct raid5_percpu *percpu;
2508
2509                 percpu = per_cpu_ptr(conf->percpu, cpu);
2510                 err = scribble_alloc(percpu, new_disks,
2511                                      new_sectors / RAID5_STRIPE_SECTORS(conf));
2512                 if (err)
2513                         break;
2514         }
2515
2516         cpus_read_unlock();
2517         mddev_resume(conf->mddev);
2518         if (!err) {
2519                 conf->scribble_disks = new_disks;
2520                 conf->scribble_sectors = new_sectors;
2521         }
2522         return err;
2523 }
2524
2525 static int resize_stripes(struct r5conf *conf, int newsize)
2526 {
2527         /* Make all the stripes able to hold 'newsize' devices.
2528          * New slots in each stripe get 'page' set to a new page.
2529          *
2530          * This happens in stages:
2531          * 1/ create a new kmem_cache and allocate the required number of
2532          *    stripe_heads.
2533          * 2/ gather all the old stripe_heads and transfer the pages across
2534          *    to the new stripe_heads.  This will have the side effect of
2535          *    freezing the array as once all stripe_heads have been collected,
2536          *    no IO will be possible.  Old stripe heads are freed once their
2537          *    pages have been transferred over, and the old kmem_cache is
2538          *    freed when all stripes are done.
2539          * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2540          *    we simple return a failure status - no need to clean anything up.
2541          * 4/ allocate new pages for the new slots in the new stripe_heads.
2542          *    If this fails, we don't bother trying the shrink the
2543          *    stripe_heads down again, we just leave them as they are.
2544          *    As each stripe_head is processed the new one is released into
2545          *    active service.
2546          *
2547          * Once step2 is started, we cannot afford to wait for a write,
2548          * so we use GFP_NOIO allocations.
2549          */
2550         struct stripe_head *osh, *nsh;
2551         LIST_HEAD(newstripes);
2552         struct disk_info *ndisks;
2553         int err = 0;
2554         struct kmem_cache *sc;
2555         int i;
2556         int hash, cnt;
2557
2558         md_allow_write(conf->mddev);
2559
2560         /* Step 1 */
2561         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2562                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2563                                0, 0, NULL);
2564         if (!sc)
2565                 return -ENOMEM;
2566
2567         /* Need to ensure auto-resizing doesn't interfere */
2568         mutex_lock(&conf->cache_size_mutex);
2569
2570         for (i = conf->max_nr_stripes; i; i--) {
2571                 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
2572                 if (!nsh)
2573                         break;
2574
2575                 list_add(&nsh->lru, &newstripes);
2576         }
2577         if (i) {
2578                 /* didn't get enough, give up */
2579                 while (!list_empty(&newstripes)) {
2580                         nsh = list_entry(newstripes.next, struct stripe_head, lru);
2581                         list_del(&nsh->lru);
2582                         free_stripe(sc, nsh);
2583                 }
2584                 kmem_cache_destroy(sc);
2585                 mutex_unlock(&conf->cache_size_mutex);
2586                 return -ENOMEM;
2587         }
2588         /* Step 2 - Must use GFP_NOIO now.
2589          * OK, we have enough stripes, start collecting inactive
2590          * stripes and copying them over
2591          */
2592         hash = 0;
2593         cnt = 0;
2594         list_for_each_entry(nsh, &newstripes, lru) {
2595                 lock_device_hash_lock(conf, hash);
2596                 wait_event_cmd(conf->wait_for_stripe,
2597                                     !list_empty(conf->inactive_list + hash),
2598                                     unlock_device_hash_lock(conf, hash),
2599                                     lock_device_hash_lock(conf, hash));
2600                 osh = get_free_stripe(conf, hash);
2601                 unlock_device_hash_lock(conf, hash);
2602
2603 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2604         for (i = 0; i < osh->nr_pages; i++) {
2605                 nsh->pages[i] = osh->pages[i];
2606                 osh->pages[i] = NULL;
2607         }
2608 #endif
2609                 for(i=0; i<conf->pool_size; i++) {
2610                         nsh->dev[i].page = osh->dev[i].page;
2611                         nsh->dev[i].orig_page = osh->dev[i].page;
2612                         nsh->dev[i].offset = osh->dev[i].offset;
2613                 }
2614                 nsh->hash_lock_index = hash;
2615                 free_stripe(conf->slab_cache, osh);
2616                 cnt++;
2617                 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2618                     !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2619                         hash++;
2620                         cnt = 0;
2621                 }
2622         }
2623         kmem_cache_destroy(conf->slab_cache);
2624
2625         /* Step 3.
2626          * At this point, we are holding all the stripes so the array
2627          * is completely stalled, so now is a good time to resize
2628          * conf->disks and the scribble region
2629          */
2630         ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO);
2631         if (ndisks) {
2632                 for (i = 0; i < conf->pool_size; i++)
2633                         ndisks[i] = conf->disks[i];
2634
2635                 for (i = conf->pool_size; i < newsize; i++) {
2636                         ndisks[i].extra_page = alloc_page(GFP_NOIO);
2637                         if (!ndisks[i].extra_page)
2638                                 err = -ENOMEM;
2639                 }
2640
2641                 if (err) {
2642                         for (i = conf->pool_size; i < newsize; i++)
2643                                 if (ndisks[i].extra_page)
2644                                         put_page(ndisks[i].extra_page);
2645                         kfree(ndisks);
2646                 } else {
2647                         kfree(conf->disks);
2648                         conf->disks = ndisks;
2649                 }
2650         } else
2651                 err = -ENOMEM;
2652
2653         conf->slab_cache = sc;
2654         conf->active_name = 1-conf->active_name;
2655
2656         /* Step 4, return new stripes to service */
2657         while(!list_empty(&newstripes)) {
2658                 nsh = list_entry(newstripes.next, struct stripe_head, lru);
2659                 list_del_init(&nsh->lru);
2660
2661 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2662                 for (i = 0; i < nsh->nr_pages; i++) {
2663                         if (nsh->pages[i])
2664                                 continue;
2665                         nsh->pages[i] = alloc_page(GFP_NOIO);
2666                         if (!nsh->pages[i])
2667                                 err = -ENOMEM;
2668                 }
2669
2670                 for (i = conf->raid_disks; i < newsize; i++) {
2671                         if (nsh->dev[i].page)
2672                                 continue;
2673                         nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2674                         nsh->dev[i].orig_page = nsh->dev[i].page;
2675                         nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2676                 }
2677 #else
2678                 for (i=conf->raid_disks; i < newsize; i++)
2679                         if (nsh->dev[i].page == NULL) {
2680                                 struct page *p = alloc_page(GFP_NOIO);
2681                                 nsh->dev[i].page = p;
2682                                 nsh->dev[i].orig_page = p;
2683                                 nsh->dev[i].offset = 0;
2684                                 if (!p)
2685                                         err = -ENOMEM;
2686                         }
2687 #endif
2688                 raid5_release_stripe(nsh);
2689         }
2690         /* critical section pass, GFP_NOIO no longer needed */
2691
2692         if (!err)
2693                 conf->pool_size = newsize;
2694         mutex_unlock(&conf->cache_size_mutex);
2695
2696         return err;
2697 }
2698
2699 static int drop_one_stripe(struct r5conf *conf)
2700 {
2701         struct stripe_head *sh;
2702         int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2703
2704         spin_lock_irq(conf->hash_locks + hash);
2705         sh = get_free_stripe(conf, hash);
2706         spin_unlock_irq(conf->hash_locks + hash);
2707         if (!sh)
2708                 return 0;
2709         BUG_ON(atomic_read(&sh->count));
2710         shrink_buffers(sh);
2711         free_stripe(conf->slab_cache, sh);
2712         atomic_dec(&conf->active_stripes);
2713         conf->max_nr_stripes--;
2714         return 1;
2715 }
2716
2717 static void shrink_stripes(struct r5conf *conf)
2718 {
2719         while (conf->max_nr_stripes &&
2720                drop_one_stripe(conf))
2721                 ;
2722
2723         kmem_cache_destroy(conf->slab_cache);
2724         conf->slab_cache = NULL;
2725 }
2726
2727 /*
2728  * This helper wraps rcu_dereference_protected() and can be used when
2729  * it is known that the nr_pending of the rdev is elevated.
2730  */
2731 static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
2732 {
2733         return rcu_dereference_protected(rdev,
2734                         atomic_read(&rcu_access_pointer(rdev)->nr_pending));
2735 }
2736
2737 /*
2738  * This helper wraps rcu_dereference_protected() and should be used
2739  * when it is known that the mddev_lock() is held. This is safe
2740  * seeing raid5_remove_disk() has the same lock held.
2741  */
2742 static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
2743                                          struct md_rdev __rcu *rdev)
2744 {
2745         return rcu_dereference_protected(rdev,
2746                         lockdep_is_held(&mddev->reconfig_mutex));
2747 }
2748
2749 static void raid5_end_read_request(struct bio * bi)
2750 {
2751         struct stripe_head *sh = bi->bi_private;
2752         struct r5conf *conf = sh->raid_conf;
2753         int disks = sh->disks, i;
2754         struct md_rdev *rdev = NULL;
2755         sector_t s;
2756
2757         for (i=0 ; i<disks; i++)
2758                 if (bi == &sh->dev[i].req)
2759                         break;
2760
2761         pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2762                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2763                 bi->bi_status);
2764         if (i == disks) {
2765                 BUG();
2766                 return;
2767         }
2768         if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2769                 /* If replacement finished while this request was outstanding,
2770                  * 'replacement' might be NULL already.
2771                  * In that case it moved down to 'rdev'.
2772                  * rdev is not removed until all requests are finished.
2773                  */
2774                 rdev = rdev_pend_deref(conf->disks[i].replacement);
2775         if (!rdev)
2776                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2777
2778         if (use_new_offset(conf, sh))
2779                 s = sh->sector + rdev->new_data_offset;
2780         else
2781                 s = sh->sector + rdev->data_offset;
2782         if (!bi->bi_status) {
2783                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2784                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2785                         /* Note that this cannot happen on a
2786                          * replacement device.  We just fail those on
2787                          * any error
2788                          */
2789                         pr_info_ratelimited(
2790                                 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2791                                 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2792                                 (unsigned long long)s,
2793                                 rdev->bdev);
2794                         atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
2795                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2796                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2797                 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2798                         clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2799
2800                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
2801                         /*
2802                          * end read for a page in journal, this
2803                          * must be preparing for prexor in rmw
2804                          */
2805                         set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2806
2807                 if (atomic_read(&rdev->read_errors))
2808                         atomic_set(&rdev->read_errors, 0);
2809         } else {
2810                 int retry = 0;
2811                 int set_bad = 0;
2812
2813                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2814                 if (!(bi->bi_status == BLK_STS_PROTECTION))
2815                         atomic_inc(&rdev->read_errors);
2816                 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2817                         pr_warn_ratelimited(
2818                                 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2819                                 mdname(conf->mddev),
2820                                 (unsigned long long)s,
2821                                 rdev->bdev);
2822                 else if (conf->mddev->degraded >= conf->max_degraded) {
2823                         set_bad = 1;
2824                         pr_warn_ratelimited(
2825                                 "md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2826                                 mdname(conf->mddev),
2827                                 (unsigned long long)s,
2828                                 rdev->bdev);
2829                 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2830                         /* Oh, no!!! */
2831                         set_bad = 1;
2832                         pr_warn_ratelimited(
2833                                 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2834                                 mdname(conf->mddev),
2835                                 (unsigned long long)s,
2836                                 rdev->bdev);
2837                 } else if (atomic_read(&rdev->read_errors)
2838                          > conf->max_nr_stripes) {
2839                         if (!test_bit(Faulty, &rdev->flags)) {
2840                                 pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2841                                     mdname(conf->mddev),
2842                                     atomic_read(&rdev->read_errors),
2843                                     conf->max_nr_stripes);
2844                                 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2845                                     mdname(conf->mddev), rdev->bdev);
2846                         }
2847                 } else
2848                         retry = 1;
2849                 if (set_bad && test_bit(In_sync, &rdev->flags)
2850                     && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2851                         retry = 1;
2852                 if (retry)
2853                         if (sh->qd_idx >= 0 && sh->pd_idx == i)
2854                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2855                         else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2856                                 set_bit(R5_ReadError, &sh->dev[i].flags);
2857                                 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2858                         } else
2859                                 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2860                 else {
2861                         clear_bit(R5_ReadError, &sh->dev[i].flags);
2862                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
2863                         if (!(set_bad
2864                               && test_bit(In_sync, &rdev->flags)
2865                               && rdev_set_badblocks(
2866                                       rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
2867                                 md_error(conf->mddev, rdev);
2868                 }
2869         }
2870         rdev_dec_pending(rdev, conf->mddev);
2871         bio_uninit(bi);
2872         clear_bit(R5_LOCKED, &sh->dev[i].flags);
2873         set_bit(STRIPE_HANDLE, &sh->state);
2874         raid5_release_stripe(sh);
2875 }
2876
2877 static void raid5_end_write_request(struct bio *bi)
2878 {
2879         struct stripe_head *sh = bi->bi_private;
2880         struct r5conf *conf = sh->raid_conf;
2881         int disks = sh->disks, i;
2882         struct md_rdev *rdev;
2883         sector_t first_bad;
2884         int bad_sectors;
2885         int replacement = 0;
2886
2887         for (i = 0 ; i < disks; i++) {
2888                 if (bi == &sh->dev[i].req) {
2889                         rdev = rdev_pend_deref(conf->disks[i].rdev);
2890                         break;
2891                 }
2892                 if (bi == &sh->dev[i].rreq) {
2893                         rdev = rdev_pend_deref(conf->disks[i].replacement);
2894                         if (rdev)
2895                                 replacement = 1;
2896                         else
2897                                 /* rdev was removed and 'replacement'
2898                                  * replaced it.  rdev is not removed
2899                                  * until all requests are finished.
2900                                  */
2901                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
2902                         break;
2903                 }
2904         }
2905         pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2906                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
2907                 bi->bi_status);
2908         if (i == disks) {
2909                 BUG();
2910                 return;
2911         }
2912
2913         if (replacement) {
2914                 if (bi->bi_status)
2915                         md_error(conf->mddev, rdev);
2916                 else if (is_badblock(rdev, sh->sector,
2917                                      RAID5_STRIPE_SECTORS(conf),
2918                                      &first_bad, &bad_sectors))
2919                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2920         } else {
2921                 if (bi->bi_status) {
2922                         set_bit(STRIPE_DEGRADED, &sh->state);
2923                         set_bit(WriteErrorSeen, &rdev->flags);
2924                         set_bit(R5_WriteError, &sh->dev[i].flags);
2925                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
2926                                 set_bit(MD_RECOVERY_NEEDED,
2927                                         &rdev->mddev->recovery);
2928                 } else if (is_badblock(rdev, sh->sector,
2929                                        RAID5_STRIPE_SECTORS(conf),
2930                                        &first_bad, &bad_sectors)) {
2931                         set_bit(R5_MadeGood, &sh->dev[i].flags);
2932                         if (test_bit(R5_ReadError, &sh->dev[i].flags))
2933                                 /* That was a successful write so make
2934                                  * sure it looks like we already did
2935                                  * a re-write.
2936                                  */
2937                                 set_bit(R5_ReWrite, &sh->dev[i].flags);
2938                 }
2939         }
2940         rdev_dec_pending(rdev, conf->mddev);
2941
2942         if (sh->batch_head && bi->bi_status && !replacement)
2943                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2944
2945         bio_uninit(bi);
2946         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2947                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2948         set_bit(STRIPE_HANDLE, &sh->state);
2949
2950         if (sh->batch_head && sh != sh->batch_head)
2951                 raid5_release_stripe(sh->batch_head);
2952         raid5_release_stripe(sh);
2953 }
2954
2955 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2956 {
2957         struct r5conf *conf = mddev->private;
2958         unsigned long flags;
2959         pr_debug("raid456: error called\n");
2960
2961         pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2962                 mdname(mddev), rdev->bdev);
2963
2964         spin_lock_irqsave(&conf->device_lock, flags);
2965         set_bit(Faulty, &rdev->flags);
2966         clear_bit(In_sync, &rdev->flags);
2967         mddev->degraded = raid5_calc_degraded(conf);
2968
2969         if (has_failed(conf)) {
2970                 set_bit(MD_BROKEN, &conf->mddev->flags);
2971                 conf->recovery_disabled = mddev->recovery_disabled;
2972
2973                 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2974                         mdname(mddev), mddev->degraded, conf->raid_disks);
2975         } else {
2976                 pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2977                         mdname(mddev), conf->raid_disks - mddev->degraded);
2978         }
2979
2980         spin_unlock_irqrestore(&conf->device_lock, flags);
2981         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2982
2983         set_bit(Blocked, &rdev->flags);
2984         set_mask_bits(&mddev->sb_flags, 0,
2985                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2986         r5c_update_on_rdev_error(mddev, rdev);
2987 }
2988
2989 /*
2990  * Input: a 'big' sector number,
2991  * Output: index of the data and parity disk, and the sector # in them.
2992  */
2993 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2994                               int previous, int *dd_idx,
2995                               struct stripe_head *sh)
2996 {
2997         sector_t stripe, stripe2;
2998         sector_t chunk_number;
2999         unsigned int chunk_offset;
3000         int pd_idx, qd_idx;
3001         int ddf_layout = 0;
3002         sector_t new_sector;
3003         int algorithm = previous ? conf->prev_algo
3004                                  : conf->algorithm;
3005         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3006                                          : conf->chunk_sectors;
3007         int raid_disks = previous ? conf->previous_raid_disks
3008                                   : conf->raid_disks;
3009         int data_disks = raid_disks - conf->max_degraded;
3010
3011         /* First compute the information on this sector */
3012
3013         /*
3014          * Compute the chunk number and the sector offset inside the chunk
3015          */
3016         chunk_offset = sector_div(r_sector, sectors_per_chunk);
3017         chunk_number = r_sector;
3018
3019         /*
3020          * Compute the stripe number
3021          */
3022         stripe = chunk_number;
3023         *dd_idx = sector_div(stripe, data_disks);
3024         stripe2 = stripe;
3025         /*
3026          * Select the parity disk based on the user selected algorithm.
3027          */
3028         pd_idx = qd_idx = -1;
3029         switch(conf->level) {
3030         case 4:
3031                 pd_idx = data_disks;
3032                 break;
3033         case 5:
3034                 switch (algorithm) {
3035                 case ALGORITHM_LEFT_ASYMMETRIC:
3036                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
3037                         if (*dd_idx >= pd_idx)
3038                                 (*dd_idx)++;
3039                         break;
3040                 case ALGORITHM_RIGHT_ASYMMETRIC:
3041                         pd_idx = sector_div(stripe2, raid_disks);
3042                         if (*dd_idx >= pd_idx)
3043                                 (*dd_idx)++;
3044                         break;
3045                 case ALGORITHM_LEFT_SYMMETRIC:
3046                         pd_idx = data_disks - sector_div(stripe2, raid_disks);
3047                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3048                         break;
3049                 case ALGORITHM_RIGHT_SYMMETRIC:
3050                         pd_idx = sector_div(stripe2, raid_disks);
3051                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3052                         break;
3053                 case ALGORITHM_PARITY_0:
3054                         pd_idx = 0;
3055                         (*dd_idx)++;
3056                         break;
3057                 case ALGORITHM_PARITY_N:
3058                         pd_idx = data_disks;
3059                         break;
3060                 default:
3061                         BUG();
3062                 }
3063                 break;
3064         case 6:
3065
3066                 switch (algorithm) {
3067                 case ALGORITHM_LEFT_ASYMMETRIC:
3068                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3069                         qd_idx = pd_idx + 1;
3070                         if (pd_idx == raid_disks-1) {
3071                                 (*dd_idx)++;    /* Q D D D P */
3072                                 qd_idx = 0;
3073                         } else if (*dd_idx >= pd_idx)
3074                                 (*dd_idx) += 2; /* D D P Q D */
3075                         break;
3076                 case ALGORITHM_RIGHT_ASYMMETRIC:
3077                         pd_idx = sector_div(stripe2, raid_disks);
3078                         qd_idx = pd_idx + 1;
3079                         if (pd_idx == raid_disks-1) {
3080                                 (*dd_idx)++;    /* Q D D D P */
3081                                 qd_idx = 0;
3082                         } else if (*dd_idx >= pd_idx)
3083                                 (*dd_idx) += 2; /* D D P Q D */
3084                         break;
3085                 case ALGORITHM_LEFT_SYMMETRIC:
3086                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3087                         qd_idx = (pd_idx + 1) % raid_disks;
3088                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3089                         break;
3090                 case ALGORITHM_RIGHT_SYMMETRIC:
3091                         pd_idx = sector_div(stripe2, raid_disks);
3092                         qd_idx = (pd_idx + 1) % raid_disks;
3093                         *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
3094                         break;
3095
3096                 case ALGORITHM_PARITY_0:
3097                         pd_idx = 0;
3098                         qd_idx = 1;
3099                         (*dd_idx) += 2;
3100                         break;
3101                 case ALGORITHM_PARITY_N:
3102                         pd_idx = data_disks;
3103                         qd_idx = data_disks + 1;
3104                         break;
3105
3106                 case ALGORITHM_ROTATING_ZERO_RESTART:
3107                         /* Exactly the same as RIGHT_ASYMMETRIC, but or
3108                          * of blocks for computing Q is different.
3109                          */
3110                         pd_idx = sector_div(stripe2, raid_disks);
3111                         qd_idx = pd_idx + 1;
3112                         if (pd_idx == raid_disks-1) {
3113                                 (*dd_idx)++;    /* Q D D D P */
3114                                 qd_idx = 0;
3115                         } else if (*dd_idx >= pd_idx)
3116                                 (*dd_idx) += 2; /* D D P Q D */
3117                         ddf_layout = 1;
3118                         break;
3119
3120                 case ALGORITHM_ROTATING_N_RESTART:
3121                         /* Same a left_asymmetric, by first stripe is
3122                          * D D D P Q  rather than
3123                          * Q D D D P
3124                          */
3125                         stripe2 += 1;
3126                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3127                         qd_idx = pd_idx + 1;
3128                         if (pd_idx == raid_disks-1) {
3129                                 (*dd_idx)++;    /* Q D D D P */
3130                                 qd_idx = 0;
3131                         } else if (*dd_idx >= pd_idx)
3132                                 (*dd_idx) += 2; /* D D P Q D */
3133                         ddf_layout = 1;
3134                         break;
3135
3136                 case ALGORITHM_ROTATING_N_CONTINUE:
3137                         /* Same as left_symmetric but Q is before P */
3138                         pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
3139                         qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
3140                         *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
3141                         ddf_layout = 1;
3142                         break;
3143
3144                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3145                         /* RAID5 left_asymmetric, with Q on last device */
3146                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3147                         if (*dd_idx >= pd_idx)
3148                                 (*dd_idx)++;
3149                         qd_idx = raid_disks - 1;
3150                         break;
3151
3152                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3153                         pd_idx = sector_div(stripe2, raid_disks-1);
3154                         if (*dd_idx >= pd_idx)
3155                                 (*dd_idx)++;
3156                         qd_idx = raid_disks - 1;
3157                         break;
3158
3159                 case ALGORITHM_LEFT_SYMMETRIC_6:
3160                         pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
3161                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3162                         qd_idx = raid_disks - 1;
3163                         break;
3164
3165                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3166                         pd_idx = sector_div(stripe2, raid_disks-1);
3167                         *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
3168                         qd_idx = raid_disks - 1;
3169                         break;
3170
3171                 case ALGORITHM_PARITY_0_6:
3172                         pd_idx = 0;
3173                         (*dd_idx)++;
3174                         qd_idx = raid_disks - 1;
3175                         break;
3176
3177                 default:
3178                         BUG();
3179                 }
3180                 break;
3181         }
3182
3183         if (sh) {
3184                 sh->pd_idx = pd_idx;
3185                 sh->qd_idx = qd_idx;
3186                 sh->ddf_layout = ddf_layout;
3187         }
3188         /*
3189          * Finally, compute the new sector number
3190          */
3191         new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3192         return new_sector;
3193 }
3194
3195 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
3196 {
3197         struct r5conf *conf = sh->raid_conf;
3198         int raid_disks = sh->disks;
3199         int data_disks = raid_disks - conf->max_degraded;
3200         sector_t new_sector = sh->sector, check;
3201         int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3202                                          : conf->chunk_sectors;
3203         int algorithm = previous ? conf->prev_algo
3204                                  : conf->algorithm;
3205         sector_t stripe;
3206         int chunk_offset;
3207         sector_t chunk_number;
3208         int dummy1, dd_idx = i;
3209         sector_t r_sector;
3210         struct stripe_head sh2;
3211
3212         chunk_offset = sector_div(new_sector, sectors_per_chunk);
3213         stripe = new_sector;
3214
3215         if (i == sh->pd_idx)
3216                 return 0;
3217         switch(conf->level) {
3218         case 4: break;
3219         case 5:
3220                 switch (algorithm) {
3221                 case ALGORITHM_LEFT_ASYMMETRIC:
3222                 case ALGORITHM_RIGHT_ASYMMETRIC:
3223                         if (i > sh->pd_idx)
3224                                 i--;
3225                         break;
3226                 case ALGORITHM_LEFT_SYMMETRIC:
3227                 case ALGORITHM_RIGHT_SYMMETRIC:
3228                         if (i < sh->pd_idx)
3229                                 i += raid_disks;
3230                         i -= (sh->pd_idx + 1);
3231                         break;
3232                 case ALGORITHM_PARITY_0:
3233                         i -= 1;
3234                         break;
3235                 case ALGORITHM_PARITY_N:
3236                         break;
3237                 default:
3238                         BUG();
3239                 }
3240                 break;
3241         case 6:
3242                 if (i == sh->qd_idx)
3243                         return 0; /* It is the Q disk */
3244                 switch (algorithm) {
3245                 case ALGORITHM_LEFT_ASYMMETRIC:
3246                 case ALGORITHM_RIGHT_ASYMMETRIC:
3247                 case ALGORITHM_ROTATING_ZERO_RESTART:
3248                 case ALGORITHM_ROTATING_N_RESTART:
3249                         if (sh->pd_idx == raid_disks-1)
3250                                 i--;    /* Q D D D P */
3251                         else if (i > sh->pd_idx)
3252                                 i -= 2; /* D D P Q D */
3253                         break;
3254                 case ALGORITHM_LEFT_SYMMETRIC:
3255                 case ALGORITHM_RIGHT_SYMMETRIC:
3256                         if (sh->pd_idx == raid_disks-1)
3257                                 i--; /* Q D D D P */
3258                         else {
3259                                 /* D D P Q D */
3260                                 if (i < sh->pd_idx)
3261                                         i += raid_disks;
3262                                 i -= (sh->pd_idx + 2);
3263                         }
3264                         break;
3265                 case ALGORITHM_PARITY_0:
3266                         i -= 2;
3267                         break;
3268                 case ALGORITHM_PARITY_N:
3269                         break;
3270                 case ALGORITHM_ROTATING_N_CONTINUE:
3271                         /* Like left_symmetric, but P is before Q */
3272                         if (sh->pd_idx == 0)
3273                                 i--;    /* P D D D Q */
3274                         else {
3275                                 /* D D Q P D */
3276                                 if (i < sh->pd_idx)
3277                                         i += raid_disks;
3278                                 i -= (sh->pd_idx + 1);
3279                         }
3280                         break;
3281                 case ALGORITHM_LEFT_ASYMMETRIC_6:
3282                 case ALGORITHM_RIGHT_ASYMMETRIC_6:
3283                         if (i > sh->pd_idx)
3284                                 i--;
3285                         break;
3286                 case ALGORITHM_LEFT_SYMMETRIC_6:
3287                 case ALGORITHM_RIGHT_SYMMETRIC_6:
3288                         if (i < sh->pd_idx)
3289                                 i += data_disks + 1;
3290                         i -= (sh->pd_idx + 1);
3291                         break;
3292                 case ALGORITHM_PARITY_0_6:
3293                         i -= 1;
3294                         break;
3295                 default:
3296                         BUG();
3297                 }
3298                 break;
3299         }
3300
3301         chunk_number = stripe * data_disks + i;
3302         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3303
3304         check = raid5_compute_sector(conf, r_sector,
3305                                      previous, &dummy1, &sh2);
3306         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
3307                 || sh2.qd_idx != sh->qd_idx) {
3308                 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3309                         mdname(conf->mddev));
3310                 return 0;
3311         }
3312         return r_sector;
3313 }
3314
3315 /*
3316  * There are cases where we want handle_stripe_dirtying() and
3317  * schedule_reconstruction() to delay towrite to some dev of a stripe.
3318  *
3319  * This function checks whether we want to delay the towrite. Specifically,
3320  * we delay the towrite when:
3321  *
3322  *   1. degraded stripe has a non-overwrite to the missing dev, AND this
3323  *      stripe has data in journal (for other devices).
3324  *
3325  *      In this case, when reading data for the non-overwrite dev, it is
3326  *      necessary to handle complex rmw of write back cache (prexor with
3327  *      orig_page, and xor with page). To keep read path simple, we would
3328  *      like to flush data in journal to RAID disks first, so complex rmw
3329  *      is handled in the write patch (handle_stripe_dirtying).
3330  *
3331  *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
3332  *
3333  *      It is important to be able to flush all stripes in raid5-cache.
3334  *      Therefore, we need reserve some space on the journal device for
3335  *      these flushes. If flush operation includes pending writes to the
3336  *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3337  *      for the flush out. If we exclude these pending writes from flush
3338  *      operation, we only need (conf->max_degraded + 1) pages per stripe.
3339  *      Therefore, excluding pending writes in these cases enables more
3340  *      efficient use of the journal device.
3341  *
3342  *      Note: To make sure the stripe makes progress, we only delay
3343  *      towrite for stripes with data already in journal (injournal > 0).
3344  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3345  *      no_space_stripes list.
3346  *
3347  *   3. during journal failure
3348  *      In journal failure, we try to flush all cached data to raid disks
3349  *      based on data in stripe cache. The array is read-only to upper
3350  *      layers, so we would skip all pending writes.
3351  *
3352  */
3353 static inline bool delay_towrite(struct r5conf *conf,
3354                                  struct r5dev *dev,
3355                                  struct stripe_head_state *s)
3356 {
3357         /* case 1 above */
3358         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3359             !test_bit(R5_Insync, &dev->flags) && s->injournal)
3360                 return true;
3361         /* case 2 above */
3362         if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3363             s->injournal > 0)
3364                 return true;
3365         /* case 3 above */
3366         if (s->log_failed && s->injournal)
3367                 return true;
3368         return false;
3369 }
3370
3371 static void
3372 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
3373                          int rcw, int expand)
3374 {
3375         int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3376         struct r5conf *conf = sh->raid_conf;
3377         int level = conf->level;
3378
3379         if (rcw) {
3380                 /*
3381                  * In some cases, handle_stripe_dirtying initially decided to
3382                  * run rmw and allocates extra page for prexor. However, rcw is
3383                  * cheaper later on. We need to free the extra page now,
3384                  * because we won't be able to do that in ops_complete_prexor().
3385                  */
3386                 r5c_release_extra_page(sh);
3387
3388                 for (i = disks; i--; ) {
3389                         struct r5dev *dev = &sh->dev[i];
3390
3391                         if (dev->towrite && !delay_towrite(conf, dev, s)) {
3392                                 set_bit(R5_LOCKED, &dev->flags);
3393                                 set_bit(R5_Wantdrain, &dev->flags);
3394                                 if (!expand)
3395                                         clear_bit(R5_UPTODATE, &dev->flags);
3396                                 s->locked++;
3397                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3398                                 set_bit(R5_LOCKED, &dev->flags);
3399                                 s->locked++;
3400                         }
3401                 }
3402                 /* if we are not expanding this is a proper write request, and
3403                  * there will be bios with new data to be drained into the
3404                  * stripe cache
3405                  */
3406                 if (!expand) {
3407                         if (!s->locked)
3408                                 /* False alarm, nothing to do */
3409                                 return;
3410                         sh->reconstruct_state = reconstruct_state_drain_run;
3411                         set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3412                 } else
3413                         sh->reconstruct_state = reconstruct_state_run;
3414
3415                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3416
3417                 if (s->locked + conf->max_degraded == disks)
3418                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
3419                                 atomic_inc(&conf->pending_full_writes);
3420         } else {
3421                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
3422                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3423                 BUG_ON(level == 6 &&
3424                         (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
3425                            test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3426
3427                 for (i = disks; i--; ) {
3428                         struct r5dev *dev = &sh->dev[i];
3429                         if (i == pd_idx || i == qd_idx)
3430                                 continue;
3431
3432                         if (dev->towrite &&
3433                             (test_bit(R5_UPTODATE, &dev->flags) ||
3434                              test_bit(R5_Wantcompute, &dev->flags))) {
3435                                 set_bit(R5_Wantdrain, &dev->flags);
3436                                 set_bit(R5_LOCKED, &dev->flags);
3437                                 clear_bit(R5_UPTODATE, &dev->flags);
3438                                 s->locked++;
3439                         } else if (test_bit(R5_InJournal, &dev->flags)) {
3440                                 set_bit(R5_LOCKED, &dev->flags);
3441                                 s->locked++;
3442                         }
3443                 }
3444                 if (!s->locked)
3445                         /* False alarm - nothing to do */
3446                         return;
3447                 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3448                 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3449                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3450                 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3451         }
3452
3453         /* keep the parity disk(s) locked while asynchronous operations
3454          * are in flight
3455          */
3456         set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3457         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3458         s->locked++;
3459
3460         if (level == 6) {
3461                 int qd_idx = sh->qd_idx;
3462                 struct r5dev *dev = &sh->dev[qd_idx];
3463
3464                 set_bit(R5_LOCKED, &dev->flags);
3465                 clear_bit(R5_UPTODATE, &dev->flags);
3466                 s->locked++;
3467         }
3468
3469         if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
3470             test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3471             !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3472             test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3473                 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
3474
3475         pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3476                 __func__, (unsigned long long)sh->sector,
3477                 s->locked, s->ops_request);
3478 }
3479
3480 static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
3481                                 int dd_idx, int forwrite)
3482 {
3483         struct r5conf *conf = sh->raid_conf;
3484         struct bio **bip;
3485
3486         pr_debug("checking bi b#%llu to stripe s#%llu\n",
3487                  bi->bi_iter.bi_sector, sh->sector);
3488
3489         /* Don't allow new IO added to stripes in batch list */
3490         if (sh->batch_head)
3491                 return true;
3492
3493         if (forwrite)
3494                 bip = &sh->dev[dd_idx].towrite;
3495         else
3496                 bip = &sh->dev[dd_idx].toread;
3497
3498         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3499                 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3500                         return true;
3501                 bip = &(*bip)->bi_next;
3502         }
3503
3504         if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3505                 return true;
3506
3507         if (forwrite && raid5_has_ppl(conf)) {
3508                 /*
3509                  * With PPL only writes to consecutive data chunks within a
3510                  * stripe are allowed because for a single stripe_head we can
3511                  * only have one PPL entry at a time, which describes one data
3512                  * range. Not really an overlap, but wait_for_overlap can be
3513                  * used to handle this.
3514                  */
3515                 sector_t sector;
3516                 sector_t first = 0;
3517                 sector_t last = 0;
3518                 int count = 0;
3519                 int i;
3520
3521                 for (i = 0; i < sh->disks; i++) {
3522                         if (i != sh->pd_idx &&
3523                             (i == dd_idx || sh->dev[i].towrite)) {
3524                                 sector = sh->dev[i].sector;
3525                                 if (count == 0 || sector < first)
3526                                         first = sector;
3527                                 if (sector > last)
3528                                         last = sector;
3529                                 count++;
3530                         }
3531                 }
3532
3533                 if (first + conf->chunk_sectors * (count - 1) != last)
3534                         return true;
3535         }
3536
3537         return false;
3538 }
3539
3540 static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3541                              int dd_idx, int forwrite, int previous)
3542 {
3543         struct r5conf *conf = sh->raid_conf;
3544         struct bio **bip;
3545         int firstwrite = 0;
3546
3547         if (forwrite) {
3548                 bip = &sh->dev[dd_idx].towrite;
3549                 if (!*bip)
3550                         firstwrite = 1;
3551         } else {
3552                 bip = &sh->dev[dd_idx].toread;
3553         }
3554
3555         while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3556                 bip = &(*bip)->bi_next;
3557
3558         if (!forwrite || previous)
3559                 clear_bit(STRIPE_BATCH_READY, &sh->state);
3560
3561         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3562         if (*bip)
3563                 bi->bi_next = *bip;
3564         *bip = bi;
3565         bio_inc_remaining(bi);
3566         md_write_inc(conf->mddev, bi);
3567
3568         if (forwrite) {
3569                 /* check if page is covered */
3570                 sector_t sector = sh->dev[dd_idx].sector;
3571                 for (bi=sh->dev[dd_idx].towrite;
3572                      sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3573                              bi && bi->bi_iter.bi_sector <= sector;
3574                      bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
3575                         if (bio_end_sector(bi) >= sector)
3576                                 sector = bio_end_sector(bi);
3577                 }
3578                 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3579                         if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3580                                 sh->overwrite_disks++;
3581         }
3582
3583         pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3584                  (*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3585                  sh->dev[dd_idx].sector);
3586
3587         if (conf->mddev->bitmap && firstwrite) {
3588                 /* Cannot hold spinlock over bitmap_startwrite,
3589                  * but must ensure this isn't added to a batch until
3590                  * we have added to the bitmap and set bm_seq.
3591                  * So set STRIPE_BITMAP_PENDING to prevent
3592                  * batching.
3593                  * If multiple __add_stripe_bio() calls race here they
3594                  * much all set STRIPE_BITMAP_PENDING.  So only the first one
3595                  * to complete "bitmap_startwrite" gets to set
3596                  * STRIPE_BIT_DELAY.  This is important as once a stripe
3597                  * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3598                  * any more.
3599                  */
3600                 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3601                 spin_unlock_irq(&sh->stripe_lock);
3602                 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3603                                      RAID5_STRIPE_SECTORS(conf), 0);
3604                 spin_lock_irq(&sh->stripe_lock);
3605                 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3606                 if (!sh->batch_head) {
3607                         sh->bm_seq = conf->seq_flush+1;
3608                         set_bit(STRIPE_BIT_DELAY, &sh->state);
3609                 }
3610         }
3611 }
3612
3613 /*
3614  * Each stripe/dev can have one or more bios attached.
3615  * toread/towrite point to the first in a chain.
3616  * The bi_next chain must be in order.
3617  */
3618 static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi,
3619                            int dd_idx, int forwrite, int previous)
3620 {
3621         spin_lock_irq(&sh->stripe_lock);
3622
3623         if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3624                 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3625                 spin_unlock_irq(&sh->stripe_lock);
3626                 return false;
3627         }
3628
3629         __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3630         spin_unlock_irq(&sh->stripe_lock);
3631         return true;
3632 }
3633
3634 static void end_reshape(struct r5conf *conf);
3635
3636 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3637                             struct stripe_head *sh)
3638 {
3639         int sectors_per_chunk =
3640                 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3641         int dd_idx;
3642         int chunk_offset = sector_div(stripe, sectors_per_chunk);
3643         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3644
3645         raid5_compute_sector(conf,
3646                              stripe * (disks - conf->max_degraded)
3647                              *sectors_per_chunk + chunk_offset,
3648                              previous,
3649                              &dd_idx, sh);
3650 }
3651
3652 static void
3653 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3654                      struct stripe_head_state *s, int disks)
3655 {
3656         int i;
3657         BUG_ON(sh->batch_head);
3658         for (i = disks; i--; ) {
3659                 struct bio *bi;
3660                 int bitmap_end = 0;
3661
3662                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3663                         struct md_rdev *rdev;
3664                         rcu_read_lock();
3665                         rdev = rcu_dereference(conf->disks[i].rdev);
3666                         if (rdev && test_bit(In_sync, &rdev->flags) &&
3667                             !test_bit(Faulty, &rdev->flags))
3668                                 atomic_inc(&rdev->nr_pending);
3669                         else
3670                                 rdev = NULL;
3671                         rcu_read_unlock();
3672                         if (rdev) {
3673                                 if (!rdev_set_badblocks(
3674                                             rdev,
3675                                             sh->sector,
3676                                             RAID5_STRIPE_SECTORS(conf), 0))
3677                                         md_error(conf->mddev, rdev);
3678                                 rdev_dec_pending(rdev, conf->mddev);
3679                         }
3680                 }
3681                 spin_lock_irq(&sh->stripe_lock);
3682                 /* fail all writes first */
3683                 bi = sh->dev[i].towrite;
3684                 sh->dev[i].towrite = NULL;
3685                 sh->overwrite_disks = 0;
3686                 spin_unlock_irq(&sh->stripe_lock);
3687                 if (bi)
3688                         bitmap_end = 1;
3689
3690                 log_stripe_write_finished(sh);
3691
3692                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3693                         wake_up(&conf->wait_for_overlap);
3694
3695                 while (bi && bi->bi_iter.bi_sector <
3696                         sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3697                         struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
3698
3699                         md_write_end(conf->mddev);
3700                         bio_io_error(bi);
3701                         bi = nextbi;
3702                 }
3703                 if (bitmap_end)
3704                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3705                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3706                 bitmap_end = 0;
3707                 /* and fail all 'written' */
3708                 bi = sh->dev[i].written;
3709                 sh->dev[i].written = NULL;
3710                 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3711                         WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3712                         sh->dev[i].page = sh->dev[i].orig_page;
3713                 }
3714
3715                 if (bi) bitmap_end = 1;
3716                 while (bi && bi->bi_iter.bi_sector <
3717                        sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3718                         struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
3719
3720                         md_write_end(conf->mddev);
3721                         bio_io_error(bi);
3722                         bi = bi2;
3723                 }
3724
3725                 /* fail any reads if this device is non-operational and
3726                  * the data has not reached the cache yet.
3727                  */
3728                 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3729                     s->failed > conf->max_degraded &&
3730                     (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3731                       test_bit(R5_ReadError, &sh->dev[i].flags))) {
3732                         spin_lock_irq(&sh->stripe_lock);
3733                         bi = sh->dev[i].toread;
3734                         sh->dev[i].toread = NULL;
3735                         spin_unlock_irq(&sh->stripe_lock);
3736                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3737                                 wake_up(&conf->wait_for_overlap);
3738                         if (bi)
3739                                 s->to_read--;
3740                         while (bi && bi->bi_iter.bi_sector <
3741                                sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3742                                 struct bio *nextbi =
3743                                         r5_next_bio(conf, bi, sh->dev[i].sector);
3744
3745                                 bio_io_error(bi);
3746                                 bi = nextbi;
3747                         }
3748                 }
3749                 if (bitmap_end)
3750                         md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3751                                            RAID5_STRIPE_SECTORS(conf), 0, 0);
3752                 /* If we were in the middle of a write the parity block might
3753                  * still be locked - so just clear all R5_LOCKED flags
3754                  */
3755                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3756         }
3757         s->to_write = 0;
3758         s->written = 0;
3759
3760         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3761                 if (atomic_dec_and_test(&conf->pending_full_writes))
3762                         md_wakeup_thread(conf->mddev->thread);
3763 }
3764
3765 static void
3766 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3767                    struct stripe_head_state *s)
3768 {
3769         int abort = 0;
3770         int i;
3771
3772         BUG_ON(sh->batch_head);
3773         clear_bit(STRIPE_SYNCING, &sh->state);
3774         if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3775                 wake_up(&conf->wait_for_overlap);
3776         s->syncing = 0;
3777         s->replacing = 0;
3778         /* There is nothing more to do for sync/check/repair.
3779          * Don't even need to abort as that is handled elsewhere
3780          * if needed, and not always wanted e.g. if there is a known
3781          * bad block here.
3782          * For recover/replace we need to record a bad block on all
3783          * non-sync devices, or abort the recovery
3784          */
3785         if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3786                 /* During recovery devices cannot be removed, so
3787                  * locking and refcounting of rdevs is not needed
3788                  */
3789                 rcu_read_lock();
3790                 for (i = 0; i < conf->raid_disks; i++) {
3791                         struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3792                         if (rdev
3793                             && !test_bit(Faulty, &rdev->flags)
3794                             && !test_bit(In_sync, &rdev->flags)
3795                             && !rdev_set_badblocks(rdev, sh->sector,
3796                                                    RAID5_STRIPE_SECTORS(conf), 0))
3797                                 abort = 1;
3798                         rdev = rcu_dereference(conf->disks[i].replacement);
3799                         if (rdev
3800                             && !test_bit(Faulty, &rdev->flags)
3801                             && !test_bit(In_sync, &rdev->flags)
3802                             && !rdev_set_badblocks(rdev, sh->sector,
3803                                                    RAID5_STRIPE_SECTORS(conf), 0))
3804                                 abort = 1;
3805                 }
3806                 rcu_read_unlock();
3807                 if (abort)
3808                         conf->recovery_disabled =
3809                                 conf->mddev->recovery_disabled;
3810         }
3811         md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
3812 }
3813
3814 static int want_replace(struct stripe_head *sh, int disk_idx)
3815 {
3816         struct md_rdev *rdev;
3817         int rv = 0;
3818
3819         rcu_read_lock();
3820         rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3821         if (rdev
3822             && !test_bit(Faulty, &rdev->flags)
3823             && !test_bit(In_sync, &rdev->flags)
3824             && (rdev->recovery_offset <= sh->sector
3825                 || rdev->mddev->recovery_cp <= sh->sector))
3826                 rv = 1;
3827         rcu_read_unlock();
3828         return rv;
3829 }
3830
3831 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3832                            int disk_idx, int disks)
3833 {
3834         struct r5dev *dev = &sh->dev[disk_idx];
3835         struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3836                                   &sh->dev[s->failed_num[1]] };
3837         int i;
3838         bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3839
3840
3841         if (test_bit(R5_LOCKED, &dev->flags) ||
3842             test_bit(R5_UPTODATE, &dev->flags))
3843                 /* No point reading this as we already have it or have
3844                  * decided to get it.
3845                  */
3846                 return 0;
3847
3848         if (dev->toread ||
3849             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3850                 /* We need this block to directly satisfy a request */
3851                 return 1;
3852
3853         if (s->syncing || s->expanding ||
3854             (s->replacing && want_replace(sh, disk_idx)))
3855                 /* When syncing, or expanding we read everything.
3856                  * When replacing, we need the replaced block.
3857                  */
3858                 return 1;
3859
3860         if ((s->failed >= 1 && fdev[0]->toread) ||
3861             (s->failed >= 2 && fdev[1]->toread))
3862                 /* If we want to read from a failed device, then
3863                  * we need to actually read every other device.
3864                  */
3865                 return 1;
3866
3867         /* Sometimes neither read-modify-write nor reconstruct-write
3868          * cycles can work.  In those cases we read every block we
3869          * can.  Then the parity-update is certain to have enough to
3870          * work with.
3871          * This can only be a problem when we need to write something,
3872          * and some device has failed.  If either of those tests
3873          * fail we need look no further.
3874          */
3875         if (!s->failed || !s->to_write)
3876                 return 0;
3877
3878         if (test_bit(R5_Insync, &dev->flags) &&
3879             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3880                 /* Pre-reads at not permitted until after short delay
3881                  * to gather multiple requests.  However if this
3882                  * device is no Insync, the block could only be computed
3883                  * and there is no need to delay that.
3884                  */
3885                 return 0;
3886
3887         for (i = 0; i < s->failed && i < 2; i++) {
3888                 if (fdev[i]->towrite &&
3889                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3890                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3891                         /* If we have a partial write to a failed
3892                          * device, then we will need to reconstruct
3893                          * the content of that device, so all other
3894                          * devices must be read.
3895                          */
3896                         return 1;
3897
3898                 if (s->failed >= 2 &&
3899                     (fdev[i]->towrite ||
3900                      s->failed_num[i] == sh->pd_idx ||
3901                      s->failed_num[i] == sh->qd_idx) &&
3902                     !test_bit(R5_UPTODATE, &fdev[i]->flags))
3903                         /* In max degraded raid6, If the failed disk is P, Q,
3904                          * or we want to read the failed disk, we need to do
3905                          * reconstruct-write.
3906                          */
3907                         force_rcw = true;
3908         }
3909
3910         /* If we are forced to do a reconstruct-write, because parity
3911          * cannot be trusted and we are currently recovering it, there
3912          * is extra need to be careful.
3913          * If one of the devices that we would need to read, because
3914          * it is not being overwritten (and maybe not written at all)
3915          * is missing/faulty, then we need to read everything we can.
3916          */
3917         if (!force_rcw &&
3918             sh->sector < sh->raid_conf->mddev->recovery_cp)
3919                 /* reconstruct-write isn't being forced */
3920                 return 0;
3921         for (i = 0; i < s->failed && i < 2; i++) {
3922                 if (s->failed_num[i] != sh->pd_idx &&
3923                     s->failed_num[i] != sh->qd_idx &&
3924                     !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3925                     !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3926                         return 1;
3927         }
3928
3929         return 0;
3930 }
3931
3932 /* fetch_block - checks the given member device to see if its data needs
3933  * to be read or computed to satisfy a request.
3934  *
3935  * Returns 1 when no more member devices need to be checked, otherwise returns
3936  * 0 to tell the loop in handle_stripe_fill to continue
3937  */
3938 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3939                        int disk_idx, int disks)
3940 {
3941         struct r5dev *dev = &sh->dev[disk_idx];
3942
3943         /* is the data in this block needed, and can we get it? */
3944         if (need_this_block(sh, s, disk_idx, disks)) {
3945                 /* we would like to get this block, possibly by computing it,
3946                  * otherwise read it if the backing disk is insync
3947                  */
3948                 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3949                 BUG_ON(test_bit(R5_Wantread, &dev->flags));
3950                 BUG_ON(sh->batch_head);
3951
3952                 /*
3953                  * In the raid6 case if the only non-uptodate disk is P
3954                  * then we already trusted P to compute the other failed
3955                  * drives. It is safe to compute rather than re-read P.
3956                  * In other cases we only compute blocks from failed
3957                  * devices, otherwise check/repair might fail to detect
3958                  * a real inconsistency.
3959                  */
3960
3961                 if ((s->uptodate == disks - 1) &&
3962                     ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
3963                     (s->failed && (disk_idx == s->failed_num[0] ||
3964                                    disk_idx == s->failed_num[1])))) {
3965                         /* have disk failed, and we're requested to fetch it;
3966                          * do compute it
3967                          */
3968                         pr_debug("Computing stripe %llu block %d\n",
3969                                (unsigned long long)sh->sector, disk_idx);
3970                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3971                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3972                         set_bit(R5_Wantcompute, &dev->flags);
3973                         sh->ops.target = disk_idx;
3974                         sh->ops.target2 = -1; /* no 2nd target */
3975                         s->req_compute = 1;
3976                         /* Careful: from this point on 'uptodate' is in the eye
3977                          * of raid_run_ops which services 'compute' operations
3978                          * before writes. R5_Wantcompute flags a block that will
3979                          * be R5_UPTODATE by the time it is needed for a
3980                          * subsequent operation.
3981                          */
3982                         s->uptodate++;
3983                         return 1;
3984                 } else if (s->uptodate == disks-2 && s->failed >= 2) {
3985                         /* Computing 2-failure is *very* expensive; only
3986                          * do it if failed >= 2
3987                          */
3988                         int other;
3989                         for (other = disks; other--; ) {
3990                                 if (other == disk_idx)
3991                                         continue;
3992                                 if (!test_bit(R5_UPTODATE,
3993                                       &sh->dev[other].flags))
3994                                         break;
3995                         }
3996                         BUG_ON(other < 0);
3997                         pr_debug("Computing stripe %llu blocks %d,%d\n",
3998                                (unsigned long long)sh->sector,
3999                                disk_idx, other);
4000                         set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4001                         set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4002                         set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
4003                         set_bit(R5_Wantcompute, &sh->dev[other].flags);
4004                         sh->ops.target = disk_idx;
4005                         sh->ops.target2 = other;
4006                         s->uptodate += 2;
4007                         s->req_compute = 1;
4008                         return 1;
4009                 } else if (test_bit(R5_Insync, &dev->flags)) {
4010                         set_bit(R5_LOCKED, &dev->flags);
4011                         set_bit(R5_Wantread, &dev->flags);
4012                         s->locked++;
4013                         pr_debug("Reading block %d (sync=%d)\n",
4014                                 disk_idx, s->syncing);
4015                 }
4016         }
4017
4018         return 0;
4019 }
4020
4021 /*
4022  * handle_stripe_fill - read or compute data to satisfy pending requests.
4023  */
4024 static void handle_stripe_fill(struct stripe_head *sh,
4025                                struct stripe_head_state *s,
4026                                int disks)
4027 {
4028         int i;
4029
4030         /* look for blocks to read/compute, skip this if a compute
4031          * is already in flight, or if the stripe contents are in the
4032          * midst of changing due to a write
4033          */
4034         if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
4035             !sh->reconstruct_state) {
4036
4037                 /*
4038                  * For degraded stripe with data in journal, do not handle
4039                  * read requests yet, instead, flush the stripe to raid
4040                  * disks first, this avoids handling complex rmw of write
4041                  * back cache (prexor with orig_page, and then xor with
4042                  * page) in the read path
4043                  */
4044                 if (s->to_read && s->injournal && s->failed) {
4045                         if (test_bit(STRIPE_R5C_CACHING, &sh->state))
4046                                 r5c_make_stripe_write_out(sh);
4047                         goto out;
4048                 }
4049
4050                 for (i = disks; i--; )
4051                         if (fetch_block(sh, s, i, disks))
4052                                 break;
4053         }
4054 out:
4055         set_bit(STRIPE_HANDLE, &sh->state);
4056 }
4057
4058 static void break_stripe_batch_list(struct stripe_head *head_sh,
4059                                     unsigned long handle_flags);
4060 /* handle_stripe_clean_event
4061  * any written block on an uptodate or failed drive can be returned.
4062  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
4063  * never LOCKED, so we don't need to test 'failed' directly.
4064  */
4065 static void handle_stripe_clean_event(struct r5conf *conf,
4066         struct stripe_head *sh, int disks)
4067 {
4068         int i;
4069         struct r5dev *dev;
4070         int discard_pending = 0;
4071         struct stripe_head *head_sh = sh;
4072         bool do_endio = false;
4073
4074         for (i = disks; i--; )
4075                 if (sh->dev[i].written) {
4076                         dev = &sh->dev[i];
4077                         if (!test_bit(R5_LOCKED, &dev->flags) &&
4078                             (test_bit(R5_UPTODATE, &dev->flags) ||
4079                              test_bit(R5_Discard, &dev->flags) ||
4080                              test_bit(R5_SkipCopy, &dev->flags))) {
4081                                 /* We can return any write requests */
4082                                 struct bio *wbi, *wbi2;
4083                                 pr_debug("Return write for disc %d\n", i);
4084                                 if (test_and_clear_bit(R5_Discard, &dev->flags))
4085                                         clear_bit(R5_UPTODATE, &dev->flags);
4086                                 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
4087                                         WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4088                                 }
4089                                 do_endio = true;
4090
4091 returnbi:
4092                                 dev->page = dev->orig_page;
4093                                 wbi = dev->written;
4094                                 dev->written = NULL;
4095                                 while (wbi && wbi->bi_iter.bi_sector <
4096                                         dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4097                                         wbi2 = r5_next_bio(conf, wbi, dev->sector);
4098                                         md_write_end(conf->mddev);
4099                                         bio_endio(wbi);
4100                                         wbi = wbi2;
4101                                 }
4102                                 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
4103                                                    RAID5_STRIPE_SECTORS(conf),
4104                                                    !test_bit(STRIPE_DEGRADED, &sh->state),
4105                                                    0);
4106                                 if (head_sh->batch_head) {
4107                                         sh = list_first_entry(&sh->batch_list,
4108                                                               struct stripe_head,
4109                                                               batch_list);
4110                                         if (sh != head_sh) {
4111                                                 dev = &sh->dev[i];
4112                                                 goto returnbi;
4113                                         }
4114                                 }
4115                                 sh = head_sh;
4116                                 dev = &sh->dev[i];
4117                         } else if (test_bit(R5_Discard, &dev->flags))
4118                                 discard_pending = 1;
4119                 }
4120
4121         log_stripe_write_finished(sh);
4122
4123         if (!discard_pending &&
4124             test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4125                 int hash;
4126                 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
4127                 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4128                 if (sh->qd_idx >= 0) {
4129                         clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
4130                         clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
4131                 }
4132                 /* now that discard is done we can proceed with any sync */
4133                 clear_bit(STRIPE_DISCARD, &sh->state);
4134                 /*
4135                  * SCSI discard will change some bio fields and the stripe has
4136                  * no updated data, so remove it from hash list and the stripe
4137                  * will be reinitialized
4138                  */
4139 unhash:
4140                 hash = sh->hash_lock_index;
4141                 spin_lock_irq(conf->hash_locks + hash);
4142                 remove_hash(sh);
4143                 spin_unlock_irq(conf->hash_locks + hash);
4144                 if (head_sh->batch_head) {
4145                         sh = list_first_entry(&sh->batch_list,
4146                                               struct stripe_head, batch_list);
4147                         if (sh != head_sh)
4148                                         goto unhash;
4149                 }
4150                 sh = head_sh;
4151
4152                 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4153                         set_bit(STRIPE_HANDLE, &sh->state);
4154
4155         }
4156
4157         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
4158                 if (atomic_dec_and_test(&conf->pending_full_writes))
4159                         md_wakeup_thread(conf->mddev->thread);
4160
4161         if (head_sh->batch_head && do_endio)
4162                 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4163 }
4164
4165 /*
4166  * For RMW in write back cache, we need extra page in prexor to store the
4167  * old data. This page is stored in dev->orig_page.
4168  *
4169  * This function checks whether we have data for prexor. The exact logic
4170  * is:
4171  *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
4172  */
4173 static inline bool uptodate_for_rmw(struct r5dev *dev)
4174 {
4175         return (test_bit(R5_UPTODATE, &dev->flags)) &&
4176                 (!test_bit(R5_InJournal, &dev->flags) ||
4177                  test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4178 }
4179
4180 static int handle_stripe_dirtying(struct r5conf *conf,
4181                                   struct stripe_head *sh,
4182                                   struct stripe_head_state *s,
4183                                   int disks)
4184 {
4185         int rmw = 0, rcw = 0, i;
4186         sector_t recovery_cp = conf->mddev->recovery_cp;
4187
4188         /* Check whether resync is now happening or should start.
4189          * If yes, then the array is dirty (after unclean shutdown or
4190          * initial creation), so parity in some stripes might be inconsistent.
4191          * In this case, we need to always do reconstruct-write, to ensure
4192          * that in case of drive failure or read-error correction, we
4193          * generate correct data from the parity.
4194          */
4195         if (conf->rmw_level == PARITY_DISABLE_RMW ||
4196             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4197              s->failed == 0)) {
4198                 /* Calculate the real rcw later - for now make it
4199                  * look like rcw is cheaper
4200                  */
4201                 rcw = 1; rmw = 2;
4202                 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4203                          conf->rmw_level, (unsigned long long)recovery_cp,
4204                          (unsigned long long)sh->sector);
4205         } else for (i = disks; i--; ) {
4206                 /* would I have to read this buffer for read_modify_write */
4207                 struct r5dev *dev = &sh->dev[i];
4208                 if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4209                      i == sh->pd_idx || i == sh->qd_idx ||
4210                      test_bit(R5_InJournal, &dev->flags)) &&
4211                     !test_bit(R5_LOCKED, &dev->flags) &&
4212                     !(uptodate_for_rmw(dev) ||
4213                       test_bit(R5_Wantcompute, &dev->flags))) {
4214                         if (test_bit(R5_Insync, &dev->flags))
4215                                 rmw++;
4216                         else
4217                                 rmw += 2*disks;  /* cannot read it */
4218                 }
4219                 /* Would I have to read this buffer for reconstruct_write */
4220                 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4221                     i != sh->pd_idx && i != sh->qd_idx &&
4222                     !test_bit(R5_LOCKED, &dev->flags) &&
4223                     !(test_bit(R5_UPTODATE, &dev->flags) ||
4224                       test_bit(R5_Wantcompute, &dev->flags))) {
4225                         if (test_bit(R5_Insync, &dev->flags))
4226                                 rcw++;
4227                         else
4228                                 rcw += 2*disks;
4229                 }
4230         }
4231
4232         pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4233                  (unsigned long long)sh->sector, sh->state, rmw, rcw);
4234         set_bit(STRIPE_HANDLE, &sh->state);
4235         if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
4236                 /* prefer read-modify-write, but need to get some data */
4237                 if (conf->mddev->queue)
4238                         blk_add_trace_msg(conf->mddev->queue,
4239                                           "raid5 rmw %llu %d",
4240                                           (unsigned long long)sh->sector, rmw);
4241                 for (i = disks; i--; ) {
4242                         struct r5dev *dev = &sh->dev[i];
4243                         if (test_bit(R5_InJournal, &dev->flags) &&
4244                             dev->page == dev->orig_page &&
4245                             !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4246                                 /* alloc page for prexor */
4247                                 struct page *p = alloc_page(GFP_NOIO);
4248
4249                                 if (p) {
4250                                         dev->orig_page = p;
4251                                         continue;
4252                                 }
4253
4254                                 /*
4255                                  * alloc_page() failed, try use
4256                                  * disk_info->extra_page
4257                                  */
4258                                 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
4259                                                       &conf->cache_state)) {
4260                                         r5c_use_extra_page(sh);
4261                                         break;
4262                                 }
4263
4264                                 /* extra_page in use, add to delayed_list */
4265                                 set_bit(STRIPE_DELAYED, &sh->state);
4266                                 s->waiting_extra_page = 1;
4267                                 return -EAGAIN;
4268                         }
4269                 }
4270
4271                 for (i = disks; i--; ) {
4272                         struct r5dev *dev = &sh->dev[i];
4273                         if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
4274                              i == sh->pd_idx || i == sh->qd_idx ||
4275                              test_bit(R5_InJournal, &dev->flags)) &&
4276                             !test_bit(R5_LOCKED, &dev->flags) &&
4277                             !(uptodate_for_rmw(dev) ||
4278                               test_bit(R5_Wantcompute, &dev->flags)) &&
4279                             test_bit(R5_Insync, &dev->flags)) {
4280                                 if (test_bit(STRIPE_PREREAD_ACTIVE,
4281                                              &sh->state)) {
4282                                         pr_debug("Read_old block %d for r-m-w\n",
4283                                                  i);
4284                                         set_bit(R5_LOCKED, &dev->flags);
4285                                         set_bit(R5_Wantread, &dev->flags);
4286                                         s->locked++;
4287                                 } else
4288                                         set_bit(STRIPE_DELAYED, &sh->state);
4289                         }
4290                 }
4291         }
4292         if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
4293                 /* want reconstruct write, but need to get some data */
4294                 int qread =0;
4295                 rcw = 0;
4296                 for (i = disks; i--; ) {
4297                         struct r5dev *dev = &sh->dev[i];
4298                         if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4299                             i != sh->pd_idx && i != sh->qd_idx &&
4300                             !test_bit(R5_LOCKED, &dev->flags) &&
4301                             !(test_bit(R5_UPTODATE, &dev->flags) ||
4302                               test_bit(R5_Wantcompute, &dev->flags))) {
4303                                 rcw++;
4304                                 if (test_bit(R5_Insync, &dev->flags) &&
4305                                     test_bit(STRIPE_PREREAD_ACTIVE,
4306                                              &sh->state)) {
4307                                         pr_debug("Read_old block "
4308                                                 "%d for Reconstruct\n", i);
4309                                         set_bit(R5_LOCKED, &dev->flags);
4310                                         set_bit(R5_Wantread, &dev->flags);
4311                                         s->locked++;
4312                                         qread++;
4313                                 } else
4314                                         set_bit(STRIPE_DELAYED, &sh->state);
4315                         }
4316                 }
4317                 if (rcw && conf->mddev->queue)
4318                         blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4319                                           (unsigned long long)sh->sector,
4320                                           rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4321         }
4322
4323         if (rcw > disks && rmw > disks &&
4324             !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4325                 set_bit(STRIPE_DELAYED, &sh->state);
4326
4327         /* now if nothing is locked, and if we have enough data,
4328          * we can start a write request
4329          */
4330         /* since handle_stripe can be called at any time we need to handle the
4331          * case where a compute block operation has been submitted and then a
4332          * subsequent call wants to start a write request.  raid_run_ops only
4333          * handles the case where compute block and reconstruct are requested
4334          * simultaneously.  If this is not the case then new writes need to be
4335          * held off until the compute completes.
4336          */
4337         if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4338             (s->locked == 0 && (rcw == 0 || rmw == 0) &&
4339              !test_bit(STRIPE_BIT_DELAY, &sh->state)))
4340                 schedule_reconstruction(sh, s, rcw == 0, 0);
4341         return 0;
4342 }
4343
4344 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
4345                                 struct stripe_head_state *s, int disks)
4346 {
4347         struct r5dev *dev = NULL;
4348
4349         BUG_ON(sh->batch_head);
4350         set_bit(STRIPE_HANDLE, &sh->state);
4351
4352         switch (sh->check_state) {
4353         case check_state_idle:
4354                 /* start a new check operation if there are no failures */
4355                 if (s->failed == 0) {
4356                         BUG_ON(s->uptodate != disks);
4357                         sh->check_state = check_state_run;
4358                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4359                         clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
4360                         s->uptodate--;
4361                         break;
4362                 }
4363                 dev = &sh->dev[s->failed_num[0]];
4364                 fallthrough;
4365         case check_state_compute_result:
4366                 sh->check_state = check_state_idle;
4367                 if (!dev)
4368                         dev = &sh->dev[sh->pd_idx];
4369
4370                 /* check that a write has not made the stripe insync */
4371                 if (test_bit(STRIPE_INSYNC, &sh->state))
4372                         break;
4373
4374                 /* either failed parity check, or recovery is happening */
4375                 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4376                 BUG_ON(s->uptodate != disks);
4377
4378                 set_bit(R5_LOCKED, &dev->flags);
4379                 s->locked++;
4380                 set_bit(R5_Wantwrite, &dev->flags);
4381
4382                 clear_bit(STRIPE_DEGRADED, &sh->state);
4383                 set_bit(STRIPE_INSYNC, &sh->state);
4384                 break;
4385         case check_state_run:
4386                 break; /* we will be called again upon completion */
4387         case check_state_check_result:
4388                 sh->check_state = check_state_idle;
4389
4390                 /* if a failure occurred during the check operation, leave
4391                  * STRIPE_INSYNC not set and let the stripe be handled again
4392                  */
4393                 if (s->failed)
4394                         break;
4395
4396                 /* handle a successful check operation, if parity is correct
4397                  * we are done.  Otherwise update the mismatch count and repair
4398                  * parity if !MD_RECOVERY_CHECK
4399                  */
4400                 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
4401                         /* parity is correct (on disc,
4402                          * not in buffer any more)
4403                          */
4404                         set_bit(STRIPE_INSYNC, &sh->state);
4405                 else {
4406                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4407                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4408                                 /* don't try to repair!! */
4409                                 set_bit(STRIPE_INSYNC, &sh->state);
4410                                 pr_warn_ratelimited("%s: mismatch sector in range "
4411                                                     "%llu-%llu\n", mdname(conf->mddev),
4412                                                     (unsigned long long) sh->sector,
4413                                                     (unsigned long long) sh->sector +
4414                                                     RAID5_STRIPE_SECTORS(conf));
4415                         } else {
4416                                 sh->check_state = check_state_compute_run;
4417                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4418                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4419                                 set_bit(R5_Wantcompute,
4420                                         &sh->dev[sh->pd_idx].flags);
4421                                 sh->ops.target = sh->pd_idx;
4422                                 sh->ops.target2 = -1;
4423                                 s->uptodate++;
4424                         }
4425                 }
4426                 break;
4427         case check_state_compute_run:
4428                 break;
4429         default:
4430                 pr_err("%s: unknown check_state: %d sector: %llu\n",
4431                        __func__, sh->check_state,
4432                        (unsigned long long) sh->sector);
4433                 BUG();
4434         }
4435 }
4436
4437 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4438                                   struct stripe_head_state *s,
4439                                   int disks)
4440 {
4441         int pd_idx = sh->pd_idx;
4442         int qd_idx = sh->qd_idx;
4443         struct r5dev *dev;
4444
4445         BUG_ON(sh->batch_head);
4446         set_bit(STRIPE_HANDLE, &sh->state);
4447
4448         BUG_ON(s->failed > 2);
4449
4450         /* Want to check and possibly repair P and Q.
4451          * However there could be one 'failed' device, in which
4452          * case we can only check one of them, possibly using the
4453          * other to generate missing data
4454          */
4455
4456         switch (sh->check_state) {
4457         case check_state_idle:
4458                 /* start a new check operation if there are < 2 failures */
4459                 if (s->failed == s->q_failed) {
4460                         /* The only possible failed device holds Q, so it
4461                          * makes sense to check P (If anything else were failed,
4462                          * we would have used P to recreate it).
4463                          */
4464                         sh->check_state = check_state_run;
4465                 }
4466                 if (!s->q_failed && s->failed < 2) {
4467                         /* Q is not failed, and we didn't use it to generate
4468                          * anything, so it makes sense to check it
4469                          */
4470                         if (sh->check_state == check_state_run)
4471                                 sh->check_state = check_state_run_pq;
4472                         else
4473                                 sh->check_state = check_state_run_q;
4474                 }
4475
4476                 /* discard potentially stale zero_sum_result */
4477                 sh->ops.zero_sum_result = 0;
4478
4479                 if (sh->check_state == check_state_run) {
4480                         /* async_xor_zero_sum destroys the contents of P */
4481                         clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
4482                         s->uptodate--;
4483                 }
4484                 if (sh->check_state >= check_state_run &&
4485                     sh->check_state <= check_state_run_pq) {
4486                         /* async_syndrome_zero_sum preserves P and Q, so
4487                          * no need to mark them !uptodate here
4488                          */
4489                         set_bit(STRIPE_OP_CHECK, &s->ops_request);
4490                         break;
4491                 }
4492
4493                 /* we have 2-disk failure */
4494                 BUG_ON(s->failed != 2);
4495                 fallthrough;
4496         case check_state_compute_result:
4497                 sh->check_state = check_state_idle;
4498
4499                 /* check that a write has not made the stripe insync */
4500                 if (test_bit(STRIPE_INSYNC, &sh->state))
4501                         break;
4502
4503                 /* now write out any block on a failed drive,
4504                  * or P or Q if they were recomputed
4505                  */
4506                 dev = NULL;
4507                 if (s->failed == 2) {
4508                         dev = &sh->dev[s->failed_num[1]];
4509                         s->locked++;
4510                         set_bit(R5_LOCKED, &dev->flags);
4511                         set_bit(R5_Wantwrite, &dev->flags);
4512                 }
4513                 if (s->failed >= 1) {
4514                         dev = &sh->dev[s->failed_num[0]];
4515                         s->locked++;
4516                         set_bit(R5_LOCKED, &dev->flags);
4517                         set_bit(R5_Wantwrite, &dev->flags);
4518                 }
4519                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4520                         dev = &sh->dev[pd_idx];
4521                         s->locked++;
4522                         set_bit(R5_LOCKED, &dev->flags);
4523                         set_bit(R5_Wantwrite, &dev->flags);
4524                 }
4525                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4526                         dev = &sh->dev[qd_idx];
4527                         s->locked++;
4528                         set_bit(R5_LOCKED, &dev->flags);
4529                         set_bit(R5_Wantwrite, &dev->flags);
4530                 }
4531                 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4532                               "%s: disk%td not up to date\n",
4533                               mdname(conf->mddev),
4534                               dev - (struct r5dev *) &sh->dev)) {
4535                         clear_bit(R5_LOCKED, &dev->flags);
4536                         clear_bit(R5_Wantwrite, &dev->flags);
4537                         s->locked--;
4538                 }
4539                 clear_bit(STRIPE_DEGRADED, &sh->state);
4540
4541                 set_bit(STRIPE_INSYNC, &sh->state);
4542                 break;
4543         case check_state_run:
4544         case check_state_run_q:
4545         case check_state_run_pq:
4546                 break; /* we will be called again upon completion */
4547         case check_state_check_result:
4548                 sh->check_state = check_state_idle;
4549
4550                 /* handle a successful check operation, if parity is correct
4551                  * we are done.  Otherwise update the mismatch count and repair
4552                  * parity if !MD_RECOVERY_CHECK
4553                  */
4554                 if (sh->ops.zero_sum_result == 0) {
4555                         /* both parities are correct */
4556                         if (!s->failed)
4557                                 set_bit(STRIPE_INSYNC, &sh->state);
4558                         else {
4559                                 /* in contrast to the raid5 case we can validate
4560                                  * parity, but still have a failure to write
4561                                  * back
4562                                  */
4563                                 sh->check_state = check_state_compute_result;
4564                                 /* Returning at this point means that we may go
4565                                  * off and bring p and/or q uptodate again so
4566                                  * we make sure to check zero_sum_result again
4567                                  * to verify if p or q need writeback
4568                                  */
4569                         }
4570                 } else {
4571                         atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
4572                         if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4573                                 /* don't try to repair!! */
4574                                 set_bit(STRIPE_INSYNC, &sh->state);
4575                                 pr_warn_ratelimited("%s: mismatch sector in range "
4576                                                     "%llu-%llu\n", mdname(conf->mddev),
4577                                                     (unsigned long long) sh->sector,
4578                                                     (unsigned long long) sh->sector +
4579                                                     RAID5_STRIPE_SECTORS(conf));
4580                         } else {
4581                                 int *target = &sh->ops.target;
4582
4583                                 sh->ops.target = -1;
4584                                 sh->ops.target2 = -1;
4585                                 sh->check_state = check_state_compute_run;
4586                                 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4587                                 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4588                                 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4589                                         set_bit(R5_Wantcompute,
4590                                                 &sh->dev[pd_idx].flags);
4591                                         *target = pd_idx;
4592                                         target = &sh->ops.target2;
4593                                         s->uptodate++;
4594                                 }
4595                                 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4596                                         set_bit(R5_Wantcompute,
4597                                                 &sh->dev[qd_idx].flags);
4598                                         *target = qd_idx;
4599                                         s->uptodate++;
4600                                 }
4601                         }
4602                 }
4603                 break;
4604         case check_state_compute_run:
4605                 break;
4606         default:
4607                 pr_warn("%s: unknown check_state: %d sector: %llu\n",
4608                         __func__, sh->check_state,
4609                         (unsigned long long) sh->sector);
4610                 BUG();
4611         }
4612 }
4613
4614 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4615 {
4616         int i;
4617
4618         /* We have read all the blocks in this stripe and now we need to
4619          * copy some of them into a target stripe for expand.
4620          */
4621         struct dma_async_tx_descriptor *tx = NULL;
4622         BUG_ON(sh->batch_head);
4623         clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4624         for (i = 0; i < sh->disks; i++)
4625                 if (i != sh->pd_idx && i != sh->qd_idx) {
4626                         int dd_idx, j;
4627                         struct stripe_head *sh2;
4628                         struct async_submit_ctl submit;
4629
4630                         sector_t bn = raid5_compute_blocknr(sh, i, 1);
4631                         sector_t s = raid5_compute_sector(conf, bn, 0,
4632                                                           &dd_idx, NULL);
4633                         sh2 = raid5_get_active_stripe(conf, NULL, s,
4634                                 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
4635                         if (sh2 == NULL)
4636                                 /* so far only the early blocks of this stripe
4637                                  * have been requested.  When later blocks
4638                                  * get requested, we will try again
4639                                  */
4640                                 continue;
4641                         if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4642                            test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4643                                 /* must have already done this block */
4644                                 raid5_release_stripe(sh2);
4645                                 continue;
4646                         }
4647
4648                         /* place all the copies on one channel */
4649                         init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4650                         tx = async_memcpy(sh2->dev[dd_idx].page,
4651                                           sh->dev[i].page, sh2->dev[dd_idx].offset,
4652                                           sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4653                                           &submit);
4654
4655                         set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4656                         set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4657                         for (j = 0; j < conf->raid_disks; j++)
4658                                 if (j != sh2->pd_idx &&
4659                                     j != sh2->qd_idx &&
4660                                     !test_bit(R5_Expanded, &sh2->dev[j].flags))
4661                                         break;
4662                         if (j == conf->raid_disks) {
4663                                 set_bit(STRIPE_EXPAND_READY, &sh2->state);
4664                                 set_bit(STRIPE_HANDLE, &sh2->state);
4665                         }
4666                         raid5_release_stripe(sh2);
4667
4668                 }
4669         /* done submitting copies, wait for them to complete */
4670         async_tx_quiesce(&tx);
4671 }
4672
4673 /*
4674  * handle_stripe - do things to a stripe.
4675  *
4676  * We lock the stripe by setting STRIPE_ACTIVE and then examine the
4677  * state of various bits to see what needs to be done.
4678  * Possible results:
4679  *    return some read requests which now have data
4680  *    return some write requests which are safely on storage
4681  *    schedule a read on some buffers
4682  *    schedule a write of some buffers
4683  *    return confirmation of parity correctness
4684  *
4685  */
4686
4687 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4688 {
4689         struct r5conf *conf = sh->raid_conf;
4690         int disks = sh->disks;
4691         struct r5dev *dev;
4692         int i;
4693         int do_recovery = 0;
4694
4695         memset(s, 0, sizeof(*s));
4696
4697         s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4698         s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4699         s->failed_num[0] = -1;
4700         s->failed_num[1] = -1;
4701         s->log_failed = r5l_log_disk_error(conf);
4702
4703         /* Now to look around and see what can be done */
4704         rcu_read_lock();
4705         for (i=disks; i--; ) {
4706                 struct md_rdev *rdev;
4707                 sector_t first_bad;
4708                 int bad_sectors;
4709                 int is_bad = 0;
4710
4711                 dev = &sh->dev[i];
4712
4713                 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4714                          i, dev->flags,
4715                          dev->toread, dev->towrite, dev->written);
4716                 /* maybe we can reply to a read
4717                  *
4718                  * new wantfill requests are only permitted while
4719                  * ops_complete_biofill is guaranteed to be inactive
4720                  */
4721                 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4722                     !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4723                         set_bit(R5_Wantfill, &dev->flags);
4724
4725                 /* now count some things */
4726                 if (test_bit(R5_LOCKED, &dev->flags))
4727                         s->locked++;
4728                 if (test_bit(R5_UPTODATE, &dev->flags))
4729                         s->uptodate++;
4730                 if (test_bit(R5_Wantcompute, &dev->flags)) {
4731                         s->compute++;
4732                         BUG_ON(s->compute > 2);
4733                 }
4734
4735                 if (test_bit(R5_Wantfill, &dev->flags))
4736                         s->to_fill++;
4737                 else if (dev->toread)
4738                         s->to_read++;
4739                 if (dev->towrite) {
4740                         s->to_write++;
4741                         if (!test_bit(R5_OVERWRITE, &dev->flags))
4742                                 s->non_overwrite++;
4743                 }
4744                 if (dev->written)
4745                         s->written++;
4746                 /* Prefer to use the replacement for reads, but only
4747                  * if it is recovered enough and has no bad blocks.
4748                  */
4749                 rdev = rcu_dereference(conf->disks[i].replacement);
4750                 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4751                     rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4752                     !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4753                                  &first_bad, &bad_sectors))
4754                         set_bit(R5_ReadRepl, &dev->flags);
4755                 else {
4756                         if (rdev && !test_bit(Faulty, &rdev->flags))
4757                                 set_bit(R5_NeedReplace, &dev->flags);
4758                         else
4759                                 clear_bit(R5_NeedReplace, &dev->flags);
4760                         rdev = rcu_dereference(conf->disks[i].rdev);
4761                         clear_bit(R5_ReadRepl, &dev->flags);
4762                 }
4763                 if (rdev && test_bit(Faulty, &rdev->flags))
4764                         rdev = NULL;
4765                 if (rdev) {
4766                         is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
4767                                              &first_bad, &bad_sectors);
4768                         if (s->blocked_rdev == NULL
4769                             && (test_bit(Blocked, &rdev->flags)
4770                                 || is_bad < 0)) {
4771                                 if (is_bad < 0)
4772                                         set_bit(BlockedBadBlocks,
4773                                                 &rdev->flags);
4774                                 s->blocked_rdev = rdev;
4775                                 atomic_inc(&rdev->nr_pending);
4776                         }
4777                 }
4778                 clear_bit(R5_Insync, &dev->flags);
4779                 if (!rdev)
4780                         /* Not in-sync */;
4781                 else if (is_bad) {
4782                         /* also not in-sync */
4783                         if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4784                             test_bit(R5_UPTODATE, &dev->flags)) {
4785                                 /* treat as in-sync, but with a read error
4786                                  * which we can now try to correct
4787                                  */
4788                                 set_bit(R5_Insync, &dev->flags);
4789                                 set_bit(R5_ReadError, &dev->flags);
4790                         }
4791                 } else if (test_bit(In_sync, &rdev->flags))
4792                         set_bit(R5_Insync, &dev->flags);
4793                 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4794                         /* in sync if before recovery_offset */
4795                         set_bit(R5_Insync, &dev->flags);
4796                 else if (test_bit(R5_UPTODATE, &dev->flags) &&
4797                          test_bit(R5_Expanded, &dev->flags))
4798                         /* If we've reshaped into here, we assume it is Insync.
4799                          * We will shortly update recovery_offset to make
4800                          * it official.
4801                          */
4802                         set_bit(R5_Insync, &dev->flags);
4803
4804                 if (test_bit(R5_WriteError, &dev->flags)) {
4805                         /* This flag does not apply to '.replacement'
4806                          * only to .rdev, so make sure to check that*/
4807                         struct md_rdev *rdev2 = rcu_dereference(
4808                                 conf->disks[i].rdev);
4809                         if (rdev2 == rdev)
4810                                 clear_bit(R5_Insync, &dev->flags);
4811                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4812                                 s->handle_bad_blocks = 1;
4813                                 atomic_inc(&rdev2->nr_pending);
4814                         } else
4815                                 clear_bit(R5_WriteError, &dev->flags);
4816                 }
4817                 if (test_bit(R5_MadeGood, &dev->flags)) {
4818                         /* This flag does not apply to '.replacement'
4819                          * only to .rdev, so make sure to check that*/
4820                         struct md_rdev *rdev2 = rcu_dereference(
4821                                 conf->disks[i].rdev);
4822                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4823                                 s->handle_bad_blocks = 1;
4824                                 atomic_inc(&rdev2->nr_pending);
4825                         } else
4826                                 clear_bit(R5_MadeGood, &dev->flags);
4827                 }
4828                 if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4829                         struct md_rdev *rdev2 = rcu_dereference(
4830                                 conf->disks[i].replacement);
4831                         if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4832                                 s->handle_bad_blocks = 1;
4833                                 atomic_inc(&rdev2->nr_pending);
4834                         } else
4835                                 clear_bit(R5_MadeGoodRepl, &dev->flags);
4836                 }
4837                 if (!test_bit(R5_Insync, &dev->flags)) {
4838                         /* The ReadError flag will just be confusing now */
4839                         clear_bit(R5_ReadError, &dev->flags);
4840                         clear_bit(R5_ReWrite, &dev->flags);
4841                 }
4842                 if (test_bit(R5_ReadError, &dev->flags))
4843                         clear_bit(R5_Insync, &dev->flags);
4844                 if (!test_bit(R5_Insync, &dev->flags)) {
4845                         if (s->failed < 2)
4846                                 s->failed_num[s->failed] = i;
4847                         s->failed++;
4848                         if (rdev && !test_bit(Faulty, &rdev->flags))
4849                                 do_recovery = 1;
4850                         else if (!rdev) {
4851                                 rdev = rcu_dereference(
4852                                     conf->disks[i].replacement);
4853                                 if (rdev && !test_bit(Faulty, &rdev->flags))
4854                                         do_recovery = 1;
4855                         }
4856                 }
4857
4858                 if (test_bit(R5_InJournal, &dev->flags))
4859                         s->injournal++;
4860                 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4861                         s->just_cached++;
4862         }
4863         if (test_bit(STRIPE_SYNCING, &sh->state)) {
4864                 /* If there is a failed device being replaced,
4865                  *     we must be recovering.
4866                  * else if we are after recovery_cp, we must be syncing
4867                  * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4868                  * else we can only be replacing
4869                  * sync and recovery both need to read all devices, and so
4870                  * use the same flag.
4871                  */
4872                 if (do_recovery ||
4873                     sh->sector >= conf->mddev->recovery_cp ||
4874                     test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4875                         s->syncing = 1;
4876                 else
4877                         s->replacing = 1;
4878         }
4879         rcu_read_unlock();
4880 }
4881
4882 /*
4883  * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4884  * a head which can now be handled.
4885  */
4886 static int clear_batch_ready(struct stripe_head *sh)
4887 {
4888         struct stripe_head *tmp;
4889         if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4890                 return (sh->batch_head && sh->batch_head != sh);
4891         spin_lock(&sh->stripe_lock);
4892         if (!sh->batch_head) {
4893                 spin_unlock(&sh->stripe_lock);
4894                 return 0;
4895         }
4896
4897         /*
4898          * this stripe could be added to a batch list before we check
4899          * BATCH_READY, skips it
4900          */
4901         if (sh->batch_head != sh) {
4902                 spin_unlock(&sh->stripe_lock);
4903                 return 1;
4904         }
4905         spin_lock(&sh->batch_lock);
4906         list_for_each_entry(tmp, &sh->batch_list, batch_list)
4907                 clear_bit(STRIPE_BATCH_READY, &tmp->state);
4908         spin_unlock(&sh->batch_lock);
4909         spin_unlock(&sh->stripe_lock);
4910
4911         /*
4912          * BATCH_READY is cleared, no new stripes can be added.
4913          * batch_list can be accessed without lock
4914          */
4915         return 0;
4916 }
4917
4918 static void break_stripe_batch_list(struct stripe_head *head_sh,
4919                                     unsigned long handle_flags)
4920 {
4921         struct stripe_head *sh, *next;
4922         int i;
4923         int do_wakeup = 0;
4924
4925         list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4926
4927                 list_del_init(&sh->batch_list);
4928
4929                 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4930                                           (1 << STRIPE_SYNCING) |
4931                                           (1 << STRIPE_REPLACED) |
4932                                           (1 << STRIPE_DELAYED) |
4933                                           (1 << STRIPE_BIT_DELAY) |
4934                                           (1 << STRIPE_FULL_WRITE) |
4935                                           (1 << STRIPE_BIOFILL_RUN) |
4936                                           (1 << STRIPE_COMPUTE_RUN)  |
4937                                           (1 << STRIPE_DISCARD) |
4938                                           (1 << STRIPE_BATCH_READY) |
4939                                           (1 << STRIPE_BATCH_ERR) |
4940                                           (1 << STRIPE_BITMAP_PENDING)),
4941                         "stripe state: %lx\n", sh->state);
4942                 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4943                                               (1 << STRIPE_REPLACED)),
4944                         "head stripe state: %lx\n", head_sh->state);
4945
4946                 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4947                                             (1 << STRIPE_PREREAD_ACTIVE) |
4948                                             (1 << STRIPE_DEGRADED) |
4949                                             (1 << STRIPE_ON_UNPLUG_LIST)),
4950                               head_sh->state & (1 << STRIPE_INSYNC));
4951
4952                 sh->check_state = head_sh->check_state;
4953                 sh->reconstruct_state = head_sh->reconstruct_state;
4954                 spin_lock_irq(&sh->stripe_lock);
4955                 sh->batch_head = NULL;
4956                 spin_unlock_irq(&sh->stripe_lock);
4957                 for (i = 0; i < sh->disks; i++) {
4958                         if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4959                                 do_wakeup = 1;
4960                         sh->dev[i].flags = head_sh->dev[i].flags &
4961                                 (~((1 << R5_WriteError) | (1 << R5_Overlap)));
4962                 }
4963                 if (handle_flags == 0 ||
4964                     sh->state & handle_flags)
4965                         set_bit(STRIPE_HANDLE, &sh->state);
4966                 raid5_release_stripe(sh);
4967         }
4968         spin_lock_irq(&head_sh->stripe_lock);
4969         head_sh->batch_head = NULL;
4970         spin_unlock_irq(&head_sh->stripe_lock);
4971         for (i = 0; i < head_sh->disks; i++)
4972                 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4973                         do_wakeup = 1;
4974         if (head_sh->state & handle_flags)
4975                 set_bit(STRIPE_HANDLE, &head_sh->state);
4976
4977         if (do_wakeup)
4978                 wake_up(&head_sh->raid_conf->wait_for_overlap);
4979 }
4980
4981 static void handle_stripe(struct stripe_head *sh)
4982 {
4983         struct stripe_head_state s;
4984         struct r5conf *conf = sh->raid_conf;
4985         int i;
4986         int prexor;
4987         int disks = sh->disks;
4988         struct r5dev *pdev, *qdev;
4989
4990         clear_bit(STRIPE_HANDLE, &sh->state);
4991
4992         /*
4993          * handle_stripe should not continue handle the batched stripe, only
4994          * the head of batch list or lone stripe can continue. Otherwise we
4995          * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4996          * is set for the batched stripe.
4997          */
4998         if (clear_batch_ready(sh))
4999                 return;
5000
5001         if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
5002                 /* already being handled, ensure it gets handled
5003                  * again when current action finishes */
5004                 set_bit(STRIPE_HANDLE, &sh->state);
5005                 return;
5006         }
5007
5008         if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
5009                 break_stripe_batch_list(sh, 0);
5010
5011         if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
5012                 spin_lock(&sh->stripe_lock);
5013                 /*
5014                  * Cannot process 'sync' concurrently with 'discard'.
5015                  * Flush data in r5cache before 'sync'.
5016                  */
5017                 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
5018                     !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
5019                     !test_bit(STRIPE_DISCARD, &sh->state) &&
5020                     test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
5021                         set_bit(STRIPE_SYNCING, &sh->state);
5022                         clear_bit(STRIPE_INSYNC, &sh->state);
5023                         clear_bit(STRIPE_REPLACED, &sh->state);
5024                 }
5025                 spin_unlock(&sh->stripe_lock);
5026         }
5027         clear_bit(STRIPE_DELAYED, &sh->state);
5028
5029         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
5030                 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
5031                (unsigned long long)sh->sector, sh->state,
5032                atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
5033                sh->check_state, sh->reconstruct_state);
5034
5035         analyse_stripe(sh, &s);
5036
5037         if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
5038                 goto finish;
5039
5040         if (s.handle_bad_blocks ||
5041             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
5042                 set_bit(STRIPE_HANDLE, &sh->state);
5043                 goto finish;
5044         }
5045
5046         if (unlikely(s.blocked_rdev)) {
5047                 if (s.syncing || s.expanding || s.expanded ||
5048                     s.replacing || s.to_write || s.written) {
5049                         set_bit(STRIPE_HANDLE, &sh->state);
5050                         goto finish;
5051                 }
5052                 /* There is nothing for the blocked_rdev to block */
5053                 rdev_dec_pending(s.blocked_rdev, conf->mddev);
5054                 s.blocked_rdev = NULL;
5055         }
5056
5057         if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
5058                 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
5059                 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
5060         }
5061
5062         pr_debug("locked=%d uptodate=%d to_read=%d"
5063                " to_write=%d failed=%d failed_num=%d,%d\n",
5064                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
5065                s.failed_num[0], s.failed_num[1]);
5066         /*
5067          * check if the array has lost more than max_degraded devices and,
5068          * if so, some requests might need to be failed.
5069          *
5070          * When journal device failed (log_failed), we will only process
5071          * the stripe if there is data need write to raid disks
5072          */
5073         if (s.failed > conf->max_degraded ||
5074             (s.log_failed && s.injournal == 0)) {
5075                 sh->check_state = 0;
5076                 sh->reconstruct_state = 0;
5077                 break_stripe_batch_list(sh, 0);
5078                 if (s.to_read+s.to_write+s.written)
5079                         handle_failed_stripe(conf, sh, &s, disks);
5080                 if (s.syncing + s.replacing)
5081                         handle_failed_sync(conf, sh, &s);
5082         }
5083
5084         /* Now we check to see if any write operations have recently
5085          * completed
5086          */
5087         prexor = 0;
5088         if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5089                 prexor = 1;
5090         if (sh->reconstruct_state == reconstruct_state_drain_result ||
5091             sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5092                 sh->reconstruct_state = reconstruct_state_idle;
5093
5094                 /* All the 'written' buffers and the parity block are ready to
5095                  * be written back to disk
5096                  */
5097                 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5098                        !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5099                 BUG_ON(sh->qd_idx >= 0 &&
5100                        !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5101                        !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5102                 for (i = disks; i--; ) {
5103                         struct r5dev *dev = &sh->dev[i];
5104                         if (test_bit(R5_LOCKED, &dev->flags) &&
5105                                 (i == sh->pd_idx || i == sh->qd_idx ||
5106                                  dev->written || test_bit(R5_InJournal,
5107                                                           &dev->flags))) {
5108                                 pr_debug("Writing block %d\n", i);
5109                                 set_bit(R5_Wantwrite, &dev->flags);
5110                                 if (prexor)
5111                                         continue;
5112                                 if (s.failed > 1)
5113                                         continue;
5114                                 if (!test_bit(R5_Insync, &dev->flags) ||
5115                                     ((i == sh->pd_idx || i == sh->qd_idx)  &&
5116                                      s.failed == 0))
5117                                         set_bit(STRIPE_INSYNC, &sh->state);
5118                         }
5119                 }
5120                 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5121                         s.dec_preread_active = 1;
5122         }
5123
5124         /*
5125          * might be able to return some write requests if the parity blocks
5126          * are safe, or on a failed drive
5127          */
5128         pdev = &sh->dev[sh->pd_idx];
5129         s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
5130                 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
5131         qdev = &sh->dev[sh->qd_idx];
5132         s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
5133                 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
5134                 || conf->level < 6;
5135
5136         if (s.written &&
5137             (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
5138                              && !test_bit(R5_LOCKED, &pdev->flags)
5139                              && (test_bit(R5_UPTODATE, &pdev->flags) ||
5140                                  test_bit(R5_Discard, &pdev->flags))))) &&
5141             (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
5142                              && !test_bit(R5_LOCKED, &qdev->flags)
5143                              && (test_bit(R5_UPTODATE, &qdev->flags) ||
5144                                  test_bit(R5_Discard, &qdev->flags))))))
5145                 handle_stripe_clean_event(conf, sh, disks);
5146
5147         if (s.just_cached)
5148                 r5c_handle_cached_data_endio(conf, sh, disks);
5149         log_stripe_write_finished(sh);
5150
5151         /* Now we might consider reading some blocks, either to check/generate
5152          * parity, or to satisfy requests
5153          * or to load a block that is being partially written.
5154          */
5155         if (s.to_read || s.non_overwrite
5156             || (s.to_write && s.failed)
5157             || (s.syncing && (s.uptodate + s.compute < disks))
5158             || s.replacing
5159             || s.expanding)
5160                 handle_stripe_fill(sh, &s, disks);
5161
5162         /*
5163          * When the stripe finishes full journal write cycle (write to journal
5164          * and raid disk), this is the clean up procedure so it is ready for
5165          * next operation.
5166          */
5167         r5c_finish_stripe_write_out(conf, sh, &s);
5168
5169         /*
5170          * Now to consider new write requests, cache write back and what else,
5171          * if anything should be read.  We do not handle new writes when:
5172          * 1/ A 'write' operation (copy+xor) is already in flight.
5173          * 2/ A 'check' operation is in flight, as it may clobber the parity
5174          *    block.
5175          * 3/ A r5c cache log write is in flight.
5176          */
5177
5178         if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5179                 if (!r5c_is_writeback(conf->log)) {
5180                         if (s.to_write)
5181                                 handle_stripe_dirtying(conf, sh, &s, disks);
5182                 } else { /* write back cache */
5183                         int ret = 0;
5184
5185                         /* First, try handle writes in caching phase */
5186                         if (s.to_write)
5187                                 ret = r5c_try_caching_write(conf, sh, &s,
5188                                                             disks);
5189                         /*
5190                          * If caching phase failed: ret == -EAGAIN
5191                          *    OR
5192                          * stripe under reclaim: !caching && injournal
5193                          *
5194                          * fall back to handle_stripe_dirtying()
5195                          */
5196                         if (ret == -EAGAIN ||
5197                             /* stripe under reclaim: !caching && injournal */
5198                             (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5199                              s.injournal > 0)) {
5200                                 ret = handle_stripe_dirtying(conf, sh, &s,
5201                                                              disks);
5202                                 if (ret == -EAGAIN)
5203                                         goto finish;
5204                         }
5205                 }
5206         }
5207
5208         /* maybe we need to check and possibly fix the parity for this stripe
5209          * Any reads will already have been scheduled, so we just see if enough
5210          * data is available.  The parity check is held off while parity
5211          * dependent operations are in flight.
5212          */
5213         if (sh->check_state ||
5214             (s.syncing && s.locked == 0 &&
5215              !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5216              !test_bit(STRIPE_INSYNC, &sh->state))) {
5217                 if (conf->level == 6)
5218                         handle_parity_checks6(conf, sh, &s, disks);
5219                 else
5220                         handle_parity_checks5(conf, sh, &s, disks);
5221         }
5222
5223         if ((s.replacing || s.syncing) && s.locked == 0
5224             && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5225             && !test_bit(STRIPE_REPLACED, &sh->state)) {
5226                 /* Write out to replacement devices where possible */
5227                 for (i = 0; i < conf->raid_disks; i++)
5228                         if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5229                                 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5230                                 set_bit(R5_WantReplace, &sh->dev[i].flags);
5231                                 set_bit(R5_LOCKED, &sh->dev[i].flags);
5232                                 s.locked++;
5233                         }
5234                 if (s.replacing)
5235                         set_bit(STRIPE_INSYNC, &sh->state);
5236                 set_bit(STRIPE_REPLACED, &sh->state);
5237         }
5238         if ((s.syncing || s.replacing) && s.locked == 0 &&
5239             !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5240             test_bit(STRIPE_INSYNC, &sh->state)) {
5241                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5242                 clear_bit(STRIPE_SYNCING, &sh->state);
5243                 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
5244                         wake_up(&conf->wait_for_overlap);
5245         }
5246
5247         /* If the failed drives are just a ReadError, then we might need
5248          * to progress the repair/check process
5249          */
5250         if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5251                 for (i = 0; i < s.failed; i++) {
5252                         struct r5dev *dev = &sh->dev[s.failed_num[i]];
5253                         if (test_bit(R5_ReadError, &dev->flags)
5254                             && !test_bit(R5_LOCKED, &dev->flags)
5255                             && test_bit(R5_UPTODATE, &dev->flags)
5256                                 ) {
5257                                 if (!test_bit(R5_ReWrite, &dev->flags)) {
5258                                         set_bit(R5_Wantwrite, &dev->flags);
5259                                         set_bit(R5_ReWrite, &dev->flags);
5260                                 } else
5261                                         /* let's read it back */
5262                                         set_bit(R5_Wantread, &dev->flags);
5263                                 set_bit(R5_LOCKED, &dev->flags);
5264                                 s.locked++;
5265                         }
5266                 }
5267
5268         /* Finish reconstruct operations initiated by the expansion process */
5269         if (sh->reconstruct_state == reconstruct_state_result) {
5270                 struct stripe_head *sh_src
5271                         = raid5_get_active_stripe(conf, NULL, sh->sector,
5272                                         R5_GAS_PREVIOUS | R5_GAS_NOBLOCK |
5273                                         R5_GAS_NOQUIESCE);
5274                 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5275                         /* sh cannot be written until sh_src has been read.
5276                          * so arrange for sh to be delayed a little
5277                          */
5278                         set_bit(STRIPE_DELAYED, &sh->state);
5279                         set_bit(STRIPE_HANDLE, &sh->state);
5280                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
5281                                               &sh_src->state))
5282                                 atomic_inc(&conf->preread_active_stripes);
5283                         raid5_release_stripe(sh_src);
5284                         goto finish;
5285                 }
5286                 if (sh_src)
5287                         raid5_release_stripe(sh_src);
5288
5289                 sh->reconstruct_state = reconstruct_state_idle;
5290                 clear_bit(STRIPE_EXPANDING, &sh->state);
5291                 for (i = conf->raid_disks; i--; ) {
5292                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
5293                         set_bit(R5_LOCKED, &sh->dev[i].flags);
5294                         s.locked++;
5295                 }
5296         }
5297
5298         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5299             !sh->reconstruct_state) {
5300                 /* Need to write out all blocks after computing parity */
5301                 sh->disks = conf->raid_disks;
5302                 stripe_set_idx(sh->sector, conf, 0, sh);
5303                 schedule_reconstruction(sh, &s, 1, 1);
5304         } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
5305                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
5306                 atomic_dec(&conf->reshape_stripes);
5307                 wake_up(&conf->wait_for_overlap);
5308                 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
5309         }
5310
5311         if (s.expanding && s.locked == 0 &&
5312             !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5313                 handle_stripe_expansion(conf, sh);
5314
5315 finish:
5316         /* wait for this device to become unblocked */
5317         if (unlikely(s.blocked_rdev)) {
5318                 if (conf->mddev->external)
5319                         md_wait_for_blocked_rdev(s.blocked_rdev,
5320                                                  conf->mddev);
5321                 else
5322                         /* Internal metadata will immediately
5323                          * be written by raid5d, so we don't
5324                          * need to wait here.
5325                          */
5326                         rdev_dec_pending(s.blocked_rdev,
5327                                          conf->mddev);
5328         }
5329
5330         if (s.handle_bad_blocks)
5331                 for (i = disks; i--; ) {
5332                         struct md_rdev *rdev;
5333                         struct r5dev *dev = &sh->dev[i];
5334                         if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
5335                                 /* We own a safe reference to the rdev */
5336                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5337                                 if (!rdev_set_badblocks(rdev, sh->sector,
5338                                                         RAID5_STRIPE_SECTORS(conf), 0))
5339                                         md_error(conf->mddev, rdev);
5340                                 rdev_dec_pending(rdev, conf->mddev);
5341                         }
5342                         if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
5343                                 rdev = rdev_pend_deref(conf->disks[i].rdev);
5344                                 rdev_clear_badblocks(rdev, sh->sector,
5345                                                      RAID5_STRIPE_SECTORS(conf), 0);
5346                                 rdev_dec_pending(rdev, conf->mddev);
5347                         }
5348                         if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
5349                                 rdev = rdev_pend_deref(conf->disks[i].replacement);
5350                                 if (!rdev)
5351                                         /* rdev have been moved down */
5352                                         rdev = rdev_pend_deref(conf->disks[i].rdev);
5353                                 rdev_clear_badblocks(rdev, sh->sector,
5354                                                      RAID5_STRIPE_SECTORS(conf), 0);
5355                                 rdev_dec_pending(rdev, conf->mddev);
5356                         }
5357                 }
5358
5359         if (s.ops_request)
5360                 raid_run_ops(sh, s.ops_request);
5361
5362         ops_run_io(sh, &s);
5363
5364         if (s.dec_preread_active) {
5365                 /* We delay this until after ops_run_io so that if make_request
5366                  * is waiting on a flush, it won't continue until the writes
5367                  * have actually been submitted.
5368                  */
5369                 atomic_dec(&conf->preread_active_stripes);
5370                 if (atomic_read(&conf->preread_active_stripes) <
5371                     IO_THRESHOLD)
5372                         md_wakeup_thread(conf->mddev->thread);
5373         }
5374
5375         clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
5376 }
5377
5378 static void raid5_activate_delayed(struct r5conf *conf)
5379         __must_hold(&conf->device_lock)
5380 {
5381         if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
5382                 while (!list_empty(&conf->delayed_list)) {
5383                         struct list_head *l = conf->delayed_list.next;
5384                         struct stripe_head *sh;
5385                         sh = list_entry(l, struct stripe_head, lru);
5386                         list_del_init(l);
5387                         clear_bit(STRIPE_DELAYED, &sh->state);
5388                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5389                                 atomic_inc(&conf->preread_active_stripes);
5390                         list_add_tail(&sh->lru, &conf->hold_list);
5391                         raid5_wakeup_stripe_thread(sh);
5392                 }
5393         }
5394 }
5395
5396 static void activate_bit_delay(struct r5conf *conf,
5397                 struct list_head *temp_inactive_list)
5398         __must_hold(&conf->device_lock)
5399 {
5400         struct list_head head;
5401         list_add(&head, &conf->bitmap_list);
5402         list_del_init(&conf->bitmap_list);
5403         while (!list_empty(&head)) {
5404                 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
5405                 int hash;
5406                 list_del_init(&sh->lru);
5407                 atomic_inc(&sh->count);
5408                 hash = sh->hash_lock_index;
5409                 __release_stripe(conf, sh, &temp_inactive_list[hash]);
5410         }
5411 }
5412
5413 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
5414 {
5415         struct r5conf *conf = mddev->private;
5416         sector_t sector = bio->bi_iter.bi_sector;
5417         unsigned int chunk_sectors;
5418         unsigned int bio_sectors = bio_sectors(bio);
5419
5420         chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5421         return  chunk_sectors >=
5422                 ((sector & (chunk_sectors - 1)) + bio_sectors);
5423 }
5424
5425 /*
5426  *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
5427  *  later sampled by raid5d.
5428  */
5429 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
5430 {
5431         unsigned long flags;
5432
5433         spin_lock_irqsave(&conf->device_lock, flags);
5434
5435         bi->bi_next = conf->retry_read_aligned_list;
5436         conf->retry_read_aligned_list = bi;
5437
5438         spin_unlock_irqrestore(&conf->device_lock, flags);
5439         md_wakeup_thread(conf->mddev->thread);
5440 }
5441
5442 static struct bio *remove_bio_from_retry(struct r5conf *conf,
5443                                          unsigned int *offset)
5444 {
5445         struct bio *bi;
5446
5447         bi = conf->retry_read_aligned;
5448         if (bi) {
5449                 *offset = conf->retry_read_offset;
5450                 conf->retry_read_aligned = NULL;
5451                 return bi;
5452         }
5453         bi = conf->retry_read_aligned_list;
5454         if(bi) {
5455                 conf->retry_read_aligned_list = bi->bi_next;
5456                 bi->bi_next = NULL;
5457                 *offset = 0;
5458         }
5459
5460         return bi;
5461 }
5462
5463 /*
5464  *  The "raid5_align_endio" should check if the read succeeded and if it
5465  *  did, call bio_endio on the original bio (having bio_put the new bio
5466  *  first).
5467  *  If the read failed..
5468  */
5469 static void raid5_align_endio(struct bio *bi)
5470 {
5471         struct md_io_acct *md_io_acct = bi->bi_private;
5472         struct bio *raid_bi = md_io_acct->orig_bio;
5473         struct mddev *mddev;
5474         struct r5conf *conf;
5475         struct md_rdev *rdev;
5476         blk_status_t error = bi->bi_status;
5477         unsigned long start_time = md_io_acct->start_time;
5478
5479         bio_put(bi);
5480
5481         rdev = (void*)raid_bi->bi_next;
5482         raid_bi->bi_next = NULL;
5483         mddev = rdev->mddev;
5484         conf = mddev->private;
5485
5486         rdev_dec_pending(rdev, conf->mddev);
5487
5488         if (!error) {
5489                 if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
5490                         bio_end_io_acct(raid_bi, start_time);
5491                 bio_endio(raid_bi);
5492                 if (atomic_dec_and_test(&conf->active_aligned_reads))
5493                         wake_up(&conf->wait_for_quiescent);
5494                 return;
5495         }
5496
5497         pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5498
5499         add_bio_to_retry(raid_bi, conf);
5500 }
5501
5502 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
5503 {
5504         struct r5conf *conf = mddev->private;
5505         struct bio *align_bio;
5506         struct md_rdev *rdev;
5507         sector_t sector, end_sector, first_bad;
5508         int bad_sectors, dd_idx;
5509         struct md_io_acct *md_io_acct;
5510         bool did_inc;
5511
5512         if (!in_chunk_boundary(mddev, raid_bio)) {
5513                 pr_debug("%s: non aligned\n", __func__);
5514                 return 0;
5515         }
5516
5517         sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0,
5518                                       &dd_idx, NULL);
5519         end_sector = bio_end_sector(raid_bio);
5520
5521         rcu_read_lock();
5522         if (r5c_big_stripe_cached(conf, sector))
5523                 goto out_rcu_unlock;
5524
5525         rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5526         if (!rdev || test_bit(Faulty, &rdev->flags) ||
5527             rdev->recovery_offset < end_sector) {
5528                 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5529                 if (!rdev)
5530                         goto out_rcu_unlock;
5531                 if (test_bit(Faulty, &rdev->flags) ||
5532                     !(test_bit(In_sync, &rdev->flags) ||
5533                       rdev->recovery_offset >= end_sector))
5534                         goto out_rcu_unlock;
5535         }
5536
5537         atomic_inc(&rdev->nr_pending);
5538         rcu_read_unlock();
5539
5540         if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
5541                         &bad_sectors)) {
5542                 rdev_dec_pending(rdev, mddev);
5543                 return 0;
5544         }
5545
5546         align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
5547                                     &mddev->io_acct_set);
5548         md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
5549         raid_bio->bi_next = (void *)rdev;
5550         if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
5551                 md_io_acct->start_time = bio_start_io_acct(raid_bio);
5552         md_io_acct->orig_bio = raid_bio;
5553
5554         align_bio->bi_end_io = raid5_align_endio;
5555         align_bio->bi_private = md_io_acct;
5556         align_bio->bi_iter.bi_sector = sector;
5557
5558         /* No reshape active, so we can trust rdev->data_offset */
5559         align_bio->bi_iter.bi_sector += rdev->data_offset;
5560
5561         did_inc = false;
5562         if (conf->quiesce == 0) {
5563                 atomic_inc(&conf->active_aligned_reads);
5564                 did_inc = true;
5565         }
5566         /* need a memory barrier to detect the race with raid5_quiesce() */
5567         if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) {
5568                 /* quiesce is in progress, so we need to undo io activation and wait
5569                  * for it to finish
5570                  */
5571                 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads))
5572                         wake_up(&conf->wait_for_quiescent);
5573                 spin_lock_irq(&conf->device_lock);
5574                 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0,
5575                                     conf->device_lock);
5576                 atomic_inc(&conf->active_aligned_reads);
5577                 spin_unlock_irq(&conf->device_lock);
5578         }
5579
5580         if (mddev->gendisk)
5581                 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
5582                                       raid_bio->bi_iter.bi_sector);
5583         submit_bio_noacct(align_bio);
5584         return 1;
5585
5586 out_rcu_unlock:
5587         rcu_read_unlock();
5588         return 0;
5589 }
5590
5591 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5592 {
5593         struct bio *split;
5594         sector_t sector = raid_bio->bi_iter.bi_sector;
5595         unsigned chunk_sects = mddev->chunk_sectors;
5596         unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5597
5598         if (sectors < bio_sectors(raid_bio)) {
5599                 struct r5conf *conf = mddev->private;
5600                 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
5601                 bio_chain(split, raid_bio);
5602                 submit_bio_noacct(raid_bio);
5603                 raid_bio = split;
5604         }
5605
5606         if (!raid5_read_one_chunk(mddev, raid_bio))
5607                 return raid_bio;
5608
5609         return NULL;
5610 }
5611
5612 /* __get_priority_stripe - get the next stripe to process
5613  *
5614  * Full stripe writes are allowed to pass preread active stripes up until
5615  * the bypass_threshold is exceeded.  In general the bypass_count
5616  * increments when the handle_list is handled before the hold_list; however, it
5617  * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5618  * stripe with in flight i/o.  The bypass_count will be reset when the
5619  * head of the hold_list has changed, i.e. the head was promoted to the
5620  * handle_list.
5621  */
5622 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5623         __must_hold(&conf->device_lock)
5624 {
5625         struct stripe_head *sh, *tmp;
5626         struct list_head *handle_list = NULL;
5627         struct r5worker_group *wg;
5628         bool second_try = !r5c_is_writeback(conf->log) &&
5629                 !r5l_log_disk_error(conf);
5630         bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
5631                 r5l_log_disk_error(conf);
5632
5633 again:
5634         wg = NULL;
5635         sh = NULL;
5636         if (conf->worker_cnt_per_group == 0) {
5637                 handle_list = try_loprio ? &conf->loprio_list :
5638                                         &conf->handle_list;
5639         } else if (group != ANY_GROUP) {
5640                 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5641                                 &conf->worker_groups[group].handle_list;
5642                 wg = &conf->worker_groups[group];
5643         } else {
5644                 int i;
5645                 for (i = 0; i < conf->group_cnt; i++) {
5646                         handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5647                                 &conf->worker_groups[i].handle_list;
5648                         wg = &conf->worker_groups[i];
5649                         if (!list_empty(handle_list))
5650                                 break;
5651                 }
5652         }
5653
5654         pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5655                   __func__,
5656                   list_empty(handle_list) ? "empty" : "busy",
5657                   list_empty(&conf->hold_list) ? "empty" : "busy",
5658                   atomic_read(&conf->pending_full_writes), conf->bypass_count);
5659
5660         if (!list_empty(handle_list)) {
5661                 sh = list_entry(handle_list->next, typeof(*sh), lru);
5662
5663                 if (list_empty(&conf->hold_list))
5664                         conf->bypass_count = 0;
5665                 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5666                         if (conf->hold_list.next == conf->last_hold)
5667                                 conf->bypass_count++;
5668                         else {
5669                                 conf->last_hold = conf->hold_list.next;
5670                                 conf->bypass_count -= conf->bypass_threshold;
5671                                 if (conf->bypass_count < 0)
5672                                         conf->bypass_count = 0;
5673                         }
5674                 }
5675         } else if (!list_empty(&conf->hold_list) &&
5676                    ((conf->bypass_threshold &&
5677                      conf->bypass_count > conf->bypass_threshold) ||
5678                     atomic_read(&conf->pending_full_writes) == 0)) {
5679
5680                 list_for_each_entry(tmp, &conf->hold_list,  lru) {
5681                         if (conf->worker_cnt_per_group == 0 ||
5682                             group == ANY_GROUP ||
5683                             !cpu_online(tmp->cpu) ||
5684                             cpu_to_group(tmp->cpu) == group) {
5685                                 sh = tmp;
5686                                 break;
5687                         }
5688                 }
5689
5690                 if (sh) {
5691                         conf->bypass_count -= conf->bypass_threshold;
5692                         if (conf->bypass_count < 0)
5693                                 conf->bypass_count = 0;
5694                 }
5695                 wg = NULL;
5696         }
5697
5698         if (!sh) {
5699                 if (second_try)
5700                         return NULL;
5701                 second_try = true;
5702                 try_loprio = !try_loprio;
5703                 goto again;
5704         }
5705
5706         if (wg) {
5707                 wg->stripes_cnt--;
5708                 sh->group = NULL;
5709         }
5710         list_del_init(&sh->lru);
5711         BUG_ON(atomic_inc_return(&sh->count) != 1);
5712         return sh;
5713 }
5714
5715 struct raid5_plug_cb {
5716         struct blk_plug_cb      cb;
5717         struct list_head        list;
5718         struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5719 };
5720
5721 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5722 {
5723         struct raid5_plug_cb *cb = container_of(
5724                 blk_cb, struct raid5_plug_cb, cb);
5725         struct stripe_head *sh;
5726         struct mddev *mddev = cb->cb.data;
5727         struct r5conf *conf = mddev->private;
5728         int cnt = 0;
5729         int hash;
5730
5731         if (cb->list.next && !list_empty(&cb->list)) {
5732                 spin_lock_irq(&conf->device_lock);
5733                 while (!list_empty(&cb->list)) {
5734                         sh = list_first_entry(&cb->list, struct stripe_head, lru);
5735                         list_del_init(&sh->lru);
5736                         /*
5737                          * avoid race release_stripe_plug() sees
5738                          * STRIPE_ON_UNPLUG_LIST clear but the stripe
5739                          * is still in our list
5740                          */
5741                         smp_mb__before_atomic();
5742                         clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5743                         /*
5744                          * STRIPE_ON_RELEASE_LIST could be set here. In that
5745                          * case, the count is always > 1 here
5746                          */
5747                         hash = sh->hash_lock_index;
5748                         __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5749                         cnt++;
5750                 }
5751                 spin_unlock_irq(&conf->device_lock);
5752         }
5753         release_inactive_stripe_list(conf, cb->temp_inactive_list,
5754                                      NR_STRIPE_HASH_LOCKS);
5755         if (mddev->queue)
5756                 trace_block_unplug(mddev->queue, cnt, !from_schedule);
5757         kfree(cb);
5758 }
5759
5760 static void release_stripe_plug(struct mddev *mddev,
5761                                 struct stripe_head *sh)
5762 {
5763         struct blk_plug_cb *blk_cb = blk_check_plugged(
5764                 raid5_unplug, mddev,
5765                 sizeof(struct raid5_plug_cb));
5766         struct raid5_plug_cb *cb;
5767
5768         if (!blk_cb) {
5769                 raid5_release_stripe(sh);
5770                 return;
5771         }
5772
5773         cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5774
5775         if (cb->list.next == NULL) {
5776                 int i;
5777                 INIT_LIST_HEAD(&cb->list);
5778                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5779                         INIT_LIST_HEAD(cb->temp_inactive_list + i);
5780         }
5781
5782         if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5783                 list_add_tail(&sh->lru, &cb->list);
5784         else
5785                 raid5_release_stripe(sh);
5786 }
5787
5788 static void make_discard_request(struct mddev *mddev, struct bio *bi)
5789 {
5790         struct r5conf *conf = mddev->private;
5791         sector_t logical_sector, last_sector;
5792         struct stripe_head *sh;
5793         int stripe_sectors;
5794
5795         /* We need to handle this when io_uring supports discard/trim */
5796         if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5797                 return;
5798
5799         if (mddev->reshape_position != MaxSector)
5800                 /* Skip discard while reshape is happening */
5801                 return;
5802
5803         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5804         last_sector = bio_end_sector(bi);
5805
5806         bi->bi_next = NULL;
5807
5808         stripe_sectors = conf->chunk_sectors *
5809                 (conf->raid_disks - conf->max_degraded);
5810         logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5811                                                stripe_sectors);
5812         sector_div(last_sector, stripe_sectors);
5813
5814         logical_sector *= conf->chunk_sectors;
5815         last_sector *= conf->chunk_sectors;
5816
5817         for (; logical_sector < last_sector;
5818              logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5819                 DEFINE_WAIT(w);
5820                 int d;
5821         again:
5822                 sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
5823                 prepare_to_wait(&conf->wait_for_overlap, &w,
5824                                 TASK_UNINTERRUPTIBLE);
5825                 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5826                 if (test_bit(STRIPE_SYNCING, &sh->state)) {
5827                         raid5_release_stripe(sh);
5828                         schedule();
5829                         goto again;
5830                 }
5831                 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5832                 spin_lock_irq(&sh->stripe_lock);
5833                 for (d = 0; d < conf->raid_disks; d++) {
5834                         if (d == sh->pd_idx || d == sh->qd_idx)
5835                                 continue;
5836                         if (sh->dev[d].towrite || sh->dev[d].toread) {
5837                                 set_bit(R5_Overlap, &sh->dev[d].flags);
5838                                 spin_unlock_irq(&sh->stripe_lock);
5839                                 raid5_release_stripe(sh);
5840                                 schedule();
5841                                 goto again;
5842                         }
5843                 }
5844                 set_bit(STRIPE_DISCARD, &sh->state);
5845                 finish_wait(&conf->wait_for_overlap, &w);
5846                 sh->overwrite_disks = 0;
5847                 for (d = 0; d < conf->raid_disks; d++) {
5848                         if (d == sh->pd_idx || d == sh->qd_idx)
5849                                 continue;
5850                         sh->dev[d].towrite = bi;
5851                         set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5852                         bio_inc_remaining(bi);
5853                         md_write_inc(mddev, bi);
5854                         sh->overwrite_disks++;
5855                 }
5856                 spin_unlock_irq(&sh->stripe_lock);
5857                 if (conf->mddev->bitmap) {
5858                         for (d = 0;
5859                              d < conf->raid_disks - conf->max_degraded;
5860                              d++)
5861                                 md_bitmap_startwrite(mddev->bitmap,
5862                                                      sh->sector,
5863                                                      RAID5_STRIPE_SECTORS(conf),
5864                                                      0);
5865                         sh->bm_seq = conf->seq_flush + 1;
5866                         set_bit(STRIPE_BIT_DELAY, &sh->state);
5867                 }
5868
5869                 set_bit(STRIPE_HANDLE, &sh->state);
5870                 clear_bit(STRIPE_DELAYED, &sh->state);
5871                 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5872                         atomic_inc(&conf->preread_active_stripes);
5873                 release_stripe_plug(mddev, sh);
5874         }
5875
5876         bio_endio(bi);
5877 }
5878
5879 static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5880                              sector_t reshape_sector)
5881 {
5882         return mddev->reshape_backwards ? sector < reshape_sector :
5883                                           sector >= reshape_sector;
5884 }
5885
5886 static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5887                                    sector_t max, sector_t reshape_sector)
5888 {
5889         return mddev->reshape_backwards ? max < reshape_sector :
5890                                           min >= reshape_sector;
5891 }
5892
5893 static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf,
5894                                     struct stripe_head *sh)
5895 {
5896         sector_t max_sector = 0, min_sector = MaxSector;
5897         bool ret = false;
5898         int dd_idx;
5899
5900         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5901                 if (dd_idx == sh->pd_idx)
5902                         continue;
5903
5904                 min_sector = min(min_sector, sh->dev[dd_idx].sector);
5905                 max_sector = min(max_sector, sh->dev[dd_idx].sector);
5906         }
5907
5908         spin_lock_irq(&conf->device_lock);
5909
5910         if (!range_ahead_of_reshape(mddev, min_sector, max_sector,
5911                                      conf->reshape_progress))
5912                 /* mismatch, need to try again */
5913                 ret = true;
5914
5915         spin_unlock_irq(&conf->device_lock);
5916
5917         return ret;
5918 }
5919
5920 static int add_all_stripe_bios(struct r5conf *conf,
5921                 struct stripe_request_ctx *ctx, struct stripe_head *sh,
5922                 struct bio *bi, int forwrite, int previous)
5923 {
5924         int dd_idx;
5925         int ret = 1;
5926
5927         spin_lock_irq(&sh->stripe_lock);
5928
5929         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5930                 struct r5dev *dev = &sh->dev[dd_idx];
5931
5932                 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5933                         continue;
5934
5935                 if (dev->sector < ctx->first_sector ||
5936                     dev->sector >= ctx->last_sector)
5937                         continue;
5938
5939                 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5940                         set_bit(R5_Overlap, &dev->flags);
5941                         ret = 0;
5942                         continue;
5943                 }
5944         }
5945
5946         if (!ret)
5947                 goto out;
5948
5949         for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
5950                 struct r5dev *dev = &sh->dev[dd_idx];
5951
5952                 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
5953                         continue;
5954
5955                 if (dev->sector < ctx->first_sector ||
5956                     dev->sector >= ctx->last_sector)
5957                         continue;
5958
5959                 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5960                 clear_bit((dev->sector - ctx->first_sector) >>
5961                           RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
5962         }
5963
5964 out:
5965         spin_unlock_irq(&sh->stripe_lock);
5966         return ret;
5967 }
5968
5969 static enum stripe_result make_stripe_request(struct mddev *mddev,
5970                 struct r5conf *conf, struct stripe_request_ctx *ctx,
5971                 sector_t logical_sector, struct bio *bi)
5972 {
5973         const int rw = bio_data_dir(bi);
5974         enum stripe_result ret;
5975         struct stripe_head *sh;
5976         sector_t new_sector;
5977         int previous = 0, flags = 0;
5978         int seq, dd_idx;
5979
5980         seq = read_seqcount_begin(&conf->gen_lock);
5981
5982         if (unlikely(conf->reshape_progress != MaxSector)) {
5983                 /*
5984                  * Spinlock is needed as reshape_progress may be
5985                  * 64bit on a 32bit platform, and so it might be
5986                  * possible to see a half-updated value
5987                  * Of course reshape_progress could change after
5988                  * the lock is dropped, so once we get a reference
5989                  * to the stripe that we think it is, we will have
5990                  * to check again.
5991                  */
5992                 spin_lock_irq(&conf->device_lock);
5993                 if (ahead_of_reshape(mddev, logical_sector,
5994                                      conf->reshape_progress)) {
5995                         previous = 1;
5996                 } else {
5997                         if (ahead_of_reshape(mddev, logical_sector,
5998                                              conf->reshape_safe)) {
5999                                 spin_unlock_irq(&conf->device_lock);
6000                                 return STRIPE_SCHEDULE_AND_RETRY;
6001                         }
6002                 }
6003                 spin_unlock_irq(&conf->device_lock);
6004         }
6005
6006         new_sector = raid5_compute_sector(conf, logical_sector, previous,
6007                                           &dd_idx, NULL);
6008         pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
6009                  new_sector, logical_sector);
6010
6011         if (previous)
6012                 flags |= R5_GAS_PREVIOUS;
6013         if (bi->bi_opf & REQ_RAHEAD)
6014                 flags |= R5_GAS_NOBLOCK;
6015         sh = raid5_get_active_stripe(conf, ctx, new_sector, flags);
6016         if (unlikely(!sh)) {
6017                 /* cannot get stripe, just give-up */
6018                 bi->bi_status = BLK_STS_IOERR;
6019                 return STRIPE_FAIL;
6020         }
6021
6022         if (unlikely(previous) &&
6023             stripe_ahead_of_reshape(mddev, conf, sh)) {
6024                 /*
6025                  * Expansion moved on while waiting for a stripe.
6026                  * Expansion could still move past after this
6027                  * test, but as we are holding a reference to
6028                  * 'sh', we know that if that happens,
6029                  *  STRIPE_EXPANDING will get set and the expansion
6030                  * won't proceed until we finish with the stripe.
6031                  */
6032                 ret = STRIPE_SCHEDULE_AND_RETRY;
6033                 goto out_release;
6034         }
6035
6036         if (read_seqcount_retry(&conf->gen_lock, seq)) {
6037                 /* Might have got the wrong stripe_head by accident */
6038                 ret = STRIPE_RETRY;
6039                 goto out_release;
6040         }
6041
6042         if (test_bit(STRIPE_EXPANDING, &sh->state) ||
6043             !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
6044                 /*
6045                  * Stripe is busy expanding or add failed due to
6046                  * overlap. Flush everything and wait a while.
6047                  */
6048                 md_wakeup_thread(mddev->thread);
6049                 ret = STRIPE_SCHEDULE_AND_RETRY;
6050                 goto out_release;
6051         }
6052
6053         if (stripe_can_batch(sh)) {
6054                 stripe_add_to_batch_list(conf, sh, ctx->batch_last);
6055                 if (ctx->batch_last)
6056                         raid5_release_stripe(ctx->batch_last);
6057                 atomic_inc(&sh->count);
6058                 ctx->batch_last = sh;
6059         }
6060
6061         if (ctx->do_flush) {
6062                 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
6063                 /* we only need flush for one stripe */
6064                 ctx->do_flush = false;
6065         }
6066
6067         set_bit(STRIPE_HANDLE, &sh->state);
6068         clear_bit(STRIPE_DELAYED, &sh->state);
6069         if ((!sh->batch_head || sh == sh->batch_head) &&
6070             (bi->bi_opf & REQ_SYNC) &&
6071             !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
6072                 atomic_inc(&conf->preread_active_stripes);
6073
6074         release_stripe_plug(mddev, sh);
6075         return STRIPE_SUCCESS;
6076
6077 out_release:
6078         raid5_release_stripe(sh);
6079         return ret;
6080 }
6081
6082 static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
6083 {
6084         DEFINE_WAIT_FUNC(wait, woken_wake_function);
6085         struct r5conf *conf = mddev->private;
6086         sector_t logical_sector;
6087         struct stripe_request_ctx ctx = {};
6088         const int rw = bio_data_dir(bi);
6089         enum stripe_result res;
6090         int s, stripe_cnt;
6091
6092         if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6093                 int ret = log_handle_flush_request(conf, bi);
6094
6095                 if (ret == 0)
6096                         return true;
6097                 if (ret == -ENODEV) {
6098                         if (md_flush_request(mddev, bi))
6099                                 return true;
6100                 }
6101                 /* ret == -EAGAIN, fallback */
6102                 /*
6103                  * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
6104                  * we need to flush journal device
6105                  */
6106                 ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6107         }
6108
6109         if (!md_write_start(mddev, bi))
6110                 return false;
6111         /*
6112          * If array is degraded, better not do chunk aligned read because
6113          * later we might have to read it again in order to reconstruct
6114          * data on failed drives.
6115          */
6116         if (rw == READ && mddev->degraded == 0 &&
6117             mddev->reshape_position == MaxSector) {
6118                 bi = chunk_aligned_read(mddev, bi);
6119                 if (!bi)
6120                         return true;
6121         }
6122
6123         if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6124                 make_discard_request(mddev, bi);
6125                 md_write_end(mddev);
6126                 return true;
6127         }
6128
6129         logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6130         ctx.first_sector = logical_sector;
6131         ctx.last_sector = bio_end_sector(bi);
6132         bi->bi_next = NULL;
6133
6134         stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6135                                            RAID5_STRIPE_SECTORS(conf));
6136         bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
6137
6138         pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6139                  bi->bi_iter.bi_sector, ctx.last_sector);
6140
6141         /* Bail out if conflicts with reshape and REQ_NOWAIT is set */
6142         if ((bi->bi_opf & REQ_NOWAIT) &&
6143             (conf->reshape_progress != MaxSector) &&
6144             !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) &&
6145             ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) {
6146                 bio_wouldblock_error(bi);
6147                 if (rw == WRITE)
6148                         md_write_end(mddev);
6149                 return true;
6150         }
6151         md_account_bio(mddev, &bi);
6152
6153         add_wait_queue(&conf->wait_for_overlap, &wait);
6154         while (1) {
6155                 res = make_stripe_request(mddev, conf, &ctx, logical_sector,
6156                                           bi);
6157                 if (res == STRIPE_FAIL)
6158                         break;
6159
6160                 if (res == STRIPE_RETRY)
6161                         continue;
6162
6163                 if (res == STRIPE_SCHEDULE_AND_RETRY) {
6164                         /*
6165                          * Must release the reference to batch_last before
6166                          * scheduling and waiting for work to be done,
6167                          * otherwise the batch_last stripe head could prevent
6168                          * raid5_activate_delayed() from making progress
6169                          * and thus deadlocking.
6170                          */
6171                         if (ctx.batch_last) {
6172                                 raid5_release_stripe(ctx.batch_last);
6173                                 ctx.batch_last = NULL;
6174                         }
6175
6176                         wait_woken(&wait, TASK_UNINTERRUPTIBLE,
6177                                    MAX_SCHEDULE_TIMEOUT);
6178                         continue;
6179                 }
6180
6181                 s = find_first_bit(ctx.sectors_to_do, stripe_cnt);
6182                 if (s == stripe_cnt)
6183                         break;
6184
6185                 logical_sector = ctx.first_sector +
6186                         (s << RAID5_STRIPE_SHIFT(conf));
6187         }
6188         remove_wait_queue(&conf->wait_for_overlap, &wait);
6189
6190         if (ctx.batch_last)
6191                 raid5_release_stripe(ctx.batch_last);
6192
6193         if (rw == WRITE)
6194                 md_write_end(mddev);
6195         bio_endio(bi);
6196         return true;
6197 }
6198
6199 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
6200
6201 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
6202 {
6203         /* reshaping is quite different to recovery/resync so it is
6204          * handled quite separately ... here.
6205          *
6206          * On each call to sync_request, we gather one chunk worth of
6207          * destination stripes and flag them as expanding.
6208          * Then we find all the source stripes and request reads.
6209          * As the reads complete, handle_stripe will copy the data
6210          * into the destination stripe and release that stripe.
6211          */
6212         struct r5conf *conf = mddev->private;
6213         struct stripe_head *sh;
6214         struct md_rdev *rdev;
6215         sector_t first_sector, last_sector;
6216         int raid_disks = conf->previous_raid_disks;
6217         int data_disks = raid_disks - conf->max_degraded;
6218         int new_data_disks = conf->raid_disks - conf->max_degraded;
6219         int i;
6220         int dd_idx;
6221         sector_t writepos, readpos, safepos;
6222         sector_t stripe_addr;
6223         int reshape_sectors;
6224         struct list_head stripes;
6225         sector_t retn;
6226
6227         if (sector_nr == 0) {
6228                 /* If restarting in the middle, skip the initial sectors */
6229                 if (mddev->reshape_backwards &&
6230                     conf->reshape_progress < raid5_size(mddev, 0, 0)) {
6231                         sector_nr = raid5_size(mddev, 0, 0)
6232                                 - conf->reshape_progress;
6233                 } else if (mddev->reshape_backwards &&
6234                            conf->reshape_progress == MaxSector) {
6235                         /* shouldn't happen, but just in case, finish up.*/
6236                         sector_nr = MaxSector;
6237                 } else if (!mddev->reshape_backwards &&
6238                            conf->reshape_progress > 0)
6239                         sector_nr = conf->reshape_progress;
6240                 sector_div(sector_nr, new_data_disks);
6241                 if (sector_nr) {
6242                         mddev->curr_resync_completed = sector_nr;
6243                         sysfs_notify_dirent_safe(mddev->sysfs_completed);
6244                         *skipped = 1;
6245                         retn = sector_nr;
6246                         goto finish;
6247                 }
6248         }
6249
6250         /* We need to process a full chunk at a time.
6251          * If old and new chunk sizes differ, we need to process the
6252          * largest of these
6253          */
6254
6255         reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6256
6257         /* We update the metadata at least every 10 seconds, or when
6258          * the data about to be copied would over-write the source of
6259          * the data at the front of the range.  i.e. one new_stripe
6260          * along from reshape_progress new_maps to after where
6261          * reshape_safe old_maps to
6262          */
6263         writepos = conf->reshape_progress;
6264         sector_div(writepos, new_data_disks);
6265         readpos = conf->reshape_progress;
6266         sector_div(readpos, data_disks);
6267         safepos = conf->reshape_safe;
6268         sector_div(safepos, data_disks);
6269         if (mddev->reshape_backwards) {
6270                 BUG_ON(writepos < reshape_sectors);
6271                 writepos -= reshape_sectors;
6272                 readpos += reshape_sectors;
6273                 safepos += reshape_sectors;
6274         } else {
6275                 writepos += reshape_sectors;
6276                 /* readpos and safepos are worst-case calculations.
6277                  * A negative number is overly pessimistic, and causes
6278                  * obvious problems for unsigned storage.  So clip to 0.
6279                  */
6280                 readpos -= min_t(sector_t, reshape_sectors, readpos);
6281                 safepos -= min_t(sector_t, reshape_sectors, safepos);
6282         }
6283
6284         /* Having calculated the 'writepos' possibly use it
6285          * to set 'stripe_addr' which is where we will write to.
6286          */
6287         if (mddev->reshape_backwards) {
6288                 BUG_ON(conf->reshape_progress == 0);
6289                 stripe_addr = writepos;
6290                 BUG_ON((mddev->dev_sectors &
6291                         ~((sector_t)reshape_sectors - 1))
6292                        - reshape_sectors - stripe_addr
6293                        != sector_nr);
6294         } else {
6295                 BUG_ON(writepos != sector_nr + reshape_sectors);
6296                 stripe_addr = sector_nr;
6297         }
6298
6299         /* 'writepos' is the most advanced device address we might write.
6300          * 'readpos' is the least advanced device address we might read.
6301          * 'safepos' is the least address recorded in the metadata as having
6302          *     been reshaped.
6303          * If there is a min_offset_diff, these are adjusted either by
6304          * increasing the safepos/readpos if diff is negative, or
6305          * increasing writepos if diff is positive.
6306          * If 'readpos' is then behind 'writepos', there is no way that we can
6307          * ensure safety in the face of a crash - that must be done by userspace
6308          * making a backup of the data.  So in that case there is no particular
6309          * rush to update metadata.
6310          * Otherwise if 'safepos' is behind 'writepos', then we really need to
6311          * update the metadata to advance 'safepos' to match 'readpos' so that
6312          * we can be safe in the event of a crash.
6313          * So we insist on updating metadata if safepos is behind writepos and
6314          * readpos is beyond writepos.
6315          * In any case, update the metadata every 10 seconds.
6316          * Maybe that number should be configurable, but I'm not sure it is
6317          * worth it.... maybe it could be a multiple of safemode_delay???
6318          */
6319         if (conf->min_offset_diff < 0) {
6320                 safepos += -conf->min_offset_diff;
6321                 readpos += -conf->min_offset_diff;
6322         } else
6323                 writepos += conf->min_offset_diff;
6324
6325         if ((mddev->reshape_backwards
6326              ? (safepos > writepos && readpos < writepos)
6327              : (safepos < writepos && readpos > writepos)) ||
6328             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
6329                 /* Cannot proceed until we've updated the superblock... */
6330                 wait_event(conf->wait_for_overlap,
6331                            atomic_read(&conf->reshape_stripes)==0
6332                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6333                 if (atomic_read(&conf->reshape_stripes) != 0)
6334                         return 0;
6335                 mddev->reshape_position = conf->reshape_progress;
6336                 mddev->curr_resync_completed = sector_nr;
6337                 if (!mddev->reshape_backwards)
6338                         /* Can update recovery_offset */
6339                         rdev_for_each(rdev, mddev)
6340                                 if (rdev->raid_disk >= 0 &&
6341                                     !test_bit(Journal, &rdev->flags) &&
6342                                     !test_bit(In_sync, &rdev->flags) &&
6343                                     rdev->recovery_offset < sector_nr)
6344                                         rdev->recovery_offset = sector_nr;
6345
6346                 conf->reshape_checkpoint = jiffies;
6347                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6348                 md_wakeup_thread(mddev->thread);
6349                 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
6350                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6351                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6352                         return 0;
6353                 spin_lock_irq(&conf->device_lock);
6354                 conf->reshape_safe = mddev->reshape_position;
6355                 spin_unlock_irq(&conf->device_lock);
6356                 wake_up(&conf->wait_for_overlap);
6357                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6358         }
6359
6360         INIT_LIST_HEAD(&stripes);
6361         for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6362                 int j;
6363                 int skipped_disk = 0;
6364                 sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i,
6365                                              R5_GAS_NOQUIESCE);
6366                 set_bit(STRIPE_EXPANDING, &sh->state);
6367                 atomic_inc(&conf->reshape_stripes);
6368                 /* If any of this stripe is beyond the end of the old
6369                  * array, then we need to zero those blocks
6370                  */
6371                 for (j=sh->disks; j--;) {
6372                         sector_t s;
6373                         if (j == sh->pd_idx)
6374                                 continue;
6375                         if (conf->level == 6 &&
6376                             j == sh->qd_idx)
6377                                 continue;
6378                         s = raid5_compute_blocknr(sh, j, 0);
6379                         if (s < raid5_size(mddev, 0, 0)) {
6380                                 skipped_disk = 1;
6381                                 continue;
6382                         }
6383                         memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
6384                         set_bit(R5_Expanded, &sh->dev[j].flags);
6385                         set_bit(R5_UPTODATE, &sh->dev[j].flags);
6386                 }
6387                 if (!skipped_disk) {
6388                         set_bit(STRIPE_EXPAND_READY, &sh->state);
6389                         set_bit(STRIPE_HANDLE, &sh->state);
6390                 }
6391                 list_add(&sh->lru, &stripes);
6392         }
6393         spin_lock_irq(&conf->device_lock);
6394         if (mddev->reshape_backwards)
6395                 conf->reshape_progress -= reshape_sectors * new_data_disks;
6396         else
6397                 conf->reshape_progress += reshape_sectors * new_data_disks;
6398         spin_unlock_irq(&conf->device_lock);
6399         /* Ok, those stripe are ready. We can start scheduling
6400          * reads on the source stripes.
6401          * The source stripes are determined by mapping the first and last
6402          * block on the destination stripes.
6403          */
6404         first_sector =
6405                 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
6406                                      1, &dd_idx, NULL);
6407         last_sector =
6408                 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
6409                                             * new_data_disks - 1),
6410                                      1, &dd_idx, NULL);
6411         if (last_sector >= mddev->dev_sectors)
6412                 last_sector = mddev->dev_sectors - 1;
6413         while (first_sector <= last_sector) {
6414                 sh = raid5_get_active_stripe(conf, NULL, first_sector,
6415                                 R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE);
6416                 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
6417                 set_bit(STRIPE_HANDLE, &sh->state);
6418                 raid5_release_stripe(sh);
6419                 first_sector += RAID5_STRIPE_SECTORS(conf);
6420         }
6421         /* Now that the sources are clearly marked, we can release
6422          * the destination stripes
6423          */
6424         while (!list_empty(&stripes)) {
6425                 sh = list_entry(stripes.next, struct stripe_head, lru);
6426                 list_del_init(&sh->lru);
6427                 raid5_release_stripe(sh);
6428         }
6429         /* If this takes us to the resync_max point where we have to pause,
6430          * then we need to write out the superblock.
6431          */
6432         sector_nr += reshape_sectors;
6433         retn = reshape_sectors;
6434 finish:
6435         if (mddev->curr_resync_completed > mddev->resync_max ||
6436             (sector_nr - mddev->curr_resync_completed) * 2
6437             >= mddev->resync_max - mddev->curr_resync_completed) {
6438                 /* Cannot proceed until we've updated the superblock... */
6439                 wait_event(conf->wait_for_overlap,
6440                            atomic_read(&conf->reshape_stripes) == 0
6441                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6442                 if (atomic_read(&conf->reshape_stripes) != 0)
6443                         goto ret;
6444                 mddev->reshape_position = conf->reshape_progress;
6445                 mddev->curr_resync_completed = sector_nr;
6446                 if (!mddev->reshape_backwards)
6447                         /* Can update recovery_offset */
6448                         rdev_for_each(rdev, mddev)
6449                                 if (rdev->raid_disk >= 0 &&
6450                                     !test_bit(Journal, &rdev->flags) &&
6451                                     !test_bit(In_sync, &rdev->flags) &&
6452                                     rdev->recovery_offset < sector_nr)
6453                                         rdev->recovery_offset = sector_nr;
6454                 conf->reshape_checkpoint = jiffies;
6455                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6456                 md_wakeup_thread(mddev->thread);
6457                 wait_event(mddev->sb_wait,
6458                            !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6459                            || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6460                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6461                         goto ret;
6462                 spin_lock_irq(&conf->device_lock);
6463                 conf->reshape_safe = mddev->reshape_position;
6464                 spin_unlock_irq(&conf->device_lock);
6465                 wake_up(&conf->wait_for_overlap);
6466                 sysfs_notify_dirent_safe(mddev->sysfs_completed);
6467         }
6468 ret:
6469         return retn;
6470 }
6471
6472 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6473                                           int *skipped)
6474 {
6475         struct r5conf *conf = mddev->private;
6476         struct stripe_head *sh;
6477         sector_t max_sector = mddev->dev_sectors;
6478         sector_t sync_blocks;
6479         int still_degraded = 0;
6480         int i;
6481
6482         if (sector_nr >= max_sector) {
6483                 /* just being told to finish up .. nothing much to do */
6484
6485                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6486                         end_reshape(conf);
6487                         return 0;
6488                 }
6489
6490                 if (mddev->curr_resync < max_sector) /* aborted */
6491                         md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
6492                                            &sync_blocks, 1);
6493                 else /* completed sync */
6494                         conf->fullsync = 0;
6495                 md_bitmap_close_sync(mddev->bitmap);
6496
6497                 return 0;
6498         }
6499
6500         /* Allow raid5_quiesce to complete */
6501         wait_event(conf->wait_for_overlap, conf->quiesce != 2);
6502
6503         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6504                 return reshape_request(mddev, sector_nr, skipped);
6505
6506         /* No need to check resync_max as we never do more than one
6507          * stripe, and as resync_max will always be on a chunk boundary,
6508          * if the check in md_do_sync didn't fire, there is no chance
6509          * of overstepping resync_max here
6510          */
6511
6512         /* if there is too many failed drives and we are trying
6513          * to resync, then assert that we are finished, because there is
6514          * nothing we can do.
6515          */
6516         if (mddev->degraded >= conf->max_degraded &&
6517             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6518                 sector_t rv = mddev->dev_sectors - sector_nr;
6519                 *skipped = 1;
6520                 return rv;
6521         }
6522         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6523             !conf->fullsync &&
6524             !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6525             sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6526                 /* we can skip this block, and probably more */
6527                 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6528                 *skipped = 1;
6529                 /* keep things rounded to whole stripes */
6530                 return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6531         }
6532
6533         md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
6534
6535         sh = raid5_get_active_stripe(conf, NULL, sector_nr,
6536                                      R5_GAS_NOBLOCK);
6537         if (sh == NULL) {
6538                 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0);
6539                 /* make sure we don't swamp the stripe cache if someone else
6540                  * is trying to get access
6541                  */
6542                 schedule_timeout_uninterruptible(1);
6543         }
6544         /* Need to check if array will still be degraded after recovery/resync
6545          * Note in case of > 1 drive failures it's possible we're rebuilding
6546          * one drive while leaving another faulty drive in array.
6547          */
6548         rcu_read_lock();
6549         for (i = 0; i < conf->raid_disks; i++) {
6550                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
6551
6552                 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
6553                         still_degraded = 1;
6554         }
6555         rcu_read_unlock();
6556
6557         md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
6558
6559         set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
6560         set_bit(STRIPE_HANDLE, &sh->state);
6561
6562         raid5_release_stripe(sh);
6563
6564         return RAID5_STRIPE_SECTORS(conf);
6565 }
6566
6567 static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6568                                unsigned int offset)
6569 {
6570         /* We may not be able to submit a whole bio at once as there
6571          * may not be enough stripe_heads available.
6572          * We cannot pre-allocate enough stripe_heads as we may need
6573          * more than exist in the cache (if we allow ever large chunks).
6574          * So we do one stripe head at a time and record in
6575          * ->bi_hw_segments how many have been done.
6576          *
6577          * We *know* that this entire raid_bio is in one chunk, so
6578          * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6579          */
6580         struct stripe_head *sh;
6581         int dd_idx;
6582         sector_t sector, logical_sector, last_sector;
6583         int scnt = 0;
6584         int handled = 0;
6585
6586         logical_sector = raid_bio->bi_iter.bi_sector &
6587                 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
6588         sector = raid5_compute_sector(conf, logical_sector,
6589                                       0, &dd_idx, NULL);
6590         last_sector = bio_end_sector(raid_bio);
6591
6592         for (; logical_sector < last_sector;
6593              logical_sector += RAID5_STRIPE_SECTORS(conf),
6594                      sector += RAID5_STRIPE_SECTORS(conf),
6595                      scnt++) {
6596
6597                 if (scnt < offset)
6598                         /* already done this stripe */
6599                         continue;
6600
6601                 sh = raid5_get_active_stripe(conf, NULL, sector,
6602                                 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
6603                 if (!sh) {
6604                         /* failed to get a stripe - must wait */
6605                         conf->retry_read_aligned = raid_bio;
6606                         conf->retry_read_offset = scnt;
6607                         return handled;
6608                 }
6609
6610                 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
6611                         raid5_release_stripe(sh);
6612                         conf->retry_read_aligned = raid_bio;
6613                         conf->retry_read_offset = scnt;
6614                         return handled;
6615                 }
6616
6617                 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
6618                 handle_stripe(sh);
6619                 raid5_release_stripe(sh);
6620                 handled++;
6621         }
6622
6623         bio_endio(raid_bio);
6624
6625         if (atomic_dec_and_test(&conf->active_aligned_reads))
6626                 wake_up(&conf->wait_for_quiescent);
6627         return handled;
6628 }
6629
6630 static int handle_active_stripes(struct r5conf *conf, int group,
6631                                  struct r5worker *worker,
6632                                  struct list_head *temp_inactive_list)
6633                 __must_hold(&conf->device_lock)
6634 {
6635         struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6636         int i, batch_size = 0, hash;
6637         bool release_inactive = false;
6638
6639         while (batch_size < MAX_STRIPE_BATCH &&
6640                         (sh = __get_priority_stripe(conf, group)) != NULL)
6641                 batch[batch_size++] = sh;
6642
6643         if (batch_size == 0) {
6644                 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6645                         if (!list_empty(temp_inactive_list + i))
6646                                 break;
6647                 if (i == NR_STRIPE_HASH_LOCKS) {
6648                         spin_unlock_irq(&conf->device_lock);
6649                         log_flush_stripe_to_raid(conf);
6650                         spin_lock_irq(&conf->device_lock);
6651                         return batch_size;
6652                 }
6653                 release_inactive = true;
6654         }
6655         spin_unlock_irq(&conf->device_lock);
6656
6657         release_inactive_stripe_list(conf, temp_inactive_list,
6658                                      NR_STRIPE_HASH_LOCKS);
6659
6660         r5l_flush_stripe_to_raid(conf->log);
6661         if (release_inactive) {
6662                 spin_lock_irq(&conf->device_lock);
6663                 return 0;
6664         }
6665
6666         for (i = 0; i < batch_size; i++)
6667                 handle_stripe(batch[i]);
6668         log_write_stripe_run(conf);
6669
6670         cond_resched();
6671
6672         spin_lock_irq(&conf->device_lock);
6673         for (i = 0; i < batch_size; i++) {
6674                 hash = batch[i]->hash_lock_index;
6675                 __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6676         }
6677         return batch_size;
6678 }
6679
6680 static void raid5_do_work(struct work_struct *work)
6681 {
6682         struct r5worker *worker = container_of(work, struct r5worker, work);
6683         struct r5worker_group *group = worker->group;
6684         struct r5conf *conf = group->conf;
6685         struct mddev *mddev = conf->mddev;
6686         int group_id = group - conf->worker_groups;
6687         int handled;
6688         struct blk_plug plug;
6689
6690         pr_debug("+++ raid5worker active\n");
6691
6692         blk_start_plug(&plug);
6693         handled = 0;
6694         spin_lock_irq(&conf->device_lock);
6695         while (1) {
6696                 int batch_size, released;
6697
6698                 released = release_stripe_list(conf, worker->temp_inactive_list);
6699
6700                 batch_size = handle_active_stripes(conf, group_id, worker,
6701                                                    worker->temp_inactive_list);
6702                 worker->working = false;
6703                 if (!batch_size && !released)
6704                         break;
6705                 handled += batch_size;
6706                 wait_event_lock_irq(mddev->sb_wait,
6707                         !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6708                         conf->device_lock);
6709         }
6710         pr_debug("%d stripes handled\n", handled);
6711
6712         spin_unlock_irq(&conf->device_lock);
6713
6714         flush_deferred_bios(conf);
6715
6716         r5l_flush_stripe_to_raid(conf->log);
6717
6718         async_tx_issue_pending_all();
6719         blk_finish_plug(&plug);
6720
6721         pr_debug("--- raid5worker inactive\n");
6722 }
6723
6724 /*
6725  * This is our raid5 kernel thread.
6726  *
6727  * We scan the hash table for stripes which can be handled now.
6728  * During the scan, completed stripes are saved for us by the interrupt
6729  * handler, so that they will not have to wait for our next wakeup.
6730  */
6731 static void raid5d(struct md_thread *thread)
6732 {
6733         struct mddev *mddev = thread->mddev;
6734         struct r5conf *conf = mddev->private;
6735         int handled;
6736         struct blk_plug plug;
6737
6738         pr_debug("+++ raid5d active\n");
6739
6740         md_check_recovery(mddev);
6741
6742         blk_start_plug(&plug);
6743         handled = 0;
6744         spin_lock_irq(&conf->device_lock);
6745         while (1) {
6746                 struct bio *bio;
6747                 int batch_size, released;
6748                 unsigned int offset;
6749
6750                 released = release_stripe_list(conf, conf->temp_inactive_list);
6751                 if (released)
6752                         clear_bit(R5_DID_ALLOC, &conf->cache_state);
6753
6754                 if (
6755                     !list_empty(&conf->bitmap_list)) {
6756                         /* Now is a good time to flush some bitmap updates */
6757                         conf->seq_flush++;
6758                         spin_unlock_irq(&conf->device_lock);
6759                         md_bitmap_unplug(mddev->bitmap);
6760                         spin_lock_irq(&conf->device_lock);
6761                         conf->seq_write = conf->seq_flush;
6762                         activate_bit_delay(conf, conf->temp_inactive_list);
6763                 }
6764                 raid5_activate_delayed(conf);
6765
6766                 while ((bio = remove_bio_from_retry(conf, &offset))) {
6767                         int ok;
6768                         spin_unlock_irq(&conf->device_lock);
6769                         ok = retry_aligned_read(conf, bio, offset);
6770                         spin_lock_irq(&conf->device_lock);
6771                         if (!ok)
6772                                 break;
6773                         handled++;
6774                 }
6775
6776                 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6777                                                    conf->temp_inactive_list);
6778                 if (!batch_size && !released)
6779                         break;
6780                 handled += batch_size;
6781
6782                 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6783                         spin_unlock_irq(&conf->device_lock);
6784                         md_check_recovery(mddev);
6785                         spin_lock_irq(&conf->device_lock);
6786
6787                         /*
6788                          * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6789                          * seeing md_check_recovery() is needed to clear
6790                          * the flag when using mdmon.
6791                          */
6792                         continue;
6793                 }
6794
6795                 wait_event_lock_irq(mddev->sb_wait,
6796                         !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6797                         conf->device_lock);
6798         }
6799         pr_debug("%d stripes handled\n", handled);
6800
6801         spin_unlock_irq(&conf->device_lock);
6802         if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6803             mutex_trylock(&conf->cache_size_mutex)) {
6804                 grow_one_stripe(conf, __GFP_NOWARN);
6805                 /* Set flag even if allocation failed.  This helps
6806                  * slow down allocation requests when mem is short
6807                  */
6808                 set_bit(R5_DID_ALLOC, &conf->cache_state);
6809                 mutex_unlock(&conf->cache_size_mutex);
6810         }
6811
6812         flush_deferred_bios(conf);
6813
6814         r5l_flush_stripe_to_raid(conf->log);
6815
6816         async_tx_issue_pending_all();
6817         blk_finish_plug(&plug);
6818
6819         pr_debug("--- raid5d inactive\n");
6820 }
6821
6822 static ssize_t
6823 raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6824 {
6825         struct r5conf *conf;
6826         int ret = 0;
6827         spin_lock(&mddev->lock);
6828         conf = mddev->private;
6829         if (conf)
6830                 ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6831         spin_unlock(&mddev->lock);
6832         return ret;
6833 }
6834
6835 int
6836 raid5_set_cache_size(struct mddev *mddev, int size)
6837 {
6838         int result = 0;
6839         struct r5conf *conf = mddev->private;
6840
6841         if (size <= 16 || size > 32768)
6842                 return -EINVAL;
6843
6844         conf->min_nr_stripes = size;
6845         mutex_lock(&conf->cache_size_mutex);
6846         while (size < conf->max_nr_stripes &&
6847                drop_one_stripe(conf))
6848                 ;
6849         mutex_unlock(&conf->cache_size_mutex);
6850
6851         md_allow_write(mddev);
6852
6853         mutex_lock(&conf->cache_size_mutex);
6854         while (size > conf->max_nr_stripes)
6855                 if (!grow_one_stripe(conf, GFP_KERNEL)) {
6856                         conf->min_nr_stripes = conf->max_nr_stripes;
6857                         result = -ENOMEM;
6858                         break;
6859                 }
6860         mutex_unlock(&conf->cache_size_mutex);
6861
6862         return result;
6863 }
6864 EXPORT_SYMBOL(raid5_set_cache_size);
6865
6866 static ssize_t
6867 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6868 {
6869         struct r5conf *conf;
6870         unsigned long new;
6871         int err;
6872
6873         if (len >= PAGE_SIZE)
6874                 return -EINVAL;
6875         if (kstrtoul(page, 10, &new))
6876                 return -EINVAL;
6877         err = mddev_lock(mddev);
6878         if (err)
6879                 return err;
6880         conf = mddev->private;
6881         if (!conf)
6882                 err = -ENODEV;
6883         else
6884                 err = raid5_set_cache_size(mddev, new);
6885         mddev_unlock(mddev);
6886
6887         return err ?: len;
6888 }
6889
6890 static struct md_sysfs_entry
6891 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6892                                 raid5_show_stripe_cache_size,
6893                                 raid5_store_stripe_cache_size);
6894
6895 static ssize_t
6896 raid5_show_rmw_level(struct mddev  *mddev, char *page)
6897 {
6898         struct r5conf *conf = mddev->private;
6899         if (conf)
6900                 return sprintf(page, "%d\n", conf->rmw_level);
6901         else
6902                 return 0;
6903 }
6904
6905 static ssize_t
6906 raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6907 {
6908         struct r5conf *conf = mddev->private;
6909         unsigned long new;
6910
6911         if (!conf)
6912                 return -ENODEV;
6913
6914         if (len >= PAGE_SIZE)
6915                 return -EINVAL;
6916
6917         if (kstrtoul(page, 10, &new))
6918                 return -EINVAL;
6919
6920         if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6921                 return -EINVAL;
6922
6923         if (new != PARITY_DISABLE_RMW &&
6924             new != PARITY_ENABLE_RMW &&
6925             new != PARITY_PREFER_RMW)
6926                 return -EINVAL;
6927
6928         conf->rmw_level = new;
6929         return len;
6930 }
6931
6932 static struct md_sysfs_entry
6933 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6934                          raid5_show_rmw_level,
6935                          raid5_store_rmw_level);
6936
6937 static ssize_t
6938 raid5_show_stripe_size(struct mddev  *mddev, char *page)
6939 {
6940         struct r5conf *conf;
6941         int ret = 0;
6942
6943         spin_lock(&mddev->lock);
6944         conf = mddev->private;
6945         if (conf)
6946                 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6947         spin_unlock(&mddev->lock);
6948         return ret;
6949 }
6950
6951 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6952 static ssize_t
6953 raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
6954 {
6955         struct r5conf *conf;
6956         unsigned long new;
6957         int err;
6958         int size;
6959
6960         if (len >= PAGE_SIZE)
6961                 return -EINVAL;
6962         if (kstrtoul(page, 10, &new))
6963                 return -EINVAL;
6964
6965         /*
6966          * The value should not be bigger than PAGE_SIZE. It requires to
6967          * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6968          * of two.
6969          */
6970         if (new % DEFAULT_STRIPE_SIZE != 0 ||
6971                         new > PAGE_SIZE || new == 0 ||
6972                         new != roundup_pow_of_two(new))
6973                 return -EINVAL;
6974
6975         err = mddev_lock(mddev);
6976         if (err)
6977                 return err;
6978
6979         conf = mddev->private;
6980         if (!conf) {
6981                 err = -ENODEV;
6982                 goto out_unlock;
6983         }
6984
6985         if (new == conf->stripe_size)
6986                 goto out_unlock;
6987
6988         pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6989                         conf->stripe_size, new);
6990
6991         if (mddev->sync_thread ||
6992                 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6993                 mddev->reshape_position != MaxSector ||
6994                 mddev->sysfs_active) {
6995                 err = -EBUSY;
6996                 goto out_unlock;
6997         }
6998
6999         mddev_suspend(mddev);
7000         mutex_lock(&conf->cache_size_mutex);
7001         size = conf->max_nr_stripes;
7002
7003         shrink_stripes(conf);
7004
7005         conf->stripe_size = new;
7006         conf->stripe_shift = ilog2(new) - 9;
7007         conf->stripe_sectors = new >> 9;
7008         if (grow_stripes(conf, size)) {
7009                 pr_warn("md/raid:%s: couldn't allocate buffers\n",
7010                                 mdname(mddev));
7011                 err = -ENOMEM;
7012         }
7013         mutex_unlock(&conf->cache_size_mutex);
7014         mddev_resume(mddev);
7015
7016 out_unlock:
7017         mddev_unlock(mddev);
7018         return err ?: len;
7019 }
7020
7021 static struct md_sysfs_entry
7022 raid5_stripe_size = __ATTR(stripe_size, 0644,
7023                          raid5_show_stripe_size,
7024                          raid5_store_stripe_size);
7025 #else
7026 static struct md_sysfs_entry
7027 raid5_stripe_size = __ATTR(stripe_size, 0444,
7028                          raid5_show_stripe_size,
7029                          NULL);
7030 #endif
7031
7032 static ssize_t
7033 raid5_show_preread_threshold(struct mddev *mddev, char *page)
7034 {
7035         struct r5conf *conf;
7036         int ret = 0;
7037         spin_lock(&mddev->lock);
7038         conf = mddev->private;
7039         if (conf)
7040                 ret = sprintf(page, "%d\n", conf->bypass_threshold);
7041         spin_unlock(&mddev->lock);
7042         return ret;
7043 }
7044
7045 static ssize_t
7046 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
7047 {
7048         struct r5conf *conf;
7049         unsigned long new;
7050         int err;
7051
7052         if (len >= PAGE_SIZE)
7053                 return -EINVAL;
7054         if (kstrtoul(page, 10, &new))
7055                 return -EINVAL;
7056
7057         err = mddev_lock(mddev);
7058         if (err)
7059                 return err;
7060         conf = mddev->private;
7061         if (!conf)
7062                 err = -ENODEV;
7063         else if (new > conf->min_nr_stripes)
7064                 err = -EINVAL;
7065         else
7066                 conf->bypass_threshold = new;
7067         mddev_unlock(mddev);
7068         return err ?: len;
7069 }
7070
7071 static struct md_sysfs_entry
7072 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7073                                         S_IRUGO | S_IWUSR,
7074                                         raid5_show_preread_threshold,
7075                                         raid5_store_preread_threshold);
7076
7077 static ssize_t
7078 raid5_show_skip_copy(struct mddev *mddev, char *page)
7079 {
7080         struct r5conf *conf;
7081         int ret = 0;
7082         spin_lock(&mddev->lock);
7083         conf = mddev->private;
7084         if (conf)
7085                 ret = sprintf(page, "%d\n", conf->skip_copy);
7086         spin_unlock(&mddev->lock);
7087         return ret;
7088 }
7089
7090 static ssize_t
7091 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
7092 {
7093         struct r5conf *conf;
7094         unsigned long new;
7095         int err;
7096
7097         if (len >= PAGE_SIZE)
7098                 return -EINVAL;
7099         if (kstrtoul(page, 10, &new))
7100                 return -EINVAL;
7101         new = !!new;
7102
7103         err = mddev_lock(mddev);
7104         if (err)
7105                 return err;
7106         conf = mddev->private;
7107         if (!conf)
7108                 err = -ENODEV;
7109         else if (new != conf->skip_copy) {
7110                 struct request_queue *q = mddev->queue;
7111
7112                 mddev_suspend(mddev);
7113                 conf->skip_copy = new;
7114                 if (new)
7115                         blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
7116                 else
7117                         blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
7118                 mddev_resume(mddev);
7119         }
7120         mddev_unlock(mddev);
7121         return err ?: len;
7122 }
7123
7124 static struct md_sysfs_entry
7125 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
7126                                         raid5_show_skip_copy,
7127                                         raid5_store_skip_copy);
7128
7129 static ssize_t
7130 stripe_cache_active_show(struct mddev *mddev, char *page)
7131 {
7132         struct r5conf *conf = mddev->private;
7133         if (conf)
7134                 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
7135         else
7136                 return 0;
7137 }
7138
7139 static struct md_sysfs_entry
7140 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7141
7142 static ssize_t
7143 raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
7144 {
7145         struct r5conf *conf;
7146         int ret = 0;
7147         spin_lock(&mddev->lock);
7148         conf = mddev->private;
7149         if (conf)
7150                 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
7151         spin_unlock(&mddev->lock);
7152         return ret;
7153 }
7154
7155 static int alloc_thread_groups(struct r5conf *conf, int cnt,
7156                                int *group_cnt,
7157                                struct r5worker_group **worker_groups);
7158 static ssize_t
7159 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
7160 {
7161         struct r5conf *conf;
7162         unsigned int new;
7163         int err;
7164         struct r5worker_group *new_groups, *old_groups;
7165         int group_cnt;
7166
7167         if (len >= PAGE_SIZE)
7168                 return -EINVAL;
7169         if (kstrtouint(page, 10, &new))
7170                 return -EINVAL;
7171         /* 8192 should be big enough */
7172         if (new > 8192)
7173                 return -EINVAL;
7174
7175         err = mddev_lock(mddev);
7176         if (err)
7177                 return err;
7178         conf = mddev->private;
7179         if (!conf)
7180                 err = -ENODEV;
7181         else if (new != conf->worker_cnt_per_group) {
7182                 mddev_suspend(mddev);
7183
7184                 old_groups = conf->worker_groups;
7185                 if (old_groups)
7186                         flush_workqueue(raid5_wq);
7187
7188                 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
7189                 if (!err) {
7190                         spin_lock_irq(&conf->device_lock);
7191                         conf->group_cnt = group_cnt;
7192                         conf->worker_cnt_per_group = new;
7193                         conf->worker_groups = new_groups;
7194                         spin_unlock_irq(&conf->device_lock);
7195
7196                         if (old_groups)
7197                                 kfree(old_groups[0].workers);
7198                         kfree(old_groups);
7199                 }
7200                 mddev_resume(mddev);
7201         }
7202         mddev_unlock(mddev);
7203
7204         return err ?: len;
7205 }
7206
7207 static struct md_sysfs_entry
7208 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
7209                                 raid5_show_group_thread_cnt,
7210                                 raid5_store_group_thread_cnt);
7211
7212 static struct attribute *raid5_attrs[] =  {
7213         &raid5_stripecache_size.attr,
7214         &raid5_stripecache_active.attr,
7215         &raid5_preread_bypass_threshold.attr,
7216         &raid5_group_thread_cnt.attr,
7217         &raid5_skip_copy.attr,
7218         &raid5_rmw_level.attr,
7219         &raid5_stripe_size.attr,
7220         &r5c_journal_mode.attr,
7221         &ppl_write_hint.attr,
7222         NULL,
7223 };
7224 static const struct attribute_group raid5_attrs_group = {
7225         .name = NULL,
7226         .attrs = raid5_attrs,
7227 };
7228
7229 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
7230                                struct r5worker_group **worker_groups)
7231 {
7232         int i, j, k;
7233         ssize_t size;
7234         struct r5worker *workers;
7235
7236         if (cnt == 0) {
7237                 *group_cnt = 0;
7238                 *worker_groups = NULL;
7239                 return 0;
7240         }
7241         *group_cnt = num_possible_nodes();
7242         size = sizeof(struct r5worker) * cnt;
7243         workers = kcalloc(size, *group_cnt, GFP_NOIO);
7244         *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group),
7245                                  GFP_NOIO);
7246         if (!*worker_groups || !workers) {
7247                 kfree(workers);
7248                 kfree(*worker_groups);
7249                 return -ENOMEM;
7250         }
7251
7252         for (i = 0; i < *group_cnt; i++) {
7253                 struct r5worker_group *group;
7254
7255                 group = &(*worker_groups)[i];
7256                 INIT_LIST_HEAD(&group->handle_list);
7257                 INIT_LIST_HEAD(&group->loprio_list);
7258                 group->conf = conf;
7259                 group->workers = workers + i * cnt;
7260
7261                 for (j = 0; j < cnt; j++) {
7262                         struct r5worker *worker = group->workers + j;
7263                         worker->group = group;
7264                         INIT_WORK(&worker->work, raid5_do_work);
7265
7266                         for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
7267                                 INIT_LIST_HEAD(worker->temp_inactive_list + k);
7268                 }
7269         }
7270
7271         return 0;
7272 }
7273
7274 static void free_thread_groups(struct r5conf *conf)
7275 {
7276         if (conf->worker_groups)
7277                 kfree(conf->worker_groups[0].workers);
7278         kfree(conf->worker_groups);
7279         conf->worker_groups = NULL;
7280 }
7281
7282 static sector_t
7283 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
7284 {
7285         struct r5conf *conf = mddev->private;
7286
7287         if (!sectors)
7288                 sectors = mddev->dev_sectors;
7289         if (!raid_disks)
7290                 /* size is defined by the smallest of previous and new size */
7291                 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7292
7293         sectors &= ~((sector_t)conf->chunk_sectors - 1);
7294         sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
7295         return sectors * (raid_disks - conf->max_degraded);
7296 }
7297
7298 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7299 {
7300         safe_put_page(percpu->spare_page);
7301         percpu->spare_page = NULL;
7302         kvfree(percpu->scribble);
7303         percpu->scribble = NULL;
7304 }
7305
7306 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
7307 {
7308         if (conf->level == 6 && !percpu->spare_page) {
7309                 percpu->spare_page = alloc_page(GFP_KERNEL);
7310                 if (!percpu->spare_page)
7311                         return -ENOMEM;
7312         }
7313
7314         if (scribble_alloc(percpu,
7315                            max(conf->raid_disks,
7316                                conf->previous_raid_disks),
7317                            max(conf->chunk_sectors,
7318                                conf->prev_chunk_sectors)
7319                            / RAID5_STRIPE_SECTORS(conf))) {
7320                 free_scratch_buffer(conf, percpu);
7321                 return -ENOMEM;
7322         }
7323
7324         local_lock_init(&percpu->lock);
7325         return 0;
7326 }
7327
7328 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7329 {
7330         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7331
7332         free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7333         return 0;
7334 }
7335
7336 static void raid5_free_percpu(struct r5conf *conf)
7337 {
7338         if (!conf->percpu)
7339                 return;
7340
7341         cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7342         free_percpu(conf->percpu);
7343 }
7344
7345 static void free_conf(struct r5conf *conf)
7346 {
7347         int i;
7348
7349         log_exit(conf);
7350
7351         unregister_shrinker(&conf->shrinker);
7352         free_thread_groups(conf);
7353         shrink_stripes(conf);
7354         raid5_free_percpu(conf);
7355         for (i = 0; i < conf->pool_size; i++)
7356                 if (conf->disks[i].extra_page)
7357                         put_page(conf->disks[i].extra_page);
7358         kfree(conf->disks);
7359         bioset_exit(&conf->bio_split);
7360         kfree(conf->stripe_hashtbl);
7361         kfree(conf->pending_data);
7362         kfree(conf);
7363 }
7364
7365 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7366 {
7367         struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
7368         struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7369
7370         if (alloc_scratch_buffer(conf, percpu)) {
7371                 pr_warn("%s: failed memory allocation for cpu%u\n",
7372                         __func__, cpu);
7373                 return -ENOMEM;
7374         }
7375         return 0;
7376 }
7377
7378 static int raid5_alloc_percpu(struct r5conf *conf)
7379 {
7380         int err = 0;
7381
7382         conf->percpu = alloc_percpu(struct raid5_percpu);
7383         if (!conf->percpu)
7384                 return -ENOMEM;
7385
7386         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
7387         if (!err) {
7388                 conf->scribble_disks = max(conf->raid_disks,
7389                         conf->previous_raid_disks);
7390                 conf->scribble_sectors = max(conf->chunk_sectors,
7391                         conf->prev_chunk_sectors);
7392         }
7393         return err;
7394 }
7395
7396 static unsigned long raid5_cache_scan(struct shrinker *shrink,
7397                                       struct shrink_control *sc)
7398 {
7399         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7400         unsigned long ret = SHRINK_STOP;
7401
7402         if (mutex_trylock(&conf->cache_size_mutex)) {
7403                 ret= 0;
7404                 while (ret < sc->nr_to_scan &&
7405                        conf->max_nr_stripes > conf->min_nr_stripes) {
7406                         if (drop_one_stripe(conf) == 0) {
7407                                 ret = SHRINK_STOP;
7408                                 break;
7409                         }
7410                         ret++;
7411                 }
7412                 mutex_unlock(&conf->cache_size_mutex);
7413         }
7414         return ret;
7415 }
7416
7417 static unsigned long raid5_cache_count(struct shrinker *shrink,
7418                                        struct shrink_control *sc)
7419 {
7420         struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
7421
7422         if (conf->max_nr_stripes < conf->min_nr_stripes)
7423                 /* unlikely, but not impossible */
7424                 return 0;
7425         return conf->max_nr_stripes - conf->min_nr_stripes;
7426 }
7427
7428 static struct r5conf *setup_conf(struct mddev *mddev)
7429 {
7430         struct r5conf *conf;
7431         int raid_disk, memory, max_disks;
7432         struct md_rdev *rdev;
7433         struct disk_info *disk;
7434         char pers_name[6];
7435         int i;
7436         int group_cnt;
7437         struct r5worker_group *new_group;
7438         int ret = -ENOMEM;
7439
7440         if (mddev->new_level != 5
7441             && mddev->new_level != 4
7442             && mddev->new_level != 6) {
7443                 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7444                         mdname(mddev), mddev->new_level);
7445                 return ERR_PTR(-EIO);
7446         }
7447         if ((mddev->new_level == 5
7448              && !algorithm_valid_raid5(mddev->new_layout)) ||
7449             (mddev->new_level == 6
7450              && !algorithm_valid_raid6(mddev->new_layout))) {
7451                 pr_warn("md/raid:%s: layout %d not supported\n",
7452                         mdname(mddev), mddev->new_layout);
7453                 return ERR_PTR(-EIO);
7454         }
7455         if (mddev->new_level == 6 && mddev->raid_disks < 4) {
7456                 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7457                         mdname(mddev), mddev->raid_disks);
7458                 return ERR_PTR(-EINVAL);
7459         }
7460
7461         if (!mddev->new_chunk_sectors ||
7462             (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
7463             !is_power_of_2(mddev->new_chunk_sectors)) {
7464                 pr_warn("md/raid:%s: invalid chunk size %d\n",
7465                         mdname(mddev), mddev->new_chunk_sectors << 9);
7466                 return ERR_PTR(-EINVAL);
7467         }
7468
7469         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
7470         if (conf == NULL)
7471                 goto abort;
7472
7473 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7474         conf->stripe_size = DEFAULT_STRIPE_SIZE;
7475         conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7476         conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7477 #endif
7478         INIT_LIST_HEAD(&conf->free_list);
7479         INIT_LIST_HEAD(&conf->pending_list);
7480         conf->pending_data = kcalloc(PENDING_IO_MAX,
7481                                      sizeof(struct r5pending_data),
7482                                      GFP_KERNEL);
7483         if (!conf->pending_data)
7484                 goto abort;
7485         for (i = 0; i < PENDING_IO_MAX; i++)
7486                 list_add(&conf->pending_data[i].sibling, &conf->free_list);
7487         /* Don't enable multi-threading by default*/
7488         if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
7489                 conf->group_cnt = group_cnt;
7490                 conf->worker_cnt_per_group = 0;
7491                 conf->worker_groups = new_group;
7492         } else
7493                 goto abort;
7494         spin_lock_init(&conf->device_lock);
7495         seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7496         mutex_init(&conf->cache_size_mutex);
7497
7498         init_waitqueue_head(&conf->wait_for_quiescent);
7499         init_waitqueue_head(&conf->wait_for_stripe);
7500         init_waitqueue_head(&conf->wait_for_overlap);
7501         INIT_LIST_HEAD(&conf->handle_list);
7502         INIT_LIST_HEAD(&conf->loprio_list);
7503         INIT_LIST_HEAD(&conf->hold_list);
7504         INIT_LIST_HEAD(&conf->delayed_list);
7505         INIT_LIST_HEAD(&conf->bitmap_list);
7506         init_llist_head(&conf->released_stripes);
7507         atomic_set(&conf->active_stripes, 0);
7508         atomic_set(&conf->preread_active_stripes, 0);
7509         atomic_set(&conf->active_aligned_reads, 0);
7510         spin_lock_init(&conf->pending_bios_lock);
7511         conf->batch_bio_dispatch = true;
7512         rdev_for_each(rdev, mddev) {
7513                 if (test_bit(Journal, &rdev->flags))
7514                         continue;
7515                 if (bdev_nonrot(rdev->bdev)) {
7516                         conf->batch_bio_dispatch = false;
7517                         break;
7518                 }
7519         }
7520
7521         conf->bypass_threshold = BYPASS_THRESHOLD;
7522         conf->recovery_disabled = mddev->recovery_disabled - 1;
7523
7524         conf->raid_disks = mddev->raid_disks;
7525         if (mddev->reshape_position == MaxSector)
7526                 conf->previous_raid_disks = mddev->raid_disks;
7527         else
7528                 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7529         max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7530
7531         conf->disks = kcalloc(max_disks, sizeof(struct disk_info),
7532                               GFP_KERNEL);
7533
7534         if (!conf->disks)
7535                 goto abort;
7536
7537         for (i = 0; i < max_disks; i++) {
7538                 conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7539                 if (!conf->disks[i].extra_page)
7540                         goto abort;
7541         }
7542
7543         ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
7544         if (ret)
7545                 goto abort;
7546         conf->mddev = mddev;
7547
7548         ret = -ENOMEM;
7549         conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7550         if (!conf->stripe_hashtbl)
7551                 goto abort;
7552
7553         /* We init hash_locks[0] separately to that it can be used
7554          * as the reference lock in the spin_lock_nest_lock() call
7555          * in lock_all_device_hash_locks_irq in order to convince
7556          * lockdep that we know what we are doing.
7557          */
7558         spin_lock_init(conf->hash_locks);
7559         for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
7560                 spin_lock_init(conf->hash_locks + i);
7561
7562         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7563                 INIT_LIST_HEAD(conf->inactive_list + i);
7564
7565         for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
7566                 INIT_LIST_HEAD(conf->temp_inactive_list + i);
7567
7568         atomic_set(&conf->r5c_cached_full_stripes, 0);
7569         INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
7570         atomic_set(&conf->r5c_cached_partial_stripes, 0);
7571         INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
7572         atomic_set(&conf->r5c_flushing_full_stripes, 0);
7573         atomic_set(&conf->r5c_flushing_partial_stripes, 0);
7574
7575         conf->level = mddev->new_level;
7576         conf->chunk_sectors = mddev->new_chunk_sectors;
7577         ret = raid5_alloc_percpu(conf);
7578         if (ret)
7579                 goto abort;
7580
7581         pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7582
7583         ret = -EIO;
7584         rdev_for_each(rdev, mddev) {
7585                 raid_disk = rdev->raid_disk;
7586                 if (raid_disk >= max_disks
7587                     || raid_disk < 0 || test_bit(Journal, &rdev->flags))
7588                         continue;
7589                 disk = conf->disks + raid_disk;
7590
7591                 if (test_bit(Replacement, &rdev->flags)) {
7592                         if (disk->replacement)
7593                                 goto abort;
7594                         RCU_INIT_POINTER(disk->replacement, rdev);
7595                 } else {
7596                         if (disk->rdev)
7597                                 goto abort;
7598                         RCU_INIT_POINTER(disk->rdev, rdev);
7599                 }
7600
7601                 if (test_bit(In_sync, &rdev->flags)) {
7602                         pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7603                                 mdname(mddev), rdev->bdev, raid_disk);
7604                 } else if (rdev->saved_raid_disk != raid_disk)
7605                         /* Cannot rely on bitmap to complete recovery */
7606                         conf->fullsync = 1;
7607         }
7608
7609         conf->level = mddev->new_level;
7610         if (conf->level == 6) {
7611                 conf->max_degraded = 2;
7612                 if (raid6_call.xor_syndrome)
7613                         conf->rmw_level = PARITY_ENABLE_RMW;
7614                 else
7615                         conf->rmw_level = PARITY_DISABLE_RMW;
7616         } else {
7617                 conf->max_degraded = 1;
7618                 conf->rmw_level = PARITY_ENABLE_RMW;
7619         }
7620         conf->algorithm = mddev->new_layout;
7621         conf->reshape_progress = mddev->reshape_position;
7622         if (conf->reshape_progress != MaxSector) {
7623                 conf->prev_chunk_sectors = mddev->chunk_sectors;
7624                 conf->prev_algo = mddev->layout;
7625         } else {
7626                 conf->prev_chunk_sectors = conf->chunk_sectors;
7627                 conf->prev_algo = conf->algorithm;
7628         }
7629
7630         conf->min_nr_stripes = NR_STRIPES;
7631         if (mddev->reshape_position != MaxSector) {
7632                 int stripes = max_t(int,
7633                         ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7634                         ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
7635                 conf->min_nr_stripes = max(NR_STRIPES, stripes);
7636                 if (conf->min_nr_stripes != NR_STRIPES)
7637                         pr_info("md/raid:%s: force stripe size %d for reshape\n",
7638                                 mdname(mddev), conf->min_nr_stripes);
7639         }
7640         memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7641                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
7642         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7643         if (grow_stripes(conf, conf->min_nr_stripes)) {
7644                 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7645                         mdname(mddev), memory);
7646                 ret = -ENOMEM;
7647                 goto abort;
7648         } else
7649                 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7650         /*
7651          * Losing a stripe head costs more than the time to refill it,
7652          * it reduces the queue depth and so can hurt throughput.
7653          * So set it rather large, scaled by number of devices.
7654          */
7655         conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
7656         conf->shrinker.scan_objects = raid5_cache_scan;
7657         conf->shrinker.count_objects = raid5_cache_count;
7658         conf->shrinker.batch = 128;
7659         conf->shrinker.flags = 0;
7660         ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
7661         if (ret) {
7662                 pr_warn("md/raid:%s: couldn't register shrinker.\n",
7663                         mdname(mddev));
7664                 goto abort;
7665         }
7666
7667         sprintf(pers_name, "raid%d", mddev->new_level);
7668         conf->thread = md_register_thread(raid5d, mddev, pers_name);
7669         if (!conf->thread) {
7670                 pr_warn("md/raid:%s: couldn't allocate thread.\n",
7671                         mdname(mddev));
7672                 ret = -ENOMEM;
7673                 goto abort;
7674         }
7675
7676         return conf;
7677
7678  abort:
7679         if (conf)
7680                 free_conf(conf);
7681         return ERR_PTR(ret);
7682 }
7683
7684 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7685 {
7686         switch (algo) {
7687         case ALGORITHM_PARITY_0:
7688                 if (raid_disk < max_degraded)
7689                         return 1;
7690                 break;
7691         case ALGORITHM_PARITY_N:
7692                 if (raid_disk >= raid_disks - max_degraded)
7693                         return 1;
7694                 break;
7695         case ALGORITHM_PARITY_0_6:
7696                 if (raid_disk == 0 ||
7697                     raid_disk == raid_disks - 1)
7698                         return 1;
7699                 break;
7700         case ALGORITHM_LEFT_ASYMMETRIC_6:
7701         case ALGORITHM_RIGHT_ASYMMETRIC_6:
7702         case ALGORITHM_LEFT_SYMMETRIC_6:
7703         case ALGORITHM_RIGHT_SYMMETRIC_6:
7704                 if (raid_disk == raid_disks - 1)
7705                         return 1;
7706         }
7707         return 0;
7708 }
7709
7710 static void raid5_set_io_opt(struct r5conf *conf)
7711 {
7712         blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7713                          (conf->raid_disks - conf->max_degraded));
7714 }
7715
7716 static int raid5_run(struct mddev *mddev)
7717 {
7718         struct r5conf *conf;
7719         int dirty_parity_disks = 0;
7720         struct md_rdev *rdev;
7721         struct md_rdev *journal_dev = NULL;
7722         sector_t reshape_offset = 0;
7723         int i, ret = 0;
7724         long long min_offset_diff = 0;
7725         int first = 1;
7726
7727         if (acct_bioset_init(mddev)) {
7728                 pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
7729                 return -ENOMEM;
7730         }
7731
7732         if (mddev_init_writes_pending(mddev) < 0) {
7733                 ret = -ENOMEM;
7734                 goto exit_acct_set;
7735         }
7736
7737         if (mddev->recovery_cp != MaxSector)
7738                 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7739                           mdname(mddev));
7740
7741         rdev_for_each(rdev, mddev) {
7742                 long long diff;
7743
7744                 if (test_bit(Journal, &rdev->flags)) {
7745                         journal_dev = rdev;
7746                         continue;
7747                 }
7748                 if (rdev->raid_disk < 0)
7749                         continue;
7750                 diff = (rdev->new_data_offset - rdev->data_offset);
7751                 if (first) {
7752                         min_offset_diff = diff;
7753                         first = 0;
7754                 } else if (mddev->reshape_backwards &&
7755                          diff < min_offset_diff)
7756                         min_offset_diff = diff;
7757                 else if (!mddev->reshape_backwards &&
7758                          diff > min_offset_diff)
7759                         min_offset_diff = diff;
7760         }
7761
7762         if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7763             (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7764                 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7765                           mdname(mddev));
7766                 ret = -EINVAL;
7767                 goto exit_acct_set;
7768         }
7769
7770         if (mddev->reshape_position != MaxSector) {
7771                 /* Check that we can continue the reshape.
7772                  * Difficulties arise if the stripe we would write to
7773                  * next is at or after the stripe we would read from next.
7774                  * For a reshape that changes the number of devices, this
7775                  * is only possible for a very short time, and mdadm makes
7776                  * sure that time appears to have past before assembling
7777                  * the array.  So we fail if that time hasn't passed.
7778                  * For a reshape that keeps the number of devices the same
7779                  * mdadm must be monitoring the reshape can keeping the
7780                  * critical areas read-only and backed up.  It will start
7781                  * the array in read-only mode, so we check for that.
7782                  */
7783                 sector_t here_new, here_old;
7784                 int old_disks;
7785                 int max_degraded = (mddev->level == 6 ? 2 : 1);
7786                 int chunk_sectors;
7787                 int new_data_disks;
7788
7789                 if (journal_dev) {
7790                         pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7791                                 mdname(mddev));
7792                         ret = -EINVAL;
7793                         goto exit_acct_set;
7794                 }
7795
7796                 if (mddev->new_level != mddev->level) {
7797                         pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7798                                 mdname(mddev));
7799                         ret = -EINVAL;
7800                         goto exit_acct_set;
7801                 }
7802                 old_disks = mddev->raid_disks - mddev->delta_disks;
7803                 /* reshape_position must be on a new-stripe boundary, and one
7804                  * further up in new geometry must map after here in old
7805                  * geometry.
7806                  * If the chunk sizes are different, then as we perform reshape
7807                  * in units of the largest of the two, reshape_position needs
7808                  * be a multiple of the largest chunk size times new data disks.
7809                  */
7810                 here_new = mddev->reshape_position;
7811                 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7812                 new_data_disks = mddev->raid_disks - max_degraded;
7813                 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7814                         pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7815                                 mdname(mddev));
7816                         ret = -EINVAL;
7817                         goto exit_acct_set;
7818                 }
7819                 reshape_offset = here_new * chunk_sectors;
7820                 /* here_new is the stripe we will write to */
7821                 here_old = mddev->reshape_position;
7822                 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7823                 /* here_old is the first stripe that we might need to read
7824                  * from */
7825                 if (mddev->delta_disks == 0) {
7826                         /* We cannot be sure it is safe to start an in-place
7827                          * reshape.  It is only safe if user-space is monitoring
7828                          * and taking constant backups.
7829                          * mdadm always starts a situation like this in
7830                          * readonly mode so it can take control before
7831                          * allowing any writes.  So just check for that.
7832                          */
7833                         if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7834                             abs(min_offset_diff) >= mddev->new_chunk_sectors)
7835                                 /* not really in-place - so OK */;
7836                         else if (mddev->ro == 0) {
7837                                 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7838                                         mdname(mddev));
7839                                 ret = -EINVAL;
7840                                 goto exit_acct_set;
7841                         }
7842                 } else if (mddev->reshape_backwards
7843                     ? (here_new * chunk_sectors + min_offset_diff <=
7844                        here_old * chunk_sectors)
7845                     : (here_new * chunk_sectors >=
7846                        here_old * chunk_sectors + (-min_offset_diff))) {
7847                         /* Reading from the same stripe as writing to - bad */
7848                         pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7849                                 mdname(mddev));
7850                         ret = -EINVAL;
7851                         goto exit_acct_set;
7852                 }
7853                 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7854                 /* OK, we should be able to continue; */
7855         } else {
7856                 BUG_ON(mddev->level != mddev->new_level);
7857                 BUG_ON(mddev->layout != mddev->new_layout);
7858                 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7859                 BUG_ON(mddev->delta_disks != 0);
7860         }
7861
7862         if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7863             test_bit(MD_HAS_PPL, &mddev->flags)) {
7864                 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7865                         mdname(mddev));
7866                 clear_bit(MD_HAS_PPL, &mddev->flags);
7867                 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
7868         }
7869
7870         if (mddev->private == NULL)
7871                 conf = setup_conf(mddev);
7872         else
7873                 conf = mddev->private;
7874
7875         if (IS_ERR(conf)) {
7876                 ret = PTR_ERR(conf);
7877                 goto exit_acct_set;
7878         }
7879
7880         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7881                 if (!journal_dev) {
7882                         pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7883                                 mdname(mddev));
7884                         mddev->ro = 1;
7885                         set_disk_ro(mddev->gendisk, 1);
7886                 } else if (mddev->recovery_cp == MaxSector)
7887                         set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7888         }
7889
7890         conf->min_offset_diff = min_offset_diff;
7891         mddev->thread = conf->thread;
7892         conf->thread = NULL;
7893         mddev->private = conf;
7894
7895         for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7896              i++) {
7897                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
7898                 if (!rdev && conf->disks[i].replacement) {
7899                         /* The replacement is all we have yet */
7900                         rdev = rdev_mdlock_deref(mddev,
7901                                                  conf->disks[i].replacement);
7902                         conf->disks[i].replacement = NULL;
7903                         clear_bit(Replacement, &rdev->flags);
7904                         rcu_assign_pointer(conf->disks[i].rdev, rdev);
7905                 }
7906                 if (!rdev)
7907                         continue;
7908                 if (rcu_access_pointer(conf->disks[i].replacement) &&
7909                     conf->reshape_progress != MaxSector) {
7910                         /* replacements and reshape simply do not mix. */
7911                         pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7912                         goto abort;
7913                 }
7914                 if (test_bit(In_sync, &rdev->flags))
7915                         continue;
7916                 /* This disc is not fully in-sync.  However if it
7917                  * just stored parity (beyond the recovery_offset),
7918                  * when we don't need to be concerned about the
7919                  * array being dirty.
7920                  * When reshape goes 'backwards', we never have
7921                  * partially completed devices, so we only need
7922                  * to worry about reshape going forwards.
7923                  */
7924                 /* Hack because v0.91 doesn't store recovery_offset properly. */
7925                 if (mddev->major_version == 0 &&
7926                     mddev->minor_version > 90)
7927                         rdev->recovery_offset = reshape_offset;
7928
7929                 if (rdev->recovery_offset < reshape_offset) {
7930                         /* We need to check old and new layout */
7931                         if (!only_parity(rdev->raid_disk,
7932                                          conf->algorithm,
7933                                          conf->raid_disks,
7934                                          conf->max_degraded))
7935                                 continue;
7936                 }
7937                 if (!only_parity(rdev->raid_disk,
7938                                  conf->prev_algo,
7939                                  conf->previous_raid_disks,
7940                                  conf->max_degraded))
7941                         continue;
7942                 dirty_parity_disks++;
7943         }
7944
7945         /*
7946          * 0 for a fully functional array, 1 or 2 for a degraded array.
7947          */
7948         mddev->degraded = raid5_calc_degraded(conf);
7949
7950         if (has_failed(conf)) {
7951                 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7952                         mdname(mddev), mddev->degraded, conf->raid_disks);
7953                 goto abort;
7954         }
7955
7956         /* device size must be a multiple of chunk size */
7957         mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
7958         mddev->resync_max_sectors = mddev->dev_sectors;
7959
7960         if (mddev->degraded > dirty_parity_disks &&
7961             mddev->recovery_cp != MaxSector) {
7962                 if (test_bit(MD_HAS_PPL, &mddev->flags))
7963                         pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7964                                 mdname(mddev));
7965                 else if (mddev->ok_start_degraded)
7966                         pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7967                                 mdname(mddev));
7968                 else {
7969                         pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7970                                 mdname(mddev));
7971                         goto abort;
7972                 }
7973         }
7974
7975         pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7976                 mdname(mddev), conf->level,
7977                 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7978                 mddev->new_layout);
7979
7980         print_raid5_conf(conf);
7981
7982         if (conf->reshape_progress != MaxSector) {
7983                 conf->reshape_safe = conf->reshape_progress;
7984                 atomic_set(&conf->reshape_stripes, 0);
7985                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7986                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7987                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7988                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7989                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7990                                                         "reshape");
7991                 if (!mddev->sync_thread)
7992                         goto abort;
7993         }
7994
7995         /* Ok, everything is just fine now */
7996         if (mddev->to_remove == &raid5_attrs_group)
7997                 mddev->to_remove = NULL;
7998         else if (mddev->kobj.sd &&
7999             sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
8000                 pr_warn("raid5: failed to create sysfs attributes for %s\n",
8001                         mdname(mddev));
8002         md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
8003
8004         if (mddev->queue) {
8005                 int chunk_size;
8006                 /* read-ahead size must cover two whole stripes, which
8007                  * is 2 * (datadisks) * chunksize where 'n' is the
8008                  * number of raid devices
8009                  */
8010                 int data_disks = conf->previous_raid_disks - conf->max_degraded;
8011                 int stripe = data_disks *
8012                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
8013
8014                 chunk_size = mddev->chunk_sectors << 9;
8015                 blk_queue_io_min(mddev->queue, chunk_size);
8016                 raid5_set_io_opt(conf);
8017                 mddev->queue->limits.raid_partial_stripes_expensive = 1;
8018                 /*
8019                  * We can only discard a whole stripe. It doesn't make sense to
8020                  * discard data disk but write parity disk
8021                  */
8022                 stripe = stripe * PAGE_SIZE;
8023                 stripe = roundup_pow_of_two(stripe);
8024                 mddev->queue->limits.discard_granularity = stripe;
8025
8026                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
8027
8028                 rdev_for_each(rdev, mddev) {
8029                         disk_stack_limits(mddev->gendisk, rdev->bdev,
8030                                           rdev->data_offset << 9);
8031                         disk_stack_limits(mddev->gendisk, rdev->bdev,
8032                                           rdev->new_data_offset << 9);
8033                 }
8034
8035                 /*
8036                  * zeroing is required, otherwise data
8037                  * could be lost. Consider a scenario: discard a stripe
8038                  * (the stripe could be inconsistent if
8039                  * discard_zeroes_data is 0); write one disk of the
8040                  * stripe (the stripe could be inconsistent again
8041                  * depending on which disks are used to calculate
8042                  * parity); the disk is broken; The stripe data of this
8043                  * disk is lost.
8044                  *
8045                  * We only allow DISCARD if the sysadmin has confirmed that
8046                  * only safe devices are in use by setting a module parameter.
8047                  * A better idea might be to turn DISCARD into WRITE_ZEROES
8048                  * requests, as that is required to be safe.
8049                  */
8050                 if (!devices_handle_discard_safely ||
8051                     mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
8052                     mddev->queue->limits.discard_granularity < stripe)
8053                         blk_queue_max_discard_sectors(mddev->queue, 0);
8054
8055                 /*
8056                  * Requests require having a bitmap for each stripe.
8057                  * Limit the max sectors based on this.
8058                  */
8059                 blk_queue_max_hw_sectors(mddev->queue,
8060                         RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
8061
8062                 /* No restrictions on the number of segments in the request */
8063                 blk_queue_max_segments(mddev->queue, USHRT_MAX);
8064         }
8065
8066         if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
8067                 goto abort;
8068
8069         return 0;
8070 abort:
8071         md_unregister_thread(&mddev->thread);
8072         print_raid5_conf(conf);
8073         free_conf(conf);
8074         mddev->private = NULL;
8075         pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8076         ret = -EIO;
8077 exit_acct_set:
8078         acct_bioset_exit(mddev);
8079         return ret;
8080 }
8081
8082 static void raid5_free(struct mddev *mddev, void *priv)
8083 {
8084         struct r5conf *conf = priv;
8085
8086         free_conf(conf);
8087         acct_bioset_exit(mddev);
8088         mddev->to_remove = &raid5_attrs_group;
8089 }
8090
8091 static void raid5_status(struct seq_file *seq, struct mddev *mddev)
8092 {
8093         struct r5conf *conf = mddev->private;
8094         int i;
8095
8096         seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
8097                 conf->chunk_sectors / 2, mddev->layout);
8098         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8099         rcu_read_lock();
8100         for (i = 0; i < conf->raid_disks; i++) {
8101                 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
8102                 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8103         }
8104         rcu_read_unlock();
8105         seq_printf (seq, "]");
8106 }
8107
8108 static void print_raid5_conf (struct r5conf *conf)
8109 {
8110         struct md_rdev *rdev;
8111         int i;
8112
8113         pr_debug("RAID conf printout:\n");
8114         if (!conf) {
8115                 pr_debug("(conf==NULL)\n");
8116                 return;
8117         }
8118         pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8119                conf->raid_disks,
8120                conf->raid_disks - conf->mddev->degraded);
8121
8122         rcu_read_lock();
8123         for (i = 0; i < conf->raid_disks; i++) {
8124                 rdev = rcu_dereference(conf->disks[i].rdev);
8125                 if (rdev)
8126                         pr_debug(" disk %d, o:%d, dev:%pg\n",
8127                                i, !test_bit(Faulty, &rdev->flags),
8128                                rdev->bdev);
8129         }
8130         rcu_read_unlock();
8131 }
8132
8133 static int raid5_spare_active(struct mddev *mddev)
8134 {
8135         int i;
8136         struct r5conf *conf = mddev->private;
8137         struct md_rdev *rdev, *replacement;
8138         int count = 0;
8139         unsigned long flags;
8140
8141         for (i = 0; i < conf->raid_disks; i++) {
8142                 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
8143                 replacement = rdev_mdlock_deref(mddev,
8144                                                 conf->disks[i].replacement);
8145                 if (replacement
8146                     && replacement->recovery_offset == MaxSector
8147                     && !test_bit(Faulty, &replacement->flags)
8148                     && !test_and_set_bit(In_sync, &replacement->flags)) {
8149                         /* Replacement has just become active. */
8150                         if (!rdev
8151                             || !test_and_clear_bit(In_sync, &rdev->flags))
8152                                 count++;
8153                         if (rdev) {
8154                                 /* Replaced device not technically faulty,
8155                                  * but we need to be sure it gets removed
8156                                  * and never re-added.
8157                                  */
8158                                 set_bit(Faulty, &rdev->flags);
8159                                 sysfs_notify_dirent_safe(
8160                                         rdev->sysfs_state);
8161                         }
8162                         sysfs_notify_dirent_safe(replacement->sysfs_state);
8163                 } else if (rdev
8164                     && rdev->recovery_offset == MaxSector
8165                     && !test_bit(Faulty, &rdev->flags)
8166                     && !test_and_set_bit(In_sync, &rdev->flags)) {
8167                         count++;
8168                         sysfs_notify_dirent_safe(rdev->sysfs_state);
8169                 }
8170         }
8171         spin_lock_irqsave(&conf->device_lock, flags);
8172         mddev->degraded = raid5_calc_degraded(conf);
8173         spin_unlock_irqrestore(&conf->device_lock, flags);
8174         print_raid5_conf(conf);
8175         return count;
8176 }
8177
8178 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
8179 {
8180         struct r5conf *conf = mddev->private;
8181         int err = 0;
8182         int number = rdev->raid_disk;
8183         struct md_rdev __rcu **rdevp;
8184         struct disk_info *p;
8185         struct md_rdev *tmp;
8186
8187         print_raid5_conf(conf);
8188         if (test_bit(Journal, &rdev->flags) && conf->log) {
8189                 /*
8190                  * we can't wait pending write here, as this is called in
8191                  * raid5d, wait will deadlock.
8192                  * neilb: there is no locking about new writes here,
8193                  * so this cannot be safe.
8194                  */
8195                 if (atomic_read(&conf->active_stripes) ||
8196                     atomic_read(&conf->r5c_cached_full_stripes) ||
8197                     atomic_read(&conf->r5c_cached_partial_stripes)) {
8198                         return -EBUSY;
8199                 }
8200                 log_exit(conf);
8201                 return 0;
8202         }
8203         if (unlikely(number >= conf->pool_size))
8204                 return 0;
8205         p = conf->disks + number;
8206         if (rdev == rcu_access_pointer(p->rdev))
8207                 rdevp = &p->rdev;
8208         else if (rdev == rcu_access_pointer(p->replacement))
8209                 rdevp = &p->replacement;
8210         else
8211                 return 0;
8212
8213         if (number >= conf->raid_disks &&
8214             conf->reshape_progress == MaxSector)
8215                 clear_bit(In_sync, &rdev->flags);
8216
8217         if (test_bit(In_sync, &rdev->flags) ||
8218             atomic_read(&rdev->nr_pending)) {
8219                 err = -EBUSY;
8220                 goto abort;
8221         }
8222         /* Only remove non-faulty devices if recovery
8223          * isn't possible.
8224          */
8225         if (!test_bit(Faulty, &rdev->flags) &&
8226             mddev->recovery_disabled != conf->recovery_disabled &&
8227             !has_failed(conf) &&
8228             (!rcu_access_pointer(p->replacement) ||
8229              rcu_access_pointer(p->replacement) == rdev) &&
8230             number < conf->raid_disks) {
8231                 err = -EBUSY;
8232                 goto abort;
8233         }
8234         *rdevp = NULL;
8235         if (!test_bit(RemoveSynchronized, &rdev->flags)) {
8236                 lockdep_assert_held(&mddev->reconfig_mutex);
8237                 synchronize_rcu();
8238                 if (atomic_read(&rdev->nr_pending)) {
8239                         /* lost the race, try later */
8240                         err = -EBUSY;
8241                         rcu_assign_pointer(*rdevp, rdev);
8242                 }
8243         }
8244         if (!err) {
8245                 err = log_modify(conf, rdev, false);
8246                 if (err)
8247                         goto abort;
8248         }
8249
8250         tmp = rcu_access_pointer(p->replacement);
8251         if (tmp) {
8252                 /* We must have just cleared 'rdev' */
8253                 rcu_assign_pointer(p->rdev, tmp);
8254                 clear_bit(Replacement, &tmp->flags);
8255                 smp_mb(); /* Make sure other CPUs may see both as identical
8256                            * but will never see neither - if they are careful
8257                            */
8258                 rcu_assign_pointer(p->replacement, NULL);
8259
8260                 if (!err)
8261                         err = log_modify(conf, tmp, true);
8262         }
8263
8264         clear_bit(WantReplacement, &rdev->flags);
8265 abort:
8266
8267         print_raid5_conf(conf);
8268         return err;
8269 }
8270
8271 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
8272 {
8273         struct r5conf *conf = mddev->private;
8274         int ret, err = -EEXIST;
8275         int disk;
8276         struct disk_info *p;
8277         struct md_rdev *tmp;
8278         int first = 0;
8279         int last = conf->raid_disks - 1;
8280
8281         if (test_bit(Journal, &rdev->flags)) {
8282                 if (conf->log)
8283                         return -EBUSY;
8284
8285                 rdev->raid_disk = 0;
8286                 /*
8287                  * The array is in readonly mode if journal is missing, so no
8288                  * write requests running. We should be safe
8289                  */
8290                 ret = log_init(conf, rdev, false);
8291                 if (ret)
8292                         return ret;
8293
8294                 ret = r5l_start(conf->log);
8295                 if (ret)
8296                         return ret;
8297
8298                 return 0;
8299         }
8300         if (mddev->recovery_disabled == conf->recovery_disabled)
8301                 return -EBUSY;
8302
8303         if (rdev->saved_raid_disk < 0 && has_failed(conf))
8304                 /* no point adding a device */
8305                 return -EINVAL;
8306
8307         if (rdev->raid_disk >= 0)
8308                 first = last = rdev->raid_disk;
8309
8310         /*
8311          * find the disk ... but prefer rdev->saved_raid_disk
8312          * if possible.
8313          */
8314         if (rdev->saved_raid_disk >= first &&
8315             rdev->saved_raid_disk <= last &&
8316             conf->disks[rdev->saved_raid_disk].rdev == NULL)
8317                 first = rdev->saved_raid_disk;
8318
8319         for (disk = first; disk <= last; disk++) {
8320                 p = conf->disks + disk;
8321                 if (p->rdev == NULL) {
8322                         clear_bit(In_sync, &rdev->flags);
8323                         rdev->raid_disk = disk;
8324                         if (rdev->saved_raid_disk != disk)
8325                                 conf->fullsync = 1;
8326                         rcu_assign_pointer(p->rdev, rdev);
8327
8328                         err = log_modify(conf, rdev, true);
8329
8330                         goto out;
8331                 }
8332         }
8333         for (disk = first; disk <= last; disk++) {
8334                 p = conf->disks + disk;
8335                 tmp = rdev_mdlock_deref(mddev, p->rdev);
8336                 if (test_bit(WantReplacement, &tmp->flags) &&
8337                     p->replacement == NULL) {
8338                         clear_bit(In_sync, &rdev->flags);
8339                         set_bit(Replacement, &rdev->flags);
8340                         rdev->raid_disk = disk;
8341                         err = 0;
8342                         conf->fullsync = 1;
8343                         rcu_assign_pointer(p->replacement, rdev);
8344                         break;
8345                 }
8346         }
8347 out:
8348         print_raid5_conf(conf);
8349         return err;
8350 }
8351
8352 static int raid5_resize(struct mddev *mddev, sector_t sectors)
8353 {
8354         /* no resync is happening, and there is enough space
8355          * on all devices, so we can resize.
8356          * We need to make sure resync covers any new space.
8357          * If the array is shrinking we should possibly wait until
8358          * any io in the removed space completes, but it hardly seems
8359          * worth it.
8360          */
8361         sector_t newsize;
8362         struct r5conf *conf = mddev->private;
8363
8364         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8365                 return -EINVAL;
8366         sectors &= ~((sector_t)conf->chunk_sectors - 1);
8367         newsize = raid5_size(mddev, sectors, mddev->raid_disks);
8368         if (mddev->external_size &&
8369             mddev->array_sectors > newsize)
8370                 return -EINVAL;
8371         if (mddev->bitmap) {
8372                 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
8373                 if (ret)
8374                         return ret;
8375         }
8376         md_set_array_sectors(mddev, newsize);
8377         if (sectors > mddev->dev_sectors &&
8378             mddev->recovery_cp > mddev->dev_sectors) {
8379                 mddev->recovery_cp = mddev->dev_sectors;
8380                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8381         }
8382         mddev->dev_sectors = sectors;
8383         mddev->resync_max_sectors = sectors;
8384         return 0;
8385 }
8386
8387 static int check_stripe_cache(struct mddev *mddev)
8388 {
8389         /* Can only proceed if there are plenty of stripe_heads.
8390          * We need a minimum of one full stripe,, and for sensible progress
8391          * it is best to have about 4 times that.
8392          * If we require 4 times, then the default 256 4K stripe_heads will
8393          * allow for chunk sizes up to 256K, which is probably OK.
8394          * If the chunk size is greater, user-space should request more
8395          * stripe_heads first.
8396          */
8397         struct r5conf *conf = mddev->private;
8398         if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8399             > conf->min_nr_stripes ||
8400             ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
8401             > conf->min_nr_stripes) {
8402                 pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
8403                         mdname(mddev),
8404                         ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
8405                          / RAID5_STRIPE_SIZE(conf))*4);
8406                 return 0;
8407         }
8408         return 1;
8409 }
8410
8411 static int check_reshape(struct mddev *mddev)
8412 {
8413         struct r5conf *conf = mddev->private;
8414
8415         if (raid5_has_log(conf) || raid5_has_ppl(conf))
8416                 return -EINVAL;
8417         if (mddev->delta_disks == 0 &&
8418             mddev->new_layout == mddev->layout &&
8419             mddev->new_chunk_sectors == mddev->chunk_sectors)
8420                 return 0; /* nothing to do */
8421         if (has_failed(conf))
8422                 return -EINVAL;
8423         if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
8424                 /* We might be able to shrink, but the devices must
8425                  * be made bigger first.
8426                  * For raid6, 4 is the minimum size.
8427                  * Otherwise 2 is the minimum
8428                  */
8429                 int min = 2;
8430                 if (mddev->level == 6)
8431                         min = 4;
8432                 if (mddev->raid_disks + mddev->delta_disks < min)
8433                         return -EINVAL;
8434         }
8435
8436         if (!check_stripe_cache(mddev))
8437                 return -ENOSPC;
8438
8439         if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
8440             mddev->delta_disks > 0)
8441                 if (resize_chunks(conf,
8442                                   conf->previous_raid_disks
8443                                   + max(0, mddev->delta_disks),
8444                                   max(mddev->new_chunk_sectors,
8445                                       mddev->chunk_sectors)
8446                             ) < 0)
8447                         return -ENOMEM;
8448
8449         if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8450                 return 0; /* never bother to shrink */
8451         return resize_stripes(conf, (conf->previous_raid_disks
8452                                      + mddev->delta_disks));
8453 }
8454
8455 static int raid5_start_reshape(struct mddev *mddev)
8456 {
8457         struct r5conf *conf = mddev->private;
8458         struct md_rdev *rdev;
8459         int spares = 0;
8460         unsigned long flags;
8461
8462         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8463                 return -EBUSY;
8464
8465         if (!check_stripe_cache(mddev))
8466                 return -ENOSPC;
8467
8468         if (has_failed(conf))
8469                 return -EINVAL;
8470
8471         rdev_for_each(rdev, mddev) {
8472                 if (!test_bit(In_sync, &rdev->flags)
8473                     && !test_bit(Faulty, &rdev->flags))
8474                         spares++;
8475         }
8476
8477         if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8478                 /* Not enough devices even to make a degraded array
8479                  * of that size
8480                  */
8481                 return -EINVAL;
8482
8483         /* Refuse to reduce size of the array.  Any reductions in
8484          * array size must be through explicit setting of array_size
8485          * attribute.
8486          */
8487         if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
8488             < mddev->array_sectors) {
8489                 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8490                         mdname(mddev));
8491                 return -EINVAL;
8492         }
8493
8494         atomic_set(&conf->reshape_stripes, 0);
8495         spin_lock_irq(&conf->device_lock);
8496         write_seqcount_begin(&conf->gen_lock);
8497         conf->previous_raid_disks = conf->raid_disks;
8498         conf->raid_disks += mddev->delta_disks;
8499         conf->prev_chunk_sectors = conf->chunk_sectors;
8500         conf->chunk_sectors = mddev->new_chunk_sectors;
8501         conf->prev_algo = conf->algorithm;
8502         conf->algorithm = mddev->new_layout;
8503         conf->generation++;
8504         /* Code that selects data_offset needs to see the generation update
8505          * if reshape_progress has been set - so a memory barrier needed.
8506          */
8507         smp_mb();
8508         if (mddev->reshape_backwards)
8509                 conf->reshape_progress = raid5_size(mddev, 0, 0);
8510         else
8511                 conf->reshape_progress = 0;
8512         conf->reshape_safe = conf->reshape_progress;
8513         write_seqcount_end(&conf->gen_lock);
8514         spin_unlock_irq(&conf->device_lock);
8515
8516         /* Now make sure any requests that proceeded on the assumption
8517          * the reshape wasn't running - like Discard or Read - have
8518          * completed.
8519          */
8520         mddev_suspend(mddev);
8521         mddev_resume(mddev);
8522
8523         /* Add some new drives, as many as will fit.
8524          * We know there are enough to make the newly sized array work.
8525          * Don't add devices if we are reducing the number of
8526          * devices in the array.  This is because it is not possible
8527          * to correctly record the "partially reconstructed" state of
8528          * such devices during the reshape and confusion could result.
8529          */
8530         if (mddev->delta_disks >= 0) {
8531                 rdev_for_each(rdev, mddev)
8532                         if (rdev->raid_disk < 0 &&
8533                             !test_bit(Faulty, &rdev->flags)) {
8534                                 if (raid5_add_disk(mddev, rdev) == 0) {
8535                                         if (rdev->raid_disk
8536                                             >= conf->previous_raid_disks)
8537                                                 set_bit(In_sync, &rdev->flags);
8538                                         else
8539                                                 rdev->recovery_offset = 0;
8540
8541                                         /* Failure here is OK */
8542                                         sysfs_link_rdev(mddev, rdev);
8543                                 }
8544                         } else if (rdev->raid_disk >= conf->previous_raid_disks
8545                                    && !test_bit(Faulty, &rdev->flags)) {
8546                                 /* This is a spare that was manually added */
8547                                 set_bit(In_sync, &rdev->flags);
8548                         }
8549
8550                 /* When a reshape changes the number of devices,
8551                  * ->degraded is measured against the larger of the
8552                  * pre and post number of devices.
8553                  */
8554                 spin_lock_irqsave(&conf->device_lock, flags);
8555                 mddev->degraded = raid5_calc_degraded(conf);
8556                 spin_unlock_irqrestore(&conf->device_lock, flags);
8557         }
8558         mddev->raid_disks = conf->raid_disks;
8559         mddev->reshape_position = conf->reshape_progress;
8560         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8561
8562         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8563         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8564         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8565         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8566         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8567         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
8568                                                 "reshape");
8569         if (!mddev->sync_thread) {
8570                 mddev->recovery = 0;
8571                 spin_lock_irq(&conf->device_lock);
8572                 write_seqcount_begin(&conf->gen_lock);
8573                 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8574                 mddev->new_chunk_sectors =
8575                         conf->chunk_sectors = conf->prev_chunk_sectors;
8576                 mddev->new_layout = conf->algorithm = conf->prev_algo;
8577                 rdev_for_each(rdev, mddev)
8578                         rdev->new_data_offset = rdev->data_offset;
8579                 smp_wmb();
8580                 conf->generation --;
8581                 conf->reshape_progress = MaxSector;
8582                 mddev->reshape_position = MaxSector;
8583                 write_seqcount_end(&conf->gen_lock);
8584                 spin_unlock_irq(&conf->device_lock);
8585                 return -EAGAIN;
8586         }
8587         conf->reshape_checkpoint = jiffies;
8588         md_wakeup_thread(mddev->sync_thread);
8589         md_new_event();
8590         return 0;
8591 }
8592
8593 /* This is called from the reshape thread and should make any
8594  * changes needed in 'conf'
8595  */
8596 static void end_reshape(struct r5conf *conf)
8597 {
8598
8599         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8600                 struct md_rdev *rdev;
8601
8602                 spin_lock_irq(&conf->device_lock);
8603                 conf->previous_raid_disks = conf->raid_disks;
8604                 md_finish_reshape(conf->mddev);
8605                 smp_wmb();
8606                 conf->reshape_progress = MaxSector;
8607                 conf->mddev->reshape_position = MaxSector;
8608                 rdev_for_each(rdev, conf->mddev)
8609                         if (rdev->raid_disk >= 0 &&
8610                             !test_bit(Journal, &rdev->flags) &&
8611                             !test_bit(In_sync, &rdev->flags))
8612                                 rdev->recovery_offset = MaxSector;
8613                 spin_unlock_irq(&conf->device_lock);
8614                 wake_up(&conf->wait_for_overlap);
8615
8616                 if (conf->mddev->queue)
8617                         raid5_set_io_opt(conf);
8618         }
8619 }
8620
8621 /* This is called from the raid5d thread with mddev_lock held.
8622  * It makes config changes to the device.
8623  */
8624 static void raid5_finish_reshape(struct mddev *mddev)
8625 {
8626         struct r5conf *conf = mddev->private;
8627         struct md_rdev *rdev;
8628
8629         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8630
8631                 if (mddev->delta_disks <= 0) {
8632                         int d;
8633                         spin_lock_irq(&conf->device_lock);
8634                         mddev->degraded = raid5_calc_degraded(conf);
8635                         spin_unlock_irq(&conf->device_lock);
8636                         for (d = conf->raid_disks ;
8637                              d < conf->raid_disks - mddev->delta_disks;
8638                              d++) {
8639                                 rdev = rdev_mdlock_deref(mddev,
8640                                                          conf->disks[d].rdev);
8641                                 if (rdev)
8642                                         clear_bit(In_sync, &rdev->flags);
8643                                 rdev = rdev_mdlock_deref(mddev,
8644                                                 conf->disks[d].replacement);
8645                                 if (rdev)
8646                                         clear_bit(In_sync, &rdev->flags);
8647                         }
8648                 }
8649                 mddev->layout = conf->algorithm;
8650                 mddev->chunk_sectors = conf->chunk_sectors;
8651                 mddev->reshape_position = MaxSector;
8652                 mddev->delta_disks = 0;
8653                 mddev->reshape_backwards = 0;
8654         }
8655 }
8656
8657 static void raid5_quiesce(struct mddev *mddev, int quiesce)
8658 {
8659         struct r5conf *conf = mddev->private;
8660
8661         if (quiesce) {
8662                 /* stop all writes */
8663                 lock_all_device_hash_locks_irq(conf);
8664                 /* '2' tells resync/reshape to pause so that all
8665                  * active stripes can drain
8666                  */
8667                 r5c_flush_cache(conf, INT_MAX);
8668                 /* need a memory barrier to make sure read_one_chunk() sees
8669                  * quiesce started and reverts to slow (locked) path.
8670                  */
8671                 smp_store_release(&conf->quiesce, 2);
8672                 wait_event_cmd(conf->wait_for_quiescent,
8673                                     atomic_read(&conf->active_stripes) == 0 &&
8674                                     atomic_read(&conf->active_aligned_reads) == 0,
8675                                     unlock_all_device_hash_locks_irq(conf),
8676                                     lock_all_device_hash_locks_irq(conf));
8677                 conf->quiesce = 1;
8678                 unlock_all_device_hash_locks_irq(conf);
8679                 /* allow reshape to continue */
8680                 wake_up(&conf->wait_for_overlap);
8681         } else {
8682                 /* re-enable writes */
8683                 lock_all_device_hash_locks_irq(conf);
8684                 conf->quiesce = 0;
8685                 wake_up(&conf->wait_for_quiescent);
8686                 wake_up(&conf->wait_for_overlap);
8687                 unlock_all_device_hash_locks_irq(conf);
8688         }
8689         log_quiesce(conf, quiesce);
8690 }
8691
8692 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
8693 {
8694         struct r0conf *raid0_conf = mddev->private;
8695         sector_t sectors;
8696
8697         /* for raid0 takeover only one zone is supported */
8698         if (raid0_conf->nr_strip_zones > 1) {
8699                 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8700                         mdname(mddev));
8701                 return ERR_PTR(-EINVAL);
8702         }
8703
8704         sectors = raid0_conf->strip_zone[0].zone_end;
8705         sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
8706         mddev->dev_sectors = sectors;
8707         mddev->new_level = level;
8708         mddev->new_layout = ALGORITHM_PARITY_N;
8709         mddev->new_chunk_sectors = mddev->chunk_sectors;
8710         mddev->raid_disks += 1;
8711         mddev->delta_disks = 1;
8712         /* make sure it will be not marked as dirty */
8713         mddev->recovery_cp = MaxSector;
8714
8715         return setup_conf(mddev);
8716 }
8717
8718 static void *raid5_takeover_raid1(struct mddev *mddev)
8719 {
8720         int chunksect;
8721         void *ret;
8722
8723         if (mddev->raid_disks != 2 ||
8724             mddev->degraded > 1)
8725                 return ERR_PTR(-EINVAL);
8726
8727         /* Should check if there are write-behind devices? */
8728
8729         chunksect = 64*2; /* 64K by default */
8730
8731         /* The array must be an exact multiple of chunksize */
8732         while (chunksect && (mddev->array_sectors & (chunksect-1)))
8733                 chunksect >>= 1;
8734
8735         if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8736                 /* array size does not allow a suitable chunk size */
8737                 return ERR_PTR(-EINVAL);
8738
8739         mddev->new_level = 5;
8740         mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8741         mddev->new_chunk_sectors = chunksect;
8742
8743         ret = setup_conf(mddev);
8744         if (!IS_ERR(ret))
8745                 mddev_clear_unsupported_flags(mddev,
8746                         UNSUPPORTED_MDDEV_FLAGS);
8747         return ret;
8748 }
8749
8750 static void *raid5_takeover_raid6(struct mddev *mddev)
8751 {
8752         int new_layout;
8753
8754         switch (mddev->layout) {
8755         case ALGORITHM_LEFT_ASYMMETRIC_6:
8756                 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8757                 break;
8758         case ALGORITHM_RIGHT_ASYMMETRIC_6:
8759                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8760                 break;
8761         case ALGORITHM_LEFT_SYMMETRIC_6:
8762                 new_layout = ALGORITHM_LEFT_SYMMETRIC;
8763                 break;
8764         case ALGORITHM_RIGHT_SYMMETRIC_6:
8765                 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8766                 break;
8767         case ALGORITHM_PARITY_0_6:
8768                 new_layout = ALGORITHM_PARITY_0;
8769                 break;
8770         case ALGORITHM_PARITY_N:
8771                 new_layout = ALGORITHM_PARITY_N;
8772                 break;
8773         default:
8774                 return ERR_PTR(-EINVAL);
8775         }
8776         mddev->new_level = 5;
8777         mddev->new_layout = new_layout;
8778         mddev->delta_disks = -1;
8779         mddev->raid_disks -= 1;
8780         return setup_conf(mddev);
8781 }
8782
8783 static int raid5_check_reshape(struct mddev *mddev)
8784 {
8785         /* For a 2-drive array, the layout and chunk size can be changed
8786          * immediately as not restriping is needed.
8787          * For larger arrays we record the new value - after validation
8788          * to be used by a reshape pass.
8789          */
8790         struct r5conf *conf = mddev->private;
8791         int new_chunk = mddev->new_chunk_sectors;
8792
8793         if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
8794                 return -EINVAL;
8795         if (new_chunk > 0) {
8796                 if (!is_power_of_2(new_chunk))
8797                         return -EINVAL;
8798                 if (new_chunk < (PAGE_SIZE>>9))
8799                         return -EINVAL;
8800                 if (mddev->array_sectors & (new_chunk-1))
8801                         /* not factor of array size */
8802                         return -EINVAL;
8803         }
8804
8805         /* They look valid */
8806
8807         if (mddev->raid_disks == 2) {
8808                 /* can make the change immediately */
8809                 if (mddev->new_layout >= 0) {
8810                         conf->algorithm = mddev->new_layout;
8811                         mddev->layout = mddev->new_layout;
8812                 }
8813                 if (new_chunk > 0) {
8814                         conf->chunk_sectors = new_chunk ;
8815                         mddev->chunk_sectors = new_chunk;
8816                 }
8817                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8818                 md_wakeup_thread(mddev->thread);
8819         }
8820         return check_reshape(mddev);
8821 }
8822
8823 static int raid6_check_reshape(struct mddev *mddev)
8824 {
8825         int new_chunk = mddev->new_chunk_sectors;
8826
8827         if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
8828                 return -EINVAL;
8829         if (new_chunk > 0) {
8830                 if (!is_power_of_2(new_chunk))
8831                         return -EINVAL;
8832                 if (new_chunk < (PAGE_SIZE >> 9))
8833                         return -EINVAL;
8834                 if (mddev->array_sectors & (new_chunk-1))
8835                         /* not factor of array size */
8836                         return -EINVAL;
8837         }
8838
8839         /* They look valid */
8840         return check_reshape(mddev);
8841 }
8842
8843 static void *raid5_takeover(struct mddev *mddev)
8844 {
8845         /* raid5 can take over:
8846          *  raid0 - if there is only one strip zone - make it a raid4 layout
8847          *  raid1 - if there are two drives.  We need to know the chunk size
8848          *  raid4 - trivial - just use a raid4 layout.
8849          *  raid6 - Providing it is a *_6 layout
8850          */
8851         if (mddev->level == 0)
8852                 return raid45_takeover_raid0(mddev, 5);
8853         if (mddev->level == 1)
8854                 return raid5_takeover_raid1(mddev);
8855         if (mddev->level == 4) {
8856                 mddev->new_layout = ALGORITHM_PARITY_N;
8857                 mddev->new_level = 5;
8858                 return setup_conf(mddev);
8859         }
8860         if (mddev->level == 6)
8861                 return raid5_takeover_raid6(mddev);
8862
8863         return ERR_PTR(-EINVAL);
8864 }
8865
8866 static void *raid4_takeover(struct mddev *mddev)
8867 {
8868         /* raid4 can take over:
8869          *  raid0 - if there is only one strip zone
8870          *  raid5 - if layout is right
8871          */
8872         if (mddev->level == 0)
8873                 return raid45_takeover_raid0(mddev, 4);
8874         if (mddev->level == 5 &&
8875             mddev->layout == ALGORITHM_PARITY_N) {
8876                 mddev->new_layout = 0;
8877                 mddev->new_level = 4;
8878                 return setup_conf(mddev);
8879         }
8880         return ERR_PTR(-EINVAL);
8881 }
8882
8883 static struct md_personality raid5_personality;
8884
8885 static void *raid6_takeover(struct mddev *mddev)
8886 {
8887         /* Currently can only take over a raid5.  We map the
8888          * personality to an equivalent raid6 personality
8889          * with the Q block at the end.
8890          */
8891         int new_layout;
8892
8893         if (mddev->pers != &raid5_personality)
8894                 return ERR_PTR(-EINVAL);
8895         if (mddev->degraded > 1)
8896                 return ERR_PTR(-EINVAL);
8897         if (mddev->raid_disks > 253)
8898                 return ERR_PTR(-EINVAL);
8899         if (mddev->raid_disks < 3)
8900                 return ERR_PTR(-EINVAL);
8901
8902         switch (mddev->layout) {
8903         case ALGORITHM_LEFT_ASYMMETRIC:
8904                 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8905                 break;
8906         case ALGORITHM_RIGHT_ASYMMETRIC:
8907                 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8908                 break;
8909         case ALGORITHM_LEFT_SYMMETRIC:
8910                 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8911                 break;
8912         case ALGORITHM_RIGHT_SYMMETRIC:
8913                 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8914                 break;
8915         case ALGORITHM_PARITY_0:
8916                 new_layout = ALGORITHM_PARITY_0_6;
8917                 break;
8918         case ALGORITHM_PARITY_N:
8919                 new_layout = ALGORITHM_PARITY_N;
8920                 break;
8921         default:
8922                 return ERR_PTR(-EINVAL);
8923         }
8924         mddev->new_level = 6;
8925         mddev->new_layout = new_layout;
8926         mddev->delta_disks = 1;
8927         mddev->raid_disks += 1;
8928         return setup_conf(mddev);
8929 }
8930
8931 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
8932 {
8933         struct r5conf *conf;
8934         int err;
8935
8936         err = mddev_lock(mddev);
8937         if (err)
8938                 return err;
8939         conf = mddev->private;
8940         if (!conf) {
8941                 mddev_unlock(mddev);
8942                 return -ENODEV;
8943         }
8944
8945         if (strncmp(buf, "ppl", 3) == 0) {
8946                 /* ppl only works with RAID 5 */
8947                 if (!raid5_has_ppl(conf) && conf->level == 5) {
8948                         err = log_init(conf, NULL, true);
8949                         if (!err) {
8950                                 err = resize_stripes(conf, conf->pool_size);
8951                                 if (err) {
8952                                         mddev_suspend(mddev);
8953                                         log_exit(conf);
8954                                         mddev_resume(mddev);
8955                                 }
8956                         }
8957                 } else
8958                         err = -EINVAL;
8959         } else if (strncmp(buf, "resync", 6) == 0) {
8960                 if (raid5_has_ppl(conf)) {
8961                         mddev_suspend(mddev);
8962                         log_exit(conf);
8963                         mddev_resume(mddev);
8964                         err = resize_stripes(conf, conf->pool_size);
8965                 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8966                            r5l_log_disk_error(conf)) {
8967                         bool journal_dev_exists = false;
8968                         struct md_rdev *rdev;
8969
8970                         rdev_for_each(rdev, mddev)
8971                                 if (test_bit(Journal, &rdev->flags)) {
8972                                         journal_dev_exists = true;
8973                                         break;
8974                                 }
8975
8976                         if (!journal_dev_exists) {
8977                                 mddev_suspend(mddev);
8978                                 clear_bit(MD_HAS_JOURNAL, &mddev->flags);
8979                                 mddev_resume(mddev);
8980                         } else  /* need remove journal device first */
8981                                 err = -EBUSY;
8982                 } else
8983                         err = -EINVAL;
8984         } else {
8985                 err = -EINVAL;
8986         }
8987
8988         if (!err)
8989                 md_update_sb(mddev, 1);
8990
8991         mddev_unlock(mddev);
8992
8993         return err;
8994 }
8995
8996 static int raid5_start(struct mddev *mddev)
8997 {
8998         struct r5conf *conf = mddev->private;
8999
9000         return r5l_start(conf->log);
9001 }
9002
9003 static struct md_personality raid6_personality =
9004 {
9005         .name           = "raid6",
9006         .level          = 6,
9007         .owner          = THIS_MODULE,
9008         .make_request   = raid5_make_request,
9009         .run            = raid5_run,
9010         .start          = raid5_start,
9011         .free           = raid5_free,
9012         .status         = raid5_status,
9013         .error_handler  = raid5_error,
9014         .hot_add_disk   = raid5_add_disk,
9015         .hot_remove_disk= raid5_remove_disk,
9016         .spare_active   = raid5_spare_active,
9017         .sync_request   = raid5_sync_request,
9018         .resize         = raid5_resize,
9019         .size           = raid5_size,
9020         .check_reshape  = raid6_check_reshape,
9021         .start_reshape  = raid5_start_reshape,
9022         .finish_reshape = raid5_finish_reshape,
9023         .quiesce        = raid5_quiesce,
9024         .takeover       = raid6_takeover,
9025         .change_consistency_policy = raid5_change_consistency_policy,
9026 };
9027 static struct md_personality raid5_personality =
9028 {
9029         .name           = "raid5",
9030         .level          = 5,
9031         .owner          = THIS_MODULE,
9032         .make_request   = raid5_make_request,
9033         .run            = raid5_run,
9034         .start          = raid5_start,
9035         .free           = raid5_free,
9036         .status         = raid5_status,
9037         .error_handler  = raid5_error,
9038         .hot_add_disk   = raid5_add_disk,
9039         .hot_remove_disk= raid5_remove_disk,
9040         .spare_active   = raid5_spare_active,
9041         .sync_request   = raid5_sync_request,
9042         .resize         = raid5_resize,
9043         .size           = raid5_size,
9044         .check_reshape  = raid5_check_reshape,
9045         .start_reshape  = raid5_start_reshape,
9046         .finish_reshape = raid5_finish_reshape,
9047         .quiesce        = raid5_quiesce,
9048         .takeover       = raid5_takeover,
9049         .change_consistency_policy = raid5_change_consistency_policy,
9050 };
9051
9052 static struct md_personality raid4_personality =
9053 {
9054         .name           = "raid4",
9055         .level          = 4,
9056         .owner          = THIS_MODULE,
9057         .make_request   = raid5_make_request,
9058         .run            = raid5_run,
9059         .start          = raid5_start,
9060         .free           = raid5_free,
9061         .status         = raid5_status,
9062         .error_handler  = raid5_error,
9063         .hot_add_disk   = raid5_add_disk,
9064         .hot_remove_disk= raid5_remove_disk,
9065         .spare_active   = raid5_spare_active,
9066         .sync_request   = raid5_sync_request,
9067         .resize         = raid5_resize,
9068         .size           = raid5_size,
9069         .check_reshape  = raid5_check_reshape,
9070         .start_reshape  = raid5_start_reshape,
9071         .finish_reshape = raid5_finish_reshape,
9072         .quiesce        = raid5_quiesce,
9073         .takeover       = raid4_takeover,
9074         .change_consistency_policy = raid5_change_consistency_policy,
9075 };
9076
9077 static int __init raid5_init(void)
9078 {
9079         int ret;
9080
9081         raid5_wq = alloc_workqueue("raid5wq",
9082                 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
9083         if (!raid5_wq)
9084                 return -ENOMEM;
9085
9086         ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
9087                                       "md/raid5:prepare",
9088                                       raid456_cpu_up_prepare,
9089                                       raid456_cpu_dead);
9090         if (ret) {
9091                 destroy_workqueue(raid5_wq);
9092                 return ret;
9093         }
9094         register_md_personality(&raid6_personality);
9095         register_md_personality(&raid5_personality);
9096         register_md_personality(&raid4_personality);
9097         return 0;
9098 }
9099
9100 static void raid5_exit(void)
9101 {
9102         unregister_md_personality(&raid6_personality);
9103         unregister_md_personality(&raid5_personality);
9104         unregister_md_personality(&raid4_personality);
9105         cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
9106         destroy_workqueue(raid5_wq);
9107 }
9108
9109 module_init(raid5_init);
9110 module_exit(raid5_exit);
9111 MODULE_LICENSE("GPL");
9112 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9113 MODULE_ALIAS("md-personality-4"); /* RAID5 */
9114 MODULE_ALIAS("md-raid5");
9115 MODULE_ALIAS("md-raid4");
9116 MODULE_ALIAS("md-level-5");
9117 MODULE_ALIAS("md-level-4");
9118 MODULE_ALIAS("md-personality-8"); /* RAID6 */
9119 MODULE_ALIAS("md-raid6");
9120 MODULE_ALIAS("md-level-6");
9121
9122 /* This used to be two separate modules, they were: */
9123 MODULE_ALIAS("raid5");
9124 MODULE_ALIAS("raid6");