fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <asm/div64.h>
  35 #include "compat.h"
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 struct btrfs_raid_bio {
  51         struct btrfs_fs_info *fs_info;
  52         struct btrfs_bio *bbio;
  53
  54         /*
  55          * logical block numbers for the start of each stripe
  56          * The last one or two are p/q.  These are sorted,
  57          * so raid_map[0] is the start of our full stripe
  58          */
  59         u64 *raid_map;
  60
  61         /* while we're doing rmw on a stripe
  62          * we put it into a hash table so we can
  63          * lock the stripe and merge more rbios
  64          * into it.
  65          */
  66         struct list_head hash_list;
  67
  68         /*
  69          * for scheduling work in the helper threads
  70          */
  71         struct btrfs_work work;
  72
  73         /*
  74          * bio list and bio_list_lock are used
  75          * to add more bios into the stripe
  76          * in hopes of avoiding the full rmw
  77          */
  78         struct bio_list bio_list;
  79         spinlock_t bio_list_lock;
  80
  81         /*
  82          * also protected by the bio_list_lock, the
  83          * stripe locking code uses plug_list to hand off
  84          * the stripe lock to the next pending IO
  85          */
  86         struct list_head plug_list;
  87
  88         /*
  89          * flags that tell us if it is safe to
  90          * merge with this bio
  91          */
  92         unsigned long flags;
  93
  94         /* size of each individual stripe on disk */
  95         int stripe_len;
  96
  97         /* number of data stripes (no p/q) */
  98         int nr_data;
  99
 100         /*
 101          * set if we're doing a parity rebuild
 102          * for a read from higher up, which is handled
 103          * differently from a parity rebuild as part of
 104          * rmw
 105          */
 106         int read_rebuild;
 107
 108         /* first bad stripe */
 109         int faila;
 110
 111         /* second bad stripe (for raid6 use) */
 112         int failb;
 113
 114         /*
 115          * number of pages needed to represent the full
 116          * stripe
 117          */
 118         int nr_pages;
 119
 120         /*
 121          * size of all the bios in the bio_list.  This
 122          * helps us decide if the rbio maps to a full
 123          * stripe or not
 124          */
 125         int bio_list_bytes;
 126
 127         atomic_t refs;
 128
 129         /*
 130          * these are two arrays of pointers.  We allocate the
 131          * rbio big enough to hold them both and setup their
 132          * locations when the rbio is allocated
 133          */
 134
 135         /* pointers to pages that we allocated for
 136          * reading/writing stripes directly from the disk (including P/Q)
 137          */
 138         struct page **stripe_pages;
 139
 140         /*
 141          * pointers to the pages in the bio_list.  Stored
 142          * here for faster lookup
 143          */
 144         struct page **bio_pages;
 145 };
 146
 147 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 148 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 149 static void rmw_work(struct btrfs_work *work);
 150 static void read_rebuild_work(struct btrfs_work *work);
 151 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 152 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 153 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 154 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 155 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 156 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 157 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 158
 159 /*
 160  * the stripe hash table is used for locking, and to collect
 161  * bios in hopes of making a full stripe
 162  */
 163 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 164 {
 165         struct btrfs_stripe_hash_table *table;
 166         struct btrfs_stripe_hash_table *x;
 167         struct btrfs_stripe_hash *cur;
 168         struct btrfs_stripe_hash *h;
 169         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 170         int i;
 171
 172         if (info->stripe_hash_table)
 173                 return 0;
 174
 175         table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
 176         if (!table)
 177                 return -ENOMEM;
 178
 179         table->table = (void *)(table + 1);
 180         h = table->table;
 181
 182         for (i = 0; i < num_entries; i++) {
 183                 cur = h + i;
 184                 INIT_LIST_HEAD(&cur->hash_list);
 185                 spin_lock_init(&cur->lock);
 186                 init_waitqueue_head(&cur->wait);
 187         }
 188
 189         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 190         if (x)
 191                 kfree(x);
 192         return 0;
 193 }
 194
 195 /*
 196  * we hash on the first logical address of the stripe
 197  */
 198 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 199 {
 200         u64 num = rbio->raid_map[0];
 201
 202         /*
 203          * we shift down quite a bit.  We're using byte
 204          * addressing, and most of the lower bits are zeros.
 205          * This tends to upset hash_64, and it consistently
 206          * returns just one or two different values.
 207          *
 208          * shifting off the lower bits fixes things.
 209          */
 210         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 211 }
 212
 213 /*
 214  * merging means we take the bio_list from the victim and
 215  * splice it into the destination.  The victim should
 216  * be discarded afterwards.
 217  *
 218  * must be called with dest->rbio_list_lock held
 219  */
 220 static void merge_rbio(struct btrfs_raid_bio *dest,
 221                        struct btrfs_raid_bio *victim)
 222 {
 223         bio_list_merge(&dest->bio_list, &victim->bio_list);
 224         dest->bio_list_bytes += victim->bio_list_bytes;
 225         bio_list_init(&victim->bio_list);
 226 }
 227
 228 /*
 229  * free the hash table used by unmount
 230  */
 231 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 232 {
 233         if (!info->stripe_hash_table)
 234                 return;
 235         kfree(info->stripe_hash_table);
 236         info->stripe_hash_table = NULL;
 237 }
 238
 239 /*
 240  * helper function to run the xor_blocks api.  It is only
 241  * able to do MAX_XOR_BLOCKS at a time, so we need to
 242  * loop through.
 243  */
 244 static void run_xor(void **pages, int src_cnt, ssize_t len)
 245 {
 246         int src_off = 0;
 247         int xor_src_cnt = 0;
 248         void *dest = pages[src_cnt];
 249
 250         while(src_cnt > 0) {
 251                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 252                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 253
 254                 src_cnt -= xor_src_cnt;
 255                 src_off += xor_src_cnt;
 256         }
 257 }
 258
 259 /*
 260  * returns true if the bio list inside this rbio
 261  * covers an entire stripe (no rmw required).
 262  * Must be called with the bio list lock held, or
 263  * at a time when you know it is impossible to add
 264  * new bios into the list
 265  */
 266 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 267 {
 268         unsigned long size = rbio->bio_list_bytes;
 269         int ret = 1;
 270
 271         if (size != rbio->nr_data * rbio->stripe_len)
 272                 ret = 0;
 273
 274         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 275         return ret;
 276 }
 277
 278 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 279 {
 280         unsigned long flags;
 281         int ret;
 282
 283         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 284         ret = __rbio_is_full(rbio);
 285         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 286         return ret;
 287 }
 288
 289 /*
 290  * returns 1 if it is safe to merge two rbios together.
 291  * The merging is safe if the two rbios correspond to
 292  * the same stripe and if they are both going in the same
 293  * direction (read vs write), and if neither one is
 294  * locked for final IO
 295  *
 296  * The caller is responsible for locking such that
 297  * rmw_locked is safe to test
 298  */
 299 static int rbio_can_merge(struct btrfs_raid_bio *last,
 300                           struct btrfs_raid_bio *cur)
 301 {
 302         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 303             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 304                 return 0;
 305
 306         if (last->raid_map[0] !=
 307             cur->raid_map[0])
 308                 return 0;
 309
 310         /* reads can't merge with writes */
 311         if (last->read_rebuild !=
 312             cur->read_rebuild) {
 313                 return 0;
 314         }
 315
 316         return 1;
 317 }
 318
 319 /*
 320  * helper to index into the pstripe
 321  */
 322 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 323 {
 324         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 325         return rbio->stripe_pages[index];
 326 }
 327
 328 /*
 329  * helper to index into the qstripe, returns null
 330  * if there is no qstripe
 331  */
 332 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 333 {
 334         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 335                 return NULL;
 336
 337         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 338                 PAGE_CACHE_SHIFT;
 339         return rbio->stripe_pages[index];
 340 }
 341
 342 /*
 343  * The first stripe in the table for a logical address
 344  * has the lock.  rbios are added in one of three ways:
 345  *
 346  * 1) Nobody has the stripe locked yet.  The rbio is given
 347  * the lock and 0 is returned.  The caller must start the IO
 348  * themselves.
 349  *
 350  * 2) Someone has the stripe locked, but we're able to merge
 351  * with the lock owner.  The rbio is freed and the IO will
 352  * start automatically along with the existing rbio.  1 is returned.
 353  *
 354  * 3) Someone has the stripe locked, but we're not able to merge.
 355  * The rbio is added to the lock owner's plug list, or merged into
 356  * an rbio already on the plug list.  When the lock owner unlocks,
 357  * the next rbio on the list is run and the IO is started automatically.
 358  * 1 is returned
 359  *
 360  * If we return 0, the caller still owns the rbio and must continue with
 361  * IO submission.  If we return 1, the caller must assume the rbio has
 362  * already been freed.
 363  */
 364 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 365 {
 366         int bucket = rbio_bucket(rbio);
 367         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 368         struct btrfs_raid_bio *cur;
 369         struct btrfs_raid_bio *pending;
 370         unsigned long flags;
 371         DEFINE_WAIT(wait);
 372         struct btrfs_raid_bio *freeit = NULL;
 373         int ret = 0;
 374         int walk = 0;
 375
 376         spin_lock_irqsave(&h->lock, flags);
 377         list_for_each_entry(cur, &h->hash_list, hash_list) {
 378                 walk++;
 379                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 380                         spin_lock(&cur->bio_list_lock);
 381
 382                         /* can we merge into the lock owner? */
 383                         if (rbio_can_merge(cur, rbio)) {
 384                                 merge_rbio(cur, rbio);
 385                                 spin_unlock(&cur->bio_list_lock);
 386                                 freeit = rbio;
 387                                 ret = 1;
 388                                 goto out;
 389                         }
 390
 391                         /*
 392                          * we couldn't merge with the running
 393                          * rbio, see if we can merge with the
 394                          * pending ones.  We don't have to
 395                          * check for rmw_locked because there
 396                          * is no way they are inside finish_rmw
 397                          * right now
 398                          */
 399                         list_for_each_entry(pending, &cur->plug_list,
 400                                             plug_list) {
 401                                 if (rbio_can_merge(pending, rbio)) {
 402                                         merge_rbio(pending, rbio);
 403                                         spin_unlock(&cur->bio_list_lock);
 404                                         freeit = rbio;
 405                                         ret = 1;
 406                                         goto out;
 407                                 }
 408                         }
 409
 410                         /* no merging, put us on the tail of the plug list,
 411                          * our rbio will be started with the currently
 412                          * running rbio unlocks
 413                          */
 414                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 415                         spin_unlock(&cur->bio_list_lock);
 416                         ret = 1;
 417                         goto out;
 418                 }
 419         }
 420
 421         atomic_inc(&rbio->refs);
 422         list_add(&rbio->hash_list, &h->hash_list);
 423 out:
 424         spin_unlock_irqrestore(&h->lock, flags);
 425         if (freeit)
 426                 __free_raid_bio(freeit);
 427         return ret;
 428 }
 429
 430 /*
 431  * called as rmw or parity rebuild is completed.  If the plug list has more
 432  * rbios waiting for this stripe, the next one on the list will be started
 433  */
 434 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 435 {
 436         int bucket;
 437         struct btrfs_stripe_hash *h;
 438         unsigned long flags;
 439
 440         bucket = rbio_bucket(rbio);
 441         h = rbio->fs_info->stripe_hash_table->table + bucket;
 442
 443         spin_lock_irqsave(&h->lock, flags);
 444         spin_lock(&rbio->bio_list_lock);
 445
 446         if (!list_empty(&rbio->hash_list)) {
 447
 448                 list_del_init(&rbio->hash_list);
 449                 atomic_dec(&rbio->refs);
 450
 451                 /*
 452                  * we use the plug list to hold all the rbios
 453                  * waiting for the chance to lock this stripe.
 454                  * hand the lock over to one of them.
 455                  */
 456                 if (!list_empty(&rbio->plug_list)) {
 457                         struct btrfs_raid_bio *next;
 458                         struct list_head *head = rbio->plug_list.next;
 459
 460                         next = list_entry(head, struct btrfs_raid_bio,
 461                                           plug_list);
 462
 463                         list_del_init(&rbio->plug_list);
 464
 465                         list_add(&next->hash_list, &h->hash_list);
 466                         atomic_inc(&next->refs);
 467                         spin_unlock(&rbio->bio_list_lock);
 468                         spin_unlock_irqrestore(&h->lock, flags);
 469
 470                         if (next->read_rebuild)
 471                                 async_read_rebuild(next);
 472                         else
 473                                 async_rmw_stripe(next);
 474
 475                         goto done_nolock;
 476
 477                 } else  if (waitqueue_active(&h->wait)) {
 478                         spin_unlock(&rbio->bio_list_lock);
 479                         spin_unlock_irqrestore(&h->lock, flags);
 480                         wake_up(&h->wait);
 481                         goto done_nolock;
 482                 }
 483         }
 484         spin_unlock(&rbio->bio_list_lock);
 485         spin_unlock_irqrestore(&h->lock, flags);
 486
 487 done_nolock:
 488         return;
 489 }
 490
 491 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 492 {
 493         int i;
 494
 495         WARN_ON(atomic_read(&rbio->refs) < 0);
 496         if (!atomic_dec_and_test(&rbio->refs))
 497                 return;
 498
 499         WARN_ON(!list_empty(&rbio->hash_list));
 500         WARN_ON(!bio_list_empty(&rbio->bio_list));
 501
 502         for (i = 0; i < rbio->nr_pages; i++) {
 503                 if (rbio->stripe_pages[i]) {
 504                         __free_page(rbio->stripe_pages[i]);
 505                         rbio->stripe_pages[i] = NULL;
 506                 }
 507         }
 508         kfree(rbio->raid_map);
 509         kfree(rbio->bbio);
 510         kfree(rbio);
 511 }
 512
 513 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 514 {
 515         unlock_stripe(rbio);
 516         __free_raid_bio(rbio);
 517 }
 518
 519 /*
 520  * this frees the rbio and runs through all the bios in the
 521  * bio_list and calls end_io on them
 522  */
 523 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 524 {
 525         struct bio *cur = bio_list_get(&rbio->bio_list);
 526         struct bio *next;
 527         free_raid_bio(rbio);
 528
 529         while (cur) {
 530                 next = cur->bi_next;
 531                 cur->bi_next = NULL;
 532                 if (uptodate)
 533                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 534                 bio_endio(cur, err);
 535                 cur = next;
 536         }
 537 }
 538
 539 /*
 540  * end io function used by finish_rmw.  When we finally
 541  * get here, we've written a full stripe
 542  */
 543 static void raid_write_end_io(struct bio *bio, int err)
 544 {
 545         struct btrfs_raid_bio *rbio = bio->bi_private;
 546
 547         if (err)
 548                 fail_bio_stripe(rbio, bio);
 549
 550         bio_put(bio);
 551
 552         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 553                 return;
 554
 555         err = 0;
 556
 557         /* OK, we have read all the stripes we need to. */
 558         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 559                 err = -EIO;
 560
 561         rbio_orig_end_io(rbio, err, 0);
 562         return;
 563 }
 564
 565 /*
 566  * the read/modify/write code wants to use the original bio for
 567  * any pages it included, and then use the rbio for everything
 568  * else.  This function decides if a given index (stripe number)
 569  * and page number in that stripe fall inside the original bio
 570  * or the rbio.
 571  *
 572  * if you set bio_list_only, you'll get a NULL back for any ranges
 573  * that are outside the bio_list
 574  *
 575  * This doesn't take any refs on anything, you get a bare page pointer
 576  * and the caller must bump refs as required.
 577  *
 578  * You must call index_rbio_pages once before you can trust
 579  * the answers from this function.
 580  */
 581 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 582                                  int index, int pagenr, int bio_list_only)
 583 {
 584         int chunk_page;
 585         struct page *p = NULL;
 586
 587         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 588
 589         spin_lock_irq(&rbio->bio_list_lock);
 590         p = rbio->bio_pages[chunk_page];
 591         spin_unlock_irq(&rbio->bio_list_lock);
 592
 593         if (p || bio_list_only)
 594                 return p;
 595
 596         return rbio->stripe_pages[chunk_page];
 597 }
 598
 599 /*
 600  * number of pages we need for the entire stripe across all the
 601  * drives
 602  */
 603 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 604 {
 605         unsigned long nr = stripe_len * nr_stripes;
 606         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 607 }
 608
 609 /*
 610  * allocation and initial setup for the btrfs_raid_bio.  Not
 611  * this does not allocate any pages for rbio->pages.
 612  */
 613 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 614                           struct btrfs_bio *bbio, u64 *raid_map,
 615                           u64 stripe_len)
 616 {
 617         struct btrfs_raid_bio *rbio;
 618         int nr_data = 0;
 619         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 620         void *p;
 621
 622         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 623                         GFP_NOFS);
 624         if (!rbio) {
 625                 kfree(raid_map);
 626                 kfree(bbio);
 627                 return ERR_PTR(-ENOMEM);
 628         }
 629
 630         bio_list_init(&rbio->bio_list);
 631         INIT_LIST_HEAD(&rbio->plug_list);
 632         spin_lock_init(&rbio->bio_list_lock);
 633         INIT_LIST_HEAD(&rbio->hash_list);
 634         rbio->bbio = bbio;
 635         rbio->raid_map = raid_map;
 636         rbio->fs_info = root->fs_info;
 637         rbio->stripe_len = stripe_len;
 638         rbio->nr_pages = num_pages;
 639         rbio->faila = -1;
 640         rbio->failb = -1;
 641         atomic_set(&rbio->refs, 1);
 642
 643         /*
 644          * the stripe_pages and bio_pages array point to the extra
 645          * memory we allocated past the end of the rbio
 646          */
 647         p = rbio + 1;
 648         rbio->stripe_pages = p;
 649         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 650
 651         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 652                 nr_data = bbio->num_stripes - 2;
 653         else
 654                 nr_data = bbio->num_stripes - 1;
 655
 656         rbio->nr_data = nr_data;
 657         return rbio;
 658 }
 659
 660 /* allocate pages for all the stripes in the bio, including parity */
 661 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 662 {
 663         int i;
 664         struct page *page;
 665
 666         for (i = 0; i < rbio->nr_pages; i++) {
 667                 if (rbio->stripe_pages[i])
 668                         continue;
 669                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 670                 if (!page)
 671                         return -ENOMEM;
 672                 rbio->stripe_pages[i] = page;
 673                 ClearPageUptodate(page);
 674         }
 675         return 0;
 676 }
 677
 678 /* allocate pages for just the p/q stripes */
 679 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 680 {
 681         int i;
 682         struct page *page;
 683
 684         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 685
 686         for (; i < rbio->nr_pages; i++) {
 687                 if (rbio->stripe_pages[i])
 688                         continue;
 689                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 690                 if (!page)
 691                         return -ENOMEM;
 692                 rbio->stripe_pages[i] = page;
 693         }
 694         return 0;
 695 }
 696
 697 /*
 698  * add a single page from a specific stripe into our list of bios for IO
 699  * this will try to merge into existing bios if possible, and returns
 700  * zero if all went well.
 701  */
 702 int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 703                      struct bio_list *bio_list,
 704                      struct page *page,
 705                      int stripe_nr,
 706                      unsigned long page_index,
 707                      unsigned long bio_max_len)
 708 {
 709         struct bio *last = bio_list->tail;
 710         u64 last_end = 0;
 711         int ret;
 712         struct bio *bio;
 713         struct btrfs_bio_stripe *stripe;
 714         u64 disk_start;
 715
 716         stripe = &rbio->bbio->stripes[stripe_nr];
 717         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
 718
 719         /* if the device is missing, just fail this stripe */
 720         if (!stripe->dev->bdev)
 721                 return fail_rbio_index(rbio, stripe_nr);
 722
 723         /* see if we can add this page onto our existing bio */
 724         if (last) {
 725                 last_end = (u64)last->bi_sector << 9;
 726                 last_end += last->bi_size;
 727
 728                 /*
 729                  * we can't merge these if they are from different
 730                  * devices or if they are not contiguous
 731                  */
 732                 if (last_end == disk_start && stripe->dev->bdev &&
 733                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
 734                     last->bi_bdev == stripe->dev->bdev) {
 735                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
 736                         if (ret == PAGE_CACHE_SIZE)
 737                                 return 0;
 738                 }
 739         }
 740
 741         /* put a new bio on the list */
 742         bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
 743         if (!bio)
 744                 return -ENOMEM;
 745
 746         bio->bi_size = 0;
 747         bio->bi_bdev = stripe->dev->bdev;
 748         bio->bi_sector = disk_start >> 9;
 749         set_bit(BIO_UPTODATE, &bio->bi_flags);
 750
 751         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 752         bio_list_add(bio_list, bio);
 753         return 0;
 754 }
 755
 756 /*
 757  * while we're doing the read/modify/write cycle, we could
 758  * have errors in reading pages off the disk.  This checks
 759  * for errors and if we're not able to read the page it'll
 760  * trigger parity reconstruction.  The rmw will be finished
 761  * after we've reconstructed the failed stripes
 762  */
 763 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
 764 {
 765         if (rbio->faila >= 0 || rbio->failb >= 0) {
 766                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
 767                 __raid56_parity_recover(rbio);
 768         } else {
 769                 finish_rmw(rbio);
 770         }
 771 }
 772
 773 /*
 774  * these are just the pages from the rbio array, not from anything
 775  * the FS sent down to us
 776  */
 777 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
 778 {
 779         int index;
 780         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
 781         index += page;
 782         return rbio->stripe_pages[index];
 783 }
 784
 785 /*
 786  * helper function to walk our bio list and populate the bio_pages array with
 787  * the result.  This seems expensive, but it is faster than constantly
 788  * searching through the bio list as we setup the IO in finish_rmw or stripe
 789  * reconstruction.
 790  *
 791  * This must be called before you trust the answers from page_in_rbio
 792  */
 793 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 794 {
 795         struct bio *bio;
 796         u64 start;
 797         unsigned long stripe_offset;
 798         unsigned long page_index;
 799         struct page *p;
 800         int i;
 801
 802         spin_lock_irq(&rbio->bio_list_lock);
 803         bio_list_for_each(bio, &rbio->bio_list) {
 804                 start = (u64)bio->bi_sector << 9;
 805                 stripe_offset = start - rbio->raid_map[0];
 806                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
 807
 808                 for (i = 0; i < bio->bi_vcnt; i++) {
 809                         p = bio->bi_io_vec[i].bv_page;
 810                         rbio->bio_pages[page_index + i] = p;
 811                 }
 812         }
 813         spin_unlock_irq(&rbio->bio_list_lock);
 814 }
 815
 816 /*
 817  * this is called from one of two situations.  We either
 818  * have a full stripe from the higher layers, or we've read all
 819  * the missing bits off disk.
 820  *
 821  * This will calculate the parity and then send down any
 822  * changed blocks.
 823  */
 824 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 825 {
 826         struct btrfs_bio *bbio = rbio->bbio;
 827         void *pointers[bbio->num_stripes];
 828         int stripe_len = rbio->stripe_len;
 829         int nr_data = rbio->nr_data;
 830         int stripe;
 831         int pagenr;
 832         int p_stripe = -1;
 833         int q_stripe = -1;
 834         struct bio_list bio_list;
 835         struct bio *bio;
 836         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
 837         int ret;
 838
 839         bio_list_init(&bio_list);
 840
 841         if (bbio->num_stripes - rbio->nr_data == 1) {
 842                 p_stripe = bbio->num_stripes - 1;
 843         } else if (bbio->num_stripes - rbio->nr_data == 2) {
 844                 p_stripe = bbio->num_stripes - 2;
 845                 q_stripe = bbio->num_stripes - 1;
 846         } else {
 847                 BUG();
 848         }
 849
 850         /* at this point we either have a full stripe,
 851          * or we've read the full stripe from the drive.
 852          * recalculate the parity and write the new results.
 853          *
 854          * We're not allowed to add any new bios to the
 855          * bio list here, anyone else that wants to
 856          * change this stripe needs to do their own rmw.
 857          */
 858         spin_lock_irq(&rbio->bio_list_lock);
 859         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 860         spin_unlock_irq(&rbio->bio_list_lock);
 861
 862         atomic_set(&rbio->bbio->error, 0);
 863
 864         /*
 865          * now that we've set rmw_locked, run through the
 866          * bio list one last time and map the page pointers
 867          */
 868         index_rbio_pages(rbio);
 869
 870         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
 871                 struct page *p;
 872                 /* first collect one page from each data stripe */
 873                 for (stripe = 0; stripe < nr_data; stripe++) {
 874                         p = page_in_rbio(rbio, stripe, pagenr, 0);
 875                         pointers[stripe] = kmap(p);
 876                 }
 877
 878                 /* then add the parity stripe */
 879                 p = rbio_pstripe_page(rbio, pagenr);
 880                 SetPageUptodate(p);
 881                 pointers[stripe++] = kmap(p);
 882
 883                 if (q_stripe != -1) {
 884
 885                         /*
 886                          * raid6, add the qstripe and call the
 887                          * library function to fill in our p/q
 888                          */
 889                         p = rbio_qstripe_page(rbio, pagenr);
 890                         SetPageUptodate(p);
 891                         pointers[stripe++] = kmap(p);
 892
 893                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
 894                                                 pointers);
 895                 } else {
 896                         /* raid5 */
 897                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
 898                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
 899                 }
 900
 901
 902                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
 903                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
 904         }
 905
 906         /*
 907          * time to start writing.  Make bios for everything from the
 908          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
 909          * everything else.
 910          */
 911         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
 912                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
 913                         struct page *page;
 914                         if (stripe < rbio->nr_data) {
 915                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
 916                                 if (!page)
 917                                         continue;
 918                         } else {
 919                                page = rbio_stripe_page(rbio, stripe, pagenr);
 920                         }
 921
 922                         ret = rbio_add_io_page(rbio, &bio_list,
 923                                        page, stripe, pagenr, rbio->stripe_len);
 924                         if (ret)
 925                                 goto cleanup;
 926                 }
 927         }
 928
 929         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
 930         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
 931
 932         while (1) {
 933                 bio = bio_list_pop(&bio_list);
 934                 if (!bio)
 935                         break;
 936
 937                 bio->bi_private = rbio;
 938                 bio->bi_end_io = raid_write_end_io;
 939                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
 940                 submit_bio(WRITE, bio);
 941         }
 942         return;
 943
 944 cleanup:
 945         rbio_orig_end_io(rbio, -EIO, 0);
 946 }
 947
 948 /*
 949  * helper to find the stripe number for a given bio.  Used to figure out which
 950  * stripe has failed.  This expects the bio to correspond to a physical disk,
 951  * so it looks up based on physical sector numbers.
 952  */
 953 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 954                            struct bio *bio)
 955 {
 956         u64 physical = bio->bi_sector;
 957         u64 stripe_start;
 958         int i;
 959         struct btrfs_bio_stripe *stripe;
 960
 961         physical <<= 9;
 962
 963         for (i = 0; i < rbio->bbio->num_stripes; i++) {
 964                 stripe = &rbio->bbio->stripes[i];
 965                 stripe_start = stripe->physical;
 966                 if (physical >= stripe_start &&
 967                     physical < stripe_start + rbio->stripe_len) {
 968                         return i;
 969                 }
 970         }
 971         return -1;
 972 }
 973
 974 /*
 975  * helper to find the stripe number for a given
 976  * bio (before mapping).  Used to figure out which stripe has
 977  * failed.  This looks up based on logical block numbers.
 978  */
 979 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
 980                                    struct bio *bio)
 981 {
 982         u64 logical = bio->bi_sector;
 983         u64 stripe_start;
 984         int i;
 985
 986         logical <<= 9;
 987
 988         for (i = 0; i < rbio->nr_data; i++) {
 989                 stripe_start = rbio->raid_map[i];
 990                 if (logical >= stripe_start &&
 991                     logical < stripe_start + rbio->stripe_len) {
 992                         return i;
 993                 }
 994         }
 995         return -1;
 996 }
 997
 998 /*
 999  * returns -EIO if we had too many failures
1000  */
1001 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002 {
1003         unsigned long flags;
1004         int ret = 0;
1005
1006         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007
1008         /* we already know this stripe is bad, move on */
1009         if (rbio->faila == failed || rbio->failb == failed)
1010                 goto out;
1011
1012         if (rbio->faila == -1) {
1013                 /* first failure on this rbio */
1014                 rbio->faila = failed;
1015                 atomic_inc(&rbio->bbio->error);
1016         } else if (rbio->failb == -1) {
1017                 /* second failure on this rbio */
1018                 rbio->failb = failed;
1019                 atomic_inc(&rbio->bbio->error);
1020         } else {
1021                 ret = -EIO;
1022         }
1023 out:
1024         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025
1026         return ret;
1027 }
1028
1029 /*
1030  * helper to fail a stripe based on a physical disk
1031  * bio.
1032  */
1033 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034                            struct bio *bio)
1035 {
1036         int failed = find_bio_stripe(rbio, bio);
1037
1038         if (failed < 0)
1039                 return -EIO;
1040
1041         return fail_rbio_index(rbio, failed);
1042 }
1043
1044 /*
1045  * this sets each page in the bio uptodate.  It should only be used on private
1046  * rbio pages, nothing that comes in from the higher layers
1047  */
1048 static void set_bio_pages_uptodate(struct bio *bio)
1049 {
1050         int i;
1051         struct page *p;
1052
1053         for (i = 0; i < bio->bi_vcnt; i++) {
1054                 p = bio->bi_io_vec[i].bv_page;
1055                 SetPageUptodate(p);
1056         }
1057 }
1058
1059 /*
1060  * end io for the read phase of the rmw cycle.  All the bios here are physical
1061  * stripe bios we've read from the disk so we can recalculate the parity of the
1062  * stripe.
1063  *
1064  * This will usually kick off finish_rmw once all the bios are read in, but it
1065  * may trigger parity reconstruction if we had any errors along the way
1066  */
1067 static void raid_rmw_end_io(struct bio *bio, int err)
1068 {
1069         struct btrfs_raid_bio *rbio = bio->bi_private;
1070
1071         if (err)
1072                 fail_bio_stripe(rbio, bio);
1073         else
1074                 set_bio_pages_uptodate(bio);
1075
1076         bio_put(bio);
1077
1078         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079                 return;
1080
1081         err = 0;
1082         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083                 goto cleanup;
1084
1085         /*
1086          * this will normally call finish_rmw to start our write
1087          * but if there are any failed stripes we'll reconstruct
1088          * from parity first
1089          */
1090         validate_rbio_for_rmw(rbio);
1091         return;
1092
1093 cleanup:
1094
1095         rbio_orig_end_io(rbio, -EIO, 0);
1096 }
1097
1098 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099 {
1100         rbio->work.flags = 0;
1101         rbio->work.func = rmw_work;
1102
1103         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104                            &rbio->work);
1105 }
1106
1107 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108 {
1109         rbio->work.flags = 0;
1110         rbio->work.func = read_rebuild_work;
1111
1112         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113                            &rbio->work);
1114 }
1115
1116 /*
1117  * the stripe must be locked by the caller.  It will
1118  * unlock after all the writes are done
1119  */
1120 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121 {
1122         int bios_to_read = 0;
1123         struct btrfs_bio *bbio = rbio->bbio;
1124         struct bio_list bio_list;
1125         int ret;
1126         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127         int pagenr;
1128         int stripe;
1129         struct bio *bio;
1130
1131         bio_list_init(&bio_list);
1132
1133         ret = alloc_rbio_pages(rbio);
1134         if (ret)
1135                 goto cleanup;
1136
1137         index_rbio_pages(rbio);
1138
1139         atomic_set(&rbio->bbio->error, 0);
1140         /*
1141          * build a list of bios to read all the missing parts of this
1142          * stripe
1143          */
1144         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146                         struct page *page;
1147                         /*
1148                          * we want to find all the pages missing from
1149                          * the rbio and read them from the disk.  If
1150                          * page_in_rbio finds a page in the bio list
1151                          * we don't need to read it off the stripe.
1152                          */
1153                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1154                         if (page)
1155                                 continue;
1156
1157                         page = rbio_stripe_page(rbio, stripe, pagenr);
1158                         ret = rbio_add_io_page(rbio, &bio_list, page,
1159                                        stripe, pagenr, rbio->stripe_len);
1160                         if (ret)
1161                                 goto cleanup;
1162                 }
1163         }
1164
1165         bios_to_read = bio_list_size(&bio_list);
1166         if (!bios_to_read) {
1167                 /*
1168                  * this can happen if others have merged with
1169                  * us, it means there is nothing left to read.
1170                  * But if there are missing devices it may not be
1171                  * safe to do the full stripe write yet.
1172                  */
1173                 goto finish;
1174         }
1175
1176         /*
1177          * the bbio may be freed once we submit the last bio.  Make sure
1178          * not to touch it after that
1179          */
1180         atomic_set(&bbio->stripes_pending, bios_to_read);
1181         while (1) {
1182                 bio = bio_list_pop(&bio_list);
1183                 if (!bio)
1184                         break;
1185
1186                 bio->bi_private = rbio;
1187                 bio->bi_end_io = raid_rmw_end_io;
1188
1189                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190                                     BTRFS_WQ_ENDIO_RAID56);
1191
1192                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193                 submit_bio(READ, bio);
1194         }
1195         /* the actual write will happen once the reads are done */
1196         return 0;
1197
1198 cleanup:
1199         rbio_orig_end_io(rbio, -EIO, 0);
1200         return -EIO;
1201
1202 finish:
1203         validate_rbio_for_rmw(rbio);
1204         return 0;
1205 }
1206
1207 /*
1208  * if the upper layers pass in a full stripe, we thank them by only allocating
1209  * enough pages to hold the parity, and sending it all down quickly.
1210  */
1211 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212 {
1213         int ret;
1214
1215         ret = alloc_rbio_parity_pages(rbio);
1216         if (ret)
1217                 return ret;
1218
1219         ret = lock_stripe_add(rbio);
1220         if (ret == 0)
1221                 finish_rmw(rbio);
1222         return 0;
1223 }
1224
1225 /*
1226  * partial stripe writes get handed over to async helpers.
1227  * We're really hoping to merge a few more writes into this
1228  * rbio before calculating new parity
1229  */
1230 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231 {
1232         int ret;
1233
1234         ret = lock_stripe_add(rbio);
1235         if (ret == 0)
1236                 async_rmw_stripe(rbio);
1237         return 0;
1238 }
1239
1240 /*
1241  * sometimes while we were reading from the drive to
1242  * recalculate parity, enough new bios come into create
1243  * a full stripe.  So we do a check here to see if we can
1244  * go directly to finish_rmw
1245  */
1246 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247 {
1248         /* head off into rmw land if we don't have a full stripe */
1249         if (!rbio_is_full(rbio))
1250                 return partial_stripe_write(rbio);
1251         return full_stripe_write(rbio);
1252 }
1253
1254 /*
1255  * our main entry point for writes from the rest of the FS.
1256  */
1257 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258                         struct btrfs_bio *bbio, u64 *raid_map,
1259                         u64 stripe_len)
1260 {
1261         struct btrfs_raid_bio *rbio;
1262
1263         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264         if (IS_ERR(rbio)) {
1265                 kfree(raid_map);
1266                 kfree(bbio);
1267                 return PTR_ERR(rbio);
1268         }
1269         bio_list_add(&rbio->bio_list, bio);
1270         rbio->bio_list_bytes = bio->bi_size;
1271         return __raid56_parity_write(rbio);
1272 }
1273
1274 /*
1275  * all parity reconstruction happens here.  We've read in everything
1276  * we can find from the drives and this does the heavy lifting of
1277  * sorting the good from the bad.
1278  */
1279 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280 {
1281         int pagenr, stripe;
1282         void **pointers;
1283         int faila = -1, failb = -1;
1284         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285         struct page *page;
1286         int err;
1287         int i;
1288
1289         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290                            GFP_NOFS);
1291         if (!pointers) {
1292                 err = -ENOMEM;
1293                 goto cleanup_io;
1294         }
1295
1296         faila = rbio->faila;
1297         failb = rbio->failb;
1298
1299         if (rbio->read_rebuild) {
1300                 spin_lock_irq(&rbio->bio_list_lock);
1301                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302                 spin_unlock_irq(&rbio->bio_list_lock);
1303         }
1304
1305         index_rbio_pages(rbio);
1306
1307         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308                 /* setup our array of pointers with pages
1309                  * from each stripe
1310                  */
1311                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312                         /*
1313                          * if we're rebuilding a read, we have to use
1314                          * pages from the bio list
1315                          */
1316                         if (rbio->read_rebuild &&
1317                             (stripe == faila || stripe == failb)) {
1318                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1319                         } else {
1320                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1321                         }
1322                         pointers[stripe] = kmap(page);
1323                 }
1324
1325                 /* all raid6 handling here */
1326                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327                     RAID6_Q_STRIPE) {
1328
1329                         /*
1330                          * single failure, rebuild from parity raid5
1331                          * style
1332                          */
1333                         if (failb < 0) {
1334                                 if (faila == rbio->nr_data) {
1335                                         /*
1336                                          * Just the P stripe has failed, without
1337                                          * a bad data or Q stripe.
1338                                          * TODO, we should redo the xor here.
1339                                          */
1340                                         err = -EIO;
1341                                         goto cleanup;
1342                                 }
1343                                 /*
1344                                  * a single failure in raid6 is rebuilt
1345                                  * in the pstripe code below
1346                                  */
1347                                 goto pstripe;
1348                         }
1349
1350                         /* make sure our ps and qs are in order */
1351                         if (faila > failb) {
1352                                 int tmp = failb;
1353                                 failb = faila;
1354                                 faila = tmp;
1355                         }
1356
1357                         /* if the q stripe is failed, do a pstripe reconstruction
1358                          * from the xors.
1359                          * If both the q stripe and the P stripe are failed, we're
1360                          * here due to a crc mismatch and we can't give them the
1361                          * data they want
1362                          */
1363                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365                                         err = -EIO;
1366                                         goto cleanup;
1367                                 }
1368                                 /*
1369                                  * otherwise we have one bad data stripe and
1370                                  * a good P stripe.  raid5!
1371                                  */
1372                                 goto pstripe;
1373                         }
1374
1375                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376                                 raid6_datap_recov(rbio->bbio->num_stripes,
1377                                                   PAGE_SIZE, faila, pointers);
1378                         } else {
1379                                 raid6_2data_recov(rbio->bbio->num_stripes,
1380                                                   PAGE_SIZE, faila, failb,
1381                                                   pointers);
1382                         }
1383                 } else {
1384                         void *p;
1385
1386                         /* rebuild from P stripe here (raid5 or raid6) */
1387                         BUG_ON(failb != -1);
1388 pstripe:
1389                         /* Copy parity block into failed block to start with */
1390                         memcpy(pointers[faila],
1391                                pointers[rbio->nr_data],
1392                                PAGE_CACHE_SIZE);
1393
1394                         /* rearrange the pointer array */
1395                         p = pointers[faila];
1396                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397                                 pointers[stripe] = pointers[stripe + 1];
1398                         pointers[rbio->nr_data - 1] = p;
1399
1400                         /* xor in the rest */
1401                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402                 }
1403                 /* if we're doing this rebuild as part of an rmw, go through
1404                  * and set all of our private rbio pages in the
1405                  * failed stripes as uptodate.  This way finish_rmw will
1406                  * know they can be trusted.  If this was a read reconstruction,
1407                  * other endio functions will fiddle the uptodate bits
1408                  */
1409                 if (!rbio->read_rebuild) {
1410                         for (i = 0;  i < nr_pages; i++) {
1411                                 if (faila != -1) {
1412                                         page = rbio_stripe_page(rbio, faila, i);
1413                                         SetPageUptodate(page);
1414                                 }
1415                                 if (failb != -1) {
1416                                         page = rbio_stripe_page(rbio, failb, i);
1417                                         SetPageUptodate(page);
1418                                 }
1419                         }
1420                 }
1421                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422                         /*
1423                          * if we're rebuilding a read, we have to use
1424                          * pages from the bio list
1425                          */
1426                         if (rbio->read_rebuild &&
1427                             (stripe == faila || stripe == failb)) {
1428                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1429                         } else {
1430                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1431                         }
1432                         kunmap(page);
1433                 }
1434         }
1435
1436         err = 0;
1437 cleanup:
1438         kfree(pointers);
1439
1440 cleanup_io:
1441
1442         if (rbio->read_rebuild) {
1443                 rbio_orig_end_io(rbio, err, err == 0);
1444         } else if (err == 0) {
1445                 rbio->faila = -1;
1446                 rbio->failb = -1;
1447                 finish_rmw(rbio);
1448         } else {
1449                 rbio_orig_end_io(rbio, err, 0);
1450         }
1451 }
1452
1453 /*
1454  * This is called only for stripes we've read from disk to
1455  * reconstruct the parity.
1456  */
1457 static void raid_recover_end_io(struct bio *bio, int err)
1458 {
1459         struct btrfs_raid_bio *rbio = bio->bi_private;
1460
1461         /*
1462          * we only read stripe pages off the disk, set them
1463          * up to date if there were no errors
1464          */
1465         if (err)
1466                 fail_bio_stripe(rbio, bio);
1467         else
1468                 set_bio_pages_uptodate(bio);
1469         bio_put(bio);
1470
1471         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472                 return;
1473
1474         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475                 rbio_orig_end_io(rbio, -EIO, 0);
1476         else
1477                 __raid_recover_end_io(rbio);
1478 }
1479
1480 /*
1481  * reads everything we need off the disk to reconstruct
1482  * the parity. endio handlers trigger final reconstruction
1483  * when the IO is done.
1484  *
1485  * This is used both for reads from the higher layers and for
1486  * parity construction required to finish a rmw cycle.
1487  */
1488 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489 {
1490         int bios_to_read = 0;
1491         struct btrfs_bio *bbio = rbio->bbio;
1492         struct bio_list bio_list;
1493         int ret;
1494         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495         int pagenr;
1496         int stripe;
1497         struct bio *bio;
1498
1499         bio_list_init(&bio_list);
1500
1501         ret = alloc_rbio_pages(rbio);
1502         if (ret)
1503                 goto cleanup;
1504
1505         atomic_set(&rbio->bbio->error, 0);
1506
1507         /*
1508          * read everything that hasn't failed.
1509          */
1510         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511                 if (rbio->faila == stripe ||
1512                     rbio->failb == stripe)
1513                         continue;
1514
1515                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516                         struct page *p;
1517
1518                         /*
1519                          * the rmw code may have already read this
1520                          * page in
1521                          */
1522                         p = rbio_stripe_page(rbio, stripe, pagenr);
1523                         if (PageUptodate(p))
1524                                 continue;
1525
1526                         ret = rbio_add_io_page(rbio, &bio_list,
1527                                        rbio_stripe_page(rbio, stripe, pagenr),
1528                                        stripe, pagenr, rbio->stripe_len);
1529                         if (ret < 0)
1530                                 goto cleanup;
1531                 }
1532         }
1533
1534         bios_to_read = bio_list_size(&bio_list);
1535         if (!bios_to_read) {
1536                 /*
1537                  * we might have no bios to read just because the pages
1538                  * were up to date, or we might have no bios to read because
1539                  * the devices were gone.
1540                  */
1541                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542                         __raid_recover_end_io(rbio);
1543                         goto out;
1544                 } else {
1545                         goto cleanup;
1546                 }
1547         }
1548
1549         /*
1550          * the bbio may be freed once we submit the last bio.  Make sure
1551          * not to touch it after that
1552          */
1553         atomic_set(&bbio->stripes_pending, bios_to_read);
1554         while (1) {
1555                 bio = bio_list_pop(&bio_list);
1556                 if (!bio)
1557                         break;
1558
1559                 bio->bi_private = rbio;
1560                 bio->bi_end_io = raid_recover_end_io;
1561
1562                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563                                     BTRFS_WQ_ENDIO_RAID56);
1564
1565                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566                 submit_bio(READ, bio);
1567         }
1568 out:
1569         return 0;
1570
1571 cleanup:
1572         if (rbio->read_rebuild)
1573                 rbio_orig_end_io(rbio, -EIO, 0);
1574         return -EIO;
1575 }
1576
1577 /*
1578  * the main entry point for reads from the higher layers.  This
1579  * is really only called when the normal read path had a failure,
1580  * so we assume the bio they send down corresponds to a failed part
1581  * of the drive.
1582  */
1583 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584                           struct btrfs_bio *bbio, u64 *raid_map,
1585                           u64 stripe_len, int mirror_num)
1586 {
1587         struct btrfs_raid_bio *rbio;
1588         int ret;
1589
1590         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591         if (IS_ERR(rbio)) {
1592                 return PTR_ERR(rbio);
1593         }
1594
1595         rbio->read_rebuild = 1;
1596         bio_list_add(&rbio->bio_list, bio);
1597         rbio->bio_list_bytes = bio->bi_size;
1598
1599         rbio->faila = find_logical_bio_stripe(rbio, bio);
1600         if (rbio->faila == -1) {
1601                 BUG();
1602                 kfree(rbio);
1603                 return -EIO;
1604         }
1605
1606         /*
1607          * reconstruct from the q stripe if they are
1608          * asking for mirror 3
1609          */
1610         if (mirror_num == 3)
1611                 rbio->failb = bbio->num_stripes - 2;
1612
1613         ret = lock_stripe_add(rbio);
1614
1615         /*
1616          * __raid56_parity_recover will end the bio with
1617          * any errors it hits.  We don't want to return
1618          * its error value up the stack because our caller
1619          * will end up calling bio_endio with any nonzero
1620          * return
1621          */
1622         if (ret == 0)
1623                 __raid56_parity_recover(rbio);
1624         /*
1625          * our rbio has been added to the list of
1626          * rbios that will be handled after the
1627          * currently lock owner is done
1628          */
1629         return 0;
1630
1631 }
1632
1633 static void rmw_work(struct btrfs_work *work)
1634 {
1635         struct btrfs_raid_bio *rbio;
1636
1637         rbio = container_of(work, struct btrfs_raid_bio, work);
1638         raid56_rmw_stripe(rbio);
1639 }
1640
1641 static void read_rebuild_work(struct btrfs_work *work)
1642 {
1643         struct btrfs_raid_bio *rbio;
1644
1645         rbio = container_of(work, struct btrfs_raid_bio, work);
1646         __raid56_parity_recover(rbio);
1647 }