pnfs/blocklayout: remove read-modify-write handling in bl_write_pagelist
[linux-2.6-block.git] / fs / nfs / blocklayout / blocklayout.c
CommitLineData
155e7524
FI
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
9549ec01 32
155e7524
FI
33#include <linux/module.h>
34#include <linux/init.h>
fe0a9b74
JR
35#include <linux/mount.h>
36#include <linux/namei.h>
9549ec01 37#include <linux/bio.h> /* struct bio */
88c9e421 38#include <linux/prefetch.h>
6296556f 39#include <linux/pagevec.h>
155e7524 40
10bd295a 41#include "../pnfs.h"
76e697ba 42#include "../nfs4session.h"
10bd295a 43#include "../internal.h"
155e7524
FI
44#include "blocklayout.h"
45
46#define NFSDBG_FACILITY NFSDBG_PNFS_LD
47
48MODULE_LICENSE("GPL");
49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
51
9549ec01
FI
52static void print_page(struct page *page)
53{
54 dprintk("PRINTPAGE page %p\n", page);
55 dprintk(" PagePrivate %d\n", PagePrivate(page));
56 dprintk(" PageUptodate %d\n", PageUptodate(page));
57 dprintk(" PageError %d\n", PageError(page));
58 dprintk(" PageDirty %d\n", PageDirty(page));
59 dprintk(" PageReferenced %d\n", PageReferenced(page));
60 dprintk(" PageLocked %d\n", PageLocked(page));
61 dprintk(" PageWriteback %d\n", PageWriteback(page));
62 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
63 dprintk("\n");
64}
65
66/* Given the be associated with isect, determine if page data needs to be
67 * initialized.
68 */
69static int is_hole(struct pnfs_block_extent *be, sector_t isect)
70{
71 if (be->be_state == PNFS_BLOCK_NONE_DATA)
72 return 1;
73 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
74 return 0;
75 else
76 return !bl_is_sector_init(be->be_inval, isect);
77}
78
650e2d39
FI
79/* Given the be associated with isect, determine if page data can be
80 * written to disk.
81 */
82static int is_writable(struct pnfs_block_extent *be, sector_t isect)
83{
71cdd40f
PT
84 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
85 be->be_state == PNFS_BLOCK_INVALID_DATA);
650e2d39
FI
86}
87
9549ec01
FI
88/* The data we are handed might be spread across several bios. We need
89 * to track when the last one is finished.
90 */
91struct parallel_io {
92 struct kref refcnt;
7c5465d6 93 void (*pnfs_callback) (void *data, int num_se);
9549ec01 94 void *data;
7c5465d6 95 int bse_count;
9549ec01
FI
96};
97
98static inline struct parallel_io *alloc_parallel(void *data)
99{
100 struct parallel_io *rv;
101
102 rv = kmalloc(sizeof(*rv), GFP_NOFS);
103 if (rv) {
104 rv->data = data;
105 kref_init(&rv->refcnt);
7c5465d6 106 rv->bse_count = 0;
9549ec01
FI
107 }
108 return rv;
109}
110
111static inline void get_parallel(struct parallel_io *p)
112{
113 kref_get(&p->refcnt);
114}
115
116static void destroy_parallel(struct kref *kref)
117{
118 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
119
120 dprintk("%s enter\n", __func__);
7c5465d6 121 p->pnfs_callback(p->data, p->bse_count);
9549ec01
FI
122 kfree(p);
123}
124
125static inline void put_parallel(struct parallel_io *p)
126{
127 kref_put(&p->refcnt, destroy_parallel);
128}
129
130static struct bio *
131bl_submit_bio(int rw, struct bio *bio)
132{
133 if (bio) {
134 get_parallel(bio->bi_private);
135 dprintk("%s submitting %s bio %u@%llu\n", __func__,
4f024f37
KO
136 rw == READ ? "read" : "write", bio->bi_iter.bi_size,
137 (unsigned long long)bio->bi_iter.bi_sector);
9549ec01
FI
138 submit_bio(rw, bio);
139 }
140 return NULL;
141}
142
143static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
144 struct pnfs_block_extent *be,
145 void (*end_io)(struct bio *, int err),
146 struct parallel_io *par)
147{
148 struct bio *bio;
149
74a6eeb4 150 npg = min(npg, BIO_MAX_PAGES);
9549ec01 151 bio = bio_alloc(GFP_NOIO, npg);
74a6eeb4
PT
152 if (!bio && (current->flags & PF_MEMALLOC)) {
153 while (!bio && (npg /= 2))
154 bio = bio_alloc(GFP_NOIO, npg);
155 }
9549ec01 156
74a6eeb4 157 if (bio) {
4f024f37
KO
158 bio->bi_iter.bi_sector = isect - be->be_f_offset +
159 be->be_v_offset;
74a6eeb4
PT
160 bio->bi_bdev = be->be_mdev;
161 bio->bi_end_io = end_io;
162 bio->bi_private = par;
163 }
9549ec01
FI
164 return bio;
165}
166
fe6e1e8d 167static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
9549ec01
FI
168 sector_t isect, struct page *page,
169 struct pnfs_block_extent *be,
170 void (*end_io)(struct bio *, int err),
fe6e1e8d
PT
171 struct parallel_io *par,
172 unsigned int offset, int len)
9549ec01 173{
fe6e1e8d
PT
174 isect = isect + (offset >> SECTOR_SHIFT);
175 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
176 npg, rw, (unsigned long long)isect, offset, len);
9549ec01
FI
177retry:
178 if (!bio) {
179 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
180 if (!bio)
181 return ERR_PTR(-ENOMEM);
182 }
fe6e1e8d 183 if (bio_add_page(bio, page, len, offset) < len) {
9549ec01
FI
184 bio = bl_submit_bio(rw, bio);
185 goto retry;
186 }
187 return bio;
188}
189
9549ec01
FI
190/* This is basically copied from mpage_end_io_read */
191static void bl_end_io_read(struct bio *bio, int err)
192{
193 struct parallel_io *par = bio->bi_private;
2c30c71b
KO
194 struct bio_vec *bvec;
195 int i;
9549ec01 196
2c30c71b
KO
197 if (!err)
198 bio_for_each_segment_all(bvec, bio, i)
199 SetPageUptodate(bvec->bv_page);
9549ec01 200
2c30c71b 201 if (err) {
d45f60c6 202 struct nfs_pgio_header *header = par->data;
cd841605
FI
203
204 if (!header->pnfs_error)
205 header->pnfs_error = -EIO;
206 pnfs_set_lo_fail(header->lseg);
9549ec01
FI
207 }
208 bio_put(bio);
209 put_parallel(par);
210}
211
212static void bl_read_cleanup(struct work_struct *work)
213{
214 struct rpc_task *task;
d45f60c6 215 struct nfs_pgio_header *hdr;
9549ec01
FI
216 dprintk("%s enter\n", __func__);
217 task = container_of(work, struct rpc_task, u.tk_work);
d45f60c6
WAA
218 hdr = container_of(task, struct nfs_pgio_header, task);
219 pnfs_ld_read_done(hdr);
9549ec01
FI
220}
221
222static void
7c5465d6 223bl_end_par_io_read(void *data, int unused)
9549ec01 224{
d45f60c6 225 struct nfs_pgio_header *hdr = data;
9549ec01 226
d45f60c6
WAA
227 hdr->task.tk_status = hdr->pnfs_error;
228 INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
229 schedule_work(&hdr->task.u.tk_work);
9549ec01
FI
230}
231
155e7524 232static enum pnfs_try_status
d45f60c6 233bl_read_pagelist(struct nfs_pgio_header *hdr)
155e7524 234{
d45f60c6 235 struct nfs_pgio_header *header = hdr;
9549ec01
FI
236 int i, hole;
237 struct bio *bio = NULL;
238 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
239 sector_t isect, extent_length = 0;
240 struct parallel_io *par;
d45f60c6
WAA
241 loff_t f_offset = hdr->args.offset;
242 size_t bytes_left = hdr->args.count;
f742dc4a 243 unsigned int pg_offset, pg_len;
d45f60c6
WAA
244 struct page **pages = hdr->args.pages;
245 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
f742dc4a 246 const bool is_dio = (header->dreq != NULL);
be98fd0a 247 struct blk_plug plug;
9549ec01 248
6f00866d 249 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
d45f60c6
WAA
250 hdr->page_array.npages, f_offset,
251 (unsigned int)hdr->args.count);
9549ec01 252
d45f60c6 253 par = alloc_parallel(hdr);
9549ec01
FI
254 if (!par)
255 goto use_mds;
9549ec01
FI
256 par->pnfs_callback = bl_end_par_io_read;
257 /* At this point, we can no longer jump to use_mds */
258
be98fd0a
CH
259 blk_start_plug(&plug);
260
9549ec01
FI
261 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
262 /* Code assumes extents are page-aligned */
d45f60c6 263 for (i = pg_index; i < hdr->page_array.npages; i++) {
921b81a8 264 if (extent_length <= 0) {
9549ec01
FI
265 /* We've used up the previous extent */
266 bl_put_extent(be);
267 bl_put_extent(cow_read);
268 bio = bl_submit_bio(READ, bio);
269 /* Get the next one */
cd841605 270 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
9549ec01
FI
271 isect, &cow_read);
272 if (!be) {
cd841605 273 header->pnfs_error = -EIO;
9549ec01
FI
274 goto out;
275 }
276 extent_length = be->be_length -
277 (isect - be->be_f_offset);
278 if (cow_read) {
279 sector_t cow_length = cow_read->be_length -
280 (isect - cow_read->be_f_offset);
281 extent_length = min(extent_length, cow_length);
282 }
283 }
f742dc4a 284
3a6fd1f0 285 pg_offset = f_offset & ~PAGE_CACHE_MASK;
f742dc4a 286 if (is_dio) {
f742dc4a
PT
287 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
288 pg_len = PAGE_CACHE_SIZE - pg_offset;
289 else
290 pg_len = bytes_left;
291
292 f_offset += pg_len;
293 bytes_left -= pg_len;
294 isect += (pg_offset >> SECTOR_SHIFT);
921b81a8 295 extent_length -= (pg_offset >> SECTOR_SHIFT);
f742dc4a 296 } else {
3a6fd1f0 297 BUG_ON(pg_offset != 0);
f742dc4a
PT
298 pg_len = PAGE_CACHE_SIZE;
299 }
300
9549ec01
FI
301 hole = is_hole(be, isect);
302 if (hole && !cow_read) {
303 bio = bl_submit_bio(READ, bio);
304 /* Fill hole w/ zeroes w/o accessing device */
305 dprintk("%s Zeroing page for hole\n", __func__);
f742dc4a 306 zero_user_segment(pages[i], pg_offset, pg_len);
9549ec01
FI
307 print_page(pages[i]);
308 SetPageUptodate(pages[i]);
309 } else {
310 struct pnfs_block_extent *be_read;
311
312 be_read = (hole && cow_read) ? cow_read : be;
823b0c9d 313 bio = do_add_page_to_bio(bio,
d45f60c6 314 hdr->page_array.npages - i,
30dd374f 315 READ,
9549ec01 316 isect, pages[i], be_read,
f742dc4a
PT
317 bl_end_io_read, par,
318 pg_offset, pg_len);
9549ec01 319 if (IS_ERR(bio)) {
cd841605 320 header->pnfs_error = PTR_ERR(bio);
e6d05a75 321 bio = NULL;
9549ec01
FI
322 goto out;
323 }
324 }
f742dc4a 325 isect += (pg_len >> SECTOR_SHIFT);
921b81a8 326 extent_length -= (pg_len >> SECTOR_SHIFT);
9549ec01 327 }
cd841605 328 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
d45f60c6
WAA
329 hdr->res.eof = 1;
330 hdr->res.count = header->inode->i_size - hdr->args.offset;
9549ec01 331 } else {
d45f60c6 332 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
9549ec01
FI
333 }
334out:
335 bl_put_extent(be);
336 bl_put_extent(cow_read);
337 bl_submit_bio(READ, bio);
be98fd0a 338 blk_finish_plug(&plug);
9549ec01
FI
339 put_parallel(par);
340 return PNFS_ATTEMPTED;
341
342 use_mds:
343 dprintk("Giving up and using normal NFS\n");
155e7524
FI
344 return PNFS_NOT_ATTEMPTED;
345}
346
31e6306a
FI
347static void mark_extents_written(struct pnfs_block_layout *bl,
348 __u64 offset, __u32 count)
349{
350 sector_t isect, end;
351 struct pnfs_block_extent *be;
7c5465d6 352 struct pnfs_block_short_extent *se;
31e6306a
FI
353
354 dprintk("%s(%llu, %u)\n", __func__, offset, count);
355 if (count == 0)
356 return;
357 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
358 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
359 end >>= SECTOR_SHIFT;
360 while (isect < end) {
361 sector_t len;
362 be = bl_find_get_extent(bl, isect, NULL);
363 BUG_ON(!be); /* FIXME */
364 len = min(end, be->be_f_offset + be->be_length) - isect;
7c5465d6
PT
365 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
366 se = bl_pop_one_short_extent(be->be_inval);
367 BUG_ON(!se);
368 bl_mark_for_commit(be, isect, len, se);
369 }
31e6306a
FI
370 isect += len;
371 bl_put_extent(be);
372 }
373}
374
650e2d39
FI
375static void bl_end_io_write(struct bio *bio, int err)
376{
377 struct parallel_io *par = bio->bi_private;
378 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
d45f60c6 379 struct nfs_pgio_header *header = par->data;
650e2d39
FI
380
381 if (!uptodate) {
cd841605
FI
382 if (!header->pnfs_error)
383 header->pnfs_error = -EIO;
384 pnfs_set_lo_fail(header->lseg);
650e2d39
FI
385 }
386 bio_put(bio);
387 put_parallel(par);
388}
389
390/* Function scheduled for call during bl_end_par_io_write,
391 * it marks sectors as written and extends the commitlist.
392 */
393static void bl_write_cleanup(struct work_struct *work)
394{
395 struct rpc_task *task;
d45f60c6 396 struct nfs_pgio_header *hdr;
650e2d39
FI
397 dprintk("%s enter\n", __func__);
398 task = container_of(work, struct rpc_task, u.tk_work);
d45f60c6
WAA
399 hdr = container_of(task, struct nfs_pgio_header, task);
400 if (likely(!hdr->pnfs_error)) {
31e6306a 401 /* Marks for LAYOUTCOMMIT */
d45f60c6
WAA
402 mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
403 hdr->args.offset, hdr->args.count);
31e6306a 404 }
d45f60c6 405 pnfs_ld_write_done(hdr);
650e2d39
FI
406}
407
408/* Called when last of bios associated with a bl_write_pagelist call finishes */
7c5465d6 409static void bl_end_par_io_write(void *data, int num_se)
650e2d39 410{
d45f60c6 411 struct nfs_pgio_header *hdr = data;
650e2d39 412
d45f60c6
WAA
413 if (unlikely(hdr->pnfs_error)) {
414 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
7c5465d6
PT
415 num_se);
416 }
417
d45f60c6 418 hdr->task.tk_status = hdr->pnfs_error;
c65e6254 419 hdr->verf.committed = NFS_FILE_SYNC;
d45f60c6
WAA
420 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
421 schedule_work(&hdr->task.u.tk_work);
650e2d39
FI
422}
423
155e7524 424static enum pnfs_try_status
d45f60c6 425bl_write_pagelist(struct nfs_pgio_header *header, int sync)
155e7524 426{
3a6fd1f0 427 int i, ret;
650e2d39 428 struct bio *bio = NULL;
3a6fd1f0
CH
429 struct pnfs_block_extent *be = NULL;
430 sector_t isect, extent_length = 0;
96c9eae6 431 struct parallel_io *par = NULL;
d45f60c6
WAA
432 loff_t offset = header->args.offset;
433 size_t count = header->args.count;
d45f60c6 434 struct page **pages = header->args.pages;
3a6fd1f0 435 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
be98fd0a 436 struct blk_plug plug;
650e2d39
FI
437
438 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
96c9eae6 439
d45f60c6 440 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
71cdd40f
PT
441 * We want to write each, and if there is an error set pnfs_error
442 * to have it redone using nfs.
650e2d39 443 */
d45f60c6 444 par = alloc_parallel(header);
650e2d39 445 if (!par)
7c5465d6 446 goto out_mds;
650e2d39
FI
447 par->pnfs_callback = bl_end_par_io_write;
448 /* At this point, have to be more careful with error handling */
449
3a6fd1f0 450 blk_start_plug(&plug);
71cdd40f 451
3a6fd1f0
CH
452 /* we always write out the whole page */
453 offset = offset & (loff_t)PAGE_CACHE_MASK;
454 isect = offset >> SECTOR_SHIFT;
71cdd40f 455
d45f60c6 456 for (i = pg_index; i < header->page_array.npages; i++) {
921b81a8 457 if (extent_length <= 0) {
650e2d39
FI
458 /* We've used up the previous extent */
459 bl_put_extent(be);
460 bio = bl_submit_bio(WRITE, bio);
461 /* Get the next one */
cd841605 462 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg),
3a6fd1f0 463 isect, NULL);
650e2d39 464 if (!be || !is_writable(be, isect)) {
cd841605 465 header->pnfs_error = -EINVAL;
650e2d39
FI
466 goto out;
467 }
7c5465d6
PT
468 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
469 if (likely(!bl_push_one_short_extent(
470 be->be_inval)))
471 par->bse_count++;
472 else {
cd841605 473 header->pnfs_error = -ENOMEM;
7c5465d6
PT
474 goto out;
475 }
476 }
650e2d39 477 extent_length = be->be_length -
71cdd40f 478 (isect - be->be_f_offset);
650e2d39 479 }
fe6e1e8d 480
3a6fd1f0 481 BUG_ON(offset & ~PAGE_CACHE_MASK);
fe6e1e8d 482
fe6e1e8d
PT
483 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
484 !bl_is_sector_init(be->be_inval, isect)) {
71cdd40f 485 ret = bl_mark_sectors_init(be->be_inval, isect,
60c52e3a 486 PAGE_CACHE_SECTORS);
71cdd40f
PT
487 if (unlikely(ret)) {
488 dprintk("%s bl_mark_sectors_init fail %d\n",
489 __func__, ret);
cd841605 490 header->pnfs_error = ret;
71cdd40f 491 goto out;
650e2d39 492 }
71cdd40f 493 }
fe6e1e8d 494
d45f60c6 495 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
3a6fd1f0 496 WRITE, isect, pages[i], be,
fe6e1e8d 497 bl_end_io_write, par,
3a6fd1f0 498 0, PAGE_CACHE_SIZE);
71cdd40f 499 if (IS_ERR(bio)) {
cd841605 500 header->pnfs_error = PTR_ERR(bio);
e6d05a75 501 bio = NULL;
71cdd40f 502 goto out;
650e2d39 503 }
3a6fd1f0
CH
504 offset += PAGE_CACHE_SIZE;
505 count -= PAGE_CACHE_SIZE;
650e2d39
FI
506 isect += PAGE_CACHE_SECTORS;
507 extent_length -= PAGE_CACHE_SECTORS;
508 }
71cdd40f 509
d45f60c6 510 header->res.count = header->args.count;
650e2d39
FI
511out:
512 bl_put_extent(be);
513 bl_submit_bio(WRITE, bio);
be98fd0a 514 blk_finish_plug(&plug);
650e2d39
FI
515 put_parallel(par);
516 return PNFS_ATTEMPTED;
7c5465d6 517out_mds:
7c5465d6 518 return PNFS_NOT_ATTEMPTED;
155e7524
FI
519}
520
9e692969 521/* FIXME - range ignored */
155e7524 522static void
9e692969 523release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
155e7524 524{
9e692969
FI
525 int i;
526 struct pnfs_block_extent *be;
527
528 spin_lock(&bl->bl_ext_lock);
529 for (i = 0; i < EXTENT_LISTS; i++) {
530 while (!list_empty(&bl->bl_extents[i])) {
531 be = list_first_entry(&bl->bl_extents[i],
532 struct pnfs_block_extent,
533 be_node);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 }
537 }
538 spin_unlock(&bl->bl_ext_lock);
155e7524
FI
539}
540
155e7524
FI
541static void
542release_inval_marks(struct pnfs_inval_markings *marks)
543{
c1c2a4cd 544 struct pnfs_inval_tracking *pos, *temp;
7c5465d6 545 struct pnfs_block_short_extent *se, *stemp;
c1c2a4cd
FI
546
547 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
548 list_del(&pos->it_link);
549 kfree(pos);
550 }
7c5465d6
PT
551
552 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
553 list_del(&se->bse_node);
554 kfree(se);
555 }
155e7524
FI
556 return;
557}
558
559static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
560{
561 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
562
563 dprintk("%s enter\n", __func__);
564 release_extents(bl, NULL);
565 release_inval_marks(&bl->bl_inval);
566 kfree(bl);
567}
568
569static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
570 gfp_t gfp_flags)
571{
572 struct pnfs_block_layout *bl;
573
574 dprintk("%s enter\n", __func__);
575 bl = kzalloc(sizeof(*bl), gfp_flags);
576 if (!bl)
577 return NULL;
578 spin_lock_init(&bl->bl_ext_lock);
579 INIT_LIST_HEAD(&bl->bl_extents[0]);
580 INIT_LIST_HEAD(&bl->bl_extents[1]);
581 INIT_LIST_HEAD(&bl->bl_commit);
582 INIT_LIST_HEAD(&bl->bl_committing);
583 bl->bl_count = 0;
584 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
585 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
586 return &bl->bl_layout;
587}
588
a60d2ebd 589static void bl_free_lseg(struct pnfs_layout_segment *lseg)
155e7524 590{
a60d2ebd
FI
591 dprintk("%s enter\n", __func__);
592 kfree(lseg);
155e7524
FI
593}
594
a60d2ebd
FI
595/* We pretty much ignore lseg, and store all data layout wide, so we
596 * can correctly merge.
597 */
598static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
599 struct nfs4_layoutget_res *lgr,
600 gfp_t gfp_flags)
155e7524 601{
a60d2ebd
FI
602 struct pnfs_layout_segment *lseg;
603 int status;
604
605 dprintk("%s enter\n", __func__);
606 lseg = kzalloc(sizeof(*lseg), gfp_flags);
607 if (!lseg)
608 return ERR_PTR(-ENOMEM);
609 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
610 if (status) {
611 /* We don't want to call the full-blown bl_free_lseg,
612 * since on error extents were not touched.
613 */
614 kfree(lseg);
615 return ERR_PTR(status);
616 }
617 return lseg;
155e7524
FI
618}
619
620static void
621bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
622 const struct nfs4_layoutcommit_args *arg)
623{
90ace12a
FI
624 dprintk("%s enter\n", __func__);
625 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
155e7524
FI
626}
627
628static void
629bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
630{
b2be7811
FI
631 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
632
633 dprintk("%s enter\n", __func__);
634 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
155e7524
FI
635}
636
2f9fd182
FI
637static void free_blk_mountid(struct block_mount_id *mid)
638{
639 if (mid) {
93a3844e
PT
640 struct pnfs_block_dev *dev, *tmp;
641
642 /* No need to take bm_lock as we are last user freeing bm_devlist */
643 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
2f9fd182
FI
644 list_del(&dev->bm_node);
645 bl_free_block_dev(dev);
646 }
2f9fd182
FI
647 kfree(mid);
648 }
649}
650
78e4e05c 651/* This is mostly copied from the filelayout_get_device_info function.
2f9fd182
FI
652 * It seems much of this should be at the generic pnfs level.
653 */
654static struct pnfs_block_dev *
655nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
656 struct nfs4_deviceid *d_id)
657{
658 struct pnfs_device *dev;
516f2e24 659 struct pnfs_block_dev *rv;
2f9fd182
FI
660 u32 max_resp_sz;
661 int max_pages;
662 struct page **pages = NULL;
663 int i, rc;
664
665 /*
666 * Use the session max response size as the basis for setting
667 * GETDEVICEINFO's maxcount
668 */
669 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
10bd295a 670 max_pages = nfs_page_array_len(0, max_resp_sz);
2f9fd182
FI
671 dprintk("%s max_resp_sz %u max_pages %d\n",
672 __func__, max_resp_sz, max_pages);
673
674 dev = kmalloc(sizeof(*dev), GFP_NOFS);
675 if (!dev) {
676 dprintk("%s kmalloc failed\n", __func__);
516f2e24 677 return ERR_PTR(-ENOMEM);
2f9fd182
FI
678 }
679
f15b5041 680 pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
2f9fd182
FI
681 if (pages == NULL) {
682 kfree(dev);
516f2e24 683 return ERR_PTR(-ENOMEM);
2f9fd182
FI
684 }
685 for (i = 0; i < max_pages; i++) {
686 pages[i] = alloc_page(GFP_NOFS);
516f2e24
JR
687 if (!pages[i]) {
688 rv = ERR_PTR(-ENOMEM);
2f9fd182 689 goto out_free;
516f2e24 690 }
2f9fd182
FI
691 }
692
693 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
694 dev->layout_type = LAYOUT_BLOCK_VOLUME;
695 dev->pages = pages;
696 dev->pgbase = 0;
697 dev->pglen = PAGE_SIZE * max_pages;
698 dev->mincount = 0;
968fe252 699 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
2f9fd182
FI
700
701 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
cd5875fe 702 rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
2f9fd182 703 dprintk("%s getdevice info returns %d\n", __func__, rc);
516f2e24
JR
704 if (rc) {
705 rv = ERR_PTR(rc);
2f9fd182 706 goto out_free;
516f2e24 707 }
2f9fd182
FI
708
709 rv = nfs4_blk_decode_device(server, dev);
710 out_free:
711 for (i = 0; i < max_pages; i++)
712 __free_page(pages[i]);
713 kfree(pages);
714 kfree(dev);
715 return rv;
716}
717
155e7524
FI
718static int
719bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
720{
2f9fd182
FI
721 struct block_mount_id *b_mt_id = NULL;
722 struct pnfs_devicelist *dlist = NULL;
723 struct pnfs_block_dev *bdev;
724 LIST_HEAD(block_disklist);
516f2e24 725 int status, i;
2f9fd182 726
155e7524 727 dprintk("%s enter\n", __func__);
2f9fd182
FI
728
729 if (server->pnfs_blksize == 0) {
730 dprintk("%s Server did not return blksize\n", __func__);
731 return -EINVAL;
732 }
e3aaf7f2
CH
733 if (server->pnfs_blksize > PAGE_SIZE) {
734 printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
735 __func__, server->pnfs_blksize);
736 return -EINVAL;
737 }
738
2f9fd182
FI
739 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
740 if (!b_mt_id) {
741 status = -ENOMEM;
742 goto out_error;
743 }
744 /* Initialize nfs4 block layout mount id */
745 spin_lock_init(&b_mt_id->bm_lock);
746 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
747
748 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
749 if (!dlist) {
750 status = -ENOMEM;
751 goto out_error;
752 }
753 dlist->eof = 0;
754 while (!dlist->eof) {
755 status = nfs4_proc_getdevicelist(server, fh, dlist);
756 if (status)
757 goto out_error;
758 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
759 __func__, dlist->num_devs, dlist->eof);
760 for (i = 0; i < dlist->num_devs; i++) {
761 bdev = nfs4_blk_get_deviceinfo(server, fh,
762 &dlist->dev_id[i]);
516f2e24
JR
763 if (IS_ERR(bdev)) {
764 status = PTR_ERR(bdev);
2f9fd182
FI
765 goto out_error;
766 }
767 spin_lock(&b_mt_id->bm_lock);
768 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
769 spin_unlock(&b_mt_id->bm_lock);
770 }
771 }
772 dprintk("%s SUCCESS\n", __func__);
773 server->pnfs_ld_data = b_mt_id;
774
775 out_return:
776 kfree(dlist);
777 return status;
778
779 out_error:
780 free_blk_mountid(b_mt_id);
781 goto out_return;
155e7524
FI
782}
783
784static int
785bl_clear_layoutdriver(struct nfs_server *server)
786{
2f9fd182
FI
787 struct block_mount_id *b_mt_id = server->pnfs_ld_data;
788
155e7524 789 dprintk("%s enter\n", __func__);
2f9fd182
FI
790 free_blk_mountid(b_mt_id);
791 dprintk("%s RETURNS\n", __func__);
155e7524
FI
792 return 0;
793}
794
f742dc4a 795static bool
3a6fd1f0
CH
796is_aligned_req(struct nfs_pageio_descriptor *pgio,
797 struct nfs_page *req, unsigned int alignment)
f742dc4a 798{
3a6fd1f0
CH
799 /*
800 * Always accept buffered writes, higher layers take care of the
801 * right alignment.
802 */
803 if (pgio->pg_dreq == NULL)
804 return true;
805
806 if (!IS_ALIGNED(req->wb_offset, alignment))
807 return false;
808
809 if (IS_ALIGNED(req->wb_bytes, alignment))
810 return true;
811
812 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
813 /*
814 * If the write goes up to the inode size, just write
815 * the full page. Data past the inode size is
816 * guaranteed to be zeroed by the higher level client
817 * code, and this behaviour is mandated by RFC 5663
818 * section 2.3.2.
819 */
820 return true;
821 }
822
823 return false;
f742dc4a
PT
824}
825
826static void
827bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
828{
3a6fd1f0 829 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
f742dc4a 830 nfs_pageio_reset_read_mds(pgio);
3a6fd1f0
CH
831 return;
832 }
833
834 pnfs_generic_pg_init_read(pgio, req);
f742dc4a
PT
835}
836
b4fdac1a
WAA
837/*
838 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
839 * of bytes (maximum @req->wb_bytes) that can be coalesced.
840 */
841static size_t
f742dc4a
PT
842bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
843 struct nfs_page *req)
844{
3a6fd1f0 845 if (!is_aligned_req(pgio, req, SECTOR_SIZE))
b4fdac1a 846 return 0;
f742dc4a
PT
847 return pnfs_generic_pg_test(pgio, prev, req);
848}
849
6296556f
PT
850/*
851 * Return the number of contiguous bytes for a given inode
852 * starting at page frame idx.
853 */
854static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
855{
856 struct address_space *mapping = inode->i_mapping;
857 pgoff_t end;
858
859 /* Optimize common case that writes from 0 to end of file */
860 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
861 if (end != NFS_I(inode)->npages) {
862 rcu_read_lock();
e7b563bb 863 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
6296556f
PT
864 rcu_read_unlock();
865 }
866
867 if (!end)
868 return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
869 else
870 return (end - idx) << PAGE_CACHE_SHIFT;
871}
872
6f018efa 873static void
96c9eae6
PT
874bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
875{
3a6fd1f0
CH
876 u64 wb_size;
877
878 if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
96c9eae6 879 nfs_pageio_reset_write_mds(pgio);
3a6fd1f0 880 return;
6296556f 881 }
3a6fd1f0
CH
882
883 if (pgio->pg_dreq == NULL)
884 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
885 req->wb_index);
886 else
887 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
888
889 pnfs_generic_pg_init_write(pgio, req, wb_size);
96c9eae6
PT
890}
891
b4fdac1a
WAA
892/*
893 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
894 * of bytes (maximum @req->wb_bytes) that can be coalesced.
895 */
896static size_t
96c9eae6
PT
897bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
898 struct nfs_page *req)
899{
3a6fd1f0 900 if (!is_aligned_req(pgio, req, PAGE_SIZE))
b4fdac1a 901 return 0;
96c9eae6
PT
902 return pnfs_generic_pg_test(pgio, prev, req);
903}
904
e9643fe8 905static const struct nfs_pageio_ops bl_pg_read_ops = {
f742dc4a
PT
906 .pg_init = bl_pg_init_read,
907 .pg_test = bl_pg_test_read,
e9643fe8
BH
908 .pg_doio = pnfs_generic_pg_readpages,
909};
910
911static const struct nfs_pageio_ops bl_pg_write_ops = {
96c9eae6
PT
912 .pg_init = bl_pg_init_write,
913 .pg_test = bl_pg_test_write,
e9643fe8
BH
914 .pg_doio = pnfs_generic_pg_writepages,
915};
916
155e7524
FI
917static struct pnfs_layoutdriver_type blocklayout_type = {
918 .id = LAYOUT_BLOCK_VOLUME,
919 .name = "LAYOUT_BLOCK_VOLUME",
5a12cca6 920 .owner = THIS_MODULE,
3a6fd1f0 921 .flags = PNFS_READ_WHOLE_PAGE,
155e7524
FI
922 .read_pagelist = bl_read_pagelist,
923 .write_pagelist = bl_write_pagelist,
924 .alloc_layout_hdr = bl_alloc_layout_hdr,
925 .free_layout_hdr = bl_free_layout_hdr,
926 .alloc_lseg = bl_alloc_lseg,
927 .free_lseg = bl_free_lseg,
928 .encode_layoutcommit = bl_encode_layoutcommit,
929 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
930 .set_layoutdriver = bl_set_layoutdriver,
931 .clear_layoutdriver = bl_clear_layoutdriver,
e9643fe8
BH
932 .pg_read_ops = &bl_pg_read_ops,
933 .pg_write_ops = &bl_pg_write_ops,
155e7524
FI
934};
935
fe0a9b74 936static const struct rpc_pipe_ops bl_upcall_ops = {
c1225158 937 .upcall = rpc_pipe_generic_upcall,
fe0a9b74
JR
938 .downcall = bl_pipe_downcall,
939 .destroy_msg = bl_pipe_destroy_msg,
940};
941
332dfab6
SK
942static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
943 struct rpc_pipe *pipe)
944{
945 struct dentry *dir, *dentry;
946
947 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
948 if (dir == NULL)
949 return ERR_PTR(-ENOENT);
950 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
951 dput(dir);
952 return dentry;
953}
954
955static void nfs4blocklayout_unregister_sb(struct super_block *sb,
956 struct rpc_pipe *pipe)
957{
958 if (pipe->dentry)
959 rpc_unlink(pipe->dentry);
960}
961
627f3066
SK
962static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
963 void *ptr)
964{
965 struct super_block *sb = ptr;
966 struct net *net = sb->s_fs_info;
967 struct nfs_net *nn = net_generic(net, nfs_net_id);
968 struct dentry *dentry;
969 int ret = 0;
970
971 if (!try_module_get(THIS_MODULE))
972 return 0;
973
974 if (nn->bl_device_pipe == NULL) {
975 module_put(THIS_MODULE);
976 return 0;
977 }
978
979 switch (event) {
980 case RPC_PIPEFS_MOUNT:
981 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
982 if (IS_ERR(dentry)) {
983 ret = PTR_ERR(dentry);
984 break;
985 }
986 nn->bl_device_pipe->dentry = dentry;
987 break;
988 case RPC_PIPEFS_UMOUNT:
989 if (nn->bl_device_pipe->dentry)
990 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
991 break;
992 default:
993 ret = -ENOTSUPP;
994 break;
995 }
996 module_put(THIS_MODULE);
997 return ret;
998}
999
1000static struct notifier_block nfs4blocklayout_block = {
1001 .notifier_call = rpc_pipefs_event,
1002};
1003
332dfab6
SK
1004static struct dentry *nfs4blocklayout_register_net(struct net *net,
1005 struct rpc_pipe *pipe)
1006{
1007 struct super_block *pipefs_sb;
1008 struct dentry *dentry;
1009
1010 pipefs_sb = rpc_get_sb_net(net);
1011 if (!pipefs_sb)
2561d618 1012 return NULL;
332dfab6
SK
1013 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1014 rpc_put_sb_net(net);
1015 return dentry;
1016}
1017
1018static void nfs4blocklayout_unregister_net(struct net *net,
1019 struct rpc_pipe *pipe)
1020{
1021 struct super_block *pipefs_sb;
1022
1023 pipefs_sb = rpc_get_sb_net(net);
1024 if (pipefs_sb) {
1025 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1026 rpc_put_sb_net(net);
1027 }
1028}
1029
9e2e74db
SK
1030static int nfs4blocklayout_net_init(struct net *net)
1031{
1032 struct nfs_net *nn = net_generic(net, nfs_net_id);
1033 struct dentry *dentry;
1034
5ffaf855 1035 init_waitqueue_head(&nn->bl_wq);
9e2e74db
SK
1036 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1037 if (IS_ERR(nn->bl_device_pipe))
1038 return PTR_ERR(nn->bl_device_pipe);
1039 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1040 if (IS_ERR(dentry)) {
1041 rpc_destroy_pipe_data(nn->bl_device_pipe);
1042 return PTR_ERR(dentry);
1043 }
1044 nn->bl_device_pipe->dentry = dentry;
1045 return 0;
1046}
1047
1048static void nfs4blocklayout_net_exit(struct net *net)
1049{
1050 struct nfs_net *nn = net_generic(net, nfs_net_id);
1051
1052 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1053 rpc_destroy_pipe_data(nn->bl_device_pipe);
1054 nn->bl_device_pipe = NULL;
1055}
1056
1057static struct pernet_operations nfs4blocklayout_net_ops = {
1058 .init = nfs4blocklayout_net_init,
1059 .exit = nfs4blocklayout_net_exit,
1060};
1061
155e7524
FI
1062static int __init nfs4blocklayout_init(void)
1063{
1064 int ret;
1065
1066 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
1067
1068 ret = pnfs_register_layoutdriver(&blocklayout_type);
fe0a9b74
JR
1069 if (ret)
1070 goto out;
1071
627f3066 1072 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
9e2e74db
SK
1073 if (ret)
1074 goto out_remove;
627f3066
SK
1075 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
1076 if (ret)
1077 goto out_notifier;
fe0a9b74
JR
1078out:
1079 return ret;
1080
627f3066
SK
1081out_notifier:
1082 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
fe0a9b74
JR
1083out_remove:
1084 pnfs_unregister_layoutdriver(&blocklayout_type);
155e7524
FI
1085 return ret;
1086}
1087
1088static void __exit nfs4blocklayout_exit(void)
1089{
1090 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1091 __func__);
1092
627f3066 1093 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
9e2e74db 1094 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
155e7524
FI
1095 pnfs_unregister_layoutdriver(&blocklayout_type);
1096}
1097
1098MODULE_ALIAS("nfs-layouttype4-3");
1099
1100module_init(nfs4blocklayout_init);
1101module_exit(nfs4blocklayout_exit);