Commit | Line | Data |
---|---|---|
c93407d0 BH |
1 | /* |
2 | * pNFS Objects layout implementation over open-osd initiator library | |
3 | * | |
4 | * Copyright (C) 2009 Panasas Inc. [year of first publication] | |
5 | * All rights reserved. | |
6 | * | |
7 | * Benny Halevy <bhalevy@panasas.com> | |
8 | * Boaz Harrosh <bharrosh@panasas.com> | |
9 | * | |
10 | * This program is free software; you can redistribute it and/or modify | |
11 | * it under the terms of the GNU General Public License version 2 | |
12 | * See the file COPYING included with this distribution for more details. | |
13 | * | |
14 | * Redistribution and use in source and binary forms, with or without | |
15 | * modification, are permitted provided that the following conditions | |
16 | * are met: | |
17 | * | |
18 | * 1. Redistributions of source code must retain the above copyright | |
19 | * notice, this list of conditions and the following disclaimer. | |
20 | * 2. Redistributions in binary form must reproduce the above copyright | |
21 | * notice, this list of conditions and the following disclaimer in the | |
22 | * documentation and/or other materials provided with the distribution. | |
23 | * 3. Neither the name of the Panasas company nor the names of its | |
24 | * contributors may be used to endorse or promote products derived | |
25 | * from this software without specific prior written permission. | |
26 | * | |
27 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED | |
28 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | |
29 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
30 | * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
31 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
32 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
33 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR | |
34 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
35 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
36 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
37 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
38 | */ | |
39 | ||
40 | #include <linux/module.h> | |
09f5bf4e BH |
41 | #include <scsi/osd_initiator.h> |
42 | ||
43 | #include "objlayout.h" | |
44 | ||
45 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | |
46 | ||
47 | #define _LLU(x) ((unsigned long long)x) | |
48 | ||
04f83450 BH |
49 | enum { BIO_MAX_PAGES_KMALLOC = |
50 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | |
51 | }; | |
52 | ||
b6c05f16 BH |
53 | struct objio_dev_ent { |
54 | struct nfs4_deviceid_node id_node; | |
55 | struct osd_dev *od; | |
56 | }; | |
57 | ||
58 | static void | |
59 | objio_free_deviceid_node(struct nfs4_deviceid_node *d) | |
60 | { | |
61 | struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); | |
62 | ||
63 | dprintk("%s: free od=%p\n", __func__, de->od); | |
64 | osduld_put_device(de->od); | |
65 | kfree(de); | |
66 | } | |
67 | ||
68 | static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, | |
69 | const struct nfs4_deviceid *d_id) | |
70 | { | |
71 | struct nfs4_deviceid_node *d; | |
72 | struct objio_dev_ent *de; | |
73 | ||
74 | d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); | |
75 | if (!d) | |
76 | return NULL; | |
77 | ||
78 | de = container_of(d, struct objio_dev_ent, id_node); | |
79 | return de; | |
80 | } | |
81 | ||
82 | static struct objio_dev_ent * | |
83 | _dev_list_add(const struct nfs_server *nfss, | |
84 | const struct nfs4_deviceid *d_id, struct osd_dev *od, | |
85 | gfp_t gfp_flags) | |
86 | { | |
87 | struct nfs4_deviceid_node *d; | |
88 | struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); | |
89 | struct objio_dev_ent *n; | |
90 | ||
91 | if (!de) { | |
92 | dprintk("%s: -ENOMEM od=%p\n", __func__, od); | |
93 | return NULL; | |
94 | } | |
95 | ||
96 | dprintk("%s: Adding od=%p\n", __func__, od); | |
97 | nfs4_init_deviceid_node(&de->id_node, | |
98 | nfss->pnfs_curr_ld, | |
99 | nfss->nfs_client, | |
100 | d_id); | |
101 | de->od = od; | |
102 | ||
103 | d = nfs4_insert_deviceid_node(&de->id_node); | |
104 | n = container_of(d, struct objio_dev_ent, id_node); | |
105 | if (n != de) { | |
106 | dprintk("%s: Race with other n->od=%p\n", __func__, n->od); | |
107 | objio_free_deviceid_node(&de->id_node); | |
108 | de = n; | |
109 | } | |
110 | ||
111 | atomic_inc(&de->id_node.ref); | |
112 | return de; | |
113 | } | |
114 | ||
09f5bf4e BH |
115 | struct caps_buffers { |
116 | u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; | |
117 | u8 creds[OSD_CAP_LEN]; | |
118 | }; | |
119 | ||
120 | struct objio_segment { | |
121 | struct pnfs_layout_segment lseg; | |
122 | ||
123 | struct pnfs_osd_object_cred *comps; | |
124 | ||
125 | unsigned mirrors_p1; | |
126 | unsigned stripe_unit; | |
127 | unsigned group_width; /* Data stripe_units without integrity comps */ | |
128 | u64 group_depth; | |
129 | unsigned group_count; | |
130 | ||
131 | unsigned comps_index; | |
132 | unsigned num_comps; | |
133 | /* variable length */ | |
134 | struct objio_dev_ent *ods[]; | |
135 | }; | |
136 | ||
137 | static inline struct objio_segment * | |
138 | OBJIO_LSEG(struct pnfs_layout_segment *lseg) | |
139 | { | |
140 | return container_of(lseg, struct objio_segment, lseg); | |
141 | } | |
142 | ||
04f83450 BH |
143 | struct objio_state; |
144 | typedef ssize_t (*objio_done_fn)(struct objio_state *ios); | |
145 | ||
146 | struct objio_state { | |
147 | /* Generic layer */ | |
148 | struct objlayout_io_state ol_state; | |
149 | ||
150 | struct objio_segment *layout; | |
151 | ||
152 | struct kref kref; | |
153 | objio_done_fn done; | |
154 | void *private; | |
155 | ||
156 | unsigned long length; | |
157 | unsigned numdevs; /* Actually used devs in this IO */ | |
158 | /* A per-device variable array of size numdevs */ | |
159 | struct _objio_per_comp { | |
160 | struct bio *bio; | |
161 | struct osd_request *or; | |
162 | unsigned long length; | |
163 | u64 offset; | |
164 | unsigned dev; | |
165 | } per_dev[]; | |
166 | }; | |
167 | ||
b6c05f16 BH |
168 | /* Send and wait for a get_device_info of devices in the layout, |
169 | then look them up with the osd_initiator library */ | |
170 | static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, | |
171 | struct objio_segment *objio_seg, unsigned comp, | |
172 | gfp_t gfp_flags) | |
173 | { | |
174 | struct pnfs_osd_deviceaddr *deviceaddr; | |
175 | struct nfs4_deviceid *d_id; | |
176 | struct objio_dev_ent *ode; | |
177 | struct osd_dev *od; | |
178 | struct osd_dev_info odi; | |
179 | int err; | |
180 | ||
181 | d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; | |
182 | ||
183 | ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); | |
184 | if (ode) | |
185 | return ode; | |
186 | ||
187 | err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); | |
188 | if (unlikely(err)) { | |
189 | dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", | |
190 | __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); | |
191 | return ERR_PTR(err); | |
192 | } | |
193 | ||
194 | odi.systemid_len = deviceaddr->oda_systemid.len; | |
195 | if (odi.systemid_len > sizeof(odi.systemid)) { | |
196 | err = -EINVAL; | |
197 | goto out; | |
198 | } else if (odi.systemid_len) | |
199 | memcpy(odi.systemid, deviceaddr->oda_systemid.data, | |
200 | odi.systemid_len); | |
201 | odi.osdname_len = deviceaddr->oda_osdname.len; | |
202 | odi.osdname = (u8 *)deviceaddr->oda_osdname.data; | |
203 | ||
204 | if (!odi.osdname_len && !odi.systemid_len) { | |
205 | dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", | |
206 | __func__); | |
207 | err = -ENODEV; | |
208 | goto out; | |
209 | } | |
210 | ||
211 | od = osduld_info_lookup(&odi); | |
212 | if (unlikely(IS_ERR(od))) { | |
213 | err = PTR_ERR(od); | |
214 | dprintk("%s: osduld_info_lookup => %d\n", __func__, err); | |
215 | goto out; | |
216 | } | |
217 | ||
218 | ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, | |
219 | gfp_flags); | |
220 | ||
221 | out: | |
222 | dprintk("%s: return=%d\n", __func__, err); | |
223 | objlayout_put_deviceinfo(deviceaddr); | |
224 | return err ? ERR_PTR(err) : ode; | |
225 | } | |
226 | ||
227 | static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, | |
228 | struct objio_segment *objio_seg, | |
229 | gfp_t gfp_flags) | |
230 | { | |
231 | unsigned i; | |
232 | int err; | |
233 | ||
234 | /* lookup all devices */ | |
235 | for (i = 0; i < objio_seg->num_comps; i++) { | |
236 | struct objio_dev_ent *ode; | |
237 | ||
238 | ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); | |
239 | if (unlikely(IS_ERR(ode))) { | |
240 | err = PTR_ERR(ode); | |
241 | goto out; | |
242 | } | |
243 | objio_seg->ods[i] = ode; | |
244 | } | |
245 | err = 0; | |
246 | ||
247 | out: | |
248 | dprintk("%s: return=%d\n", __func__, err); | |
249 | return err; | |
250 | } | |
251 | ||
09f5bf4e BH |
252 | static int _verify_data_map(struct pnfs_osd_layout *layout) |
253 | { | |
254 | struct pnfs_osd_data_map *data_map = &layout->olo_map; | |
255 | u64 stripe_length; | |
256 | u32 group_width; | |
257 | ||
258 | /* FIXME: Only raid0 for now. if not go through MDS */ | |
259 | if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { | |
260 | printk(KERN_ERR "Only RAID_0 for now\n"); | |
261 | return -ENOTSUPP; | |
262 | } | |
263 | if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { | |
264 | printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", | |
265 | data_map->odm_num_comps, data_map->odm_mirror_cnt); | |
266 | return -EINVAL; | |
267 | } | |
268 | ||
269 | if (data_map->odm_group_width) | |
270 | group_width = data_map->odm_group_width; | |
271 | else | |
272 | group_width = data_map->odm_num_comps / | |
273 | (data_map->odm_mirror_cnt + 1); | |
274 | ||
275 | stripe_length = (u64)data_map->odm_stripe_unit * group_width; | |
276 | if (stripe_length >= (1ULL << 32)) { | |
277 | printk(KERN_ERR "Total Stripe length(0x%llx)" | |
278 | " >= 32bit is not supported\n", _LLU(stripe_length)); | |
279 | return -ENOTSUPP; | |
280 | } | |
281 | ||
282 | if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { | |
283 | printk(KERN_ERR "Stripe Unit(0x%llx)" | |
284 | " must be Multples of PAGE_SIZE(0x%lx)\n", | |
285 | _LLU(data_map->odm_stripe_unit), PAGE_SIZE); | |
286 | return -ENOTSUPP; | |
287 | } | |
288 | ||
289 | return 0; | |
290 | } | |
291 | ||
292 | static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, | |
293 | struct pnfs_osd_object_cred *src_comp, | |
294 | struct caps_buffers *caps_p) | |
295 | { | |
296 | WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); | |
297 | WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); | |
298 | ||
299 | *cur_comp = *src_comp; | |
300 | ||
301 | memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, | |
302 | sizeof(caps_p->caps_key)); | |
303 | cur_comp->oc_cap_key.cred = caps_p->caps_key; | |
304 | ||
305 | memcpy(caps_p->creds, src_comp->oc_cap.cred, | |
306 | sizeof(caps_p->creds)); | |
307 | cur_comp->oc_cap.cred = caps_p->creds; | |
308 | } | |
309 | ||
310 | int objio_alloc_lseg(struct pnfs_layout_segment **outp, | |
311 | struct pnfs_layout_hdr *pnfslay, | |
312 | struct pnfs_layout_range *range, | |
313 | struct xdr_stream *xdr, | |
314 | gfp_t gfp_flags) | |
315 | { | |
316 | struct objio_segment *objio_seg; | |
317 | struct pnfs_osd_xdr_decode_layout_iter iter; | |
318 | struct pnfs_osd_layout layout; | |
319 | struct pnfs_osd_object_cred *cur_comp, src_comp; | |
320 | struct caps_buffers *caps_p; | |
321 | int err; | |
322 | ||
323 | err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); | |
324 | if (unlikely(err)) | |
325 | return err; | |
326 | ||
327 | err = _verify_data_map(&layout); | |
328 | if (unlikely(err)) | |
329 | return err; | |
330 | ||
331 | objio_seg = kzalloc(sizeof(*objio_seg) + | |
332 | sizeof(objio_seg->ods[0]) * layout.olo_num_comps + | |
333 | sizeof(*objio_seg->comps) * layout.olo_num_comps + | |
334 | sizeof(struct caps_buffers) * layout.olo_num_comps, | |
335 | gfp_flags); | |
336 | if (!objio_seg) | |
337 | return -ENOMEM; | |
338 | ||
339 | objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); | |
340 | cur_comp = objio_seg->comps; | |
341 | caps_p = (void *)(cur_comp + layout.olo_num_comps); | |
342 | while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) | |
343 | copy_single_comp(cur_comp++, &src_comp, caps_p++); | |
344 | if (unlikely(err)) | |
345 | goto err; | |
346 | ||
347 | objio_seg->num_comps = layout.olo_num_comps; | |
348 | objio_seg->comps_index = layout.olo_comps_index; | |
b6c05f16 BH |
349 | err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); |
350 | if (err) | |
351 | goto err; | |
09f5bf4e BH |
352 | |
353 | objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; | |
354 | objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; | |
355 | if (layout.olo_map.odm_group_width) { | |
356 | objio_seg->group_width = layout.olo_map.odm_group_width; | |
357 | objio_seg->group_depth = layout.olo_map.odm_group_depth; | |
358 | objio_seg->group_count = layout.olo_map.odm_num_comps / | |
359 | objio_seg->mirrors_p1 / | |
360 | objio_seg->group_width; | |
361 | } else { | |
362 | objio_seg->group_width = layout.olo_map.odm_num_comps / | |
363 | objio_seg->mirrors_p1; | |
364 | objio_seg->group_depth = -1; | |
365 | objio_seg->group_count = 1; | |
366 | } | |
367 | ||
368 | *outp = &objio_seg->lseg; | |
369 | return 0; | |
370 | ||
371 | err: | |
372 | kfree(objio_seg); | |
373 | dprintk("%s: Error: return %d\n", __func__, err); | |
374 | *outp = NULL; | |
375 | return err; | |
376 | } | |
377 | ||
378 | void objio_free_lseg(struct pnfs_layout_segment *lseg) | |
379 | { | |
b6c05f16 | 380 | int i; |
09f5bf4e BH |
381 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); |
382 | ||
b6c05f16 BH |
383 | for (i = 0; i < objio_seg->num_comps; i++) { |
384 | if (!objio_seg->ods[i]) | |
385 | break; | |
386 | nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); | |
387 | } | |
09f5bf4e BH |
388 | kfree(objio_seg); |
389 | } | |
390 | ||
04f83450 BH |
391 | int objio_alloc_io_state(struct pnfs_layout_segment *lseg, |
392 | struct objlayout_io_state **outp, | |
393 | gfp_t gfp_flags) | |
394 | { | |
395 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); | |
396 | struct objio_state *ios; | |
397 | const unsigned first_size = sizeof(*ios) + | |
398 | objio_seg->num_comps * sizeof(ios->per_dev[0]); | |
399 | ||
400 | ios = kzalloc(first_size, gfp_flags); | |
401 | if (unlikely(!ios)) | |
402 | return -ENOMEM; | |
403 | ||
404 | ios->layout = objio_seg; | |
405 | ||
406 | *outp = &ios->ol_state; | |
407 | return 0; | |
408 | } | |
409 | ||
410 | void objio_free_io_state(struct objlayout_io_state *ol_state) | |
411 | { | |
412 | struct objio_state *ios = container_of(ol_state, struct objio_state, | |
413 | ol_state); | |
414 | ||
415 | kfree(ios); | |
416 | } | |
417 | ||
418 | static void _clear_bio(struct bio *bio) | |
419 | { | |
420 | struct bio_vec *bv; | |
421 | unsigned i; | |
422 | ||
423 | __bio_for_each_segment(bv, bio, i, 0) { | |
424 | unsigned this_count = bv->bv_len; | |
425 | ||
426 | if (likely(PAGE_SIZE == this_count)) | |
427 | clear_highpage(bv->bv_page); | |
428 | else | |
429 | zero_user(bv->bv_page, bv->bv_offset, this_count); | |
430 | } | |
431 | } | |
432 | ||
433 | static int _io_check(struct objio_state *ios, bool is_write) | |
434 | { | |
435 | enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; | |
436 | int lin_ret = 0; | |
437 | int i; | |
438 | ||
439 | for (i = 0; i < ios->numdevs; i++) { | |
440 | struct osd_sense_info osi; | |
441 | struct osd_request *or = ios->per_dev[i].or; | |
442 | unsigned dev; | |
443 | int ret; | |
444 | ||
445 | if (!or) | |
446 | continue; | |
447 | ||
448 | ret = osd_req_decode_sense(or, &osi); | |
449 | if (likely(!ret)) | |
450 | continue; | |
451 | ||
452 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | |
453 | /* start read offset passed endof file */ | |
454 | BUG_ON(is_write); | |
455 | _clear_bio(ios->per_dev[i].bio); | |
456 | dprintk("%s: start read offset passed end of file " | |
457 | "offset=0x%llx, length=0x%lx\n", __func__, | |
458 | _LLU(ios->per_dev[i].offset), | |
459 | ios->per_dev[i].length); | |
460 | ||
461 | continue; /* we recovered */ | |
462 | } | |
463 | dev = ios->per_dev[i].dev; | |
464 | ||
465 | if (osi.osd_err_pri >= oep) { | |
466 | oep = osi.osd_err_pri; | |
467 | lin_ret = ret; | |
468 | } | |
469 | } | |
470 | ||
471 | return lin_ret; | |
472 | } | |
473 | ||
474 | /* | |
475 | * Common IO state helpers. | |
476 | */ | |
477 | static void _io_free(struct objio_state *ios) | |
478 | { | |
479 | unsigned i; | |
480 | ||
481 | for (i = 0; i < ios->numdevs; i++) { | |
482 | struct _objio_per_comp *per_dev = &ios->per_dev[i]; | |
483 | ||
484 | if (per_dev->or) { | |
485 | osd_end_request(per_dev->or); | |
486 | per_dev->or = NULL; | |
487 | } | |
488 | ||
489 | if (per_dev->bio) { | |
490 | bio_put(per_dev->bio); | |
491 | per_dev->bio = NULL; | |
492 | } | |
493 | } | |
494 | } | |
495 | ||
496 | struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) | |
497 | { | |
498 | unsigned min_dev = ios->layout->comps_index; | |
499 | unsigned max_dev = min_dev + ios->layout->num_comps; | |
500 | ||
501 | BUG_ON(dev < min_dev || max_dev <= dev); | |
502 | return ios->layout->ods[dev - min_dev]->od; | |
503 | } | |
504 | ||
505 | struct _striping_info { | |
506 | u64 obj_offset; | |
507 | u64 group_length; | |
508 | unsigned dev; | |
509 | unsigned unit_off; | |
510 | }; | |
511 | ||
512 | static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, | |
513 | struct _striping_info *si) | |
514 | { | |
515 | u32 stripe_unit = ios->layout->stripe_unit; | |
516 | u32 group_width = ios->layout->group_width; | |
517 | u64 group_depth = ios->layout->group_depth; | |
518 | u32 U = stripe_unit * group_width; | |
519 | ||
520 | u64 T = U * group_depth; | |
521 | u64 S = T * ios->layout->group_count; | |
522 | u64 M = div64_u64(file_offset, S); | |
523 | ||
524 | /* | |
525 | G = (L - (M * S)) / T | |
526 | H = (L - (M * S)) % T | |
527 | */ | |
528 | u64 LmodU = file_offset - M * S; | |
529 | u32 G = div64_u64(LmodU, T); | |
530 | u64 H = LmodU - G * T; | |
531 | ||
532 | u32 N = div_u64(H, U); | |
533 | ||
534 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | |
535 | si->obj_offset = si->unit_off + (N * stripe_unit) + | |
536 | (M * group_depth * stripe_unit); | |
537 | ||
538 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | |
539 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | |
540 | si->dev *= ios->layout->mirrors_p1; | |
541 | ||
542 | si->group_length = T - H; | |
543 | } | |
544 | ||
545 | static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, | |
546 | unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, | |
547 | gfp_t gfp_flags) | |
548 | { | |
549 | unsigned pg = *cur_pg; | |
550 | struct request_queue *q = | |
551 | osd_request_queue(_io_od(ios, per_dev->dev)); | |
552 | ||
553 | per_dev->length += cur_len; | |
554 | ||
555 | if (per_dev->bio == NULL) { | |
556 | unsigned stripes = ios->layout->num_comps / | |
557 | ios->layout->mirrors_p1; | |
558 | unsigned pages_in_stripe = stripes * | |
559 | (ios->layout->stripe_unit / PAGE_SIZE); | |
560 | unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / | |
561 | stripes; | |
562 | ||
563 | if (BIO_MAX_PAGES_KMALLOC < bio_size) | |
564 | bio_size = BIO_MAX_PAGES_KMALLOC; | |
565 | ||
566 | per_dev->bio = bio_kmalloc(gfp_flags, bio_size); | |
567 | if (unlikely(!per_dev->bio)) { | |
568 | dprintk("Faild to allocate BIO size=%u\n", bio_size); | |
569 | return -ENOMEM; | |
570 | } | |
571 | } | |
572 | ||
573 | while (cur_len > 0) { | |
574 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | |
575 | unsigned added_len; | |
576 | ||
577 | BUG_ON(ios->ol_state.nr_pages <= pg); | |
578 | cur_len -= pglen; | |
579 | ||
580 | added_len = bio_add_pc_page(q, per_dev->bio, | |
581 | ios->ol_state.pages[pg], pglen, pgbase); | |
582 | if (unlikely(pglen != added_len)) | |
583 | return -ENOMEM; | |
584 | pgbase = 0; | |
585 | ++pg; | |
586 | } | |
587 | BUG_ON(cur_len); | |
588 | ||
589 | *cur_pg = pg; | |
590 | return 0; | |
591 | } | |
592 | ||
593 | static int _prepare_one_group(struct objio_state *ios, u64 length, | |
594 | struct _striping_info *si, unsigned *last_pg, | |
595 | gfp_t gfp_flags) | |
596 | { | |
597 | unsigned stripe_unit = ios->layout->stripe_unit; | |
598 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | |
599 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | |
600 | unsigned dev = si->dev; | |
601 | unsigned first_dev = dev - (dev % devs_in_group); | |
602 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | |
603 | unsigned cur_pg = *last_pg; | |
604 | int ret = 0; | |
605 | ||
606 | while (length) { | |
607 | struct _objio_per_comp *per_dev = &ios->per_dev[dev]; | |
608 | unsigned cur_len, page_off = 0; | |
609 | ||
610 | if (!per_dev->length) { | |
611 | per_dev->dev = dev; | |
612 | if (dev < si->dev) { | |
613 | per_dev->offset = si->obj_offset + stripe_unit - | |
614 | si->unit_off; | |
615 | cur_len = stripe_unit; | |
616 | } else if (dev == si->dev) { | |
617 | per_dev->offset = si->obj_offset; | |
618 | cur_len = stripe_unit - si->unit_off; | |
619 | page_off = si->unit_off & ~PAGE_MASK; | |
620 | BUG_ON(page_off && | |
621 | (page_off != ios->ol_state.pgbase)); | |
622 | } else { /* dev > si->dev */ | |
623 | per_dev->offset = si->obj_offset - si->unit_off; | |
624 | cur_len = stripe_unit; | |
625 | } | |
626 | ||
627 | if (max_comp < dev) | |
628 | max_comp = dev; | |
629 | } else { | |
630 | cur_len = stripe_unit; | |
631 | } | |
632 | if (cur_len >= length) | |
633 | cur_len = length; | |
634 | ||
635 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | |
636 | cur_len, gfp_flags); | |
637 | if (unlikely(ret)) | |
638 | goto out; | |
639 | ||
640 | dev += mirrors_p1; | |
641 | dev = (dev % devs_in_group) + first_dev; | |
642 | ||
643 | length -= cur_len; | |
644 | ios->length += cur_len; | |
645 | } | |
646 | out: | |
647 | ios->numdevs = max_comp + mirrors_p1; | |
648 | *last_pg = cur_pg; | |
649 | return ret; | |
650 | } | |
651 | ||
652 | static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) | |
653 | { | |
654 | u64 length = ios->ol_state.count; | |
655 | u64 offset = ios->ol_state.offset; | |
656 | struct _striping_info si; | |
657 | unsigned last_pg = 0; | |
658 | int ret = 0; | |
659 | ||
660 | while (length) { | |
661 | _calc_stripe_info(ios, offset, &si); | |
662 | ||
663 | if (length < si.group_length) | |
664 | si.group_length = length; | |
665 | ||
666 | ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); | |
667 | if (unlikely(ret)) | |
668 | goto out; | |
669 | ||
670 | offset += si.group_length; | |
671 | length -= si.group_length; | |
672 | } | |
673 | ||
674 | out: | |
675 | if (!ios->length) | |
676 | return ret; | |
677 | ||
678 | return 0; | |
679 | } | |
680 | ||
681 | static ssize_t _sync_done(struct objio_state *ios) | |
682 | { | |
683 | struct completion *waiting = ios->private; | |
684 | ||
685 | complete(waiting); | |
686 | return 0; | |
687 | } | |
688 | ||
689 | static void _last_io(struct kref *kref) | |
690 | { | |
691 | struct objio_state *ios = container_of(kref, struct objio_state, kref); | |
692 | ||
693 | ios->done(ios); | |
694 | } | |
695 | ||
696 | static void _done_io(struct osd_request *or, void *p) | |
697 | { | |
698 | struct objio_state *ios = p; | |
699 | ||
700 | kref_put(&ios->kref, _last_io); | |
701 | } | |
702 | ||
703 | static ssize_t _io_exec(struct objio_state *ios) | |
704 | { | |
705 | DECLARE_COMPLETION_ONSTACK(wait); | |
706 | ssize_t status = 0; /* sync status */ | |
707 | unsigned i; | |
708 | objio_done_fn saved_done_fn = ios->done; | |
709 | bool sync = ios->ol_state.sync; | |
710 | ||
711 | if (sync) { | |
712 | ios->done = _sync_done; | |
713 | ios->private = &wait; | |
714 | } | |
715 | ||
716 | kref_init(&ios->kref); | |
717 | ||
718 | for (i = 0; i < ios->numdevs; i++) { | |
719 | struct osd_request *or = ios->per_dev[i].or; | |
720 | ||
721 | if (!or) | |
722 | continue; | |
723 | ||
724 | kref_get(&ios->kref); | |
725 | osd_execute_request_async(or, _done_io, ios); | |
726 | } | |
727 | ||
728 | kref_put(&ios->kref, _last_io); | |
729 | ||
730 | if (sync) { | |
731 | wait_for_completion(&wait); | |
732 | status = saved_done_fn(ios); | |
733 | } | |
734 | ||
735 | return status; | |
736 | } | |
737 | ||
738 | /* | |
739 | * read | |
740 | */ | |
741 | static ssize_t _read_done(struct objio_state *ios) | |
742 | { | |
743 | ssize_t status; | |
744 | int ret = _io_check(ios, false); | |
745 | ||
746 | _io_free(ios); | |
747 | ||
748 | if (likely(!ret)) | |
749 | status = ios->length; | |
750 | else | |
751 | status = ret; | |
752 | ||
753 | objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); | |
754 | return status; | |
755 | } | |
756 | ||
757 | static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) | |
758 | { | |
759 | struct osd_request *or = NULL; | |
760 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | |
761 | unsigned dev = per_dev->dev; | |
762 | struct pnfs_osd_object_cred *cred = | |
763 | &ios->layout->comps[dev]; | |
764 | struct osd_obj_id obj = { | |
765 | .partition = cred->oc_object_id.oid_partition_id, | |
766 | .id = cred->oc_object_id.oid_object_id, | |
767 | }; | |
768 | int ret; | |
769 | ||
770 | or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); | |
771 | if (unlikely(!or)) { | |
772 | ret = -ENOMEM; | |
773 | goto err; | |
774 | } | |
775 | per_dev->or = or; | |
776 | ||
777 | osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); | |
778 | ||
779 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | |
780 | if (ret) { | |
781 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | |
782 | __func__, ret); | |
783 | goto err; | |
784 | } | |
785 | ||
786 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | |
787 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | |
788 | per_dev->length); | |
789 | ||
790 | err: | |
791 | return ret; | |
792 | } | |
793 | ||
794 | static ssize_t _read_exec(struct objio_state *ios) | |
795 | { | |
796 | unsigned i; | |
797 | int ret; | |
798 | ||
799 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | |
800 | if (!ios->per_dev[i].length) | |
801 | continue; | |
802 | ret = _read_mirrors(ios, i); | |
803 | if (unlikely(ret)) | |
804 | goto err; | |
805 | } | |
806 | ||
807 | ios->done = _read_done; | |
808 | return _io_exec(ios); /* In sync mode exec returns the io status */ | |
809 | ||
810 | err: | |
811 | _io_free(ios); | |
812 | return ret; | |
813 | } | |
814 | ||
815 | ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) | |
816 | { | |
817 | struct objio_state *ios = container_of(ol_state, struct objio_state, | |
818 | ol_state); | |
819 | int ret; | |
820 | ||
821 | ret = _io_rw_pagelist(ios, GFP_KERNEL); | |
822 | if (unlikely(ret)) | |
823 | return ret; | |
824 | ||
825 | return _read_exec(ios); | |
826 | } | |
827 | ||
828 | /* | |
829 | * write | |
830 | */ | |
831 | static ssize_t _write_done(struct objio_state *ios) | |
832 | { | |
833 | ssize_t status; | |
834 | int ret = _io_check(ios, true); | |
835 | ||
836 | _io_free(ios); | |
837 | ||
838 | if (likely(!ret)) { | |
839 | /* FIXME: should be based on the OSD's persistence model | |
840 | * See OSD2r05 Section 4.13 Data persistence model */ | |
841 | ios->ol_state.committed = NFS_FILE_SYNC; | |
842 | status = ios->length; | |
843 | } else { | |
844 | status = ret; | |
845 | } | |
846 | ||
847 | objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); | |
848 | return status; | |
849 | } | |
850 | ||
851 | static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) | |
852 | { | |
853 | struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; | |
854 | unsigned dev = ios->per_dev[cur_comp].dev; | |
855 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | |
856 | int ret; | |
857 | ||
858 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { | |
859 | struct osd_request *or = NULL; | |
860 | struct pnfs_osd_object_cred *cred = | |
861 | &ios->layout->comps[dev]; | |
862 | struct osd_obj_id obj = { | |
863 | .partition = cred->oc_object_id.oid_partition_id, | |
864 | .id = cred->oc_object_id.oid_object_id, | |
865 | }; | |
866 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | |
867 | struct bio *bio; | |
868 | ||
869 | or = osd_start_request(_io_od(ios, dev), GFP_NOFS); | |
870 | if (unlikely(!or)) { | |
871 | ret = -ENOMEM; | |
872 | goto err; | |
873 | } | |
874 | per_dev->or = or; | |
875 | ||
876 | if (per_dev != master_dev) { | |
877 | bio = bio_kmalloc(GFP_NOFS, | |
878 | master_dev->bio->bi_max_vecs); | |
879 | if (unlikely(!bio)) { | |
880 | dprintk("Faild to allocate BIO size=%u\n", | |
881 | master_dev->bio->bi_max_vecs); | |
882 | ret = -ENOMEM; | |
883 | goto err; | |
884 | } | |
885 | ||
886 | __bio_clone(bio, master_dev->bio); | |
887 | bio->bi_bdev = NULL; | |
888 | bio->bi_next = NULL; | |
889 | per_dev->bio = bio; | |
890 | per_dev->dev = dev; | |
891 | per_dev->length = master_dev->length; | |
892 | per_dev->offset = master_dev->offset; | |
893 | } else { | |
894 | bio = master_dev->bio; | |
895 | bio->bi_rw |= REQ_WRITE; | |
896 | } | |
897 | ||
898 | osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); | |
899 | ||
900 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | |
901 | if (ret) { | |
902 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | |
903 | __func__, ret); | |
904 | goto err; | |
905 | } | |
906 | ||
907 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | |
908 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | |
909 | per_dev->length); | |
910 | } | |
911 | ||
912 | err: | |
913 | return ret; | |
914 | } | |
915 | ||
916 | static ssize_t _write_exec(struct objio_state *ios) | |
917 | { | |
918 | unsigned i; | |
919 | int ret; | |
920 | ||
921 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | |
922 | if (!ios->per_dev[i].length) | |
923 | continue; | |
924 | ret = _write_mirrors(ios, i); | |
925 | if (unlikely(ret)) | |
926 | goto err; | |
927 | } | |
928 | ||
929 | ios->done = _write_done; | |
930 | return _io_exec(ios); /* In sync mode exec returns the io->status */ | |
931 | ||
932 | err: | |
933 | _io_free(ios); | |
934 | return ret; | |
935 | } | |
936 | ||
937 | ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) | |
938 | { | |
939 | struct objio_state *ios = container_of(ol_state, struct objio_state, | |
940 | ol_state); | |
941 | int ret; | |
942 | ||
943 | /* TODO: ios->stable = stable; */ | |
944 | ret = _io_rw_pagelist(ios, GFP_NOFS); | |
945 | if (unlikely(ret)) | |
946 | return ret; | |
947 | ||
948 | return _write_exec(ios); | |
949 | } | |
950 | ||
951 | /* | |
952 | * objlayout_pg_test(). Called by nfs_can_coalesce_requests() | |
953 | * | |
954 | * return 1 : coalesce page | |
955 | * return 0 : don't coalesce page | |
956 | */ | |
957 | int | |
958 | objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, | |
959 | struct nfs_page *req) | |
960 | { | |
961 | return 1; | |
962 | } | |
c93407d0 BH |
963 | |
964 | static struct pnfs_layoutdriver_type objlayout_type = { | |
965 | .id = LAYOUT_OSD2_OBJECTS, | |
966 | .name = "LAYOUT_OSD2_OBJECTS", | |
8a1636c4 | 967 | .flags = PNFS_LAYOUTRET_ON_SETATTR, |
09f5bf4e | 968 | |
e51b841d BH |
969 | .alloc_layout_hdr = objlayout_alloc_layout_hdr, |
970 | .free_layout_hdr = objlayout_free_layout_hdr, | |
971 | ||
09f5bf4e BH |
972 | .alloc_lseg = objlayout_alloc_lseg, |
973 | .free_lseg = objlayout_free_lseg, | |
b6c05f16 | 974 | |
04f83450 BH |
975 | .read_pagelist = objlayout_read_pagelist, |
976 | .write_pagelist = objlayout_write_pagelist, | |
977 | .pg_test = objlayout_pg_test, | |
978 | ||
b6c05f16 | 979 | .free_deviceid_node = objio_free_deviceid_node, |
c93407d0 BH |
980 | }; |
981 | ||
982 | MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); | |
983 | MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); | |
984 | MODULE_LICENSE("GPL"); | |
985 | ||
986 | static int __init | |
987 | objlayout_init(void) | |
988 | { | |
989 | int ret = pnfs_register_layoutdriver(&objlayout_type); | |
990 | ||
991 | if (ret) | |
992 | printk(KERN_INFO | |
993 | "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", | |
994 | __func__, ret); | |
995 | else | |
996 | printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", | |
997 | __func__); | |
998 | return ret; | |
999 | } | |
1000 | ||
1001 | static void __exit | |
1002 | objlayout_exit(void) | |
1003 | { | |
1004 | pnfs_unregister_layoutdriver(&objlayout_type); | |
1005 | printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", | |
1006 | __func__); | |
1007 | } | |
1008 | ||
1009 | module_init(objlayout_init); | |
1010 | module_exit(objlayout_exit); |