nfs41: pull data server cache from file layout to generic pnfs
[linux-2.6-block.git] / fs / nfs / filelayout / filelayoutdev.c
CommitLineData
16b374ca
AA
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h>
98fc685a 33#include <linux/module.h>
5976687a 34#include <linux/sunrpc/addr.h>
16b374ca 35
b5968725
TH
36#include "../internal.h"
37#include "../nfs4session.h"
38#include "filelayout.h"
16b374ca
AA
39
40#define NFSDBG_FACILITY NFSDBG_PNFS_LD
41
98fc685a
AA
42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
44
d83217c1
AA
45/*
46 * Create an rpc connection to the nfs4_pnfs_ds data server
35dbbc99 47 * Currently only supports IPv4 and IPv6 addresses
d83217c1
AA
48 */
49static int
50nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
51{
7e574f0d 52 struct nfs_client *clp = ERR_PTR(-EIO);
14f9a607 53 struct nfs4_pnfs_ds_addr *da;
d83217c1
AA
54 int status = 0;
55
14f9a607 56 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
d83217c1
AA
57 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
58
7e574f0d
WAA
59 list_for_each_entry(da, &ds->ds_addrs, da_node) {
60 dprintk("%s: DS %s: trying address %s\n",
61 __func__, ds->ds_remotestr, da->da_remotestr);
14f9a607 62
7e574f0d 63 clp = nfs4_set_ds_client(mds_srv->nfs_client,
98fc685a
AA
64 (struct sockaddr *)&da->da_addr,
65 da->da_addrlen, IPPROTO_TCP,
66 dataserver_timeo, dataserver_retrans);
7e574f0d
WAA
67 if (!IS_ERR(clp))
68 break;
69 }
70
d83217c1
AA
71 if (IS_ERR(clp)) {
72 status = PTR_ERR(clp);
73 goto out;
74 }
75
7b38c368 76 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
d83217c1
AA
77 if (status)
78 goto out_put;
79
acd65e5b 80 smp_wmb();
d83217c1 81 ds->ds_clp = clp;
c9895cb6 82 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
d83217c1
AA
83out:
84 return status;
85out_put:
86 nfs_put_client(clp);
87 goto out;
88}
89
1775bc34 90void
16b374ca
AA
91nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
92{
93 struct nfs4_pnfs_ds *ds;
94 int i;
95
a1eaecbc 96 nfs4_print_deviceid(&dsaddr->id_node.deviceid);
16b374ca
AA
97
98 for (i = 0; i < dsaddr->ds_num; i++) {
99 ds = dsaddr->ds_list[i];
875ae069
PT
100 if (ds != NULL)
101 nfs4_pnfs_ds_put(ds);
16b374ca
AA
102 }
103 kfree(dsaddr->stripe_indices);
104 kfree(dsaddr);
105}
106
16b374ca 107/*
c9895cb6 108 * Currently only supports ipv4, ipv6 and one multi-path address.
16b374ca 109 */
14f9a607 110static struct nfs4_pnfs_ds_addr *
17094272 111decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
16b374ca 112{
14f9a607 113 struct nfs4_pnfs_ds_addr *da = NULL;
c9895cb6 114 char *buf, *portstr;
13fff2f3 115 __be16 port;
c9895cb6 116 int nlen, rlen;
16b374ca 117 int tmp[2];
35124a09 118 __be32 *p;
c9895cb6 119 char *netid, *match_netid;
14f9a607
WAA
120 size_t len, match_netid_len;
121 char *startsep = "";
122 char *endsep = "";
123
16b374ca
AA
124
125 /* r_netid */
35124a09
WAA
126 p = xdr_inline_decode(streamp, 4);
127 if (unlikely(!p))
128 goto out_err;
16b374ca 129 nlen = be32_to_cpup(p++);
16b374ca 130
35124a09
WAA
131 p = xdr_inline_decode(streamp, nlen);
132 if (unlikely(!p))
133 goto out_err;
16b374ca 134
c9895cb6
WAA
135 netid = kmalloc(nlen+1, gfp_flags);
136 if (unlikely(!netid))
16b374ca 137 goto out_err;
16b374ca 138
c9895cb6
WAA
139 netid[nlen] = '\0';
140 memcpy(netid, p, nlen);
141
142 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
35124a09
WAA
143 p = xdr_inline_decode(streamp, 4);
144 if (unlikely(!p))
c9895cb6 145 goto out_free_netid;
35124a09
WAA
146 rlen = be32_to_cpup(p);
147
148 p = xdr_inline_decode(streamp, rlen);
149 if (unlikely(!p))
c9895cb6 150 goto out_free_netid;
35124a09 151
c9895cb6
WAA
152 /* port is ".ABC.DEF", 8 chars max */
153 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
ad3d2eed 154 dprintk("%s: Invalid address, length %d\n", __func__,
16b374ca 155 rlen);
c9895cb6 156 goto out_free_netid;
16b374ca 157 }
a75b9df9 158 buf = kmalloc(rlen + 1, gfp_flags);
b9f81057
SF
159 if (!buf) {
160 dprintk("%s: Not enough memory\n", __func__);
c9895cb6 161 goto out_free_netid;
b9f81057 162 }
16b374ca 163 buf[rlen] = '\0';
35124a09 164 memcpy(buf, p, rlen);
16b374ca 165
c9895cb6
WAA
166 /* replace port '.' with '-' */
167 portstr = strrchr(buf, '.');
168 if (!portstr) {
169 dprintk("%s: Failed finding expected dot in port\n",
170 __func__);
171 goto out_free_buf;
172 }
173 *portstr = '-';
174
175 /* find '.' between address and port */
176 portstr = strrchr(buf, '.');
177 if (!portstr) {
178 dprintk("%s: Failed finding expected dot between address and "
179 "port\n", __func__);
180 goto out_free_buf;
16b374ca 181 }
c9895cb6 182 *portstr = '\0';
16b374ca 183
14f9a607
WAA
184 da = kzalloc(sizeof(*da), gfp_flags);
185 if (unlikely(!da))
c9895cb6 186 goto out_free_buf;
14f9a607
WAA
187
188 INIT_LIST_HEAD(&da->da_node);
189
17094272 190 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
14f9a607
WAA
191 sizeof(da->da_addr))) {
192 dprintk("%s: error parsing address %s\n", __func__, buf);
193 goto out_free_da;
16b374ca
AA
194 }
195
c9895cb6
WAA
196 portstr++;
197 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
16b374ca
AA
198 port = htons((tmp[0] << 8) | (tmp[1]));
199
14f9a607 200 switch (da->da_addr.ss_family) {
c9895cb6 201 case AF_INET:
14f9a607
WAA
202 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
203 da->da_addrlen = sizeof(struct sockaddr_in);
c9895cb6
WAA
204 match_netid = "tcp";
205 match_netid_len = 3;
206 break;
207
208 case AF_INET6:
14f9a607
WAA
209 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
210 da->da_addrlen = sizeof(struct sockaddr_in6);
c9895cb6
WAA
211 match_netid = "tcp6";
212 match_netid_len = 4;
14f9a607
WAA
213 startsep = "[";
214 endsep = "]";
c9895cb6
WAA
215 break;
216
217 default:
218 dprintk("%s: unsupported address family: %u\n",
14f9a607
WAA
219 __func__, da->da_addr.ss_family);
220 goto out_free_da;
c9895cb6
WAA
221 }
222
223 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
224 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
225 __func__, netid, match_netid);
14f9a607 226 goto out_free_da;
c9895cb6
WAA
227 }
228
14f9a607
WAA
229 /* save human readable address */
230 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
231 da->da_remotestr = kzalloc(len, gfp_flags);
232
233 /* NULL is ok, only used for dprintk */
234 if (da->da_remotestr)
235 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
236 buf, endsep, ntohs(port));
237
238 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
239 kfree(buf);
240 kfree(netid);
241 return da;
242
243out_free_da:
244 kfree(da);
c9895cb6 245out_free_buf:
14f9a607 246 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
16b374ca 247 kfree(buf);
c9895cb6
WAA
248out_free_netid:
249 kfree(netid);
16b374ca 250out_err:
14f9a607 251 return NULL;
16b374ca
AA
252}
253
254/* Decode opaque device data and return the result */
661373b1
CH
255struct nfs4_file_layout_dsaddr *
256nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
257 gfp_t gfp_flags)
16b374ca 258{
35124a09 259 int i;
16b374ca
AA
260 u32 cnt, num;
261 u8 *indexp;
35124a09
WAA
262 __be32 *p;
263 u8 *stripe_indices;
264 u8 max_stripe_index;
265 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
266 struct xdr_stream stream;
f7da7a12 267 struct xdr_buf buf;
35124a09 268 struct page *scratch;
14f9a607
WAA
269 struct list_head dsaddrs;
270 struct nfs4_pnfs_ds_addr *da;
35124a09
WAA
271
272 /* set up xdr stream */
a75b9df9 273 scratch = alloc_page(gfp_flags);
35124a09
WAA
274 if (!scratch)
275 goto out_err;
276
f7da7a12 277 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
35124a09 278 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
16b374ca
AA
279
280 /* Get the stripe count (number of stripe index) */
35124a09
WAA
281 p = xdr_inline_decode(&stream, 4);
282 if (unlikely(!p))
283 goto out_err_free_scratch;
284
285 cnt = be32_to_cpup(p);
16b374ca
AA
286 dprintk("%s stripe count %d\n", __func__, cnt);
287 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
a030889a 288 printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
16b374ca
AA
289 "supported maximum %d\n", __func__,
290 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
35124a09
WAA
291 goto out_err_free_scratch;
292 }
293
294 /* read stripe indices */
a75b9df9 295 stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
35124a09
WAA
296 if (!stripe_indices)
297 goto out_err_free_scratch;
298
299 p = xdr_inline_decode(&stream, cnt << 2);
300 if (unlikely(!p))
301 goto out_err_free_stripe_indices;
302
303 indexp = &stripe_indices[0];
304 max_stripe_index = 0;
305 for (i = 0; i < cnt; i++) {
306 *indexp = be32_to_cpup(p++);
307 max_stripe_index = max(max_stripe_index, *indexp);
308 indexp++;
16b374ca
AA
309 }
310
311 /* Check the multipath list count */
35124a09
WAA
312 p = xdr_inline_decode(&stream, 4);
313 if (unlikely(!p))
314 goto out_err_free_stripe_indices;
315
316 num = be32_to_cpup(p);
16b374ca
AA
317 dprintk("%s ds_num %u\n", __func__, num);
318 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
a030889a 319 printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
16b374ca
AA
320 "supported maximum %d\n", __func__,
321 num, NFS4_PNFS_MAX_MULTI_CNT);
35124a09 322 goto out_err_free_stripe_indices;
16b374ca 323 }
35124a09
WAA
324
325 /* validate stripe indices are all < num */
326 if (max_stripe_index >= num) {
a030889a 327 printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
35124a09
WAA
328 __func__, max_stripe_index, num);
329 goto out_err_free_stripe_indices;
330 }
331
16b374ca
AA
332 dsaddr = kzalloc(sizeof(*dsaddr) +
333 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
a75b9df9 334 gfp_flags);
16b374ca 335 if (!dsaddr)
35124a09 336 goto out_err_free_stripe_indices;
16b374ca
AA
337
338 dsaddr->stripe_count = cnt;
35124a09
WAA
339 dsaddr->stripe_indices = stripe_indices;
340 stripe_indices = NULL;
16b374ca 341 dsaddr->ds_num = num;
661373b1 342 nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
16b374ca 343
14f9a607
WAA
344 INIT_LIST_HEAD(&dsaddrs);
345
16b374ca
AA
346 for (i = 0; i < dsaddr->ds_num; i++) {
347 int j;
35124a09
WAA
348 u32 mp_count;
349
350 p = xdr_inline_decode(&stream, 4);
351 if (unlikely(!p))
352 goto out_err_free_deviceid;
16b374ca 353
35124a09 354 mp_count = be32_to_cpup(p); /* multipath count */
35124a09 355 for (j = 0; j < mp_count; j++) {
661373b1 356 da = decode_ds_addr(server->nfs_client->cl_net,
17094272 357 &stream, gfp_flags);
14f9a607
WAA
358 if (da)
359 list_add_tail(&da->da_node, &dsaddrs);
360 }
361 if (list_empty(&dsaddrs)) {
362 dprintk("%s: no suitable DS addresses found\n",
363 __func__);
364 goto out_err_free_deviceid;
365 }
366
367 dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
368 if (!dsaddr->ds_list[i])
369 goto out_err_drain_dsaddrs;
370
371 /* If DS was already in cache, free ds addrs */
372 while (!list_empty(&dsaddrs)) {
373 da = list_first_entry(&dsaddrs,
374 struct nfs4_pnfs_ds_addr,
375 da_node);
376 list_del_init(&da->da_node);
377 kfree(da->da_remotestr);
378 kfree(da);
16b374ca
AA
379 }
380 }
35124a09
WAA
381
382 __free_page(scratch);
16b374ca
AA
383 return dsaddr;
384
14f9a607
WAA
385out_err_drain_dsaddrs:
386 while (!list_empty(&dsaddrs)) {
387 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
388 da_node);
389 list_del_init(&da->da_node);
390 kfree(da->da_remotestr);
391 kfree(da);
392 }
35124a09 393out_err_free_deviceid:
16b374ca 394 nfs4_fl_free_deviceid(dsaddr);
35124a09
WAA
395 /* stripe_indicies was part of dsaddr */
396 goto out_err_free_scratch;
397out_err_free_stripe_indices:
398 kfree(stripe_indices);
399out_err_free_scratch:
400 __free_page(scratch);
16b374ca
AA
401out_err:
402 dprintk("%s ERROR: returning NULL\n", __func__);
403 return NULL;
404}
405
ea8eecdd
CH
406void
407nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
16b374ca 408{
1775bc34 409 nfs4_put_deviceid_node(&dsaddr->id_node);
16b374ca 410}
cfe7f412
FI
411
412/*
413 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
414 * Then: ((res + fsi) % dsaddr->stripe_count)
415 */
416u32
417nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
418{
419 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
420 u64 tmp;
421
422 tmp = offset - flseg->pattern_offset;
423 do_div(tmp, flseg->stripe_unit);
424 tmp += flseg->first_stripe_index;
425 return do_div(tmp, flseg->dsaddr->stripe_count);
426}
427
428u32
429nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
430{
431 return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
432}
433
434struct nfs_fh *
435nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
436{
437 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
438 u32 i;
439
440 if (flseg->stripe_type == STRIPE_SPARSE) {
441 if (flseg->num_fh == 1)
442 i = 0;
443 else if (flseg->num_fh == 0)
444 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
445 return NULL;
446 else
447 i = nfs4_fl_calc_ds_index(lseg, j);
448 } else
449 i = j;
450 return flseg->fh_array[i];
451}
452
c23266d5
AA
453static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
454{
455 might_sleep();
74316201
N
456 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
457 nfs_wait_bit_killable, TASK_KILLABLE);
c23266d5
AA
458}
459
460static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
461{
4e857c58 462 smp_mb__before_atomic();
c23266d5 463 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
4e857c58 464 smp_mb__after_atomic();
c23266d5
AA
465 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
466}
467
468
cfe7f412
FI
469struct nfs4_pnfs_ds *
470nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
471{
472 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
473 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
554d458d 474 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
52b26a3e 475 struct nfs4_pnfs_ds *ret = ds;
cfe7f412
FI
476
477 if (ds == NULL) {
a030889a 478 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
cfe7f412 479 __func__, ds_idx);
f54bcf2e 480 pnfs_generic_mark_devid_invalid(devid);
52b26a3e 481 goto out;
cfe7f412 482 }
acd65e5b 483 smp_rmb();
c23266d5 484 if (ds->ds_clp)
52b26a3e 485 goto out_test_devid;
cfe7f412 486
c23266d5 487 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
568e8c49 488 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
cfe7f412
FI
489 int err;
490
568e8c49 491 err = nfs4_ds_connect(s, ds);
52b26a3e 492 if (err)
1dfed273 493 nfs4_mark_deviceid_unavailable(devid);
c23266d5
AA
494 nfs4_clear_ds_conn_bit(ds);
495 } else {
496 /* Either ds is connected, or ds is NULL */
497 nfs4_wait_ds_connect(ds);
cfe7f412 498 }
52b26a3e
TM
499out_test_devid:
500 if (filelayout_test_devid_unavailable(devid))
501 ret = NULL;
502out:
503 return ret;
cfe7f412 504}
98fc685a
AA
505
506module_param(dataserver_retrans, uint, 0644);
507MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
508 "retries a request before it attempts further "
509 " recovery action.");
510module_param(dataserver_timeo, uint, 0644);
511MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
512 "NFSv4.1 client waits for a response from a "
513 " data server before it retries an NFS request.");