Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
[linux-2.6-block.git] / fs / afs / server.c
1 /* AFS server record management
2  *
3  * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
4  * Written by David Howells (dhowells@redhat.com)
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  */
11
12 #include <linux/sched.h>
13 #include <linux/slab.h>
14 #include "afs_fs.h"
15 #include "internal.h"
16
17 static unsigned afs_server_gc_delay = 10;       /* Server record timeout in seconds */
18 static unsigned afs_server_update_delay = 30;   /* Time till VLDB recheck in secs */
19
20 static void afs_inc_servers_outstanding(struct afs_net *net)
21 {
22         atomic_inc(&net->servers_outstanding);
23 }
24
25 static void afs_dec_servers_outstanding(struct afs_net *net)
26 {
27         if (atomic_dec_and_test(&net->servers_outstanding))
28                 wake_up_var(&net->servers_outstanding);
29 }
30
31 /*
32  * Find a server by one of its addresses.
33  */
34 struct afs_server *afs_find_server(struct afs_net *net,
35                                    const struct sockaddr_rxrpc *srx)
36 {
37         const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
38         const struct afs_addr_list *alist;
39         struct afs_server *server = NULL;
40         unsigned int i;
41         bool ipv6 = true;
42         int seq = 0, diff;
43
44         if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 ||
45             srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 ||
46             srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff))
47                 ipv6 = false;
48
49         rcu_read_lock();
50
51         do {
52                 if (server)
53                         afs_put_server(net, server);
54                 server = NULL;
55                 read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
56
57                 if (ipv6) {
58                         hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
59                                 alist = rcu_dereference(server->addresses);
60                                 for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
61                                         b = &alist->addrs[i].transport.sin6;
62                                         diff = ((u16 __force)a->sin6_port -
63                                                 (u16 __force)b->sin6_port);
64                                         if (diff == 0)
65                                                 diff = memcmp(&a->sin6_addr,
66                                                               &b->sin6_addr,
67                                                               sizeof(struct in6_addr));
68                                         if (diff == 0)
69                                                 goto found;
70                                         if (diff < 0) {
71                                                 // TODO: Sort the list
72                                                 //if (i == alist->nr_ipv4)
73                                                 //      goto not_found;
74                                                 break;
75                                         }
76                                 }
77                         }
78                 } else {
79                         hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
80                                 alist = rcu_dereference(server->addresses);
81                                 for (i = 0; i < alist->nr_ipv4; i++) {
82                                         b = &alist->addrs[i].transport.sin6;
83                                         diff = ((u16 __force)a->sin6_port -
84                                                 (u16 __force)b->sin6_port);
85                                         if (diff == 0)
86                                                 diff = ((u32 __force)a->sin6_addr.s6_addr32[3] -
87                                                         (u32 __force)b->sin6_addr.s6_addr32[3]);
88                                         if (diff == 0)
89                                                 goto found;
90                                         if (diff < 0) {
91                                                 // TODO: Sort the list
92                                                 //if (i == 0)
93                                                 //      goto not_found;
94                                                 break;
95                                         }
96                                 }
97                         }
98                 }
99
100         //not_found:
101                 server = NULL;
102         found:
103                 if (server && !atomic_inc_not_zero(&server->usage))
104                         server = NULL;
105
106         } while (need_seqretry(&net->fs_addr_lock, seq));
107
108         done_seqretry(&net->fs_addr_lock, seq);
109
110         rcu_read_unlock();
111         return server;
112 }
113
114 /*
115  * Look up a server by its UUID
116  */
117 struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
118 {
119         struct afs_server *server = NULL;
120         struct rb_node *p;
121         int diff, seq = 0;
122
123         _enter("%pU", uuid);
124
125         do {
126                 /* Unfortunately, rbtree walking doesn't give reliable results
127                  * under just the RCU read lock, so we have to check for
128                  * changes.
129                  */
130                 if (server)
131                         afs_put_server(net, server);
132                 server = NULL;
133
134                 read_seqbegin_or_lock(&net->fs_lock, &seq);
135
136                 p = net->fs_servers.rb_node;
137                 while (p) {
138                         server = rb_entry(p, struct afs_server, uuid_rb);
139
140                         diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
141                         if (diff < 0) {
142                                 p = p->rb_left;
143                         } else if (diff > 0) {
144                                 p = p->rb_right;
145                         } else {
146                                 afs_get_server(server);
147                                 break;
148                         }
149
150                         server = NULL;
151                 }
152         } while (need_seqretry(&net->fs_lock, seq));
153
154         done_seqretry(&net->fs_lock, seq);
155
156         _leave(" = %p", server);
157         return server;
158 }
159
160 /*
161  * Install a server record in the namespace tree
162  */
163 static struct afs_server *afs_install_server(struct afs_net *net,
164                                              struct afs_server *candidate)
165 {
166         const struct afs_addr_list *alist;
167         struct afs_server *server;
168         struct rb_node **pp, *p;
169         int ret = -EEXIST, diff;
170
171         _enter("%p", candidate);
172
173         write_seqlock(&net->fs_lock);
174
175         /* Firstly install the server in the UUID lookup tree */
176         pp = &net->fs_servers.rb_node;
177         p = NULL;
178         while (*pp) {
179                 p = *pp;
180                 _debug("- consider %p", p);
181                 server = rb_entry(p, struct afs_server, uuid_rb);
182                 diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
183                 if (diff < 0)
184                         pp = &(*pp)->rb_left;
185                 else if (diff > 0)
186                         pp = &(*pp)->rb_right;
187                 else
188                         goto exists;
189         }
190
191         server = candidate;
192         rb_link_node(&server->uuid_rb, p, pp);
193         rb_insert_color(&server->uuid_rb, &net->fs_servers);
194         hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
195
196         write_seqlock(&net->fs_addr_lock);
197         alist = rcu_dereference_protected(server->addresses,
198                                           lockdep_is_held(&net->fs_addr_lock.lock));
199
200         /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
201          * it in the IPv4 and/or IPv6 reverse-map lists.
202          *
203          * TODO: For speed we want to use something other than a flat list
204          * here; even sorting the list in terms of lowest address would help a
205          * bit, but anything we might want to do gets messy and memory
206          * intensive.
207          */
208         if (alist->nr_ipv4 > 0)
209                 hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
210         if (alist->nr_addrs > alist->nr_ipv4)
211                 hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
212
213         write_sequnlock(&net->fs_addr_lock);
214         ret = 0;
215
216 exists:
217         afs_get_server(server);
218         write_sequnlock(&net->fs_lock);
219         return server;
220 }
221
222 /*
223  * allocate a new server record
224  */
225 static struct afs_server *afs_alloc_server(struct afs_net *net,
226                                            const uuid_t *uuid,
227                                            struct afs_addr_list *alist)
228 {
229         struct afs_server *server;
230
231         _enter("");
232
233         server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
234         if (!server)
235                 goto enomem;
236
237         atomic_set(&server->usage, 1);
238         RCU_INIT_POINTER(server->addresses, alist);
239         server->addr_version = alist->version;
240         server->uuid = *uuid;
241         server->flags = (1UL << AFS_SERVER_FL_NEW);
242         server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
243         rwlock_init(&server->fs_lock);
244         INIT_LIST_HEAD(&server->cb_interests);
245         rwlock_init(&server->cb_break_lock);
246
247         afs_inc_servers_outstanding(net);
248         _leave(" = %p", server);
249         return server;
250
251 enomem:
252         _leave(" = NULL [nomem]");
253         return NULL;
254 }
255
256 /*
257  * Look up an address record for a server
258  */
259 static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
260                                                  struct key *key, const uuid_t *uuid)
261 {
262         struct afs_addr_cursor ac;
263         struct afs_addr_list *alist;
264         int ret;
265
266         ret = afs_set_vl_cursor(&ac, cell);
267         if (ret < 0)
268                 return ERR_PTR(ret);
269
270         while (afs_iterate_addresses(&ac)) {
271                 if (test_bit(ac.index, &ac.alist->yfs))
272                         alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
273                 else
274                         alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
275                 switch (ac.error) {
276                 case 0:
277                         afs_end_cursor(&ac);
278                         return alist;
279                 case -ECONNABORTED:
280                         ac.error = afs_abort_to_error(ac.abort_code);
281                         goto error;
282                 case -ENOMEM:
283                 case -ENONET:
284                         goto error;
285                 case -ENETUNREACH:
286                 case -EHOSTUNREACH:
287                 case -ECONNREFUSED:
288                         break;
289                 default:
290                         ac.error = -EIO;
291                         goto error;
292                 }
293         }
294
295 error:
296         return ERR_PTR(afs_end_cursor(&ac));
297 }
298
299 /*
300  * Get or create a fileserver record.
301  */
302 struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
303                                      const uuid_t *uuid)
304 {
305         struct afs_addr_list *alist;
306         struct afs_server *server, *candidate;
307
308         _enter("%p,%pU", cell->net, uuid);
309
310         server = afs_find_server_by_uuid(cell->net, uuid);
311         if (server)
312                 return server;
313
314         alist = afs_vl_lookup_addrs(cell, key, uuid);
315         if (IS_ERR(alist))
316                 return ERR_CAST(alist);
317
318         candidate = afs_alloc_server(cell->net, uuid, alist);
319         if (!candidate) {
320                 afs_put_addrlist(alist);
321                 return ERR_PTR(-ENOMEM);
322         }
323
324         server = afs_install_server(cell->net, candidate);
325         if (server != candidate) {
326                 afs_put_addrlist(alist);
327                 kfree(candidate);
328         }
329
330         _leave(" = %p{%d}", server, atomic_read(&server->usage));
331         return server;
332 }
333
334 /*
335  * Set the server timer to fire after a given delay, assuming it's not already
336  * set for an earlier time.
337  */
338 static void afs_set_server_timer(struct afs_net *net, time64_t delay)
339 {
340         if (net->live) {
341                 afs_inc_servers_outstanding(net);
342                 if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
343                         afs_dec_servers_outstanding(net);
344         }
345 }
346
347 /*
348  * Server management timer.  We have an increment on fs_outstanding that we
349  * need to pass along to the work item.
350  */
351 void afs_servers_timer(struct timer_list *timer)
352 {
353         struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
354
355         _enter("");
356         if (!queue_work(afs_wq, &net->fs_manager))
357                 afs_dec_servers_outstanding(net);
358 }
359
360 /*
361  * Release a reference on a server record.
362  */
363 void afs_put_server(struct afs_net *net, struct afs_server *server)
364 {
365         unsigned int usage;
366
367         if (!server)
368                 return;
369
370         server->put_time = ktime_get_real_seconds();
371
372         usage = atomic_dec_return(&server->usage);
373
374         _enter("{%u}", usage);
375
376         if (likely(usage > 0))
377                 return;
378
379         afs_set_server_timer(net, afs_server_gc_delay);
380 }
381
382 static void afs_server_rcu(struct rcu_head *rcu)
383 {
384         struct afs_server *server = container_of(rcu, struct afs_server, rcu);
385
386         afs_put_addrlist(rcu_access_pointer(server->addresses));
387         kfree(server);
388 }
389
390 /*
391  * destroy a dead server
392  */
393 static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
394 {
395         struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
396         struct afs_addr_cursor ac = {
397                 .alist  = alist,
398                 .addr   = &alist->addrs[0],
399                 .start  = alist->index,
400                 .index  = alist->index,
401                 .error  = 0,
402         };
403         _enter("%p", server);
404
405         afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
406         call_rcu(&server->rcu, afs_server_rcu);
407         afs_dec_servers_outstanding(net);
408 }
409
410 /*
411  * Garbage collect any expired servers.
412  */
413 static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
414 {
415         struct afs_server *server;
416         bool deleted;
417         int usage;
418
419         while ((server = gc_list)) {
420                 gc_list = server->gc_next;
421
422                 write_seqlock(&net->fs_lock);
423                 usage = 1;
424                 deleted = atomic_try_cmpxchg(&server->usage, &usage, 0);
425                 if (deleted) {
426                         rb_erase(&server->uuid_rb, &net->fs_servers);
427                         hlist_del_rcu(&server->proc_link);
428                 }
429                 write_sequnlock(&net->fs_lock);
430
431                 if (deleted)
432                         afs_destroy_server(net, server);
433         }
434 }
435
436 /*
437  * Manage the records of servers known to be within a network namespace.  This
438  * includes garbage collecting unused servers.
439  *
440  * Note also that we were given an increment on net->servers_outstanding by
441  * whoever queued us that we need to deal with before returning.
442  */
443 void afs_manage_servers(struct work_struct *work)
444 {
445         struct afs_net *net = container_of(work, struct afs_net, fs_manager);
446         struct afs_server *gc_list = NULL;
447         struct rb_node *cursor;
448         time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
449         bool purging = !net->live;
450
451         _enter("");
452
453         /* Trawl the server list looking for servers that have expired from
454          * lack of use.
455          */
456         read_seqlock_excl(&net->fs_lock);
457
458         for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
459                 struct afs_server *server =
460                         rb_entry(cursor, struct afs_server, uuid_rb);
461                 int usage = atomic_read(&server->usage);
462
463                 _debug("manage %pU %u", &server->uuid, usage);
464
465                 ASSERTCMP(usage, >=, 1);
466                 ASSERTIFCMP(purging, usage, ==, 1);
467
468                 if (usage == 1) {
469                         time64_t expire_at = server->put_time;
470
471                         if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
472                             !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
473                                 expire_at += afs_server_gc_delay;
474                         if (purging || expire_at <= now) {
475                                 server->gc_next = gc_list;
476                                 gc_list = server;
477                         } else if (expire_at < next_manage) {
478                                 next_manage = expire_at;
479                         }
480                 }
481         }
482
483         read_sequnlock_excl(&net->fs_lock);
484
485         /* Update the timer on the way out.  We have to pass an increment on
486          * servers_outstanding in the namespace that we are in to the timer or
487          * the work scheduler.
488          */
489         if (!purging && next_manage < TIME64_MAX) {
490                 now = ktime_get_real_seconds();
491
492                 if (next_manage - now <= 0) {
493                         if (queue_work(afs_wq, &net->fs_manager))
494                                 afs_inc_servers_outstanding(net);
495                 } else {
496                         afs_set_server_timer(net, next_manage - now);
497                 }
498         }
499
500         afs_gc_servers(net, gc_list);
501
502         afs_dec_servers_outstanding(net);
503         _leave(" [%d]", atomic_read(&net->servers_outstanding));
504 }
505
506 static void afs_queue_server_manager(struct afs_net *net)
507 {
508         afs_inc_servers_outstanding(net);
509         if (!queue_work(afs_wq, &net->fs_manager))
510                 afs_dec_servers_outstanding(net);
511 }
512
513 /*
514  * Purge list of servers.
515  */
516 void afs_purge_servers(struct afs_net *net)
517 {
518         _enter("");
519
520         if (del_timer_sync(&net->fs_timer))
521                 atomic_dec(&net->servers_outstanding);
522
523         afs_queue_server_manager(net);
524
525         _debug("wait");
526         wait_var_event(&net->servers_outstanding,
527                        !atomic_read(&net->servers_outstanding));
528         _leave("");
529 }
530
531 /*
532  * Probe a fileserver to find its capabilities.
533  *
534  * TODO: Try service upgrade.
535  */
536 static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
537 {
538         _enter("");
539
540         fc->ac.addr = NULL;
541         fc->ac.start = READ_ONCE(fc->ac.alist->index);
542         fc->ac.index = fc->ac.start;
543         fc->ac.error = 0;
544         fc->ac.begun = false;
545
546         while (afs_iterate_addresses(&fc->ac)) {
547                 afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
548                                         &fc->ac, fc->key);
549                 switch (fc->ac.error) {
550                 case 0:
551                         afs_end_cursor(&fc->ac);
552                         set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
553                         return true;
554                 case -ECONNABORTED:
555                         fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
556                         goto error;
557                 case -ENOMEM:
558                 case -ENONET:
559                         goto error;
560                 case -ENETUNREACH:
561                 case -EHOSTUNREACH:
562                 case -ECONNREFUSED:
563                 case -ETIMEDOUT:
564                 case -ETIME:
565                         break;
566                 default:
567                         fc->ac.error = -EIO;
568                         goto error;
569                 }
570         }
571
572 error:
573         afs_end_cursor(&fc->ac);
574         return false;
575 }
576
577 /*
578  * If we haven't already, try probing the fileserver to get its capabilities.
579  * We try not to instigate parallel probes, but it's possible that the parallel
580  * probes will fail due to authentication failure when ours would succeed.
581  *
582  * TODO: Try sending an anonymous probe if an authenticated probe fails.
583  */
584 bool afs_probe_fileserver(struct afs_fs_cursor *fc)
585 {
586         bool success;
587         int ret, retries = 0;
588
589         _enter("");
590
591 retry:
592         if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
593                 _leave(" = t");
594                 return true;
595         }
596
597         if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
598                 success = afs_do_probe_fileserver(fc);
599                 clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
600                 wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
601                 _leave(" = t");
602                 return success;
603         }
604
605         _debug("wait");
606         ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
607                           TASK_INTERRUPTIBLE);
608         if (ret == -ERESTARTSYS) {
609                 fc->ac.error = ret;
610                 _leave(" = f [%d]", ret);
611                 return false;
612         }
613
614         retries++;
615         if (retries == 4) {
616                 fc->ac.error = -ESTALE;
617                 _leave(" = f [stale]");
618                 return false;
619         }
620         _debug("retry");
621         goto retry;
622 }
623
624 /*
625  * Get an update for a server's address list.
626  */
627 static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
628 {
629         struct afs_addr_list *alist, *discard;
630
631         _enter("");
632
633         alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key,
634                                     &server->uuid);
635         if (IS_ERR(alist)) {
636                 fc->ac.error = PTR_ERR(alist);
637                 _leave(" = f [%d]", fc->ac.error);
638                 return false;
639         }
640
641         discard = alist;
642         if (server->addr_version != alist->version) {
643                 write_lock(&server->fs_lock);
644                 discard = rcu_dereference_protected(server->addresses,
645                                                     lockdep_is_held(&server->fs_lock));
646                 rcu_assign_pointer(server->addresses, alist);
647                 server->addr_version = alist->version;
648                 write_unlock(&server->fs_lock);
649         }
650
651         server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
652         afs_put_addrlist(discard);
653         _leave(" = t");
654         return true;
655 }
656
657 /*
658  * See if a server's address list needs updating.
659  */
660 bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
661 {
662         time64_t now = ktime_get_real_seconds();
663         long diff;
664         bool success;
665         int ret, retries = 0;
666
667         _enter("");
668
669         ASSERT(server);
670
671 retry:
672         diff = READ_ONCE(server->update_at) - now;
673         if (diff > 0) {
674                 _leave(" = t [not now %ld]", diff);
675                 return true;
676         }
677
678         if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
679                 success = afs_update_server_record(fc, server);
680                 clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
681                 wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
682                 _leave(" = %d", success);
683                 return success;
684         }
685
686         ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
687                           TASK_INTERRUPTIBLE);
688         if (ret == -ERESTARTSYS) {
689                 fc->ac.error = ret;
690                 _leave(" = f [intr]");
691                 return false;
692         }
693
694         retries++;
695         if (retries == 4) {
696                 _leave(" = f [stale]");
697                 ret = -ESTALE;
698                 return false;
699         }
700         goto retry;
701 }