ipc/sem.c: spelling fix
[linux-block.git] / fs / ocfs2 / dlm / dlmmaster.c
CommitLineData
328970de 1// SPDX-License-Identifier: GPL-2.0-or-later
6714d8e8
KH
2/* -*- mode: c; c-basic-offset: 8; -*-
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * dlmmod.c
6 *
7 * standalone DLM module
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
6714d8e8
KH
10 */
11
12
13#include <linux/module.h>
14#include <linux/fs.h>
15#include <linux/types.h>
16#include <linux/slab.h>
17#include <linux/highmem.h>
6714d8e8
KH
18#include <linux/init.h>
19#include <linux/sysctl.h>
20#include <linux/random.h>
21#include <linux/blkdev.h>
22#include <linux/socket.h>
23#include <linux/inet.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26
27
ca322fb6
MY
28#include "../cluster/heartbeat.h"
29#include "../cluster/nodemanager.h"
30#include "../cluster/tcp.h"
6714d8e8
KH
31
32#include "dlmapi.h"
33#include "dlmcommon.h"
82353b59 34#include "dlmdomain.h"
e5a0334c 35#include "dlmdebug.h"
6714d8e8
KH
36
37#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
ca322fb6 38#include "../cluster/masklog.h"
6714d8e8 39
6714d8e8
KH
40static void dlm_mle_node_down(struct dlm_ctxt *dlm,
41 struct dlm_master_list_entry *mle,
42 struct o2nm_node *node,
43 int idx);
44static void dlm_mle_node_up(struct dlm_ctxt *dlm,
45 struct dlm_master_list_entry *mle,
46 struct o2nm_node *node,
47 int idx);
48
49static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
ba2bf218
KH
50static int dlm_do_assert_master(struct dlm_ctxt *dlm,
51 struct dlm_lock_resource *res,
52 void *nodemap, u32 flags);
f3f85464 53static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
6714d8e8
KH
54
55static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
56 struct dlm_master_list_entry *mle,
57 const char *name,
58 unsigned int namelen)
59{
6714d8e8
KH
60 if (dlm != mle->dlm)
61 return 0;
62
7141514b
SM
63 if (namelen != mle->mnamelen ||
64 memcmp(name, mle->mname, namelen) != 0)
f77a9a78
SM
65 return 0;
66
6714d8e8
KH
67 return 1;
68}
69
1a5c4e2a
FF
70static struct kmem_cache *dlm_lockres_cache;
71static struct kmem_cache *dlm_lockname_cache;
72static struct kmem_cache *dlm_mle_cache;
6714d8e8 73
6714d8e8
KH
74static void dlm_mle_release(struct kref *kref);
75static void dlm_init_mle(struct dlm_master_list_entry *mle,
76 enum dlm_mle_type type,
77 struct dlm_ctxt *dlm,
78 struct dlm_lock_resource *res,
79 const char *name,
80 unsigned int namelen);
81static void dlm_put_mle(struct dlm_master_list_entry *mle);
82static void __dlm_put_mle(struct dlm_master_list_entry *mle);
83static int dlm_find_mle(struct dlm_ctxt *dlm,
84 struct dlm_master_list_entry **mle,
85 char *name, unsigned int namelen);
86
ba2bf218
KH
87static int dlm_do_master_request(struct dlm_lock_resource *res,
88 struct dlm_master_list_entry *mle, int to);
6714d8e8
KH
89
90
91static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
92 struct dlm_lock_resource *res,
93 struct dlm_master_list_entry *mle,
94 int *blocked);
95static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
96 struct dlm_lock_resource *res,
97 struct dlm_master_list_entry *mle,
98 int blocked);
99static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
100 struct dlm_lock_resource *res,
101 struct dlm_master_list_entry *mle,
102 struct dlm_master_list_entry **oldmle,
103 const char *name, unsigned int namelen,
104 u8 new_master, u8 master);
105
106static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
107 struct dlm_lock_resource *res);
108static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
109 struct dlm_lock_resource *res);
110static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
111 struct dlm_lock_resource *res,
112 u8 target);
c03872f5
KH
113static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
114 struct dlm_lock_resource *res);
6714d8e8
KH
115
116
117int dlm_is_host_down(int errno)
118{
119 switch (errno) {
120 case -EBADF:
121 case -ECONNREFUSED:
122 case -ENOTCONN:
123 case -ECONNRESET:
124 case -EPIPE:
125 case -EHOSTDOWN:
126 case -EHOSTUNREACH:
127 case -ETIMEDOUT:
128 case -ECONNABORTED:
129 case -ENETDOWN:
130 case -ENETUNREACH:
131 case -ENETRESET:
132 case -ESHUTDOWN:
133 case -ENOPROTOOPT:
134 case -EINVAL: /* if returned from our tcp code,
135 this means there is no socket */
136 return 1;
137 }
138 return 0;
139}
140
141
142/*
143 * MASTER LIST FUNCTIONS
144 */
145
146
147/*
148 * regarding master list entries and heartbeat callbacks:
149 *
150 * in order to avoid sleeping and allocation that occurs in
151 * heartbeat, master list entries are simply attached to the
152 * dlm's established heartbeat callbacks. the mle is attached
153 * when it is created, and since the dlm->spinlock is held at
154 * that time, any heartbeat event will be properly discovered
155 * by the mle. the mle needs to be detached from the
156 * dlm->mle_hb_events list as soon as heartbeat events are no
157 * longer useful to the mle, and before the mle is freed.
158 *
159 * as a general rule, heartbeat events are no longer needed by
160 * the mle once an "answer" regarding the lock master has been
161 * received.
162 */
163static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
164 struct dlm_master_list_entry *mle)
165{
166 assert_spin_locked(&dlm->spinlock);
167
168 list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
169}
170
171
172static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
173 struct dlm_master_list_entry *mle)
174{
175 if (!list_empty(&mle->hb_events))
176 list_del_init(&mle->hb_events);
177}
178
179
180static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
181 struct dlm_master_list_entry *mle)
182{
183 spin_lock(&dlm->spinlock);
184 __dlm_mle_detach_hb_events(dlm, mle);
185 spin_unlock(&dlm->spinlock);
186}
187
a2bf0477
KH
188static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
189{
190 struct dlm_ctxt *dlm;
191 dlm = mle->dlm;
192
193 assert_spin_locked(&dlm->spinlock);
194 assert_spin_locked(&dlm->master_lock);
195 mle->inuse++;
196 kref_get(&mle->mle_refs);
197}
198
199static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
200{
201 struct dlm_ctxt *dlm;
202 dlm = mle->dlm;
203
204 spin_lock(&dlm->spinlock);
205 spin_lock(&dlm->master_lock);
206 mle->inuse--;
207 __dlm_put_mle(mle);
208 spin_unlock(&dlm->master_lock);
209 spin_unlock(&dlm->spinlock);
210
211}
212
6714d8e8
KH
213/* remove from list and free */
214static void __dlm_put_mle(struct dlm_master_list_entry *mle)
215{
216 struct dlm_ctxt *dlm;
217 dlm = mle->dlm;
218
219 assert_spin_locked(&dlm->spinlock);
220 assert_spin_locked(&dlm->master_lock);
2c935bc5 221 if (!kref_read(&mle->mle_refs)) {
aa852354
KH
222 /* this may or may not crash, but who cares.
223 * it's a BUG. */
224 mlog(ML_ERROR, "bad mle: %p\n", mle);
225 dlm_print_one_mle(mle);
226 BUG();
227 } else
228 kref_put(&mle->mle_refs, dlm_mle_release);
6714d8e8
KH
229}
230
231
232/* must not have any spinlocks coming in */
233static void dlm_put_mle(struct dlm_master_list_entry *mle)
234{
235 struct dlm_ctxt *dlm;
236 dlm = mle->dlm;
237
238 spin_lock(&dlm->spinlock);
239 spin_lock(&dlm->master_lock);
240 __dlm_put_mle(mle);
241 spin_unlock(&dlm->master_lock);
242 spin_unlock(&dlm->spinlock);
243}
244
245static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
246{
247 kref_get(&mle->mle_refs);
248}
249
250static void dlm_init_mle(struct dlm_master_list_entry *mle,
251 enum dlm_mle_type type,
252 struct dlm_ctxt *dlm,
253 struct dlm_lock_resource *res,
254 const char *name,
255 unsigned int namelen)
256{
257 assert_spin_locked(&dlm->spinlock);
258
259 mle->dlm = dlm;
260 mle->type = type;
2ed6c750 261 INIT_HLIST_NODE(&mle->master_hash_node);
6714d8e8
KH
262 INIT_LIST_HEAD(&mle->hb_events);
263 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
264 spin_lock_init(&mle->spinlock);
265 init_waitqueue_head(&mle->wq);
266 atomic_set(&mle->woken, 0);
267 kref_init(&mle->mle_refs);
268 memset(mle->response_map, 0, sizeof(mle->response_map));
269 mle->master = O2NM_MAX_NODES;
270 mle->new_master = O2NM_MAX_NODES;
a2bf0477 271 mle->inuse = 0;
6714d8e8 272
f77a9a78
SM
273 BUG_ON(mle->type != DLM_MLE_BLOCK &&
274 mle->type != DLM_MLE_MASTER &&
275 mle->type != DLM_MLE_MIGRATION);
276
6714d8e8
KH
277 if (mle->type == DLM_MLE_MASTER) {
278 BUG_ON(!res);
7141514b
SM
279 mle->mleres = res;
280 memcpy(mle->mname, res->lockname.name, res->lockname.len);
281 mle->mnamelen = res->lockname.len;
282 mle->mnamehash = res->lockname.hash;
f77a9a78 283 } else {
6714d8e8 284 BUG_ON(!name);
7141514b
SM
285 mle->mleres = NULL;
286 memcpy(mle->mname, name, namelen);
287 mle->mnamelen = namelen;
288 mle->mnamehash = dlm_lockid_hash(name, namelen);
6714d8e8
KH
289 }
290
2041d8fd
SM
291 atomic_inc(&dlm->mle_tot_count[mle->type]);
292 atomic_inc(&dlm->mle_cur_count[mle->type]);
293
6714d8e8
KH
294 /* copy off the node_map and register hb callbacks on our copy */
295 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
296 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
297 clear_bit(dlm->node_num, mle->vote_map);
298 clear_bit(dlm->node_num, mle->node_map);
299
300 /* attach the mle to the domain node up/down events */
301 __dlm_mle_attach_hb_events(dlm, mle);
302}
303
1c084577
SM
304void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
305{
306 assert_spin_locked(&dlm->spinlock);
307 assert_spin_locked(&dlm->master_lock);
308
2ed6c750
SM
309 if (!hlist_unhashed(&mle->master_hash_node))
310 hlist_del_init(&mle->master_hash_node);
1c084577
SM
311}
312
313void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
314{
2ed6c750 315 struct hlist_head *bucket;
2ed6c750 316
1c084577
SM
317 assert_spin_locked(&dlm->master_lock);
318
7141514b 319 bucket = dlm_master_hash(dlm, mle->mnamehash);
2ed6c750 320 hlist_add_head(&mle->master_hash_node, bucket);
1c084577 321}
6714d8e8
KH
322
323/* returns 1 if found, 0 if not */
324static int dlm_find_mle(struct dlm_ctxt *dlm,
325 struct dlm_master_list_entry **mle,
326 char *name, unsigned int namelen)
327{
328 struct dlm_master_list_entry *tmpmle;
2ed6c750 329 struct hlist_head *bucket;
2ed6c750 330 unsigned int hash;
6714d8e8
KH
331
332 assert_spin_locked(&dlm->master_lock);
333
2ed6c750
SM
334 hash = dlm_lockid_hash(name, namelen);
335 bucket = dlm_master_hash(dlm, hash);
df53cd3b 336 hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
6714d8e8
KH
337 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
338 continue;
339 dlm_get_mle(tmpmle);
340 *mle = tmpmle;
341 return 1;
342 }
343 return 0;
344}
345
346void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
347{
348 struct dlm_master_list_entry *mle;
6714d8e8
KH
349
350 assert_spin_locked(&dlm->spinlock);
2bd63216 351
800deef3 352 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
6714d8e8
KH
353 if (node_up)
354 dlm_mle_node_up(dlm, mle, NULL, idx);
355 else
356 dlm_mle_node_down(dlm, mle, NULL, idx);
357 }
358}
359
360static void dlm_mle_node_down(struct dlm_ctxt *dlm,
361 struct dlm_master_list_entry *mle,
362 struct o2nm_node *node, int idx)
363{
364 spin_lock(&mle->spinlock);
365
366 if (!test_bit(idx, mle->node_map))
367 mlog(0, "node %u already removed from nodemap!\n", idx);
368 else
369 clear_bit(idx, mle->node_map);
370
371 spin_unlock(&mle->spinlock);
372}
373
374static void dlm_mle_node_up(struct dlm_ctxt *dlm,
375 struct dlm_master_list_entry *mle,
376 struct o2nm_node *node, int idx)
377{
378 spin_lock(&mle->spinlock);
379
380 if (test_bit(idx, mle->node_map))
381 mlog(0, "node %u already in node map!\n", idx);
382 else
383 set_bit(idx, mle->node_map);
384
385 spin_unlock(&mle->spinlock);
386}
387
388
389int dlm_init_mle_cache(void)
390{
12eb0035 391 dlm_mle_cache = kmem_cache_create("o2dlm_mle",
6714d8e8
KH
392 sizeof(struct dlm_master_list_entry),
393 0, SLAB_HWCACHE_ALIGN,
20c2df83 394 NULL);
6714d8e8
KH
395 if (dlm_mle_cache == NULL)
396 return -ENOMEM;
397 return 0;
398}
399
400void dlm_destroy_mle_cache(void)
401{
a17b485a 402 kmem_cache_destroy(dlm_mle_cache);
6714d8e8
KH
403}
404
405static void dlm_mle_release(struct kref *kref)
406{
407 struct dlm_master_list_entry *mle;
408 struct dlm_ctxt *dlm;
409
6714d8e8
KH
410 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
411 dlm = mle->dlm;
412
6714d8e8
KH
413 assert_spin_locked(&dlm->spinlock);
414 assert_spin_locked(&dlm->master_lock);
415
7141514b
SM
416 mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
417 mle->type);
2ed6c750 418
6714d8e8 419 /* remove from list if not already */
1c084577 420 __dlm_unlink_mle(dlm, mle);
6714d8e8
KH
421
422 /* detach the mle from the domain node up/down events */
423 __dlm_mle_detach_hb_events(dlm, mle);
424
2041d8fd
SM
425 atomic_dec(&dlm->mle_cur_count[mle->type]);
426
6714d8e8
KH
427 /* NOTE: kfree under spinlock here.
428 * if this is bad, we can move this to a freelist. */
429 kmem_cache_free(dlm_mle_cache, mle);
430}
431
432
433/*
434 * LOCK RESOURCE FUNCTIONS
435 */
436
724bdca9
SM
437int dlm_init_master_caches(void)
438{
439 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
440 sizeof(struct dlm_lock_resource),
441 0, SLAB_HWCACHE_ALIGN, NULL);
442 if (!dlm_lockres_cache)
443 goto bail;
444
445 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
446 DLM_LOCKID_NAME_MAX, 0,
447 SLAB_HWCACHE_ALIGN, NULL);
448 if (!dlm_lockname_cache)
449 goto bail;
450
451 return 0;
452bail:
453 dlm_destroy_master_caches();
454 return -ENOMEM;
455}
456
457void dlm_destroy_master_caches(void)
458{
a17b485a 459 kmem_cache_destroy(dlm_lockname_cache);
460 dlm_lockname_cache = NULL;
724bdca9 461
a17b485a 462 kmem_cache_destroy(dlm_lockres_cache);
463 dlm_lockres_cache = NULL;
724bdca9
SM
464}
465
6714d8e8
KH
466static void dlm_lockres_release(struct kref *kref)
467{
468 struct dlm_lock_resource *res;
b0d4f817 469 struct dlm_ctxt *dlm;
6714d8e8
KH
470
471 res = container_of(kref, struct dlm_lock_resource, refs);
b0d4f817 472 dlm = res->dlm;
6714d8e8
KH
473
474 /* This should not happen -- all lockres' have a name
475 * associated with them at init time. */
476 BUG_ON(!res->lockname.name);
477
478 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
479 res->lockname.name);
480
6800791a
SM
481 atomic_dec(&dlm->res_cur_count);
482
a7f90d83
KH
483 if (!hlist_unhashed(&res->hash_node) ||
484 !list_empty(&res->granted) ||
485 !list_empty(&res->converting) ||
486 !list_empty(&res->blocked) ||
487 !list_empty(&res->dirty) ||
488 !list_empty(&res->recovering) ||
489 !list_empty(&res->purge)) {
490 mlog(ML_ERROR,
491 "Going to BUG for resource %.*s."
492 " We're on a list! [%c%c%c%c%c%c%c]\n",
493 res->lockname.len, res->lockname.name,
494 !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
495 !list_empty(&res->granted) ? 'G' : ' ',
496 !list_empty(&res->converting) ? 'C' : ' ',
497 !list_empty(&res->blocked) ? 'B' : ' ',
498 !list_empty(&res->dirty) ? 'D' : ' ',
499 !list_empty(&res->recovering) ? 'R' : ' ',
500 !list_empty(&res->purge) ? 'P' : ' ');
501
502 dlm_print_one_lock_resource(res);
503 }
504
6714d8e8
KH
505 /* By the time we're ready to blow this guy away, we shouldn't
506 * be on any lists. */
81f2094a 507 BUG_ON(!hlist_unhashed(&res->hash_node));
6714d8e8
KH
508 BUG_ON(!list_empty(&res->granted));
509 BUG_ON(!list_empty(&res->converting));
510 BUG_ON(!list_empty(&res->blocked));
511 BUG_ON(!list_empty(&res->dirty));
512 BUG_ON(!list_empty(&res->recovering));
513 BUG_ON(!list_empty(&res->purge));
514
724bdca9 515 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
6714d8e8 516
724bdca9 517 kmem_cache_free(dlm_lockres_cache, res);
6714d8e8
KH
518}
519
6714d8e8
KH
520void dlm_lockres_put(struct dlm_lock_resource *res)
521{
522 kref_put(&res->refs, dlm_lockres_release);
523}
524
525static void dlm_init_lockres(struct dlm_ctxt *dlm,
526 struct dlm_lock_resource *res,
527 const char *name, unsigned int namelen)
528{
529 char *qname;
530
531 /* If we memset here, we lose our reference to the kmalloc'd
532 * res->lockname.name, so be sure to init every field
533 * correctly! */
534
535 qname = (char *) res->lockname.name;
536 memcpy(qname, name, namelen);
537
538 res->lockname.len = namelen;
a3d33291 539 res->lockname.hash = dlm_lockid_hash(name, namelen);
6714d8e8
KH
540
541 init_waitqueue_head(&res->wq);
542 spin_lock_init(&res->spinlock);
81f2094a 543 INIT_HLIST_NODE(&res->hash_node);
6714d8e8
KH
544 INIT_LIST_HEAD(&res->granted);
545 INIT_LIST_HEAD(&res->converting);
546 INIT_LIST_HEAD(&res->blocked);
547 INIT_LIST_HEAD(&res->dirty);
548 INIT_LIST_HEAD(&res->recovering);
549 INIT_LIST_HEAD(&res->purge);
29576f8b 550 INIT_LIST_HEAD(&res->tracking);
6714d8e8
KH
551 atomic_set(&res->asts_reserved, 0);
552 res->migration_pending = 0;
ba2bf218 553 res->inflight_locks = 0;
ac4fef4d 554 res->inflight_assert_workers = 0;
6714d8e8 555
b0d4f817
SM
556 res->dlm = dlm;
557
6714d8e8
KH
558 kref_init(&res->refs);
559
6800791a
SM
560 atomic_inc(&dlm->res_tot_count);
561 atomic_inc(&dlm->res_cur_count);
562
6714d8e8
KH
563 /* just for consistency */
564 spin_lock(&res->spinlock);
565 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
566 spin_unlock(&res->spinlock);
567
568 res->state = DLM_LOCK_RES_IN_PROGRESS;
569
570 res->last_used = 0;
571
cbe355f5 572 spin_lock(&dlm->track_lock);
29576f8b 573 list_add_tail(&res->tracking, &dlm->tracking_list);
cbe355f5 574 spin_unlock(&dlm->track_lock);
29576f8b 575
6714d8e8 576 memset(res->lvb, 0, DLM_LVB_LEN);
ba2bf218 577 memset(res->refmap, 0, sizeof(res->refmap));
6714d8e8
KH
578}
579
580struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
581 const char *name,
582 unsigned int namelen)
583{
724bdca9 584 struct dlm_lock_resource *res = NULL;
6714d8e8 585
3914ed0c 586 res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
6714d8e8 587 if (!res)
724bdca9 588 goto error;
6714d8e8 589
3914ed0c 590 res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
724bdca9
SM
591 if (!res->lockname.name)
592 goto error;
6714d8e8
KH
593
594 dlm_init_lockres(dlm, res, name, namelen);
595 return res;
724bdca9
SM
596
597error:
724bdca9
SM
598 if (res)
599 kmem_cache_free(dlm_lockres_cache, res);
600 return NULL;
6714d8e8
KH
601}
602
8d400b81
SM
603void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
604 struct dlm_lock_resource *res, int bit)
ba2bf218 605{
8d400b81
SM
606 assert_spin_locked(&res->spinlock);
607
608 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
609 res->lockname.name, bit, __builtin_return_address(0));
610
611 set_bit(bit, res->refmap);
612}
613
614void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
615 struct dlm_lock_resource *res, int bit)
616{
617 assert_spin_locked(&res->spinlock);
618
619 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
620 res->lockname.name, bit, __builtin_return_address(0));
621
622 clear_bit(bit, res->refmap);
623}
624
5760a97c 625static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
8d400b81
SM
626 struct dlm_lock_resource *res)
627{
ba2bf218 628 res->inflight_locks++;
ff0a522e 629
8d400b81
SM
630 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
631 res->lockname.len, res->lockname.name, res->inflight_locks,
632 __builtin_return_address(0));
ba2bf218
KH
633}
634
5760a97c
JQ
635void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
636 struct dlm_lock_resource *res)
637{
638 assert_spin_locked(&res->spinlock);
639 __dlm_lockres_grab_inflight_ref(dlm, res);
640}
641
8d400b81
SM
642void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
643 struct dlm_lock_resource *res)
ba2bf218
KH
644{
645 assert_spin_locked(&res->spinlock);
646
647 BUG_ON(res->inflight_locks == 0);
8d400b81 648
ba2bf218 649 res->inflight_locks--;
ff0a522e 650
8d400b81
SM
651 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
652 res->lockname.len, res->lockname.name, res->inflight_locks,
653 __builtin_return_address(0));
654
ba2bf218
KH
655 wake_up(&res->wq);
656}
657
ac4fef4d
X
658void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
659 struct dlm_lock_resource *res)
660{
661 assert_spin_locked(&res->spinlock);
662 res->inflight_assert_workers++;
663 mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
664 dlm->name, res->lockname.len, res->lockname.name,
665 res->inflight_assert_workers);
666}
667
ac4fef4d
X
668static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
669 struct dlm_lock_resource *res)
670{
671 assert_spin_locked(&res->spinlock);
672 BUG_ON(res->inflight_assert_workers == 0);
673 res->inflight_assert_workers--;
674 mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
675 dlm->name, res->lockname.len, res->lockname.name,
676 res->inflight_assert_workers);
677}
678
679static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
680 struct dlm_lock_resource *res)
681{
682 spin_lock(&res->spinlock);
683 __dlm_lockres_drop_inflight_worker(dlm, res);
684 spin_unlock(&res->spinlock);
685}
686
6714d8e8
KH
687/*
688 * lookup a lock resource by name.
689 * may already exist in the hashtable.
690 * lockid is null terminated
691 *
692 * if not, allocate enough for the lockres and for
693 * the temporary structure used in doing the mastering.
694 *
695 * also, do a lookup in the dlm->master_list to see
696 * if another node has begun mastering the same lock.
697 * if so, there should be a block entry in there
698 * for this name, and we should *not* attempt to master
699 * the lock here. need to wait around for that node
700 * to assert_master (or die).
701 *
702 */
703struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
704 const char *lockid,
3384f3df 705 int namelen,
6714d8e8
KH
706 int flags)
707{
708 struct dlm_lock_resource *tmpres=NULL, *res=NULL;
709 struct dlm_master_list_entry *mle = NULL;
710 struct dlm_master_list_entry *alloc_mle = NULL;
711 int blocked = 0;
712 int ret, nodenum;
713 struct dlm_node_iter iter;
3384f3df 714 unsigned int hash;
6714d8e8 715 int tries = 0;
c03872f5 716 int bit, wait_on_recovery = 0;
6714d8e8
KH
717
718 BUG_ON(!lockid);
719
a3d33291 720 hash = dlm_lockid_hash(lockid, namelen);
6714d8e8
KH
721
722 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
723
724lookup:
725 spin_lock(&dlm->spinlock);
ba2bf218 726 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
6714d8e8 727 if (tmpres) {
7b791d68 728 spin_unlock(&dlm->spinlock);
ba2bf218 729 spin_lock(&tmpres->spinlock);
b1432a2a
JB
730
731 /*
732 * Right after dlm spinlock was released, dlm_thread could have
733 * purged the lockres. Check if lockres got unhashed. If so
734 * start over.
735 */
736 if (hlist_unhashed(&tmpres->hash_node)) {
737 spin_unlock(&tmpres->spinlock);
738 dlm_lockres_put(tmpres);
739 tmpres = NULL;
740 goto lookup;
741 }
742
ff0a522e 743 /* Wait on the thread that is mastering the resource */
7b791d68
SM
744 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
745 __dlm_wait_on_lockres(tmpres);
746 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
ff0a522e
SM
747 spin_unlock(&tmpres->spinlock);
748 dlm_lockres_put(tmpres);
749 tmpres = NULL;
750 goto lookup;
7b791d68
SM
751 }
752
ff0a522e
SM
753 /* Wait on the resource purge to complete before continuing */
754 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
755 BUG_ON(tmpres->owner == dlm->node_num);
756 __dlm_wait_on_lockres_flags(tmpres,
757 DLM_LOCK_RES_DROPPING_REF);
ba2bf218
KH
758 spin_unlock(&tmpres->spinlock);
759 dlm_lockres_put(tmpres);
760 tmpres = NULL;
761 goto lookup;
762 }
763
ff0a522e
SM
764 /* Grab inflight ref to pin the resource */
765 dlm_lockres_grab_inflight_ref(dlm, tmpres);
766
767 spin_unlock(&tmpres->spinlock);
f57a22dd
YJ
768 if (res) {
769 spin_lock(&dlm->track_lock);
770 if (!list_empty(&res->tracking))
771 list_del_init(&res->tracking);
772 else
773 mlog(ML_ERROR, "Resource %.*s not "
774 "on the Tracking list\n",
775 res->lockname.len,
776 res->lockname.name);
777 spin_unlock(&dlm->track_lock);
6714d8e8 778 dlm_lockres_put(res);
f57a22dd 779 }
6714d8e8
KH
780 res = tmpres;
781 goto leave;
782 }
783
784 if (!res) {
785 spin_unlock(&dlm->spinlock);
786 mlog(0, "allocating a new resource\n");
787 /* nothing found and we need to allocate one. */
3914ed0c 788 alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
6714d8e8
KH
789 if (!alloc_mle)
790 goto leave;
791 res = dlm_new_lockres(dlm, lockid, namelen);
792 if (!res)
793 goto leave;
794 goto lookup;
795 }
796
797 mlog(0, "no lockres found, allocated our own: %p\n", res);
798
799 if (flags & LKM_LOCAL) {
800 /* caller knows it's safe to assume it's not mastered elsewhere
801 * DONE! return right away */
802 spin_lock(&res->spinlock);
803 dlm_change_lockres_owner(dlm, res, dlm->node_num);
804 __dlm_insert_lockres(dlm, res);
ba2bf218 805 dlm_lockres_grab_inflight_ref(dlm, res);
6714d8e8
KH
806 spin_unlock(&res->spinlock);
807 spin_unlock(&dlm->spinlock);
808 /* lockres still marked IN_PROGRESS */
809 goto wake_waiters;
810 }
811
812 /* check master list to see if another node has started mastering it */
813 spin_lock(&dlm->master_lock);
814
815 /* if we found a block, wait for lock to be mastered by another node */
816 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
817 if (blocked) {
ba2bf218 818 int mig;
6714d8e8
KH
819 if (mle->type == DLM_MLE_MASTER) {
820 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
821 BUG();
ba2bf218
KH
822 }
823 mig = (mle->type == DLM_MLE_MIGRATION);
824 /* if there is a migration in progress, let the migration
825 * finish before continuing. we can wait for the absence
826 * of the MIGRATION mle: either the migrate finished or
827 * one of the nodes died and the mle was cleaned up.
828 * if there is a BLOCK here, but it already has a master
829 * set, we are too late. the master does not have a ref
830 * for us in the refmap. detach the mle and drop it.
831 * either way, go back to the top and start over. */
832 if (mig || mle->master != O2NM_MAX_NODES) {
833 BUG_ON(mig && mle->master == dlm->node_num);
834 /* we arrived too late. the master does not
835 * have a ref for us. retry. */
836 mlog(0, "%s:%.*s: late on %s\n",
837 dlm->name, namelen, lockid,
838 mig ? "MIGRATION" : "BLOCK");
6714d8e8 839 spin_unlock(&dlm->master_lock);
6714d8e8
KH
840 spin_unlock(&dlm->spinlock);
841
842 /* master is known, detach */
ba2bf218
KH
843 if (!mig)
844 dlm_mle_detach_hb_events(dlm, mle);
6714d8e8
KH
845 dlm_put_mle(mle);
846 mle = NULL;
25985edc 847 /* this is lame, but we can't wait on either
ba2bf218
KH
848 * the mle or lockres waitqueue here */
849 if (mig)
850 msleep(100);
851 goto lookup;
6714d8e8
KH
852 }
853 } else {
854 /* go ahead and try to master lock on this node */
855 mle = alloc_mle;
856 /* make sure this does not get freed below */
857 alloc_mle = NULL;
858 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
859 set_bit(dlm->node_num, mle->maybe_map);
1c084577 860 __dlm_insert_mle(dlm, mle);
c03872f5
KH
861
862 /* still holding the dlm spinlock, check the recovery map
2bd63216 863 * to see if there are any nodes that still need to be
c03872f5
KH
864 * considered. these will not appear in the mle nodemap
865 * but they might own this lockres. wait on them. */
866 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
867 if (bit < O2NM_MAX_NODES) {
8decab3c
SM
868 mlog(0, "%s: res %.*s, At least one node (%d) "
869 "to recover before lock mastery can begin\n",
c03872f5
KH
870 dlm->name, namelen, (char *)lockid, bit);
871 wait_on_recovery = 1;
872 }
6714d8e8
KH
873 }
874
875 /* at this point there is either a DLM_MLE_BLOCK or a
876 * DLM_MLE_MASTER on the master list, so it's safe to add the
877 * lockres to the hashtable. anyone who finds the lock will
878 * still have to wait on the IN_PROGRESS. */
879
880 /* finally add the lockres to its hash bucket */
881 __dlm_insert_lockres(dlm, res);
8d400b81 882
5760a97c
JQ
883 /* since this lockres is new it doesn't not require the spinlock */
884 __dlm_lockres_grab_inflight_ref(dlm, res);
ba2bf218 885
6714d8e8
KH
886 /* get an extra ref on the mle in case this is a BLOCK
887 * if so, the creator of the BLOCK may try to put the last
888 * ref at this time in the assert master handler, so we
889 * need an extra one to keep from a bad ptr deref. */
a2bf0477 890 dlm_get_mle_inuse(mle);
6714d8e8
KH
891 spin_unlock(&dlm->master_lock);
892 spin_unlock(&dlm->spinlock);
893
e7e69eb3 894redo_request:
c03872f5
KH
895 while (wait_on_recovery) {
896 /* any cluster changes that occurred after dropping the
897 * dlm spinlock would be detectable be a change on the mle,
898 * so we only need to clear out the recovery map once. */
899 if (dlm_is_recovery_lock(lockid, namelen)) {
8decab3c
SM
900 mlog(0, "%s: Recovery map is not empty, but must "
901 "master $RECOVERY lock now\n", dlm->name);
c03872f5
KH
902 if (!dlm_pre_master_reco_lockres(dlm, res))
903 wait_on_recovery = 0;
904 else {
905 mlog(0, "%s: waiting 500ms for heartbeat state "
906 "change\n", dlm->name);
907 msleep(500);
908 }
909 continue;
2bd63216 910 }
c03872f5
KH
911
912 dlm_kick_recovery_thread(dlm);
aa087b84 913 msleep(1000);
c03872f5
KH
914 dlm_wait_for_recovery(dlm);
915
916 spin_lock(&dlm->spinlock);
917 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
918 if (bit < O2NM_MAX_NODES) {
8decab3c
SM
919 mlog(0, "%s: res %.*s, At least one node (%d) "
920 "to recover before lock mastery can begin\n",
c03872f5
KH
921 dlm->name, namelen, (char *)lockid, bit);
922 wait_on_recovery = 1;
923 } else
924 wait_on_recovery = 0;
925 spin_unlock(&dlm->spinlock);
b7084ab5
KH
926
927 if (wait_on_recovery)
928 dlm_wait_for_node_recovery(dlm, bit, 10000);
c03872f5
KH
929 }
930
6714d8e8
KH
931 /* must wait for lock to be mastered elsewhere */
932 if (blocked)
933 goto wait;
934
6714d8e8
KH
935 ret = -EINVAL;
936 dlm_node_iter_init(mle->vote_map, &iter);
937 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
ba2bf218 938 ret = dlm_do_master_request(res, mle, nodenum);
6714d8e8
KH
939 if (ret < 0)
940 mlog_errno(ret);
941 if (mle->master != O2NM_MAX_NODES) {
942 /* found a master ! */
9c6510a5
KH
943 if (mle->master <= nodenum)
944 break;
945 /* if our master request has not reached the master
946 * yet, keep going until it does. this is how the
947 * master will know that asserts are needed back to
948 * the lower nodes. */
8decab3c
SM
949 mlog(0, "%s: res %.*s, Requests only up to %u but "
950 "master is %u, keep going\n", dlm->name, namelen,
9c6510a5 951 lockid, nodenum, mle->master);
6714d8e8
KH
952 }
953 }
954
955wait:
956 /* keep going until the response map includes all nodes */
957 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
958 if (ret < 0) {
e7e69eb3 959 wait_on_recovery = 1;
8decab3c
SM
960 mlog(0, "%s: res %.*s, Node map changed, redo the master "
961 "request now, blocked=%d\n", dlm->name, res->lockname.len,
6714d8e8
KH
962 res->lockname.name, blocked);
963 if (++tries > 20) {
8decab3c
SM
964 mlog(ML_ERROR, "%s: res %.*s, Spinning on "
965 "dlm_wait_for_lock_mastery, blocked = %d\n",
2bd63216 966 dlm->name, res->lockname.len,
6714d8e8
KH
967 res->lockname.name, blocked);
968 dlm_print_one_lock_resource(res);
8a9343fa 969 dlm_print_one_mle(mle);
6714d8e8
KH
970 tries = 0;
971 }
972 goto redo_request;
973 }
974
8decab3c
SM
975 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
976 res->lockname.name, res->owner);
6714d8e8
KH
977 /* make sure we never continue without this */
978 BUG_ON(res->owner == O2NM_MAX_NODES);
979
980 /* master is known, detach if not already detached */
981 dlm_mle_detach_hb_events(dlm, mle);
982 dlm_put_mle(mle);
983 /* put the extra ref */
a2bf0477 984 dlm_put_mle_inuse(mle);
6714d8e8
KH
985
986wake_waiters:
987 spin_lock(&res->spinlock);
988 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
989 spin_unlock(&res->spinlock);
990 wake_up(&res->wq);
991
992leave:
993 /* need to free the unused mle */
994 if (alloc_mle)
995 kmem_cache_free(dlm_mle_cache, alloc_mle);
996
997 return res;
998}
999
1000
1001#define DLM_MASTERY_TIMEOUT_MS 5000
1002
1003static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
1004 struct dlm_lock_resource *res,
1005 struct dlm_master_list_entry *mle,
1006 int *blocked)
1007{
1008 u8 m;
1009 int ret, bit;
1010 int map_changed, voting_done;
1011 int assert, sleep;
1012
1013recheck:
1014 ret = 0;
1015 assert = 0;
1016
1017 /* check if another node has already become the owner */
1018 spin_lock(&res->spinlock);
1019 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
9c6510a5
KH
1020 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
1021 res->lockname.len, res->lockname.name, res->owner);
6714d8e8 1022 spin_unlock(&res->spinlock);
9c6510a5
KH
1023 /* this will cause the master to re-assert across
1024 * the whole cluster, freeing up mles */
588e0090 1025 if (res->owner != dlm->node_num) {
ba2bf218 1026 ret = dlm_do_master_request(res, mle, res->owner);
588e0090
KH
1027 if (ret < 0) {
1028 /* give recovery a chance to run */
1029 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
1030 msleep(500);
1031 goto recheck;
1032 }
9c6510a5
KH
1033 }
1034 ret = 0;
6714d8e8
KH
1035 goto leave;
1036 }
1037 spin_unlock(&res->spinlock);
1038
1039 spin_lock(&mle->spinlock);
1040 m = mle->master;
1041 map_changed = (memcmp(mle->vote_map, mle->node_map,
1042 sizeof(mle->vote_map)) != 0);
1043 voting_done = (memcmp(mle->vote_map, mle->response_map,
1044 sizeof(mle->vote_map)) == 0);
1045
1046 /* restart if we hit any errors */
1047 if (map_changed) {
1048 int b;
1049 mlog(0, "%s: %.*s: node map changed, restarting\n",
1050 dlm->name, res->lockname.len, res->lockname.name);
1051 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1052 b = (mle->type == DLM_MLE_BLOCK);
1053 if ((*blocked && !b) || (!*blocked && b)) {
2bd63216 1054 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
6714d8e8
KH
1055 dlm->name, res->lockname.len, res->lockname.name,
1056 *blocked, b);
1057 *blocked = b;
1058 }
1059 spin_unlock(&mle->spinlock);
1060 if (ret < 0) {
1061 mlog_errno(ret);
1062 goto leave;
1063 }
1064 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1065 "rechecking now\n", dlm->name, res->lockname.len,
1066 res->lockname.name);
1067 goto recheck;
aa852354
KH
1068 } else {
1069 if (!voting_done) {
1070 mlog(0, "map not changed and voting not done "
1071 "for %s:%.*s\n", dlm->name, res->lockname.len,
1072 res->lockname.name);
1073 }
6714d8e8
KH
1074 }
1075
1076 if (m != O2NM_MAX_NODES) {
1077 /* another node has done an assert!
1078 * all done! */
1079 sleep = 0;
1080 } else {
1081 sleep = 1;
1082 /* have all nodes responded? */
1083 if (voting_done && !*blocked) {
1084 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
1085 if (dlm->node_num <= bit) {
1086 /* my node number is lowest.
1087 * now tell other nodes that I am
1088 * mastering this. */
1089 mle->master = dlm->node_num;
ba2bf218
KH
1090 /* ref was grabbed in get_lock_resource
1091 * will be dropped in dlmlock_master */
6714d8e8
KH
1092 assert = 1;
1093 sleep = 0;
1094 }
1095 /* if voting is done, but we have not received
1096 * an assert master yet, we must sleep */
1097 }
1098 }
1099
1100 spin_unlock(&mle->spinlock);
1101
1102 /* sleep if we haven't finished voting yet */
1103 if (sleep) {
1104 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
6714d8e8
KH
1105 atomic_set(&mle->woken, 0);
1106 (void)wait_event_timeout(mle->wq,
1107 (atomic_read(&mle->woken) == 1),
1108 timeo);
1109 if (res->owner == O2NM_MAX_NODES) {
ba2bf218
KH
1110 mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1111 res->lockname.len, res->lockname.name);
6714d8e8
KH
1112 goto recheck;
1113 }
1114 mlog(0, "done waiting, master is %u\n", res->owner);
1115 ret = 0;
1116 goto leave;
1117 }
1118
1119 ret = 0; /* done */
1120 if (assert) {
1121 m = dlm->node_num;
1122 mlog(0, "about to master %.*s here, this=%u\n",
1123 res->lockname.len, res->lockname.name, m);
ba2bf218 1124 ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
6714d8e8
KH
1125 if (ret) {
1126 /* This is a failure in the network path,
1127 * not in the response to the assert_master
1128 * (any nonzero response is a BUG on this node).
1129 * Most likely a socket just got disconnected
1130 * due to node death. */
1131 mlog_errno(ret);
1132 }
1133 /* no longer need to restart lock mastery.
1134 * all living nodes have been contacted. */
1135 ret = 0;
1136 }
1137
1138 /* set the lockres owner */
1139 spin_lock(&res->spinlock);
ba2bf218
KH
1140 /* mastery reference obtained either during
1141 * assert_master_handler or in get_lock_resource */
6714d8e8
KH
1142 dlm_change_lockres_owner(dlm, res, m);
1143 spin_unlock(&res->spinlock);
1144
1145leave:
1146 return ret;
1147}
1148
1149struct dlm_bitmap_diff_iter
1150{
1151 int curnode;
1152 unsigned long *orig_bm;
1153 unsigned long *cur_bm;
1154 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1155};
1156
1157enum dlm_node_state_change
1158{
1159 NODE_DOWN = -1,
1160 NODE_NO_CHANGE = 0,
1161 NODE_UP
1162};
1163
1164static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1165 unsigned long *orig_bm,
1166 unsigned long *cur_bm)
1167{
1168 unsigned long p1, p2;
1169 int i;
1170
1171 iter->curnode = -1;
1172 iter->orig_bm = orig_bm;
1173 iter->cur_bm = cur_bm;
1174
1175 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1176 p1 = *(iter->orig_bm + i);
1177 p2 = *(iter->cur_bm + i);
1178 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1179 }
1180}
1181
1182static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1183 enum dlm_node_state_change *state)
1184{
1185 int bit;
1186
1187 if (iter->curnode >= O2NM_MAX_NODES)
1188 return -ENOENT;
1189
1190 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1191 iter->curnode+1);
1192 if (bit >= O2NM_MAX_NODES) {
1193 iter->curnode = O2NM_MAX_NODES;
1194 return -ENOENT;
1195 }
1196
1197 /* if it was there in the original then this node died */
1198 if (test_bit(bit, iter->orig_bm))
1199 *state = NODE_DOWN;
1200 else
1201 *state = NODE_UP;
1202
1203 iter->curnode = bit;
1204 return bit;
1205}
1206
1207
1208static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1209 struct dlm_lock_resource *res,
1210 struct dlm_master_list_entry *mle,
1211 int blocked)
1212{
1213 struct dlm_bitmap_diff_iter bdi;
1214 enum dlm_node_state_change sc;
1215 int node;
1216 int ret = 0;
1217
1218 mlog(0, "something happened such that the "
1219 "master process may need to be restarted!\n");
1220
1221 assert_spin_locked(&mle->spinlock);
1222
1223 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1224 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1225 while (node >= 0) {
1226 if (sc == NODE_UP) {
e2faea4c
KH
1227 /* a node came up. clear any old vote from
1228 * the response map and set it in the vote map
1229 * then restart the mastery. */
1230 mlog(ML_NOTICE, "node %d up while restarting\n", node);
6714d8e8
KH
1231
1232 /* redo the master request, but only for the new node */
1233 mlog(0, "sending request to new node\n");
1234 clear_bit(node, mle->response_map);
1235 set_bit(node, mle->vote_map);
1236 } else {
1237 mlog(ML_ERROR, "node down! %d\n", node);
6714d8e8
KH
1238 if (blocked) {
1239 int lowest = find_next_bit(mle->maybe_map,
1240 O2NM_MAX_NODES, 0);
1241
1242 /* act like it was never there */
1243 clear_bit(node, mle->maybe_map);
1244
e7e69eb3
KH
1245 if (node == lowest) {
1246 mlog(0, "expected master %u died"
1247 " while this node was blocked "
1248 "waiting on it!\n", node);
1249 lowest = find_next_bit(mle->maybe_map,
1250 O2NM_MAX_NODES,
1251 lowest+1);
1252 if (lowest < O2NM_MAX_NODES) {
1253 mlog(0, "%s:%.*s:still "
1254 "blocked. waiting on %u "
1255 "now\n", dlm->name,
1256 res->lockname.len,
1257 res->lockname.name,
1258 lowest);
1259 } else {
1260 /* mle is an MLE_BLOCK, but
1261 * there is now nothing left to
1262 * block on. we need to return
1263 * all the way back out and try
1264 * again with an MLE_MASTER.
1265 * dlm_do_local_recovery_cleanup
1266 * has already run, so the mle
1267 * refcount is ok */
1268 mlog(0, "%s:%.*s: no "
1269 "longer blocking. try to "
1270 "master this here\n",
1271 dlm->name,
1272 res->lockname.len,
1273 res->lockname.name);
1274 mle->type = DLM_MLE_MASTER;
7141514b 1275 mle->mleres = res;
e7e69eb3 1276 }
6714d8e8 1277 }
6714d8e8
KH
1278 }
1279
e7e69eb3
KH
1280 /* now blank out everything, as if we had never
1281 * contacted anyone */
1282 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1283 memset(mle->response_map, 0, sizeof(mle->response_map));
1284 /* reset the vote_map to the current node_map */
1285 memcpy(mle->vote_map, mle->node_map,
1286 sizeof(mle->node_map));
1287 /* put myself into the maybe map */
1288 if (mle->type != DLM_MLE_BLOCK)
1289 set_bit(dlm->node_num, mle->maybe_map);
6714d8e8
KH
1290 }
1291 ret = -EAGAIN;
6714d8e8
KH
1292 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1293 }
1294 return ret;
1295}
1296
1297
1298/*
1299 * DLM_MASTER_REQUEST_MSG
1300 *
1301 * returns: 0 on success,
1302 * -errno on a network error
1303 *
1304 * on error, the caller should assume the target node is "dead"
1305 *
1306 */
1307
ba2bf218
KH
1308static int dlm_do_master_request(struct dlm_lock_resource *res,
1309 struct dlm_master_list_entry *mle, int to)
6714d8e8
KH
1310{
1311 struct dlm_ctxt *dlm = mle->dlm;
1312 struct dlm_master_request request;
1313 int ret, response=0, resend;
1314
1315 memset(&request, 0, sizeof(request));
1316 request.node_idx = dlm->node_num;
1317
1318 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1319
7141514b
SM
1320 request.namelen = (u8)mle->mnamelen;
1321 memcpy(request.name, mle->mname, request.namelen);
6714d8e8
KH
1322
1323again:
1324 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1325 sizeof(request), to, &response);
1326 if (ret < 0) {
1327 if (ret == -ESRCH) {
1328 /* should never happen */
1329 mlog(ML_ERROR, "TCP stack not ready!\n");
1330 BUG();
1331 } else if (ret == -EINVAL) {
1332 mlog(ML_ERROR, "bad args passed to o2net!\n");
1333 BUG();
1334 } else if (ret == -ENOMEM) {
1335 mlog(ML_ERROR, "out of memory while trying to send "
1336 "network message! retrying\n");
1337 /* this is totally crude */
1338 msleep(50);
1339 goto again;
1340 } else if (!dlm_is_host_down(ret)) {
1341 /* not a network error. bad. */
1342 mlog_errno(ret);
1343 mlog(ML_ERROR, "unhandled error!");
1344 BUG();
1345 }
1346 /* all other errors should be network errors,
1347 * and likely indicate node death */
1348 mlog(ML_ERROR, "link to %d went down!\n", to);
1349 goto out;
1350 }
1351
1352 ret = 0;
1353 resend = 0;
1354 spin_lock(&mle->spinlock);
1355 switch (response) {
1356 case DLM_MASTER_RESP_YES:
1357 set_bit(to, mle->response_map);
1358 mlog(0, "node %u is the master, response=YES\n", to);
ba2bf218
KH
1359 mlog(0, "%s:%.*s: master node %u now knows I have a "
1360 "reference\n", dlm->name, res->lockname.len,
1361 res->lockname.name, to);
6714d8e8
KH
1362 mle->master = to;
1363 break;
1364 case DLM_MASTER_RESP_NO:
1365 mlog(0, "node %u not master, response=NO\n", to);
1366 set_bit(to, mle->response_map);
1367 break;
1368 case DLM_MASTER_RESP_MAYBE:
1369 mlog(0, "node %u not master, response=MAYBE\n", to);
1370 set_bit(to, mle->response_map);
1371 set_bit(to, mle->maybe_map);
1372 break;
1373 case DLM_MASTER_RESP_ERROR:
1374 mlog(0, "node %u hit an error, resending\n", to);
1375 resend = 1;
1376 response = 0;
1377 break;
1378 default:
1379 mlog(ML_ERROR, "bad response! %u\n", response);
1380 BUG();
1381 }
1382 spin_unlock(&mle->spinlock);
1383 if (resend) {
1384 /* this is also totally crude */
1385 msleep(50);
1386 goto again;
1387 }
1388
1389out:
1390 return ret;
1391}
1392
1393/*
1394 * locks that can be taken here:
1395 * dlm->spinlock
1396 * res->spinlock
1397 * mle->spinlock
1398 * dlm->master_list
1399 *
1400 * if possible, TRIM THIS DOWN!!!
1401 */
d74c9803
KH
1402int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
1403 void **ret_data)
6714d8e8
KH
1404{
1405 u8 response = DLM_MASTER_RESP_MAYBE;
1406 struct dlm_ctxt *dlm = data;
9c6510a5 1407 struct dlm_lock_resource *res = NULL;
6714d8e8
KH
1408 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1409 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1410 char *name;
a3d33291 1411 unsigned int namelen, hash;
6714d8e8
KH
1412 int found, ret;
1413 int set_maybe;
9c6510a5 1414 int dispatch_assert = 0;
012572d4 1415 int dispatched = 0;
6714d8e8
KH
1416
1417 if (!dlm_grab(dlm))
1418 return DLM_MASTER_RESP_NO;
1419
1420 if (!dlm_domain_fully_joined(dlm)) {
1421 response = DLM_MASTER_RESP_NO;
1422 goto send_response;
1423 }
1424
1425 name = request->name;
1426 namelen = request->namelen;
a3d33291 1427 hash = dlm_lockid_hash(name, namelen);
6714d8e8
KH
1428
1429 if (namelen > DLM_LOCKID_NAME_MAX) {
1430 response = DLM_IVBUFLEN;
1431 goto send_response;
1432 }
1433
1434way_up_top:
1435 spin_lock(&dlm->spinlock);
a3d33291 1436 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
6714d8e8
KH
1437 if (res) {
1438 spin_unlock(&dlm->spinlock);
1439
1440 /* take care of the easy cases up front */
1441 spin_lock(&res->spinlock);
cb79662b
SE
1442
1443 /*
1444 * Right after dlm spinlock was released, dlm_thread could have
1445 * purged the lockres. Check if lockres got unhashed. If so
1446 * start over.
1447 */
1448 if (hlist_unhashed(&res->hash_node)) {
1449 spin_unlock(&res->spinlock);
1450 dlm_lockres_put(res);
1451 goto way_up_top;
1452 }
1453
1cd04dbe
KH
1454 if (res->state & (DLM_LOCK_RES_RECOVERING|
1455 DLM_LOCK_RES_MIGRATING)) {
6714d8e8
KH
1456 spin_unlock(&res->spinlock);
1457 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1cd04dbe 1458 "being recovered/migrated\n");
6714d8e8
KH
1459 response = DLM_MASTER_RESP_ERROR;
1460 if (mle)
1461 kmem_cache_free(dlm_mle_cache, mle);
1462 goto send_response;
1463 }
1464
1465 if (res->owner == dlm->node_num) {
8d400b81 1466 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
6714d8e8 1467 spin_unlock(&res->spinlock);
6714d8e8
KH
1468 response = DLM_MASTER_RESP_YES;
1469 if (mle)
1470 kmem_cache_free(dlm_mle_cache, mle);
1471
1472 /* this node is the owner.
1473 * there is some extra work that needs to
1474 * happen now. the requesting node has
1475 * caused all nodes up to this one to
1476 * create mles. this node now needs to
1477 * go back and clean those up. */
9c6510a5 1478 dispatch_assert = 1;
6714d8e8
KH
1479 goto send_response;
1480 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1481 spin_unlock(&res->spinlock);
1482 // mlog(0, "node %u is the master\n", res->owner);
1483 response = DLM_MASTER_RESP_NO;
1484 if (mle)
1485 kmem_cache_free(dlm_mle_cache, mle);
1486 goto send_response;
1487 }
1488
1489 /* ok, there is no owner. either this node is
1490 * being blocked, or it is actively trying to
1491 * master this lock. */
1492 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1493 mlog(ML_ERROR, "lock with no owner should be "
1494 "in-progress!\n");
1495 BUG();
1496 }
1497
1498 // mlog(0, "lockres is in progress...\n");
1499 spin_lock(&dlm->master_lock);
1500 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1501 if (!found) {
1502 mlog(ML_ERROR, "no mle found for this lock!\n");
1503 BUG();
1504 }
1505 set_maybe = 1;
1506 spin_lock(&tmpmle->spinlock);
1507 if (tmpmle->type == DLM_MLE_BLOCK) {
1508 // mlog(0, "this node is waiting for "
1509 // "lockres to be mastered\n");
1510 response = DLM_MASTER_RESP_NO;
1511 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1512 mlog(0, "node %u is master, but trying to migrate to "
1513 "node %u.\n", tmpmle->master, tmpmle->new_master);
1514 if (tmpmle->master == dlm->node_num) {
6714d8e8
KH
1515 mlog(ML_ERROR, "no owner on lockres, but this "
1516 "node is trying to migrate it to %u?!\n",
1517 tmpmle->new_master);
1518 BUG();
1519 } else {
1520 /* the real master can respond on its own */
1521 response = DLM_MASTER_RESP_NO;
1522 }
1523 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1524 set_maybe = 0;
9c6510a5 1525 if (tmpmle->master == dlm->node_num) {
6714d8e8 1526 response = DLM_MASTER_RESP_YES;
9c6510a5
KH
1527 /* this node will be the owner.
1528 * go back and clean the mles on any
1529 * other nodes */
1530 dispatch_assert = 1;
8d400b81
SM
1531 dlm_lockres_set_refmap_bit(dlm, res,
1532 request->node_idx);
9c6510a5 1533 } else
6714d8e8
KH
1534 response = DLM_MASTER_RESP_NO;
1535 } else {
1536 // mlog(0, "this node is attempting to "
1537 // "master lockres\n");
1538 response = DLM_MASTER_RESP_MAYBE;
1539 }
1540 if (set_maybe)
1541 set_bit(request->node_idx, tmpmle->maybe_map);
1542 spin_unlock(&tmpmle->spinlock);
1543
1544 spin_unlock(&dlm->master_lock);
1545 spin_unlock(&res->spinlock);
1546
1547 /* keep the mle attached to heartbeat events */
1548 dlm_put_mle(tmpmle);
1549 if (mle)
1550 kmem_cache_free(dlm_mle_cache, mle);
1551 goto send_response;
1552 }
1553
1554 /*
1555 * lockres doesn't exist on this node
1556 * if there is an MLE_BLOCK, return NO
1557 * if there is an MLE_MASTER, return MAYBE
1558 * otherwise, add an MLE_BLOCK, return NO
1559 */
1560 spin_lock(&dlm->master_lock);
1561 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1562 if (!found) {
1563 /* this lockid has never been seen on this node yet */
1564 // mlog(0, "no mle found\n");
1565 if (!mle) {
1566 spin_unlock(&dlm->master_lock);
1567 spin_unlock(&dlm->spinlock);
1568
3914ed0c 1569 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
6714d8e8 1570 if (!mle) {
6714d8e8 1571 response = DLM_MASTER_RESP_ERROR;
9c6510a5 1572 mlog_errno(-ENOMEM);
6714d8e8
KH
1573 goto send_response;
1574 }
6714d8e8
KH
1575 goto way_up_top;
1576 }
1577
1578 // mlog(0, "this is second time thru, already allocated, "
1579 // "add the block.\n");
41b8c8a1 1580 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
6714d8e8 1581 set_bit(request->node_idx, mle->maybe_map);
1c084577 1582 __dlm_insert_mle(dlm, mle);
6714d8e8
KH
1583 response = DLM_MASTER_RESP_NO;
1584 } else {
6714d8e8 1585 spin_lock(&tmpmle->spinlock);
9c6510a5
KH
1586 if (tmpmle->master == dlm->node_num) {
1587 mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1588 BUG();
1589 }
6714d8e8
KH
1590 if (tmpmle->type == DLM_MLE_BLOCK)
1591 response = DLM_MASTER_RESP_NO;
1592 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1593 mlog(0, "migration mle was found (%u->%u)\n",
1594 tmpmle->master, tmpmle->new_master);
6714d8e8
KH
1595 /* real master can respond on its own */
1596 response = DLM_MASTER_RESP_NO;
9c6510a5
KH
1597 } else
1598 response = DLM_MASTER_RESP_MAYBE;
aa7b5859 1599 set_bit(request->node_idx, tmpmle->maybe_map);
6714d8e8
KH
1600 spin_unlock(&tmpmle->spinlock);
1601 }
1602 spin_unlock(&dlm->master_lock);
1603 spin_unlock(&dlm->spinlock);
1604
1605 if (found) {
1606 /* keep the mle attached to heartbeat events */
1607 dlm_put_mle(tmpmle);
1608 }
1609send_response:
b31cfc02
SM
1610 /*
1611 * __dlm_lookup_lockres() grabbed a reference to this lockres.
1612 * The reference is released by dlm_assert_master_worker() under
1613 * the call to dlm_dispatch_assert_master(). If
1614 * dlm_assert_master_worker() isn't called, we drop it here.
1615 */
9c6510a5 1616 if (dispatch_assert) {
9c6510a5
KH
1617 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1618 dlm->node_num, res->lockname.len, res->lockname.name);
1e589581 1619 spin_lock(&res->spinlock);
2bd63216 1620 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
9c6510a5
KH
1621 DLM_ASSERT_MASTER_MLE_CLEANUP);
1622 if (ret < 0) {
1623 mlog(ML_ERROR, "failed to dispatch assert master work\n");
1624 response = DLM_MASTER_RESP_ERROR;
b67de018 1625 spin_unlock(&res->spinlock);
b31cfc02 1626 dlm_lockres_put(res);
012572d4
JQ
1627 } else {
1628 dispatched = 1;
1e589581 1629 __dlm_lockres_grab_inflight_worker(dlm, res);
b67de018 1630 spin_unlock(&res->spinlock);
012572d4 1631 }
b31cfc02
SM
1632 } else {
1633 if (res)
1634 dlm_lockres_put(res);
9c6510a5
KH
1635 }
1636
012572d4
JQ
1637 if (!dispatched)
1638 dlm_put(dlm);
6714d8e8
KH
1639 return response;
1640}
1641
1642/*
1643 * DLM_ASSERT_MASTER_MSG
1644 */
1645
1646
1647/*
1648 * NOTE: this can be used for debugging
1649 * can periodically run all locks owned by this node
1650 * and re-assert across the cluster...
1651 */
05488bbe
AB
1652static int dlm_do_assert_master(struct dlm_ctxt *dlm,
1653 struct dlm_lock_resource *res,
1654 void *nodemap, u32 flags)
6714d8e8
KH
1655{
1656 struct dlm_assert_master assert;
1657 int to, tmpret;
1658 struct dlm_node_iter iter;
1659 int ret = 0;
9c6510a5 1660 int reassert;
ba2bf218
KH
1661 const char *lockname = res->lockname.name;
1662 unsigned int namelen = res->lockname.len;
6714d8e8
KH
1663
1664 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
f3f85464
SM
1665
1666 spin_lock(&res->spinlock);
1667 res->state |= DLM_LOCK_RES_SETREF_INPROG;
1668 spin_unlock(&res->spinlock);
1669
9c6510a5
KH
1670again:
1671 reassert = 0;
6714d8e8
KH
1672
1673 /* note that if this nodemap is empty, it returns 0 */
1674 dlm_node_iter_init(nodemap, &iter);
1675 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1676 int r = 0;
a9ee4c8a
KH
1677 struct dlm_master_list_entry *mle = NULL;
1678
6714d8e8
KH
1679 mlog(0, "sending assert master to %d (%.*s)\n", to,
1680 namelen, lockname);
1681 memset(&assert, 0, sizeof(assert));
1682 assert.node_idx = dlm->node_num;
1683 assert.namelen = namelen;
1684 memcpy(assert.name, lockname, namelen);
1685 assert.flags = cpu_to_be32(flags);
1686
1687 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1688 &assert, sizeof(assert), to, &r);
1689 if (tmpret < 0) {
a5196ec5
WW
1690 mlog(ML_ERROR, "Error %d when sending message %u (key "
1691 "0x%x) to node %u\n", tmpret,
1692 DLM_ASSERT_MASTER_MSG, dlm->key, to);
6714d8e8 1693 if (!dlm_is_host_down(tmpret)) {
3b3b84a8 1694 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
6714d8e8
KH
1695 BUG();
1696 }
1697 /* a node died. finish out the rest of the nodes. */
3b3b84a8 1698 mlog(0, "link to %d went down!\n", to);
6714d8e8
KH
1699 /* any nonzero status return will do */
1700 ret = tmpret;
ba2bf218 1701 r = 0;
6714d8e8
KH
1702 } else if (r < 0) {
1703 /* ok, something horribly messed. kill thyself. */
1704 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1705 "got %d.\n", namelen, lockname, to, r);
a9ee4c8a
KH
1706 spin_lock(&dlm->spinlock);
1707 spin_lock(&dlm->master_lock);
1708 if (dlm_find_mle(dlm, &mle, (char *)lockname,
1709 namelen)) {
1710 dlm_print_one_mle(mle);
1711 __dlm_put_mle(mle);
1712 }
1713 spin_unlock(&dlm->master_lock);
1714 spin_unlock(&dlm->spinlock);
6714d8e8 1715 BUG();
ba2bf218
KH
1716 }
1717
1718 if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1719 !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1720 mlog(ML_ERROR, "%.*s: very strange, "
1721 "master MLE but no lockres on %u\n",
1722 namelen, lockname, to);
1723 }
1724
1725 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
9c6510a5 1726 mlog(0, "%.*s: node %u create mles on other "
2bd63216 1727 "nodes and requests a re-assert\n",
9c6510a5
KH
1728 namelen, lockname, to);
1729 reassert = 1;
6714d8e8 1730 }
ba2bf218
KH
1731 if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1732 mlog(0, "%.*s: node %u has a reference to this "
1733 "lockres, set the bit in the refmap\n",
1734 namelen, lockname, to);
1735 spin_lock(&res->spinlock);
8d400b81 1736 dlm_lockres_set_refmap_bit(dlm, res, to);
ba2bf218
KH
1737 spin_unlock(&res->spinlock);
1738 }
6714d8e8
KH
1739 }
1740
9c6510a5
KH
1741 if (reassert)
1742 goto again;
1743
f3f85464
SM
1744 spin_lock(&res->spinlock);
1745 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1746 spin_unlock(&res->spinlock);
1747 wake_up(&res->wq);
1748
6714d8e8
KH
1749 return ret;
1750}
1751
1752/*
1753 * locks that can be taken here:
1754 * dlm->spinlock
1755 * res->spinlock
1756 * mle->spinlock
1757 * dlm->master_list
1758 *
1759 * if possible, TRIM THIS DOWN!!!
1760 */
d74c9803
KH
1761int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1762 void **ret_data)
6714d8e8
KH
1763{
1764 struct dlm_ctxt *dlm = data;
1765 struct dlm_master_list_entry *mle = NULL;
1766 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1767 struct dlm_lock_resource *res = NULL;
1768 char *name;
a3d33291 1769 unsigned int namelen, hash;
6714d8e8 1770 u32 flags;
ba2bf218 1771 int master_request = 0, have_lockres_ref = 0;
9c6510a5 1772 int ret = 0;
6714d8e8
KH
1773
1774 if (!dlm_grab(dlm))
1775 return 0;
1776
1777 name = assert->name;
1778 namelen = assert->namelen;
a3d33291 1779 hash = dlm_lockid_hash(name, namelen);
6714d8e8
KH
1780 flags = be32_to_cpu(assert->flags);
1781
1782 if (namelen > DLM_LOCKID_NAME_MAX) {
1783 mlog(ML_ERROR, "Invalid name length!");
1784 goto done;
1785 }
1786
1787 spin_lock(&dlm->spinlock);
1788
1789 if (flags)
1790 mlog(0, "assert_master with flags: %u\n", flags);
1791
1792 /* find the MLE */
1793 spin_lock(&dlm->master_lock);
1794 if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1795 /* not an error, could be master just re-asserting */
1796 mlog(0, "just got an assert_master from %u, but no "
1797 "MLE for it! (%.*s)\n", assert->node_idx,
1798 namelen, name);
1799 } else {
1800 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1801 if (bit >= O2NM_MAX_NODES) {
1802 /* not necessarily an error, though less likely.
1803 * could be master just re-asserting. */
aa852354 1804 mlog(0, "no bits set in the maybe_map, but %u "
6714d8e8
KH
1805 "is asserting! (%.*s)\n", assert->node_idx,
1806 namelen, name);
1807 } else if (bit != assert->node_idx) {
1808 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1809 mlog(0, "master %u was found, %u should "
1810 "back off\n", assert->node_idx, bit);
1811 } else {
1812 /* with the fix for bug 569, a higher node
1813 * number winning the mastery will respond
1814 * YES to mastery requests, but this node
1815 * had no way of knowing. let it pass. */
aa852354 1816 mlog(0, "%u is the lowest node, "
6714d8e8
KH
1817 "%u is asserting. (%.*s) %u must "
1818 "have begun after %u won.\n", bit,
1819 assert->node_idx, namelen, name, bit,
1820 assert->node_idx);
1821 }
1822 }
2d1a868c
KH
1823 if (mle->type == DLM_MLE_MIGRATION) {
1824 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1825 mlog(0, "%s:%.*s: got cleanup assert"
1826 " from %u for migration\n",
1827 dlm->name, namelen, name,
1828 assert->node_idx);
1829 } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1830 mlog(0, "%s:%.*s: got unrelated assert"
1831 " from %u for migration, ignoring\n",
1832 dlm->name, namelen, name,
1833 assert->node_idx);
1834 __dlm_put_mle(mle);
1835 spin_unlock(&dlm->master_lock);
1836 spin_unlock(&dlm->spinlock);
1837 goto done;
2bd63216 1838 }
2d1a868c 1839 }
6714d8e8
KH
1840 }
1841 spin_unlock(&dlm->master_lock);
1842
1843 /* ok everything checks out with the MLE
1844 * now check to see if there is a lockres */
a3d33291 1845 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
6714d8e8
KH
1846 if (res) {
1847 spin_lock(&res->spinlock);
1848 if (res->state & DLM_LOCK_RES_RECOVERING) {
1849 mlog(ML_ERROR, "%u asserting but %.*s is "
1850 "RECOVERING!\n", assert->node_idx, namelen, name);
1851 goto kill;
1852 }
1853 if (!mle) {
dc2ed195
KH
1854 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1855 res->owner != assert->node_idx) {
53ecd25e
SM
1856 mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1857 "but current owner is %u! (%.*s)\n",
1858 assert->node_idx, res->owner, namelen,
1859 name);
1860 __dlm_print_one_lock_resource(res);
1861 BUG();
6714d8e8
KH
1862 }
1863 } else if (mle->type != DLM_MLE_MIGRATION) {
1864 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1865 /* owner is just re-asserting */
1866 if (res->owner == assert->node_idx) {
1867 mlog(0, "owner %u re-asserting on "
1868 "lock %.*s\n", assert->node_idx,
1869 namelen, name);
1870 goto ok;
1871 }
1872 mlog(ML_ERROR, "got assert_master from "
1873 "node %u, but %u is the owner! "
1874 "(%.*s)\n", assert->node_idx,
1875 res->owner, namelen, name);
1876 goto kill;
1877 }
1878 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1879 mlog(ML_ERROR, "got assert from %u, but lock "
1880 "with no owner should be "
1881 "in-progress! (%.*s)\n",
1882 assert->node_idx,
1883 namelen, name);
1884 goto kill;
1885 }
1886 } else /* mle->type == DLM_MLE_MIGRATION */ {
1887 /* should only be getting an assert from new master */
1888 if (assert->node_idx != mle->new_master) {
1889 mlog(ML_ERROR, "got assert from %u, but "
1890 "new master is %u, and old master "
1891 "was %u (%.*s)\n",
1892 assert->node_idx, mle->new_master,
1893 mle->master, namelen, name);
1894 goto kill;
1895 }
1896
1897 }
1898ok:
1899 spin_unlock(&res->spinlock);
1900 }
6714d8e8
KH
1901
1902 // mlog(0, "woo! got an assert_master from node %u!\n",
1903 // assert->node_idx);
1904 if (mle) {
9c6510a5
KH
1905 int extra_ref = 0;
1906 int nn = -1;
a2bf0477 1907 int rr, err = 0;
2bd63216 1908
6714d8e8 1909 spin_lock(&mle->spinlock);
9c6510a5
KH
1910 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1911 extra_ref = 1;
1912 else {
1913 /* MASTER mle: if any bits set in the response map
1914 * then the calling node needs to re-assert to clear
1915 * up nodes that this node contacted */
2bd63216 1916 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
9c6510a5 1917 nn+1)) < O2NM_MAX_NODES) {
728b9805 1918 if (nn != dlm->node_num && nn != assert->node_idx) {
9c6510a5 1919 master_request = 1;
728b9805
JB
1920 break;
1921 }
9c6510a5
KH
1922 }
1923 }
6714d8e8
KH
1924 mle->master = assert->node_idx;
1925 atomic_set(&mle->woken, 1);
1926 wake_up(&mle->wq);
1927 spin_unlock(&mle->spinlock);
1928
a2bf0477 1929 if (res) {
a6fa3640 1930 int wake = 0;
6714d8e8 1931 spin_lock(&res->spinlock);
a2bf0477
KH
1932 if (mle->type == DLM_MLE_MIGRATION) {
1933 mlog(0, "finishing off migration of lockres %.*s, "
1934 "from %u to %u\n",
1935 res->lockname.len, res->lockname.name,
1936 dlm->node_num, mle->new_master);
1937 res->state &= ~DLM_LOCK_RES_MIGRATING;
a6fa3640 1938 wake = 1;
a2bf0477
KH
1939 dlm_change_lockres_owner(dlm, res, mle->new_master);
1940 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1941 } else {
1942 dlm_change_lockres_owner(dlm, res, mle->master);
1943 }
6714d8e8 1944 spin_unlock(&res->spinlock);
ba2bf218 1945 have_lockres_ref = 1;
a6fa3640
KH
1946 if (wake)
1947 wake_up(&res->wq);
6714d8e8 1948 }
a2bf0477
KH
1949
1950 /* master is known, detach if not already detached.
1951 * ensures that only one assert_master call will happen
1952 * on this mle. */
a2bf0477
KH
1953 spin_lock(&dlm->master_lock);
1954
2c935bc5 1955 rr = kref_read(&mle->mle_refs);
a2bf0477
KH
1956 if (mle->inuse > 0) {
1957 if (extra_ref && rr < 3)
1958 err = 1;
1959 else if (!extra_ref && rr < 2)
1960 err = 1;
1961 } else {
1962 if (extra_ref && rr < 2)
1963 err = 1;
1964 else if (!extra_ref && rr < 1)
1965 err = 1;
1966 }
1967 if (err) {
1968 mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1969 "that will mess up this node, refs=%d, extra=%d, "
1970 "inuse=%d\n", dlm->name, namelen, name,
1971 assert->node_idx, rr, extra_ref, mle->inuse);
1972 dlm_print_one_mle(mle);
1973 }
1c084577 1974 __dlm_unlink_mle(dlm, mle);
a2bf0477
KH
1975 __dlm_mle_detach_hb_events(dlm, mle);
1976 __dlm_put_mle(mle);
6714d8e8
KH
1977 if (extra_ref) {
1978 /* the assert master message now balances the extra
1979 * ref given by the master / migration request message.
1980 * if this is the last put, it will be removed
1981 * from the list. */
a2bf0477
KH
1982 __dlm_put_mle(mle);
1983 }
1984 spin_unlock(&dlm->master_lock);
a2bf0477
KH
1985 } else if (res) {
1986 if (res->owner != assert->node_idx) {
1987 mlog(0, "assert_master from %u, but current "
1988 "owner is %u (%.*s), no mle\n", assert->node_idx,
1989 res->owner, namelen, name);
6714d8e8
KH
1990 }
1991 }
14741472 1992 spin_unlock(&dlm->spinlock);
6714d8e8
KH
1993
1994done:
9c6510a5 1995 ret = 0;
3b8118cf
KH
1996 if (res) {
1997 spin_lock(&res->spinlock);
1998 res->state |= DLM_LOCK_RES_SETREF_INPROG;
1999 spin_unlock(&res->spinlock);
2000 *ret_data = (void *)res;
2001 }
6714d8e8 2002 dlm_put(dlm);
9c6510a5
KH
2003 if (master_request) {
2004 mlog(0, "need to tell master to reassert\n");
ba2bf218
KH
2005 /* positive. negative would shoot down the node. */
2006 ret |= DLM_ASSERT_RESPONSE_REASSERT;
2007 if (!have_lockres_ref) {
2008 mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2009 "mle present here for %s:%.*s, but no lockres!\n",
2010 assert->node_idx, dlm->name, namelen, name);
2011 }
2012 }
2013 if (have_lockres_ref) {
2014 /* let the master know we have a reference to the lockres */
2015 ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
2016 mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2017 dlm->name, namelen, name, assert->node_idx);
9c6510a5
KH
2018 }
2019 return ret;
6714d8e8
KH
2020
2021kill:
2022 /* kill the caller! */
a9ee4c8a
KH
2023 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
2024 "and killing the other node now! This node is OK and can continue.\n");
2025 __dlm_print_one_lock_resource(res);
6714d8e8 2026 spin_unlock(&res->spinlock);
55dacd22 2027 spin_lock(&dlm->master_lock);
2028 if (mle)
2029 __dlm_put_mle(mle);
2030 spin_unlock(&dlm->master_lock);
6714d8e8 2031 spin_unlock(&dlm->spinlock);
2bd63216 2032 *ret_data = (void *)res;
6714d8e8
KH
2033 dlm_put(dlm);
2034 return -EINVAL;
2035}
2036
3b8118cf
KH
2037void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
2038{
2039 struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
2040
2041 if (ret_data) {
2042 spin_lock(&res->spinlock);
2043 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2044 spin_unlock(&res->spinlock);
2045 wake_up(&res->wq);
2046 dlm_lockres_put(res);
2047 }
2048 return;
2049}
2050
6714d8e8
KH
2051int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2052 struct dlm_lock_resource *res,
2053 int ignore_higher, u8 request_from, u32 flags)
2054{
2055 struct dlm_work_item *item;
b24ae0b5 2056 item = kzalloc(sizeof(*item), GFP_ATOMIC);
6714d8e8
KH
2057 if (!item)
2058 return -ENOMEM;
2059
2060
2061 /* queue up work for dlm_assert_master_worker */
6714d8e8
KH
2062 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
2063 item->u.am.lockres = res; /* already have a ref */
2064 /* can optionally ignore node numbers higher than this node */
2065 item->u.am.ignore_higher = ignore_higher;
2066 item->u.am.request_from = request_from;
2067 item->u.am.flags = flags;
2068
2bd63216
SM
2069 if (ignore_higher)
2070 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
9c6510a5 2071 res->lockname.name);
2bd63216 2072
6714d8e8
KH
2073 spin_lock(&dlm->work_lock);
2074 list_add_tail(&item->list, &dlm->work_list);
2075 spin_unlock(&dlm->work_lock);
2076
3156d267 2077 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
6714d8e8
KH
2078 return 0;
2079}
2080
2081static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2082{
2083 struct dlm_ctxt *dlm = data;
2084 int ret = 0;
2085 struct dlm_lock_resource *res;
2086 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
2087 int ignore_higher;
2088 int bit;
2089 u8 request_from;
2090 u32 flags;
2091
2092 dlm = item->dlm;
2093 res = item->u.am.lockres;
2094 ignore_higher = item->u.am.ignore_higher;
2095 request_from = item->u.am.request_from;
2096 flags = item->u.am.flags;
2097
2098 spin_lock(&dlm->spinlock);
2099 memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
2100 spin_unlock(&dlm->spinlock);
2101
2102 clear_bit(dlm->node_num, nodemap);
2103 if (ignore_higher) {
2104 /* if is this just to clear up mles for nodes below
2105 * this node, do not send the message to the original
2106 * caller or any node number higher than this */
2107 clear_bit(request_from, nodemap);
2108 bit = dlm->node_num;
2109 while (1) {
2110 bit = find_next_bit(nodemap, O2NM_MAX_NODES,
2111 bit+1);
2112 if (bit >= O2NM_MAX_NODES)
2113 break;
2114 clear_bit(bit, nodemap);
2115 }
2116 }
2117
36407488
KH
2118 /*
2119 * If we're migrating this lock to someone else, we are no
2120 * longer allowed to assert out own mastery. OTOH, we need to
2121 * prevent migration from starting while we're still asserting
2122 * our dominance. The reserved ast delays migration.
2123 */
2124 spin_lock(&res->spinlock);
2125 if (res->state & DLM_LOCK_RES_MIGRATING) {
2126 mlog(0, "Someone asked us to assert mastery, but we're "
2127 "in the middle of migration. Skipping assert, "
2128 "the new master will handle that.\n");
2129 spin_unlock(&res->spinlock);
2130 goto put;
2131 } else
2132 __dlm_lockres_reserve_ast(res);
2133 spin_unlock(&res->spinlock);
2134
6714d8e8
KH
2135 /* this call now finishes out the nodemap
2136 * even if one or more nodes die */
2137 mlog(0, "worker about to master %.*s here, this=%u\n",
2138 res->lockname.len, res->lockname.name, dlm->node_num);
ba2bf218 2139 ret = dlm_do_assert_master(dlm, res, nodemap, flags);
6714d8e8
KH
2140 if (ret < 0) {
2141 /* no need to restart, we are done */
3b3b84a8
KH
2142 if (!dlm_is_host_down(ret))
2143 mlog_errno(ret);
6714d8e8
KH
2144 }
2145
36407488
KH
2146 /* Ok, we've asserted ourselves. Let's let migration start. */
2147 dlm_lockres_release_ast(dlm, res);
2148
2149put:
ac4fef4d
X
2150 dlm_lockres_drop_inflight_worker(dlm, res);
2151
6714d8e8
KH
2152 dlm_lockres_put(res);
2153
2154 mlog(0, "finished with dlm_assert_master_worker\n");
2155}
2156
c03872f5
KH
2157/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2158 * We cannot wait for node recovery to complete to begin mastering this
2159 * lockres because this lockres is used to kick off recovery! ;-)
2160 * So, do a pre-check on all living nodes to see if any of those nodes
2161 * think that $RECOVERY is currently mastered by a dead node. If so,
2162 * we wait a short time to allow that node to get notified by its own
2163 * heartbeat stack, then check again. All $RECOVERY lock resources
e926d8a1 2164 * mastered by dead nodes are purged when the heartbeat callback is
c03872f5
KH
2165 * fired, so we can know for sure that it is safe to continue once
2166 * the node returns a live node or no node. */
2167static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2168 struct dlm_lock_resource *res)
2169{
2170 struct dlm_node_iter iter;
2171 int nodenum;
2172 int ret = 0;
2173 u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
2174
2175 spin_lock(&dlm->spinlock);
2176 dlm_node_iter_init(dlm->domain_map, &iter);
2177 spin_unlock(&dlm->spinlock);
2178
2179 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2180 /* do not send to self */
2181 if (nodenum == dlm->node_num)
2182 continue;
2183 ret = dlm_do_master_requery(dlm, res, nodenum, &master);
2184 if (ret < 0) {
2185 mlog_errno(ret);
2186 if (!dlm_is_host_down(ret))
2187 BUG();
2188 /* host is down, so answer for that node would be
2189 * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
f42a100b 2190 ret = 0;
c03872f5
KH
2191 }
2192
2193 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
2194 /* check to see if this master is in the recovery map */
2195 spin_lock(&dlm->spinlock);
2196 if (test_bit(master, dlm->recovery_map)) {
2197 mlog(ML_NOTICE, "%s: node %u has not seen "
2198 "node %u go down yet, and thinks the "
2199 "dead node is mastering the recovery "
2200 "lock. must wait.\n", dlm->name,
2201 nodenum, master);
2202 ret = -EAGAIN;
2203 }
2204 spin_unlock(&dlm->spinlock);
2bd63216 2205 mlog(0, "%s: reco lock master is %u\n", dlm->name,
c03872f5
KH
2206 master);
2207 break;
2208 }
2209 }
2210 return ret;
2211}
2212
ba2bf218
KH
2213/*
2214 * DLM_DEREF_LOCKRES_MSG
2215 */
2216
2217int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2218{
2219 struct dlm_deref_lockres deref;
2220 int ret = 0, r;
2221 const char *lockname;
2222 unsigned int namelen;
2223
2224 lockname = res->lockname.name;
2225 namelen = res->lockname.len;
2226 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2227
ba2bf218
KH
2228 memset(&deref, 0, sizeof(deref));
2229 deref.node_idx = dlm->node_num;
2230 deref.namelen = namelen;
2231 memcpy(deref.name, lockname, namelen);
2232
2233 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2234 &deref, sizeof(deref), res->owner, &r);
2235 if (ret < 0)
8decab3c
SM
2236 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2237 dlm->name, namelen, lockname, ret, res->owner);
ba2bf218
KH
2238 else if (r < 0) {
2239 /* BAD. other node says I did not have a ref. */
8decab3c
SM
2240 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2241 dlm->name, namelen, lockname, res->owner, r);
ba2bf218 2242 dlm_print_one_lock_resource(res);
309e9191 2243 if (r == -ENOMEM)
2244 BUG();
2245 } else
2246 ret = r;
2247
2248 return ret;
ba2bf218
KH
2249}
2250
d74c9803
KH
2251int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2252 void **ret_data)
ba2bf218
KH
2253{
2254 struct dlm_ctxt *dlm = data;
2255 struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
2256 struct dlm_lock_resource *res = NULL;
2257 char *name;
2258 unsigned int namelen;
2259 int ret = -EINVAL;
2260 u8 node;
2261 unsigned int hash;
f3f85464
SM
2262 struct dlm_work_item *item;
2263 int cleared = 0;
2264 int dispatch = 0;
ba2bf218
KH
2265
2266 if (!dlm_grab(dlm))
2267 return 0;
2268
2269 name = deref->name;
2270 namelen = deref->namelen;
2271 node = deref->node_idx;
2272
2273 if (namelen > DLM_LOCKID_NAME_MAX) {
2274 mlog(ML_ERROR, "Invalid name length!");
2275 goto done;
2276 }
2277 if (deref->node_idx >= O2NM_MAX_NODES) {
2278 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2279 goto done;
2280 }
2281
2282 hash = dlm_lockid_hash(name, namelen);
2283
2284 spin_lock(&dlm->spinlock);
2285 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2286 if (!res) {
2287 spin_unlock(&dlm->spinlock);
2288 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2289 dlm->name, namelen, name);
2290 goto done;
2291 }
2292 spin_unlock(&dlm->spinlock);
2293
2294 spin_lock(&res->spinlock);
f3f85464
SM
2295 if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2296 dispatch = 1;
2297 else {
2298 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2299 if (test_bit(node, res->refmap)) {
8d400b81 2300 dlm_lockres_clear_refmap_bit(dlm, res, node);
f3f85464
SM
2301 cleared = 1;
2302 }
ba2bf218
KH
2303 }
2304 spin_unlock(&res->spinlock);
2305
f3f85464
SM
2306 if (!dispatch) {
2307 if (cleared)
2308 dlm_lockres_calc_usage(dlm, res);
2309 else {
2310 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2311 "but it is already dropped!\n", dlm->name,
2312 res->lockname.len, res->lockname.name, node);
2af37ce8 2313 dlm_print_one_lock_resource(res);
f3f85464 2314 }
842b90b6 2315 ret = DLM_DEREF_RESPONSE_DONE;
f3f85464
SM
2316 goto done;
2317 }
2318
2319 item = kzalloc(sizeof(*item), GFP_NOFS);
2320 if (!item) {
2321 ret = -ENOMEM;
2322 mlog_errno(ret);
2323 goto done;
2324 }
2325
2326 dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2327 item->u.dl.deref_res = res;
2328 item->u.dl.deref_node = node;
2329
2330 spin_lock(&dlm->work_lock);
2331 list_add_tail(&item->list, &dlm->work_list);
2332 spin_unlock(&dlm->work_lock);
2333
2334 queue_work(dlm->dlm_worker, &dlm->dispatched_work);
842b90b6 2335 return DLM_DEREF_RESPONSE_INPROG;
f3f85464 2336
ba2bf218
KH
2337done:
2338 if (res)
2339 dlm_lockres_put(res);
2340 dlm_put(dlm);
f3f85464 2341
ba2bf218
KH
2342 return ret;
2343}
2344
60d663cb 2345int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
2346 void **ret_data)
2347{
2348 struct dlm_ctxt *dlm = data;
2349 struct dlm_deref_lockres_done *deref
2350 = (struct dlm_deref_lockres_done *)msg->buf;
2351 struct dlm_lock_resource *res = NULL;
2352 char *name;
2353 unsigned int namelen;
2354 int ret = -EINVAL;
2355 u8 node;
2356 unsigned int hash;
2357
2358 if (!dlm_grab(dlm))
2359 return 0;
2360
2361 name = deref->name;
2362 namelen = deref->namelen;
2363 node = deref->node_idx;
2364
2365 if (namelen > DLM_LOCKID_NAME_MAX) {
2366 mlog(ML_ERROR, "Invalid name length!");
2367 goto done;
2368 }
2369 if (deref->node_idx >= O2NM_MAX_NODES) {
2370 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2371 goto done;
2372 }
2373
2374 hash = dlm_lockid_hash(name, namelen);
2375
2376 spin_lock(&dlm->spinlock);
2377 res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2378 if (!res) {
2379 spin_unlock(&dlm->spinlock);
2380 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2381 dlm->name, namelen, name);
2382 goto done;
2383 }
2384
2385 spin_lock(&res->spinlock);
86b652b9 2386 if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
2387 spin_unlock(&res->spinlock);
2388 spin_unlock(&dlm->spinlock);
2389 mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
2390 "but it is already derefed!\n", dlm->name,
2391 res->lockname.len, res->lockname.name, node);
86b652b9 2392 ret = 0;
2393 goto done;
2394 }
2395
ee8f7fcb 2396 __dlm_do_purge_lockres(dlm, res);
60d663cb 2397 spin_unlock(&res->spinlock);
2398 wake_up(&res->wq);
2399
60d663cb 2400 spin_unlock(&dlm->spinlock);
2401
b7341364 2402 ret = 0;
60d663cb 2403done:
ee8f7fcb 2404 if (res)
2405 dlm_lockres_put(res);
60d663cb 2406 dlm_put(dlm);
2407 return ret;
2408}
2409
2410static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
2411 struct dlm_lock_resource *res, u8 node)
2412{
2413 struct dlm_deref_lockres_done deref;
2414 int ret = 0, r;
2415 const char *lockname;
2416 unsigned int namelen;
2417
2418 lockname = res->lockname.name;
2419 namelen = res->lockname.len;
2420 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2421
2422 memset(&deref, 0, sizeof(deref));
2423 deref.node_idx = dlm->node_num;
2424 deref.namelen = namelen;
2425 memcpy(deref.name, lockname, namelen);
2426
2427 ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
2428 &deref, sizeof(deref), node, &r);
2429 if (ret < 0) {
2430 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
2431 " to node %u\n", dlm->name, namelen,
2432 lockname, ret, node);
2433 } else if (r < 0) {
2434 /* ignore the error */
2435 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2436 dlm->name, namelen, lockname, node, r);
2437 dlm_print_one_lock_resource(res);
2438 }
2439}
2440
f3f85464
SM
2441static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2442{
2443 struct dlm_ctxt *dlm;
2444 struct dlm_lock_resource *res;
2445 u8 node;
2446 u8 cleared = 0;
2447
2448 dlm = item->dlm;
2449 res = item->u.dl.deref_res;
2450 node = item->u.dl.deref_node;
2451
2452 spin_lock(&res->spinlock);
2453 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
b5560143 2454 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
f3f85464 2455 if (test_bit(node, res->refmap)) {
8d400b81 2456 dlm_lockres_clear_refmap_bit(dlm, res, node);
f3f85464
SM
2457 cleared = 1;
2458 }
2459 spin_unlock(&res->spinlock);
2460
842b90b6 2461 dlm_drop_lockres_ref_done(dlm, res, node);
2462
f3f85464
SM
2463 if (cleared) {
2464 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2465 dlm->name, res->lockname.len, res->lockname.name, node);
2466 dlm_lockres_calc_usage(dlm, res);
2467 } else {
2468 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2469 "but it is already dropped!\n", dlm->name,
2470 res->lockname.len, res->lockname.name, node);
2af37ce8 2471 dlm_print_one_lock_resource(res);
f3f85464
SM
2472 }
2473
2474 dlm_lockres_put(res);
2475}
2476
9f62e960 2477/*
baa31b89 2478 * A migratable resource is one that is :
9f62e960
SM
2479 * 1. locally mastered, and,
2480 * 2. zero local locks, and,
2481 * 3. one or more non-local locks, or, one or more references
2482 * Returns 1 if yes, 0 if not.
2f5bf1f2 2483 */
baa31b89 2484static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
9f62e960 2485 struct dlm_lock_resource *res)
2f5bf1f2 2486{
9f62e960
SM
2487 enum dlm_lockres_list idx;
2488 int nonlocal = 0, node_ref;
800deef3 2489 struct list_head *queue;
2f5bf1f2 2490 struct dlm_lock *lock;
9f62e960 2491 u64 cookie;
2f5bf1f2
SM
2492
2493 assert_spin_locked(&res->spinlock);
2494
fae477b6
X
2495 /* delay migration when the lockres is in MIGRATING state */
2496 if (res->state & DLM_LOCK_RES_MIGRATING)
2497 return 0;
2498
bba1cb17 2499 /* delay migration when the lockres is in RECOCERING state */
814ce694
JX
2500 if (res->state & (DLM_LOCK_RES_RECOVERING|
2501 DLM_LOCK_RES_RECOVERY_WAITING))
bba1cb17
TS
2502 return 0;
2503
9f62e960
SM
2504 if (res->owner != dlm->node_num)
2505 return 0;
2f5bf1f2 2506
9f62e960
SM
2507 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2508 queue = dlm_list_idx_to_ptr(res, idx);
800deef3 2509 list_for_each_entry(lock, queue, list) {
9f62e960
SM
2510 if (lock->ml.node != dlm->node_num) {
2511 nonlocal++;
2512 continue;
2f5bf1f2 2513 }
9f62e960 2514 cookie = be64_to_cpu(lock->ml.cookie);
baa31b89 2515 mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
9f62e960
SM
2516 "%s list\n", dlm->name, res->lockname.len,
2517 res->lockname.name,
2518 dlm_get_lock_cookie_node(cookie),
2519 dlm_get_lock_cookie_seq(cookie),
2520 dlm_list_in_text(idx));
2521 return 0;
2f5bf1f2 2522 }
2f5bf1f2
SM
2523 }
2524
9f62e960
SM
2525 if (!nonlocal) {
2526 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2527 if (node_ref >= O2NM_MAX_NODES)
2528 return 0;
2529 }
388c4bcb 2530
510c4879 2531 mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
9f62e960 2532 res->lockname.name);
2f5bf1f2 2533
9f62e960 2534 return 1;
2f5bf1f2 2535}
6714d8e8
KH
2536
2537/*
2538 * DLM_MIGRATE_LOCKRES
2539 */
2540
2541
faf0ec9f 2542static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
66effd3c 2543 struct dlm_lock_resource *res, u8 target)
6714d8e8
KH
2544{
2545 struct dlm_master_list_entry *mle = NULL;
2546 struct dlm_master_list_entry *oldmle = NULL;
2547 struct dlm_migratable_lockres *mres = NULL;
2f5bf1f2 2548 int ret = 0;
6714d8e8
KH
2549 const char *name;
2550 unsigned int namelen;
2551 int mle_added = 0;
2f5bf1f2 2552 int wake = 0;
6714d8e8
KH
2553
2554 if (!dlm_grab(dlm))
2555 return -EINVAL;
2556
2557 name = res->lockname.name;
2558 namelen = res->lockname.len;
2559
66effd3c
SM
2560 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2561 target);
6714d8e8 2562
66effd3c 2563 /* preallocate up front. if this fails, abort */
6714d8e8 2564 ret = -ENOMEM;
ad8100e0 2565 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
6714d8e8
KH
2566 if (!mres) {
2567 mlog_errno(ret);
2568 goto leave;
2569 }
2570
3914ed0c 2571 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
6714d8e8
KH
2572 if (!mle) {
2573 mlog_errno(ret);
2574 goto leave;
2575 }
2576 ret = 0;
2577
6714d8e8
KH
2578 /*
2579 * clear any existing master requests and
2580 * add the migration mle to the list
2581 */
66effd3c 2582 spin_lock(&dlm->spinlock);
6714d8e8
KH
2583 spin_lock(&dlm->master_lock);
2584 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2585 namelen, target, dlm->node_num);
bef5502d 2586 /* get an extra reference on the mle.
2587 * otherwise the assert_master from the new
2588 * master will destroy this.
2589 */
3db409fa
CG
2590 if (ret != -EEXIST)
2591 dlm_get_mle_inuse(mle);
2592
6714d8e8
KH
2593 spin_unlock(&dlm->master_lock);
2594 spin_unlock(&dlm->spinlock);
2595
2596 if (ret == -EEXIST) {
2597 mlog(0, "another process is already migrating it\n");
2598 goto fail;
2599 }
2600 mle_added = 1;
2601
2602 /*
2603 * set the MIGRATING flag and flush asts
2604 * if we fail after this we need to re-dirty the lockres
2605 */
2606 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2607 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2608 "the target went down.\n", res->lockname.len,
2609 res->lockname.name, target);
2610 spin_lock(&res->spinlock);
2611 res->state &= ~DLM_LOCK_RES_MIGRATING;
a6fa3640 2612 wake = 1;
6714d8e8
KH
2613 spin_unlock(&res->spinlock);
2614 ret = -EINVAL;
2615 }
2616
2617fail:
32e49326 2618 if (ret != -EEXIST && oldmle) {
6714d8e8
KH
2619 /* master is known, detach if not already detached */
2620 dlm_mle_detach_hb_events(dlm, oldmle);
2621 dlm_put_mle(oldmle);
2622 }
2623
2624 if (ret < 0) {
2625 if (mle_added) {
2626 dlm_mle_detach_hb_events(dlm, mle);
2627 dlm_put_mle(mle);
bef5502d 2628 dlm_put_mle_inuse(mle);
6714d8e8
KH
2629 } else if (mle) {
2630 kmem_cache_free(dlm_mle_cache, mle);
66effd3c 2631 mle = NULL;
6714d8e8
KH
2632 }
2633 goto leave;
2634 }
2635
2636 /*
2637 * at this point, we have a migration target, an mle
2638 * in the master list, and the MIGRATING flag set on
2639 * the lockres
2640 */
2641
1cd04dbe
KH
2642 /* now that remote nodes are spinning on the MIGRATING flag,
2643 * ensure that all assert_master work is flushed. */
2644 flush_workqueue(dlm->dlm_worker);
6714d8e8 2645
6714d8e8
KH
2646 /* notify new node and send all lock state */
2647 /* call send_one_lockres with migration flag.
2648 * this serves as notice to the target node that a
2649 * migration is starting. */
2650 ret = dlm_send_one_lockres(dlm, res, mres, target,
2651 DLM_MRES_MIGRATION);
2652
2653 if (ret < 0) {
2654 mlog(0, "migration to node %u failed with %d\n",
2655 target, ret);
2656 /* migration failed, detach and clean up mle */
2657 dlm_mle_detach_hb_events(dlm, mle);
2658 dlm_put_mle(mle);
a2bf0477
KH
2659 dlm_put_mle_inuse(mle);
2660 spin_lock(&res->spinlock);
2661 res->state &= ~DLM_LOCK_RES_MIGRATING;
a6fa3640 2662 wake = 1;
a2bf0477 2663 spin_unlock(&res->spinlock);
df016c66
SM
2664 if (dlm_is_host_down(ret))
2665 dlm_wait_for_node_death(dlm, target,
2666 DLM_NODE_DEATH_WAIT_MAX);
6714d8e8
KH
2667 goto leave;
2668 }
2669
2670 /* at this point, the target sends a message to all nodes,
2671 * (using dlm_do_migrate_request). this node is skipped since
2672 * we had to put an mle in the list to begin the process. this
2673 * node now waits for target to do an assert master. this node
2674 * will be the last one notified, ensuring that the migration
2675 * is complete everywhere. if the target dies while this is
2676 * going on, some nodes could potentially see the target as the
2677 * master, so it is important that my recovery finds the migration
af901ca1 2678 * mle and sets the master to UNKNOWN. */
6714d8e8
KH
2679
2680
2681 /* wait for new node to assert master */
2682 while (1) {
2683 ret = wait_event_interruptible_timeout(mle->wq,
2684 (atomic_read(&mle->woken) == 1),
2685 msecs_to_jiffies(5000));
2686
2687 if (ret >= 0) {
2688 if (atomic_read(&mle->woken) == 1 ||
2689 res->owner == target)
2690 break;
2691
1cd04dbe
KH
2692 mlog(0, "%s:%.*s: timed out during migration\n",
2693 dlm->name, res->lockname.len, res->lockname.name);
2bd63216 2694 /* avoid hang during shutdown when migrating lockres
e2faea4c
KH
2695 * to a node which also goes down */
2696 if (dlm_is_node_dead(dlm, target)) {
aa852354
KH
2697 mlog(0, "%s:%.*s: expected migration "
2698 "target %u is no longer up, restarting\n",
e2faea4c
KH
2699 dlm->name, res->lockname.len,
2700 res->lockname.name, target);
1cd04dbe
KH
2701 ret = -EINVAL;
2702 /* migration failed, detach and clean up mle */
2703 dlm_mle_detach_hb_events(dlm, mle);
2704 dlm_put_mle(mle);
2705 dlm_put_mle_inuse(mle);
2706 spin_lock(&res->spinlock);
2707 res->state &= ~DLM_LOCK_RES_MIGRATING;
a6fa3640 2708 wake = 1;
1cd04dbe
KH
2709 spin_unlock(&res->spinlock);
2710 goto leave;
e2faea4c 2711 }
1cd04dbe
KH
2712 } else
2713 mlog(0, "%s:%.*s: caught signal during migration\n",
2714 dlm->name, res->lockname.len, res->lockname.name);
6714d8e8
KH
2715 }
2716
2717 /* all done, set the owner, clear the flag */
2718 spin_lock(&res->spinlock);
2719 dlm_set_lockres_owner(dlm, res, target);
2720 res->state &= ~DLM_LOCK_RES_MIGRATING;
2721 dlm_remove_nonlocal_locks(dlm, res);
2722 spin_unlock(&res->spinlock);
2723 wake_up(&res->wq);
2724
2725 /* master is known, detach if not already detached */
2726 dlm_mle_detach_hb_events(dlm, mle);
a2bf0477 2727 dlm_put_mle_inuse(mle);
6714d8e8
KH
2728 ret = 0;
2729
2730 dlm_lockres_calc_usage(dlm, res);
2731
2732leave:
2733 /* re-dirty the lockres if we failed */
2734 if (ret < 0)
2735 dlm_kick_thread(dlm, res);
2736
a6fa3640
KH
2737 /* wake up waiters if the MIGRATING flag got set
2738 * but migration failed */
2739 if (wake)
2740 wake_up(&res->wq);
2741
6714d8e8
KH
2742 if (mres)
2743 free_page((unsigned long)mres);
2744
2745 dlm_put(dlm);
2746
9f62e960
SM
2747 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2748 name, target, ret);
6714d8e8
KH
2749 return ret;
2750}
6714d8e8 2751
9f62e960
SM
2752/*
2753 * Should be called only after beginning the domain leave process.
ba2bf218
KH
2754 * There should not be any remaining locks on nonlocal lock resources,
2755 * and there should be no local locks left on locally mastered resources.
2756 *
2757 * Called with the dlm spinlock held, may drop it to do migration, but
2758 * will re-acquire before exit.
2759 *
9f62e960
SM
2760 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2761 */
ba2bf218 2762int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
8f745e62 2763 __must_hold(&dlm->spinlock)
ba2bf218 2764{
66effd3c 2765 int ret;
ba2bf218 2766 int lock_dropped = 0;
66effd3c 2767 u8 target = O2NM_MAX_NODES;
ba2bf218 2768
9f62e960 2769 assert_spin_locked(&dlm->spinlock);
2f5bf1f2 2770
9f62e960 2771 spin_lock(&res->spinlock);
baa31b89 2772 if (dlm_is_lockres_migratable(dlm, res))
66effd3c 2773 target = dlm_pick_migration_target(dlm, res);
b36c3f84 2774 spin_unlock(&res->spinlock);
66effd3c
SM
2775
2776 if (target == O2NM_MAX_NODES)
9f62e960 2777 goto leave;
ba2bf218
KH
2778
2779 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2780 spin_unlock(&dlm->spinlock);
2781 lock_dropped = 1;
66effd3c
SM
2782 ret = dlm_migrate_lockres(dlm, res, target);
2783 if (ret)
2784 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2785 dlm->name, res->lockname.len, res->lockname.name,
2786 target, ret);
ba2bf218
KH
2787 spin_lock(&dlm->spinlock);
2788leave:
2789 return lock_dropped;
2790}
2791
6714d8e8
KH
2792int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2793{
2794 int ret;
2795 spin_lock(&dlm->ast_lock);
2796 spin_lock(&lock->spinlock);
2797 ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2798 spin_unlock(&lock->spinlock);
2799 spin_unlock(&dlm->ast_lock);
2800 return ret;
2801}
2802
2803static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2804 struct dlm_lock_resource *res,
2805 u8 mig_target)
2806{
2807 int can_proceed;
2808 spin_lock(&res->spinlock);
2809 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2810 spin_unlock(&res->spinlock);
2811
2bd63216 2812 /* target has died, so make the caller break out of the
6714d8e8
KH
2813 * wait_event, but caller must recheck the domain_map */
2814 spin_lock(&dlm->spinlock);
2815 if (!test_bit(mig_target, dlm->domain_map))
2816 can_proceed = 1;
2817 spin_unlock(&dlm->spinlock);
2818 return can_proceed;
2819}
2820
faf0ec9f
AB
2821static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2822 struct dlm_lock_resource *res)
6714d8e8
KH
2823{
2824 int ret;
2825 spin_lock(&res->spinlock);
2826 ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2827 spin_unlock(&res->spinlock);
2828 return ret;
2829}
2830
2831
2832static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2833 struct dlm_lock_resource *res,
2834 u8 target)
2835{
2836 int ret = 0;
2837
2838 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2839 res->lockname.len, res->lockname.name, dlm->node_num,
2840 target);
2841 /* need to set MIGRATING flag on lockres. this is done by
2842 * ensuring that all asts have been flushed for this lockres. */
2843 spin_lock(&res->spinlock);
2844 BUG_ON(res->migration_pending);
2845 res->migration_pending = 1;
2846 /* strategy is to reserve an extra ast then release
2847 * it below, letting the release do all of the work */
2848 __dlm_lockres_reserve_ast(res);
2849 spin_unlock(&res->spinlock);
2850
ddc09c8d 2851 /* now flush all the pending asts */
6714d8e8 2852 dlm_kick_thread(dlm, res);
ddc09c8d
KH
2853 /* before waiting on DIRTY, block processes which may
2854 * try to dirty the lockres before MIGRATING is set */
2855 spin_lock(&res->spinlock);
2856 BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2857 res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
2858 spin_unlock(&res->spinlock);
2859 /* now wait on any pending asts and the DIRTY state */
6714d8e8
KH
2860 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2861 dlm_lockres_release_ast(dlm, res);
2862
2863 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2864 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2865 /* if the extra ref we just put was the final one, this
2866 * will pass thru immediately. otherwise, we need to wait
2867 * for the last ast to finish. */
2868again:
2869 ret = wait_event_interruptible_timeout(dlm->migration_wq,
2870 dlm_migration_can_proceed(dlm, res, target),
2871 msecs_to_jiffies(1000));
2872 if (ret < 0) {
2873 mlog(0, "woken again: migrating? %s, dead? %s\n",
2874 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2875 test_bit(target, dlm->domain_map) ? "no":"yes");
2876 } else {
2877 mlog(0, "all is well: migrating? %s, dead? %s\n",
2878 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2879 test_bit(target, dlm->domain_map) ? "no":"yes");
2880 }
2881 if (!dlm_migration_can_proceed(dlm, res, target)) {
2882 mlog(0, "trying again...\n");
2883 goto again;
2884 }
2885
a39953dd 2886 ret = 0;
6714d8e8
KH
2887 /* did the target go down or die? */
2888 spin_lock(&dlm->spinlock);
2889 if (!test_bit(target, dlm->domain_map)) {
2890 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2891 target);
2892 ret = -EHOSTDOWN;
2893 }
2894 spin_unlock(&dlm->spinlock);
2895
a39953dd
WW
2896 /*
2897 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2898 * another try; otherwise, we are sure the MIGRATING state is there,
4d39f0ac 2899 * drop the unneeded state which blocked threads trying to DIRTY
a39953dd
WW
2900 */
2901 spin_lock(&res->spinlock);
2902 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2903 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2904 if (!ret)
2905 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
cc28d6d8 2906 else
2907 res->migration_pending = 0;
a39953dd
WW
2908 spin_unlock(&res->spinlock);
2909
6714d8e8
KH
2910 /*
2911 * at this point:
2912 *
a39953dd 2913 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
6714d8e8
KH
2914 * o there are no pending asts on this lockres
2915 * o all processes trying to reserve an ast on this
2916 * lockres must wait for the MIGRATING flag to clear
2917 */
2918 return ret;
2919}
2920
2921/* last step in the migration process.
2922 * original master calls this to free all of the dlm_lock
2923 * structures that used to be for other nodes. */
2924static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2925 struct dlm_lock_resource *res)
2926{
6714d8e8 2927 struct list_head *queue = &res->granted;
ba2bf218 2928 int i, bit;
800deef3 2929 struct dlm_lock *lock, *next;
6714d8e8
KH
2930
2931 assert_spin_locked(&res->spinlock);
2932
2933 BUG_ON(res->owner == dlm->node_num);
2934
2935 for (i=0; i<3; i++) {
800deef3 2936 list_for_each_entry_safe(lock, next, queue, list) {
6714d8e8
KH
2937 if (lock->ml.node != dlm->node_num) {
2938 mlog(0, "putting lock for node %u\n",
2939 lock->ml.node);
2940 /* be extra careful */
2941 BUG_ON(!list_empty(&lock->ast_list));
2942 BUG_ON(!list_empty(&lock->bast_list));
2943 BUG_ON(lock->ast_pending);
2944 BUG_ON(lock->bast_pending);
8d400b81
SM
2945 dlm_lockres_clear_refmap_bit(dlm, res,
2946 lock->ml.node);
6714d8e8
KH
2947 list_del_init(&lock->list);
2948 dlm_lock_put(lock);
2c5c54ac
SM
2949 /* In a normal unlock, we would have added a
2950 * DLM_UNLOCK_FREE_LOCK action. Force it. */
2951 dlm_lock_put(lock);
6714d8e8
KH
2952 }
2953 }
2954 queue++;
2955 }
ba2bf218
KH
2956 bit = 0;
2957 while (1) {
2958 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2959 if (bit >= O2NM_MAX_NODES)
2960 break;
2961 /* do not clear the local node reference, if there is a
2962 * process holding this, let it drop the ref itself */
2963 if (bit != dlm->node_num) {
2964 mlog(0, "%s:%.*s: node %u had a ref to this "
2965 "migrating lockres, clearing\n", dlm->name,
2966 res->lockname.len, res->lockname.name, bit);
8d400b81 2967 dlm_lockres_clear_refmap_bit(dlm, res, bit);
ba2bf218
KH
2968 }
2969 bit++;
2970 }
6714d8e8
KH
2971}
2972
66effd3c
SM
2973/*
2974 * Pick a node to migrate the lock resource to. This function selects a
2975 * potential target based first on the locks and then on refmap. It skips
2976 * nodes that are in the process of exiting the domain.
2977 */
6714d8e8
KH
2978static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2979 struct dlm_lock_resource *res)
2980{
66effd3c 2981 enum dlm_lockres_list idx;
6714d8e8 2982 struct list_head *queue = &res->granted;
6714d8e8 2983 struct dlm_lock *lock;
66effd3c
SM
2984 int noderef;
2985 u8 nodenum = O2NM_MAX_NODES;
6714d8e8
KH
2986
2987 assert_spin_locked(&dlm->spinlock);
66effd3c 2988 assert_spin_locked(&res->spinlock);
6714d8e8 2989
66effd3c
SM
2990 /* Go through all the locks */
2991 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2992 queue = dlm_list_idx_to_ptr(res, idx);
800deef3 2993 list_for_each_entry(lock, queue, list) {
66effd3c
SM
2994 if (lock->ml.node == dlm->node_num)
2995 continue;
2996 if (test_bit(lock->ml.node, dlm->exit_domain_map))
2997 continue;
2998 nodenum = lock->ml.node;
2999 goto bail;
6714d8e8 3000 }
6714d8e8 3001 }
388c4bcb 3002
66effd3c
SM
3003 /* Go thru the refmap */
3004 noderef = -1;
6714d8e8 3005 while (1) {
66effd3c
SM
3006 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
3007 noderef + 1);
3008 if (noderef >= O2NM_MAX_NODES)
6714d8e8 3009 break;
66effd3c
SM
3010 if (noderef == dlm->node_num)
3011 continue;
3012 if (test_bit(noderef, dlm->exit_domain_map))
3013 continue;
3014 nodenum = noderef;
3015 goto bail;
6714d8e8
KH
3016 }
3017
66effd3c
SM
3018bail:
3019 return nodenum;
6714d8e8
KH
3020}
3021
6714d8e8
KH
3022/* this is called by the new master once all lockres
3023 * data has been received */
3024static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
3025 struct dlm_lock_resource *res,
3026 u8 master, u8 new_master,
3027 struct dlm_node_iter *iter)
3028{
3029 struct dlm_migrate_request migrate;
2b832564 3030 int ret, skip, status = 0;
6714d8e8
KH
3031 int nodenum;
3032
3033 memset(&migrate, 0, sizeof(migrate));
3034 migrate.namelen = res->lockname.len;
3035 memcpy(migrate.name, res->lockname.name, migrate.namelen);
3036 migrate.new_master = new_master;
3037 migrate.master = master;
3038
3039 ret = 0;
3040
3041 /* send message to all nodes, except the master and myself */
3042 while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
3043 if (nodenum == master ||
3044 nodenum == new_master)
3045 continue;
3046
2b832564
SM
3047 /* We could race exit domain. If exited, skip. */
3048 spin_lock(&dlm->spinlock);
3049 skip = (!test_bit(nodenum, dlm->domain_map));
3050 spin_unlock(&dlm->spinlock);
3051 if (skip) {
3052 clear_bit(nodenum, iter->node_map);
3053 continue;
3054 }
3055
6714d8e8
KH
3056 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
3057 &migrate, sizeof(migrate), nodenum,
3058 &status);
2b832564 3059 if (ret < 0) {
8decab3c
SM
3060 mlog(ML_ERROR, "%s: res %.*s, Error %d send "
3061 "MIGRATE_REQUEST to node %u\n", dlm->name,
3062 migrate.namelen, migrate.name, ret, nodenum);
2b832564
SM
3063 if (!dlm_is_host_down(ret)) {
3064 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3065 BUG();
3066 }
3067 clear_bit(nodenum, iter->node_map);
3068 ret = 0;
3069 } else if (status < 0) {
6714d8e8
KH
3070 mlog(0, "migrate request (node %u) returned %d!\n",
3071 nodenum, status);
3072 ret = status;
ba2bf218
KH
3073 } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
3074 /* during the migration request we short-circuited
3075 * the mastery of the lockres. make sure we have
3076 * a mastery ref for nodenum */
3077 mlog(0, "%s:%.*s: need ref for node %u\n",
3078 dlm->name, res->lockname.len, res->lockname.name,
3079 nodenum);
3080 spin_lock(&res->spinlock);
8d400b81 3081 dlm_lockres_set_refmap_bit(dlm, res, nodenum);
ba2bf218 3082 spin_unlock(&res->spinlock);
6714d8e8
KH
3083 }
3084 }
3085
3086 if (ret < 0)
3087 mlog_errno(ret);
3088
3089 mlog(0, "returning ret=%d\n", ret);
3090 return ret;
3091}
3092
3093
3094/* if there is an existing mle for this lockres, we now know who the master is.
3095 * (the one who sent us *this* message) we can clear it up right away.
3096 * since the process that put the mle on the list still has a reference to it,
3097 * we can unhash it now, set the master and wake the process. as a result,
3098 * we will have no mle in the list to start with. now we can add an mle for
3099 * the migration and this should be the only one found for those scanning the
3100 * list. */
d74c9803
KH
3101int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3102 void **ret_data)
6714d8e8
KH
3103{
3104 struct dlm_ctxt *dlm = data;
3105 struct dlm_lock_resource *res = NULL;
3106 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
3107 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
3108 const char *name;
a3d33291 3109 unsigned int namelen, hash;
6714d8e8
KH
3110 int ret = 0;
3111
3112 if (!dlm_grab(dlm))
c372f219 3113 return 0;
6714d8e8
KH
3114
3115 name = migrate->name;
3116 namelen = migrate->namelen;
a3d33291 3117 hash = dlm_lockid_hash(name, namelen);
6714d8e8
KH
3118
3119 /* preallocate.. if this fails, abort */
3914ed0c 3120 mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
6714d8e8
KH
3121
3122 if (!mle) {
3123 ret = -ENOMEM;
3124 goto leave;
3125 }
3126
3127 /* check for pre-existing lock */
3128 spin_lock(&dlm->spinlock);
a3d33291 3129 res = __dlm_lookup_lockres(dlm, name, namelen, hash);
6714d8e8
KH
3130 if (res) {
3131 spin_lock(&res->spinlock);
3132 if (res->state & DLM_LOCK_RES_RECOVERING) {
3133 /* if all is working ok, this can only mean that we got
3134 * a migrate request from a node that we now see as
3135 * dead. what can we do here? drop it to the floor? */
3136 spin_unlock(&res->spinlock);
3137 mlog(ML_ERROR, "Got a migrate request, but the "
3138 "lockres is marked as recovering!");
3139 kmem_cache_free(dlm_mle_cache, mle);
3140 ret = -EINVAL; /* need a better solution */
3141 goto unlock;
3142 }
3143 res->state |= DLM_LOCK_RES_MIGRATING;
3144 spin_unlock(&res->spinlock);
3145 }
3146
6d98c3cc 3147 spin_lock(&dlm->master_lock);
6714d8e8
KH
3148 /* ignore status. only nonzero status would BUG. */
3149 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3150 name, namelen,
3151 migrate->new_master,
3152 migrate->master);
3153
0cc482ee
G
3154 if (ret < 0)
3155 kmem_cache_free(dlm_mle_cache, mle);
3156
6714d8e8 3157 spin_unlock(&dlm->master_lock);
6d98c3cc 3158unlock:
6714d8e8
KH
3159 spin_unlock(&dlm->spinlock);
3160
3161 if (oldmle) {
3162 /* master is known, detach if not already detached */
3163 dlm_mle_detach_hb_events(dlm, oldmle);
3164 dlm_put_mle(oldmle);
3165 }
3166
3167 if (res)
3168 dlm_lockres_put(res);
3169leave:
3170 dlm_put(dlm);
3171 return ret;
3172}
3173
3174/* must be holding dlm->spinlock and dlm->master_lock
3175 * when adding a migration mle, we can clear any other mles
3176 * in the master list because we know with certainty that
3177 * the master is "master". so we remove any old mle from
3178 * the list after setting it's master field, and then add
3179 * the new migration mle. this way we can hold with the rule
3180 * of having only one mle for a given lock name at all times. */
3181static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3182 struct dlm_lock_resource *res,
3183 struct dlm_master_list_entry *mle,
3184 struct dlm_master_list_entry **oldmle,
3185 const char *name, unsigned int namelen,
3186 u8 new_master, u8 master)
3187{
3188 int found;
3189 int ret = 0;
3190
3191 *oldmle = NULL;
3192
6714d8e8
KH
3193 assert_spin_locked(&dlm->spinlock);
3194 assert_spin_locked(&dlm->master_lock);
3195
3196 /* caller is responsible for any ref taken here on oldmle */
3197 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
3198 if (found) {
3199 struct dlm_master_list_entry *tmp = *oldmle;
3200 spin_lock(&tmp->spinlock);
3201 if (tmp->type == DLM_MLE_MIGRATION) {
3202 if (master == dlm->node_num) {
3203 /* ah another process raced me to it */
3204 mlog(0, "tried to migrate %.*s, but some "
3205 "process beat me to it\n",
3206 namelen, name);
32e49326 3207 spin_unlock(&tmp->spinlock);
3208 return -EEXIST;
6714d8e8
KH
3209 } else {
3210 /* bad. 2 NODES are trying to migrate! */
3211 mlog(ML_ERROR, "migration error mle: "
3212 "master=%u new_master=%u // request: "
3213 "master=%u new_master=%u // "
3214 "lockres=%.*s\n",
3215 tmp->master, tmp->new_master,
3216 master, new_master,
3217 namelen, name);
3218 BUG();
3219 }
3220 } else {
3221 /* this is essentially what assert_master does */
3222 tmp->master = master;
3223 atomic_set(&tmp->woken, 1);
3224 wake_up(&tmp->wq);
1c084577
SM
3225 /* remove it so that only one mle will be found */
3226 __dlm_unlink_mle(dlm, tmp);
ba2bf218 3227 __dlm_mle_detach_hb_events(dlm, tmp);
b9aaac5a 3228 if (tmp->type == DLM_MLE_MASTER) {
3229 ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3230 mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3231 "telling master to get ref "
3232 "for cleared out mle during "
3233 "migration\n", dlm->name,
3234 namelen, name, master,
3235 new_master);
3236 }
6714d8e8
KH
3237 }
3238 spin_unlock(&tmp->spinlock);
3239 }
3240
3241 /* now add a migration mle to the tail of the list */
3242 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
3243 mle->new_master = new_master;
ba2bf218
KH
3244 /* the new master will be sending an assert master for this.
3245 * at that point we will get the refmap reference */
6714d8e8
KH
3246 mle->master = master;
3247 /* do this for consistency with other mle types */
3248 set_bit(new_master, mle->maybe_map);
1c084577 3249 __dlm_insert_mle(dlm, mle);
6714d8e8
KH
3250
3251 return ret;
3252}
3253
c2cd4a44
SM
3254/*
3255 * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3256 */
3257static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3258 struct dlm_master_list_entry *mle)
3259{
3260 struct dlm_lock_resource *res;
c2cd4a44
SM
3261
3262 /* Find the lockres associated to the mle and set its owner to UNK */
7141514b
SM
3263 res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3264 mle->mnamehash);
c2cd4a44
SM
3265 if (res) {
3266 spin_unlock(&dlm->master_lock);
3267
3268 /* move lockres onto recovery list */
3269 spin_lock(&res->spinlock);
3270 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3271 dlm_move_lockres_to_recovery_list(dlm, res);
3272 spin_unlock(&res->spinlock);
3273 dlm_lockres_put(res);
3274
3275 /* about to get rid of mle, detach from heartbeat */
3276 __dlm_mle_detach_hb_events(dlm, mle);
3277
3278 /* dump the mle */
3279 spin_lock(&dlm->master_lock);
3280 __dlm_put_mle(mle);
3281 spin_unlock(&dlm->master_lock);
3282 }
3283
3284 return res;
3285}
3286
3287static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3288 struct dlm_master_list_entry *mle)
3289{
3290 __dlm_mle_detach_hb_events(dlm, mle);
3291
3292 spin_lock(&mle->spinlock);
3293 __dlm_unlink_mle(dlm, mle);
3294 atomic_set(&mle->woken, 1);
3295 spin_unlock(&mle->spinlock);
3296
3297 wake_up(&mle->wq);
3298}
3299
3300static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3301 struct dlm_master_list_entry *mle, u8 dead_node)
3302{
3303 int bit;
3304
3305 BUG_ON(mle->type != DLM_MLE_BLOCK);
3306
3307 spin_lock(&mle->spinlock);
3308 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3309 if (bit != dead_node) {
3310 mlog(0, "mle found, but dead node %u would not have been "
3311 "master\n", dead_node);
3312 spin_unlock(&mle->spinlock);
3313 } else {
3314 /* Must drop the refcount by one since the assert_master will
3315 * never arrive. This may result in the mle being unlinked and
3316 * freed, but there may still be a process waiting in the
3317 * dlmlock path which is fine. */
3318 mlog(0, "node %u was expected master\n", dead_node);
3319 atomic_set(&mle->woken, 1);
3320 spin_unlock(&mle->spinlock);
3321 wake_up(&mle->wq);
3322
3323 /* Do not need events any longer, so detach from heartbeat */
3324 __dlm_mle_detach_hb_events(dlm, mle);
3325 __dlm_put_mle(mle);
3326 }
3327}
6714d8e8
KH
3328
3329void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3330{
2ed6c750 3331 struct dlm_master_list_entry *mle;
6714d8e8 3332 struct dlm_lock_resource *res;
2ed6c750 3333 struct hlist_head *bucket;
df53cd3b 3334 struct hlist_node *tmp;
2ed6c750 3335 unsigned int i;
6714d8e8 3336
ef6b689b 3337 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
6714d8e8
KH
3338top:
3339 assert_spin_locked(&dlm->spinlock);
3340
3341 /* clean the master list */
3342 spin_lock(&dlm->master_lock);
2ed6c750
SM
3343 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3344 bucket = dlm_master_hash(dlm, i);
df53cd3b 3345 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
67ae1f06
SM
3346 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3347 mle->type != DLM_MLE_MASTER &&
3348 mle->type != DLM_MLE_MIGRATION);
3349
3350 /* MASTER mles are initiated locally. The waiting
3351 * process will notice the node map change shortly.
3352 * Let that happen as normal. */
3353 if (mle->type == DLM_MLE_MASTER)
3354 continue;
3355
3356 /* BLOCK mles are initiated by other nodes. Need to
3357 * clean up if the dead node would have been the
3358 * master. */
3359 if (mle->type == DLM_MLE_BLOCK) {
3360 dlm_clean_block_mle(dlm, mle, dead_node);
3361 continue;
3362 }
6714d8e8 3363
67ae1f06
SM
3364 /* Everything else is a MIGRATION mle */
3365
3366 /* The rule for MIGRATION mles is that the master
3367 * becomes UNKNOWN if *either* the original or the new
3368 * master dies. All UNKNOWN lockres' are sent to
3369 * whichever node becomes the recovery master. The new
3370 * master is responsible for determining if there is
3371 * still a master for this lockres, or if he needs to
3372 * take over mastery. Either way, this node should
3373 * expect another message to resolve this. */
3374
3375 if (mle->master != dead_node &&
3376 mle->new_master != dead_node)
3377 continue;
3378
bef5502d 3379 if (mle->new_master == dead_node && mle->inuse) {
3380 mlog(ML_NOTICE, "%s: target %u died during "
3381 "migration from %u, the MLE is "
3382 "still keep used, ignore it!\n",
3383 dlm->name, dead_node,
3384 mle->master);
3385 continue;
3386 }
3387
67ae1f06
SM
3388 /* If we have reached this point, this mle needs to be
3389 * removed from the list and freed. */
3390 dlm_clean_migration_mle(dlm, mle);
3391
3392 mlog(0, "%s: node %u died during migration from "
3393 "%u to %u!\n", dlm->name, dead_node, mle->master,
3394 mle->new_master);
3395
3396 /* If we find a lockres associated with the mle, we've
3397 * hit this rare case that messes up our lock ordering.
3398 * If so, we need to drop the master lock so that we can
3399 * take the lockres lock, meaning that we will have to
3400 * restart from the head of list. */
3401 res = dlm_reset_mleres_owner(dlm, mle);
3402 if (res)
3403 /* restart */
3404 goto top;
3405
3406 /* This may be the last reference */
3407 __dlm_put_mle(mle);
6714d8e8 3408 }
2ed6c750 3409 }
6714d8e8
KH
3410 spin_unlock(&dlm->master_lock);
3411}
3412
6714d8e8
KH
3413int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3414 u8 old_master)
3415{
3416 struct dlm_node_iter iter;
3417 int ret = 0;
3418
3419 spin_lock(&dlm->spinlock);
3420 dlm_node_iter_init(dlm->domain_map, &iter);
3421 clear_bit(old_master, iter.node_map);
3422 clear_bit(dlm->node_num, iter.node_map);
3423 spin_unlock(&dlm->spinlock);
3424
ba2bf218
KH
3425 /* ownership of the lockres is changing. account for the
3426 * mastery reference here since old_master will briefly have
3427 * a reference after the migration completes */
3428 spin_lock(&res->spinlock);
8d400b81 3429 dlm_lockres_set_refmap_bit(dlm, res, old_master);
ba2bf218
KH
3430 spin_unlock(&res->spinlock);
3431
6714d8e8
KH
3432 mlog(0, "now time to do a migrate request to other nodes\n");
3433 ret = dlm_do_migrate_request(dlm, res, old_master,
3434 dlm->node_num, &iter);
3435 if (ret < 0) {
3436 mlog_errno(ret);
3437 goto leave;
3438 }
3439
3440 mlog(0, "doing assert master of %.*s to all except the original node\n",
3441 res->lockname.len, res->lockname.name);
3442 /* this call now finishes out the nodemap
3443 * even if one or more nodes die */
ba2bf218 3444 ret = dlm_do_assert_master(dlm, res, iter.node_map,
6714d8e8
KH
3445 DLM_ASSERT_MASTER_FINISH_MIGRATION);
3446 if (ret < 0) {
3447 /* no longer need to retry. all living nodes contacted. */
3448 mlog_errno(ret);
3449 ret = 0;
3450 }
3451
3452 memset(iter.node_map, 0, sizeof(iter.node_map));
3453 set_bit(old_master, iter.node_map);
3454 mlog(0, "doing assert master of %.*s back to %u\n",
3455 res->lockname.len, res->lockname.name, old_master);
ba2bf218 3456 ret = dlm_do_assert_master(dlm, res, iter.node_map,
6714d8e8
KH
3457 DLM_ASSERT_MASTER_FINISH_MIGRATION);
3458 if (ret < 0) {
3459 mlog(0, "assert master to original master failed "
3460 "with %d.\n", ret);
3461 /* the only nonzero status here would be because of
3462 * a dead original node. we're done. */
3463 ret = 0;
3464 }
3465
3466 /* all done, set the owner, clear the flag */
3467 spin_lock(&res->spinlock);
3468 dlm_set_lockres_owner(dlm, res, dlm->node_num);
3469 res->state &= ~DLM_LOCK_RES_MIGRATING;
3470 spin_unlock(&res->spinlock);
3471 /* re-dirty it on the new master */
3472 dlm_kick_thread(dlm, res);
3473 wake_up(&res->wq);
3474leave:
3475 return ret;
3476}
3477
3478/*
3479 * LOCKRES AST REFCOUNT
3480 * this is integral to migration
3481 */
3482
3483/* for future intent to call an ast, reserve one ahead of time.
3484 * this should be called only after waiting on the lockres
3485 * with dlm_wait_on_lockres, and while still holding the
3486 * spinlock after the call. */
3487void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
3488{
3489 assert_spin_locked(&res->spinlock);
3490 if (res->state & DLM_LOCK_RES_MIGRATING) {
3491 __dlm_print_one_lock_resource(res);
3492 }
3493 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3494
3495 atomic_inc(&res->asts_reserved);
3496}
3497
3498/*
3499 * used to drop the reserved ast, either because it went unused,
3500 * or because the ast/bast was actually called.
3501 *
3502 * also, if there is a pending migration on this lockres,
3503 * and this was the last pending ast on the lockres,
3504 * atomically set the MIGRATING flag before we drop the lock.
3505 * this is how we ensure that migration can proceed with no
3506 * asts in progress. note that it is ok if the state of the
3507 * queues is such that a lock should be granted in the future
3508 * or that a bast should be fired, because the new master will
3509 * shuffle the lists on this lockres as soon as it is migrated.
3510 */
3511void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3512 struct dlm_lock_resource *res)
3513{
3514 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
3515 return;
3516
3517 if (!res->migration_pending) {
3518 spin_unlock(&res->spinlock);
3519 return;
3520 }
3521
3522 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3523 res->migration_pending = 0;
3524 res->state |= DLM_LOCK_RES_MIGRATING;
3525 spin_unlock(&res->spinlock);
3526 wake_up(&res->wq);
3527 wake_up(&dlm->migration_wq);
3528}
5dad6c39
SE
3529
3530void dlm_force_free_mles(struct dlm_ctxt *dlm)
3531{
3532 int i;
3533 struct hlist_head *bucket;
3534 struct dlm_master_list_entry *mle;
df53cd3b 3535 struct hlist_node *tmp;
5dad6c39
SE
3536
3537 /*
3538 * We notified all other nodes that we are exiting the domain and
3539 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3540 * around we force free them and wake any processes that are waiting
3541 * on the mles
3542 */
3543 spin_lock(&dlm->spinlock);
3544 spin_lock(&dlm->master_lock);
3545
3546 BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3547 BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3548
3549 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3550 bucket = dlm_master_hash(dlm, i);
df53cd3b 3551 hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
5dad6c39
SE
3552 if (mle->type != DLM_MLE_BLOCK) {
3553 mlog(ML_ERROR, "bad mle: %p\n", mle);
3554 dlm_print_one_mle(mle);
3555 }
3556 atomic_set(&mle->woken, 1);
3557 wake_up(&mle->wq);
3558
3559 __dlm_unlink_mle(dlm, mle);
3560 __dlm_mle_detach_hb_events(dlm, mle);
3561 __dlm_put_mle(mle);
3562 }
3563 }
3564 spin_unlock(&dlm->master_lock);
3565 spin_unlock(&dlm->spinlock);
3566}