RDMA/core: Implement compat device/sysfs tree in net namespace
[linux-block.git] / drivers / infiniband / core / device.c
CommitLineData
1da177e4
LT
1/*
2 * Copyright (c) 2004 Topspin Communications. All rights reserved.
2a1d9b7f 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
1da177e4
LT
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
1da177e4
LT
32 */
33
34#include <linux/module.h>
35#include <linux/string.h>
36#include <linux/errno.h>
9a6b090c 37#include <linux/kernel.h>
1da177e4
LT
38#include <linux/slab.h>
39#include <linux/init.h>
9268f72d 40#include <linux/netdevice.h>
4e0f7b90
PP
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
8f408ab6
DJ
43#include <linux/security.h>
44#include <linux/notifier.h>
324e227e 45#include <linux/hashtable.h>
b2cbae2c 46#include <rdma/rdma_netlink.h>
03db3a2d
MB
47#include <rdma/ib_addr.h>
48#include <rdma/ib_cache.h>
1da177e4
LT
49
50#include "core_priv.h"
41eda65c 51#include "restrack.h"
1da177e4
LT
52
53MODULE_AUTHOR("Roland Dreier");
54MODULE_DESCRIPTION("core kernel InfiniBand API");
55MODULE_LICENSE("Dual BSD/GPL");
56
14d3a3b2 57struct workqueue_struct *ib_comp_wq;
f794809a 58struct workqueue_struct *ib_comp_unbound_wq;
f0626710
TH
59struct workqueue_struct *ib_wq;
60EXPORT_SYMBOL_GPL(ib_wq);
61
921eab11
JG
62/*
63 * Each of the three rwsem locks (devices, clients, client_data) protects the
64 * xarray of the same name. Specifically it allows the caller to assert that
65 * the MARK will/will not be changing under the lock, and for devices and
66 * clients, that the value in the xarray is still a valid pointer. Change of
67 * the MARK is linked to the object state, so holding the lock and testing the
68 * MARK also asserts that the contained object is in a certain state.
69 *
70 * This is used to build a two stage register/unregister flow where objects
71 * can continue to be in the xarray even though they are still in progress to
72 * register/unregister.
73 *
74 * The xarray itself provides additional locking, and restartable iteration,
75 * which is also relied on.
76 *
77 * Locks should not be nested, with the exception of client_data, which is
78 * allowed to nest under the read side of the other two locks.
79 *
80 * The devices_rwsem also protects the device name list, any change or
81 * assignment of device name must also hold the write side to guarantee unique
82 * names.
83 */
84
0df91bb6
JG
85/*
86 * devices contains devices that have had their names assigned. The
87 * devices may not be registered. Users that care about the registration
88 * status need to call ib_device_try_get() on the device to ensure it is
89 * registered, and keep it registered, for the required duration.
90 *
91 */
92static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
921eab11 93static DECLARE_RWSEM(devices_rwsem);
0df91bb6
JG
94#define DEVICE_REGISTERED XA_MARK_1
95
1da177e4 96static LIST_HEAD(client_list);
e59178d8
JG
97#define CLIENT_REGISTERED XA_MARK_1
98static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
921eab11 99static DECLARE_RWSEM(clients_rwsem);
1da177e4
LT
100
101/*
0df91bb6
JG
102 * If client_data is registered then the corresponding client must also still
103 * be registered.
104 */
105#define CLIENT_DATA_REGISTERED XA_MARK_1
4e0f7b90
PP
106
107/**
108 * struct rdma_dev_net - rdma net namespace metadata for a net
109 * @net: Pointer to owner net namespace
110 * @id: xarray id to identify the net namespace.
111 */
112struct rdma_dev_net {
113 possible_net_t net;
114 u32 id;
115};
116
117static unsigned int rdma_dev_net_id;
118
119/*
120 * A list of net namespaces is maintained in an xarray. This is necessary
121 * because we can't get the locking right using the existing net ns list. We
122 * would require a init_net callback after the list is updated.
123 */
124static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
125/*
126 * rwsem to protect accessing the rdma_nets xarray entries.
127 */
128static DECLARE_RWSEM(rdma_nets_rwsem);
129
0df91bb6
JG
130/*
131 * xarray has this behavior where it won't iterate over NULL values stored in
132 * allocated arrays. So we need our own iterator to see all values stored in
133 * the array. This does the same thing as xa_for_each except that it also
134 * returns NULL valued entries if the array is allocating. Simplified to only
135 * work on simple xarrays.
136 */
137static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
138 xa_mark_t filter)
139{
140 XA_STATE(xas, xa, *indexp);
141 void *entry;
142
143 rcu_read_lock();
144 do {
145 entry = xas_find_marked(&xas, ULONG_MAX, filter);
146 if (xa_is_zero(entry))
147 break;
148 } while (xas_retry(&xas, entry));
149 rcu_read_unlock();
150
151 if (entry) {
152 *indexp = xas.xa_index;
153 if (xa_is_zero(entry))
154 return NULL;
155 return entry;
156 }
157 return XA_ERROR(-ENOENT);
158}
159#define xan_for_each_marked(xa, index, entry, filter) \
160 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
161 !xa_is_err(entry); \
162 (index)++, entry = xan_find_marked(xa, &(index), filter))
163
324e227e
JG
164/* RCU hash table mapping netdevice pointers to struct ib_port_data */
165static DEFINE_SPINLOCK(ndev_hash_lock);
166static DECLARE_HASHTABLE(ndev_hash, 5);
167
c2261dd7 168static void free_netdevs(struct ib_device *ib_dev);
d0899892
JG
169static void ib_unregister_work(struct work_struct *work);
170static void __ib_unregister_device(struct ib_device *device);
8f408ab6
DJ
171static int ib_security_change(struct notifier_block *nb, unsigned long event,
172 void *lsm_data);
173static void ib_policy_change_task(struct work_struct *work);
174static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
175
176static struct notifier_block ibdev_lsm_nb = {
177 .notifier_call = ib_security_change,
178};
1da177e4 179
324e227e
JG
180/* Pointer to the RCU head at the start of the ib_port_data array */
181struct ib_port_data_rcu {
182 struct rcu_head rcu_head;
183 struct ib_port_data pdata[];
184};
185
1da177e4
LT
186static int ib_device_check_mandatory(struct ib_device *device)
187{
3023a1e9 188#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
1da177e4
LT
189 static const struct {
190 size_t offset;
191 char *name;
192 } mandatory_table[] = {
193 IB_MANDATORY_FUNC(query_device),
194 IB_MANDATORY_FUNC(query_port),
195 IB_MANDATORY_FUNC(query_pkey),
1da177e4
LT
196 IB_MANDATORY_FUNC(alloc_pd),
197 IB_MANDATORY_FUNC(dealloc_pd),
1da177e4
LT
198 IB_MANDATORY_FUNC(create_qp),
199 IB_MANDATORY_FUNC(modify_qp),
200 IB_MANDATORY_FUNC(destroy_qp),
201 IB_MANDATORY_FUNC(post_send),
202 IB_MANDATORY_FUNC(post_recv),
203 IB_MANDATORY_FUNC(create_cq),
204 IB_MANDATORY_FUNC(destroy_cq),
205 IB_MANDATORY_FUNC(poll_cq),
206 IB_MANDATORY_FUNC(req_notify_cq),
207 IB_MANDATORY_FUNC(get_dma_mr),
7738613e
IW
208 IB_MANDATORY_FUNC(dereg_mr),
209 IB_MANDATORY_FUNC(get_port_immutable)
1da177e4
LT
210 };
211 int i;
212
6780c4fa 213 device->kverbs_provider = true;
9a6b090c 214 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
3023a1e9
KH
215 if (!*(void **) ((void *) &device->ops +
216 mandatory_table[i].offset)) {
6780c4fa
GP
217 device->kverbs_provider = false;
218 break;
1da177e4
LT
219 }
220 }
221
222 return 0;
223}
224
f8978bd9 225/*
01b67117
PP
226 * Caller must perform ib_device_put() to return the device reference count
227 * when ib_device_get_by_index() returns valid device pointer.
f8978bd9
LR
228 */
229struct ib_device *ib_device_get_by_index(u32 index)
230{
231 struct ib_device *device;
232
921eab11 233 down_read(&devices_rwsem);
0df91bb6 234 device = xa_load(&devices, index);
01b67117 235 if (device) {
d79af724 236 if (!ib_device_try_get(device))
01b67117
PP
237 device = NULL;
238 }
921eab11 239 up_read(&devices_rwsem);
f8978bd9
LR
240 return device;
241}
242
d79af724
JG
243/**
244 * ib_device_put - Release IB device reference
245 * @device: device whose reference to be released
246 *
247 * ib_device_put() releases reference to the IB device to allow it to be
248 * unregistered and eventually free.
249 */
01b67117
PP
250void ib_device_put(struct ib_device *device)
251{
252 if (refcount_dec_and_test(&device->refcount))
253 complete(&device->unreg_completion);
254}
d79af724 255EXPORT_SYMBOL(ib_device_put);
01b67117 256
1da177e4
LT
257static struct ib_device *__ib_device_get_by_name(const char *name)
258{
259 struct ib_device *device;
0df91bb6 260 unsigned long index;
1da177e4 261
0df91bb6 262 xa_for_each (&devices, index, device)
896de009 263 if (!strcmp(name, dev_name(&device->dev)))
1da177e4
LT
264 return device;
265
266 return NULL;
267}
268
6cc2c8e5
JG
269/**
270 * ib_device_get_by_name - Find an IB device by name
271 * @name: The name to look for
272 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
273 *
274 * Find and hold an ib_device by its name. The caller must call
275 * ib_device_put() on the returned pointer.
276 */
277struct ib_device *ib_device_get_by_name(const char *name,
278 enum rdma_driver_id driver_id)
279{
280 struct ib_device *device;
281
282 down_read(&devices_rwsem);
283 device = __ib_device_get_by_name(name);
284 if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
285 device->driver_id != driver_id)
286 device = NULL;
287
288 if (device) {
289 if (!ib_device_try_get(device))
290 device = NULL;
291 }
292 up_read(&devices_rwsem);
293 return device;
294}
295EXPORT_SYMBOL(ib_device_get_by_name);
296
4e0f7b90
PP
297static int rename_compat_devs(struct ib_device *device)
298{
299 struct ib_core_device *cdev;
300 unsigned long index;
301 int ret = 0;
302
303 mutex_lock(&device->compat_devs_mutex);
304 xa_for_each (&device->compat_devs, index, cdev) {
305 ret = device_rename(&cdev->dev, dev_name(&device->dev));
306 if (ret) {
307 dev_warn(&cdev->dev,
308 "Fail to rename compatdev to new name %s\n",
309 dev_name(&device->dev));
310 break;
311 }
312 }
313 mutex_unlock(&device->compat_devs_mutex);
314 return ret;
315}
316
d21943dd
LR
317int ib_device_rename(struct ib_device *ibdev, const char *name)
318{
e3593b56 319 int ret;
d21943dd 320
921eab11 321 down_write(&devices_rwsem);
e3593b56
JG
322 if (!strcmp(name, dev_name(&ibdev->dev))) {
323 ret = 0;
324 goto out;
325 }
326
344684e6
JG
327 if (__ib_device_get_by_name(name)) {
328 ret = -EEXIST;
329 goto out;
d21943dd
LR
330 }
331
332 ret = device_rename(&ibdev->dev, name);
333 if (ret)
334 goto out;
335 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
4e0f7b90 336 ret = rename_compat_devs(ibdev);
d21943dd 337out:
921eab11 338 up_write(&devices_rwsem);
d21943dd
LR
339 return ret;
340}
341
e349f858 342static int alloc_name(struct ib_device *ibdev, const char *name)
1da177e4 343{
1da177e4 344 struct ib_device *device;
0df91bb6 345 unsigned long index;
3b88afd3
JG
346 struct ida inuse;
347 int rc;
1da177e4
LT
348 int i;
349
921eab11 350 lockdep_assert_held_exclusive(&devices_rwsem);
3b88afd3 351 ida_init(&inuse);
0df91bb6 352 xa_for_each (&devices, index, device) {
e349f858
JG
353 char buf[IB_DEVICE_NAME_MAX];
354
896de009 355 if (sscanf(dev_name(&device->dev), name, &i) != 1)
1da177e4 356 continue;
3b88afd3 357 if (i < 0 || i >= INT_MAX)
1da177e4
LT
358 continue;
359 snprintf(buf, sizeof buf, name, i);
3b88afd3
JG
360 if (strcmp(buf, dev_name(&device->dev)) != 0)
361 continue;
362
363 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
364 if (rc < 0)
365 goto out;
1da177e4
LT
366 }
367
3b88afd3
JG
368 rc = ida_alloc(&inuse, GFP_KERNEL);
369 if (rc < 0)
370 goto out;
1da177e4 371
3b88afd3
JG
372 rc = dev_set_name(&ibdev->dev, name, rc);
373out:
374 ida_destroy(&inuse);
375 return rc;
1da177e4
LT
376}
377
55aeed06
JG
378static void ib_device_release(struct device *device)
379{
380 struct ib_device *dev = container_of(device, struct ib_device, dev);
381
c2261dd7 382 free_netdevs(dev);
652432f3 383 WARN_ON(refcount_read(&dev->refcount));
d45f89d5 384 ib_cache_release_one(dev);
b34b269a 385 ib_security_release_port_pkey_list(dev);
4e0f7b90 386 xa_destroy(&dev->compat_devs);
0df91bb6 387 xa_destroy(&dev->client_data);
324e227e
JG
388 if (dev->port_data)
389 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
390 pdata[0]),
391 rcu_head);
392 kfree_rcu(dev, rcu_head);
55aeed06
JG
393}
394
395static int ib_device_uevent(struct device *device,
396 struct kobj_uevent_env *env)
397{
896de009 398 if (add_uevent_var(env, "NAME=%s", dev_name(device)))
55aeed06
JG
399 return -ENOMEM;
400
401 /*
402 * It would be nice to pass the node GUID with the event...
403 */
404
405 return 0;
406}
407
62dfa795
PP
408static const void *net_namespace(struct device *d)
409{
4e0f7b90
PP
410 struct ib_core_device *coredev =
411 container_of(d, struct ib_core_device, dev);
412
413 return read_pnet(&coredev->rdma_net);
62dfa795
PP
414}
415
55aeed06
JG
416static struct class ib_class = {
417 .name = "infiniband",
418 .dev_release = ib_device_release,
419 .dev_uevent = ib_device_uevent,
62dfa795
PP
420 .ns_type = &net_ns_type_operations,
421 .namespace = net_namespace,
55aeed06
JG
422};
423
cebe556b 424static void rdma_init_coredev(struct ib_core_device *coredev,
4e0f7b90 425 struct ib_device *dev, struct net *net)
cebe556b
PP
426{
427 /* This BUILD_BUG_ON is intended to catch layout change
428 * of union of ib_core_device and device.
429 * dev must be the first element as ib_core and providers
430 * driver uses it. Adding anything in ib_core_device before
431 * device will break this assumption.
432 */
433 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
434 offsetof(struct ib_device, dev));
435
436 coredev->dev.class = &ib_class;
437 coredev->dev.groups = dev->groups;
438 device_initialize(&coredev->dev);
439 coredev->owner = dev;
440 INIT_LIST_HEAD(&coredev->port_list);
4e0f7b90 441 write_pnet(&coredev->rdma_net, net);
cebe556b
PP
442}
443
1da177e4 444/**
459cc69f 445 * _ib_alloc_device - allocate an IB device struct
1da177e4
LT
446 * @size:size of structure to allocate
447 *
448 * Low-level drivers should use ib_alloc_device() to allocate &struct
449 * ib_device. @size is the size of the structure to be allocated,
450 * including any private data used by the low-level driver.
451 * ib_dealloc_device() must be used to free structures allocated with
452 * ib_alloc_device().
453 */
459cc69f 454struct ib_device *_ib_alloc_device(size_t size)
1da177e4 455{
55aeed06
JG
456 struct ib_device *device;
457
458 if (WARN_ON(size < sizeof(struct ib_device)))
459 return NULL;
460
461 device = kzalloc(size, GFP_KERNEL);
462 if (!device)
463 return NULL;
464
41eda65c
LR
465 if (rdma_restrack_init(device)) {
466 kfree(device);
467 return NULL;
468 }
02d8883f 469
5f8f5499 470 device->groups[0] = &ib_dev_attr_group;
4e0f7b90 471 rdma_init_coredev(&device->coredev, device, &init_net);
55aeed06 472
55aeed06
JG
473 INIT_LIST_HEAD(&device->event_handler_list);
474 spin_lock_init(&device->event_handler_lock);
d0899892 475 mutex_init(&device->unregistration_lock);
0df91bb6
JG
476 /*
477 * client_data needs to be alloc because we don't want our mark to be
478 * destroyed if the user stores NULL in the client data.
479 */
480 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
921eab11 481 init_rwsem(&device->client_data_rwsem);
4e0f7b90
PP
482 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
483 mutex_init(&device->compat_devs_mutex);
01b67117 484 init_completion(&device->unreg_completion);
d0899892 485 INIT_WORK(&device->unregistration_work, ib_unregister_work);
1da177e4 486
55aeed06 487 return device;
1da177e4 488}
459cc69f 489EXPORT_SYMBOL(_ib_alloc_device);
1da177e4
LT
490
491/**
492 * ib_dealloc_device - free an IB device struct
493 * @device:structure to free
494 *
495 * Free a structure allocated with ib_alloc_device().
496 */
497void ib_dealloc_device(struct ib_device *device)
498{
d0899892
JG
499 if (device->ops.dealloc_driver)
500 device->ops.dealloc_driver(device);
501
502 /*
503 * ib_unregister_driver() requires all devices to remain in the xarray
504 * while their ops are callable. The last op we call is dealloc_driver
505 * above. This is needed to create a fence on op callbacks prior to
506 * allowing the driver module to unload.
507 */
508 down_write(&devices_rwsem);
509 if (xa_load(&devices, device->index) == device)
510 xa_erase(&devices, device->index);
511 up_write(&devices_rwsem);
512
c2261dd7
JG
513 /* Expedite releasing netdev references */
514 free_netdevs(device);
515
4e0f7b90 516 WARN_ON(!xa_empty(&device->compat_devs));
0df91bb6 517 WARN_ON(!xa_empty(&device->client_data));
652432f3 518 WARN_ON(refcount_read(&device->refcount));
0ad699c0 519 rdma_restrack_clean(device);
e155755e 520 /* Balances with device_initialize */
924b8900 521 put_device(&device->dev);
1da177e4
LT
522}
523EXPORT_SYMBOL(ib_dealloc_device);
524
921eab11
JG
525/*
526 * add_client_context() and remove_client_context() must be safe against
527 * parallel calls on the same device - registration/unregistration of both the
528 * device and client can be occurring in parallel.
529 *
530 * The routines need to be a fence, any caller must not return until the add
531 * or remove is fully completed.
532 */
533static int add_client_context(struct ib_device *device,
534 struct ib_client *client)
1da177e4 535{
921eab11 536 int ret = 0;
1da177e4 537
6780c4fa 538 if (!device->kverbs_provider && !client->no_kverbs_req)
921eab11
JG
539 return 0;
540
541 down_write(&device->client_data_rwsem);
542 /*
543 * Another caller to add_client_context got here first and has already
544 * completely initialized context.
545 */
546 if (xa_get_mark(&device->client_data, client->client_id,
547 CLIENT_DATA_REGISTERED))
548 goto out;
549
550 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
551 GFP_KERNEL));
552 if (ret)
553 goto out;
554 downgrade_write(&device->client_data_rwsem);
555 if (client->add)
556 client->add(device);
557
558 /* Readers shall not see a client until add has been completed */
559 xa_set_mark(&device->client_data, client->client_id,
560 CLIENT_DATA_REGISTERED);
561 up_read(&device->client_data_rwsem);
562 return 0;
563
564out:
565 up_write(&device->client_data_rwsem);
566 return ret;
567}
568
569static void remove_client_context(struct ib_device *device,
570 unsigned int client_id)
571{
572 struct ib_client *client;
573 void *client_data;
6780c4fa 574
921eab11
JG
575 down_write(&device->client_data_rwsem);
576 if (!xa_get_mark(&device->client_data, client_id,
577 CLIENT_DATA_REGISTERED)) {
578 up_write(&device->client_data_rwsem);
579 return;
580 }
581 client_data = xa_load(&device->client_data, client_id);
582 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
583 client = xa_load(&clients, client_id);
584 downgrade_write(&device->client_data_rwsem);
1da177e4 585
921eab11
JG
586 /*
587 * Notice we cannot be holding any exclusive locks when calling the
588 * remove callback as the remove callback can recurse back into any
589 * public functions in this module and thus try for any locks those
590 * functions take.
591 *
592 * For this reason clients and drivers should not call the
593 * unregistration functions will holdling any locks.
594 *
595 * It tempting to drop the client_data_rwsem too, but this is required
596 * to ensure that unregister_client does not return until all clients
597 * are completely unregistered, which is required to avoid module
598 * unloading races.
599 */
600 if (client->remove)
601 client->remove(device, client_data);
602
603 xa_erase(&device->client_data, client_id);
604 up_read(&device->client_data_rwsem);
1da177e4
LT
605}
606
c2261dd7 607static int alloc_port_data(struct ib_device *device)
5eb620c8 608{
324e227e 609 struct ib_port_data_rcu *pdata_rcu;
ea1075ed 610 unsigned int port;
c2261dd7
JG
611
612 if (device->port_data)
613 return 0;
614
615 /* This can only be called once the physical port range is defined */
616 if (WARN_ON(!device->phys_port_cnt))
617 return -EINVAL;
7738613e 618
8ceb1357
JG
619 /*
620 * device->port_data is indexed directly by the port number to make
7738613e
IW
621 * access to this data as efficient as possible.
622 *
8ceb1357
JG
623 * Therefore port_data is declared as a 1 based array with potential
624 * empty slots at the beginning.
7738613e 625 */
324e227e
JG
626 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
627 rdma_end_port(device) + 1),
628 GFP_KERNEL);
629 if (!pdata_rcu)
55aeed06 630 return -ENOMEM;
324e227e
JG
631 /*
632 * The rcu_head is put in front of the port data array and the stored
633 * pointer is adjusted since we never need to see that member until
634 * kfree_rcu.
635 */
636 device->port_data = pdata_rcu->pdata;
5eb620c8 637
ea1075ed 638 rdma_for_each_port (device, port) {
8ceb1357
JG
639 struct ib_port_data *pdata = &device->port_data[port];
640
324e227e 641 pdata->ib_dev = device;
8ceb1357
JG
642 spin_lock_init(&pdata->pkey_list_lock);
643 INIT_LIST_HEAD(&pdata->pkey_list);
c2261dd7 644 spin_lock_init(&pdata->netdev_lock);
324e227e 645 INIT_HLIST_NODE(&pdata->ndev_hash_link);
c2261dd7
JG
646 }
647 return 0;
648}
649
650static int verify_immutable(const struct ib_device *dev, u8 port)
651{
652 return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
653 rdma_max_mad_size(dev, port) != 0);
654}
655
656static int setup_port_data(struct ib_device *device)
657{
658 unsigned int port;
659 int ret;
660
661 ret = alloc_port_data(device);
662 if (ret)
663 return ret;
664
665 rdma_for_each_port (device, port) {
666 struct ib_port_data *pdata = &device->port_data[port];
8ceb1357
JG
667
668 ret = device->ops.get_port_immutable(device, port,
669 &pdata->immutable);
5eb620c8 670 if (ret)
55aeed06 671 return ret;
337877a4 672
55aeed06
JG
673 if (verify_immutable(device, port))
674 return -EINVAL;
5eb620c8 675 }
55aeed06 676 return 0;
5eb620c8
YE
677}
678
9abb0d1b 679void ib_get_device_fw_str(struct ib_device *dev, char *str)
5fa76c20 680{
3023a1e9
KH
681 if (dev->ops.get_dev_fw_str)
682 dev->ops.get_dev_fw_str(dev, str);
5fa76c20
IW
683 else
684 str[0] = '\0';
685}
686EXPORT_SYMBOL(ib_get_device_fw_str);
687
8f408ab6
DJ
688static void ib_policy_change_task(struct work_struct *work)
689{
690 struct ib_device *dev;
0df91bb6 691 unsigned long index;
8f408ab6 692
921eab11 693 down_read(&devices_rwsem);
0df91bb6 694 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
ea1075ed 695 unsigned int i;
8f408ab6 696
ea1075ed 697 rdma_for_each_port (dev, i) {
8f408ab6
DJ
698 u64 sp;
699 int ret = ib_get_cached_subnet_prefix(dev,
700 i,
701 &sp);
702
703 WARN_ONCE(ret,
704 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n",
705 ret);
a750cfde
DJ
706 if (!ret)
707 ib_security_cache_change(dev, i, sp);
8f408ab6
DJ
708 }
709 }
921eab11 710 up_read(&devices_rwsem);
8f408ab6
DJ
711}
712
713static int ib_security_change(struct notifier_block *nb, unsigned long event,
714 void *lsm_data)
715{
716 if (event != LSM_POLICY_CHANGE)
717 return NOTIFY_DONE;
718
719 schedule_work(&ib_policy_change_work);
c66f6741 720 ib_mad_agent_security_change();
8f408ab6
DJ
721
722 return NOTIFY_OK;
723}
724
4e0f7b90
PP
725static void compatdev_release(struct device *dev)
726{
727 struct ib_core_device *cdev =
728 container_of(dev, struct ib_core_device, dev);
729
730 kfree(cdev);
731}
732
733static int add_one_compat_dev(struct ib_device *device,
734 struct rdma_dev_net *rnet)
735{
736 struct ib_core_device *cdev;
737 int ret;
738
739 /*
740 * Create and add compat device in all namespaces other than where it
741 * is currently bound to.
742 */
743 if (net_eq(read_pnet(&rnet->net),
744 read_pnet(&device->coredev.rdma_net)))
745 return 0;
746
747 /*
748 * The first of init_net() or ib_register_device() to take the
749 * compat_devs_mutex wins and gets to add the device. Others will wait
750 * for completion here.
751 */
752 mutex_lock(&device->compat_devs_mutex);
753 cdev = xa_load(&device->compat_devs, rnet->id);
754 if (cdev) {
755 ret = 0;
756 goto done;
757 }
758 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
759 if (ret)
760 goto done;
761
762 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
763 if (!cdev) {
764 ret = -ENOMEM;
765 goto cdev_err;
766 }
767
768 cdev->dev.parent = device->dev.parent;
769 rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
770 cdev->dev.release = compatdev_release;
771 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
772
773 ret = device_add(&cdev->dev);
774 if (ret)
775 goto add_err;
776
777 ret = xa_err(xa_store(&device->compat_devs, rnet->id,
778 cdev, GFP_KERNEL));
779 if (ret)
780 goto insert_err;
781
782 mutex_unlock(&device->compat_devs_mutex);
783 return 0;
784
785insert_err:
786 device_del(&cdev->dev);
787add_err:
788 put_device(&cdev->dev);
789cdev_err:
790 xa_release(&device->compat_devs, rnet->id);
791done:
792 mutex_unlock(&device->compat_devs_mutex);
793 return ret;
794}
795
796static void remove_one_compat_dev(struct ib_device *device, u32 id)
797{
798 struct ib_core_device *cdev;
799
800 mutex_lock(&device->compat_devs_mutex);
801 cdev = xa_erase(&device->compat_devs, id);
802 mutex_unlock(&device->compat_devs_mutex);
803 if (cdev) {
804 device_del(&cdev->dev);
805 put_device(&cdev->dev);
806 }
807}
808
809static void remove_compat_devs(struct ib_device *device)
810{
811 struct ib_core_device *cdev;
812 unsigned long index;
813
814 xa_for_each (&device->compat_devs, index, cdev)
815 remove_one_compat_dev(device, index);
816}
817
818static int add_compat_devs(struct ib_device *device)
819{
820 struct rdma_dev_net *rnet;
821 unsigned long index;
822 int ret = 0;
823
824 down_read(&rdma_nets_rwsem);
825 xa_for_each (&rdma_nets, index, rnet) {
826 ret = add_one_compat_dev(device, rnet);
827 if (ret)
828 break;
829 }
830 up_read(&rdma_nets_rwsem);
831 return ret;
832}
833
834static void rdma_dev_exit_net(struct net *net)
835{
836 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
837 struct ib_device *dev;
838 unsigned long index;
839 int ret;
840
841 down_write(&rdma_nets_rwsem);
842 /*
843 * Prevent the ID from being re-used and hide the id from xa_for_each.
844 */
845 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
846 WARN_ON(ret);
847 up_write(&rdma_nets_rwsem);
848
849 down_read(&devices_rwsem);
850 xa_for_each (&devices, index, dev) {
851 get_device(&dev->dev);
852 /*
853 * Release the devices_rwsem so that pontentially blocking
854 * device_del, doesn't hold the devices_rwsem for too long.
855 */
856 up_read(&devices_rwsem);
857
858 remove_one_compat_dev(dev, rnet->id);
859
860 put_device(&dev->dev);
861 down_read(&devices_rwsem);
862 }
863 up_read(&devices_rwsem);
864
865 xa_erase(&rdma_nets, rnet->id);
866}
867
868static __net_init int rdma_dev_init_net(struct net *net)
869{
870 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
871 unsigned long index;
872 struct ib_device *dev;
873 int ret;
874
875 /* No need to create any compat devices in default init_net. */
876 if (net_eq(net, &init_net))
877 return 0;
878
879 write_pnet(&rnet->net, net);
880
881 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
882 if (ret)
883 return ret;
884
885 down_read(&devices_rwsem);
886 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
887 ret = add_one_compat_dev(dev, rnet);
888 if (ret)
889 break;
890 }
891 up_read(&devices_rwsem);
892
893 if (ret)
894 rdma_dev_exit_net(net);
895
896 return ret;
897}
898
0df91bb6 899/*
d0899892
JG
900 * Assign the unique string device name and the unique device index. This is
901 * undone by ib_dealloc_device.
ecc82c53 902 */
0df91bb6 903static int assign_name(struct ib_device *device, const char *name)
ecc82c53 904{
0df91bb6
JG
905 static u32 last_id;
906 int ret;
ecc82c53 907
921eab11 908 down_write(&devices_rwsem);
0df91bb6
JG
909 /* Assign a unique name to the device */
910 if (strchr(name, '%'))
911 ret = alloc_name(device, name);
912 else
913 ret = dev_set_name(&device->dev, name);
914 if (ret)
915 goto out;
916
917 if (__ib_device_get_by_name(dev_name(&device->dev))) {
918 ret = -ENFILE;
919 goto out;
920 }
921 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
ecc82c53 922
ea295481
LT
923 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
924 &last_id, GFP_KERNEL);
925 if (ret > 0)
926 ret = 0;
921eab11 927
0df91bb6 928out:
921eab11 929 up_write(&devices_rwsem);
0df91bb6
JG
930 return ret;
931}
932
548cb4fb 933static void setup_dma_device(struct ib_device *device)
1da177e4 934{
99db9494
BVA
935 struct device *parent = device->dev.parent;
936
0957c29f
BVA
937 WARN_ON_ONCE(device->dma_device);
938 if (device->dev.dma_ops) {
939 /*
940 * The caller provided custom DMA operations. Copy the
941 * DMA-related fields that are used by e.g. dma_alloc_coherent()
942 * into device->dev.
943 */
944 device->dma_device = &device->dev;
02ee9da3
BVA
945 if (!device->dev.dma_mask) {
946 if (parent)
947 device->dev.dma_mask = parent->dma_mask;
948 else
949 WARN_ON_ONCE(true);
950 }
951 if (!device->dev.coherent_dma_mask) {
952 if (parent)
953 device->dev.coherent_dma_mask =
954 parent->coherent_dma_mask;
955 else
956 WARN_ON_ONCE(true);
957 }
0957c29f
BVA
958 } else {
959 /*
960 * The caller did not provide custom DMA operations. Use the
961 * DMA mapping operations of the parent device.
962 */
02ee9da3 963 WARN_ON_ONCE(!parent);
0957c29f
BVA
964 device->dma_device = parent;
965 }
548cb4fb 966}
1da177e4 967
921eab11
JG
968/*
969 * setup_device() allocates memory and sets up data that requires calling the
970 * device ops, this is the only reason these actions are not done during
971 * ib_alloc_device. It is undone by ib_dealloc_device().
972 */
548cb4fb
PP
973static int setup_device(struct ib_device *device)
974{
975 struct ib_udata uhw = {.outlen = 0, .inlen = 0};
976 int ret;
1da177e4 977
921eab11
JG
978 setup_dma_device(device);
979
548cb4fb
PP
980 ret = ib_device_check_mandatory(device);
981 if (ret)
982 return ret;
1da177e4 983
8ceb1357 984 ret = setup_port_data(device);
5eb620c8 985 if (ret) {
8ceb1357 986 dev_warn(&device->dev, "Couldn't create per-port data\n");
548cb4fb
PP
987 return ret;
988 }
989
990 memset(&device->attrs, 0, sizeof(device->attrs));
3023a1e9 991 ret = device->ops.query_device(device, &device->attrs, &uhw);
548cb4fb
PP
992 if (ret) {
993 dev_warn(&device->dev,
994 "Couldn't query the device attributes\n");
d45f89d5 995 return ret;
5eb620c8
YE
996 }
997
d45f89d5 998 return 0;
548cb4fb
PP
999}
1000
921eab11
JG
1001static void disable_device(struct ib_device *device)
1002{
1003 struct ib_client *client;
1004
1005 WARN_ON(!refcount_read(&device->refcount));
1006
1007 down_write(&devices_rwsem);
1008 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
1009 up_write(&devices_rwsem);
1010
1011 down_read(&clients_rwsem);
1012 list_for_each_entry_reverse(client, &client_list, list)
1013 remove_client_context(device, client->client_id);
1014 up_read(&clients_rwsem);
1015
1016 /* Pairs with refcount_set in enable_device */
1017 ib_device_put(device);
1018 wait_for_completion(&device->unreg_completion);
c2261dd7 1019
4e0f7b90
PP
1020 /*
1021 * compat devices must be removed after device refcount drops to zero.
1022 * Otherwise init_net() may add more compatdevs after removing compat
1023 * devices and before device is disabled.
1024 */
1025 remove_compat_devs(device);
1026
c2261dd7
JG
1027 /* Expedite removing unregistered pointers from the hash table */
1028 free_netdevs(device);
921eab11
JG
1029}
1030
1031/*
1032 * An enabled device is visible to all clients and to all the public facing
d0899892
JG
1033 * APIs that return a device pointer. This always returns with a new get, even
1034 * if it fails.
921eab11 1035 */
d0899892 1036static int enable_device_and_get(struct ib_device *device)
921eab11
JG
1037{
1038 struct ib_client *client;
1039 unsigned long index;
d0899892 1040 int ret = 0;
921eab11 1041
d0899892
JG
1042 /*
1043 * One ref belongs to the xa and the other belongs to this
1044 * thread. This is needed to guard against parallel unregistration.
1045 */
1046 refcount_set(&device->refcount, 2);
921eab11
JG
1047 down_write(&devices_rwsem);
1048 xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
d0899892
JG
1049
1050 /*
1051 * By using downgrade_write() we ensure that no other thread can clear
1052 * DEVICE_REGISTERED while we are completing the client setup.
1053 */
1054 downgrade_write(&devices_rwsem);
921eab11 1055
ca22354b
JG
1056 if (device->ops.enable_driver) {
1057 ret = device->ops.enable_driver(device);
1058 if (ret)
1059 goto out;
1060 }
1061
921eab11
JG
1062 down_read(&clients_rwsem);
1063 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1064 ret = add_client_context(device, client);
d0899892
JG
1065 if (ret)
1066 break;
921eab11
JG
1067 }
1068 up_read(&clients_rwsem);
4e0f7b90
PP
1069 if (!ret)
1070 ret = add_compat_devs(device);
ca22354b 1071out:
d0899892
JG
1072 up_read(&devices_rwsem);
1073 return ret;
921eab11
JG
1074}
1075
548cb4fb
PP
1076/**
1077 * ib_register_device - Register an IB device with IB core
1078 * @device:Device to register
1079 *
1080 * Low-level drivers use ib_register_device() to register their
1081 * devices with the IB core. All registered clients will receive a
1082 * callback for each device that is added. @device must be allocated
1083 * with ib_alloc_device().
d0899892
JG
1084 *
1085 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
1086 * asynchronously then the device pointer may become freed as soon as this
1087 * function returns.
548cb4fb 1088 */
ea4baf7f 1089int ib_register_device(struct ib_device *device, const char *name)
548cb4fb
PP
1090{
1091 int ret;
548cb4fb 1092
0df91bb6
JG
1093 ret = assign_name(device, name);
1094 if (ret)
921eab11 1095 return ret;
548cb4fb
PP
1096
1097 ret = setup_device(device);
1098 if (ret)
d0899892 1099 return ret;
03db3a2d 1100
d45f89d5
JG
1101 ret = ib_cache_setup_one(device);
1102 if (ret) {
1103 dev_warn(&device->dev,
1104 "Couldn't set up InfiniBand P_Key/GID cache\n");
d0899892 1105 return ret;
d45f89d5
JG
1106 }
1107
7527a7b1 1108 ib_device_register_rdmacg(device);
3e153a93 1109
5f8f5499
PP
1110 ret = device_add(&device->dev);
1111 if (ret)
1112 goto cg_cleanup;
1113
ea4baf7f 1114 ret = ib_device_register_sysfs(device);
1da177e4 1115 if (ret) {
43c7c851
JG
1116 dev_warn(&device->dev,
1117 "Couldn't register device with driver model\n");
5f8f5499 1118 goto dev_cleanup;
1da177e4
LT
1119 }
1120
d0899892
JG
1121 ret = enable_device_and_get(device);
1122 if (ret) {
1123 void (*dealloc_fn)(struct ib_device *);
1124
1125 /*
1126 * If we hit this error flow then we don't want to
1127 * automatically dealloc the device since the caller is
1128 * expected to call ib_dealloc_device() after
1129 * ib_register_device() fails. This is tricky due to the
1130 * possibility for a parallel unregistration along with this
1131 * error flow. Since we have a refcount here we know any
1132 * parallel flow is stopped in disable_device and will see the
1133 * NULL pointers, causing the responsibility to
1134 * ib_dealloc_device() to revert back to this thread.
1135 */
1136 dealloc_fn = device->ops.dealloc_driver;
1137 device->ops.dealloc_driver = NULL;
1138 ib_device_put(device);
1139 __ib_unregister_device(device);
1140 device->ops.dealloc_driver = dealloc_fn;
1141 return ret;
1142 }
1143 ib_device_put(device);
1da177e4 1144
4be3a4fa
PP
1145 return 0;
1146
5f8f5499
PP
1147dev_cleanup:
1148 device_del(&device->dev);
2fb4f4ea
PP
1149cg_cleanup:
1150 ib_device_unregister_rdmacg(device);
d45f89d5 1151 ib_cache_cleanup_one(device);
1da177e4
LT
1152 return ret;
1153}
1154EXPORT_SYMBOL(ib_register_device);
1155
d0899892
JG
1156/* Callers must hold a get on the device. */
1157static void __ib_unregister_device(struct ib_device *ib_dev)
1158{
1159 /*
1160 * We have a registration lock so that all the calls to unregister are
1161 * fully fenced, once any unregister returns the device is truely
1162 * unregistered even if multiple callers are unregistering it at the
1163 * same time. This also interacts with the registration flow and
1164 * provides sane semantics if register and unregister are racing.
1165 */
1166 mutex_lock(&ib_dev->unregistration_lock);
1167 if (!refcount_read(&ib_dev->refcount))
1168 goto out;
1169
1170 disable_device(ib_dev);
1171 ib_device_unregister_sysfs(ib_dev);
1172 device_del(&ib_dev->dev);
1173 ib_device_unregister_rdmacg(ib_dev);
1174 ib_cache_cleanup_one(ib_dev);
1175
1176 /*
1177 * Drivers using the new flow may not call ib_dealloc_device except
1178 * in error unwind prior to registration success.
1179 */
1180 if (ib_dev->ops.dealloc_driver) {
1181 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
1182 ib_dealloc_device(ib_dev);
1183 }
1184out:
1185 mutex_unlock(&ib_dev->unregistration_lock);
1186}
1187
1da177e4
LT
1188/**
1189 * ib_unregister_device - Unregister an IB device
d0899892 1190 * @device: The device to unregister
1da177e4
LT
1191 *
1192 * Unregister an IB device. All clients will receive a remove callback.
d0899892
JG
1193 *
1194 * Callers should call this routine only once, and protect against races with
1195 * registration. Typically it should only be called as part of a remove
1196 * callback in an implementation of driver core's struct device_driver and
1197 * related.
1198 *
1199 * If ops.dealloc_driver is used then ib_dev will be freed upon return from
1200 * this function.
1da177e4 1201 */
d0899892 1202void ib_unregister_device(struct ib_device *ib_dev)
1da177e4 1203{
d0899892
JG
1204 get_device(&ib_dev->dev);
1205 __ib_unregister_device(ib_dev);
1206 put_device(&ib_dev->dev);
1da177e4
LT
1207}
1208EXPORT_SYMBOL(ib_unregister_device);
1209
d0899892
JG
1210/**
1211 * ib_unregister_device_and_put - Unregister a device while holding a 'get'
1212 * device: The device to unregister
1213 *
1214 * This is the same as ib_unregister_device(), except it includes an internal
1215 * ib_device_put() that should match a 'get' obtained by the caller.
1216 *
1217 * It is safe to call this routine concurrently from multiple threads while
1218 * holding the 'get'. When the function returns the device is fully
1219 * unregistered.
1220 *
1221 * Drivers using this flow MUST use the driver_unregister callback to clean up
1222 * their resources associated with the device and dealloc it.
1223 */
1224void ib_unregister_device_and_put(struct ib_device *ib_dev)
1225{
1226 WARN_ON(!ib_dev->ops.dealloc_driver);
1227 get_device(&ib_dev->dev);
1228 ib_device_put(ib_dev);
1229 __ib_unregister_device(ib_dev);
1230 put_device(&ib_dev->dev);
1231}
1232EXPORT_SYMBOL(ib_unregister_device_and_put);
1233
1234/**
1235 * ib_unregister_driver - Unregister all IB devices for a driver
1236 * @driver_id: The driver to unregister
1237 *
1238 * This implements a fence for device unregistration. It only returns once all
1239 * devices associated with the driver_id have fully completed their
1240 * unregistration and returned from ib_unregister_device*().
1241 *
1242 * If device's are not yet unregistered it goes ahead and starts unregistering
1243 * them.
1244 *
1245 * This does not block creation of new devices with the given driver_id, that
1246 * is the responsibility of the caller.
1247 */
1248void ib_unregister_driver(enum rdma_driver_id driver_id)
1249{
1250 struct ib_device *ib_dev;
1251 unsigned long index;
1252
1253 down_read(&devices_rwsem);
1254 xa_for_each (&devices, index, ib_dev) {
1255 if (ib_dev->driver_id != driver_id)
1256 continue;
1257
1258 get_device(&ib_dev->dev);
1259 up_read(&devices_rwsem);
1260
1261 WARN_ON(!ib_dev->ops.dealloc_driver);
1262 __ib_unregister_device(ib_dev);
1263
1264 put_device(&ib_dev->dev);
1265 down_read(&devices_rwsem);
1266 }
1267 up_read(&devices_rwsem);
1268}
1269EXPORT_SYMBOL(ib_unregister_driver);
1270
1271static void ib_unregister_work(struct work_struct *work)
1272{
1273 struct ib_device *ib_dev =
1274 container_of(work, struct ib_device, unregistration_work);
1275
1276 __ib_unregister_device(ib_dev);
1277 put_device(&ib_dev->dev);
1278}
1279
1280/**
1281 * ib_unregister_device_queued - Unregister a device using a work queue
1282 * device: The device to unregister
1283 *
1284 * This schedules an asynchronous unregistration using a WQ for the device. A
1285 * driver should use this to avoid holding locks while doing unregistration,
1286 * such as holding the RTNL lock.
1287 *
1288 * Drivers using this API must use ib_unregister_driver before module unload
1289 * to ensure that all scheduled unregistrations have completed.
1290 */
1291void ib_unregister_device_queued(struct ib_device *ib_dev)
1292{
1293 WARN_ON(!refcount_read(&ib_dev->refcount));
1294 WARN_ON(!ib_dev->ops.dealloc_driver);
1295 get_device(&ib_dev->dev);
1296 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
1297 put_device(&ib_dev->dev);
1298}
1299EXPORT_SYMBOL(ib_unregister_device_queued);
1300
4e0f7b90
PP
1301static struct pernet_operations rdma_dev_net_ops = {
1302 .init = rdma_dev_init_net,
1303 .exit = rdma_dev_exit_net,
1304 .id = &rdma_dev_net_id,
1305 .size = sizeof(struct rdma_dev_net),
1306};
1307
e59178d8
JG
1308static int assign_client_id(struct ib_client *client)
1309{
1310 int ret;
1311
921eab11 1312 down_write(&clients_rwsem);
e59178d8
JG
1313 /*
1314 * The add/remove callbacks must be called in FIFO/LIFO order. To
1315 * achieve this we assign client_ids so they are sorted in
1316 * registration order, and retain a linked list we can reverse iterate
1317 * to get the LIFO order. The extra linked list can go away if xarray
1318 * learns to reverse iterate.
1319 */
ea295481 1320 if (list_empty(&client_list)) {
e59178d8 1321 client->client_id = 0;
ea295481
LT
1322 } else {
1323 struct ib_client *last;
1324
1325 last = list_last_entry(&client_list, struct ib_client, list);
1326 client->client_id = last->client_id + 1;
4512acd0 1327 }
ea295481 1328 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
e59178d8
JG
1329 if (ret)
1330 goto out;
1331
921eab11
JG
1332 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
1333 list_add_tail(&client->list, &client_list);
1334
e59178d8 1335out:
921eab11 1336 up_write(&clients_rwsem);
e59178d8
JG
1337 return ret;
1338}
1339
1da177e4
LT
1340/**
1341 * ib_register_client - Register an IB client
1342 * @client:Client to register
1343 *
1344 * Upper level users of the IB drivers can use ib_register_client() to
1345 * register callbacks for IB device addition and removal. When an IB
1346 * device is added, each registered client's add method will be called
1347 * (in the order the clients were registered), and when a device is
1348 * removed, each client's remove method will be called (in the reverse
1349 * order that clients were registered). In addition, when
1350 * ib_register_client() is called, the client will receive an add
1351 * callback for all devices already registered.
1352 */
1353int ib_register_client(struct ib_client *client)
1354{
1355 struct ib_device *device;
0df91bb6 1356 unsigned long index;
e59178d8 1357 int ret;
1da177e4 1358
e59178d8 1359 ret = assign_client_id(client);
921eab11 1360 if (ret)
e59178d8 1361 return ret;
1da177e4 1362
921eab11
JG
1363 down_read(&devices_rwsem);
1364 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
1365 ret = add_client_context(device, client);
1366 if (ret) {
1367 up_read(&devices_rwsem);
1368 ib_unregister_client(client);
1369 return ret;
1370 }
1371 }
1372 up_read(&devices_rwsem);
1da177e4
LT
1373 return 0;
1374}
1375EXPORT_SYMBOL(ib_register_client);
1376
1377/**
1378 * ib_unregister_client - Unregister an IB client
1379 * @client:Client to unregister
1380 *
1381 * Upper level users use ib_unregister_client() to remove their client
1382 * registration. When ib_unregister_client() is called, the client
1383 * will receive a remove callback for each IB device still registered.
921eab11
JG
1384 *
1385 * This is a full fence, once it returns no client callbacks will be called,
1386 * or are running in another thread.
1da177e4
LT
1387 */
1388void ib_unregister_client(struct ib_client *client)
1389{
1da177e4 1390 struct ib_device *device;
0df91bb6 1391 unsigned long index;
1da177e4 1392
921eab11 1393 down_write(&clients_rwsem);
e59178d8 1394 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
921eab11
JG
1395 up_write(&clients_rwsem);
1396 /*
1397 * Every device still known must be serialized to make sure we are
1398 * done with the client callbacks before we return.
1399 */
1400 down_read(&devices_rwsem);
1401 xa_for_each (&devices, index, device)
1402 remove_client_context(device, client->client_id);
1403 up_read(&devices_rwsem);
1da177e4 1404
921eab11 1405 down_write(&clients_rwsem);
e59178d8
JG
1406 list_del(&client->list);
1407 xa_erase(&clients, client->client_id);
921eab11 1408 up_write(&clients_rwsem);
1da177e4
LT
1409}
1410EXPORT_SYMBOL(ib_unregister_client);
1411
1da177e4 1412/**
9cd330d3 1413 * ib_set_client_data - Set IB client context
1da177e4
LT
1414 * @device:Device to set context for
1415 * @client:Client to set context for
1416 * @data:Context to set
1417 *
0df91bb6
JG
1418 * ib_set_client_data() sets client context data that can be retrieved with
1419 * ib_get_client_data(). This can only be called while the client is
1420 * registered to the device, once the ib_client remove() callback returns this
1421 * cannot be called.
1da177e4
LT
1422 */
1423void ib_set_client_data(struct ib_device *device, struct ib_client *client,
1424 void *data)
1425{
0df91bb6 1426 void *rc;
1da177e4 1427
0df91bb6
JG
1428 if (WARN_ON(IS_ERR(data)))
1429 data = NULL;
1da177e4 1430
0df91bb6
JG
1431 rc = xa_store(&device->client_data, client->client_id, data,
1432 GFP_KERNEL);
1433 WARN_ON(xa_is_err(rc));
1da177e4
LT
1434}
1435EXPORT_SYMBOL(ib_set_client_data);
1436
1437/**
1438 * ib_register_event_handler - Register an IB event handler
1439 * @event_handler:Handler to register
1440 *
1441 * ib_register_event_handler() registers an event handler that will be
1442 * called back when asynchronous IB events occur (as defined in
1443 * chapter 11 of the InfiniBand Architecture Specification). This
1444 * callback may occur in interrupt context.
1445 */
dcc9881e 1446void ib_register_event_handler(struct ib_event_handler *event_handler)
1da177e4
LT
1447{
1448 unsigned long flags;
1449
1450 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1451 list_add_tail(&event_handler->list,
1452 &event_handler->device->event_handler_list);
1453 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1da177e4
LT
1454}
1455EXPORT_SYMBOL(ib_register_event_handler);
1456
1457/**
1458 * ib_unregister_event_handler - Unregister an event handler
1459 * @event_handler:Handler to unregister
1460 *
1461 * Unregister an event handler registered with
1462 * ib_register_event_handler().
1463 */
dcc9881e 1464void ib_unregister_event_handler(struct ib_event_handler *event_handler)
1da177e4
LT
1465{
1466 unsigned long flags;
1467
1468 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1469 list_del(&event_handler->list);
1470 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1da177e4
LT
1471}
1472EXPORT_SYMBOL(ib_unregister_event_handler);
1473
1474/**
1475 * ib_dispatch_event - Dispatch an asynchronous event
1476 * @event:Event to dispatch
1477 *
1478 * Low-level drivers must call ib_dispatch_event() to dispatch the
1479 * event to all registered event handlers when an asynchronous event
1480 * occurs.
1481 */
1482void ib_dispatch_event(struct ib_event *event)
1483{
1484 unsigned long flags;
1485 struct ib_event_handler *handler;
1486
1487 spin_lock_irqsave(&event->device->event_handler_lock, flags);
1488
1489 list_for_each_entry(handler, &event->device->event_handler_list, list)
1490 handler->handler(handler, event);
1491
1492 spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
1493}
1494EXPORT_SYMBOL(ib_dispatch_event);
1495
1da177e4
LT
1496/**
1497 * ib_query_port - Query IB port attributes
1498 * @device:Device to query
1499 * @port_num:Port number to query
1500 * @port_attr:Port attributes
1501 *
1502 * ib_query_port() returns the attributes of a port through the
1503 * @port_attr pointer.
1504 */
1505int ib_query_port(struct ib_device *device,
1506 u8 port_num,
1507 struct ib_port_attr *port_attr)
1508{
fad61ad4
EC
1509 union ib_gid gid;
1510 int err;
1511
24dc831b 1512 if (!rdma_is_port_valid(device, port_num))
116c0074
RD
1513 return -EINVAL;
1514
fad61ad4 1515 memset(port_attr, 0, sizeof(*port_attr));
3023a1e9 1516 err = device->ops.query_port(device, port_num, port_attr);
fad61ad4
EC
1517 if (err || port_attr->subnet_prefix)
1518 return err;
1519
d7012467
EC
1520 if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
1521 return 0;
1522
3023a1e9 1523 err = device->ops.query_gid(device, port_num, 0, &gid);
fad61ad4
EC
1524 if (err)
1525 return err;
1526
1527 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
1528 return 0;
1da177e4
LT
1529}
1530EXPORT_SYMBOL(ib_query_port);
1531
324e227e
JG
1532static void add_ndev_hash(struct ib_port_data *pdata)
1533{
1534 unsigned long flags;
1535
1536 might_sleep();
1537
1538 spin_lock_irqsave(&ndev_hash_lock, flags);
1539 if (hash_hashed(&pdata->ndev_hash_link)) {
1540 hash_del_rcu(&pdata->ndev_hash_link);
1541 spin_unlock_irqrestore(&ndev_hash_lock, flags);
1542 /*
1543 * We cannot do hash_add_rcu after a hash_del_rcu until the
1544 * grace period
1545 */
1546 synchronize_rcu();
1547 spin_lock_irqsave(&ndev_hash_lock, flags);
1548 }
1549 if (pdata->netdev)
1550 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
1551 (uintptr_t)pdata->netdev);
1552 spin_unlock_irqrestore(&ndev_hash_lock, flags);
1553}
1554
c2261dd7
JG
1555/**
1556 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
1557 * @ib_dev: Device to modify
1558 * @ndev: net_device to affiliate, may be NULL
1559 * @port: IB port the net_device is connected to
1560 *
1561 * Drivers should use this to link the ib_device to a netdev so the netdev
1562 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
1563 * affiliated with any port.
1564 *
1565 * The caller must ensure that the given ndev is not unregistered or
1566 * unregistering, and that either the ib_device is unregistered or
1567 * ib_device_set_netdev() is called with NULL when the ndev sends a
1568 * NETDEV_UNREGISTER event.
1569 */
1570int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
1571 unsigned int port)
1572{
1573 struct net_device *old_ndev;
1574 struct ib_port_data *pdata;
1575 unsigned long flags;
1576 int ret;
1577
1578 /*
1579 * Drivers wish to call this before ib_register_driver, so we have to
1580 * setup the port data early.
1581 */
1582 ret = alloc_port_data(ib_dev);
1583 if (ret)
1584 return ret;
1585
1586 if (!rdma_is_port_valid(ib_dev, port))
1587 return -EINVAL;
1588
1589 pdata = &ib_dev->port_data[port];
1590 spin_lock_irqsave(&pdata->netdev_lock, flags);
324e227e
JG
1591 old_ndev = rcu_dereference_protected(
1592 pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
1593 if (old_ndev == ndev) {
c2261dd7
JG
1594 spin_unlock_irqrestore(&pdata->netdev_lock, flags);
1595 return 0;
1596 }
c2261dd7
JG
1597
1598 if (ndev)
1599 dev_hold(ndev);
324e227e 1600 rcu_assign_pointer(pdata->netdev, ndev);
c2261dd7
JG
1601 spin_unlock_irqrestore(&pdata->netdev_lock, flags);
1602
324e227e 1603 add_ndev_hash(pdata);
c2261dd7
JG
1604 if (old_ndev)
1605 dev_put(old_ndev);
1606
1607 return 0;
1608}
1609EXPORT_SYMBOL(ib_device_set_netdev);
1610
1611static void free_netdevs(struct ib_device *ib_dev)
1612{
1613 unsigned long flags;
1614 unsigned int port;
1615
1616 rdma_for_each_port (ib_dev, port) {
1617 struct ib_port_data *pdata = &ib_dev->port_data[port];
324e227e 1618 struct net_device *ndev;
c2261dd7
JG
1619
1620 spin_lock_irqsave(&pdata->netdev_lock, flags);
324e227e
JG
1621 ndev = rcu_dereference_protected(
1622 pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
1623 if (ndev) {
1624 spin_lock(&ndev_hash_lock);
1625 hash_del_rcu(&pdata->ndev_hash_link);
1626 spin_unlock(&ndev_hash_lock);
1627
1628 /*
1629 * If this is the last dev_put there is still a
1630 * synchronize_rcu before the netdev is kfreed, so we
1631 * can continue to rely on unlocked pointer
1632 * comparisons after the put
1633 */
1634 rcu_assign_pointer(pdata->netdev, NULL);
1635 dev_put(ndev);
c2261dd7
JG
1636 }
1637 spin_unlock_irqrestore(&pdata->netdev_lock, flags);
1638 }
1639}
1640
1641struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
1642 unsigned int port)
1643{
1644 struct ib_port_data *pdata;
1645 struct net_device *res;
1646
1647 if (!rdma_is_port_valid(ib_dev, port))
1648 return NULL;
1649
1650 pdata = &ib_dev->port_data[port];
1651
1652 /*
1653 * New drivers should use ib_device_set_netdev() not the legacy
1654 * get_netdev().
1655 */
1656 if (ib_dev->ops.get_netdev)
1657 res = ib_dev->ops.get_netdev(ib_dev, port);
1658 else {
1659 spin_lock(&pdata->netdev_lock);
324e227e
JG
1660 res = rcu_dereference_protected(
1661 pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
c2261dd7
JG
1662 if (res)
1663 dev_hold(res);
1664 spin_unlock(&pdata->netdev_lock);
1665 }
1666
1667 /*
1668 * If we are starting to unregister expedite things by preventing
1669 * propagation of an unregistering netdev.
1670 */
1671 if (res && res->reg_state != NETREG_REGISTERED) {
1672 dev_put(res);
1673 return NULL;
1674 }
1675
1676 return res;
1677}
1678
324e227e
JG
1679/**
1680 * ib_device_get_by_netdev - Find an IB device associated with a netdev
1681 * @ndev: netdev to locate
1682 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
1683 *
1684 * Find and hold an ib_device that is associated with a netdev via
1685 * ib_device_set_netdev(). The caller must call ib_device_put() on the
1686 * returned pointer.
1687 */
1688struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
1689 enum rdma_driver_id driver_id)
1690{
1691 struct ib_device *res = NULL;
1692 struct ib_port_data *cur;
1693
1694 rcu_read_lock();
1695 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
1696 (uintptr_t)ndev) {
1697 if (rcu_access_pointer(cur->netdev) == ndev &&
1698 (driver_id == RDMA_DRIVER_UNKNOWN ||
1699 cur->ib_dev->driver_id == driver_id) &&
1700 ib_device_try_get(cur->ib_dev)) {
1701 res = cur->ib_dev;
1702 break;
1703 }
1704 }
1705 rcu_read_unlock();
1706
1707 return res;
1708}
1709EXPORT_SYMBOL(ib_device_get_by_netdev);
1710
03db3a2d
MB
1711/**
1712 * ib_enum_roce_netdev - enumerate all RoCE ports
1713 * @ib_dev : IB device we want to query
1714 * @filter: Should we call the callback?
1715 * @filter_cookie: Cookie passed to filter
1716 * @cb: Callback to call for each found RoCE ports
1717 * @cookie: Cookie passed back to the callback
1718 *
1719 * Enumerates all of the physical RoCE ports of ib_dev
1720 * which are related to netdevice and calls callback() on each
1721 * device for which filter() function returns non zero.
1722 */
1723void ib_enum_roce_netdev(struct ib_device *ib_dev,
1724 roce_netdev_filter filter,
1725 void *filter_cookie,
1726 roce_netdev_callback cb,
1727 void *cookie)
1728{
ea1075ed 1729 unsigned int port;
03db3a2d 1730
ea1075ed 1731 rdma_for_each_port (ib_dev, port)
03db3a2d 1732 if (rdma_protocol_roce(ib_dev, port)) {
c2261dd7
JG
1733 struct net_device *idev =
1734 ib_device_get_netdev(ib_dev, port);
03db3a2d
MB
1735
1736 if (filter(ib_dev, port, idev, filter_cookie))
1737 cb(ib_dev, port, idev, cookie);
1738
1739 if (idev)
1740 dev_put(idev);
1741 }
1742}
1743
1744/**
1745 * ib_enum_all_roce_netdevs - enumerate all RoCE devices
1746 * @filter: Should we call the callback?
1747 * @filter_cookie: Cookie passed to filter
1748 * @cb: Callback to call for each found RoCE ports
1749 * @cookie: Cookie passed back to the callback
1750 *
1751 * Enumerates all RoCE devices' physical ports which are related
1752 * to netdevices and calls callback() on each device for which
1753 * filter() function returns non zero.
1754 */
1755void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
1756 void *filter_cookie,
1757 roce_netdev_callback cb,
1758 void *cookie)
1759{
1760 struct ib_device *dev;
0df91bb6 1761 unsigned long index;
03db3a2d 1762
921eab11 1763 down_read(&devices_rwsem);
0df91bb6 1764 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
03db3a2d 1765 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
921eab11 1766 up_read(&devices_rwsem);
8030c835
LR
1767}
1768
1769/**
1770 * ib_enum_all_devs - enumerate all ib_devices
1771 * @cb: Callback to call for each found ib_device
1772 *
1773 * Enumerates all ib_devices and calls callback() on each device.
1774 */
1775int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
1776 struct netlink_callback *cb)
1777{
0df91bb6 1778 unsigned long index;
8030c835
LR
1779 struct ib_device *dev;
1780 unsigned int idx = 0;
1781 int ret = 0;
1782
921eab11 1783 down_read(&devices_rwsem);
0df91bb6 1784 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
8030c835
LR
1785 ret = nldev_cb(dev, skb, cb, idx);
1786 if (ret)
1787 break;
1788 idx++;
1789 }
921eab11 1790 up_read(&devices_rwsem);
8030c835 1791 return ret;
03db3a2d
MB
1792}
1793
1da177e4
LT
1794/**
1795 * ib_query_pkey - Get P_Key table entry
1796 * @device:Device to query
1797 * @port_num:Port number to query
1798 * @index:P_Key table index to query
1799 * @pkey:Returned P_Key
1800 *
1801 * ib_query_pkey() fetches the specified P_Key table entry.
1802 */
1803int ib_query_pkey(struct ib_device *device,
1804 u8 port_num, u16 index, u16 *pkey)
1805{
9af3f5cf
YS
1806 if (!rdma_is_port_valid(device, port_num))
1807 return -EINVAL;
1808
3023a1e9 1809 return device->ops.query_pkey(device, port_num, index, pkey);
1da177e4
LT
1810}
1811EXPORT_SYMBOL(ib_query_pkey);
1812
1813/**
1814 * ib_modify_device - Change IB device attributes
1815 * @device:Device to modify
1816 * @device_modify_mask:Mask of attributes to change
1817 * @device_modify:New attribute values
1818 *
1819 * ib_modify_device() changes a device's attributes as specified by
1820 * the @device_modify_mask and @device_modify structure.
1821 */
1822int ib_modify_device(struct ib_device *device,
1823 int device_modify_mask,
1824 struct ib_device_modify *device_modify)
1825{
3023a1e9 1826 if (!device->ops.modify_device)
10e1b54b
BVA
1827 return -ENOSYS;
1828
3023a1e9
KH
1829 return device->ops.modify_device(device, device_modify_mask,
1830 device_modify);
1da177e4
LT
1831}
1832EXPORT_SYMBOL(ib_modify_device);
1833
1834/**
1835 * ib_modify_port - Modifies the attributes for the specified port.
1836 * @device: The device to modify.
1837 * @port_num: The number of the port to modify.
1838 * @port_modify_mask: Mask used to specify which attributes of the port
1839 * to change.
1840 * @port_modify: New attribute values for the port.
1841 *
1842 * ib_modify_port() changes a port's attributes as specified by the
1843 * @port_modify_mask and @port_modify structure.
1844 */
1845int ib_modify_port(struct ib_device *device,
1846 u8 port_num, int port_modify_mask,
1847 struct ib_port_modify *port_modify)
1848{
61e0962d 1849 int rc;
10e1b54b 1850
24dc831b 1851 if (!rdma_is_port_valid(device, port_num))
116c0074
RD
1852 return -EINVAL;
1853
3023a1e9
KH
1854 if (device->ops.modify_port)
1855 rc = device->ops.modify_port(device, port_num,
1856 port_modify_mask,
1857 port_modify);
61e0962d
SX
1858 else
1859 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
1860 return rc;
1da177e4
LT
1861}
1862EXPORT_SYMBOL(ib_modify_port);
1863
5eb620c8
YE
1864/**
1865 * ib_find_gid - Returns the port number and GID table index where
dbb12562 1866 * a specified GID value occurs. Its searches only for IB link layer.
5eb620c8
YE
1867 * @device: The device to query.
1868 * @gid: The GID value to search for.
1869 * @port_num: The port number of the device where the GID value was found.
1870 * @index: The index into the GID table where the GID was found. This
1871 * parameter may be NULL.
1872 */
1873int ib_find_gid(struct ib_device *device, union ib_gid *gid,
b26c4a11 1874 u8 *port_num, u16 *index)
5eb620c8
YE
1875{
1876 union ib_gid tmp_gid;
ea1075ed
JG
1877 unsigned int port;
1878 int ret, i;
5eb620c8 1879
ea1075ed 1880 rdma_for_each_port (device, port) {
22d24f75 1881 if (!rdma_protocol_ib(device, port))
b39ffa1d
MB
1882 continue;
1883
8ceb1357
JG
1884 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
1885 ++i) {
1dfce294 1886 ret = rdma_query_gid(device, port, i, &tmp_gid);
5eb620c8
YE
1887 if (ret)
1888 return ret;
1889 if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
1890 *port_num = port;
1891 if (index)
1892 *index = i;
1893 return 0;
1894 }
1895 }
1896 }
1897
1898 return -ENOENT;
1899}
1900EXPORT_SYMBOL(ib_find_gid);
1901
1902/**
1903 * ib_find_pkey - Returns the PKey table index where a specified
1904 * PKey value occurs.
1905 * @device: The device to query.
1906 * @port_num: The port number of the device to search for the PKey.
1907 * @pkey: The PKey value to search for.
1908 * @index: The index into the PKey table where the PKey was found.
1909 */
1910int ib_find_pkey(struct ib_device *device,
1911 u8 port_num, u16 pkey, u16 *index)
1912{
1913 int ret, i;
1914 u16 tmp_pkey;
ff7166c4 1915 int partial_ix = -1;
5eb620c8 1916
8ceb1357
JG
1917 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
1918 ++i) {
5eb620c8
YE
1919 ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
1920 if (ret)
1921 return ret;
36026ecc 1922 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
ff7166c4
JM
1923 /* if there is full-member pkey take it.*/
1924 if (tmp_pkey & 0x8000) {
1925 *index = i;
1926 return 0;
1927 }
1928 if (partial_ix < 0)
1929 partial_ix = i;
5eb620c8
YE
1930 }
1931 }
1932
ff7166c4
JM
1933 /*no full-member, if exists take the limited*/
1934 if (partial_ix >= 0) {
1935 *index = partial_ix;
1936 return 0;
1937 }
5eb620c8
YE
1938 return -ENOENT;
1939}
1940EXPORT_SYMBOL(ib_find_pkey);
1941
9268f72d
YK
1942/**
1943 * ib_get_net_dev_by_params() - Return the appropriate net_dev
1944 * for a received CM request
1945 * @dev: An RDMA device on which the request has been received.
1946 * @port: Port number on the RDMA device.
1947 * @pkey: The Pkey the request came on.
1948 * @gid: A GID that the net_dev uses to communicate.
1949 * @addr: Contains the IP address that the request specified as its
1950 * destination.
921eab11 1951 *
9268f72d
YK
1952 */
1953struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
1954 u8 port,
1955 u16 pkey,
1956 const union ib_gid *gid,
1957 const struct sockaddr *addr)
1958{
1959 struct net_device *net_dev = NULL;
0df91bb6
JG
1960 unsigned long index;
1961 void *client_data;
9268f72d
YK
1962
1963 if (!rdma_protocol_ib(dev, port))
1964 return NULL;
1965
921eab11
JG
1966 /*
1967 * Holding the read side guarantees that the client will not become
1968 * unregistered while we are calling get_net_dev_by_params()
1969 */
1970 down_read(&dev->client_data_rwsem);
0df91bb6
JG
1971 xan_for_each_marked (&dev->client_data, index, client_data,
1972 CLIENT_DATA_REGISTERED) {
1973 struct ib_client *client = xa_load(&clients, index);
9268f72d 1974
0df91bb6 1975 if (!client || !client->get_net_dev_by_params)
9268f72d
YK
1976 continue;
1977
0df91bb6
JG
1978 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
1979 addr, client_data);
1980 if (net_dev)
1981 break;
9268f72d 1982 }
921eab11 1983 up_read(&dev->client_data_rwsem);
9268f72d
YK
1984
1985 return net_dev;
1986}
1987EXPORT_SYMBOL(ib_get_net_dev_by_params);
1988
521ed0d9
KH
1989void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
1990{
3023a1e9 1991 struct ib_device_ops *dev_ops = &dev->ops;
521ed0d9
KH
1992#define SET_DEVICE_OP(ptr, name) \
1993 do { \
1994 if (ops->name) \
1995 if (!((ptr)->name)) \
1996 (ptr)->name = ops->name; \
1997 } while (0)
1998
30471d4b
LR
1999#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
2000
3023a1e9 2001 SET_DEVICE_OP(dev_ops, add_gid);
2f1927b0 2002 SET_DEVICE_OP(dev_ops, advise_mr);
3023a1e9
KH
2003 SET_DEVICE_OP(dev_ops, alloc_dm);
2004 SET_DEVICE_OP(dev_ops, alloc_fmr);
2005 SET_DEVICE_OP(dev_ops, alloc_hw_stats);
2006 SET_DEVICE_OP(dev_ops, alloc_mr);
2007 SET_DEVICE_OP(dev_ops, alloc_mw);
2008 SET_DEVICE_OP(dev_ops, alloc_pd);
2009 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
2010 SET_DEVICE_OP(dev_ops, alloc_ucontext);
2011 SET_DEVICE_OP(dev_ops, alloc_xrcd);
2012 SET_DEVICE_OP(dev_ops, attach_mcast);
2013 SET_DEVICE_OP(dev_ops, check_mr_status);
2014 SET_DEVICE_OP(dev_ops, create_ah);
2015 SET_DEVICE_OP(dev_ops, create_counters);
2016 SET_DEVICE_OP(dev_ops, create_cq);
2017 SET_DEVICE_OP(dev_ops, create_flow);
2018 SET_DEVICE_OP(dev_ops, create_flow_action_esp);
2019 SET_DEVICE_OP(dev_ops, create_qp);
2020 SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
2021 SET_DEVICE_OP(dev_ops, create_srq);
2022 SET_DEVICE_OP(dev_ops, create_wq);
2023 SET_DEVICE_OP(dev_ops, dealloc_dm);
d0899892 2024 SET_DEVICE_OP(dev_ops, dealloc_driver);
3023a1e9
KH
2025 SET_DEVICE_OP(dev_ops, dealloc_fmr);
2026 SET_DEVICE_OP(dev_ops, dealloc_mw);
2027 SET_DEVICE_OP(dev_ops, dealloc_pd);
2028 SET_DEVICE_OP(dev_ops, dealloc_ucontext);
2029 SET_DEVICE_OP(dev_ops, dealloc_xrcd);
2030 SET_DEVICE_OP(dev_ops, del_gid);
2031 SET_DEVICE_OP(dev_ops, dereg_mr);
2032 SET_DEVICE_OP(dev_ops, destroy_ah);
2033 SET_DEVICE_OP(dev_ops, destroy_counters);
2034 SET_DEVICE_OP(dev_ops, destroy_cq);
2035 SET_DEVICE_OP(dev_ops, destroy_flow);
2036 SET_DEVICE_OP(dev_ops, destroy_flow_action);
2037 SET_DEVICE_OP(dev_ops, destroy_qp);
2038 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
2039 SET_DEVICE_OP(dev_ops, destroy_srq);
2040 SET_DEVICE_OP(dev_ops, destroy_wq);
2041 SET_DEVICE_OP(dev_ops, detach_mcast);
2042 SET_DEVICE_OP(dev_ops, disassociate_ucontext);
2043 SET_DEVICE_OP(dev_ops, drain_rq);
2044 SET_DEVICE_OP(dev_ops, drain_sq);
ca22354b 2045 SET_DEVICE_OP(dev_ops, enable_driver);
02da3750 2046 SET_DEVICE_OP(dev_ops, fill_res_entry);
3023a1e9
KH
2047 SET_DEVICE_OP(dev_ops, get_dev_fw_str);
2048 SET_DEVICE_OP(dev_ops, get_dma_mr);
2049 SET_DEVICE_OP(dev_ops, get_hw_stats);
2050 SET_DEVICE_OP(dev_ops, get_link_layer);
2051 SET_DEVICE_OP(dev_ops, get_netdev);
2052 SET_DEVICE_OP(dev_ops, get_port_immutable);
2053 SET_DEVICE_OP(dev_ops, get_vector_affinity);
2054 SET_DEVICE_OP(dev_ops, get_vf_config);
2055 SET_DEVICE_OP(dev_ops, get_vf_stats);
ea4baf7f 2056 SET_DEVICE_OP(dev_ops, init_port);
3023a1e9
KH
2057 SET_DEVICE_OP(dev_ops, map_mr_sg);
2058 SET_DEVICE_OP(dev_ops, map_phys_fmr);
2059 SET_DEVICE_OP(dev_ops, mmap);
2060 SET_DEVICE_OP(dev_ops, modify_ah);
2061 SET_DEVICE_OP(dev_ops, modify_cq);
2062 SET_DEVICE_OP(dev_ops, modify_device);
2063 SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
2064 SET_DEVICE_OP(dev_ops, modify_port);
2065 SET_DEVICE_OP(dev_ops, modify_qp);
2066 SET_DEVICE_OP(dev_ops, modify_srq);
2067 SET_DEVICE_OP(dev_ops, modify_wq);
2068 SET_DEVICE_OP(dev_ops, peek_cq);
2069 SET_DEVICE_OP(dev_ops, poll_cq);
2070 SET_DEVICE_OP(dev_ops, post_recv);
2071 SET_DEVICE_OP(dev_ops, post_send);
2072 SET_DEVICE_OP(dev_ops, post_srq_recv);
2073 SET_DEVICE_OP(dev_ops, process_mad);
2074 SET_DEVICE_OP(dev_ops, query_ah);
2075 SET_DEVICE_OP(dev_ops, query_device);
2076 SET_DEVICE_OP(dev_ops, query_gid);
2077 SET_DEVICE_OP(dev_ops, query_pkey);
2078 SET_DEVICE_OP(dev_ops, query_port);
2079 SET_DEVICE_OP(dev_ops, query_qp);
2080 SET_DEVICE_OP(dev_ops, query_srq);
2081 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
2082 SET_DEVICE_OP(dev_ops, read_counters);
2083 SET_DEVICE_OP(dev_ops, reg_dm_mr);
2084 SET_DEVICE_OP(dev_ops, reg_user_mr);
2085 SET_DEVICE_OP(dev_ops, req_ncomp_notif);
2086 SET_DEVICE_OP(dev_ops, req_notify_cq);
2087 SET_DEVICE_OP(dev_ops, rereg_user_mr);
2088 SET_DEVICE_OP(dev_ops, resize_cq);
2089 SET_DEVICE_OP(dev_ops, set_vf_guid);
2090 SET_DEVICE_OP(dev_ops, set_vf_link_state);
2091 SET_DEVICE_OP(dev_ops, unmap_fmr);
21a428a0
LR
2092
2093 SET_OBJ_SIZE(dev_ops, ib_pd);
a2a074ef 2094 SET_OBJ_SIZE(dev_ops, ib_ucontext);
521ed0d9
KH
2095}
2096EXPORT_SYMBOL(ib_set_device_ops);
2097
d0e312fe 2098static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
735c631a 2099 [RDMA_NL_LS_OP_RESOLVE] = {
647c75ac 2100 .doit = ib_nl_handle_resolve_resp,
e3a2b93d
LR
2101 .flags = RDMA_NL_ADMIN_PERM,
2102 },
735c631a 2103 [RDMA_NL_LS_OP_SET_TIMEOUT] = {
647c75ac 2104 .doit = ib_nl_handle_set_timeout,
e3a2b93d
LR
2105 .flags = RDMA_NL_ADMIN_PERM,
2106 },
ae43f828 2107 [RDMA_NL_LS_OP_IP_RESOLVE] = {
647c75ac 2108 .doit = ib_nl_handle_ip_res_resp,
e3a2b93d
LR
2109 .flags = RDMA_NL_ADMIN_PERM,
2110 },
735c631a
MB
2111};
2112
1da177e4
LT
2113static int __init ib_core_init(void)
2114{
2115 int ret;
2116
f0626710
TH
2117 ib_wq = alloc_workqueue("infiniband", 0, 0);
2118 if (!ib_wq)
2119 return -ENOMEM;
2120
14d3a3b2 2121 ib_comp_wq = alloc_workqueue("ib-comp-wq",
b7363e67 2122 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
14d3a3b2
CH
2123 if (!ib_comp_wq) {
2124 ret = -ENOMEM;
2125 goto err;
2126 }
2127
f794809a
JM
2128 ib_comp_unbound_wq =
2129 alloc_workqueue("ib-comp-unb-wq",
2130 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
2131 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
2132 if (!ib_comp_unbound_wq) {
2133 ret = -ENOMEM;
2134 goto err_comp;
2135 }
2136
55aeed06 2137 ret = class_register(&ib_class);
fd75c789 2138 if (ret) {
aba25a3e 2139 pr_warn("Couldn't create InfiniBand device class\n");
f794809a 2140 goto err_comp_unbound;
fd75c789 2141 }
1da177e4 2142
c9901724 2143 ret = rdma_nl_init();
b2cbae2c 2144 if (ret) {
c9901724 2145 pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
b2cbae2c
RD
2146 goto err_sysfs;
2147 }
2148
e3f20f02
LR
2149 ret = addr_init();
2150 if (ret) {
2151 pr_warn("Could't init IB address resolution\n");
2152 goto err_ibnl;
2153 }
2154
4c2cb422
MB
2155 ret = ib_mad_init();
2156 if (ret) {
2157 pr_warn("Couldn't init IB MAD\n");
2158 goto err_addr;
2159 }
2160
c2e49c92
MB
2161 ret = ib_sa_init();
2162 if (ret) {
2163 pr_warn("Couldn't init SA\n");
2164 goto err_mad;
2165 }
2166
8f408ab6
DJ
2167 ret = register_lsm_notifier(&ibdev_lsm_nb);
2168 if (ret) {
2169 pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
c9901724 2170 goto err_sa;
8f408ab6
DJ
2171 }
2172
4e0f7b90
PP
2173 ret = register_pernet_device(&rdma_dev_net_ops);
2174 if (ret) {
2175 pr_warn("Couldn't init compat dev. ret %d\n", ret);
2176 goto err_compat;
2177 }
2178
6c80b41a 2179 nldev_init();
c9901724 2180 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
5ef8c0c1 2181 roce_gid_mgmt_init();
1da177e4 2182
fd75c789
NM
2183 return 0;
2184
4e0f7b90
PP
2185err_compat:
2186 unregister_lsm_notifier(&ibdev_lsm_nb);
735c631a
MB
2187err_sa:
2188 ib_sa_cleanup();
c2e49c92
MB
2189err_mad:
2190 ib_mad_cleanup();
4c2cb422
MB
2191err_addr:
2192 addr_cleanup();
e3f20f02 2193err_ibnl:
c9901724 2194 rdma_nl_exit();
fd75c789 2195err_sysfs:
55aeed06 2196 class_unregister(&ib_class);
f794809a
JM
2197err_comp_unbound:
2198 destroy_workqueue(ib_comp_unbound_wq);
14d3a3b2
CH
2199err_comp:
2200 destroy_workqueue(ib_comp_wq);
fd75c789
NM
2201err:
2202 destroy_workqueue(ib_wq);
1da177e4
LT
2203 return ret;
2204}
2205
2206static void __exit ib_core_cleanup(void)
2207{
5ef8c0c1 2208 roce_gid_mgmt_cleanup();
6c80b41a 2209 nldev_exit();
c9901724 2210 rdma_nl_unregister(RDMA_NL_LS);
4e0f7b90 2211 unregister_pernet_device(&rdma_dev_net_ops);
c9901724 2212 unregister_lsm_notifier(&ibdev_lsm_nb);
c2e49c92 2213 ib_sa_cleanup();
4c2cb422 2214 ib_mad_cleanup();
e3f20f02 2215 addr_cleanup();
c9901724 2216 rdma_nl_exit();
55aeed06 2217 class_unregister(&ib_class);
f794809a 2218 destroy_workqueue(ib_comp_unbound_wq);
14d3a3b2 2219 destroy_workqueue(ib_comp_wq);
f7c6a7b5 2220 /* Make sure that any pending umem accounting work is done. */
f0626710 2221 destroy_workqueue(ib_wq);
d0899892 2222 flush_workqueue(system_unbound_wq);
e59178d8 2223 WARN_ON(!xa_empty(&clients));
0df91bb6 2224 WARN_ON(!xa_empty(&devices));
1da177e4
LT
2225}
2226
e3bf14bd
JG
2227MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
2228
62dfa795
PP
2229/* ib core relies on netdev stack to first register net_ns_type_operations
2230 * ns kobject type before ib_core initialization.
2231 */
2232fs_initcall(ib_core_init);
1da177e4 2233module_exit(ib_core_cleanup);