forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-10-09 244b2c5ca8b14627e4a17755e5922221e121c771
kernel/drivers/infiniband/core/device.c
....@@ -37,73 +37,241 @@
3737 #include <linux/kernel.h>
3838 #include <linux/slab.h>
3939 #include <linux/init.h>
40
-#include <linux/mutex.h>
4140 #include <linux/netdevice.h>
41
+#include <net/net_namespace.h>
4242 #include <linux/security.h>
4343 #include <linux/notifier.h>
44
+#include <linux/hashtable.h>
4445 #include <rdma/rdma_netlink.h>
4546 #include <rdma/ib_addr.h>
4647 #include <rdma/ib_cache.h>
48
+#include <rdma/rdma_counter.h>
4749
4850 #include "core_priv.h"
51
+#include "restrack.h"
4952
5053 MODULE_AUTHOR("Roland Dreier");
5154 MODULE_DESCRIPTION("core kernel InfiniBand API");
5255 MODULE_LICENSE("Dual BSD/GPL");
53
-
54
-struct ib_client_data {
55
- struct list_head list;
56
- struct ib_client *client;
57
- void * data;
58
- /* The device or client is going down. Do not call client or device
59
- * callbacks other than remove(). */
60
- bool going_down;
61
-};
6256
6357 struct workqueue_struct *ib_comp_wq;
6458 struct workqueue_struct *ib_comp_unbound_wq;
6559 struct workqueue_struct *ib_wq;
6660 EXPORT_SYMBOL_GPL(ib_wq);
6761
68
-/* The device_list and client_list contain devices and clients after their
69
- * registration has completed, and the devices and clients are removed
70
- * during unregistration. */
71
-static LIST_HEAD(device_list);
72
-static LIST_HEAD(client_list);
62
+/*
63
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
64
+ * xarray of the same name. Specifically it allows the caller to assert that
65
+ * the MARK will/will not be changing under the lock, and for devices and
66
+ * clients, that the value in the xarray is still a valid pointer. Change of
67
+ * the MARK is linked to the object state, so holding the lock and testing the
68
+ * MARK also asserts that the contained object is in a certain state.
69
+ *
70
+ * This is used to build a two stage register/unregister flow where objects
71
+ * can continue to be in the xarray even though they are still in progress to
72
+ * register/unregister.
73
+ *
74
+ * The xarray itself provides additional locking, and restartable iteration,
75
+ * which is also relied on.
76
+ *
77
+ * Locks should not be nested, with the exception of client_data, which is
78
+ * allowed to nest under the read side of the other two locks.
79
+ *
80
+ * The devices_rwsem also protects the device name list, any change or
81
+ * assignment of device name must also hold the write side to guarantee unique
82
+ * names.
83
+ */
7384
7485 /*
75
- * device_mutex and lists_rwsem protect access to both device_list and
76
- * client_list. device_mutex protects writer access by device and client
77
- * registration / de-registration. lists_rwsem protects reader access to
78
- * these lists. Iterators of these lists must lock it for read, while updates
79
- * to the lists must be done with a write lock. A special case is when the
80
- * device_mutex is locked. In this case locking the lists for read access is
81
- * not necessary as the device_mutex implies it.
86
+ * devices contains devices that have had their names assigned. The
87
+ * devices may not be registered. Users that care about the registration
88
+ * status need to call ib_device_try_get() on the device to ensure it is
89
+ * registered, and keep it registered, for the required duration.
8290 *
83
- * lists_rwsem also protects access to the client data list.
8491 */
85
-static DEFINE_MUTEX(device_mutex);
86
-static DECLARE_RWSEM(lists_rwsem);
92
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
93
+static DECLARE_RWSEM(devices_rwsem);
94
+#define DEVICE_REGISTERED XA_MARK_1
8795
96
+static u32 highest_client_id;
97
+#define CLIENT_REGISTERED XA_MARK_1
98
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
99
+static DECLARE_RWSEM(clients_rwsem);
100
+
101
+static void ib_client_put(struct ib_client *client)
102
+{
103
+ if (refcount_dec_and_test(&client->uses))
104
+ complete(&client->uses_zero);
105
+}
106
+
107
+/*
108
+ * If client_data is registered then the corresponding client must also still
109
+ * be registered.
110
+ */
111
+#define CLIENT_DATA_REGISTERED XA_MARK_1
112
+
113
+unsigned int rdma_dev_net_id;
114
+
115
+/*
116
+ * A list of net namespaces is maintained in an xarray. This is necessary
117
+ * because we can't get the locking right using the existing net ns list. We
118
+ * would require a init_net callback after the list is updated.
119
+ */
120
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
121
+/*
122
+ * rwsem to protect accessing the rdma_nets xarray entries.
123
+ */
124
+static DECLARE_RWSEM(rdma_nets_rwsem);
125
+
126
+bool ib_devices_shared_netns = true;
127
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
128
+MODULE_PARM_DESC(netns_mode,
129
+ "Share device among net namespaces; default=1 (shared)");
130
+/**
131
+ * rdma_dev_access_netns() - Return whether an rdma device can be accessed
132
+ * from a specified net namespace or not.
133
+ * @dev: Pointer to rdma device which needs to be checked
134
+ * @net: Pointer to net namesapce for which access to be checked
135
+ *
136
+ * When the rdma device is in shared mode, it ignores the net namespace.
137
+ * When the rdma device is exclusive to a net namespace, rdma device net
138
+ * namespace is checked against the specified one.
139
+ */
140
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
141
+{
142
+ return (ib_devices_shared_netns ||
143
+ net_eq(read_pnet(&dev->coredev.rdma_net), net));
144
+}
145
+EXPORT_SYMBOL(rdma_dev_access_netns);
146
+
147
+/*
148
+ * xarray has this behavior where it won't iterate over NULL values stored in
149
+ * allocated arrays. So we need our own iterator to see all values stored in
150
+ * the array. This does the same thing as xa_for_each except that it also
151
+ * returns NULL valued entries if the array is allocating. Simplified to only
152
+ * work on simple xarrays.
153
+ */
154
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
155
+ xa_mark_t filter)
156
+{
157
+ XA_STATE(xas, xa, *indexp);
158
+ void *entry;
159
+
160
+ rcu_read_lock();
161
+ do {
162
+ entry = xas_find_marked(&xas, ULONG_MAX, filter);
163
+ if (xa_is_zero(entry))
164
+ break;
165
+ } while (xas_retry(&xas, entry));
166
+ rcu_read_unlock();
167
+
168
+ if (entry) {
169
+ *indexp = xas.xa_index;
170
+ if (xa_is_zero(entry))
171
+ return NULL;
172
+ return entry;
173
+ }
174
+ return XA_ERROR(-ENOENT);
175
+}
176
+#define xan_for_each_marked(xa, index, entry, filter) \
177
+ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
178
+ !xa_is_err(entry); \
179
+ (index)++, entry = xan_find_marked(xa, &(index), filter))
180
+
181
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
182
+static DEFINE_SPINLOCK(ndev_hash_lock);
183
+static DECLARE_HASHTABLE(ndev_hash, 5);
184
+
185
+static void free_netdevs(struct ib_device *ib_dev);
186
+static void ib_unregister_work(struct work_struct *work);
187
+static void __ib_unregister_device(struct ib_device *device);
88188 static int ib_security_change(struct notifier_block *nb, unsigned long event,
89189 void *lsm_data);
90190 static void ib_policy_change_task(struct work_struct *work);
91191 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
92192
193
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
194
+ struct va_format *vaf)
195
+{
196
+ if (ibdev && ibdev->dev.parent)
197
+ dev_printk_emit(level[1] - '0',
198
+ ibdev->dev.parent,
199
+ "%s %s %s: %pV",
200
+ dev_driver_string(ibdev->dev.parent),
201
+ dev_name(ibdev->dev.parent),
202
+ dev_name(&ibdev->dev),
203
+ vaf);
204
+ else if (ibdev)
205
+ printk("%s%s: %pV",
206
+ level, dev_name(&ibdev->dev), vaf);
207
+ else
208
+ printk("%s(NULL ib_device): %pV", level, vaf);
209
+}
210
+
211
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
212
+ const char *format, ...)
213
+{
214
+ struct va_format vaf;
215
+ va_list args;
216
+
217
+ va_start(args, format);
218
+
219
+ vaf.fmt = format;
220
+ vaf.va = &args;
221
+
222
+ __ibdev_printk(level, ibdev, &vaf);
223
+
224
+ va_end(args);
225
+}
226
+EXPORT_SYMBOL(ibdev_printk);
227
+
228
+#define define_ibdev_printk_level(func, level) \
229
+void func(const struct ib_device *ibdev, const char *fmt, ...) \
230
+{ \
231
+ struct va_format vaf; \
232
+ va_list args; \
233
+ \
234
+ va_start(args, fmt); \
235
+ \
236
+ vaf.fmt = fmt; \
237
+ vaf.va = &args; \
238
+ \
239
+ __ibdev_printk(level, ibdev, &vaf); \
240
+ \
241
+ va_end(args); \
242
+} \
243
+EXPORT_SYMBOL(func);
244
+
245
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
246
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
247
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
248
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
249
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
250
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
251
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
252
+
93253 static struct notifier_block ibdev_lsm_nb = {
94254 .notifier_call = ib_security_change,
95255 };
96256
97
-static int ib_device_check_mandatory(struct ib_device *device)
257
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
258
+ struct net *net);
259
+
260
+/* Pointer to the RCU head at the start of the ib_port_data array */
261
+struct ib_port_data_rcu {
262
+ struct rcu_head rcu_head;
263
+ struct ib_port_data pdata[];
264
+};
265
+
266
+static void ib_device_check_mandatory(struct ib_device *device)
98267 {
99
-#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
268
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
100269 static const struct {
101270 size_t offset;
102271 char *name;
103272 } mandatory_table[] = {
104273 IB_MANDATORY_FUNC(query_device),
105274 IB_MANDATORY_FUNC(query_port),
106
- IB_MANDATORY_FUNC(query_pkey),
107275 IB_MANDATORY_FUNC(alloc_pd),
108276 IB_MANDATORY_FUNC(dealloc_pd),
109277 IB_MANDATORY_FUNC(create_qp),
....@@ -121,110 +289,228 @@
121289 };
122290 int i;
123291
292
+ device->kverbs_provider = true;
124293 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
125
- if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
126
- pr_warn("Device %s is missing mandatory function %s\n",
127
- device->name, mandatory_table[i].name);
128
- return -EINVAL;
294
+ if (!*(void **) ((void *) &device->ops +
295
+ mandatory_table[i].offset)) {
296
+ device->kverbs_provider = false;
297
+ break;
129298 }
130299 }
131
-
132
- return 0;
133
-}
134
-
135
-static struct ib_device *__ib_device_get_by_index(u32 index)
136
-{
137
- struct ib_device *device;
138
-
139
- list_for_each_entry(device, &device_list, core_list)
140
- if (device->index == index)
141
- return device;
142
-
143
- return NULL;
144300 }
145301
146302 /*
147
- * Caller is responsible to return refrerence count by calling put_device()
303
+ * Caller must perform ib_device_put() to return the device reference count
304
+ * when ib_device_get_by_index() returns valid device pointer.
148305 */
149
-struct ib_device *ib_device_get_by_index(u32 index)
306
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
150307 {
151308 struct ib_device *device;
152309
153
- down_read(&lists_rwsem);
154
- device = __ib_device_get_by_index(index);
155
- if (device)
156
- get_device(&device->dev);
310
+ down_read(&devices_rwsem);
311
+ device = xa_load(&devices, index);
312
+ if (device) {
313
+ if (!rdma_dev_access_netns(device, net)) {
314
+ device = NULL;
315
+ goto out;
316
+ }
157317
158
- up_read(&lists_rwsem);
318
+ if (!ib_device_try_get(device))
319
+ device = NULL;
320
+ }
321
+out:
322
+ up_read(&devices_rwsem);
159323 return device;
160324 }
325
+
326
+/**
327
+ * ib_device_put - Release IB device reference
328
+ * @device: device whose reference to be released
329
+ *
330
+ * ib_device_put() releases reference to the IB device to allow it to be
331
+ * unregistered and eventually free.
332
+ */
333
+void ib_device_put(struct ib_device *device)
334
+{
335
+ if (refcount_dec_and_test(&device->refcount))
336
+ complete(&device->unreg_completion);
337
+}
338
+EXPORT_SYMBOL(ib_device_put);
161339
162340 static struct ib_device *__ib_device_get_by_name(const char *name)
163341 {
164342 struct ib_device *device;
343
+ unsigned long index;
165344
166
- list_for_each_entry(device, &device_list, core_list)
167
- if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
345
+ xa_for_each (&devices, index, device)
346
+ if (!strcmp(name, dev_name(&device->dev)))
168347 return device;
169348
170349 return NULL;
171350 }
172351
173
-static int alloc_name(char *name)
352
+/**
353
+ * ib_device_get_by_name - Find an IB device by name
354
+ * @name: The name to look for
355
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
356
+ *
357
+ * Find and hold an ib_device by its name. The caller must call
358
+ * ib_device_put() on the returned pointer.
359
+ */
360
+struct ib_device *ib_device_get_by_name(const char *name,
361
+ enum rdma_driver_id driver_id)
174362 {
175
- unsigned long *inuse;
176
- char buf[IB_DEVICE_NAME_MAX];
177363 struct ib_device *device;
178
- int i;
179364
180
- inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
181
- if (!inuse)
182
- return -ENOMEM;
365
+ down_read(&devices_rwsem);
366
+ device = __ib_device_get_by_name(name);
367
+ if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
368
+ device->ops.driver_id != driver_id)
369
+ device = NULL;
183370
184
- list_for_each_entry(device, &device_list, core_list) {
185
- if (!sscanf(device->name, name, &i))
186
- continue;
187
- if (i < 0 || i >= PAGE_SIZE * 8)
188
- continue;
189
- snprintf(buf, sizeof buf, name, i);
190
- if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
191
- set_bit(i, inuse);
371
+ if (device) {
372
+ if (!ib_device_try_get(device))
373
+ device = NULL;
374
+ }
375
+ up_read(&devices_rwsem);
376
+ return device;
377
+}
378
+EXPORT_SYMBOL(ib_device_get_by_name);
379
+
380
+static int rename_compat_devs(struct ib_device *device)
381
+{
382
+ struct ib_core_device *cdev;
383
+ unsigned long index;
384
+ int ret = 0;
385
+
386
+ mutex_lock(&device->compat_devs_mutex);
387
+ xa_for_each (&device->compat_devs, index, cdev) {
388
+ ret = device_rename(&cdev->dev, dev_name(&device->dev));
389
+ if (ret) {
390
+ dev_warn(&cdev->dev,
391
+ "Fail to rename compatdev to new name %s\n",
392
+ dev_name(&device->dev));
393
+ break;
394
+ }
395
+ }
396
+ mutex_unlock(&device->compat_devs_mutex);
397
+ return ret;
398
+}
399
+
400
+int ib_device_rename(struct ib_device *ibdev, const char *name)
401
+{
402
+ unsigned long index;
403
+ void *client_data;
404
+ int ret;
405
+
406
+ down_write(&devices_rwsem);
407
+ if (!strcmp(name, dev_name(&ibdev->dev))) {
408
+ up_write(&devices_rwsem);
409
+ return 0;
192410 }
193411
194
- i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
195
- free_page((unsigned long) inuse);
196
- snprintf(buf, sizeof buf, name, i);
412
+ if (__ib_device_get_by_name(name)) {
413
+ up_write(&devices_rwsem);
414
+ return -EEXIST;
415
+ }
197416
198
- if (__ib_device_get_by_name(buf))
199
- return -ENFILE;
417
+ ret = device_rename(&ibdev->dev, name);
418
+ if (ret) {
419
+ up_write(&devices_rwsem);
420
+ return ret;
421
+ }
200422
201
- strlcpy(name, buf, IB_DEVICE_NAME_MAX);
423
+ strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
424
+ ret = rename_compat_devs(ibdev);
425
+
426
+ downgrade_write(&devices_rwsem);
427
+ down_read(&ibdev->client_data_rwsem);
428
+ xan_for_each_marked(&ibdev->client_data, index, client_data,
429
+ CLIENT_DATA_REGISTERED) {
430
+ struct ib_client *client = xa_load(&clients, index);
431
+
432
+ if (!client || !client->rename)
433
+ continue;
434
+
435
+ client->rename(ibdev, client_data);
436
+ }
437
+ up_read(&ibdev->client_data_rwsem);
438
+ up_read(&devices_rwsem);
202439 return 0;
440
+}
441
+
442
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
443
+{
444
+ if (use_dim > 1)
445
+ return -EINVAL;
446
+ ibdev->use_cq_dim = use_dim;
447
+
448
+ return 0;
449
+}
450
+
451
+static int alloc_name(struct ib_device *ibdev, const char *name)
452
+{
453
+ struct ib_device *device;
454
+ unsigned long index;
455
+ struct ida inuse;
456
+ int rc;
457
+ int i;
458
+
459
+ lockdep_assert_held_write(&devices_rwsem);
460
+ ida_init(&inuse);
461
+ xa_for_each (&devices, index, device) {
462
+ char buf[IB_DEVICE_NAME_MAX];
463
+
464
+ if (sscanf(dev_name(&device->dev), name, &i) != 1)
465
+ continue;
466
+ if (i < 0 || i >= INT_MAX)
467
+ continue;
468
+ snprintf(buf, sizeof buf, name, i);
469
+ if (strcmp(buf, dev_name(&device->dev)) != 0)
470
+ continue;
471
+
472
+ rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
473
+ if (rc < 0)
474
+ goto out;
475
+ }
476
+
477
+ rc = ida_alloc(&inuse, GFP_KERNEL);
478
+ if (rc < 0)
479
+ goto out;
480
+
481
+ rc = dev_set_name(&ibdev->dev, name, rc);
482
+out:
483
+ ida_destroy(&inuse);
484
+ return rc;
203485 }
204486
205487 static void ib_device_release(struct device *device)
206488 {
207489 struct ib_device *dev = container_of(device, struct ib_device, dev);
208490
209
- WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
210
- if (dev->reg_state == IB_DEV_UNREGISTERED) {
211
- /*
212
- * In IB_DEV_UNINITIALIZED state, cache or port table
213
- * is not even created. Free cache and port table only when
214
- * device reaches UNREGISTERED state.
215
- */
491
+ free_netdevs(dev);
492
+ WARN_ON(refcount_read(&dev->refcount));
493
+ if (dev->port_data) {
216494 ib_cache_release_one(dev);
217
- kfree(dev->port_immutable);
495
+ ib_security_release_port_pkey_list(dev);
496
+ rdma_counter_release(dev);
497
+ kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
498
+ pdata[0]),
499
+ rcu_head);
218500 }
219
- kfree(dev);
501
+
502
+ mutex_destroy(&dev->unregistration_lock);
503
+ mutex_destroy(&dev->compat_devs_mutex);
504
+
505
+ xa_destroy(&dev->compat_devs);
506
+ xa_destroy(&dev->client_data);
507
+ kfree_rcu(dev, rcu_head);
220508 }
221509
222510 static int ib_device_uevent(struct device *device,
223511 struct kobj_uevent_env *env)
224512 {
225
- struct ib_device *dev = container_of(device, struct ib_device, dev);
226
-
227
- if (add_uevent_var(env, "NAME=%s", dev->name))
513
+ if (add_uevent_var(env, "NAME=%s", dev_name(device)))
228514 return -ENOMEM;
229515
230516 /*
....@@ -234,14 +520,44 @@
234520 return 0;
235521 }
236522
523
+static const void *net_namespace(struct device *d)
524
+{
525
+ struct ib_core_device *coredev =
526
+ container_of(d, struct ib_core_device, dev);
527
+
528
+ return read_pnet(&coredev->rdma_net);
529
+}
530
+
237531 static struct class ib_class = {
238532 .name = "infiniband",
239533 .dev_release = ib_device_release,
240534 .dev_uevent = ib_device_uevent,
535
+ .ns_type = &net_ns_type_operations,
536
+ .namespace = net_namespace,
241537 };
242538
539
+static void rdma_init_coredev(struct ib_core_device *coredev,
540
+ struct ib_device *dev, struct net *net)
541
+{
542
+ /* This BUILD_BUG_ON is intended to catch layout change
543
+ * of union of ib_core_device and device.
544
+ * dev must be the first element as ib_core and providers
545
+ * driver uses it. Adding anything in ib_core_device before
546
+ * device will break this assumption.
547
+ */
548
+ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
549
+ offsetof(struct ib_device, dev));
550
+
551
+ coredev->dev.class = &ib_class;
552
+ coredev->dev.groups = dev->groups;
553
+ device_initialize(&coredev->dev);
554
+ coredev->owner = dev;
555
+ INIT_LIST_HEAD(&coredev->port_list);
556
+ write_pnet(&coredev->rdma_net, net);
557
+}
558
+
243559 /**
244
- * ib_alloc_device - allocate an IB device struct
560
+ * _ib_alloc_device - allocate an IB device struct
245561 * @size:size of structure to allocate
246562 *
247563 * Low-level drivers should use ib_alloc_device() to allocate &struct
....@@ -250,7 +566,7 @@
250566 * ib_dealloc_device() must be used to free structures allocated with
251567 * ib_alloc_device().
252568 */
253
-struct ib_device *ib_alloc_device(size_t size)
569
+struct ib_device *_ib_alloc_device(size_t size)
254570 {
255571 struct ib_device *device;
256572
....@@ -261,22 +577,43 @@
261577 if (!device)
262578 return NULL;
263579
264
- rdma_restrack_init(&device->res);
580
+ if (rdma_restrack_init(device)) {
581
+ kfree(device);
582
+ return NULL;
583
+ }
265584
266
- device->dev.class = &ib_class;
267
- device_initialize(&device->dev);
268
-
269
- dev_set_drvdata(&device->dev, device);
585
+ device->groups[0] = &ib_dev_attr_group;
586
+ rdma_init_coredev(&device->coredev, device, &init_net);
270587
271588 INIT_LIST_HEAD(&device->event_handler_list);
272
- spin_lock_init(&device->event_handler_lock);
273
- spin_lock_init(&device->client_data_lock);
274
- INIT_LIST_HEAD(&device->client_data_list);
275
- INIT_LIST_HEAD(&device->port_list);
589
+ spin_lock_init(&device->qp_open_list_lock);
590
+ init_rwsem(&device->event_handler_rwsem);
591
+ mutex_init(&device->unregistration_lock);
592
+ /*
593
+ * client_data needs to be alloc because we don't want our mark to be
594
+ * destroyed if the user stores NULL in the client data.
595
+ */
596
+ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
597
+ init_rwsem(&device->client_data_rwsem);
598
+ xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
599
+ mutex_init(&device->compat_devs_mutex);
600
+ init_completion(&device->unreg_completion);
601
+ INIT_WORK(&device->unregistration_work, ib_unregister_work);
602
+
603
+ device->uverbs_ex_cmd_mask =
604
+ BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
605
+ BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
606
+ BIT_ULL(IB_USER_VERBS_EX_CMD_CREATE_WQ) |
607
+ BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
608
+ BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL) |
609
+ BIT_ULL(IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
610
+ BIT_ULL(IB_USER_VERBS_EX_CMD_MODIFY_CQ) |
611
+ BIT_ULL(IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
612
+ BIT_ULL(IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
276613
277614 return device;
278615 }
279
-EXPORT_SYMBOL(ib_alloc_device);
616
+EXPORT_SYMBOL(_ib_alloc_device);
280617
281618 /**
282619 * ib_dealloc_device - free an IB device struct
....@@ -286,32 +623,173 @@
286623 */
287624 void ib_dealloc_device(struct ib_device *device)
288625 {
289
- WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
290
- device->reg_state != IB_DEV_UNINITIALIZED);
291
- rdma_restrack_clean(&device->res);
626
+ if (device->ops.dealloc_driver)
627
+ device->ops.dealloc_driver(device);
628
+
629
+ /*
630
+ * ib_unregister_driver() requires all devices to remain in the xarray
631
+ * while their ops are callable. The last op we call is dealloc_driver
632
+ * above. This is needed to create a fence on op callbacks prior to
633
+ * allowing the driver module to unload.
634
+ */
635
+ down_write(&devices_rwsem);
636
+ if (xa_load(&devices, device->index) == device)
637
+ xa_erase(&devices, device->index);
638
+ up_write(&devices_rwsem);
639
+
640
+ /* Expedite releasing netdev references */
641
+ free_netdevs(device);
642
+
643
+ WARN_ON(!xa_empty(&device->compat_devs));
644
+ WARN_ON(!xa_empty(&device->client_data));
645
+ WARN_ON(refcount_read(&device->refcount));
646
+ rdma_restrack_clean(device);
647
+ /* Balances with device_initialize */
292648 put_device(&device->dev);
293649 }
294650 EXPORT_SYMBOL(ib_dealloc_device);
295651
296
-static int add_client_context(struct ib_device *device, struct ib_client *client)
652
+/*
653
+ * add_client_context() and remove_client_context() must be safe against
654
+ * parallel calls on the same device - registration/unregistration of both the
655
+ * device and client can be occurring in parallel.
656
+ *
657
+ * The routines need to be a fence, any caller must not return until the add
658
+ * or remove is fully completed.
659
+ */
660
+static int add_client_context(struct ib_device *device,
661
+ struct ib_client *client)
297662 {
298
- struct ib_client_data *context;
299
- unsigned long flags;
663
+ int ret = 0;
300664
301
- context = kmalloc(sizeof *context, GFP_KERNEL);
302
- if (!context)
665
+ if (!device->kverbs_provider && !client->no_kverbs_req)
666
+ return 0;
667
+
668
+ down_write(&device->client_data_rwsem);
669
+ /*
670
+ * So long as the client is registered hold both the client and device
671
+ * unregistration locks.
672
+ */
673
+ if (!refcount_inc_not_zero(&client->uses))
674
+ goto out_unlock;
675
+ refcount_inc(&device->refcount);
676
+
677
+ /*
678
+ * Another caller to add_client_context got here first and has already
679
+ * completely initialized context.
680
+ */
681
+ if (xa_get_mark(&device->client_data, client->client_id,
682
+ CLIENT_DATA_REGISTERED))
683
+ goto out;
684
+
685
+ ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
686
+ GFP_KERNEL));
687
+ if (ret)
688
+ goto out;
689
+ downgrade_write(&device->client_data_rwsem);
690
+ if (client->add) {
691
+ if (client->add(device)) {
692
+ /*
693
+ * If a client fails to add then the error code is
694
+ * ignored, but we won't call any more ops on this
695
+ * client.
696
+ */
697
+ xa_erase(&device->client_data, client->client_id);
698
+ up_read(&device->client_data_rwsem);
699
+ ib_device_put(device);
700
+ ib_client_put(client);
701
+ return 0;
702
+ }
703
+ }
704
+
705
+ /* Readers shall not see a client until add has been completed */
706
+ xa_set_mark(&device->client_data, client->client_id,
707
+ CLIENT_DATA_REGISTERED);
708
+ up_read(&device->client_data_rwsem);
709
+ return 0;
710
+
711
+out:
712
+ ib_device_put(device);
713
+ ib_client_put(client);
714
+out_unlock:
715
+ up_write(&device->client_data_rwsem);
716
+ return ret;
717
+}
718
+
719
+static void remove_client_context(struct ib_device *device,
720
+ unsigned int client_id)
721
+{
722
+ struct ib_client *client;
723
+ void *client_data;
724
+
725
+ down_write(&device->client_data_rwsem);
726
+ if (!xa_get_mark(&device->client_data, client_id,
727
+ CLIENT_DATA_REGISTERED)) {
728
+ up_write(&device->client_data_rwsem);
729
+ return;
730
+ }
731
+ client_data = xa_load(&device->client_data, client_id);
732
+ xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
733
+ client = xa_load(&clients, client_id);
734
+ up_write(&device->client_data_rwsem);
735
+
736
+ /*
737
+ * Notice we cannot be holding any exclusive locks when calling the
738
+ * remove callback as the remove callback can recurse back into any
739
+ * public functions in this module and thus try for any locks those
740
+ * functions take.
741
+ *
742
+ * For this reason clients and drivers should not call the
743
+ * unregistration functions will holdling any locks.
744
+ */
745
+ if (client->remove)
746
+ client->remove(device, client_data);
747
+
748
+ xa_erase(&device->client_data, client_id);
749
+ ib_device_put(device);
750
+ ib_client_put(client);
751
+}
752
+
753
+static int alloc_port_data(struct ib_device *device)
754
+{
755
+ struct ib_port_data_rcu *pdata_rcu;
756
+ unsigned int port;
757
+
758
+ if (device->port_data)
759
+ return 0;
760
+
761
+ /* This can only be called once the physical port range is defined */
762
+ if (WARN_ON(!device->phys_port_cnt))
763
+ return -EINVAL;
764
+
765
+ /*
766
+ * device->port_data is indexed directly by the port number to make
767
+ * access to this data as efficient as possible.
768
+ *
769
+ * Therefore port_data is declared as a 1 based array with potential
770
+ * empty slots at the beginning.
771
+ */
772
+ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
773
+ rdma_end_port(device) + 1),
774
+ GFP_KERNEL);
775
+ if (!pdata_rcu)
303776 return -ENOMEM;
777
+ /*
778
+ * The rcu_head is put in front of the port data array and the stored
779
+ * pointer is adjusted since we never need to see that member until
780
+ * kfree_rcu.
781
+ */
782
+ device->port_data = pdata_rcu->pdata;
304783
305
- context->client = client;
306
- context->data = NULL;
307
- context->going_down = false;
784
+ rdma_for_each_port (device, port) {
785
+ struct ib_port_data *pdata = &device->port_data[port];
308786
309
- down_write(&lists_rwsem);
310
- spin_lock_irqsave(&device->client_data_lock, flags);
311
- list_add(&context->list, &device->client_data_list);
312
- spin_unlock_irqrestore(&device->client_data_lock, flags);
313
- up_write(&lists_rwsem);
314
-
787
+ pdata->ib_dev = device;
788
+ spin_lock_init(&pdata->pkey_list_lock);
789
+ INIT_LIST_HEAD(&pdata->pkey_list);
790
+ spin_lock_init(&pdata->netdev_lock);
791
+ INIT_HLIST_NODE(&pdata->ndev_hash_link);
792
+ }
315793 return 0;
316794 }
317795
....@@ -321,29 +799,20 @@
321799 rdma_max_mad_size(dev, port) != 0);
322800 }
323801
324
-static int read_port_immutable(struct ib_device *device)
802
+static int setup_port_data(struct ib_device *device)
325803 {
804
+ unsigned int port;
326805 int ret;
327
- u8 start_port = rdma_start_port(device);
328
- u8 end_port = rdma_end_port(device);
329
- u8 port;
330806
331
- /**
332
- * device->port_immutable is indexed directly by the port number to make
333
- * access to this data as efficient as possible.
334
- *
335
- * Therefore port_immutable is declared as a 1 based array with
336
- * potential empty slots at the beginning.
337
- */
338
- device->port_immutable = kcalloc(end_port + 1,
339
- sizeof(*device->port_immutable),
340
- GFP_KERNEL);
341
- if (!device->port_immutable)
342
- return -ENOMEM;
807
+ ret = alloc_port_data(device);
808
+ if (ret)
809
+ return ret;
343810
344
- for (port = start_port; port <= end_port; ++port) {
345
- ret = device->get_port_immutable(device, port,
346
- &device->port_immutable[port]);
811
+ rdma_for_each_port (device, port) {
812
+ struct ib_port_data *pdata = &device->port_data[port];
813
+
814
+ ret = device->ops.get_port_immutable(device, port,
815
+ &pdata->immutable);
347816 if (ret)
348817 return ret;
349818
....@@ -355,46 +824,23 @@
355824
356825 void ib_get_device_fw_str(struct ib_device *dev, char *str)
357826 {
358
- if (dev->get_dev_fw_str)
359
- dev->get_dev_fw_str(dev, str);
827
+ if (dev->ops.get_dev_fw_str)
828
+ dev->ops.get_dev_fw_str(dev, str);
360829 else
361830 str[0] = '\0';
362831 }
363832 EXPORT_SYMBOL(ib_get_device_fw_str);
364833
365
-static int setup_port_pkey_list(struct ib_device *device)
366
-{
367
- int i;
368
-
369
- /**
370
- * device->port_pkey_list is indexed directly by the port number,
371
- * Therefore it is declared as a 1 based array with potential empty
372
- * slots at the beginning.
373
- */
374
- device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
375
- sizeof(*device->port_pkey_list),
376
- GFP_KERNEL);
377
-
378
- if (!device->port_pkey_list)
379
- return -ENOMEM;
380
-
381
- for (i = 0; i < (rdma_end_port(device) + 1); i++) {
382
- spin_lock_init(&device->port_pkey_list[i].list_lock);
383
- INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
384
- }
385
-
386
- return 0;
387
-}
388
-
389834 static void ib_policy_change_task(struct work_struct *work)
390835 {
391836 struct ib_device *dev;
837
+ unsigned long index;
392838
393
- down_read(&lists_rwsem);
394
- list_for_each_entry(dev, &device_list, core_list) {
395
- int i;
839
+ down_read(&devices_rwsem);
840
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
841
+ unsigned int i;
396842
397
- for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
843
+ rdma_for_each_port (dev, i) {
398844 u64 sp;
399845 int ret = ib_get_cached_subnet_prefix(dev,
400846 i,
....@@ -407,7 +853,7 @@
407853 ib_security_cache_change(dev, i, sp);
408854 }
409855 }
410
- up_read(&lists_rwsem);
856
+ up_read(&devices_rwsem);
411857 }
412858
413859 static int ib_security_change(struct notifier_block *nb, unsigned long event,
....@@ -417,208 +863,857 @@
417863 return NOTIFY_DONE;
418864
419865 schedule_work(&ib_policy_change_work);
866
+ ib_mad_agent_security_change();
420867
421868 return NOTIFY_OK;
422869 }
423870
424
-/**
425
- * __dev_new_index - allocate an device index
426
- *
427
- * Returns a suitable unique value for a new device interface
428
- * number. It assumes that there are less than 2^32-1 ib devices
429
- * will be present in the system.
430
- */
431
-static u32 __dev_new_index(void)
871
+static void compatdev_release(struct device *dev)
432872 {
873
+ struct ib_core_device *cdev =
874
+ container_of(dev, struct ib_core_device, dev);
875
+
876
+ kfree(cdev);
877
+}
878
+
879
+static int add_one_compat_dev(struct ib_device *device,
880
+ struct rdma_dev_net *rnet)
881
+{
882
+ struct ib_core_device *cdev;
883
+ int ret;
884
+
885
+ lockdep_assert_held(&rdma_nets_rwsem);
886
+ if (!ib_devices_shared_netns)
887
+ return 0;
888
+
433889 /*
434
- * The device index to allow stable naming.
435
- * Similar to struct net -> ifindex.
890
+ * Create and add compat device in all namespaces other than where it
891
+ * is currently bound to.
436892 */
437
- static u32 index;
893
+ if (net_eq(read_pnet(&rnet->net),
894
+ read_pnet(&device->coredev.rdma_net)))
895
+ return 0;
438896
439
- for (;;) {
440
- if (!(++index))
441
- index = 1;
442
-
443
- if (!__ib_device_get_by_index(index))
444
- return index;
897
+ /*
898
+ * The first of init_net() or ib_register_device() to take the
899
+ * compat_devs_mutex wins and gets to add the device. Others will wait
900
+ * for completion here.
901
+ */
902
+ mutex_lock(&device->compat_devs_mutex);
903
+ cdev = xa_load(&device->compat_devs, rnet->id);
904
+ if (cdev) {
905
+ ret = 0;
906
+ goto done;
445907 }
908
+ ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
909
+ if (ret)
910
+ goto done;
911
+
912
+ cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
913
+ if (!cdev) {
914
+ ret = -ENOMEM;
915
+ goto cdev_err;
916
+ }
917
+
918
+ cdev->dev.parent = device->dev.parent;
919
+ rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
920
+ cdev->dev.release = compatdev_release;
921
+ ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
922
+ if (ret)
923
+ goto add_err;
924
+
925
+ ret = device_add(&cdev->dev);
926
+ if (ret)
927
+ goto add_err;
928
+ ret = ib_setup_port_attrs(cdev);
929
+ if (ret)
930
+ goto port_err;
931
+
932
+ ret = xa_err(xa_store(&device->compat_devs, rnet->id,
933
+ cdev, GFP_KERNEL));
934
+ if (ret)
935
+ goto insert_err;
936
+
937
+ mutex_unlock(&device->compat_devs_mutex);
938
+ return 0;
939
+
940
+insert_err:
941
+ ib_free_port_attrs(cdev);
942
+port_err:
943
+ device_del(&cdev->dev);
944
+add_err:
945
+ put_device(&cdev->dev);
946
+cdev_err:
947
+ xa_release(&device->compat_devs, rnet->id);
948
+done:
949
+ mutex_unlock(&device->compat_devs_mutex);
950
+ return ret;
951
+}
952
+
953
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
954
+{
955
+ struct ib_core_device *cdev;
956
+
957
+ mutex_lock(&device->compat_devs_mutex);
958
+ cdev = xa_erase(&device->compat_devs, id);
959
+ mutex_unlock(&device->compat_devs_mutex);
960
+ if (cdev) {
961
+ ib_free_port_attrs(cdev);
962
+ device_del(&cdev->dev);
963
+ put_device(&cdev->dev);
964
+ }
965
+}
966
+
967
+static void remove_compat_devs(struct ib_device *device)
968
+{
969
+ struct ib_core_device *cdev;
970
+ unsigned long index;
971
+
972
+ xa_for_each (&device->compat_devs, index, cdev)
973
+ remove_one_compat_dev(device, index);
974
+}
975
+
976
+static int add_compat_devs(struct ib_device *device)
977
+{
978
+ struct rdma_dev_net *rnet;
979
+ unsigned long index;
980
+ int ret = 0;
981
+
982
+ lockdep_assert_held(&devices_rwsem);
983
+
984
+ down_read(&rdma_nets_rwsem);
985
+ xa_for_each (&rdma_nets, index, rnet) {
986
+ ret = add_one_compat_dev(device, rnet);
987
+ if (ret)
988
+ break;
989
+ }
990
+ up_read(&rdma_nets_rwsem);
991
+ return ret;
992
+}
993
+
994
+static void remove_all_compat_devs(void)
995
+{
996
+ struct ib_compat_device *cdev;
997
+ struct ib_device *dev;
998
+ unsigned long index;
999
+
1000
+ down_read(&devices_rwsem);
1001
+ xa_for_each (&devices, index, dev) {
1002
+ unsigned long c_index = 0;
1003
+
1004
+ /* Hold nets_rwsem so that any other thread modifying this
1005
+ * system param can sync with this thread.
1006
+ */
1007
+ down_read(&rdma_nets_rwsem);
1008
+ xa_for_each (&dev->compat_devs, c_index, cdev)
1009
+ remove_one_compat_dev(dev, c_index);
1010
+ up_read(&rdma_nets_rwsem);
1011
+ }
1012
+ up_read(&devices_rwsem);
1013
+}
1014
+
1015
+static int add_all_compat_devs(void)
1016
+{
1017
+ struct rdma_dev_net *rnet;
1018
+ struct ib_device *dev;
1019
+ unsigned long index;
1020
+ int ret = 0;
1021
+
1022
+ down_read(&devices_rwsem);
1023
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1024
+ unsigned long net_index = 0;
1025
+
1026
+ /* Hold nets_rwsem so that any other thread modifying this
1027
+ * system param can sync with this thread.
1028
+ */
1029
+ down_read(&rdma_nets_rwsem);
1030
+ xa_for_each (&rdma_nets, net_index, rnet) {
1031
+ ret = add_one_compat_dev(dev, rnet);
1032
+ if (ret)
1033
+ break;
1034
+ }
1035
+ up_read(&rdma_nets_rwsem);
1036
+ }
1037
+ up_read(&devices_rwsem);
1038
+ if (ret)
1039
+ remove_all_compat_devs();
1040
+ return ret;
1041
+}
1042
+
1043
+int rdma_compatdev_set(u8 enable)
1044
+{
1045
+ struct rdma_dev_net *rnet;
1046
+ unsigned long index;
1047
+ int ret = 0;
1048
+
1049
+ down_write(&rdma_nets_rwsem);
1050
+ if (ib_devices_shared_netns == enable) {
1051
+ up_write(&rdma_nets_rwsem);
1052
+ return 0;
1053
+ }
1054
+
1055
+ /* enable/disable of compat devices is not supported
1056
+ * when more than default init_net exists.
1057
+ */
1058
+ xa_for_each (&rdma_nets, index, rnet) {
1059
+ ret++;
1060
+ break;
1061
+ }
1062
+ if (!ret)
1063
+ ib_devices_shared_netns = enable;
1064
+ up_write(&rdma_nets_rwsem);
1065
+ if (ret)
1066
+ return -EBUSY;
1067
+
1068
+ if (enable)
1069
+ ret = add_all_compat_devs();
1070
+ else
1071
+ remove_all_compat_devs();
1072
+ return ret;
1073
+}
1074
+
1075
+static void rdma_dev_exit_net(struct net *net)
1076
+{
1077
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1078
+ struct ib_device *dev;
1079
+ unsigned long index;
1080
+ int ret;
1081
+
1082
+ down_write(&rdma_nets_rwsem);
1083
+ /*
1084
+ * Prevent the ID from being re-used and hide the id from xa_for_each.
1085
+ */
1086
+ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
1087
+ WARN_ON(ret);
1088
+ up_write(&rdma_nets_rwsem);
1089
+
1090
+ down_read(&devices_rwsem);
1091
+ xa_for_each (&devices, index, dev) {
1092
+ get_device(&dev->dev);
1093
+ /*
1094
+ * Release the devices_rwsem so that pontentially blocking
1095
+ * device_del, doesn't hold the devices_rwsem for too long.
1096
+ */
1097
+ up_read(&devices_rwsem);
1098
+
1099
+ remove_one_compat_dev(dev, rnet->id);
1100
+
1101
+ /*
1102
+ * If the real device is in the NS then move it back to init.
1103
+ */
1104
+ rdma_dev_change_netns(dev, net, &init_net);
1105
+
1106
+ put_device(&dev->dev);
1107
+ down_read(&devices_rwsem);
1108
+ }
1109
+ up_read(&devices_rwsem);
1110
+
1111
+ rdma_nl_net_exit(rnet);
1112
+ xa_erase(&rdma_nets, rnet->id);
1113
+}
1114
+
1115
+static __net_init int rdma_dev_init_net(struct net *net)
1116
+{
1117
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1118
+ unsigned long index;
1119
+ struct ib_device *dev;
1120
+ int ret;
1121
+
1122
+ write_pnet(&rnet->net, net);
1123
+
1124
+ ret = rdma_nl_net_init(rnet);
1125
+ if (ret)
1126
+ return ret;
1127
+
1128
+ /* No need to create any compat devices in default init_net. */
1129
+ if (net_eq(net, &init_net))
1130
+ return 0;
1131
+
1132
+ ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
1133
+ if (ret) {
1134
+ rdma_nl_net_exit(rnet);
1135
+ return ret;
1136
+ }
1137
+
1138
+ down_read(&devices_rwsem);
1139
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1140
+ /* Hold nets_rwsem so that netlink command cannot change
1141
+ * system configuration for device sharing mode.
1142
+ */
1143
+ down_read(&rdma_nets_rwsem);
1144
+ ret = add_one_compat_dev(dev, rnet);
1145
+ up_read(&rdma_nets_rwsem);
1146
+ if (ret)
1147
+ break;
1148
+ }
1149
+ up_read(&devices_rwsem);
1150
+
1151
+ if (ret)
1152
+ rdma_dev_exit_net(net);
1153
+
1154
+ return ret;
1155
+}
1156
+
1157
+/*
1158
+ * Assign the unique string device name and the unique device index. This is
1159
+ * undone by ib_dealloc_device.
1160
+ */
1161
+static int assign_name(struct ib_device *device, const char *name)
1162
+{
1163
+ static u32 last_id;
1164
+ int ret;
1165
+
1166
+ down_write(&devices_rwsem);
1167
+ /* Assign a unique name to the device */
1168
+ if (strchr(name, '%'))
1169
+ ret = alloc_name(device, name);
1170
+ else
1171
+ ret = dev_set_name(&device->dev, name);
1172
+ if (ret)
1173
+ goto out;
1174
+
1175
+ if (__ib_device_get_by_name(dev_name(&device->dev))) {
1176
+ ret = -ENFILE;
1177
+ goto out;
1178
+ }
1179
+ strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
1180
+
1181
+ ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
1182
+ &last_id, GFP_KERNEL);
1183
+ if (ret > 0)
1184
+ ret = 0;
1185
+
1186
+out:
1187
+ up_write(&devices_rwsem);
1188
+ return ret;
1189
+}
1190
+
1191
+/*
1192
+ * setup_device() allocates memory and sets up data that requires calling the
1193
+ * device ops, this is the only reason these actions are not done during
1194
+ * ib_alloc_device. It is undone by ib_dealloc_device().
1195
+ */
1196
+static int setup_device(struct ib_device *device)
1197
+{
1198
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
1199
+ int ret;
1200
+
1201
+ ib_device_check_mandatory(device);
1202
+
1203
+ ret = setup_port_data(device);
1204
+ if (ret) {
1205
+ dev_warn(&device->dev, "Couldn't create per-port data\n");
1206
+ return ret;
1207
+ }
1208
+
1209
+ memset(&device->attrs, 0, sizeof(device->attrs));
1210
+ ret = device->ops.query_device(device, &device->attrs, &uhw);
1211
+ if (ret) {
1212
+ dev_warn(&device->dev,
1213
+ "Couldn't query the device attributes\n");
1214
+ return ret;
1215
+ }
1216
+
1217
+ return 0;
1218
+}
1219
+
1220
+static void disable_device(struct ib_device *device)
1221
+{
1222
+ u32 cid;
1223
+
1224
+ WARN_ON(!refcount_read(&device->refcount));
1225
+
1226
+ down_write(&devices_rwsem);
1227
+ xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
1228
+ up_write(&devices_rwsem);
1229
+
1230
+ /*
1231
+ * Remove clients in LIFO order, see assign_client_id. This could be
1232
+ * more efficient if xarray learns to reverse iterate. Since no new
1233
+ * clients can be added to this ib_device past this point we only need
1234
+ * the maximum possible client_id value here.
1235
+ */
1236
+ down_read(&clients_rwsem);
1237
+ cid = highest_client_id;
1238
+ up_read(&clients_rwsem);
1239
+ while (cid) {
1240
+ cid--;
1241
+ remove_client_context(device, cid);
1242
+ }
1243
+
1244
+ ib_cq_pool_destroy(device);
1245
+
1246
+ /* Pairs with refcount_set in enable_device */
1247
+ ib_device_put(device);
1248
+ wait_for_completion(&device->unreg_completion);
1249
+
1250
+ /*
1251
+ * compat devices must be removed after device refcount drops to zero.
1252
+ * Otherwise init_net() may add more compatdevs after removing compat
1253
+ * devices and before device is disabled.
1254
+ */
1255
+ remove_compat_devs(device);
1256
+}
1257
+
1258
+/*
1259
+ * An enabled device is visible to all clients and to all the public facing
1260
+ * APIs that return a device pointer. This always returns with a new get, even
1261
+ * if it fails.
1262
+ */
1263
+static int enable_device_and_get(struct ib_device *device)
1264
+{
1265
+ struct ib_client *client;
1266
+ unsigned long index;
1267
+ int ret = 0;
1268
+
1269
+ /*
1270
+ * One ref belongs to the xa and the other belongs to this
1271
+ * thread. This is needed to guard against parallel unregistration.
1272
+ */
1273
+ refcount_set(&device->refcount, 2);
1274
+ down_write(&devices_rwsem);
1275
+ xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
1276
+
1277
+ /*
1278
+ * By using downgrade_write() we ensure that no other thread can clear
1279
+ * DEVICE_REGISTERED while we are completing the client setup.
1280
+ */
1281
+ downgrade_write(&devices_rwsem);
1282
+
1283
+ if (device->ops.enable_driver) {
1284
+ ret = device->ops.enable_driver(device);
1285
+ if (ret)
1286
+ goto out;
1287
+ }
1288
+
1289
+ ib_cq_pool_init(device);
1290
+
1291
+ down_read(&clients_rwsem);
1292
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1293
+ ret = add_client_context(device, client);
1294
+ if (ret)
1295
+ break;
1296
+ }
1297
+ up_read(&clients_rwsem);
1298
+ if (!ret)
1299
+ ret = add_compat_devs(device);
1300
+out:
1301
+ up_read(&devices_rwsem);
1302
+ return ret;
1303
+}
1304
+
1305
+static void prevent_dealloc_device(struct ib_device *ib_dev)
1306
+{
4461307 }
4471308
4481309 /**
4491310 * ib_register_device - Register an IB device with IB core
450
- * @device:Device to register
1311
+ * @device: Device to register
1312
+ * @name: unique string device name. This may include a '%' which will
1313
+ * cause a unique index to be added to the passed device name.
1314
+ * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB
1315
+ * device will be used. In this case the caller should fully
1316
+ * setup the ibdev for DMA. This usually means using dma_virt_ops.
4511317 *
4521318 * Low-level drivers use ib_register_device() to register their
4531319 * devices with the IB core. All registered clients will receive a
4541320 * callback for each device that is added. @device must be allocated
4551321 * with ib_alloc_device().
1322
+ *
1323
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
1324
+ * asynchronously then the device pointer may become freed as soon as this
1325
+ * function returns.
4561326 */
457
-int ib_register_device(struct ib_device *device,
458
- int (*port_callback)(struct ib_device *,
459
- u8, struct kobject *))
1327
+int ib_register_device(struct ib_device *device, const char *name,
1328
+ struct device *dma_device)
4601329 {
4611330 int ret;
462
- struct ib_client *client;
463
- struct ib_udata uhw = {.outlen = 0, .inlen = 0};
464
- struct device *parent = device->dev.parent;
4651331
466
- WARN_ON_ONCE(device->dma_device);
467
- if (device->dev.dma_ops) {
468
- /*
469
- * The caller provided custom DMA operations. Copy the
470
- * DMA-related fields that are used by e.g. dma_alloc_coherent()
471
- * into device->dev.
472
- */
473
- device->dma_device = &device->dev;
474
- if (!device->dev.dma_mask) {
475
- if (parent)
476
- device->dev.dma_mask = parent->dma_mask;
477
- else
478
- WARN_ON_ONCE(true);
479
- }
480
- if (!device->dev.coherent_dma_mask) {
481
- if (parent)
482
- device->dev.coherent_dma_mask =
483
- parent->coherent_dma_mask;
484
- else
485
- WARN_ON_ONCE(true);
486
- }
487
- } else {
488
- /*
489
- * The caller did not provide custom DMA operations. Use the
490
- * DMA mapping operations of the parent device.
491
- */
492
- WARN_ON_ONCE(!parent);
493
- device->dma_device = parent;
494
- }
1332
+ ret = assign_name(device, name);
1333
+ if (ret)
1334
+ return ret;
4951335
496
- mutex_lock(&device_mutex);
1336
+ /*
1337
+ * If the caller does not provide a DMA capable device then the IB core
1338
+ * will set up ib_sge and scatterlist structures that stash the kernel
1339
+ * virtual address into the address field.
1340
+ */
1341
+ WARN_ON(dma_device && !dma_device->dma_parms);
1342
+ device->dma_device = dma_device;
4971343
498
- if (strchr(device->name, '%')) {
499
- ret = alloc_name(device->name);
500
- if (ret)
501
- goto out;
502
- }
503
-
504
- if (ib_device_check_mandatory(device)) {
505
- ret = -EINVAL;
506
- goto out;
507
- }
508
-
509
- ret = read_port_immutable(device);
510
- if (ret) {
511
- pr_warn("Couldn't create per port immutable data %s\n",
512
- device->name);
513
- goto out;
514
- }
515
-
516
- ret = setup_port_pkey_list(device);
517
- if (ret) {
518
- pr_warn("Couldn't create per port_pkey_list\n");
519
- goto out;
520
- }
1344
+ ret = setup_device(device);
1345
+ if (ret)
1346
+ return ret;
5211347
5221348 ret = ib_cache_setup_one(device);
5231349 if (ret) {
524
- pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
525
- goto port_cleanup;
1350
+ dev_warn(&device->dev,
1351
+ "Couldn't set up InfiniBand P_Key/GID cache\n");
1352
+ return ret;
5261353 }
5271354
528
- ret = ib_device_register_rdmacg(device);
529
- if (ret) {
530
- pr_warn("Couldn't register device with rdma cgroup\n");
531
- goto cache_cleanup;
532
- }
1355
+ ib_device_register_rdmacg(device);
5331356
534
- memset(&device->attrs, 0, sizeof(device->attrs));
535
- ret = device->query_device(device, &device->attrs, &uhw);
536
- if (ret) {
537
- pr_warn("Couldn't query the device attributes\n");
1357
+ rdma_counter_init(device);
1358
+
1359
+ /*
1360
+ * Ensure that ADD uevent is not fired because it
1361
+ * is too early amd device is not initialized yet.
1362
+ */
1363
+ dev_set_uevent_suppress(&device->dev, true);
1364
+ ret = device_add(&device->dev);
1365
+ if (ret)
5381366 goto cg_cleanup;
539
- }
5401367
541
- ret = ib_device_register_sysfs(device, port_callback);
1368
+ ret = ib_device_register_sysfs(device);
5421369 if (ret) {
543
- pr_warn("Couldn't register device %s with driver model\n",
544
- device->name);
545
- goto cg_cleanup;
1370
+ dev_warn(&device->dev,
1371
+ "Couldn't register device with driver model\n");
1372
+ goto dev_cleanup;
5461373 }
5471374
548
- device->reg_state = IB_DEV_REGISTERED;
1375
+ ret = enable_device_and_get(device);
1376
+ if (ret) {
1377
+ void (*dealloc_fn)(struct ib_device *);
5491378
550
- list_for_each_entry(client, &client_list, list)
551
- if (!add_client_context(device, client) && client->add)
552
- client->add(device);
1379
+ /*
1380
+ * If we hit this error flow then we don't want to
1381
+ * automatically dealloc the device since the caller is
1382
+ * expected to call ib_dealloc_device() after
1383
+ * ib_register_device() fails. This is tricky due to the
1384
+ * possibility for a parallel unregistration along with this
1385
+ * error flow. Since we have a refcount here we know any
1386
+ * parallel flow is stopped in disable_device and will see the
1387
+ * special dealloc_driver pointer, causing the responsibility to
1388
+ * ib_dealloc_device() to revert back to this thread.
1389
+ */
1390
+ dealloc_fn = device->ops.dealloc_driver;
1391
+ device->ops.dealloc_driver = prevent_dealloc_device;
1392
+ ib_device_put(device);
1393
+ __ib_unregister_device(device);
1394
+ device->ops.dealloc_driver = dealloc_fn;
1395
+ dev_set_uevent_suppress(&device->dev, false);
1396
+ return ret;
1397
+ }
1398
+ dev_set_uevent_suppress(&device->dev, false);
1399
+ /* Mark for userspace that device is ready */
1400
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1401
+ ib_device_put(device);
5531402
554
- device->index = __dev_new_index();
555
- down_write(&lists_rwsem);
556
- list_add_tail(&device->core_list, &device_list);
557
- up_write(&lists_rwsem);
558
- mutex_unlock(&device_mutex);
5591403 return 0;
5601404
1405
+dev_cleanup:
1406
+ device_del(&device->dev);
5611407 cg_cleanup:
1408
+ dev_set_uevent_suppress(&device->dev, false);
5621409 ib_device_unregister_rdmacg(device);
563
-cache_cleanup:
5641410 ib_cache_cleanup_one(device);
565
- ib_cache_release_one(device);
566
-port_cleanup:
567
- kfree(device->port_immutable);
568
-out:
569
- mutex_unlock(&device_mutex);
5701411 return ret;
5711412 }
5721413 EXPORT_SYMBOL(ib_register_device);
5731414
1415
+/* Callers must hold a get on the device. */
1416
+static void __ib_unregister_device(struct ib_device *ib_dev)
1417
+{
1418
+ /*
1419
+ * We have a registration lock so that all the calls to unregister are
1420
+ * fully fenced, once any unregister returns the device is truely
1421
+ * unregistered even if multiple callers are unregistering it at the
1422
+ * same time. This also interacts with the registration flow and
1423
+ * provides sane semantics if register and unregister are racing.
1424
+ */
1425
+ mutex_lock(&ib_dev->unregistration_lock);
1426
+ if (!refcount_read(&ib_dev->refcount))
1427
+ goto out;
1428
+
1429
+ disable_device(ib_dev);
1430
+
1431
+ /* Expedite removing unregistered pointers from the hash table */
1432
+ free_netdevs(ib_dev);
1433
+
1434
+ ib_device_unregister_sysfs(ib_dev);
1435
+ device_del(&ib_dev->dev);
1436
+ ib_device_unregister_rdmacg(ib_dev);
1437
+ ib_cache_cleanup_one(ib_dev);
1438
+
1439
+ /*
1440
+ * Drivers using the new flow may not call ib_dealloc_device except
1441
+ * in error unwind prior to registration success.
1442
+ */
1443
+ if (ib_dev->ops.dealloc_driver &&
1444
+ ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
1445
+ WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
1446
+ ib_dealloc_device(ib_dev);
1447
+ }
1448
+out:
1449
+ mutex_unlock(&ib_dev->unregistration_lock);
1450
+}
1451
+
5741452 /**
5751453 * ib_unregister_device - Unregister an IB device
576
- * @device:Device to unregister
1454
+ * @ib_dev: The device to unregister
5771455 *
5781456 * Unregister an IB device. All clients will receive a remove callback.
1457
+ *
1458
+ * Callers should call this routine only once, and protect against races with
1459
+ * registration. Typically it should only be called as part of a remove
1460
+ * callback in an implementation of driver core's struct device_driver and
1461
+ * related.
1462
+ *
1463
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
1464
+ * this function.
5791465 */
580
-void ib_unregister_device(struct ib_device *device)
1466
+void ib_unregister_device(struct ib_device *ib_dev)
5811467 {
582
- struct ib_client_data *context, *tmp;
583
- unsigned long flags;
584
-
585
- mutex_lock(&device_mutex);
586
-
587
- down_write(&lists_rwsem);
588
- list_del(&device->core_list);
589
- spin_lock_irqsave(&device->client_data_lock, flags);
590
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
591
- context->going_down = true;
592
- spin_unlock_irqrestore(&device->client_data_lock, flags);
593
- downgrade_write(&lists_rwsem);
594
-
595
- list_for_each_entry_safe(context, tmp, &device->client_data_list,
596
- list) {
597
- if (context->client->remove)
598
- context->client->remove(device, context->data);
599
- }
600
- up_read(&lists_rwsem);
601
-
602
- ib_device_unregister_sysfs(device);
603
- ib_device_unregister_rdmacg(device);
604
-
605
- mutex_unlock(&device_mutex);
606
-
607
- ib_cache_cleanup_one(device);
608
-
609
- ib_security_destroy_port_pkey_list(device);
610
- kfree(device->port_pkey_list);
611
-
612
- down_write(&lists_rwsem);
613
- spin_lock_irqsave(&device->client_data_lock, flags);
614
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
615
- kfree(context);
616
- spin_unlock_irqrestore(&device->client_data_lock, flags);
617
- up_write(&lists_rwsem);
618
-
619
- device->reg_state = IB_DEV_UNREGISTERED;
1468
+ get_device(&ib_dev->dev);
1469
+ __ib_unregister_device(ib_dev);
1470
+ put_device(&ib_dev->dev);
6201471 }
6211472 EXPORT_SYMBOL(ib_unregister_device);
1473
+
1474
+/**
1475
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
1476
+ * @ib_dev: The device to unregister
1477
+ *
1478
+ * This is the same as ib_unregister_device(), except it includes an internal
1479
+ * ib_device_put() that should match a 'get' obtained by the caller.
1480
+ *
1481
+ * It is safe to call this routine concurrently from multiple threads while
1482
+ * holding the 'get'. When the function returns the device is fully
1483
+ * unregistered.
1484
+ *
1485
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
1486
+ * their resources associated with the device and dealloc it.
1487
+ */
1488
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
1489
+{
1490
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1491
+ get_device(&ib_dev->dev);
1492
+ ib_device_put(ib_dev);
1493
+ __ib_unregister_device(ib_dev);
1494
+ put_device(&ib_dev->dev);
1495
+}
1496
+EXPORT_SYMBOL(ib_unregister_device_and_put);
1497
+
1498
+/**
1499
+ * ib_unregister_driver - Unregister all IB devices for a driver
1500
+ * @driver_id: The driver to unregister
1501
+ *
1502
+ * This implements a fence for device unregistration. It only returns once all
1503
+ * devices associated with the driver_id have fully completed their
1504
+ * unregistration and returned from ib_unregister_device*().
1505
+ *
1506
+ * If device's are not yet unregistered it goes ahead and starts unregistering
1507
+ * them.
1508
+ *
1509
+ * This does not block creation of new devices with the given driver_id, that
1510
+ * is the responsibility of the caller.
1511
+ */
1512
+void ib_unregister_driver(enum rdma_driver_id driver_id)
1513
+{
1514
+ struct ib_device *ib_dev;
1515
+ unsigned long index;
1516
+
1517
+ down_read(&devices_rwsem);
1518
+ xa_for_each (&devices, index, ib_dev) {
1519
+ if (ib_dev->ops.driver_id != driver_id)
1520
+ continue;
1521
+
1522
+ get_device(&ib_dev->dev);
1523
+ up_read(&devices_rwsem);
1524
+
1525
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1526
+ __ib_unregister_device(ib_dev);
1527
+
1528
+ put_device(&ib_dev->dev);
1529
+ down_read(&devices_rwsem);
1530
+ }
1531
+ up_read(&devices_rwsem);
1532
+}
1533
+EXPORT_SYMBOL(ib_unregister_driver);
1534
+
1535
+static void ib_unregister_work(struct work_struct *work)
1536
+{
1537
+ struct ib_device *ib_dev =
1538
+ container_of(work, struct ib_device, unregistration_work);
1539
+
1540
+ __ib_unregister_device(ib_dev);
1541
+ put_device(&ib_dev->dev);
1542
+}
1543
+
1544
+/**
1545
+ * ib_unregister_device_queued - Unregister a device using a work queue
1546
+ * @ib_dev: The device to unregister
1547
+ *
1548
+ * This schedules an asynchronous unregistration using a WQ for the device. A
1549
+ * driver should use this to avoid holding locks while doing unregistration,
1550
+ * such as holding the RTNL lock.
1551
+ *
1552
+ * Drivers using this API must use ib_unregister_driver before module unload
1553
+ * to ensure that all scheduled unregistrations have completed.
1554
+ */
1555
+void ib_unregister_device_queued(struct ib_device *ib_dev)
1556
+{
1557
+ WARN_ON(!refcount_read(&ib_dev->refcount));
1558
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1559
+ get_device(&ib_dev->dev);
1560
+ if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
1561
+ put_device(&ib_dev->dev);
1562
+}
1563
+EXPORT_SYMBOL(ib_unregister_device_queued);
1564
+
1565
+/*
1566
+ * The caller must pass in a device that has the kref held and the refcount
1567
+ * released. If the device is in cur_net and still registered then it is moved
1568
+ * into net.
1569
+ */
1570
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
1571
+ struct net *net)
1572
+{
1573
+ int ret2 = -EINVAL;
1574
+ int ret;
1575
+
1576
+ mutex_lock(&device->unregistration_lock);
1577
+
1578
+ /*
1579
+ * If a device not under ib_device_get() or if the unregistration_lock
1580
+ * is not held, the namespace can be changed, or it can be unregistered.
1581
+ * Check again under the lock.
1582
+ */
1583
+ if (refcount_read(&device->refcount) == 0 ||
1584
+ !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
1585
+ ret = -ENODEV;
1586
+ goto out;
1587
+ }
1588
+
1589
+ kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
1590
+ disable_device(device);
1591
+
1592
+ /*
1593
+ * At this point no one can be using the device, so it is safe to
1594
+ * change the namespace.
1595
+ */
1596
+ write_pnet(&device->coredev.rdma_net, net);
1597
+
1598
+ down_read(&devices_rwsem);
1599
+ /*
1600
+ * Currently rdma devices are system wide unique. So the device name
1601
+ * is guaranteed free in the new namespace. Publish the new namespace
1602
+ * at the sysfs level.
1603
+ */
1604
+ ret = device_rename(&device->dev, dev_name(&device->dev));
1605
+ up_read(&devices_rwsem);
1606
+ if (ret) {
1607
+ dev_warn(&device->dev,
1608
+ "%s: Couldn't rename device after namespace change\n",
1609
+ __func__);
1610
+ /* Try and put things back and re-enable the device */
1611
+ write_pnet(&device->coredev.rdma_net, cur_net);
1612
+ }
1613
+
1614
+ ret2 = enable_device_and_get(device);
1615
+ if (ret2) {
1616
+ /*
1617
+ * This shouldn't really happen, but if it does, let the user
1618
+ * retry at later point. So don't disable the device.
1619
+ */
1620
+ dev_warn(&device->dev,
1621
+ "%s: Couldn't re-enable device after namespace change\n",
1622
+ __func__);
1623
+ }
1624
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1625
+
1626
+ ib_device_put(device);
1627
+out:
1628
+ mutex_unlock(&device->unregistration_lock);
1629
+ if (ret)
1630
+ return ret;
1631
+ return ret2;
1632
+}
1633
+
1634
+int ib_device_set_netns_put(struct sk_buff *skb,
1635
+ struct ib_device *dev, u32 ns_fd)
1636
+{
1637
+ struct net *net;
1638
+ int ret;
1639
+
1640
+ net = get_net_ns_by_fd(ns_fd);
1641
+ if (IS_ERR(net)) {
1642
+ ret = PTR_ERR(net);
1643
+ goto net_err;
1644
+ }
1645
+
1646
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
1647
+ ret = -EPERM;
1648
+ goto ns_err;
1649
+ }
1650
+
1651
+ /*
1652
+ * Currently supported only for those providers which support
1653
+ * disassociation and don't do port specific sysfs init. Once a
1654
+ * port_cleanup infrastructure is implemented, this limitation will be
1655
+ * removed.
1656
+ */
1657
+ if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
1658
+ ib_devices_shared_netns) {
1659
+ ret = -EOPNOTSUPP;
1660
+ goto ns_err;
1661
+ }
1662
+
1663
+ get_device(&dev->dev);
1664
+ ib_device_put(dev);
1665
+ ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
1666
+ put_device(&dev->dev);
1667
+
1668
+ put_net(net);
1669
+ return ret;
1670
+
1671
+ns_err:
1672
+ put_net(net);
1673
+net_err:
1674
+ ib_device_put(dev);
1675
+ return ret;
1676
+}
1677
+
1678
+static struct pernet_operations rdma_dev_net_ops = {
1679
+ .init = rdma_dev_init_net,
1680
+ .exit = rdma_dev_exit_net,
1681
+ .id = &rdma_dev_net_id,
1682
+ .size = sizeof(struct rdma_dev_net),
1683
+};
1684
+
1685
+static int assign_client_id(struct ib_client *client)
1686
+{
1687
+ int ret;
1688
+
1689
+ down_write(&clients_rwsem);
1690
+ /*
1691
+ * The add/remove callbacks must be called in FIFO/LIFO order. To
1692
+ * achieve this we assign client_ids so they are sorted in
1693
+ * registration order.
1694
+ */
1695
+ client->client_id = highest_client_id;
1696
+ ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
1697
+ if (ret)
1698
+ goto out;
1699
+
1700
+ highest_client_id++;
1701
+ xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
1702
+
1703
+out:
1704
+ up_write(&clients_rwsem);
1705
+ return ret;
1706
+}
1707
+
1708
+static void remove_client_id(struct ib_client *client)
1709
+{
1710
+ down_write(&clients_rwsem);
1711
+ xa_erase(&clients, client->client_id);
1712
+ for (; highest_client_id; highest_client_id--)
1713
+ if (xa_load(&clients, highest_client_id - 1))
1714
+ break;
1715
+ up_write(&clients_rwsem);
1716
+}
6221717
6231718 /**
6241719 * ib_register_client - Register an IB client
....@@ -636,19 +1731,25 @@
6361731 int ib_register_client(struct ib_client *client)
6371732 {
6381733 struct ib_device *device;
1734
+ unsigned long index;
1735
+ int ret;
6391736
640
- mutex_lock(&device_mutex);
1737
+ refcount_set(&client->uses, 1);
1738
+ init_completion(&client->uses_zero);
1739
+ ret = assign_client_id(client);
1740
+ if (ret)
1741
+ return ret;
6411742
642
- list_for_each_entry(device, &device_list, core_list)
643
- if (!add_client_context(device, client) && client->add)
644
- client->add(device);
645
-
646
- down_write(&lists_rwsem);
647
- list_add_tail(&client->list, &client_list);
648
- up_write(&lists_rwsem);
649
-
650
- mutex_unlock(&device_mutex);
651
-
1743
+ down_read(&devices_rwsem);
1744
+ xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
1745
+ ret = add_client_context(device, client);
1746
+ if (ret) {
1747
+ up_read(&devices_rwsem);
1748
+ ib_unregister_client(client);
1749
+ return ret;
1750
+ }
1751
+ }
1752
+ up_read(&devices_rwsem);
6521753 return 0;
6531754 }
6541755 EXPORT_SYMBOL(ib_register_client);
....@@ -660,80 +1761,140 @@
6601761 * Upper level users use ib_unregister_client() to remove their client
6611762 * registration. When ib_unregister_client() is called, the client
6621763 * will receive a remove callback for each IB device still registered.
1764
+ *
1765
+ * This is a full fence, once it returns no client callbacks will be called,
1766
+ * or are running in another thread.
6631767 */
6641768 void ib_unregister_client(struct ib_client *client)
6651769 {
666
- struct ib_client_data *context, *tmp;
6671770 struct ib_device *device;
668
- unsigned long flags;
1771
+ unsigned long index;
6691772
670
- mutex_lock(&device_mutex);
1773
+ down_write(&clients_rwsem);
1774
+ ib_client_put(client);
1775
+ xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
1776
+ up_write(&clients_rwsem);
6711777
672
- down_write(&lists_rwsem);
673
- list_del(&client->list);
674
- up_write(&lists_rwsem);
675
-
676
- list_for_each_entry(device, &device_list, core_list) {
677
- struct ib_client_data *found_context = NULL;
678
-
679
- down_write(&lists_rwsem);
680
- spin_lock_irqsave(&device->client_data_lock, flags);
681
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
682
- if (context->client == client) {
683
- context->going_down = true;
684
- found_context = context;
685
- break;
686
- }
687
- spin_unlock_irqrestore(&device->client_data_lock, flags);
688
- up_write(&lists_rwsem);
689
-
690
- if (client->remove)
691
- client->remove(device, found_context ?
692
- found_context->data : NULL);
693
-
694
- if (!found_context) {
695
- pr_warn("No client context found for %s/%s\n",
696
- device->name, client->name);
1778
+ /* We do not want to have locks while calling client->remove() */
1779
+ rcu_read_lock();
1780
+ xa_for_each (&devices, index, device) {
1781
+ if (!ib_device_try_get(device))
6971782 continue;
698
- }
1783
+ rcu_read_unlock();
6991784
700
- down_write(&lists_rwsem);
701
- spin_lock_irqsave(&device->client_data_lock, flags);
702
- list_del(&found_context->list);
703
- kfree(found_context);
704
- spin_unlock_irqrestore(&device->client_data_lock, flags);
705
- up_write(&lists_rwsem);
1785
+ remove_client_context(device, client->client_id);
1786
+
1787
+ ib_device_put(device);
1788
+ rcu_read_lock();
7061789 }
1790
+ rcu_read_unlock();
7071791
708
- mutex_unlock(&device_mutex);
1792
+ /*
1793
+ * remove_client_context() is not a fence, it can return even though a
1794
+ * removal is ongoing. Wait until all removals are completed.
1795
+ */
1796
+ wait_for_completion(&client->uses_zero);
1797
+ remove_client_id(client);
7091798 }
7101799 EXPORT_SYMBOL(ib_unregister_client);
7111800
712
-/**
713
- * ib_get_client_data - Get IB client context
714
- * @device:Device to get context for
715
- * @client:Client to get context for
716
- *
717
- * ib_get_client_data() returns client context set with
718
- * ib_set_client_data().
719
- */
720
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
1801
+static int __ib_get_global_client_nl_info(const char *client_name,
1802
+ struct ib_client_nl_info *res)
7211803 {
722
- struct ib_client_data *context;
723
- void *ret = NULL;
724
- unsigned long flags;
1804
+ struct ib_client *client;
1805
+ unsigned long index;
1806
+ int ret = -ENOENT;
7251807
726
- spin_lock_irqsave(&device->client_data_lock, flags);
727
- list_for_each_entry(context, &device->client_data_list, list)
728
- if (context->client == client) {
729
- ret = context->data;
1808
+ down_read(&clients_rwsem);
1809
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1810
+ if (strcmp(client->name, client_name) != 0)
1811
+ continue;
1812
+ if (!client->get_global_nl_info) {
1813
+ ret = -EOPNOTSUPP;
7301814 break;
7311815 }
732
- spin_unlock_irqrestore(&device->client_data_lock, flags);
1816
+ ret = client->get_global_nl_info(res);
1817
+ if (WARN_ON(ret == -ENOENT))
1818
+ ret = -EINVAL;
1819
+ if (!ret && res->cdev)
1820
+ get_device(res->cdev);
1821
+ break;
1822
+ }
1823
+ up_read(&clients_rwsem);
1824
+ return ret;
1825
+}
1826
+
1827
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
1828
+ const char *client_name,
1829
+ struct ib_client_nl_info *res)
1830
+{
1831
+ unsigned long index;
1832
+ void *client_data;
1833
+ int ret = -ENOENT;
1834
+
1835
+ down_read(&ibdev->client_data_rwsem);
1836
+ xan_for_each_marked (&ibdev->client_data, index, client_data,
1837
+ CLIENT_DATA_REGISTERED) {
1838
+ struct ib_client *client = xa_load(&clients, index);
1839
+
1840
+ if (!client || strcmp(client->name, client_name) != 0)
1841
+ continue;
1842
+ if (!client->get_nl_info) {
1843
+ ret = -EOPNOTSUPP;
1844
+ break;
1845
+ }
1846
+ ret = client->get_nl_info(ibdev, client_data, res);
1847
+ if (WARN_ON(ret == -ENOENT))
1848
+ ret = -EINVAL;
1849
+
1850
+ /*
1851
+ * The cdev is guaranteed valid as long as we are inside the
1852
+ * client_data_rwsem as remove_one can't be called. Keep it
1853
+ * valid for the caller.
1854
+ */
1855
+ if (!ret && res->cdev)
1856
+ get_device(res->cdev);
1857
+ break;
1858
+ }
1859
+ up_read(&ibdev->client_data_rwsem);
7331860
7341861 return ret;
7351862 }
736
-EXPORT_SYMBOL(ib_get_client_data);
1863
+
1864
+/**
1865
+ * ib_get_client_nl_info - Fetch the nl_info from a client
1866
+ * @device - IB device
1867
+ * @client_name - Name of the client
1868
+ * @res - Result of the query
1869
+ */
1870
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
1871
+ struct ib_client_nl_info *res)
1872
+{
1873
+ int ret;
1874
+
1875
+ if (ibdev)
1876
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
1877
+ else
1878
+ ret = __ib_get_global_client_nl_info(client_name, res);
1879
+#ifdef CONFIG_MODULES
1880
+ if (ret == -ENOENT) {
1881
+ request_module("rdma-client-%s", client_name);
1882
+ if (ibdev)
1883
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
1884
+ else
1885
+ ret = __ib_get_global_client_nl_info(client_name, res);
1886
+ }
1887
+#endif
1888
+ if (ret) {
1889
+ if (ret == -ENOENT)
1890
+ return -EOPNOTSUPP;
1891
+ return ret;
1892
+ }
1893
+
1894
+ if (WARN_ON(!res->cdev))
1895
+ return -EINVAL;
1896
+ return 0;
1897
+}
7371898
7381899 /**
7391900 * ib_set_client_data - Set IB client context
....@@ -741,27 +1902,22 @@
7411902 * @client:Client to set context for
7421903 * @data:Context to set
7431904 *
744
- * ib_set_client_data() sets client context that can be retrieved with
745
- * ib_get_client_data().
1905
+ * ib_set_client_data() sets client context data that can be retrieved with
1906
+ * ib_get_client_data(). This can only be called while the client is
1907
+ * registered to the device, once the ib_client remove() callback returns this
1908
+ * cannot be called.
7461909 */
7471910 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
7481911 void *data)
7491912 {
750
- struct ib_client_data *context;
751
- unsigned long flags;
1913
+ void *rc;
7521914
753
- spin_lock_irqsave(&device->client_data_lock, flags);
754
- list_for_each_entry(context, &device->client_data_list, list)
755
- if (context->client == client) {
756
- context->data = data;
757
- goto out;
758
- }
1915
+ if (WARN_ON(IS_ERR(data)))
1916
+ data = NULL;
7591917
760
- pr_warn("No client context found for %s/%s\n",
761
- device->name, client->name);
762
-
763
-out:
764
- spin_unlock_irqrestore(&device->client_data_lock, flags);
1918
+ rc = xa_store(&device->client_data, client->client_id, data,
1919
+ GFP_KERNEL);
1920
+ WARN_ON(xa_is_err(rc));
7651921 }
7661922 EXPORT_SYMBOL(ib_set_client_data);
7671923
....@@ -771,17 +1927,15 @@
7711927 *
7721928 * ib_register_event_handler() registers an event handler that will be
7731929 * called back when asynchronous IB events occur (as defined in
774
- * chapter 11 of the InfiniBand Architecture Specification). This
775
- * callback may occur in interrupt context.
1930
+ * chapter 11 of the InfiniBand Architecture Specification). This
1931
+ * callback occurs in workqueue context.
7761932 */
7771933 void ib_register_event_handler(struct ib_event_handler *event_handler)
7781934 {
779
- unsigned long flags;
780
-
781
- spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1935
+ down_write(&event_handler->device->event_handler_rwsem);
7821936 list_add_tail(&event_handler->list,
7831937 &event_handler->device->event_handler_list);
784
- spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1938
+ up_write(&event_handler->device->event_handler_rwsem);
7851939 }
7861940 EXPORT_SYMBOL(ib_register_event_handler);
7871941
....@@ -794,35 +1948,87 @@
7941948 */
7951949 void ib_unregister_event_handler(struct ib_event_handler *event_handler)
7961950 {
797
- unsigned long flags;
798
-
799
- spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1951
+ down_write(&event_handler->device->event_handler_rwsem);
8001952 list_del(&event_handler->list);
801
- spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1953
+ up_write(&event_handler->device->event_handler_rwsem);
8021954 }
8031955 EXPORT_SYMBOL(ib_unregister_event_handler);
8041956
805
-/**
806
- * ib_dispatch_event - Dispatch an asynchronous event
807
- * @event:Event to dispatch
808
- *
809
- * Low-level drivers must call ib_dispatch_event() to dispatch the
810
- * event to all registered event handlers when an asynchronous event
811
- * occurs.
812
- */
813
-void ib_dispatch_event(struct ib_event *event)
1957
+void ib_dispatch_event_clients(struct ib_event *event)
8141958 {
815
- unsigned long flags;
8161959 struct ib_event_handler *handler;
8171960
818
- spin_lock_irqsave(&event->device->event_handler_lock, flags);
1961
+ down_read(&event->device->event_handler_rwsem);
8191962
8201963 list_for_each_entry(handler, &event->device->event_handler_list, list)
8211964 handler->handler(handler, event);
8221965
823
- spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
1966
+ up_read(&event->device->event_handler_rwsem);
8241967 }
825
-EXPORT_SYMBOL(ib_dispatch_event);
1968
+
1969
+static int iw_query_port(struct ib_device *device,
1970
+ u8 port_num,
1971
+ struct ib_port_attr *port_attr)
1972
+{
1973
+ struct in_device *inetdev;
1974
+ struct net_device *netdev;
1975
+
1976
+ memset(port_attr, 0, sizeof(*port_attr));
1977
+
1978
+ netdev = ib_device_get_netdev(device, port_num);
1979
+ if (!netdev)
1980
+ return -ENODEV;
1981
+
1982
+ port_attr->max_mtu = IB_MTU_4096;
1983
+ port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
1984
+
1985
+ if (!netif_carrier_ok(netdev)) {
1986
+ port_attr->state = IB_PORT_DOWN;
1987
+ port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
1988
+ } else {
1989
+ rcu_read_lock();
1990
+ inetdev = __in_dev_get_rcu(netdev);
1991
+
1992
+ if (inetdev && inetdev->ifa_list) {
1993
+ port_attr->state = IB_PORT_ACTIVE;
1994
+ port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
1995
+ } else {
1996
+ port_attr->state = IB_PORT_INIT;
1997
+ port_attr->phys_state =
1998
+ IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
1999
+ }
2000
+
2001
+ rcu_read_unlock();
2002
+ }
2003
+
2004
+ dev_put(netdev);
2005
+ return device->ops.query_port(device, port_num, port_attr);
2006
+}
2007
+
2008
+static int __ib_query_port(struct ib_device *device,
2009
+ u8 port_num,
2010
+ struct ib_port_attr *port_attr)
2011
+{
2012
+ union ib_gid gid = {};
2013
+ int err;
2014
+
2015
+ memset(port_attr, 0, sizeof(*port_attr));
2016
+
2017
+ err = device->ops.query_port(device, port_num, port_attr);
2018
+ if (err || port_attr->subnet_prefix)
2019
+ return err;
2020
+
2021
+ if (rdma_port_get_link_layer(device, port_num) !=
2022
+ IB_LINK_LAYER_INFINIBAND)
2023
+ return 0;
2024
+
2025
+ err = device->ops.query_gid(device, port_num, 0, &gid);
2026
+ if (err)
2027
+ return err;
2028
+
2029
+ port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
2030
+ return 0;
2031
+}
8262032
8272033 /**
8282034 * ib_query_port - Query IB port attributes
....@@ -837,28 +2043,197 @@
8372043 u8 port_num,
8382044 struct ib_port_attr *port_attr)
8392045 {
840
- union ib_gid gid;
841
- int err;
842
-
8432046 if (!rdma_is_port_valid(device, port_num))
8442047 return -EINVAL;
8452048
846
- memset(port_attr, 0, sizeof(*port_attr));
847
- err = device->query_port(device, port_num, port_attr);
848
- if (err || port_attr->subnet_prefix)
849
- return err;
850
-
851
- if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
852
- return 0;
853
-
854
- err = device->query_gid(device, port_num, 0, &gid);
855
- if (err)
856
- return err;
857
-
858
- port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
859
- return 0;
2049
+ if (rdma_protocol_iwarp(device, port_num))
2050
+ return iw_query_port(device, port_num, port_attr);
2051
+ else
2052
+ return __ib_query_port(device, port_num, port_attr);
8602053 }
8612054 EXPORT_SYMBOL(ib_query_port);
2055
+
2056
+static void add_ndev_hash(struct ib_port_data *pdata)
2057
+{
2058
+ unsigned long flags;
2059
+
2060
+ might_sleep();
2061
+
2062
+ spin_lock_irqsave(&ndev_hash_lock, flags);
2063
+ if (hash_hashed(&pdata->ndev_hash_link)) {
2064
+ hash_del_rcu(&pdata->ndev_hash_link);
2065
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
2066
+ /*
2067
+ * We cannot do hash_add_rcu after a hash_del_rcu until the
2068
+ * grace period
2069
+ */
2070
+ synchronize_rcu();
2071
+ spin_lock_irqsave(&ndev_hash_lock, flags);
2072
+ }
2073
+ if (pdata->netdev)
2074
+ hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
2075
+ (uintptr_t)pdata->netdev);
2076
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
2077
+}
2078
+
2079
+/**
2080
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
2081
+ * @ib_dev: Device to modify
2082
+ * @ndev: net_device to affiliate, may be NULL
2083
+ * @port: IB port the net_device is connected to
2084
+ *
2085
+ * Drivers should use this to link the ib_device to a netdev so the netdev
2086
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
2087
+ * affiliated with any port.
2088
+ *
2089
+ * The caller must ensure that the given ndev is not unregistered or
2090
+ * unregistering, and that either the ib_device is unregistered or
2091
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
2092
+ * NETDEV_UNREGISTER event.
2093
+ */
2094
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
2095
+ unsigned int port)
2096
+{
2097
+ struct net_device *old_ndev;
2098
+ struct ib_port_data *pdata;
2099
+ unsigned long flags;
2100
+ int ret;
2101
+
2102
+ /*
2103
+ * Drivers wish to call this before ib_register_driver, so we have to
2104
+ * setup the port data early.
2105
+ */
2106
+ ret = alloc_port_data(ib_dev);
2107
+ if (ret)
2108
+ return ret;
2109
+
2110
+ if (!rdma_is_port_valid(ib_dev, port))
2111
+ return -EINVAL;
2112
+
2113
+ pdata = &ib_dev->port_data[port];
2114
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
2115
+ old_ndev = rcu_dereference_protected(
2116
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2117
+ if (old_ndev == ndev) {
2118
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2119
+ return 0;
2120
+ }
2121
+
2122
+ if (ndev)
2123
+ dev_hold(ndev);
2124
+ rcu_assign_pointer(pdata->netdev, ndev);
2125
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2126
+
2127
+ add_ndev_hash(pdata);
2128
+ if (old_ndev)
2129
+ dev_put(old_ndev);
2130
+
2131
+ return 0;
2132
+}
2133
+EXPORT_SYMBOL(ib_device_set_netdev);
2134
+
2135
+static void free_netdevs(struct ib_device *ib_dev)
2136
+{
2137
+ unsigned long flags;
2138
+ unsigned int port;
2139
+
2140
+ if (!ib_dev->port_data)
2141
+ return;
2142
+
2143
+ rdma_for_each_port (ib_dev, port) {
2144
+ struct ib_port_data *pdata = &ib_dev->port_data[port];
2145
+ struct net_device *ndev;
2146
+
2147
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
2148
+ ndev = rcu_dereference_protected(
2149
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2150
+ if (ndev) {
2151
+ spin_lock(&ndev_hash_lock);
2152
+ hash_del_rcu(&pdata->ndev_hash_link);
2153
+ spin_unlock(&ndev_hash_lock);
2154
+
2155
+ /*
2156
+ * If this is the last dev_put there is still a
2157
+ * synchronize_rcu before the netdev is kfreed, so we
2158
+ * can continue to rely on unlocked pointer
2159
+ * comparisons after the put
2160
+ */
2161
+ rcu_assign_pointer(pdata->netdev, NULL);
2162
+ dev_put(ndev);
2163
+ }
2164
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2165
+ }
2166
+}
2167
+
2168
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
2169
+ unsigned int port)
2170
+{
2171
+ struct ib_port_data *pdata;
2172
+ struct net_device *res;
2173
+
2174
+ if (!rdma_is_port_valid(ib_dev, port))
2175
+ return NULL;
2176
+
2177
+ pdata = &ib_dev->port_data[port];
2178
+
2179
+ /*
2180
+ * New drivers should use ib_device_set_netdev() not the legacy
2181
+ * get_netdev().
2182
+ */
2183
+ if (ib_dev->ops.get_netdev)
2184
+ res = ib_dev->ops.get_netdev(ib_dev, port);
2185
+ else {
2186
+ spin_lock(&pdata->netdev_lock);
2187
+ res = rcu_dereference_protected(
2188
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2189
+ if (res)
2190
+ dev_hold(res);
2191
+ spin_unlock(&pdata->netdev_lock);
2192
+ }
2193
+
2194
+ /*
2195
+ * If we are starting to unregister expedite things by preventing
2196
+ * propagation of an unregistering netdev.
2197
+ */
2198
+ if (res && res->reg_state != NETREG_REGISTERED) {
2199
+ dev_put(res);
2200
+ return NULL;
2201
+ }
2202
+
2203
+ return res;
2204
+}
2205
+
2206
+/**
2207
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
2208
+ * @ndev: netdev to locate
2209
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
2210
+ *
2211
+ * Find and hold an ib_device that is associated with a netdev via
2212
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
2213
+ * returned pointer.
2214
+ */
2215
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
2216
+ enum rdma_driver_id driver_id)
2217
+{
2218
+ struct ib_device *res = NULL;
2219
+ struct ib_port_data *cur;
2220
+
2221
+ rcu_read_lock();
2222
+ hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
2223
+ (uintptr_t)ndev) {
2224
+ if (rcu_access_pointer(cur->netdev) == ndev &&
2225
+ (driver_id == RDMA_DRIVER_UNKNOWN ||
2226
+ cur->ib_dev->ops.driver_id == driver_id) &&
2227
+ ib_device_try_get(cur->ib_dev)) {
2228
+ res = cur->ib_dev;
2229
+ break;
2230
+ }
2231
+ }
2232
+ rcu_read_unlock();
2233
+
2234
+ return res;
2235
+}
2236
+EXPORT_SYMBOL(ib_device_get_by_netdev);
8622237
8632238 /**
8642239 * ib_enum_roce_netdev - enumerate all RoCE ports
....@@ -878,21 +2253,12 @@
8782253 roce_netdev_callback cb,
8792254 void *cookie)
8802255 {
881
- u8 port;
2256
+ unsigned int port;
8822257
883
- for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
884
- port++)
2258
+ rdma_for_each_port (ib_dev, port)
8852259 if (rdma_protocol_roce(ib_dev, port)) {
886
- struct net_device *idev = NULL;
887
-
888
- if (ib_dev->get_netdev)
889
- idev = ib_dev->get_netdev(ib_dev, port);
890
-
891
- if (idev &&
892
- idev->reg_state >= NETREG_UNREGISTERED) {
893
- dev_put(idev);
894
- idev = NULL;
895
- }
2260
+ struct net_device *idev =
2261
+ ib_device_get_netdev(ib_dev, port);
8962262
8972263 if (filter(ib_dev, port, idev, filter_cookie))
8982264 cb(ib_dev, port, idev, cookie);
....@@ -919,11 +2285,12 @@
9192285 void *cookie)
9202286 {
9212287 struct ib_device *dev;
2288
+ unsigned long index;
9222289
923
- down_read(&lists_rwsem);
924
- list_for_each_entry(dev, &device_list, core_list)
2290
+ down_read(&devices_rwsem);
2291
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
9252292 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
926
- up_read(&lists_rwsem);
2293
+ up_read(&devices_rwsem);
9272294 }
9282295
9292296 /**
....@@ -935,19 +2302,22 @@
9352302 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
9362303 struct netlink_callback *cb)
9372304 {
2305
+ unsigned long index;
9382306 struct ib_device *dev;
9392307 unsigned int idx = 0;
9402308 int ret = 0;
9412309
942
- down_read(&lists_rwsem);
943
- list_for_each_entry(dev, &device_list, core_list) {
2310
+ down_read(&devices_rwsem);
2311
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
2312
+ if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
2313
+ continue;
2314
+
9442315 ret = nldev_cb(dev, skb, cb, idx);
9452316 if (ret)
9462317 break;
9472318 idx++;
9482319 }
949
-
950
- up_read(&lists_rwsem);
2320
+ up_read(&devices_rwsem);
9512321 return ret;
9522322 }
9532323
....@@ -963,7 +2333,13 @@
9632333 int ib_query_pkey(struct ib_device *device,
9642334 u8 port_num, u16 index, u16 *pkey)
9652335 {
966
- return device->query_pkey(device, port_num, index, pkey);
2336
+ if (!rdma_is_port_valid(device, port_num))
2337
+ return -EINVAL;
2338
+
2339
+ if (!device->ops.query_pkey)
2340
+ return -EOPNOTSUPP;
2341
+
2342
+ return device->ops.query_pkey(device, port_num, index, pkey);
9672343 }
9682344 EXPORT_SYMBOL(ib_query_pkey);
9692345
....@@ -980,11 +2356,11 @@
9802356 int device_modify_mask,
9812357 struct ib_device_modify *device_modify)
9822358 {
983
- if (!device->modify_device)
984
- return -ENOSYS;
2359
+ if (!device->ops.modify_device)
2360
+ return -EOPNOTSUPP;
9852361
986
- return device->modify_device(device, device_modify_mask,
987
- device_modify);
2362
+ return device->ops.modify_device(device, device_modify_mask,
2363
+ device_modify);
9882364 }
9892365 EXPORT_SYMBOL(ib_modify_device);
9902366
....@@ -1008,11 +2384,16 @@
10082384 if (!rdma_is_port_valid(device, port_num))
10092385 return -EINVAL;
10102386
1011
- if (device->modify_port)
1012
- rc = device->modify_port(device, port_num, port_modify_mask,
1013
- port_modify);
2387
+ if (device->ops.modify_port)
2388
+ rc = device->ops.modify_port(device, port_num,
2389
+ port_modify_mask,
2390
+ port_modify);
2391
+ else if (rdma_protocol_roce(device, port_num) &&
2392
+ ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
2393
+ (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
2394
+ rc = 0;
10142395 else
1015
- rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
2396
+ rc = -EOPNOTSUPP;
10162397 return rc;
10172398 }
10182399 EXPORT_SYMBOL(ib_modify_port);
....@@ -1030,13 +2411,15 @@
10302411 u8 *port_num, u16 *index)
10312412 {
10322413 union ib_gid tmp_gid;
1033
- int ret, port, i;
2414
+ unsigned int port;
2415
+ int ret, i;
10342416
1035
- for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
2417
+ rdma_for_each_port (device, port) {
10362418 if (!rdma_protocol_ib(device, port))
10372419 continue;
10382420
1039
- for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
2421
+ for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
2422
+ ++i) {
10402423 ret = rdma_query_gid(device, port, i, &tmp_gid);
10412424 if (ret)
10422425 continue;
....@@ -1069,7 +2452,8 @@
10692452 u16 tmp_pkey;
10702453 int partial_ix = -1;
10712454
1072
- for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
2455
+ for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
2456
+ ++i) {
10732457 ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
10742458 if (ret)
10752459 return ret;
....@@ -1102,6 +2486,7 @@
11022486 * @gid: A GID that the net_dev uses to communicate.
11032487 * @addr: Contains the IP address that the request specified as its
11042488 * destination.
2489
+ *
11052490 */
11062491 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
11072492 u8 port,
....@@ -1110,33 +2495,201 @@
11102495 const struct sockaddr *addr)
11112496 {
11122497 struct net_device *net_dev = NULL;
1113
- struct ib_client_data *context;
2498
+ unsigned long index;
2499
+ void *client_data;
11142500
11152501 if (!rdma_protocol_ib(dev, port))
11162502 return NULL;
11172503
1118
- down_read(&lists_rwsem);
2504
+ /*
2505
+ * Holding the read side guarantees that the client will not become
2506
+ * unregistered while we are calling get_net_dev_by_params()
2507
+ */
2508
+ down_read(&dev->client_data_rwsem);
2509
+ xan_for_each_marked (&dev->client_data, index, client_data,
2510
+ CLIENT_DATA_REGISTERED) {
2511
+ struct ib_client *client = xa_load(&clients, index);
11192512
1120
- list_for_each_entry(context, &dev->client_data_list, list) {
1121
- struct ib_client *client = context->client;
1122
-
1123
- if (context->going_down)
2513
+ if (!client || !client->get_net_dev_by_params)
11242514 continue;
11252515
1126
- if (client->get_net_dev_by_params) {
1127
- net_dev = client->get_net_dev_by_params(dev, port, pkey,
1128
- gid, addr,
1129
- context->data);
1130
- if (net_dev)
1131
- break;
1132
- }
2516
+ net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
2517
+ addr, client_data);
2518
+ if (net_dev)
2519
+ break;
11332520 }
1134
-
1135
- up_read(&lists_rwsem);
2521
+ up_read(&dev->client_data_rwsem);
11362522
11372523 return net_dev;
11382524 }
11392525 EXPORT_SYMBOL(ib_get_net_dev_by_params);
2526
+
2527
+void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
2528
+{
2529
+ struct ib_device_ops *dev_ops = &dev->ops;
2530
+#define SET_DEVICE_OP(ptr, name) \
2531
+ do { \
2532
+ if (ops->name) \
2533
+ if (!((ptr)->name)) \
2534
+ (ptr)->name = ops->name; \
2535
+ } while (0)
2536
+
2537
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
2538
+
2539
+ if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
2540
+ WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
2541
+ dev_ops->driver_id != ops->driver_id);
2542
+ dev_ops->driver_id = ops->driver_id;
2543
+ }
2544
+ if (ops->owner) {
2545
+ WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
2546
+ dev_ops->owner = ops->owner;
2547
+ }
2548
+ if (ops->uverbs_abi_ver)
2549
+ dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
2550
+
2551
+ dev_ops->uverbs_no_driver_id_binding |=
2552
+ ops->uverbs_no_driver_id_binding;
2553
+
2554
+ SET_DEVICE_OP(dev_ops, add_gid);
2555
+ SET_DEVICE_OP(dev_ops, advise_mr);
2556
+ SET_DEVICE_OP(dev_ops, alloc_dm);
2557
+ SET_DEVICE_OP(dev_ops, alloc_hw_stats);
2558
+ SET_DEVICE_OP(dev_ops, alloc_mr);
2559
+ SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
2560
+ SET_DEVICE_OP(dev_ops, alloc_mw);
2561
+ SET_DEVICE_OP(dev_ops, alloc_pd);
2562
+ SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
2563
+ SET_DEVICE_OP(dev_ops, alloc_ucontext);
2564
+ SET_DEVICE_OP(dev_ops, alloc_xrcd);
2565
+ SET_DEVICE_OP(dev_ops, attach_mcast);
2566
+ SET_DEVICE_OP(dev_ops, check_mr_status);
2567
+ SET_DEVICE_OP(dev_ops, counter_alloc_stats);
2568
+ SET_DEVICE_OP(dev_ops, counter_bind_qp);
2569
+ SET_DEVICE_OP(dev_ops, counter_dealloc);
2570
+ SET_DEVICE_OP(dev_ops, counter_unbind_qp);
2571
+ SET_DEVICE_OP(dev_ops, counter_update_stats);
2572
+ SET_DEVICE_OP(dev_ops, create_ah);
2573
+ SET_DEVICE_OP(dev_ops, create_counters);
2574
+ SET_DEVICE_OP(dev_ops, create_cq);
2575
+ SET_DEVICE_OP(dev_ops, create_flow);
2576
+ SET_DEVICE_OP(dev_ops, create_flow_action_esp);
2577
+ SET_DEVICE_OP(dev_ops, create_qp);
2578
+ SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
2579
+ SET_DEVICE_OP(dev_ops, create_srq);
2580
+ SET_DEVICE_OP(dev_ops, create_wq);
2581
+ SET_DEVICE_OP(dev_ops, dealloc_dm);
2582
+ SET_DEVICE_OP(dev_ops, dealloc_driver);
2583
+ SET_DEVICE_OP(dev_ops, dealloc_mw);
2584
+ SET_DEVICE_OP(dev_ops, dealloc_pd);
2585
+ SET_DEVICE_OP(dev_ops, dealloc_ucontext);
2586
+ SET_DEVICE_OP(dev_ops, dealloc_xrcd);
2587
+ SET_DEVICE_OP(dev_ops, del_gid);
2588
+ SET_DEVICE_OP(dev_ops, dereg_mr);
2589
+ SET_DEVICE_OP(dev_ops, destroy_ah);
2590
+ SET_DEVICE_OP(dev_ops, destroy_counters);
2591
+ SET_DEVICE_OP(dev_ops, destroy_cq);
2592
+ SET_DEVICE_OP(dev_ops, destroy_flow);
2593
+ SET_DEVICE_OP(dev_ops, destroy_flow_action);
2594
+ SET_DEVICE_OP(dev_ops, destroy_qp);
2595
+ SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
2596
+ SET_DEVICE_OP(dev_ops, destroy_srq);
2597
+ SET_DEVICE_OP(dev_ops, destroy_wq);
2598
+ SET_DEVICE_OP(dev_ops, detach_mcast);
2599
+ SET_DEVICE_OP(dev_ops, disassociate_ucontext);
2600
+ SET_DEVICE_OP(dev_ops, drain_rq);
2601
+ SET_DEVICE_OP(dev_ops, drain_sq);
2602
+ SET_DEVICE_OP(dev_ops, enable_driver);
2603
+ SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
2604
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
2605
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
2606
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
2607
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
2608
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
2609
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
2610
+ SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
2611
+ SET_DEVICE_OP(dev_ops, get_dev_fw_str);
2612
+ SET_DEVICE_OP(dev_ops, get_dma_mr);
2613
+ SET_DEVICE_OP(dev_ops, get_hw_stats);
2614
+ SET_DEVICE_OP(dev_ops, get_link_layer);
2615
+ SET_DEVICE_OP(dev_ops, get_netdev);
2616
+ SET_DEVICE_OP(dev_ops, get_port_immutable);
2617
+ SET_DEVICE_OP(dev_ops, get_vector_affinity);
2618
+ SET_DEVICE_OP(dev_ops, get_vf_config);
2619
+ SET_DEVICE_OP(dev_ops, get_vf_guid);
2620
+ SET_DEVICE_OP(dev_ops, get_vf_stats);
2621
+ SET_DEVICE_OP(dev_ops, init_port);
2622
+ SET_DEVICE_OP(dev_ops, iw_accept);
2623
+ SET_DEVICE_OP(dev_ops, iw_add_ref);
2624
+ SET_DEVICE_OP(dev_ops, iw_connect);
2625
+ SET_DEVICE_OP(dev_ops, iw_create_listen);
2626
+ SET_DEVICE_OP(dev_ops, iw_destroy_listen);
2627
+ SET_DEVICE_OP(dev_ops, iw_get_qp);
2628
+ SET_DEVICE_OP(dev_ops, iw_reject);
2629
+ SET_DEVICE_OP(dev_ops, iw_rem_ref);
2630
+ SET_DEVICE_OP(dev_ops, map_mr_sg);
2631
+ SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
2632
+ SET_DEVICE_OP(dev_ops, mmap);
2633
+ SET_DEVICE_OP(dev_ops, mmap_free);
2634
+ SET_DEVICE_OP(dev_ops, modify_ah);
2635
+ SET_DEVICE_OP(dev_ops, modify_cq);
2636
+ SET_DEVICE_OP(dev_ops, modify_device);
2637
+ SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
2638
+ SET_DEVICE_OP(dev_ops, modify_port);
2639
+ SET_DEVICE_OP(dev_ops, modify_qp);
2640
+ SET_DEVICE_OP(dev_ops, modify_srq);
2641
+ SET_DEVICE_OP(dev_ops, modify_wq);
2642
+ SET_DEVICE_OP(dev_ops, peek_cq);
2643
+ SET_DEVICE_OP(dev_ops, poll_cq);
2644
+ SET_DEVICE_OP(dev_ops, post_recv);
2645
+ SET_DEVICE_OP(dev_ops, post_send);
2646
+ SET_DEVICE_OP(dev_ops, post_srq_recv);
2647
+ SET_DEVICE_OP(dev_ops, process_mad);
2648
+ SET_DEVICE_OP(dev_ops, query_ah);
2649
+ SET_DEVICE_OP(dev_ops, query_device);
2650
+ SET_DEVICE_OP(dev_ops, query_gid);
2651
+ SET_DEVICE_OP(dev_ops, query_pkey);
2652
+ SET_DEVICE_OP(dev_ops, query_port);
2653
+ SET_DEVICE_OP(dev_ops, query_qp);
2654
+ SET_DEVICE_OP(dev_ops, query_srq);
2655
+ SET_DEVICE_OP(dev_ops, query_ucontext);
2656
+ SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
2657
+ SET_DEVICE_OP(dev_ops, read_counters);
2658
+ SET_DEVICE_OP(dev_ops, reg_dm_mr);
2659
+ SET_DEVICE_OP(dev_ops, reg_user_mr);
2660
+ SET_DEVICE_OP(dev_ops, req_ncomp_notif);
2661
+ SET_DEVICE_OP(dev_ops, req_notify_cq);
2662
+ SET_DEVICE_OP(dev_ops, rereg_user_mr);
2663
+ SET_DEVICE_OP(dev_ops, resize_cq);
2664
+ SET_DEVICE_OP(dev_ops, set_vf_guid);
2665
+ SET_DEVICE_OP(dev_ops, set_vf_link_state);
2666
+
2667
+ SET_OBJ_SIZE(dev_ops, ib_ah);
2668
+ SET_OBJ_SIZE(dev_ops, ib_counters);
2669
+ SET_OBJ_SIZE(dev_ops, ib_cq);
2670
+ SET_OBJ_SIZE(dev_ops, ib_mw);
2671
+ SET_OBJ_SIZE(dev_ops, ib_pd);
2672
+ SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
2673
+ SET_OBJ_SIZE(dev_ops, ib_srq);
2674
+ SET_OBJ_SIZE(dev_ops, ib_ucontext);
2675
+ SET_OBJ_SIZE(dev_ops, ib_xrcd);
2676
+}
2677
+EXPORT_SYMBOL(ib_set_device_ops);
2678
+
2679
+#ifdef CONFIG_INFINIBAND_VIRT_DMA
2680
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
2681
+{
2682
+ struct scatterlist *s;
2683
+ int i;
2684
+
2685
+ for_each_sg(sg, s, nents, i) {
2686
+ sg_dma_address(s) = (uintptr_t)sg_virt(s);
2687
+ sg_dma_len(s) = s->length;
2688
+ }
2689
+ return nents;
2690
+}
2691
+EXPORT_SYMBOL(ib_dma_virt_map_sg);
2692
+#endif /* CONFIG_INFINIBAND_VIRT_DMA */
11402693
11412694 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
11422695 [RDMA_NL_LS_OP_RESOLVE] = {
....@@ -1183,15 +2736,11 @@
11832736 goto err_comp_unbound;
11842737 }
11852738
1186
- ret = rdma_nl_init();
1187
- if (ret) {
1188
- pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
1189
- goto err_sysfs;
1190
- }
2739
+ rdma_nl_init();
11912740
11922741 ret = addr_init();
11932742 if (ret) {
1194
- pr_warn("Could't init IB address resolution\n");
2743
+ pr_warn("Couldn't init IB address resolution\n");
11952744 goto err_ibnl;
11962745 }
11972746
....@@ -1207,18 +2756,34 @@
12072756 goto err_mad;
12082757 }
12092758
1210
- ret = register_lsm_notifier(&ibdev_lsm_nb);
2759
+ ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
12112760 if (ret) {
12122761 pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
12132762 goto err_sa;
12142763 }
12152764
2765
+ ret = register_pernet_device(&rdma_dev_net_ops);
2766
+ if (ret) {
2767
+ pr_warn("Couldn't init compat dev. ret %d\n", ret);
2768
+ goto err_compat;
2769
+ }
2770
+
12162771 nldev_init();
12172772 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
1218
- roce_gid_mgmt_init();
2773
+ ret = roce_gid_mgmt_init();
2774
+ if (ret) {
2775
+ pr_warn("Couldn't init RoCE GID management\n");
2776
+ goto err_parent;
2777
+ }
12192778
12202779 return 0;
12212780
2781
+err_parent:
2782
+ rdma_nl_unregister(RDMA_NL_LS);
2783
+ nldev_exit();
2784
+ unregister_pernet_device(&rdma_dev_net_ops);
2785
+err_compat:
2786
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
12222787 err_sa:
12232788 ib_sa_cleanup();
12242789 err_mad:
....@@ -1226,8 +2791,6 @@
12262791 err_addr:
12272792 addr_cleanup();
12282793 err_ibnl:
1229
- rdma_nl_exit();
1230
-err_sysfs:
12312794 class_unregister(&ib_class);
12322795 err_comp_unbound:
12332796 destroy_workqueue(ib_comp_unbound_wq);
....@@ -1241,9 +2804,10 @@
12412804 static void __exit ib_core_cleanup(void)
12422805 {
12432806 roce_gid_mgmt_cleanup();
1244
- nldev_exit();
12452807 rdma_nl_unregister(RDMA_NL_LS);
1246
- unregister_lsm_notifier(&ibdev_lsm_nb);
2808
+ nldev_exit();
2809
+ unregister_pernet_device(&rdma_dev_net_ops);
2810
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
12472811 ib_sa_cleanup();
12482812 ib_mad_cleanup();
12492813 addr_cleanup();
....@@ -1253,9 +2817,15 @@
12532817 destroy_workqueue(ib_comp_wq);
12542818 /* Make sure that any pending umem accounting work is done. */
12552819 destroy_workqueue(ib_wq);
2820
+ flush_workqueue(system_unbound_wq);
2821
+ WARN_ON(!xa_empty(&clients));
2822
+ WARN_ON(!xa_empty(&devices));
12562823 }
12572824
12582825 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
12592826
1260
-subsys_initcall(ib_core_init);
2827
+/* ib core relies on netdev stack to first register net_ns_type_operations
2828
+ * ns kobject type before ib_core initialization.
2829
+ */
2830
+fs_initcall(ib_core_init);
12612831 module_exit(ib_core_cleanup);