forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/drivers/infiniband/core/device.c
....@@ -37,73 +37,241 @@
3737 #include <linux/kernel.h>
3838 #include <linux/slab.h>
3939 #include <linux/init.h>
40
-#include <linux/mutex.h>
4140 #include <linux/netdevice.h>
41
+#include <net/net_namespace.h>
4242 #include <linux/security.h>
4343 #include <linux/notifier.h>
44
+#include <linux/hashtable.h>
4445 #include <rdma/rdma_netlink.h>
4546 #include <rdma/ib_addr.h>
4647 #include <rdma/ib_cache.h>
48
+#include <rdma/rdma_counter.h>
4749
4850 #include "core_priv.h"
51
+#include "restrack.h"
4952
5053 MODULE_AUTHOR("Roland Dreier");
5154 MODULE_DESCRIPTION("core kernel InfiniBand API");
5255 MODULE_LICENSE("Dual BSD/GPL");
53
-
54
-struct ib_client_data {
55
- struct list_head list;
56
- struct ib_client *client;
57
- void * data;
58
- /* The device or client is going down. Do not call client or device
59
- * callbacks other than remove(). */
60
- bool going_down;
61
-};
6256
6357 struct workqueue_struct *ib_comp_wq;
6458 struct workqueue_struct *ib_comp_unbound_wq;
6559 struct workqueue_struct *ib_wq;
6660 EXPORT_SYMBOL_GPL(ib_wq);
6761
68
-/* The device_list and client_list contain devices and clients after their
69
- * registration has completed, and the devices and clients are removed
70
- * during unregistration. */
71
-static LIST_HEAD(device_list);
72
-static LIST_HEAD(client_list);
62
+/*
63
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
64
+ * xarray of the same name. Specifically it allows the caller to assert that
65
+ * the MARK will/will not be changing under the lock, and for devices and
66
+ * clients, that the value in the xarray is still a valid pointer. Change of
67
+ * the MARK is linked to the object state, so holding the lock and testing the
68
+ * MARK also asserts that the contained object is in a certain state.
69
+ *
70
+ * This is used to build a two stage register/unregister flow where objects
71
+ * can continue to be in the xarray even though they are still in progress to
72
+ * register/unregister.
73
+ *
74
+ * The xarray itself provides additional locking, and restartable iteration,
75
+ * which is also relied on.
76
+ *
77
+ * Locks should not be nested, with the exception of client_data, which is
78
+ * allowed to nest under the read side of the other two locks.
79
+ *
80
+ * The devices_rwsem also protects the device name list, any change or
81
+ * assignment of device name must also hold the write side to guarantee unique
82
+ * names.
83
+ */
7384
7485 /*
75
- * device_mutex and lists_rwsem protect access to both device_list and
76
- * client_list. device_mutex protects writer access by device and client
77
- * registration / de-registration. lists_rwsem protects reader access to
78
- * these lists. Iterators of these lists must lock it for read, while updates
79
- * to the lists must be done with a write lock. A special case is when the
80
- * device_mutex is locked. In this case locking the lists for read access is
81
- * not necessary as the device_mutex implies it.
86
+ * devices contains devices that have had their names assigned. The
87
+ * devices may not be registered. Users that care about the registration
88
+ * status need to call ib_device_try_get() on the device to ensure it is
89
+ * registered, and keep it registered, for the required duration.
8290 *
83
- * lists_rwsem also protects access to the client data list.
8491 */
85
-static DEFINE_MUTEX(device_mutex);
86
-static DECLARE_RWSEM(lists_rwsem);
92
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
93
+static DECLARE_RWSEM(devices_rwsem);
94
+#define DEVICE_REGISTERED XA_MARK_1
8795
96
+static u32 highest_client_id;
97
+#define CLIENT_REGISTERED XA_MARK_1
98
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
99
+static DECLARE_RWSEM(clients_rwsem);
100
+
101
+static void ib_client_put(struct ib_client *client)
102
+{
103
+ if (refcount_dec_and_test(&client->uses))
104
+ complete(&client->uses_zero);
105
+}
106
+
107
+/*
108
+ * If client_data is registered then the corresponding client must also still
109
+ * be registered.
110
+ */
111
+#define CLIENT_DATA_REGISTERED XA_MARK_1
112
+
113
+unsigned int rdma_dev_net_id;
114
+
115
+/*
116
+ * A list of net namespaces is maintained in an xarray. This is necessary
117
+ * because we can't get the locking right using the existing net ns list. We
118
+ * would require a init_net callback after the list is updated.
119
+ */
120
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
121
+/*
122
+ * rwsem to protect accessing the rdma_nets xarray entries.
123
+ */
124
+static DECLARE_RWSEM(rdma_nets_rwsem);
125
+
126
+bool ib_devices_shared_netns = true;
127
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
128
+MODULE_PARM_DESC(netns_mode,
129
+ "Share device among net namespaces; default=1 (shared)");
130
+/**
131
+ * rdma_dev_access_netns() - Return whether an rdma device can be accessed
132
+ * from a specified net namespace or not.
133
+ * @dev: Pointer to rdma device which needs to be checked
134
+ * @net: Pointer to net namesapce for which access to be checked
135
+ *
136
+ * When the rdma device is in shared mode, it ignores the net namespace.
137
+ * When the rdma device is exclusive to a net namespace, rdma device net
138
+ * namespace is checked against the specified one.
139
+ */
140
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
141
+{
142
+ return (ib_devices_shared_netns ||
143
+ net_eq(read_pnet(&dev->coredev.rdma_net), net));
144
+}
145
+EXPORT_SYMBOL(rdma_dev_access_netns);
146
+
147
+/*
148
+ * xarray has this behavior where it won't iterate over NULL values stored in
149
+ * allocated arrays. So we need our own iterator to see all values stored in
150
+ * the array. This does the same thing as xa_for_each except that it also
151
+ * returns NULL valued entries if the array is allocating. Simplified to only
152
+ * work on simple xarrays.
153
+ */
154
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
155
+ xa_mark_t filter)
156
+{
157
+ XA_STATE(xas, xa, *indexp);
158
+ void *entry;
159
+
160
+ rcu_read_lock();
161
+ do {
162
+ entry = xas_find_marked(&xas, ULONG_MAX, filter);
163
+ if (xa_is_zero(entry))
164
+ break;
165
+ } while (xas_retry(&xas, entry));
166
+ rcu_read_unlock();
167
+
168
+ if (entry) {
169
+ *indexp = xas.xa_index;
170
+ if (xa_is_zero(entry))
171
+ return NULL;
172
+ return entry;
173
+ }
174
+ return XA_ERROR(-ENOENT);
175
+}
176
+#define xan_for_each_marked(xa, index, entry, filter) \
177
+ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
178
+ !xa_is_err(entry); \
179
+ (index)++, entry = xan_find_marked(xa, &(index), filter))
180
+
181
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
182
+static DEFINE_SPINLOCK(ndev_hash_lock);
183
+static DECLARE_HASHTABLE(ndev_hash, 5);
184
+
185
+static void free_netdevs(struct ib_device *ib_dev);
186
+static void ib_unregister_work(struct work_struct *work);
187
+static void __ib_unregister_device(struct ib_device *device);
88188 static int ib_security_change(struct notifier_block *nb, unsigned long event,
89189 void *lsm_data);
90190 static void ib_policy_change_task(struct work_struct *work);
91191 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
92192
193
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
194
+ struct va_format *vaf)
195
+{
196
+ if (ibdev && ibdev->dev.parent)
197
+ dev_printk_emit(level[1] - '0',
198
+ ibdev->dev.parent,
199
+ "%s %s %s: %pV",
200
+ dev_driver_string(ibdev->dev.parent),
201
+ dev_name(ibdev->dev.parent),
202
+ dev_name(&ibdev->dev),
203
+ vaf);
204
+ else if (ibdev)
205
+ printk("%s%s: %pV",
206
+ level, dev_name(&ibdev->dev), vaf);
207
+ else
208
+ printk("%s(NULL ib_device): %pV", level, vaf);
209
+}
210
+
211
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
212
+ const char *format, ...)
213
+{
214
+ struct va_format vaf;
215
+ va_list args;
216
+
217
+ va_start(args, format);
218
+
219
+ vaf.fmt = format;
220
+ vaf.va = &args;
221
+
222
+ __ibdev_printk(level, ibdev, &vaf);
223
+
224
+ va_end(args);
225
+}
226
+EXPORT_SYMBOL(ibdev_printk);
227
+
228
+#define define_ibdev_printk_level(func, level) \
229
+void func(const struct ib_device *ibdev, const char *fmt, ...) \
230
+{ \
231
+ struct va_format vaf; \
232
+ va_list args; \
233
+ \
234
+ va_start(args, fmt); \
235
+ \
236
+ vaf.fmt = fmt; \
237
+ vaf.va = &args; \
238
+ \
239
+ __ibdev_printk(level, ibdev, &vaf); \
240
+ \
241
+ va_end(args); \
242
+} \
243
+EXPORT_SYMBOL(func);
244
+
245
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
246
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
247
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
248
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
249
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
250
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
251
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
252
+
93253 static struct notifier_block ibdev_lsm_nb = {
94254 .notifier_call = ib_security_change,
95255 };
96256
97
-static int ib_device_check_mandatory(struct ib_device *device)
257
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
258
+ struct net *net);
259
+
260
+/* Pointer to the RCU head at the start of the ib_port_data array */
261
+struct ib_port_data_rcu {
262
+ struct rcu_head rcu_head;
263
+ struct ib_port_data pdata[];
264
+};
265
+
266
+static void ib_device_check_mandatory(struct ib_device *device)
98267 {
99
-#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
268
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
100269 static const struct {
101270 size_t offset;
102271 char *name;
103272 } mandatory_table[] = {
104273 IB_MANDATORY_FUNC(query_device),
105274 IB_MANDATORY_FUNC(query_port),
106
- IB_MANDATORY_FUNC(query_pkey),
107275 IB_MANDATORY_FUNC(alloc_pd),
108276 IB_MANDATORY_FUNC(dealloc_pd),
109277 IB_MANDATORY_FUNC(create_qp),
....@@ -121,110 +289,228 @@
121289 };
122290 int i;
123291
292
+ device->kverbs_provider = true;
124293 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
125
- if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
126
- pr_warn("Device %s is missing mandatory function %s\n",
127
- device->name, mandatory_table[i].name);
128
- return -EINVAL;
294
+ if (!*(void **) ((void *) &device->ops +
295
+ mandatory_table[i].offset)) {
296
+ device->kverbs_provider = false;
297
+ break;
129298 }
130299 }
131
-
132
- return 0;
133
-}
134
-
135
-static struct ib_device *__ib_device_get_by_index(u32 index)
136
-{
137
- struct ib_device *device;
138
-
139
- list_for_each_entry(device, &device_list, core_list)
140
- if (device->index == index)
141
- return device;
142
-
143
- return NULL;
144300 }
145301
146302 /*
147
- * Caller is responsible to return refrerence count by calling put_device()
303
+ * Caller must perform ib_device_put() to return the device reference count
304
+ * when ib_device_get_by_index() returns valid device pointer.
148305 */
149
-struct ib_device *ib_device_get_by_index(u32 index)
306
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
150307 {
151308 struct ib_device *device;
152309
153
- down_read(&lists_rwsem);
154
- device = __ib_device_get_by_index(index);
155
- if (device)
156
- get_device(&device->dev);
310
+ down_read(&devices_rwsem);
311
+ device = xa_load(&devices, index);
312
+ if (device) {
313
+ if (!rdma_dev_access_netns(device, net)) {
314
+ device = NULL;
315
+ goto out;
316
+ }
157317
158
- up_read(&lists_rwsem);
318
+ if (!ib_device_try_get(device))
319
+ device = NULL;
320
+ }
321
+out:
322
+ up_read(&devices_rwsem);
159323 return device;
160324 }
325
+
326
+/**
327
+ * ib_device_put - Release IB device reference
328
+ * @device: device whose reference to be released
329
+ *
330
+ * ib_device_put() releases reference to the IB device to allow it to be
331
+ * unregistered and eventually free.
332
+ */
333
+void ib_device_put(struct ib_device *device)
334
+{
335
+ if (refcount_dec_and_test(&device->refcount))
336
+ complete(&device->unreg_completion);
337
+}
338
+EXPORT_SYMBOL(ib_device_put);
161339
162340 static struct ib_device *__ib_device_get_by_name(const char *name)
163341 {
164342 struct ib_device *device;
343
+ unsigned long index;
165344
166
- list_for_each_entry(device, &device_list, core_list)
167
- if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
345
+ xa_for_each (&devices, index, device)
346
+ if (!strcmp(name, dev_name(&device->dev)))
168347 return device;
169348
170349 return NULL;
171350 }
172351
173
-static int alloc_name(char *name)
352
+/**
353
+ * ib_device_get_by_name - Find an IB device by name
354
+ * @name: The name to look for
355
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
356
+ *
357
+ * Find and hold an ib_device by its name. The caller must call
358
+ * ib_device_put() on the returned pointer.
359
+ */
360
+struct ib_device *ib_device_get_by_name(const char *name,
361
+ enum rdma_driver_id driver_id)
174362 {
175
- unsigned long *inuse;
176
- char buf[IB_DEVICE_NAME_MAX];
177363 struct ib_device *device;
178
- int i;
179364
180
- inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
181
- if (!inuse)
182
- return -ENOMEM;
365
+ down_read(&devices_rwsem);
366
+ device = __ib_device_get_by_name(name);
367
+ if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
368
+ device->ops.driver_id != driver_id)
369
+ device = NULL;
183370
184
- list_for_each_entry(device, &device_list, core_list) {
185
- if (!sscanf(device->name, name, &i))
186
- continue;
187
- if (i < 0 || i >= PAGE_SIZE * 8)
188
- continue;
189
- snprintf(buf, sizeof buf, name, i);
190
- if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
191
- set_bit(i, inuse);
371
+ if (device) {
372
+ if (!ib_device_try_get(device))
373
+ device = NULL;
374
+ }
375
+ up_read(&devices_rwsem);
376
+ return device;
377
+}
378
+EXPORT_SYMBOL(ib_device_get_by_name);
379
+
380
+static int rename_compat_devs(struct ib_device *device)
381
+{
382
+ struct ib_core_device *cdev;
383
+ unsigned long index;
384
+ int ret = 0;
385
+
386
+ mutex_lock(&device->compat_devs_mutex);
387
+ xa_for_each (&device->compat_devs, index, cdev) {
388
+ ret = device_rename(&cdev->dev, dev_name(&device->dev));
389
+ if (ret) {
390
+ dev_warn(&cdev->dev,
391
+ "Fail to rename compatdev to new name %s\n",
392
+ dev_name(&device->dev));
393
+ break;
394
+ }
395
+ }
396
+ mutex_unlock(&device->compat_devs_mutex);
397
+ return ret;
398
+}
399
+
400
+int ib_device_rename(struct ib_device *ibdev, const char *name)
401
+{
402
+ unsigned long index;
403
+ void *client_data;
404
+ int ret;
405
+
406
+ down_write(&devices_rwsem);
407
+ if (!strcmp(name, dev_name(&ibdev->dev))) {
408
+ up_write(&devices_rwsem);
409
+ return 0;
192410 }
193411
194
- i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
195
- free_page((unsigned long) inuse);
196
- snprintf(buf, sizeof buf, name, i);
412
+ if (__ib_device_get_by_name(name)) {
413
+ up_write(&devices_rwsem);
414
+ return -EEXIST;
415
+ }
197416
198
- if (__ib_device_get_by_name(buf))
199
- return -ENFILE;
417
+ ret = device_rename(&ibdev->dev, name);
418
+ if (ret) {
419
+ up_write(&devices_rwsem);
420
+ return ret;
421
+ }
200422
201
- strlcpy(name, buf, IB_DEVICE_NAME_MAX);
423
+ strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
424
+ ret = rename_compat_devs(ibdev);
425
+
426
+ downgrade_write(&devices_rwsem);
427
+ down_read(&ibdev->client_data_rwsem);
428
+ xan_for_each_marked(&ibdev->client_data, index, client_data,
429
+ CLIENT_DATA_REGISTERED) {
430
+ struct ib_client *client = xa_load(&clients, index);
431
+
432
+ if (!client || !client->rename)
433
+ continue;
434
+
435
+ client->rename(ibdev, client_data);
436
+ }
437
+ up_read(&ibdev->client_data_rwsem);
438
+ up_read(&devices_rwsem);
202439 return 0;
440
+}
441
+
442
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
443
+{
444
+ if (use_dim > 1)
445
+ return -EINVAL;
446
+ ibdev->use_cq_dim = use_dim;
447
+
448
+ return 0;
449
+}
450
+
451
+static int alloc_name(struct ib_device *ibdev, const char *name)
452
+{
453
+ struct ib_device *device;
454
+ unsigned long index;
455
+ struct ida inuse;
456
+ int rc;
457
+ int i;
458
+
459
+ lockdep_assert_held_write(&devices_rwsem);
460
+ ida_init(&inuse);
461
+ xa_for_each (&devices, index, device) {
462
+ char buf[IB_DEVICE_NAME_MAX];
463
+
464
+ if (sscanf(dev_name(&device->dev), name, &i) != 1)
465
+ continue;
466
+ if (i < 0 || i >= INT_MAX)
467
+ continue;
468
+ snprintf(buf, sizeof buf, name, i);
469
+ if (strcmp(buf, dev_name(&device->dev)) != 0)
470
+ continue;
471
+
472
+ rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
473
+ if (rc < 0)
474
+ goto out;
475
+ }
476
+
477
+ rc = ida_alloc(&inuse, GFP_KERNEL);
478
+ if (rc < 0)
479
+ goto out;
480
+
481
+ rc = dev_set_name(&ibdev->dev, name, rc);
482
+out:
483
+ ida_destroy(&inuse);
484
+ return rc;
203485 }
204486
205487 static void ib_device_release(struct device *device)
206488 {
207489 struct ib_device *dev = container_of(device, struct ib_device, dev);
208490
209
- WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
210
- if (dev->reg_state == IB_DEV_UNREGISTERED) {
211
- /*
212
- * In IB_DEV_UNINITIALIZED state, cache or port table
213
- * is not even created. Free cache and port table only when
214
- * device reaches UNREGISTERED state.
215
- */
491
+ free_netdevs(dev);
492
+ WARN_ON(refcount_read(&dev->refcount));
493
+ if (dev->port_data) {
216494 ib_cache_release_one(dev);
217
- kfree(dev->port_immutable);
495
+ ib_security_release_port_pkey_list(dev);
496
+ rdma_counter_release(dev);
497
+ kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
498
+ pdata[0]),
499
+ rcu_head);
218500 }
219
- kfree(dev);
501
+
502
+ mutex_destroy(&dev->unregistration_lock);
503
+ mutex_destroy(&dev->compat_devs_mutex);
504
+
505
+ xa_destroy(&dev->compat_devs);
506
+ xa_destroy(&dev->client_data);
507
+ kfree_rcu(dev, rcu_head);
220508 }
221509
222510 static int ib_device_uevent(struct device *device,
223511 struct kobj_uevent_env *env)
224512 {
225
- struct ib_device *dev = container_of(device, struct ib_device, dev);
226
-
227
- if (add_uevent_var(env, "NAME=%s", dev->name))
513
+ if (add_uevent_var(env, "NAME=%s", dev_name(device)))
228514 return -ENOMEM;
229515
230516 /*
....@@ -234,14 +520,44 @@
234520 return 0;
235521 }
236522
523
+static const void *net_namespace(struct device *d)
524
+{
525
+ struct ib_core_device *coredev =
526
+ container_of(d, struct ib_core_device, dev);
527
+
528
+ return read_pnet(&coredev->rdma_net);
529
+}
530
+
237531 static struct class ib_class = {
238532 .name = "infiniband",
239533 .dev_release = ib_device_release,
240534 .dev_uevent = ib_device_uevent,
535
+ .ns_type = &net_ns_type_operations,
536
+ .namespace = net_namespace,
241537 };
242538
539
+static void rdma_init_coredev(struct ib_core_device *coredev,
540
+ struct ib_device *dev, struct net *net)
541
+{
542
+ /* This BUILD_BUG_ON is intended to catch layout change
543
+ * of union of ib_core_device and device.
544
+ * dev must be the first element as ib_core and providers
545
+ * driver uses it. Adding anything in ib_core_device before
546
+ * device will break this assumption.
547
+ */
548
+ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
549
+ offsetof(struct ib_device, dev));
550
+
551
+ coredev->dev.class = &ib_class;
552
+ coredev->dev.groups = dev->groups;
553
+ device_initialize(&coredev->dev);
554
+ coredev->owner = dev;
555
+ INIT_LIST_HEAD(&coredev->port_list);
556
+ write_pnet(&coredev->rdma_net, net);
557
+}
558
+
243559 /**
244
- * ib_alloc_device - allocate an IB device struct
560
+ * _ib_alloc_device - allocate an IB device struct
245561 * @size:size of structure to allocate
246562 *
247563 * Low-level drivers should use ib_alloc_device() to allocate &struct
....@@ -250,7 +566,7 @@
250566 * ib_dealloc_device() must be used to free structures allocated with
251567 * ib_alloc_device().
252568 */
253
-struct ib_device *ib_alloc_device(size_t size)
569
+struct ib_device *_ib_alloc_device(size_t size)
254570 {
255571 struct ib_device *device;
256572
....@@ -261,22 +577,32 @@
261577 if (!device)
262578 return NULL;
263579
264
- rdma_restrack_init(&device->res);
580
+ if (rdma_restrack_init(device)) {
581
+ kfree(device);
582
+ return NULL;
583
+ }
265584
266
- device->dev.class = &ib_class;
267
- device_initialize(&device->dev);
268
-
269
- dev_set_drvdata(&device->dev, device);
585
+ device->groups[0] = &ib_dev_attr_group;
586
+ rdma_init_coredev(&device->coredev, device, &init_net);
270587
271588 INIT_LIST_HEAD(&device->event_handler_list);
272
- spin_lock_init(&device->event_handler_lock);
273
- spin_lock_init(&device->client_data_lock);
274
- INIT_LIST_HEAD(&device->client_data_list);
275
- INIT_LIST_HEAD(&device->port_list);
589
+ spin_lock_init(&device->qp_open_list_lock);
590
+ init_rwsem(&device->event_handler_rwsem);
591
+ mutex_init(&device->unregistration_lock);
592
+ /*
593
+ * client_data needs to be alloc because we don't want our mark to be
594
+ * destroyed if the user stores NULL in the client data.
595
+ */
596
+ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
597
+ init_rwsem(&device->client_data_rwsem);
598
+ xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
599
+ mutex_init(&device->compat_devs_mutex);
600
+ init_completion(&device->unreg_completion);
601
+ INIT_WORK(&device->unregistration_work, ib_unregister_work);
276602
277603 return device;
278604 }
279
-EXPORT_SYMBOL(ib_alloc_device);
605
+EXPORT_SYMBOL(_ib_alloc_device);
280606
281607 /**
282608 * ib_dealloc_device - free an IB device struct
....@@ -286,32 +612,173 @@
286612 */
287613 void ib_dealloc_device(struct ib_device *device)
288614 {
289
- WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
290
- device->reg_state != IB_DEV_UNINITIALIZED);
291
- rdma_restrack_clean(&device->res);
615
+ if (device->ops.dealloc_driver)
616
+ device->ops.dealloc_driver(device);
617
+
618
+ /*
619
+ * ib_unregister_driver() requires all devices to remain in the xarray
620
+ * while their ops are callable. The last op we call is dealloc_driver
621
+ * above. This is needed to create a fence on op callbacks prior to
622
+ * allowing the driver module to unload.
623
+ */
624
+ down_write(&devices_rwsem);
625
+ if (xa_load(&devices, device->index) == device)
626
+ xa_erase(&devices, device->index);
627
+ up_write(&devices_rwsem);
628
+
629
+ /* Expedite releasing netdev references */
630
+ free_netdevs(device);
631
+
632
+ WARN_ON(!xa_empty(&device->compat_devs));
633
+ WARN_ON(!xa_empty(&device->client_data));
634
+ WARN_ON(refcount_read(&device->refcount));
635
+ rdma_restrack_clean(device);
636
+ /* Balances with device_initialize */
292637 put_device(&device->dev);
293638 }
294639 EXPORT_SYMBOL(ib_dealloc_device);
295640
296
-static int add_client_context(struct ib_device *device, struct ib_client *client)
641
+/*
642
+ * add_client_context() and remove_client_context() must be safe against
643
+ * parallel calls on the same device - registration/unregistration of both the
644
+ * device and client can be occurring in parallel.
645
+ *
646
+ * The routines need to be a fence, any caller must not return until the add
647
+ * or remove is fully completed.
648
+ */
649
+static int add_client_context(struct ib_device *device,
650
+ struct ib_client *client)
297651 {
298
- struct ib_client_data *context;
299
- unsigned long flags;
652
+ int ret = 0;
300653
301
- context = kmalloc(sizeof *context, GFP_KERNEL);
302
- if (!context)
654
+ if (!device->kverbs_provider && !client->no_kverbs_req)
655
+ return 0;
656
+
657
+ down_write(&device->client_data_rwsem);
658
+ /*
659
+ * So long as the client is registered hold both the client and device
660
+ * unregistration locks.
661
+ */
662
+ if (!refcount_inc_not_zero(&client->uses))
663
+ goto out_unlock;
664
+ refcount_inc(&device->refcount);
665
+
666
+ /*
667
+ * Another caller to add_client_context got here first and has already
668
+ * completely initialized context.
669
+ */
670
+ if (xa_get_mark(&device->client_data, client->client_id,
671
+ CLIENT_DATA_REGISTERED))
672
+ goto out;
673
+
674
+ ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
675
+ GFP_KERNEL));
676
+ if (ret)
677
+ goto out;
678
+ downgrade_write(&device->client_data_rwsem);
679
+ if (client->add) {
680
+ if (client->add(device)) {
681
+ /*
682
+ * If a client fails to add then the error code is
683
+ * ignored, but we won't call any more ops on this
684
+ * client.
685
+ */
686
+ xa_erase(&device->client_data, client->client_id);
687
+ up_read(&device->client_data_rwsem);
688
+ ib_device_put(device);
689
+ ib_client_put(client);
690
+ return 0;
691
+ }
692
+ }
693
+
694
+ /* Readers shall not see a client until add has been completed */
695
+ xa_set_mark(&device->client_data, client->client_id,
696
+ CLIENT_DATA_REGISTERED);
697
+ up_read(&device->client_data_rwsem);
698
+ return 0;
699
+
700
+out:
701
+ ib_device_put(device);
702
+ ib_client_put(client);
703
+out_unlock:
704
+ up_write(&device->client_data_rwsem);
705
+ return ret;
706
+}
707
+
708
+static void remove_client_context(struct ib_device *device,
709
+ unsigned int client_id)
710
+{
711
+ struct ib_client *client;
712
+ void *client_data;
713
+
714
+ down_write(&device->client_data_rwsem);
715
+ if (!xa_get_mark(&device->client_data, client_id,
716
+ CLIENT_DATA_REGISTERED)) {
717
+ up_write(&device->client_data_rwsem);
718
+ return;
719
+ }
720
+ client_data = xa_load(&device->client_data, client_id);
721
+ xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
722
+ client = xa_load(&clients, client_id);
723
+ up_write(&device->client_data_rwsem);
724
+
725
+ /*
726
+ * Notice we cannot be holding any exclusive locks when calling the
727
+ * remove callback as the remove callback can recurse back into any
728
+ * public functions in this module and thus try for any locks those
729
+ * functions take.
730
+ *
731
+ * For this reason clients and drivers should not call the
732
+ * unregistration functions will holdling any locks.
733
+ */
734
+ if (client->remove)
735
+ client->remove(device, client_data);
736
+
737
+ xa_erase(&device->client_data, client_id);
738
+ ib_device_put(device);
739
+ ib_client_put(client);
740
+}
741
+
742
+static int alloc_port_data(struct ib_device *device)
743
+{
744
+ struct ib_port_data_rcu *pdata_rcu;
745
+ unsigned int port;
746
+
747
+ if (device->port_data)
748
+ return 0;
749
+
750
+ /* This can only be called once the physical port range is defined */
751
+ if (WARN_ON(!device->phys_port_cnt))
752
+ return -EINVAL;
753
+
754
+ /*
755
+ * device->port_data is indexed directly by the port number to make
756
+ * access to this data as efficient as possible.
757
+ *
758
+ * Therefore port_data is declared as a 1 based array with potential
759
+ * empty slots at the beginning.
760
+ */
761
+ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
762
+ rdma_end_port(device) + 1),
763
+ GFP_KERNEL);
764
+ if (!pdata_rcu)
303765 return -ENOMEM;
766
+ /*
767
+ * The rcu_head is put in front of the port data array and the stored
768
+ * pointer is adjusted since we never need to see that member until
769
+ * kfree_rcu.
770
+ */
771
+ device->port_data = pdata_rcu->pdata;
304772
305
- context->client = client;
306
- context->data = NULL;
307
- context->going_down = false;
773
+ rdma_for_each_port (device, port) {
774
+ struct ib_port_data *pdata = &device->port_data[port];
308775
309
- down_write(&lists_rwsem);
310
- spin_lock_irqsave(&device->client_data_lock, flags);
311
- list_add(&context->list, &device->client_data_list);
312
- spin_unlock_irqrestore(&device->client_data_lock, flags);
313
- up_write(&lists_rwsem);
314
-
776
+ pdata->ib_dev = device;
777
+ spin_lock_init(&pdata->pkey_list_lock);
778
+ INIT_LIST_HEAD(&pdata->pkey_list);
779
+ spin_lock_init(&pdata->netdev_lock);
780
+ INIT_HLIST_NODE(&pdata->ndev_hash_link);
781
+ }
315782 return 0;
316783 }
317784
....@@ -321,29 +788,20 @@
321788 rdma_max_mad_size(dev, port) != 0);
322789 }
323790
324
-static int read_port_immutable(struct ib_device *device)
791
+static int setup_port_data(struct ib_device *device)
325792 {
793
+ unsigned int port;
326794 int ret;
327
- u8 start_port = rdma_start_port(device);
328
- u8 end_port = rdma_end_port(device);
329
- u8 port;
330795
331
- /**
332
- * device->port_immutable is indexed directly by the port number to make
333
- * access to this data as efficient as possible.
334
- *
335
- * Therefore port_immutable is declared as a 1 based array with
336
- * potential empty slots at the beginning.
337
- */
338
- device->port_immutable = kcalloc(end_port + 1,
339
- sizeof(*device->port_immutable),
340
- GFP_KERNEL);
341
- if (!device->port_immutable)
342
- return -ENOMEM;
796
+ ret = alloc_port_data(device);
797
+ if (ret)
798
+ return ret;
343799
344
- for (port = start_port; port <= end_port; ++port) {
345
- ret = device->get_port_immutable(device, port,
346
- &device->port_immutable[port]);
800
+ rdma_for_each_port (device, port) {
801
+ struct ib_port_data *pdata = &device->port_data[port];
802
+
803
+ ret = device->ops.get_port_immutable(device, port,
804
+ &pdata->immutable);
347805 if (ret)
348806 return ret;
349807
....@@ -355,46 +813,23 @@
355813
356814 void ib_get_device_fw_str(struct ib_device *dev, char *str)
357815 {
358
- if (dev->get_dev_fw_str)
359
- dev->get_dev_fw_str(dev, str);
816
+ if (dev->ops.get_dev_fw_str)
817
+ dev->ops.get_dev_fw_str(dev, str);
360818 else
361819 str[0] = '\0';
362820 }
363821 EXPORT_SYMBOL(ib_get_device_fw_str);
364822
365
-static int setup_port_pkey_list(struct ib_device *device)
366
-{
367
- int i;
368
-
369
- /**
370
- * device->port_pkey_list is indexed directly by the port number,
371
- * Therefore it is declared as a 1 based array with potential empty
372
- * slots at the beginning.
373
- */
374
- device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
375
- sizeof(*device->port_pkey_list),
376
- GFP_KERNEL);
377
-
378
- if (!device->port_pkey_list)
379
- return -ENOMEM;
380
-
381
- for (i = 0; i < (rdma_end_port(device) + 1); i++) {
382
- spin_lock_init(&device->port_pkey_list[i].list_lock);
383
- INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
384
- }
385
-
386
- return 0;
387
-}
388
-
389823 static void ib_policy_change_task(struct work_struct *work)
390824 {
391825 struct ib_device *dev;
826
+ unsigned long index;
392827
393
- down_read(&lists_rwsem);
394
- list_for_each_entry(dev, &device_list, core_list) {
395
- int i;
828
+ down_read(&devices_rwsem);
829
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
830
+ unsigned int i;
396831
397
- for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
832
+ rdma_for_each_port (dev, i) {
398833 u64 sp;
399834 int ret = ib_get_cached_subnet_prefix(dev,
400835 i,
....@@ -407,7 +842,7 @@
407842 ib_security_cache_change(dev, i, sp);
408843 }
409844 }
410
- up_read(&lists_rwsem);
845
+ up_read(&devices_rwsem);
411846 }
412847
413848 static int ib_security_change(struct notifier_block *nb, unsigned long event,
....@@ -417,208 +852,857 @@
417852 return NOTIFY_DONE;
418853
419854 schedule_work(&ib_policy_change_work);
855
+ ib_mad_agent_security_change();
420856
421857 return NOTIFY_OK;
422858 }
423859
424
-/**
425
- * __dev_new_index - allocate an device index
426
- *
427
- * Returns a suitable unique value for a new device interface
428
- * number. It assumes that there are less than 2^32-1 ib devices
429
- * will be present in the system.
430
- */
431
-static u32 __dev_new_index(void)
860
+static void compatdev_release(struct device *dev)
432861 {
862
+ struct ib_core_device *cdev =
863
+ container_of(dev, struct ib_core_device, dev);
864
+
865
+ kfree(cdev);
866
+}
867
+
868
+static int add_one_compat_dev(struct ib_device *device,
869
+ struct rdma_dev_net *rnet)
870
+{
871
+ struct ib_core_device *cdev;
872
+ int ret;
873
+
874
+ lockdep_assert_held(&rdma_nets_rwsem);
875
+ if (!ib_devices_shared_netns)
876
+ return 0;
877
+
433878 /*
434
- * The device index to allow stable naming.
435
- * Similar to struct net -> ifindex.
879
+ * Create and add compat device in all namespaces other than where it
880
+ * is currently bound to.
436881 */
437
- static u32 index;
882
+ if (net_eq(read_pnet(&rnet->net),
883
+ read_pnet(&device->coredev.rdma_net)))
884
+ return 0;
438885
439
- for (;;) {
440
- if (!(++index))
441
- index = 1;
442
-
443
- if (!__ib_device_get_by_index(index))
444
- return index;
886
+ /*
887
+ * The first of init_net() or ib_register_device() to take the
888
+ * compat_devs_mutex wins and gets to add the device. Others will wait
889
+ * for completion here.
890
+ */
891
+ mutex_lock(&device->compat_devs_mutex);
892
+ cdev = xa_load(&device->compat_devs, rnet->id);
893
+ if (cdev) {
894
+ ret = 0;
895
+ goto done;
445896 }
897
+ ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
898
+ if (ret)
899
+ goto done;
900
+
901
+ cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
902
+ if (!cdev) {
903
+ ret = -ENOMEM;
904
+ goto cdev_err;
905
+ }
906
+
907
+ cdev->dev.parent = device->dev.parent;
908
+ rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
909
+ cdev->dev.release = compatdev_release;
910
+ ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
911
+ if (ret)
912
+ goto add_err;
913
+
914
+ ret = device_add(&cdev->dev);
915
+ if (ret)
916
+ goto add_err;
917
+ ret = ib_setup_port_attrs(cdev);
918
+ if (ret)
919
+ goto port_err;
920
+
921
+ ret = xa_err(xa_store(&device->compat_devs, rnet->id,
922
+ cdev, GFP_KERNEL));
923
+ if (ret)
924
+ goto insert_err;
925
+
926
+ mutex_unlock(&device->compat_devs_mutex);
927
+ return 0;
928
+
929
+insert_err:
930
+ ib_free_port_attrs(cdev);
931
+port_err:
932
+ device_del(&cdev->dev);
933
+add_err:
934
+ put_device(&cdev->dev);
935
+cdev_err:
936
+ xa_release(&device->compat_devs, rnet->id);
937
+done:
938
+ mutex_unlock(&device->compat_devs_mutex);
939
+ return ret;
940
+}
941
+
942
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
943
+{
944
+ struct ib_core_device *cdev;
945
+
946
+ mutex_lock(&device->compat_devs_mutex);
947
+ cdev = xa_erase(&device->compat_devs, id);
948
+ mutex_unlock(&device->compat_devs_mutex);
949
+ if (cdev) {
950
+ ib_free_port_attrs(cdev);
951
+ device_del(&cdev->dev);
952
+ put_device(&cdev->dev);
953
+ }
954
+}
955
+
956
+static void remove_compat_devs(struct ib_device *device)
957
+{
958
+ struct ib_core_device *cdev;
959
+ unsigned long index;
960
+
961
+ xa_for_each (&device->compat_devs, index, cdev)
962
+ remove_one_compat_dev(device, index);
963
+}
964
+
965
+static int add_compat_devs(struct ib_device *device)
966
+{
967
+ struct rdma_dev_net *rnet;
968
+ unsigned long index;
969
+ int ret = 0;
970
+
971
+ lockdep_assert_held(&devices_rwsem);
972
+
973
+ down_read(&rdma_nets_rwsem);
974
+ xa_for_each (&rdma_nets, index, rnet) {
975
+ ret = add_one_compat_dev(device, rnet);
976
+ if (ret)
977
+ break;
978
+ }
979
+ up_read(&rdma_nets_rwsem);
980
+ return ret;
981
+}
982
+
983
+static void remove_all_compat_devs(void)
984
+{
985
+ struct ib_compat_device *cdev;
986
+ struct ib_device *dev;
987
+ unsigned long index;
988
+
989
+ down_read(&devices_rwsem);
990
+ xa_for_each (&devices, index, dev) {
991
+ unsigned long c_index = 0;
992
+
993
+ /* Hold nets_rwsem so that any other thread modifying this
994
+ * system param can sync with this thread.
995
+ */
996
+ down_read(&rdma_nets_rwsem);
997
+ xa_for_each (&dev->compat_devs, c_index, cdev)
998
+ remove_one_compat_dev(dev, c_index);
999
+ up_read(&rdma_nets_rwsem);
1000
+ }
1001
+ up_read(&devices_rwsem);
1002
+}
1003
+
1004
+static int add_all_compat_devs(void)
1005
+{
1006
+ struct rdma_dev_net *rnet;
1007
+ struct ib_device *dev;
1008
+ unsigned long index;
1009
+ int ret = 0;
1010
+
1011
+ down_read(&devices_rwsem);
1012
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1013
+ unsigned long net_index = 0;
1014
+
1015
+ /* Hold nets_rwsem so that any other thread modifying this
1016
+ * system param can sync with this thread.
1017
+ */
1018
+ down_read(&rdma_nets_rwsem);
1019
+ xa_for_each (&rdma_nets, net_index, rnet) {
1020
+ ret = add_one_compat_dev(dev, rnet);
1021
+ if (ret)
1022
+ break;
1023
+ }
1024
+ up_read(&rdma_nets_rwsem);
1025
+ }
1026
+ up_read(&devices_rwsem);
1027
+ if (ret)
1028
+ remove_all_compat_devs();
1029
+ return ret;
1030
+}
1031
+
1032
+int rdma_compatdev_set(u8 enable)
1033
+{
1034
+ struct rdma_dev_net *rnet;
1035
+ unsigned long index;
1036
+ int ret = 0;
1037
+
1038
+ down_write(&rdma_nets_rwsem);
1039
+ if (ib_devices_shared_netns == enable) {
1040
+ up_write(&rdma_nets_rwsem);
1041
+ return 0;
1042
+ }
1043
+
1044
+ /* enable/disable of compat devices is not supported
1045
+ * when more than default init_net exists.
1046
+ */
1047
+ xa_for_each (&rdma_nets, index, rnet) {
1048
+ ret++;
1049
+ break;
1050
+ }
1051
+ if (!ret)
1052
+ ib_devices_shared_netns = enable;
1053
+ up_write(&rdma_nets_rwsem);
1054
+ if (ret)
1055
+ return -EBUSY;
1056
+
1057
+ if (enable)
1058
+ ret = add_all_compat_devs();
1059
+ else
1060
+ remove_all_compat_devs();
1061
+ return ret;
1062
+}
1063
+
1064
+static void rdma_dev_exit_net(struct net *net)
1065
+{
1066
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1067
+ struct ib_device *dev;
1068
+ unsigned long index;
1069
+ int ret;
1070
+
1071
+ down_write(&rdma_nets_rwsem);
1072
+ /*
1073
+ * Prevent the ID from being re-used and hide the id from xa_for_each.
1074
+ */
1075
+ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
1076
+ WARN_ON(ret);
1077
+ up_write(&rdma_nets_rwsem);
1078
+
1079
+ down_read(&devices_rwsem);
1080
+ xa_for_each (&devices, index, dev) {
1081
+ get_device(&dev->dev);
1082
+ /*
1083
+ * Release the devices_rwsem so that pontentially blocking
1084
+ * device_del, doesn't hold the devices_rwsem for too long.
1085
+ */
1086
+ up_read(&devices_rwsem);
1087
+
1088
+ remove_one_compat_dev(dev, rnet->id);
1089
+
1090
+ /*
1091
+ * If the real device is in the NS then move it back to init.
1092
+ */
1093
+ rdma_dev_change_netns(dev, net, &init_net);
1094
+
1095
+ put_device(&dev->dev);
1096
+ down_read(&devices_rwsem);
1097
+ }
1098
+ up_read(&devices_rwsem);
1099
+
1100
+ rdma_nl_net_exit(rnet);
1101
+ xa_erase(&rdma_nets, rnet->id);
1102
+}
1103
+
1104
+static __net_init int rdma_dev_init_net(struct net *net)
1105
+{
1106
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1107
+ unsigned long index;
1108
+ struct ib_device *dev;
1109
+ int ret;
1110
+
1111
+ write_pnet(&rnet->net, net);
1112
+
1113
+ ret = rdma_nl_net_init(rnet);
1114
+ if (ret)
1115
+ return ret;
1116
+
1117
+ /* No need to create any compat devices in default init_net. */
1118
+ if (net_eq(net, &init_net))
1119
+ return 0;
1120
+
1121
+ ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
1122
+ if (ret) {
1123
+ rdma_nl_net_exit(rnet);
1124
+ return ret;
1125
+ }
1126
+
1127
+ down_read(&devices_rwsem);
1128
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1129
+ /* Hold nets_rwsem so that netlink command cannot change
1130
+ * system configuration for device sharing mode.
1131
+ */
1132
+ down_read(&rdma_nets_rwsem);
1133
+ ret = add_one_compat_dev(dev, rnet);
1134
+ up_read(&rdma_nets_rwsem);
1135
+ if (ret)
1136
+ break;
1137
+ }
1138
+ up_read(&devices_rwsem);
1139
+
1140
+ if (ret)
1141
+ rdma_dev_exit_net(net);
1142
+
1143
+ return ret;
1144
+}
1145
+
1146
+/*
1147
+ * Assign the unique string device name and the unique device index. This is
1148
+ * undone by ib_dealloc_device.
1149
+ */
1150
+static int assign_name(struct ib_device *device, const char *name)
1151
+{
1152
+ static u32 last_id;
1153
+ int ret;
1154
+
1155
+ down_write(&devices_rwsem);
1156
+ /* Assign a unique name to the device */
1157
+ if (strchr(name, '%'))
1158
+ ret = alloc_name(device, name);
1159
+ else
1160
+ ret = dev_set_name(&device->dev, name);
1161
+ if (ret)
1162
+ goto out;
1163
+
1164
+ if (__ib_device_get_by_name(dev_name(&device->dev))) {
1165
+ ret = -ENFILE;
1166
+ goto out;
1167
+ }
1168
+ strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
1169
+
1170
+ ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
1171
+ &last_id, GFP_KERNEL);
1172
+ if (ret > 0)
1173
+ ret = 0;
1174
+
1175
+out:
1176
+ up_write(&devices_rwsem);
1177
+ return ret;
1178
+}
1179
+
1180
+/*
1181
+ * setup_device() allocates memory and sets up data that requires calling the
1182
+ * device ops, this is the only reason these actions are not done during
1183
+ * ib_alloc_device. It is undone by ib_dealloc_device().
1184
+ */
1185
+static int setup_device(struct ib_device *device)
1186
+{
1187
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
1188
+ int ret;
1189
+
1190
+ ib_device_check_mandatory(device);
1191
+
1192
+ ret = setup_port_data(device);
1193
+ if (ret) {
1194
+ dev_warn(&device->dev, "Couldn't create per-port data\n");
1195
+ return ret;
1196
+ }
1197
+
1198
+ memset(&device->attrs, 0, sizeof(device->attrs));
1199
+ ret = device->ops.query_device(device, &device->attrs, &uhw);
1200
+ if (ret) {
1201
+ dev_warn(&device->dev,
1202
+ "Couldn't query the device attributes\n");
1203
+ return ret;
1204
+ }
1205
+
1206
+ return 0;
1207
+}
1208
+
1209
+static void disable_device(struct ib_device *device)
1210
+{
1211
+ u32 cid;
1212
+
1213
+ WARN_ON(!refcount_read(&device->refcount));
1214
+
1215
+ down_write(&devices_rwsem);
1216
+ xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
1217
+ up_write(&devices_rwsem);
1218
+
1219
+ /*
1220
+ * Remove clients in LIFO order, see assign_client_id. This could be
1221
+ * more efficient if xarray learns to reverse iterate. Since no new
1222
+ * clients can be added to this ib_device past this point we only need
1223
+ * the maximum possible client_id value here.
1224
+ */
1225
+ down_read(&clients_rwsem);
1226
+ cid = highest_client_id;
1227
+ up_read(&clients_rwsem);
1228
+ while (cid) {
1229
+ cid--;
1230
+ remove_client_context(device, cid);
1231
+ }
1232
+
1233
+ ib_cq_pool_destroy(device);
1234
+
1235
+ /* Pairs with refcount_set in enable_device */
1236
+ ib_device_put(device);
1237
+ wait_for_completion(&device->unreg_completion);
1238
+
1239
+ /*
1240
+ * compat devices must be removed after device refcount drops to zero.
1241
+ * Otherwise init_net() may add more compatdevs after removing compat
1242
+ * devices and before device is disabled.
1243
+ */
1244
+ remove_compat_devs(device);
1245
+}
1246
+
1247
+/*
1248
+ * An enabled device is visible to all clients and to all the public facing
1249
+ * APIs that return a device pointer. This always returns with a new get, even
1250
+ * if it fails.
1251
+ */
1252
+static int enable_device_and_get(struct ib_device *device)
1253
+{
1254
+ struct ib_client *client;
1255
+ unsigned long index;
1256
+ int ret = 0;
1257
+
1258
+ /*
1259
+ * One ref belongs to the xa and the other belongs to this
1260
+ * thread. This is needed to guard against parallel unregistration.
1261
+ */
1262
+ refcount_set(&device->refcount, 2);
1263
+ down_write(&devices_rwsem);
1264
+ xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
1265
+
1266
+ /*
1267
+ * By using downgrade_write() we ensure that no other thread can clear
1268
+ * DEVICE_REGISTERED while we are completing the client setup.
1269
+ */
1270
+ downgrade_write(&devices_rwsem);
1271
+
1272
+ if (device->ops.enable_driver) {
1273
+ ret = device->ops.enable_driver(device);
1274
+ if (ret)
1275
+ goto out;
1276
+ }
1277
+
1278
+ ib_cq_pool_init(device);
1279
+
1280
+ down_read(&clients_rwsem);
1281
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1282
+ ret = add_client_context(device, client);
1283
+ if (ret)
1284
+ break;
1285
+ }
1286
+ up_read(&clients_rwsem);
1287
+ if (!ret)
1288
+ ret = add_compat_devs(device);
1289
+out:
1290
+ up_read(&devices_rwsem);
1291
+ return ret;
1292
+}
1293
+
1294
+static void prevent_dealloc_device(struct ib_device *ib_dev)
1295
+{
4461296 }
4471297
4481298 /**
4491299 * ib_register_device - Register an IB device with IB core
450
- * @device:Device to register
1300
+ * @device: Device to register
1301
+ * @name: unique string device name. This may include a '%' which will
1302
+ * cause a unique index to be added to the passed device name.
1303
+ * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB
1304
+ * device will be used. In this case the caller should fully
1305
+ * setup the ibdev for DMA. This usually means using dma_virt_ops.
4511306 *
4521307 * Low-level drivers use ib_register_device() to register their
4531308 * devices with the IB core. All registered clients will receive a
4541309 * callback for each device that is added. @device must be allocated
4551310 * with ib_alloc_device().
1311
+ *
1312
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
1313
+ * asynchronously then the device pointer may become freed as soon as this
1314
+ * function returns.
4561315 */
457
-int ib_register_device(struct ib_device *device,
458
- int (*port_callback)(struct ib_device *,
459
- u8, struct kobject *))
1316
+int ib_register_device(struct ib_device *device, const char *name,
1317
+ struct device *dma_device)
4601318 {
4611319 int ret;
462
- struct ib_client *client;
463
- struct ib_udata uhw = {.outlen = 0, .inlen = 0};
464
- struct device *parent = device->dev.parent;
4651320
466
- WARN_ON_ONCE(device->dma_device);
467
- if (device->dev.dma_ops) {
468
- /*
469
- * The caller provided custom DMA operations. Copy the
470
- * DMA-related fields that are used by e.g. dma_alloc_coherent()
471
- * into device->dev.
472
- */
473
- device->dma_device = &device->dev;
474
- if (!device->dev.dma_mask) {
475
- if (parent)
476
- device->dev.dma_mask = parent->dma_mask;
477
- else
478
- WARN_ON_ONCE(true);
479
- }
480
- if (!device->dev.coherent_dma_mask) {
481
- if (parent)
482
- device->dev.coherent_dma_mask =
483
- parent->coherent_dma_mask;
484
- else
485
- WARN_ON_ONCE(true);
486
- }
487
- } else {
488
- /*
489
- * The caller did not provide custom DMA operations. Use the
490
- * DMA mapping operations of the parent device.
491
- */
492
- WARN_ON_ONCE(!parent);
493
- device->dma_device = parent;
494
- }
1321
+ ret = assign_name(device, name);
1322
+ if (ret)
1323
+ return ret;
4951324
496
- mutex_lock(&device_mutex);
1325
+ /*
1326
+ * If the caller does not provide a DMA capable device then the IB core
1327
+ * will set up ib_sge and scatterlist structures that stash the kernel
1328
+ * virtual address into the address field.
1329
+ */
1330
+ WARN_ON(dma_device && !dma_device->dma_parms);
1331
+ device->dma_device = dma_device;
4971332
498
- if (strchr(device->name, '%')) {
499
- ret = alloc_name(device->name);
500
- if (ret)
501
- goto out;
502
- }
503
-
504
- if (ib_device_check_mandatory(device)) {
505
- ret = -EINVAL;
506
- goto out;
507
- }
508
-
509
- ret = read_port_immutable(device);
510
- if (ret) {
511
- pr_warn("Couldn't create per port immutable data %s\n",
512
- device->name);
513
- goto out;
514
- }
515
-
516
- ret = setup_port_pkey_list(device);
517
- if (ret) {
518
- pr_warn("Couldn't create per port_pkey_list\n");
519
- goto out;
520
- }
1333
+ ret = setup_device(device);
1334
+ if (ret)
1335
+ return ret;
5211336
5221337 ret = ib_cache_setup_one(device);
5231338 if (ret) {
524
- pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
525
- goto port_cleanup;
1339
+ dev_warn(&device->dev,
1340
+ "Couldn't set up InfiniBand P_Key/GID cache\n");
1341
+ return ret;
5261342 }
5271343
528
- ret = ib_device_register_rdmacg(device);
529
- if (ret) {
530
- pr_warn("Couldn't register device with rdma cgroup\n");
531
- goto cache_cleanup;
532
- }
1344
+ ib_device_register_rdmacg(device);
5331345
534
- memset(&device->attrs, 0, sizeof(device->attrs));
535
- ret = device->query_device(device, &device->attrs, &uhw);
536
- if (ret) {
537
- pr_warn("Couldn't query the device attributes\n");
1346
+ rdma_counter_init(device);
1347
+
1348
+ /*
1349
+ * Ensure that ADD uevent is not fired because it
1350
+ * is too early amd device is not initialized yet.
1351
+ */
1352
+ dev_set_uevent_suppress(&device->dev, true);
1353
+ ret = device_add(&device->dev);
1354
+ if (ret)
5381355 goto cg_cleanup;
539
- }
5401356
541
- ret = ib_device_register_sysfs(device, port_callback);
1357
+ ret = ib_device_register_sysfs(device);
5421358 if (ret) {
543
- pr_warn("Couldn't register device %s with driver model\n",
544
- device->name);
545
- goto cg_cleanup;
1359
+ dev_warn(&device->dev,
1360
+ "Couldn't register device with driver model\n");
1361
+ goto dev_cleanup;
5461362 }
5471363
548
- device->reg_state = IB_DEV_REGISTERED;
1364
+ ret = enable_device_and_get(device);
1365
+ if (ret) {
1366
+ void (*dealloc_fn)(struct ib_device *);
5491367
550
- list_for_each_entry(client, &client_list, list)
551
- if (!add_client_context(device, client) && client->add)
552
- client->add(device);
1368
+ /*
1369
+ * If we hit this error flow then we don't want to
1370
+ * automatically dealloc the device since the caller is
1371
+ * expected to call ib_dealloc_device() after
1372
+ * ib_register_device() fails. This is tricky due to the
1373
+ * possibility for a parallel unregistration along with this
1374
+ * error flow. Since we have a refcount here we know any
1375
+ * parallel flow is stopped in disable_device and will see the
1376
+ * special dealloc_driver pointer, causing the responsibility to
1377
+ * ib_dealloc_device() to revert back to this thread.
1378
+ */
1379
+ dealloc_fn = device->ops.dealloc_driver;
1380
+ device->ops.dealloc_driver = prevent_dealloc_device;
1381
+ ib_device_put(device);
1382
+ __ib_unregister_device(device);
1383
+ device->ops.dealloc_driver = dealloc_fn;
1384
+ dev_set_uevent_suppress(&device->dev, false);
1385
+ return ret;
1386
+ }
1387
+ dev_set_uevent_suppress(&device->dev, false);
1388
+ /* Mark for userspace that device is ready */
1389
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1390
+ ib_device_put(device);
5531391
554
- device->index = __dev_new_index();
555
- down_write(&lists_rwsem);
556
- list_add_tail(&device->core_list, &device_list);
557
- up_write(&lists_rwsem);
558
- mutex_unlock(&device_mutex);
5591392 return 0;
5601393
1394
+dev_cleanup:
1395
+ device_del(&device->dev);
5611396 cg_cleanup:
1397
+ dev_set_uevent_suppress(&device->dev, false);
5621398 ib_device_unregister_rdmacg(device);
563
-cache_cleanup:
5641399 ib_cache_cleanup_one(device);
565
- ib_cache_release_one(device);
566
-port_cleanup:
567
- kfree(device->port_immutable);
568
-out:
569
- mutex_unlock(&device_mutex);
5701400 return ret;
5711401 }
5721402 EXPORT_SYMBOL(ib_register_device);
5731403
1404
+/* Callers must hold a get on the device. */
1405
+static void __ib_unregister_device(struct ib_device *ib_dev)
1406
+{
1407
+ /*
1408
+ * We have a registration lock so that all the calls to unregister are
1409
+ * fully fenced, once any unregister returns the device is truely
1410
+ * unregistered even if multiple callers are unregistering it at the
1411
+ * same time. This also interacts with the registration flow and
1412
+ * provides sane semantics if register and unregister are racing.
1413
+ */
1414
+ mutex_lock(&ib_dev->unregistration_lock);
1415
+ if (!refcount_read(&ib_dev->refcount))
1416
+ goto out;
1417
+
1418
+ disable_device(ib_dev);
1419
+
1420
+ /* Expedite removing unregistered pointers from the hash table */
1421
+ free_netdevs(ib_dev);
1422
+
1423
+ ib_device_unregister_sysfs(ib_dev);
1424
+ device_del(&ib_dev->dev);
1425
+ ib_device_unregister_rdmacg(ib_dev);
1426
+ ib_cache_cleanup_one(ib_dev);
1427
+
1428
+ /*
1429
+ * Drivers using the new flow may not call ib_dealloc_device except
1430
+ * in error unwind prior to registration success.
1431
+ */
1432
+ if (ib_dev->ops.dealloc_driver &&
1433
+ ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
1434
+ WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
1435
+ ib_dealloc_device(ib_dev);
1436
+ }
1437
+out:
1438
+ mutex_unlock(&ib_dev->unregistration_lock);
1439
+}
1440
+
5741441 /**
5751442 * ib_unregister_device - Unregister an IB device
576
- * @device:Device to unregister
1443
+ * @ib_dev: The device to unregister
5771444 *
5781445 * Unregister an IB device. All clients will receive a remove callback.
1446
+ *
1447
+ * Callers should call this routine only once, and protect against races with
1448
+ * registration. Typically it should only be called as part of a remove
1449
+ * callback in an implementation of driver core's struct device_driver and
1450
+ * related.
1451
+ *
1452
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
1453
+ * this function.
5791454 */
580
-void ib_unregister_device(struct ib_device *device)
1455
+void ib_unregister_device(struct ib_device *ib_dev)
5811456 {
582
- struct ib_client_data *context, *tmp;
583
- unsigned long flags;
584
-
585
- mutex_lock(&device_mutex);
586
-
587
- down_write(&lists_rwsem);
588
- list_del(&device->core_list);
589
- spin_lock_irqsave(&device->client_data_lock, flags);
590
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
591
- context->going_down = true;
592
- spin_unlock_irqrestore(&device->client_data_lock, flags);
593
- downgrade_write(&lists_rwsem);
594
-
595
- list_for_each_entry_safe(context, tmp, &device->client_data_list,
596
- list) {
597
- if (context->client->remove)
598
- context->client->remove(device, context->data);
599
- }
600
- up_read(&lists_rwsem);
601
-
602
- ib_device_unregister_sysfs(device);
603
- ib_device_unregister_rdmacg(device);
604
-
605
- mutex_unlock(&device_mutex);
606
-
607
- ib_cache_cleanup_one(device);
608
-
609
- ib_security_destroy_port_pkey_list(device);
610
- kfree(device->port_pkey_list);
611
-
612
- down_write(&lists_rwsem);
613
- spin_lock_irqsave(&device->client_data_lock, flags);
614
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
615
- kfree(context);
616
- spin_unlock_irqrestore(&device->client_data_lock, flags);
617
- up_write(&lists_rwsem);
618
-
619
- device->reg_state = IB_DEV_UNREGISTERED;
1457
+ get_device(&ib_dev->dev);
1458
+ __ib_unregister_device(ib_dev);
1459
+ put_device(&ib_dev->dev);
6201460 }
6211461 EXPORT_SYMBOL(ib_unregister_device);
1462
+
1463
+/**
1464
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
1465
+ * @ib_dev: The device to unregister
1466
+ *
1467
+ * This is the same as ib_unregister_device(), except it includes an internal
1468
+ * ib_device_put() that should match a 'get' obtained by the caller.
1469
+ *
1470
+ * It is safe to call this routine concurrently from multiple threads while
1471
+ * holding the 'get'. When the function returns the device is fully
1472
+ * unregistered.
1473
+ *
1474
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
1475
+ * their resources associated with the device and dealloc it.
1476
+ */
1477
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
1478
+{
1479
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1480
+ get_device(&ib_dev->dev);
1481
+ ib_device_put(ib_dev);
1482
+ __ib_unregister_device(ib_dev);
1483
+ put_device(&ib_dev->dev);
1484
+}
1485
+EXPORT_SYMBOL(ib_unregister_device_and_put);
1486
+
1487
+/**
1488
+ * ib_unregister_driver - Unregister all IB devices for a driver
1489
+ * @driver_id: The driver to unregister
1490
+ *
1491
+ * This implements a fence for device unregistration. It only returns once all
1492
+ * devices associated with the driver_id have fully completed their
1493
+ * unregistration and returned from ib_unregister_device*().
1494
+ *
1495
+ * If device's are not yet unregistered it goes ahead and starts unregistering
1496
+ * them.
1497
+ *
1498
+ * This does not block creation of new devices with the given driver_id, that
1499
+ * is the responsibility of the caller.
1500
+ */
1501
+void ib_unregister_driver(enum rdma_driver_id driver_id)
1502
+{
1503
+ struct ib_device *ib_dev;
1504
+ unsigned long index;
1505
+
1506
+ down_read(&devices_rwsem);
1507
+ xa_for_each (&devices, index, ib_dev) {
1508
+ if (ib_dev->ops.driver_id != driver_id)
1509
+ continue;
1510
+
1511
+ get_device(&ib_dev->dev);
1512
+ up_read(&devices_rwsem);
1513
+
1514
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1515
+ __ib_unregister_device(ib_dev);
1516
+
1517
+ put_device(&ib_dev->dev);
1518
+ down_read(&devices_rwsem);
1519
+ }
1520
+ up_read(&devices_rwsem);
1521
+}
1522
+EXPORT_SYMBOL(ib_unregister_driver);
1523
+
1524
+static void ib_unregister_work(struct work_struct *work)
1525
+{
1526
+ struct ib_device *ib_dev =
1527
+ container_of(work, struct ib_device, unregistration_work);
1528
+
1529
+ __ib_unregister_device(ib_dev);
1530
+ put_device(&ib_dev->dev);
1531
+}
1532
+
1533
+/**
1534
+ * ib_unregister_device_queued - Unregister a device using a work queue
1535
+ * @ib_dev: The device to unregister
1536
+ *
1537
+ * This schedules an asynchronous unregistration using a WQ for the device. A
1538
+ * driver should use this to avoid holding locks while doing unregistration,
1539
+ * such as holding the RTNL lock.
1540
+ *
1541
+ * Drivers using this API must use ib_unregister_driver before module unload
1542
+ * to ensure that all scheduled unregistrations have completed.
1543
+ */
1544
+void ib_unregister_device_queued(struct ib_device *ib_dev)
1545
+{
1546
+ WARN_ON(!refcount_read(&ib_dev->refcount));
1547
+ WARN_ON(!ib_dev->ops.dealloc_driver);
1548
+ get_device(&ib_dev->dev);
1549
+ if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
1550
+ put_device(&ib_dev->dev);
1551
+}
1552
+EXPORT_SYMBOL(ib_unregister_device_queued);
1553
+
1554
+/*
1555
+ * The caller must pass in a device that has the kref held and the refcount
1556
+ * released. If the device is in cur_net and still registered then it is moved
1557
+ * into net.
1558
+ */
1559
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
1560
+ struct net *net)
1561
+{
1562
+ int ret2 = -EINVAL;
1563
+ int ret;
1564
+
1565
+ mutex_lock(&device->unregistration_lock);
1566
+
1567
+ /*
1568
+ * If a device not under ib_device_get() or if the unregistration_lock
1569
+ * is not held, the namespace can be changed, or it can be unregistered.
1570
+ * Check again under the lock.
1571
+ */
1572
+ if (refcount_read(&device->refcount) == 0 ||
1573
+ !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
1574
+ ret = -ENODEV;
1575
+ goto out;
1576
+ }
1577
+
1578
+ kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
1579
+ disable_device(device);
1580
+
1581
+ /*
1582
+ * At this point no one can be using the device, so it is safe to
1583
+ * change the namespace.
1584
+ */
1585
+ write_pnet(&device->coredev.rdma_net, net);
1586
+
1587
+ down_read(&devices_rwsem);
1588
+ /*
1589
+ * Currently rdma devices are system wide unique. So the device name
1590
+ * is guaranteed free in the new namespace. Publish the new namespace
1591
+ * at the sysfs level.
1592
+ */
1593
+ ret = device_rename(&device->dev, dev_name(&device->dev));
1594
+ up_read(&devices_rwsem);
1595
+ if (ret) {
1596
+ dev_warn(&device->dev,
1597
+ "%s: Couldn't rename device after namespace change\n",
1598
+ __func__);
1599
+ /* Try and put things back and re-enable the device */
1600
+ write_pnet(&device->coredev.rdma_net, cur_net);
1601
+ }
1602
+
1603
+ ret2 = enable_device_and_get(device);
1604
+ if (ret2) {
1605
+ /*
1606
+ * This shouldn't really happen, but if it does, let the user
1607
+ * retry at later point. So don't disable the device.
1608
+ */
1609
+ dev_warn(&device->dev,
1610
+ "%s: Couldn't re-enable device after namespace change\n",
1611
+ __func__);
1612
+ }
1613
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1614
+
1615
+ ib_device_put(device);
1616
+out:
1617
+ mutex_unlock(&device->unregistration_lock);
1618
+ if (ret)
1619
+ return ret;
1620
+ return ret2;
1621
+}
1622
+
1623
+int ib_device_set_netns_put(struct sk_buff *skb,
1624
+ struct ib_device *dev, u32 ns_fd)
1625
+{
1626
+ struct net *net;
1627
+ int ret;
1628
+
1629
+ net = get_net_ns_by_fd(ns_fd);
1630
+ if (IS_ERR(net)) {
1631
+ ret = PTR_ERR(net);
1632
+ goto net_err;
1633
+ }
1634
+
1635
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
1636
+ ret = -EPERM;
1637
+ goto ns_err;
1638
+ }
1639
+
1640
+ /*
1641
+ * Currently supported only for those providers which support
1642
+ * disassociation and don't do port specific sysfs init. Once a
1643
+ * port_cleanup infrastructure is implemented, this limitation will be
1644
+ * removed.
1645
+ */
1646
+ if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
1647
+ ib_devices_shared_netns) {
1648
+ ret = -EOPNOTSUPP;
1649
+ goto ns_err;
1650
+ }
1651
+
1652
+ get_device(&dev->dev);
1653
+ ib_device_put(dev);
1654
+ ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
1655
+ put_device(&dev->dev);
1656
+
1657
+ put_net(net);
1658
+ return ret;
1659
+
1660
+ns_err:
1661
+ put_net(net);
1662
+net_err:
1663
+ ib_device_put(dev);
1664
+ return ret;
1665
+}
1666
+
1667
+static struct pernet_operations rdma_dev_net_ops = {
1668
+ .init = rdma_dev_init_net,
1669
+ .exit = rdma_dev_exit_net,
1670
+ .id = &rdma_dev_net_id,
1671
+ .size = sizeof(struct rdma_dev_net),
1672
+};
1673
+
1674
+static int assign_client_id(struct ib_client *client)
1675
+{
1676
+ int ret;
1677
+
1678
+ down_write(&clients_rwsem);
1679
+ /*
1680
+ * The add/remove callbacks must be called in FIFO/LIFO order. To
1681
+ * achieve this we assign client_ids so they are sorted in
1682
+ * registration order.
1683
+ */
1684
+ client->client_id = highest_client_id;
1685
+ ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
1686
+ if (ret)
1687
+ goto out;
1688
+
1689
+ highest_client_id++;
1690
+ xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
1691
+
1692
+out:
1693
+ up_write(&clients_rwsem);
1694
+ return ret;
1695
+}
1696
+
1697
+static void remove_client_id(struct ib_client *client)
1698
+{
1699
+ down_write(&clients_rwsem);
1700
+ xa_erase(&clients, client->client_id);
1701
+ for (; highest_client_id; highest_client_id--)
1702
+ if (xa_load(&clients, highest_client_id - 1))
1703
+ break;
1704
+ up_write(&clients_rwsem);
1705
+}
6221706
6231707 /**
6241708 * ib_register_client - Register an IB client
....@@ -636,19 +1720,25 @@
6361720 int ib_register_client(struct ib_client *client)
6371721 {
6381722 struct ib_device *device;
1723
+ unsigned long index;
1724
+ int ret;
6391725
640
- mutex_lock(&device_mutex);
1726
+ refcount_set(&client->uses, 1);
1727
+ init_completion(&client->uses_zero);
1728
+ ret = assign_client_id(client);
1729
+ if (ret)
1730
+ return ret;
6411731
642
- list_for_each_entry(device, &device_list, core_list)
643
- if (!add_client_context(device, client) && client->add)
644
- client->add(device);
645
-
646
- down_write(&lists_rwsem);
647
- list_add_tail(&client->list, &client_list);
648
- up_write(&lists_rwsem);
649
-
650
- mutex_unlock(&device_mutex);
651
-
1732
+ down_read(&devices_rwsem);
1733
+ xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
1734
+ ret = add_client_context(device, client);
1735
+ if (ret) {
1736
+ up_read(&devices_rwsem);
1737
+ ib_unregister_client(client);
1738
+ return ret;
1739
+ }
1740
+ }
1741
+ up_read(&devices_rwsem);
6521742 return 0;
6531743 }
6541744 EXPORT_SYMBOL(ib_register_client);
....@@ -660,80 +1750,140 @@
6601750 * Upper level users use ib_unregister_client() to remove their client
6611751 * registration. When ib_unregister_client() is called, the client
6621752 * will receive a remove callback for each IB device still registered.
1753
+ *
1754
+ * This is a full fence, once it returns no client callbacks will be called,
1755
+ * or are running in another thread.
6631756 */
6641757 void ib_unregister_client(struct ib_client *client)
6651758 {
666
- struct ib_client_data *context, *tmp;
6671759 struct ib_device *device;
668
- unsigned long flags;
1760
+ unsigned long index;
6691761
670
- mutex_lock(&device_mutex);
1762
+ down_write(&clients_rwsem);
1763
+ ib_client_put(client);
1764
+ xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
1765
+ up_write(&clients_rwsem);
6711766
672
- down_write(&lists_rwsem);
673
- list_del(&client->list);
674
- up_write(&lists_rwsem);
675
-
676
- list_for_each_entry(device, &device_list, core_list) {
677
- struct ib_client_data *found_context = NULL;
678
-
679
- down_write(&lists_rwsem);
680
- spin_lock_irqsave(&device->client_data_lock, flags);
681
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
682
- if (context->client == client) {
683
- context->going_down = true;
684
- found_context = context;
685
- break;
686
- }
687
- spin_unlock_irqrestore(&device->client_data_lock, flags);
688
- up_write(&lists_rwsem);
689
-
690
- if (client->remove)
691
- client->remove(device, found_context ?
692
- found_context->data : NULL);
693
-
694
- if (!found_context) {
695
- pr_warn("No client context found for %s/%s\n",
696
- device->name, client->name);
1767
+ /* We do not want to have locks while calling client->remove() */
1768
+ rcu_read_lock();
1769
+ xa_for_each (&devices, index, device) {
1770
+ if (!ib_device_try_get(device))
6971771 continue;
698
- }
1772
+ rcu_read_unlock();
6991773
700
- down_write(&lists_rwsem);
701
- spin_lock_irqsave(&device->client_data_lock, flags);
702
- list_del(&found_context->list);
703
- kfree(found_context);
704
- spin_unlock_irqrestore(&device->client_data_lock, flags);
705
- up_write(&lists_rwsem);
1774
+ remove_client_context(device, client->client_id);
1775
+
1776
+ ib_device_put(device);
1777
+ rcu_read_lock();
7061778 }
1779
+ rcu_read_unlock();
7071780
708
- mutex_unlock(&device_mutex);
1781
+ /*
1782
+ * remove_client_context() is not a fence, it can return even though a
1783
+ * removal is ongoing. Wait until all removals are completed.
1784
+ */
1785
+ wait_for_completion(&client->uses_zero);
1786
+ remove_client_id(client);
7091787 }
7101788 EXPORT_SYMBOL(ib_unregister_client);
7111789
712
-/**
713
- * ib_get_client_data - Get IB client context
714
- * @device:Device to get context for
715
- * @client:Client to get context for
716
- *
717
- * ib_get_client_data() returns client context set with
718
- * ib_set_client_data().
719
- */
720
-void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
1790
+static int __ib_get_global_client_nl_info(const char *client_name,
1791
+ struct ib_client_nl_info *res)
7211792 {
722
- struct ib_client_data *context;
723
- void *ret = NULL;
724
- unsigned long flags;
1793
+ struct ib_client *client;
1794
+ unsigned long index;
1795
+ int ret = -ENOENT;
7251796
726
- spin_lock_irqsave(&device->client_data_lock, flags);
727
- list_for_each_entry(context, &device->client_data_list, list)
728
- if (context->client == client) {
729
- ret = context->data;
1797
+ down_read(&clients_rwsem);
1798
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1799
+ if (strcmp(client->name, client_name) != 0)
1800
+ continue;
1801
+ if (!client->get_global_nl_info) {
1802
+ ret = -EOPNOTSUPP;
7301803 break;
7311804 }
732
- spin_unlock_irqrestore(&device->client_data_lock, flags);
1805
+ ret = client->get_global_nl_info(res);
1806
+ if (WARN_ON(ret == -ENOENT))
1807
+ ret = -EINVAL;
1808
+ if (!ret && res->cdev)
1809
+ get_device(res->cdev);
1810
+ break;
1811
+ }
1812
+ up_read(&clients_rwsem);
1813
+ return ret;
1814
+}
1815
+
1816
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
1817
+ const char *client_name,
1818
+ struct ib_client_nl_info *res)
1819
+{
1820
+ unsigned long index;
1821
+ void *client_data;
1822
+ int ret = -ENOENT;
1823
+
1824
+ down_read(&ibdev->client_data_rwsem);
1825
+ xan_for_each_marked (&ibdev->client_data, index, client_data,
1826
+ CLIENT_DATA_REGISTERED) {
1827
+ struct ib_client *client = xa_load(&clients, index);
1828
+
1829
+ if (!client || strcmp(client->name, client_name) != 0)
1830
+ continue;
1831
+ if (!client->get_nl_info) {
1832
+ ret = -EOPNOTSUPP;
1833
+ break;
1834
+ }
1835
+ ret = client->get_nl_info(ibdev, client_data, res);
1836
+ if (WARN_ON(ret == -ENOENT))
1837
+ ret = -EINVAL;
1838
+
1839
+ /*
1840
+ * The cdev is guaranteed valid as long as we are inside the
1841
+ * client_data_rwsem as remove_one can't be called. Keep it
1842
+ * valid for the caller.
1843
+ */
1844
+ if (!ret && res->cdev)
1845
+ get_device(res->cdev);
1846
+ break;
1847
+ }
1848
+ up_read(&ibdev->client_data_rwsem);
7331849
7341850 return ret;
7351851 }
736
-EXPORT_SYMBOL(ib_get_client_data);
1852
+
1853
+/**
1854
+ * ib_get_client_nl_info - Fetch the nl_info from a client
1855
+ * @device - IB device
1856
+ * @client_name - Name of the client
1857
+ * @res - Result of the query
1858
+ */
1859
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
1860
+ struct ib_client_nl_info *res)
1861
+{
1862
+ int ret;
1863
+
1864
+ if (ibdev)
1865
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
1866
+ else
1867
+ ret = __ib_get_global_client_nl_info(client_name, res);
1868
+#ifdef CONFIG_MODULES
1869
+ if (ret == -ENOENT) {
1870
+ request_module("rdma-client-%s", client_name);
1871
+ if (ibdev)
1872
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
1873
+ else
1874
+ ret = __ib_get_global_client_nl_info(client_name, res);
1875
+ }
1876
+#endif
1877
+ if (ret) {
1878
+ if (ret == -ENOENT)
1879
+ return -EOPNOTSUPP;
1880
+ return ret;
1881
+ }
1882
+
1883
+ if (WARN_ON(!res->cdev))
1884
+ return -EINVAL;
1885
+ return 0;
1886
+}
7371887
7381888 /**
7391889 * ib_set_client_data - Set IB client context
....@@ -741,27 +1891,22 @@
7411891 * @client:Client to set context for
7421892 * @data:Context to set
7431893 *
744
- * ib_set_client_data() sets client context that can be retrieved with
745
- * ib_get_client_data().
1894
+ * ib_set_client_data() sets client context data that can be retrieved with
1895
+ * ib_get_client_data(). This can only be called while the client is
1896
+ * registered to the device, once the ib_client remove() callback returns this
1897
+ * cannot be called.
7461898 */
7471899 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
7481900 void *data)
7491901 {
750
- struct ib_client_data *context;
751
- unsigned long flags;
1902
+ void *rc;
7521903
753
- spin_lock_irqsave(&device->client_data_lock, flags);
754
- list_for_each_entry(context, &device->client_data_list, list)
755
- if (context->client == client) {
756
- context->data = data;
757
- goto out;
758
- }
1904
+ if (WARN_ON(IS_ERR(data)))
1905
+ data = NULL;
7591906
760
- pr_warn("No client context found for %s/%s\n",
761
- device->name, client->name);
762
-
763
-out:
764
- spin_unlock_irqrestore(&device->client_data_lock, flags);
1907
+ rc = xa_store(&device->client_data, client->client_id, data,
1908
+ GFP_KERNEL);
1909
+ WARN_ON(xa_is_err(rc));
7651910 }
7661911 EXPORT_SYMBOL(ib_set_client_data);
7671912
....@@ -771,17 +1916,15 @@
7711916 *
7721917 * ib_register_event_handler() registers an event handler that will be
7731918 * called back when asynchronous IB events occur (as defined in
774
- * chapter 11 of the InfiniBand Architecture Specification). This
775
- * callback may occur in interrupt context.
1919
+ * chapter 11 of the InfiniBand Architecture Specification). This
1920
+ * callback occurs in workqueue context.
7761921 */
7771922 void ib_register_event_handler(struct ib_event_handler *event_handler)
7781923 {
779
- unsigned long flags;
780
-
781
- spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1924
+ down_write(&event_handler->device->event_handler_rwsem);
7821925 list_add_tail(&event_handler->list,
7831926 &event_handler->device->event_handler_list);
784
- spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1927
+ up_write(&event_handler->device->event_handler_rwsem);
7851928 }
7861929 EXPORT_SYMBOL(ib_register_event_handler);
7871930
....@@ -794,35 +1937,87 @@
7941937 */
7951938 void ib_unregister_event_handler(struct ib_event_handler *event_handler)
7961939 {
797
- unsigned long flags;
798
-
799
- spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1940
+ down_write(&event_handler->device->event_handler_rwsem);
8001941 list_del(&event_handler->list);
801
- spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1942
+ up_write(&event_handler->device->event_handler_rwsem);
8021943 }
8031944 EXPORT_SYMBOL(ib_unregister_event_handler);
8041945
805
-/**
806
- * ib_dispatch_event - Dispatch an asynchronous event
807
- * @event:Event to dispatch
808
- *
809
- * Low-level drivers must call ib_dispatch_event() to dispatch the
810
- * event to all registered event handlers when an asynchronous event
811
- * occurs.
812
- */
813
-void ib_dispatch_event(struct ib_event *event)
1946
+void ib_dispatch_event_clients(struct ib_event *event)
8141947 {
815
- unsigned long flags;
8161948 struct ib_event_handler *handler;
8171949
818
- spin_lock_irqsave(&event->device->event_handler_lock, flags);
1950
+ down_read(&event->device->event_handler_rwsem);
8191951
8201952 list_for_each_entry(handler, &event->device->event_handler_list, list)
8211953 handler->handler(handler, event);
8221954
823
- spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
1955
+ up_read(&event->device->event_handler_rwsem);
8241956 }
825
-EXPORT_SYMBOL(ib_dispatch_event);
1957
+
1958
+static int iw_query_port(struct ib_device *device,
1959
+ u8 port_num,
1960
+ struct ib_port_attr *port_attr)
1961
+{
1962
+ struct in_device *inetdev;
1963
+ struct net_device *netdev;
1964
+
1965
+ memset(port_attr, 0, sizeof(*port_attr));
1966
+
1967
+ netdev = ib_device_get_netdev(device, port_num);
1968
+ if (!netdev)
1969
+ return -ENODEV;
1970
+
1971
+ port_attr->max_mtu = IB_MTU_4096;
1972
+ port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
1973
+
1974
+ if (!netif_carrier_ok(netdev)) {
1975
+ port_attr->state = IB_PORT_DOWN;
1976
+ port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
1977
+ } else {
1978
+ rcu_read_lock();
1979
+ inetdev = __in_dev_get_rcu(netdev);
1980
+
1981
+ if (inetdev && inetdev->ifa_list) {
1982
+ port_attr->state = IB_PORT_ACTIVE;
1983
+ port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
1984
+ } else {
1985
+ port_attr->state = IB_PORT_INIT;
1986
+ port_attr->phys_state =
1987
+ IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
1988
+ }
1989
+
1990
+ rcu_read_unlock();
1991
+ }
1992
+
1993
+ dev_put(netdev);
1994
+ return device->ops.query_port(device, port_num, port_attr);
1995
+}
1996
+
1997
+static int __ib_query_port(struct ib_device *device,
1998
+ u8 port_num,
1999
+ struct ib_port_attr *port_attr)
2000
+{
2001
+ union ib_gid gid = {};
2002
+ int err;
2003
+
2004
+ memset(port_attr, 0, sizeof(*port_attr));
2005
+
2006
+ err = device->ops.query_port(device, port_num, port_attr);
2007
+ if (err || port_attr->subnet_prefix)
2008
+ return err;
2009
+
2010
+ if (rdma_port_get_link_layer(device, port_num) !=
2011
+ IB_LINK_LAYER_INFINIBAND)
2012
+ return 0;
2013
+
2014
+ err = device->ops.query_gid(device, port_num, 0, &gid);
2015
+ if (err)
2016
+ return err;
2017
+
2018
+ port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
2019
+ return 0;
2020
+}
8262021
8272022 /**
8282023 * ib_query_port - Query IB port attributes
....@@ -837,28 +2032,197 @@
8372032 u8 port_num,
8382033 struct ib_port_attr *port_attr)
8392034 {
840
- union ib_gid gid;
841
- int err;
842
-
8432035 if (!rdma_is_port_valid(device, port_num))
8442036 return -EINVAL;
8452037
846
- memset(port_attr, 0, sizeof(*port_attr));
847
- err = device->query_port(device, port_num, port_attr);
848
- if (err || port_attr->subnet_prefix)
849
- return err;
850
-
851
- if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
852
- return 0;
853
-
854
- err = device->query_gid(device, port_num, 0, &gid);
855
- if (err)
856
- return err;
857
-
858
- port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
859
- return 0;
2038
+ if (rdma_protocol_iwarp(device, port_num))
2039
+ return iw_query_port(device, port_num, port_attr);
2040
+ else
2041
+ return __ib_query_port(device, port_num, port_attr);
8602042 }
8612043 EXPORT_SYMBOL(ib_query_port);
2044
+
2045
+static void add_ndev_hash(struct ib_port_data *pdata)
2046
+{
2047
+ unsigned long flags;
2048
+
2049
+ might_sleep();
2050
+
2051
+ spin_lock_irqsave(&ndev_hash_lock, flags);
2052
+ if (hash_hashed(&pdata->ndev_hash_link)) {
2053
+ hash_del_rcu(&pdata->ndev_hash_link);
2054
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
2055
+ /*
2056
+ * We cannot do hash_add_rcu after a hash_del_rcu until the
2057
+ * grace period
2058
+ */
2059
+ synchronize_rcu();
2060
+ spin_lock_irqsave(&ndev_hash_lock, flags);
2061
+ }
2062
+ if (pdata->netdev)
2063
+ hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
2064
+ (uintptr_t)pdata->netdev);
2065
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
2066
+}
2067
+
2068
+/**
2069
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
2070
+ * @ib_dev: Device to modify
2071
+ * @ndev: net_device to affiliate, may be NULL
2072
+ * @port: IB port the net_device is connected to
2073
+ *
2074
+ * Drivers should use this to link the ib_device to a netdev so the netdev
2075
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
2076
+ * affiliated with any port.
2077
+ *
2078
+ * The caller must ensure that the given ndev is not unregistered or
2079
+ * unregistering, and that either the ib_device is unregistered or
2080
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
2081
+ * NETDEV_UNREGISTER event.
2082
+ */
2083
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
2084
+ unsigned int port)
2085
+{
2086
+ struct net_device *old_ndev;
2087
+ struct ib_port_data *pdata;
2088
+ unsigned long flags;
2089
+ int ret;
2090
+
2091
+ /*
2092
+ * Drivers wish to call this before ib_register_driver, so we have to
2093
+ * setup the port data early.
2094
+ */
2095
+ ret = alloc_port_data(ib_dev);
2096
+ if (ret)
2097
+ return ret;
2098
+
2099
+ if (!rdma_is_port_valid(ib_dev, port))
2100
+ return -EINVAL;
2101
+
2102
+ pdata = &ib_dev->port_data[port];
2103
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
2104
+ old_ndev = rcu_dereference_protected(
2105
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2106
+ if (old_ndev == ndev) {
2107
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2108
+ return 0;
2109
+ }
2110
+
2111
+ if (ndev)
2112
+ dev_hold(ndev);
2113
+ rcu_assign_pointer(pdata->netdev, ndev);
2114
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2115
+
2116
+ add_ndev_hash(pdata);
2117
+ if (old_ndev)
2118
+ dev_put(old_ndev);
2119
+
2120
+ return 0;
2121
+}
2122
+EXPORT_SYMBOL(ib_device_set_netdev);
2123
+
2124
+static void free_netdevs(struct ib_device *ib_dev)
2125
+{
2126
+ unsigned long flags;
2127
+ unsigned int port;
2128
+
2129
+ if (!ib_dev->port_data)
2130
+ return;
2131
+
2132
+ rdma_for_each_port (ib_dev, port) {
2133
+ struct ib_port_data *pdata = &ib_dev->port_data[port];
2134
+ struct net_device *ndev;
2135
+
2136
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
2137
+ ndev = rcu_dereference_protected(
2138
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2139
+ if (ndev) {
2140
+ spin_lock(&ndev_hash_lock);
2141
+ hash_del_rcu(&pdata->ndev_hash_link);
2142
+ spin_unlock(&ndev_hash_lock);
2143
+
2144
+ /*
2145
+ * If this is the last dev_put there is still a
2146
+ * synchronize_rcu before the netdev is kfreed, so we
2147
+ * can continue to rely on unlocked pointer
2148
+ * comparisons after the put
2149
+ */
2150
+ rcu_assign_pointer(pdata->netdev, NULL);
2151
+ dev_put(ndev);
2152
+ }
2153
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2154
+ }
2155
+}
2156
+
2157
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
2158
+ unsigned int port)
2159
+{
2160
+ struct ib_port_data *pdata;
2161
+ struct net_device *res;
2162
+
2163
+ if (!rdma_is_port_valid(ib_dev, port))
2164
+ return NULL;
2165
+
2166
+ pdata = &ib_dev->port_data[port];
2167
+
2168
+ /*
2169
+ * New drivers should use ib_device_set_netdev() not the legacy
2170
+ * get_netdev().
2171
+ */
2172
+ if (ib_dev->ops.get_netdev)
2173
+ res = ib_dev->ops.get_netdev(ib_dev, port);
2174
+ else {
2175
+ spin_lock(&pdata->netdev_lock);
2176
+ res = rcu_dereference_protected(
2177
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2178
+ if (res)
2179
+ dev_hold(res);
2180
+ spin_unlock(&pdata->netdev_lock);
2181
+ }
2182
+
2183
+ /*
2184
+ * If we are starting to unregister expedite things by preventing
2185
+ * propagation of an unregistering netdev.
2186
+ */
2187
+ if (res && res->reg_state != NETREG_REGISTERED) {
2188
+ dev_put(res);
2189
+ return NULL;
2190
+ }
2191
+
2192
+ return res;
2193
+}
2194
+
2195
+/**
2196
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
2197
+ * @ndev: netdev to locate
2198
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
2199
+ *
2200
+ * Find and hold an ib_device that is associated with a netdev via
2201
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
2202
+ * returned pointer.
2203
+ */
2204
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
2205
+ enum rdma_driver_id driver_id)
2206
+{
2207
+ struct ib_device *res = NULL;
2208
+ struct ib_port_data *cur;
2209
+
2210
+ rcu_read_lock();
2211
+ hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
2212
+ (uintptr_t)ndev) {
2213
+ if (rcu_access_pointer(cur->netdev) == ndev &&
2214
+ (driver_id == RDMA_DRIVER_UNKNOWN ||
2215
+ cur->ib_dev->ops.driver_id == driver_id) &&
2216
+ ib_device_try_get(cur->ib_dev)) {
2217
+ res = cur->ib_dev;
2218
+ break;
2219
+ }
2220
+ }
2221
+ rcu_read_unlock();
2222
+
2223
+ return res;
2224
+}
2225
+EXPORT_SYMBOL(ib_device_get_by_netdev);
8622226
8632227 /**
8642228 * ib_enum_roce_netdev - enumerate all RoCE ports
....@@ -878,21 +2242,12 @@
8782242 roce_netdev_callback cb,
8792243 void *cookie)
8802244 {
881
- u8 port;
2245
+ unsigned int port;
8822246
883
- for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
884
- port++)
2247
+ rdma_for_each_port (ib_dev, port)
8852248 if (rdma_protocol_roce(ib_dev, port)) {
886
- struct net_device *idev = NULL;
887
-
888
- if (ib_dev->get_netdev)
889
- idev = ib_dev->get_netdev(ib_dev, port);
890
-
891
- if (idev &&
892
- idev->reg_state >= NETREG_UNREGISTERED) {
893
- dev_put(idev);
894
- idev = NULL;
895
- }
2249
+ struct net_device *idev =
2250
+ ib_device_get_netdev(ib_dev, port);
8962251
8972252 if (filter(ib_dev, port, idev, filter_cookie))
8982253 cb(ib_dev, port, idev, cookie);
....@@ -919,11 +2274,12 @@
9192274 void *cookie)
9202275 {
9212276 struct ib_device *dev;
2277
+ unsigned long index;
9222278
923
- down_read(&lists_rwsem);
924
- list_for_each_entry(dev, &device_list, core_list)
2279
+ down_read(&devices_rwsem);
2280
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
9252281 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
926
- up_read(&lists_rwsem);
2282
+ up_read(&devices_rwsem);
9272283 }
9282284
9292285 /**
....@@ -935,19 +2291,22 @@
9352291 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
9362292 struct netlink_callback *cb)
9372293 {
2294
+ unsigned long index;
9382295 struct ib_device *dev;
9392296 unsigned int idx = 0;
9402297 int ret = 0;
9412298
942
- down_read(&lists_rwsem);
943
- list_for_each_entry(dev, &device_list, core_list) {
2299
+ down_read(&devices_rwsem);
2300
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
2301
+ if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
2302
+ continue;
2303
+
9442304 ret = nldev_cb(dev, skb, cb, idx);
9452305 if (ret)
9462306 break;
9472307 idx++;
9482308 }
949
-
950
- up_read(&lists_rwsem);
2309
+ up_read(&devices_rwsem);
9512310 return ret;
9522311 }
9532312
....@@ -963,7 +2322,13 @@
9632322 int ib_query_pkey(struct ib_device *device,
9642323 u8 port_num, u16 index, u16 *pkey)
9652324 {
966
- return device->query_pkey(device, port_num, index, pkey);
2325
+ if (!rdma_is_port_valid(device, port_num))
2326
+ return -EINVAL;
2327
+
2328
+ if (!device->ops.query_pkey)
2329
+ return -EOPNOTSUPP;
2330
+
2331
+ return device->ops.query_pkey(device, port_num, index, pkey);
9672332 }
9682333 EXPORT_SYMBOL(ib_query_pkey);
9692334
....@@ -980,11 +2345,11 @@
9802345 int device_modify_mask,
9812346 struct ib_device_modify *device_modify)
9822347 {
983
- if (!device->modify_device)
984
- return -ENOSYS;
2348
+ if (!device->ops.modify_device)
2349
+ return -EOPNOTSUPP;
9852350
986
- return device->modify_device(device, device_modify_mask,
987
- device_modify);
2351
+ return device->ops.modify_device(device, device_modify_mask,
2352
+ device_modify);
9882353 }
9892354 EXPORT_SYMBOL(ib_modify_device);
9902355
....@@ -1008,11 +2373,16 @@
10082373 if (!rdma_is_port_valid(device, port_num))
10092374 return -EINVAL;
10102375
1011
- if (device->modify_port)
1012
- rc = device->modify_port(device, port_num, port_modify_mask,
1013
- port_modify);
2376
+ if (device->ops.modify_port)
2377
+ rc = device->ops.modify_port(device, port_num,
2378
+ port_modify_mask,
2379
+ port_modify);
2380
+ else if (rdma_protocol_roce(device, port_num) &&
2381
+ ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
2382
+ (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
2383
+ rc = 0;
10142384 else
1015
- rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
2385
+ rc = -EOPNOTSUPP;
10162386 return rc;
10172387 }
10182388 EXPORT_SYMBOL(ib_modify_port);
....@@ -1030,13 +2400,15 @@
10302400 u8 *port_num, u16 *index)
10312401 {
10322402 union ib_gid tmp_gid;
1033
- int ret, port, i;
2403
+ unsigned int port;
2404
+ int ret, i;
10342405
1035
- for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
2406
+ rdma_for_each_port (device, port) {
10362407 if (!rdma_protocol_ib(device, port))
10372408 continue;
10382409
1039
- for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
2410
+ for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
2411
+ ++i) {
10402412 ret = rdma_query_gid(device, port, i, &tmp_gid);
10412413 if (ret)
10422414 continue;
....@@ -1069,7 +2441,8 @@
10692441 u16 tmp_pkey;
10702442 int partial_ix = -1;
10712443
1072
- for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
2444
+ for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
2445
+ ++i) {
10732446 ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
10742447 if (ret)
10752448 return ret;
....@@ -1102,6 +2475,7 @@
11022475 * @gid: A GID that the net_dev uses to communicate.
11032476 * @addr: Contains the IP address that the request specified as its
11042477 * destination.
2478
+ *
11052479 */
11062480 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
11072481 u8 port,
....@@ -1110,33 +2484,201 @@
11102484 const struct sockaddr *addr)
11112485 {
11122486 struct net_device *net_dev = NULL;
1113
- struct ib_client_data *context;
2487
+ unsigned long index;
2488
+ void *client_data;
11142489
11152490 if (!rdma_protocol_ib(dev, port))
11162491 return NULL;
11172492
1118
- down_read(&lists_rwsem);
2493
+ /*
2494
+ * Holding the read side guarantees that the client will not become
2495
+ * unregistered while we are calling get_net_dev_by_params()
2496
+ */
2497
+ down_read(&dev->client_data_rwsem);
2498
+ xan_for_each_marked (&dev->client_data, index, client_data,
2499
+ CLIENT_DATA_REGISTERED) {
2500
+ struct ib_client *client = xa_load(&clients, index);
11192501
1120
- list_for_each_entry(context, &dev->client_data_list, list) {
1121
- struct ib_client *client = context->client;
1122
-
1123
- if (context->going_down)
2502
+ if (!client || !client->get_net_dev_by_params)
11242503 continue;
11252504
1126
- if (client->get_net_dev_by_params) {
1127
- net_dev = client->get_net_dev_by_params(dev, port, pkey,
1128
- gid, addr,
1129
- context->data);
1130
- if (net_dev)
1131
- break;
1132
- }
2505
+ net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
2506
+ addr, client_data);
2507
+ if (net_dev)
2508
+ break;
11332509 }
1134
-
1135
- up_read(&lists_rwsem);
2510
+ up_read(&dev->client_data_rwsem);
11362511
11372512 return net_dev;
11382513 }
11392514 EXPORT_SYMBOL(ib_get_net_dev_by_params);
2515
+
2516
+void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
2517
+{
2518
+ struct ib_device_ops *dev_ops = &dev->ops;
2519
+#define SET_DEVICE_OP(ptr, name) \
2520
+ do { \
2521
+ if (ops->name) \
2522
+ if (!((ptr)->name)) \
2523
+ (ptr)->name = ops->name; \
2524
+ } while (0)
2525
+
2526
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
2527
+
2528
+ if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
2529
+ WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
2530
+ dev_ops->driver_id != ops->driver_id);
2531
+ dev_ops->driver_id = ops->driver_id;
2532
+ }
2533
+ if (ops->owner) {
2534
+ WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
2535
+ dev_ops->owner = ops->owner;
2536
+ }
2537
+ if (ops->uverbs_abi_ver)
2538
+ dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
2539
+
2540
+ dev_ops->uverbs_no_driver_id_binding |=
2541
+ ops->uverbs_no_driver_id_binding;
2542
+
2543
+ SET_DEVICE_OP(dev_ops, add_gid);
2544
+ SET_DEVICE_OP(dev_ops, advise_mr);
2545
+ SET_DEVICE_OP(dev_ops, alloc_dm);
2546
+ SET_DEVICE_OP(dev_ops, alloc_hw_stats);
2547
+ SET_DEVICE_OP(dev_ops, alloc_mr);
2548
+ SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
2549
+ SET_DEVICE_OP(dev_ops, alloc_mw);
2550
+ SET_DEVICE_OP(dev_ops, alloc_pd);
2551
+ SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
2552
+ SET_DEVICE_OP(dev_ops, alloc_ucontext);
2553
+ SET_DEVICE_OP(dev_ops, alloc_xrcd);
2554
+ SET_DEVICE_OP(dev_ops, attach_mcast);
2555
+ SET_DEVICE_OP(dev_ops, check_mr_status);
2556
+ SET_DEVICE_OP(dev_ops, counter_alloc_stats);
2557
+ SET_DEVICE_OP(dev_ops, counter_bind_qp);
2558
+ SET_DEVICE_OP(dev_ops, counter_dealloc);
2559
+ SET_DEVICE_OP(dev_ops, counter_unbind_qp);
2560
+ SET_DEVICE_OP(dev_ops, counter_update_stats);
2561
+ SET_DEVICE_OP(dev_ops, create_ah);
2562
+ SET_DEVICE_OP(dev_ops, create_counters);
2563
+ SET_DEVICE_OP(dev_ops, create_cq);
2564
+ SET_DEVICE_OP(dev_ops, create_flow);
2565
+ SET_DEVICE_OP(dev_ops, create_flow_action_esp);
2566
+ SET_DEVICE_OP(dev_ops, create_qp);
2567
+ SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
2568
+ SET_DEVICE_OP(dev_ops, create_srq);
2569
+ SET_DEVICE_OP(dev_ops, create_wq);
2570
+ SET_DEVICE_OP(dev_ops, dealloc_dm);
2571
+ SET_DEVICE_OP(dev_ops, dealloc_driver);
2572
+ SET_DEVICE_OP(dev_ops, dealloc_mw);
2573
+ SET_DEVICE_OP(dev_ops, dealloc_pd);
2574
+ SET_DEVICE_OP(dev_ops, dealloc_ucontext);
2575
+ SET_DEVICE_OP(dev_ops, dealloc_xrcd);
2576
+ SET_DEVICE_OP(dev_ops, del_gid);
2577
+ SET_DEVICE_OP(dev_ops, dereg_mr);
2578
+ SET_DEVICE_OP(dev_ops, destroy_ah);
2579
+ SET_DEVICE_OP(dev_ops, destroy_counters);
2580
+ SET_DEVICE_OP(dev_ops, destroy_cq);
2581
+ SET_DEVICE_OP(dev_ops, destroy_flow);
2582
+ SET_DEVICE_OP(dev_ops, destroy_flow_action);
2583
+ SET_DEVICE_OP(dev_ops, destroy_qp);
2584
+ SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
2585
+ SET_DEVICE_OP(dev_ops, destroy_srq);
2586
+ SET_DEVICE_OP(dev_ops, destroy_wq);
2587
+ SET_DEVICE_OP(dev_ops, detach_mcast);
2588
+ SET_DEVICE_OP(dev_ops, disassociate_ucontext);
2589
+ SET_DEVICE_OP(dev_ops, drain_rq);
2590
+ SET_DEVICE_OP(dev_ops, drain_sq);
2591
+ SET_DEVICE_OP(dev_ops, enable_driver);
2592
+ SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
2593
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
2594
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
2595
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
2596
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
2597
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
2598
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
2599
+ SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
2600
+ SET_DEVICE_OP(dev_ops, get_dev_fw_str);
2601
+ SET_DEVICE_OP(dev_ops, get_dma_mr);
2602
+ SET_DEVICE_OP(dev_ops, get_hw_stats);
2603
+ SET_DEVICE_OP(dev_ops, get_link_layer);
2604
+ SET_DEVICE_OP(dev_ops, get_netdev);
2605
+ SET_DEVICE_OP(dev_ops, get_port_immutable);
2606
+ SET_DEVICE_OP(dev_ops, get_vector_affinity);
2607
+ SET_DEVICE_OP(dev_ops, get_vf_config);
2608
+ SET_DEVICE_OP(dev_ops, get_vf_guid);
2609
+ SET_DEVICE_OP(dev_ops, get_vf_stats);
2610
+ SET_DEVICE_OP(dev_ops, init_port);
2611
+ SET_DEVICE_OP(dev_ops, iw_accept);
2612
+ SET_DEVICE_OP(dev_ops, iw_add_ref);
2613
+ SET_DEVICE_OP(dev_ops, iw_connect);
2614
+ SET_DEVICE_OP(dev_ops, iw_create_listen);
2615
+ SET_DEVICE_OP(dev_ops, iw_destroy_listen);
2616
+ SET_DEVICE_OP(dev_ops, iw_get_qp);
2617
+ SET_DEVICE_OP(dev_ops, iw_reject);
2618
+ SET_DEVICE_OP(dev_ops, iw_rem_ref);
2619
+ SET_DEVICE_OP(dev_ops, map_mr_sg);
2620
+ SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
2621
+ SET_DEVICE_OP(dev_ops, mmap);
2622
+ SET_DEVICE_OP(dev_ops, mmap_free);
2623
+ SET_DEVICE_OP(dev_ops, modify_ah);
2624
+ SET_DEVICE_OP(dev_ops, modify_cq);
2625
+ SET_DEVICE_OP(dev_ops, modify_device);
2626
+ SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
2627
+ SET_DEVICE_OP(dev_ops, modify_port);
2628
+ SET_DEVICE_OP(dev_ops, modify_qp);
2629
+ SET_DEVICE_OP(dev_ops, modify_srq);
2630
+ SET_DEVICE_OP(dev_ops, modify_wq);
2631
+ SET_DEVICE_OP(dev_ops, peek_cq);
2632
+ SET_DEVICE_OP(dev_ops, poll_cq);
2633
+ SET_DEVICE_OP(dev_ops, post_recv);
2634
+ SET_DEVICE_OP(dev_ops, post_send);
2635
+ SET_DEVICE_OP(dev_ops, post_srq_recv);
2636
+ SET_DEVICE_OP(dev_ops, process_mad);
2637
+ SET_DEVICE_OP(dev_ops, query_ah);
2638
+ SET_DEVICE_OP(dev_ops, query_device);
2639
+ SET_DEVICE_OP(dev_ops, query_gid);
2640
+ SET_DEVICE_OP(dev_ops, query_pkey);
2641
+ SET_DEVICE_OP(dev_ops, query_port);
2642
+ SET_DEVICE_OP(dev_ops, query_qp);
2643
+ SET_DEVICE_OP(dev_ops, query_srq);
2644
+ SET_DEVICE_OP(dev_ops, query_ucontext);
2645
+ SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
2646
+ SET_DEVICE_OP(dev_ops, read_counters);
2647
+ SET_DEVICE_OP(dev_ops, reg_dm_mr);
2648
+ SET_DEVICE_OP(dev_ops, reg_user_mr);
2649
+ SET_DEVICE_OP(dev_ops, req_ncomp_notif);
2650
+ SET_DEVICE_OP(dev_ops, req_notify_cq);
2651
+ SET_DEVICE_OP(dev_ops, rereg_user_mr);
2652
+ SET_DEVICE_OP(dev_ops, resize_cq);
2653
+ SET_DEVICE_OP(dev_ops, set_vf_guid);
2654
+ SET_DEVICE_OP(dev_ops, set_vf_link_state);
2655
+
2656
+ SET_OBJ_SIZE(dev_ops, ib_ah);
2657
+ SET_OBJ_SIZE(dev_ops, ib_counters);
2658
+ SET_OBJ_SIZE(dev_ops, ib_cq);
2659
+ SET_OBJ_SIZE(dev_ops, ib_mw);
2660
+ SET_OBJ_SIZE(dev_ops, ib_pd);
2661
+ SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
2662
+ SET_OBJ_SIZE(dev_ops, ib_srq);
2663
+ SET_OBJ_SIZE(dev_ops, ib_ucontext);
2664
+ SET_OBJ_SIZE(dev_ops, ib_xrcd);
2665
+}
2666
+EXPORT_SYMBOL(ib_set_device_ops);
2667
+
2668
+#ifdef CONFIG_INFINIBAND_VIRT_DMA
2669
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
2670
+{
2671
+ struct scatterlist *s;
2672
+ int i;
2673
+
2674
+ for_each_sg(sg, s, nents, i) {
2675
+ sg_dma_address(s) = (uintptr_t)sg_virt(s);
2676
+ sg_dma_len(s) = s->length;
2677
+ }
2678
+ return nents;
2679
+}
2680
+EXPORT_SYMBOL(ib_dma_virt_map_sg);
2681
+#endif /* CONFIG_INFINIBAND_VIRT_DMA */
11402682
11412683 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
11422684 [RDMA_NL_LS_OP_RESOLVE] = {
....@@ -1183,15 +2725,11 @@
11832725 goto err_comp_unbound;
11842726 }
11852727
1186
- ret = rdma_nl_init();
1187
- if (ret) {
1188
- pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
1189
- goto err_sysfs;
1190
- }
2728
+ rdma_nl_init();
11912729
11922730 ret = addr_init();
11932731 if (ret) {
1194
- pr_warn("Could't init IB address resolution\n");
2732
+ pr_warn("Couldn't init IB address resolution\n");
11952733 goto err_ibnl;
11962734 }
11972735
....@@ -1207,18 +2745,34 @@
12072745 goto err_mad;
12082746 }
12092747
1210
- ret = register_lsm_notifier(&ibdev_lsm_nb);
2748
+ ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
12112749 if (ret) {
12122750 pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
12132751 goto err_sa;
12142752 }
12152753
2754
+ ret = register_pernet_device(&rdma_dev_net_ops);
2755
+ if (ret) {
2756
+ pr_warn("Couldn't init compat dev. ret %d\n", ret);
2757
+ goto err_compat;
2758
+ }
2759
+
12162760 nldev_init();
12172761 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
1218
- roce_gid_mgmt_init();
2762
+ ret = roce_gid_mgmt_init();
2763
+ if (ret) {
2764
+ pr_warn("Couldn't init RoCE GID management\n");
2765
+ goto err_parent;
2766
+ }
12192767
12202768 return 0;
12212769
2770
+err_parent:
2771
+ rdma_nl_unregister(RDMA_NL_LS);
2772
+ nldev_exit();
2773
+ unregister_pernet_device(&rdma_dev_net_ops);
2774
+err_compat:
2775
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
12222776 err_sa:
12232777 ib_sa_cleanup();
12242778 err_mad:
....@@ -1226,8 +2780,6 @@
12262780 err_addr:
12272781 addr_cleanup();
12282782 err_ibnl:
1229
- rdma_nl_exit();
1230
-err_sysfs:
12312783 class_unregister(&ib_class);
12322784 err_comp_unbound:
12332785 destroy_workqueue(ib_comp_unbound_wq);
....@@ -1243,7 +2795,8 @@
12432795 roce_gid_mgmt_cleanup();
12442796 nldev_exit();
12452797 rdma_nl_unregister(RDMA_NL_LS);
1246
- unregister_lsm_notifier(&ibdev_lsm_nb);
2798
+ unregister_pernet_device(&rdma_dev_net_ops);
2799
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
12472800 ib_sa_cleanup();
12482801 ib_mad_cleanup();
12492802 addr_cleanup();
....@@ -1253,9 +2806,15 @@
12532806 destroy_workqueue(ib_comp_wq);
12542807 /* Make sure that any pending umem accounting work is done. */
12552808 destroy_workqueue(ib_wq);
2809
+ flush_workqueue(system_unbound_wq);
2810
+ WARN_ON(!xa_empty(&clients));
2811
+ WARN_ON(!xa_empty(&devices));
12562812 }
12572813
12582814 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
12592815
1260
-subsys_initcall(ib_core_init);
2816
+/* ib core relies on netdev stack to first register net_ns_type_operations
2817
+ * ns kobject type before ib_core initialization.
2818
+ */
2819
+fs_initcall(ib_core_init);
12612820 module_exit(ib_core_cleanup);