hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
....@@ -52,10 +52,6 @@
5252 #include <linux/inetdevice.h>
5353 #include <rdma/ib_cache.h>
5454
55
-#define DRV_VERSION "1.0.0"
56
-
57
-const char ipoib_driver_version[] = DRV_VERSION;
58
-
5955 MODULE_AUTHOR("Roland Dreier");
6056 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
6157 MODULE_LICENSE("Dual BSD/GPL");
....@@ -90,7 +86,7 @@
9086
9187 struct ib_sa_client ipoib_sa_client;
9288
93
-static void ipoib_add_one(struct ib_device *device);
89
+static int ipoib_add_one(struct ib_device *device);
9490 static void ipoib_remove_one(struct ib_device *device, void *client_data);
9591 static void ipoib_neigh_reclaim(struct rcu_head *rp);
9692 static struct net_device *ipoib_get_net_dev_by_params(
....@@ -167,7 +163,7 @@
167163 if (flags & IFF_UP)
168164 continue;
169165
170
- dev_change_flags(cpriv->dev, flags | IFF_UP);
166
+ dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
171167 }
172168 up_read(&priv->vlan_rwsem);
173169 }
....@@ -207,7 +203,7 @@
207203 if (!(flags & IFF_UP))
208204 continue;
209205
210
- dev_change_flags(cpriv->dev, flags & ~IFF_UP);
206
+ dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL);
211207 }
212208 up_read(&priv->vlan_rwsem);
213209 }
....@@ -346,9 +342,10 @@
346342 struct net_device *result;
347343 };
348344
349
-static int ipoib_upper_walk(struct net_device *upper, void *_data)
345
+static int ipoib_upper_walk(struct net_device *upper,
346
+ struct netdev_nested_priv *priv)
350347 {
351
- struct ipoib_walk_data *data = _data;
348
+ struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data;
352349 int ret = 0;
353350
354351 if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
....@@ -372,10 +369,12 @@
372369 static struct net_device *ipoib_get_net_dev_match_addr(
373370 const struct sockaddr *addr, struct net_device *dev)
374371 {
372
+ struct netdev_nested_priv priv;
375373 struct ipoib_walk_data data = {
376374 .addr = addr,
377375 };
378376
377
+ priv.data = (void *)&data;
379378 rcu_read_lock();
380379 if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
381380 dev_hold(dev);
....@@ -383,7 +382,7 @@
383382 goto out;
384383 }
385384
386
- netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data);
385
+ netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv);
387386 out:
388387 rcu_read_unlock();
389388 return data.result;
....@@ -483,9 +482,6 @@
483482 if (ret)
484483 return NULL;
485484
486
- if (!dev_list)
487
- return NULL;
488
-
489485 /* See if we can find a unique device matching the L2 parameters */
490486 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
491487 gid, NULL, &net_dev);
....@@ -509,7 +505,7 @@
509505 default:
510506 dev_warn_ratelimited(&dev->dev,
511507 "duplicate IP address detected\n");
512
- /* Fall through */
508
+ fallthrough;
513509 case 1:
514510 return net_dev;
515511 }
....@@ -533,6 +529,7 @@
533529 "will cause multicast packet drops\n");
534530 netdev_update_features(dev);
535531 dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
532
+ netif_set_real_num_tx_queues(dev, 1);
536533 rtnl_unlock();
537534 priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
538535
....@@ -544,6 +541,7 @@
544541 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
545542 netdev_update_features(dev);
546543 dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
544
+ netif_set_real_num_tx_queues(dev, dev->num_tx_queues);
547545 rtnl_unlock();
548546 ipoib_flush_paths(dev);
549547 return (!rtnl_trylock()) ? -EBUSY : 0;
....@@ -613,7 +611,7 @@
613611 while ((skb = __skb_dequeue(&path->queue)))
614612 dev_kfree_skb_irq(skb);
615613
616
- ipoib_dbg(ipoib_priv(dev), "path_free\n");
614
+ ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);
617615
618616 /* remove all neigh connected to this path */
619617 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
....@@ -1182,7 +1180,7 @@
11821180 return NETDEV_TX_OK;
11831181 }
11841182
1185
-static void ipoib_timeout(struct net_device *dev)
1183
+static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
11861184 {
11871185 struct ipoib_dev_priv *priv = ipoib_priv(dev);
11881186
....@@ -1643,7 +1641,7 @@
16431641 {
16441642 struct ipoib_dev_priv *priv = ipoib_priv(dev);
16451643
1646
- ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1644
+ ipoib_dbg(priv, "%s\n", __func__);
16471645 init_completion(&priv->ntbl.deleted);
16481646
16491647 cancel_delayed_work_sync(&priv->neigh_reap_task);
....@@ -1825,7 +1823,7 @@
18251823 * running ensures the it will not add more work.
18261824 */
18271825 rtnl_lock();
1828
- dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1826
+ dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
18291827 rtnl_unlock();
18301828
18311829 /* ipoib_event() cannot be running once this returns */
....@@ -1864,7 +1862,7 @@
18641862 priv->port);
18651863 return result;
18661864 }
1867
- priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1865
+ priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr);
18681866
18691867 result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
18701868 if (result) {
....@@ -1897,14 +1895,22 @@
18971895
18981896 priv->max_ib_mtu = ppriv->max_ib_mtu;
18991897 set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
1900
- memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
1901
- memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
1898
+ if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
1899
+ memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
1900
+ sizeof(priv->local_gid));
1901
+ else {
1902
+ memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr,
1903
+ INFINIBAND_ALEN);
1904
+ memcpy(&priv->local_gid, &ppriv->local_gid,
1905
+ sizeof(priv->local_gid));
1906
+ }
19021907 }
19031908
19041909 static int ipoib_ndo_init(struct net_device *ndev)
19051910 {
19061911 struct ipoib_dev_priv *priv = ipoib_priv(ndev);
19071912 int rc;
1913
+ struct rdma_netdev *rn = netdev_priv(ndev);
19081914
19091915 if (priv->parent) {
19101916 ipoib_child_init(ndev);
....@@ -1917,6 +1923,7 @@
19171923 /* MTU will be reset when mcast join happens */
19181924 ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
19191925 priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
1926
+ rn->mtu = priv->mcast_mtu;
19201927 ndev->max_mtu = IPOIB_CM_MTU;
19211928
19221929 ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
....@@ -2023,6 +2030,15 @@
20232030 return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
20242031 }
20252032
2033
+static int ipoib_get_vf_guid(struct net_device *dev, int vf,
2034
+ struct ifla_vf_guid *node_guid,
2035
+ struct ifla_vf_guid *port_guid)
2036
+{
2037
+ struct ipoib_dev_priv *priv = ipoib_priv(dev);
2038
+
2039
+ return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid);
2040
+}
2041
+
20262042 static int ipoib_get_vf_stats(struct net_device *dev, int vf,
20272043 struct ifla_vf_stats *vf_stats)
20282044 {
....@@ -2049,6 +2065,7 @@
20492065 .ndo_set_vf_link_state = ipoib_set_vf_link_state,
20502066 .ndo_get_vf_config = ipoib_get_vf_config,
20512067 .ndo_get_vf_stats = ipoib_get_vf_stats,
2068
+ .ndo_get_vf_guid = ipoib_get_vf_guid,
20522069 .ndo_set_vf_guid = ipoib_set_vf_guid,
20532070 .ndo_set_mac_address = ipoib_set_mac,
20542071 .ndo_get_stats64 = ipoib_get_stats,
....@@ -2070,9 +2087,17 @@
20702087 .ndo_do_ioctl = ipoib_ioctl,
20712088 };
20722089
2090
+static const struct net_device_ops ipoib_netdev_default_pf = {
2091
+ .ndo_init = ipoib_dev_init_default,
2092
+ .ndo_uninit = ipoib_dev_uninit_default,
2093
+ .ndo_open = ipoib_ib_dev_open_default,
2094
+ .ndo_stop = ipoib_ib_dev_stop_default,
2095
+};
2096
+
20732097 void ipoib_setup_common(struct net_device *dev)
20742098 {
20752099 dev->header_ops = &ipoib_header_ops;
2100
+ dev->netdev_ops = &ipoib_netdev_default_pf;
20762101
20772102 ipoib_set_ethtool_ops(dev);
20782103
....@@ -2122,89 +2147,57 @@
21222147 INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
21232148 }
21242149
2125
-static const struct net_device_ops ipoib_netdev_default_pf = {
2126
- .ndo_init = ipoib_dev_init_default,
2127
- .ndo_uninit = ipoib_dev_uninit_default,
2128
- .ndo_open = ipoib_ib_dev_open_default,
2129
- .ndo_stop = ipoib_ib_dev_stop_default,
2130
-};
2131
-
2132
-static struct net_device
2133
-*ipoib_create_netdev_default(struct ib_device *hca,
2134
- const char *name,
2135
- unsigned char name_assign_type,
2136
- void (*setup)(struct net_device *))
2150
+static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
2151
+ const char *name)
21372152 {
21382153 struct net_device *dev;
2139
- struct rdma_netdev *rn;
21402154
2141
- dev = alloc_netdev((int)sizeof(struct rdma_netdev),
2142
- name,
2143
- name_assign_type, setup);
2155
+ dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
2156
+ NET_NAME_UNKNOWN, ipoib_setup_common);
2157
+ if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
2158
+ return dev;
2159
+
2160
+ dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
2161
+ ipoib_setup_common);
21442162 if (!dev)
2145
- return NULL;
2146
-
2147
- rn = netdev_priv(dev);
2148
-
2149
- rn->send = ipoib_send;
2150
- rn->attach_mcast = ipoib_mcast_attach;
2151
- rn->detach_mcast = ipoib_mcast_detach;
2152
- rn->hca = hca;
2153
- dev->netdev_ops = &ipoib_netdev_default_pf;
2154
-
2163
+ return ERR_PTR(-ENOMEM);
21552164 return dev;
21562165 }
21572166
2158
-static struct net_device *ipoib_get_netdev(struct ib_device *hca, u8 port,
2159
- const char *name)
2167
+int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
2168
+ struct net_device *dev)
21602169 {
2161
- struct net_device *dev;
2162
-
2163
- if (hca->alloc_rdma_netdev) {
2164
- dev = hca->alloc_rdma_netdev(hca, port,
2165
- RDMA_NETDEV_IPOIB, name,
2166
- NET_NAME_UNKNOWN,
2167
- ipoib_setup_common);
2168
- if (IS_ERR_OR_NULL(dev) && PTR_ERR(dev) != -EOPNOTSUPP)
2169
- return NULL;
2170
- }
2171
-
2172
- if (!hca->alloc_rdma_netdev || PTR_ERR(dev) == -EOPNOTSUPP)
2173
- dev = ipoib_create_netdev_default(hca, name, NET_NAME_UNKNOWN,
2174
- ipoib_setup_common);
2175
-
2176
- return dev;
2177
-}
2178
-
2179
-struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port,
2180
- const char *name)
2181
-{
2182
- struct net_device *dev;
2170
+ struct rdma_netdev *rn = netdev_priv(dev);
21832171 struct ipoib_dev_priv *priv;
2184
- struct rdma_netdev *rn;
2172
+ int rc;
21852173
21862174 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
21872175 if (!priv)
2188
- return NULL;
2176
+ return -ENOMEM;
21892177
21902178 priv->ca = hca;
21912179 priv->port = port;
21922180
2193
- dev = ipoib_get_netdev(hca, port, name);
2194
- if (!dev)
2195
- goto free_priv;
2181
+ rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
2182
+ NET_NAME_UNKNOWN, ipoib_setup_common, dev);
2183
+ if (rc) {
2184
+ if (rc != -EOPNOTSUPP)
2185
+ goto out;
2186
+
2187
+ rn->send = ipoib_send;
2188
+ rn->attach_mcast = ipoib_mcast_attach;
2189
+ rn->detach_mcast = ipoib_mcast_detach;
2190
+ rn->hca = hca;
2191
+ }
21962192
21972193 priv->rn_ops = dev->netdev_ops;
21982194
2199
- /* fixme : should be after the query_cap */
2200
- if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
2195
+ if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION)
22012196 dev->netdev_ops = &ipoib_netdev_ops_vf;
22022197 else
22032198 dev->netdev_ops = &ipoib_netdev_ops_pf;
22042199
2205
- rn = netdev_priv(dev);
22062200 rn->clnt_priv = priv;
2207
-
22082201 /*
22092202 * Only the child register_netdev flows can handle priv_destructor
22102203 * being set, so we force it to NULL here and handle manually until it
....@@ -2215,10 +2208,35 @@
22152208
22162209 ipoib_build_priv(dev);
22172210
2218
- return priv;
2219
-free_priv:
2211
+ return 0;
2212
+
2213
+out:
22202214 kfree(priv);
2221
- return NULL;
2215
+ return rc;
2216
+}
2217
+
2218
+struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
2219
+ const char *name)
2220
+{
2221
+ struct net_device *dev;
2222
+ int rc;
2223
+
2224
+ dev = ipoib_alloc_netdev(hca, port, name);
2225
+ if (IS_ERR(dev))
2226
+ return dev;
2227
+
2228
+ rc = ipoib_intf_init(hca, port, name, dev);
2229
+ if (rc) {
2230
+ free_netdev(dev);
2231
+ return ERR_PTR(rc);
2232
+ }
2233
+
2234
+ /*
2235
+ * Upon success the caller must ensure ipoib_intf_free is called or
2236
+ * register_netdevice succeed'd and priv_destructor is set to
2237
+ * ipoib_intf_free.
2238
+ */
2239
+ return dev;
22222240 }
22232241
22242242 void ipoib_intf_free(struct net_device *dev)
....@@ -2398,19 +2416,62 @@
23982416 return device_create_file(&dev->dev, &dev_attr_pkey);
23992417 }
24002418
2419
+/*
2420
+ * We erroneously exposed the iface's port number in the dev_id
2421
+ * sysfs field long after dev_port was introduced for that purpose[1],
2422
+ * and we need to stop everyone from relying on that.
2423
+ * Let's overload the shower routine for the dev_id file here
2424
+ * to gently bring the issue up.
2425
+ *
2426
+ * [1] https://www.spinics.net/lists/netdev/msg272123.html
2427
+ */
2428
+static ssize_t dev_id_show(struct device *dev,
2429
+ struct device_attribute *attr, char *buf)
2430
+{
2431
+ struct net_device *ndev = to_net_dev(dev);
2432
+
2433
+ /*
2434
+ * ndev->dev_port will be equal to 0 in old kernel prior to commit
2435
+ * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
2436
+ * port numbers") Zero was chosen as special case for user space
2437
+ * applications to fallback and query dev_id to check if it has
2438
+ * different value or not.
2439
+ *
2440
+ * Don't print warning in such scenario.
2441
+ *
2442
+ * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
2443
+ */
2444
+ if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
2445
+ netdev_info_once(ndev,
2446
+ "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
2447
+ current->comm);
2448
+
2449
+ return sprintf(buf, "%#x\n", ndev->dev_id);
2450
+}
2451
+static DEVICE_ATTR_RO(dev_id);
2452
+
2453
+static int ipoib_intercept_dev_id_attr(struct net_device *dev)
2454
+{
2455
+ device_remove_file(&dev->dev, &dev_attr_dev_id);
2456
+ return device_create_file(&dev->dev, &dev_attr_dev_id);
2457
+}
2458
+
24012459 static struct net_device *ipoib_add_port(const char *format,
24022460 struct ib_device *hca, u8 port)
24032461 {
2462
+ struct rtnl_link_ops *ops = ipoib_get_link_ops();
2463
+ struct rdma_netdev_alloc_params params;
24042464 struct ipoib_dev_priv *priv;
24052465 struct net_device *ndev;
24062466 int result;
24072467
2408
- priv = ipoib_intf_alloc(hca, port, format);
2409
- if (!priv) {
2410
- pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port);
2411
- return ERR_PTR(-ENOMEM);
2468
+ ndev = ipoib_intf_alloc(hca, port, format);
2469
+ if (IS_ERR(ndev)) {
2470
+ pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
2471
+ PTR_ERR(ndev));
2472
+ return ndev;
24122473 }
2413
- ndev = priv->dev;
2474
+ priv = ipoib_priv(ndev);
24142475
24152476 INIT_IB_EVENT_HANDLER(&priv->event_handler,
24162477 priv->ca, ipoib_event);
....@@ -2418,6 +2479,8 @@
24182479
24192480 /* call event handler to ensure pkey in sync */
24202481 queue_work(ipoib_workqueue, &priv->flush_heavy);
2482
+
2483
+ ndev->rtnl_link_ops = ipoib_get_link_ops();
24212484
24222485 result = register_netdev(ndev);
24232486 if (result) {
....@@ -2431,6 +2494,14 @@
24312494 return ERR_PTR(result);
24322495 }
24332496
2497
+ if (hca->ops.rdma_netdev_get_params) {
2498
+ int rc = hca->ops.rdma_netdev_get_params(hca, port,
2499
+ RDMA_NETDEV_IPOIB,
2500
+ &params);
2501
+
2502
+ if (!rc && ops->priv_size < params.sizeof_priv)
2503
+ ops->priv_size = params.sizeof_priv;
2504
+ }
24342505 /*
24352506 * We cannot set priv_destructor before register_netdev because we
24362507 * need priv to be always valid during the error flow to execute
....@@ -2439,6 +2510,8 @@
24392510 */
24402511 ndev->priv_destructor = ipoib_intf_free;
24412512
2513
+ if (ipoib_intercept_dev_id_attr(ndev))
2514
+ goto sysfs_failed;
24422515 if (ipoib_cm_add_mode_attr(ndev))
24432516 goto sysfs_failed;
24442517 if (ipoib_add_pkey_attr(ndev))
....@@ -2458,21 +2531,21 @@
24582531 return ERR_PTR(-ENOMEM);
24592532 }
24602533
2461
-static void ipoib_add_one(struct ib_device *device)
2534
+static int ipoib_add_one(struct ib_device *device)
24622535 {
24632536 struct list_head *dev_list;
24642537 struct net_device *dev;
24652538 struct ipoib_dev_priv *priv;
2466
- int p;
2539
+ unsigned int p;
24672540 int count = 0;
24682541
24692542 dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
24702543 if (!dev_list)
2471
- return;
2544
+ return -ENOMEM;
24722545
24732546 INIT_LIST_HEAD(dev_list);
24742547
2475
- for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
2548
+ rdma_for_each_port (device, p) {
24762549 if (!rdma_protocol_ib(device, p))
24772550 continue;
24782551 dev = ipoib_add_port("ib%d", device, p);
....@@ -2485,19 +2558,17 @@
24852558
24862559 if (!count) {
24872560 kfree(dev_list);
2488
- return;
2561
+ return -EOPNOTSUPP;
24892562 }
24902563
24912564 ib_set_client_data(device, &ipoib_client, dev_list);
2565
+ return 0;
24922566 }
24932567
24942568 static void ipoib_remove_one(struct ib_device *device, void *client_data)
24952569 {
24962570 struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
24972571 struct list_head *dev_list = client_data;
2498
-
2499
- if (!dev_list)
2500
- return;
25012572
25022573 list_for_each_entry_safe(priv, tmp, dev_list, list) {
25032574 LIST_HEAD(head);
....@@ -2545,9 +2616,7 @@
25452616 */
25462617 BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
25472618
2548
- ret = ipoib_register_debugfs();
2549
- if (ret)
2550
- return ret;
2619
+ ipoib_register_debugfs();
25512620
25522621 /*
25532622 * We create a global workqueue here that is used for all flush