hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/infiniband/hw/hfi1/verbs.c
....@@ -1,5 +1,5 @@
11 /*
2
- * Copyright(c) 2015 - 2018 Intel Corporation.
2
+ * Copyright(c) 2015 - 2020 Intel Corporation.
33 *
44 * This file is provided under a dual BSD/GPLv2 license. When using or
55 * redistributing this file, you may do so under either license.
....@@ -66,6 +66,7 @@
6666 #include "vnic.h"
6767 #include "fault.h"
6868 #include "affinity.h"
69
+#include "ipoib.h"
6970
7071 static unsigned int hfi1_lkey_table_size = 16;
7172 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
....@@ -130,8 +131,6 @@
130131 module_param(piothreshold, ushort, S_IRUGO);
131132 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
132133
133
-#define COPY_CACHELESS 1
134
-#define COPY_ADAPTIVE 2
135134 static unsigned int sge_copy_mode;
136135 module_param(sge_copy_mode, uint, S_IRUGO);
137136 MODULE_PARM_DESC(sge_copy_mode,
....@@ -149,168 +148,24 @@
149148 /* Length of buffer to create verbs txreq cache name */
150149 #define TXREQ_NAME_LEN 24
151150
152
-static uint wss_threshold;
151
+static uint wss_threshold = 80;
153152 module_param(wss_threshold, uint, S_IRUGO);
154153 MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
155154 static uint wss_clean_period = 256;
156155 module_param(wss_clean_period, uint, S_IRUGO);
157156 MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
158157
159
-/* memory working set size */
160
-struct hfi1_wss {
161
- unsigned long *entries;
162
- atomic_t total_count;
163
- atomic_t clean_counter;
164
- atomic_t clean_entry;
165
-
166
- int threshold;
167
- int num_entries;
168
- long pages_mask;
169
-};
170
-
171
-static struct hfi1_wss wss;
172
-
173
-int hfi1_wss_init(void)
174
-{
175
- long llc_size;
176
- long llc_bits;
177
- long table_size;
178
- long table_bits;
179
-
180
- /* check for a valid percent range - default to 80 if none or invalid */
181
- if (wss_threshold < 1 || wss_threshold > 100)
182
- wss_threshold = 80;
183
- /* reject a wildly large period */
184
- if (wss_clean_period > 1000000)
185
- wss_clean_period = 256;
186
- /* reject a zero period */
187
- if (wss_clean_period == 0)
188
- wss_clean_period = 1;
189
-
190
- /*
191
- * Calculate the table size - the next power of 2 larger than the
192
- * LLC size. LLC size is in KiB.
193
- */
194
- llc_size = wss_llc_size() * 1024;
195
- table_size = roundup_pow_of_two(llc_size);
196
-
197
- /* one bit per page in rounded up table */
198
- llc_bits = llc_size / PAGE_SIZE;
199
- table_bits = table_size / PAGE_SIZE;
200
- wss.pages_mask = table_bits - 1;
201
- wss.num_entries = table_bits / BITS_PER_LONG;
202
-
203
- wss.threshold = (llc_bits * wss_threshold) / 100;
204
- if (wss.threshold == 0)
205
- wss.threshold = 1;
206
-
207
- atomic_set(&wss.clean_counter, wss_clean_period);
208
-
209
- wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
210
- GFP_KERNEL);
211
- if (!wss.entries) {
212
- hfi1_wss_exit();
213
- return -ENOMEM;
214
- }
215
-
216
- return 0;
217
-}
218
-
219
-void hfi1_wss_exit(void)
220
-{
221
- /* coded to handle partially initialized and repeat callers */
222
- kfree(wss.entries);
223
- wss.entries = NULL;
224
-}
225
-
226
-/*
227
- * Advance the clean counter. When the clean period has expired,
228
- * clean an entry.
229
- *
230
- * This is implemented in atomics to avoid locking. Because multiple
231
- * variables are involved, it can be racy which can lead to slightly
232
- * inaccurate information. Since this is only a heuristic, this is
233
- * OK. Any innaccuracies will clean themselves out as the counter
234
- * advances. That said, it is unlikely the entry clean operation will
235
- * race - the next possible racer will not start until the next clean
236
- * period.
237
- *
238
- * The clean counter is implemented as a decrement to zero. When zero
239
- * is reached an entry is cleaned.
240
- */
241
-static void wss_advance_clean_counter(void)
242
-{
243
- int entry;
244
- int weight;
245
- unsigned long bits;
246
-
247
- /* become the cleaner if we decrement the counter to zero */
248
- if (atomic_dec_and_test(&wss.clean_counter)) {
249
- /*
250
- * Set, not add, the clean period. This avoids an issue
251
- * where the counter could decrement below the clean period.
252
- * Doing a set can result in lost decrements, slowing the
253
- * clean advance. Since this a heuristic, this possible
254
- * slowdown is OK.
255
- *
256
- * An alternative is to loop, advancing the counter by a
257
- * clean period until the result is > 0. However, this could
258
- * lead to several threads keeping another in the clean loop.
259
- * This could be mitigated by limiting the number of times
260
- * we stay in the loop.
261
- */
262
- atomic_set(&wss.clean_counter, wss_clean_period);
263
-
264
- /*
265
- * Uniquely grab the entry to clean and move to next.
266
- * The current entry is always the lower bits of
267
- * wss.clean_entry. The table size, wss.num_entries,
268
- * is always a power-of-2.
269
- */
270
- entry = (atomic_inc_return(&wss.clean_entry) - 1)
271
- & (wss.num_entries - 1);
272
-
273
- /* clear the entry and count the bits */
274
- bits = xchg(&wss.entries[entry], 0);
275
- weight = hweight64((u64)bits);
276
- /* only adjust the contended total count if needed */
277
- if (weight)
278
- atomic_sub(weight, &wss.total_count);
279
- }
280
-}
281
-
282
-/*
283
- * Insert the given address into the working set array.
284
- */
285
-static void wss_insert(void *address)
286
-{
287
- u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
288
- u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
289
- u32 nr = page & (BITS_PER_LONG - 1);
290
-
291
- if (!test_and_set_bit(nr, &wss.entries[entry]))
292
- atomic_inc(&wss.total_count);
293
-
294
- wss_advance_clean_counter();
295
-}
296
-
297
-/*
298
- * Is the working set larger than the threshold?
299
- */
300
-static inline bool wss_exceeds_threshold(void)
301
-{
302
- return atomic_read(&wss.total_count) >= wss.threshold;
303
-}
304
-
305158 /*
306159 * Translate ib_wr_opcode into ib_wc_opcode.
307160 */
308161 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
309162 [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
163
+ [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
310164 [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
311165 [IB_WR_SEND] = IB_WC_SEND,
312166 [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
313167 [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
168
+ [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
314169 [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
315170 [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
316171 [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
....@@ -346,6 +201,14 @@
346201 [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
347202 [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4,
348203 [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
204
+ [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36,
205
+ [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36,
206
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36,
207
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36,
208
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36,
209
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36,
210
+ [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36,
211
+ [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36,
349212 /* UC */
350213 [IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
351214 [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
....@@ -389,6 +252,17 @@
389252 [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
390253 [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv,
391254 [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
255
+
256
+ /* TID RDMA has separate handlers for different opcodes.*/
257
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req,
258
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp,
259
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data,
260
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
261
+ [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req,
262
+ [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp,
263
+ [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync,
264
+ [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack,
265
+
392266 /* UC */
393267 [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
394268 [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
....@@ -436,79 +310,6 @@
436310 */
437311 __be64 ib_hfi1_sys_image_guid;
438312
439
-/**
440
- * hfi1_copy_sge - copy data to SGE memory
441
- * @ss: the SGE state
442
- * @data: the data to copy
443
- * @length: the length of the data
444
- * @release: boolean to release MR
445
- * @copy_last: do a separate copy of the last 8 bytes
446
- */
447
-void hfi1_copy_sge(
448
- struct rvt_sge_state *ss,
449
- void *data, u32 length,
450
- bool release,
451
- bool copy_last)
452
-{
453
- struct rvt_sge *sge = &ss->sge;
454
- int i;
455
- bool in_last = false;
456
- bool cacheless_copy = false;
457
-
458
- if (sge_copy_mode == COPY_CACHELESS) {
459
- cacheless_copy = length >= PAGE_SIZE;
460
- } else if (sge_copy_mode == COPY_ADAPTIVE) {
461
- if (length >= PAGE_SIZE) {
462
- /*
463
- * NOTE: this *assumes*:
464
- * o The first vaddr is the dest.
465
- * o If multiple pages, then vaddr is sequential.
466
- */
467
- wss_insert(sge->vaddr);
468
- if (length >= (2 * PAGE_SIZE))
469
- wss_insert(sge->vaddr + PAGE_SIZE);
470
-
471
- cacheless_copy = wss_exceeds_threshold();
472
- } else {
473
- wss_advance_clean_counter();
474
- }
475
- }
476
- if (copy_last) {
477
- if (length > 8) {
478
- length -= 8;
479
- } else {
480
- copy_last = false;
481
- in_last = true;
482
- }
483
- }
484
-
485
-again:
486
- while (length) {
487
- u32 len = rvt_get_sge_length(sge, length);
488
-
489
- WARN_ON_ONCE(len == 0);
490
- if (unlikely(in_last)) {
491
- /* enforce byte transfer ordering */
492
- for (i = 0; i < len; i++)
493
- ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
494
- } else if (cacheless_copy) {
495
- cacheless_memcpy(sge->vaddr, data, len);
496
- } else {
497
- memcpy(sge->vaddr, data, len);
498
- }
499
- rvt_update_sge(ss, len, release);
500
- data += len;
501
- length -= len;
502
- }
503
-
504
- if (copy_last) {
505
- copy_last = false;
506
- in_last = true;
507
- length = 8;
508
- goto again;
509
- }
510
-}
511
-
512313 /*
513314 * Make sure the QP is ready and able to accept the given opcode.
514315 */
....@@ -527,7 +328,7 @@
527328 static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
528329 {
529330 #ifdef CONFIG_FAULT_INJECTION
530
- if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
331
+ if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
531332 /*
532333 * In order to drop non-IB traffic we
533334 * set PbcInsertHrc to NONE (0x2).
....@@ -538,8 +339,9 @@
538339 * packet will not be delivered to the
539340 * correct context.
540341 */
342
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
541343 pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
542
- else
344
+ } else {
543345 /*
544346 * In order to drop regular verbs
545347 * traffic we set the PbcTestEbp
....@@ -549,8 +351,127 @@
549351 * triggered and will be dropped.
550352 */
551353 pbc |= PBC_TEST_EBP;
354
+ }
552355 #endif
553356 return pbc;
357
+}
358
+
359
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
360
+{
361
+ if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
362
+ !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
363
+ return NULL;
364
+ if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
365
+ return opcode_handler_tbl[opcode];
366
+ return NULL;
367
+}
368
+
369
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
370
+{
371
+ struct hfi1_ctxtdata *rcd = packet->rcd;
372
+ struct ib_header *hdr = packet->hdr;
373
+ u32 tlen = packet->tlen;
374
+ struct hfi1_pportdata *ppd = rcd->ppd;
375
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
376
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
377
+ opcode_handler opcode_handler;
378
+ unsigned long flags;
379
+ u32 qp_num;
380
+ int lnh;
381
+ u8 opcode;
382
+
383
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
384
+ if (unlikely(tlen < 15 * sizeof(u32)))
385
+ goto drop;
386
+
387
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
388
+ if (lnh != HFI1_LRH_BTH)
389
+ goto drop;
390
+
391
+ packet->ohdr = &hdr->u.oth;
392
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
393
+
394
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
395
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
396
+
397
+ /* verbs_qp can be picked up from any tid_rdma header struct */
398
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
399
+ RVT_QPN_MASK;
400
+
401
+ rcu_read_lock();
402
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
403
+ if (!packet->qp)
404
+ goto drop_rcu;
405
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
406
+ opcode_handler = tid_qp_ok(opcode, packet);
407
+ if (likely(opcode_handler))
408
+ opcode_handler(packet);
409
+ else
410
+ goto drop_unlock;
411
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
412
+ rcu_read_unlock();
413
+
414
+ return;
415
+drop_unlock:
416
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
417
+drop_rcu:
418
+ rcu_read_unlock();
419
+drop:
420
+ ibp->rvp.n_pkt_drops++;
421
+}
422
+
423
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
424
+{
425
+ struct hfi1_ctxtdata *rcd = packet->rcd;
426
+ struct ib_header *hdr = packet->hdr;
427
+ u32 tlen = packet->tlen;
428
+ struct hfi1_pportdata *ppd = rcd->ppd;
429
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
430
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
431
+ opcode_handler opcode_handler;
432
+ unsigned long flags;
433
+ u32 qp_num;
434
+ int lnh;
435
+ u8 opcode;
436
+
437
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
438
+ if (unlikely(tlen < 15 * sizeof(u32)))
439
+ goto drop;
440
+
441
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
442
+ if (lnh != HFI1_LRH_BTH)
443
+ goto drop;
444
+
445
+ packet->ohdr = &hdr->u.oth;
446
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
447
+
448
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
449
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
450
+
451
+ /* verbs_qp can be picked up from any tid_rdma header struct */
452
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
453
+ RVT_QPN_MASK;
454
+
455
+ rcu_read_lock();
456
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
457
+ if (!packet->qp)
458
+ goto drop_rcu;
459
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
460
+ opcode_handler = tid_qp_ok(opcode, packet);
461
+ if (likely(opcode_handler))
462
+ opcode_handler(packet);
463
+ else
464
+ goto drop_unlock;
465
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
466
+ rcu_read_unlock();
467
+
468
+ return;
469
+drop_unlock:
470
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
471
+drop_rcu:
472
+ rcu_read_unlock();
473
+drop:
474
+ ibp->rvp.n_pkt_drops++;
554475 }
555476
556477 static int hfi1_do_pkey_check(struct hfi1_packet *packet)
....@@ -713,11 +634,13 @@
713634
714635 spin_lock(&qp->s_lock);
715636 if (tx->wqe) {
716
- hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
637
+ rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
717638 } else if (qp->ibqp.qp_type == IB_QPT_RC) {
718639 struct hfi1_opa_header *hdr;
719640
720641 hdr = &tx->phdr.hdr;
642
+ if (unlikely(status == SDMA_TXREQ_S_ABORTED))
643
+ hfi1_rc_verbs_aborted(qp, hdr);
721644 hfi1_rc_send_complete(qp, hdr);
722645 }
723646 spin_unlock(&qp->s_lock);
....@@ -725,11 +648,28 @@
725648 hfi1_put_txreq(tx);
726649 }
727650
651
+void hfi1_wait_kmem(struct rvt_qp *qp)
652
+{
653
+ struct hfi1_qp_priv *priv = qp->priv;
654
+ struct ib_qp *ibqp = &qp->ibqp;
655
+ struct ib_device *ibdev = ibqp->device;
656
+ struct hfi1_ibdev *dev = to_idev(ibdev);
657
+
658
+ if (list_empty(&priv->s_iowait.list)) {
659
+ if (list_empty(&dev->memwait))
660
+ mod_timer(&dev->mem_timer, jiffies + 1);
661
+ qp->s_flags |= RVT_S_WAIT_KMEM;
662
+ list_add_tail(&priv->s_iowait.list, &dev->memwait);
663
+ priv->s_iowait.lock = &dev->iowait_lock;
664
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
665
+ rvt_get_qp(qp);
666
+ }
667
+}
668
+
728669 static int wait_kmem(struct hfi1_ibdev *dev,
729670 struct rvt_qp *qp,
730671 struct hfi1_pkt_state *ps)
731672 {
732
- struct hfi1_qp_priv *priv = qp->priv;
733673 unsigned long flags;
734674 int ret = 0;
735675
....@@ -737,18 +677,10 @@
737677 if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
738678 write_seqlock(&dev->iowait_lock);
739679 list_add_tail(&ps->s_txreq->txreq.list,
740
- &priv->s_iowait.tx_head);
741
- if (list_empty(&priv->s_iowait.list)) {
742
- if (list_empty(&dev->memwait))
743
- mod_timer(&dev->mem_timer, jiffies + 1);
744
- qp->s_flags |= RVT_S_WAIT_KMEM;
745
- list_add_tail(&priv->s_iowait.list, &dev->memwait);
746
- priv->s_iowait.lock = &dev->iowait_lock;
747
- trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
748
- rvt_get_qp(qp);
749
- }
680
+ &ps->wait->tx_head);
681
+ hfi1_wait_kmem(qp);
750682 write_sequnlock(&dev->iowait_lock);
751
- qp->s_flags &= ~RVT_S_BUSY;
683
+ hfi1_qp_unbusy(qp, ps->wait);
752684 ret = -EBUSY;
753685 }
754686 spin_unlock_irqrestore(&qp->s_lock, flags);
....@@ -774,11 +706,7 @@
774706 int ret = 0;
775707
776708 while (length) {
777
- len = ss->sge.length;
778
- if (len > length)
779
- len = length;
780
- if (len > ss->sge.sge_length)
781
- len = ss->sge.sge_length;
709
+ len = rvt_get_sge_length(&ss->sge, length);
782710 WARN_ON_ONCE(len == 0);
783711 ret = sdma_txadd_kvaddr(
784712 sde->dd,
....@@ -899,6 +827,15 @@
899827 return ret;
900828 }
901829
830
+static u64 update_hcrc(u8 opcode, u64 pbc)
831
+{
832
+ if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
833
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
834
+ pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
835
+ }
836
+ return pbc;
837
+}
838
+
902839 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
903840 u64 pbc)
904841 {
....@@ -937,21 +874,24 @@
937874 else
938875 pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
939876
940
- if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
941
- pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
942877 pbc = create_pbc(ppd,
943878 pbc,
944879 qp->srate_mbps,
945880 vl,
946881 plen);
882
+
883
+ if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
884
+ pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
885
+ else
886
+ /* Update HCRC based on packet opcode */
887
+ pbc = update_hcrc(ps->opcode, pbc);
947888 }
948889 tx->wqe = qp->s_wqe;
949890 ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
950891 if (unlikely(ret))
951892 goto bail_build;
952893 }
953
- ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
954
- ps->pkts_sent);
894
+ ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
955895 if (unlikely(ret < 0)) {
956896 if (ret == -ECOMM)
957897 goto bail_ecomm;
....@@ -987,7 +927,6 @@
987927 {
988928 struct hfi1_qp_priv *priv = qp->priv;
989929 struct hfi1_devdata *dd = sc->dd;
990
- struct hfi1_ibdev *dev = &dd->verbs_dev;
991930 unsigned long flags;
992931 int ret = 0;
993932
....@@ -999,9 +938,9 @@
999938 */
1000939 spin_lock_irqsave(&qp->s_lock, flags);
1001940 if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
1002
- write_seqlock(&dev->iowait_lock);
941
+ write_seqlock(&sc->waitlock);
1003942 list_add_tail(&ps->s_txreq->txreq.list,
1004
- &priv->s_iowait.tx_head);
943
+ &ps->wait->tx_head);
1005944 if (list_empty(&priv->s_iowait.list)) {
1006945 struct hfi1_ibdev *dev = &dd->verbs_dev;
1007946 int was_empty;
....@@ -1010,17 +949,18 @@
1010949 dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
1011950 qp->s_flags |= flag;
1012951 was_empty = list_empty(&sc->piowait);
952
+ iowait_get_priority(&priv->s_iowait);
1013953 iowait_queue(ps->pkts_sent, &priv->s_iowait,
1014954 &sc->piowait);
1015
- priv->s_iowait.lock = &dev->iowait_lock;
955
+ priv->s_iowait.lock = &sc->waitlock;
1016956 trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
1017957 rvt_get_qp(qp);
1018958 /* counting: only call wantpiobuf_intr if first user */
1019959 if (was_empty)
1020960 hfi1_sc_wantpiobuf_intr(sc, 1);
1021961 }
1022
- write_sequnlock(&dev->iowait_lock);
1023
- qp->s_flags &= ~RVT_S_BUSY;
962
+ write_sequnlock(&sc->waitlock);
963
+ hfi1_qp_unbusy(qp, ps->wait);
1024964 ret = -EBUSY;
1025965 }
1026966 spin_unlock_irqrestore(&qp->s_lock, flags);
....@@ -1091,14 +1031,17 @@
10911031 else
10921032 pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
10931033
1034
+ pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
10941035 if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
10951036 pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
1096
- pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
1037
+ else
1038
+ /* Update HCRC based on packet opcode */
1039
+ pbc = update_hcrc(ps->opcode, pbc);
10971040 }
10981041 if (cb)
10991042 iowait_pio_inc(&priv->s_iowait);
11001043 pbuf = sc_buffer_alloc(sc, plen, cb, qp);
1101
- if (unlikely(IS_ERR_OR_NULL(pbuf))) {
1044
+ if (IS_ERR_OR_NULL(pbuf)) {
11021045 if (cb)
11031046 verbs_pio_complete(qp, 0);
11041047 if (IS_ERR(pbuf)) {
....@@ -1137,12 +1080,8 @@
11371080 if (ss) {
11381081 while (len) {
11391082 void *addr = ss->sge.vaddr;
1140
- u32 slen = ss->sge.length;
1083
+ u32 slen = rvt_get_sge_length(&ss->sge, len);
11411084
1142
- if (slen > len)
1143
- slen = len;
1144
- if (slen > ss->sge.sge_length)
1145
- slen = ss->sge.sge_length;
11461085 rvt_update_sge(ss, slen, false);
11471086 seg_pio_copy_mid(pbuf, addr, slen);
11481087 len -= slen;
....@@ -1161,15 +1100,15 @@
11611100 &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
11621101
11631102 pio_bail:
1103
+ spin_lock_irqsave(&qp->s_lock, flags);
11641104 if (qp->s_wqe) {
1165
- spin_lock_irqsave(&qp->s_lock, flags);
1166
- hfi1_send_complete(qp, qp->s_wqe, wc_status);
1167
- spin_unlock_irqrestore(&qp->s_lock, flags);
1105
+ rvt_send_complete(qp, qp->s_wqe, wc_status);
11681106 } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1169
- spin_lock_irqsave(&qp->s_lock, flags);
1107
+ if (unlikely(wc_status == IB_WC_GENERAL_ERR))
1108
+ hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
11701109 hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
1171
- spin_unlock_irqrestore(&qp->s_lock, flags);
11721110 }
1111
+ spin_unlock_irqrestore(&qp->s_lock, flags);
11731112
11741113 ret = 0;
11751114
....@@ -1289,15 +1228,16 @@
12891228 case IB_QPT_UD:
12901229 break;
12911230 case IB_QPT_UC:
1292
- case IB_QPT_RC: {
1231
+ case IB_QPT_RC:
1232
+ priv->s_running_pkt_size =
1233
+ (tx->s_cur_size + priv->s_running_pkt_size) / 2;
12931234 if (piothreshold &&
1294
- tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
1235
+ priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
12951236 (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
12961237 iowait_sdma_pending(&priv->s_iowait) == 0 &&
12971238 !sdma_txreq_built(&tx->txreq))
12981239 return dd->process_pio_send;
12991240 break;
1300
- }
13011241 default:
13021242 break;
13031243 }
....@@ -1370,7 +1310,7 @@
13701310 hfi1_cdbg(PIO, "%s() Failed. Completing with err",
13711311 __func__);
13721312 spin_lock_irqsave(&qp->s_lock, flags);
1373
- hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1313
+ rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
13741314 spin_unlock_irqrestore(&qp->s_lock, flags);
13751315 }
13761316 return -EINVAL;
....@@ -1403,7 +1343,7 @@
14031343 IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
14041344 IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
14051345 IB_DEVICE_MEM_MGT_EXTENSIONS |
1406
- IB_DEVICE_RDMA_NETDEV_OPA_VNIC;
1346
+ IB_DEVICE_RDMA_NETDEV_OPA;
14071347 rdi->dparms.props.page_size_cap = PAGE_SIZE;
14081348 rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
14091349 rdi->dparms.props.vendor_part_id = dd->pcidev->device;
....@@ -1412,14 +1352,15 @@
14121352 rdi->dparms.props.max_mr_size = U64_MAX;
14131353 rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
14141354 rdi->dparms.props.max_qp = hfi1_max_qps;
1415
- rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
1355
+ rdi->dparms.props.max_qp_wr =
1356
+ (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
1357
+ HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
14161358 rdi->dparms.props.max_send_sge = hfi1_max_sges;
14171359 rdi->dparms.props.max_recv_sge = hfi1_max_sges;
14181360 rdi->dparms.props.max_sge_rd = hfi1_max_sges;
14191361 rdi->dparms.props.max_cq = hfi1_max_cqs;
14201362 rdi->dparms.props.max_ah = hfi1_max_ahs;
14211363 rdi->dparms.props.max_cqe = hfi1_max_cqes;
1422
- rdi->dparms.props.max_map_per_fmr = 32767;
14231364 rdi->dparms.props.max_pd = hfi1_max_pds;
14241365 rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
14251366 rdi->dparms.props.max_qp_init_rd_atom = 255;
....@@ -1483,7 +1424,7 @@
14831424 props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
14841425 props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
14851426 /* see rate_show() in ib core/sysfs.c */
1486
- props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1427
+ props->active_speed = opa_speed_to_ib(ppd->link_speed_active);
14871428 props->max_vl_num = ppd->vls_supported;
14881429
14891430 /* Once we are a "first class" citizen and have added the OPA MTUs to
....@@ -1498,6 +1439,7 @@
14981439 4096 : hfi1_max_mtu), IB_MTU_4096);
14991440 props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
15001441 mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
1442
+ props->phys_mtu = hfi1_max_mtu;
15011443
15021444 return 0;
15031445 }
....@@ -1802,15 +1744,15 @@
18021744
18031745 static u64 hfi1_sps_ints(void)
18041746 {
1805
- unsigned long flags;
1747
+ unsigned long index, flags;
18061748 struct hfi1_devdata *dd;
18071749 u64 sps_ints = 0;
18081750
1809
- spin_lock_irqsave(&hfi1_devs_lock, flags);
1810
- list_for_each_entry(dd, &hfi1_dev_list, list) {
1751
+ xa_lock_irqsave(&hfi1_dev_table, flags);
1752
+ xa_for_each(&hfi1_dev_table, index, dd) {
18111753 sps_ints += get_all_cpu_total(dd->int_counter);
18121754 }
1813
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
1755
+ xa_unlock_irqrestore(&hfi1_dev_table, flags);
18141756 return sps_ints;
18151757 }
18161758
....@@ -1839,6 +1781,21 @@
18391781 memcpy(stats->value, values, count * sizeof(u64));
18401782 return count;
18411783 }
1784
+
1785
+static const struct ib_device_ops hfi1_dev_ops = {
1786
+ .owner = THIS_MODULE,
1787
+ .driver_id = RDMA_DRIVER_HFI1,
1788
+
1789
+ .alloc_hw_stats = alloc_hw_stats,
1790
+ .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
1791
+ .get_dev_fw_str = hfi1_get_dev_fw_str,
1792
+ .get_hw_stats = get_hw_stats,
1793
+ .init_port = hfi1_create_port_files,
1794
+ .modify_device = modify_device,
1795
+ /* keep process mad in the driver */
1796
+ .process_mad = hfi1_process_mad,
1797
+ .rdma_netdev_get_params = hfi1_ipoib_rn_get_params,
1798
+};
18421799
18431800 /**
18441801 * hfi1_register_ib_device - register our device with the infiniband core
....@@ -1880,17 +1837,10 @@
18801837 */
18811838 if (!ib_hfi1_sys_image_guid)
18821839 ib_hfi1_sys_image_guid = ibdev->node_guid;
1883
- ibdev->owner = THIS_MODULE;
18841840 ibdev->phys_port_cnt = dd->num_pports;
18851841 ibdev->dev.parent = &dd->pcidev->dev;
1886
- ibdev->modify_device = modify_device;
1887
- ibdev->alloc_hw_stats = alloc_hw_stats;
1888
- ibdev->get_hw_stats = get_hw_stats;
1889
- ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn;
18901842
1891
- /* keep process mad in the driver */
1892
- ibdev->process_mad = hfi1_process_mad;
1893
- ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
1843
+ ib_set_device_ops(ibdev, &hfi1_dev_ops);
18941844
18951845 strlcpy(ibdev->node_desc, init_utsname()->nodename,
18961846 sizeof(ibdev->node_desc));
....@@ -1898,7 +1848,6 @@
18981848 /*
18991849 * Fill in rvt info object.
19001850 */
1901
- dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
19021851 dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
19031852 dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
19041853 dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
....@@ -1916,9 +1865,8 @@
19161865 dd->verbs_dev.rdi.dparms.qpn_start = 0;
19171866 dd->verbs_dev.rdi.dparms.qpn_inc = 1;
19181867 dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1919
- dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
1920
- dd->verbs_dev.rdi.dparms.qpn_res_end =
1921
- dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
1868
+ dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE;
1869
+ dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX;
19221870 dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
19231871 dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
19241872 dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
....@@ -1928,6 +1876,7 @@
19281876 dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
19291877
19301878 dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1879
+ dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
19311880 dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
19321881 dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
19331882 dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
....@@ -1945,7 +1894,7 @@
19451894 dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
19461895 dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
19471896 dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
1948
- dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
1897
+ dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
19491898 dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
19501899 hfi1_comp_vect_mappings_lookup;
19511900
....@@ -1958,9 +1907,17 @@
19581907 dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
19591908 dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
19601909 dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1910
+ dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
1911
+ dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
1912
+ dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
1913
+ dd->verbs_dev.rdi.dparms.reserved_operations = 1;
1914
+ dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
19611915
19621916 /* post send table */
19631917 dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
1918
+
1919
+ /* opcode translation table */
1920
+ dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
19641921
19651922 ppd = dd->pport;
19661923 for (i = 0; i < dd->num_pports; i++, ppd++)
....@@ -1969,7 +1926,10 @@
19691926 i,
19701927 ppd->pkeys);
19711928
1972
- ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
1929
+ rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
1930
+ &ib_hfi1_attr_group);
1931
+
1932
+ ret = rvt_register_device(&dd->verbs_dev.rdi);
19731933 if (ret)
19741934 goto err_verbs_txreq;
19751935