hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/drivers/infiniband/hw/hfi1/rc.c
....@@ -51,24 +51,48 @@
5151
5252 #include "hfi.h"
5353 #include "qp.h"
54
+#include "rc.h"
5455 #include "verbs_txreq.h"
5556 #include "trace.h"
5657
57
-/* cut down ridiculously long IB macro names */
58
-#define OP(x) RC_OP(x)
59
-
60
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
61
- u32 psn, u32 pmtu)
58
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
59
+ u8 *prev_ack, bool *scheduled)
60
+ __must_hold(&qp->s_lock)
6261 {
63
- u32 len;
62
+ struct rvt_ack_entry *e = NULL;
63
+ u8 i, p;
64
+ bool s = true;
6465
65
- len = delta_psn(psn, wqe->psn) * pmtu;
66
- ss->sge = wqe->sg_list[0];
67
- ss->sg_list = wqe->sg_list + 1;
68
- ss->num_sge = wqe->wr.num_sge;
69
- ss->total_len = wqe->length;
70
- rvt_skip_sge(ss, len, false);
71
- return wqe->length - len;
66
+ for (i = qp->r_head_ack_queue; ; i = p) {
67
+ if (i == qp->s_tail_ack_queue)
68
+ s = false;
69
+ if (i)
70
+ p = i - 1;
71
+ else
72
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
73
+ if (p == qp->r_head_ack_queue) {
74
+ e = NULL;
75
+ break;
76
+ }
77
+ e = &qp->s_ack_queue[p];
78
+ if (!e->opcode) {
79
+ e = NULL;
80
+ break;
81
+ }
82
+ if (cmp_psn(psn, e->psn) >= 0) {
83
+ if (p == qp->s_tail_ack_queue &&
84
+ cmp_psn(psn, e->lpsn) <= 0)
85
+ s = false;
86
+ break;
87
+ }
88
+ }
89
+ if (prev)
90
+ *prev = p;
91
+ if (prev_ack)
92
+ *prev_ack = i;
93
+ if (scheduled)
94
+ *scheduled = s;
95
+ return e;
7296 }
7397
7498 /**
....@@ -87,20 +111,25 @@
87111 struct hfi1_pkt_state *ps)
88112 {
89113 struct rvt_ack_entry *e;
90
- u32 hwords;
91
- u32 len;
92
- u32 bth0;
93
- u32 bth2;
114
+ u32 hwords, hdrlen;
115
+ u32 len = 0;
116
+ u32 bth0 = 0, bth2 = 0;
117
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
94118 int middle = 0;
95119 u32 pmtu = qp->pmtu;
96
- struct hfi1_qp_priv *priv = qp->priv;
120
+ struct hfi1_qp_priv *qpriv = qp->priv;
121
+ bool last_pkt;
122
+ u32 delta;
123
+ u8 next = qp->s_tail_ack_queue;
124
+ struct tid_rdma_request *req;
97125
126
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
98127 lockdep_assert_held(&qp->s_lock);
99128 /* Don't send an ACK if we aren't supposed to. */
100129 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
101130 goto bail;
102131
103
- if (priv->hdr_type == HFI1_PKT_TYPE_9B)
132
+ if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
104133 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
105134 hwords = 5;
106135 else
....@@ -111,20 +140,27 @@
111140 case OP(RDMA_READ_RESPONSE_LAST):
112141 case OP(RDMA_READ_RESPONSE_ONLY):
113142 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
114
- if (e->rdma_sge.mr) {
115
- rvt_put_mr(e->rdma_sge.mr);
116
- e->rdma_sge.mr = NULL;
117
- }
118
- /* FALLTHROUGH */
143
+ release_rdma_sge_mr(e);
144
+ fallthrough;
119145 case OP(ATOMIC_ACKNOWLEDGE):
120146 /*
121147 * We can increment the tail pointer now that the last
122148 * response has been sent instead of only being
123149 * constructed.
124150 */
125
- if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
126
- qp->s_tail_ack_queue = 0;
127
- /* FALLTHROUGH */
151
+ if (++next > rvt_size_atomic(&dev->rdi))
152
+ next = 0;
153
+ /*
154
+ * Only advance the s_acked_ack_queue pointer if there
155
+ * have been no TID RDMA requests.
156
+ */
157
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
158
+ if (e->opcode != TID_OP(WRITE_REQ) &&
159
+ qp->s_acked_ack_queue == qp->s_tail_ack_queue)
160
+ qp->s_acked_ack_queue = next;
161
+ qp->s_tail_ack_queue = next;
162
+ trace_hfi1_rsp_make_rc_ack(qp, e->psn);
163
+ fallthrough;
128164 case OP(SEND_ONLY):
129165 case OP(ACKNOWLEDGE):
130166 /* Check for no next entry in the queue. */
....@@ -135,6 +171,12 @@
135171 }
136172
137173 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
174
+ /* Check for tid write fence */
175
+ if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
176
+ hfi1_tid_rdma_ack_interlock(qp, e)) {
177
+ iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
178
+ goto bail;
179
+ }
138180 if (e->opcode == OP(RDMA_READ_REQUEST)) {
139181 /*
140182 * If a RDMA read response is being resent and
....@@ -144,6 +186,10 @@
144186 */
145187 len = e->rdma_sge.sge_length;
146188 if (len && !e->rdma_sge.mr) {
189
+ if (qp->s_acked_ack_queue ==
190
+ qp->s_tail_ack_queue)
191
+ qp->s_acked_ack_queue =
192
+ qp->r_head_ack_queue;
147193 qp->s_tail_ack_queue = qp->r_head_ack_queue;
148194 goto bail;
149195 }
....@@ -165,6 +211,45 @@
165211 hwords++;
166212 qp->s_ack_rdma_psn = e->psn;
167213 bth2 = mask_psn(qp->s_ack_rdma_psn++);
214
+ } else if (e->opcode == TID_OP(WRITE_REQ)) {
215
+ /*
216
+ * If a TID RDMA WRITE RESP is being resent, we have to
217
+ * wait for the actual request. All requests that are to
218
+ * be resent will have their state set to
219
+ * TID_REQUEST_RESEND. When the new request arrives, the
220
+ * state will be changed to TID_REQUEST_RESEND_ACTIVE.
221
+ */
222
+ req = ack_to_tid_req(e);
223
+ if (req->state == TID_REQUEST_RESEND ||
224
+ req->state == TID_REQUEST_INIT_RESEND)
225
+ goto bail;
226
+ qp->s_ack_state = TID_OP(WRITE_RESP);
227
+ qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
228
+ goto write_resp;
229
+ } else if (e->opcode == TID_OP(READ_REQ)) {
230
+ /*
231
+ * If a TID RDMA read response is being resent and
232
+ * we haven't seen the duplicate request yet,
233
+ * then stop sending the remaining responses the
234
+ * responder has seen until the requester re-sends it.
235
+ */
236
+ len = e->rdma_sge.sge_length;
237
+ if (len && !e->rdma_sge.mr) {
238
+ if (qp->s_acked_ack_queue ==
239
+ qp->s_tail_ack_queue)
240
+ qp->s_acked_ack_queue =
241
+ qp->r_head_ack_queue;
242
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
243
+ goto bail;
244
+ }
245
+ /* Copy SGE state in case we need to resend */
246
+ ps->s_txreq->mr = e->rdma_sge.mr;
247
+ if (ps->s_txreq->mr)
248
+ rvt_get_mr(ps->s_txreq->mr);
249
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
250
+ qp->s_ack_rdma_sge.num_sge = 1;
251
+ qp->s_ack_state = TID_OP(READ_RESP);
252
+ goto read_resp;
168253 } else {
169254 /* COMPARE_SWAP or FETCH_ADD */
170255 ps->s_txreq->ss = NULL;
....@@ -176,12 +261,13 @@
176261 bth2 = mask_psn(e->psn);
177262 e->sent = 1;
178263 }
264
+ trace_hfi1_tid_write_rsp_make_rc_ack(qp);
179265 bth0 = qp->s_ack_state << 24;
180266 break;
181267
182268 case OP(RDMA_READ_RESPONSE_FIRST):
183269 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
184
- /* FALLTHROUGH */
270
+ fallthrough;
185271 case OP(RDMA_READ_RESPONSE_MIDDLE):
186272 ps->s_txreq->ss = &qp->s_ack_rdma_sge;
187273 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
....@@ -202,6 +288,84 @@
202288 bth2 = mask_psn(qp->s_ack_rdma_psn++);
203289 break;
204290
291
+ case TID_OP(WRITE_RESP):
292
+write_resp:
293
+ /*
294
+ * 1. Check if RVT_S_ACK_PENDING is set. If yes,
295
+ * goto normal.
296
+ * 2. Attempt to allocate TID resources.
297
+ * 3. Remove RVT_S_RESP_PENDING flags from s_flags
298
+ * 4. If resources not available:
299
+ * 4.1 Set RVT_S_WAIT_TID_SPACE
300
+ * 4.2 Queue QP on RCD TID queue
301
+ * 4.3 Put QP on iowait list.
302
+ * 4.4 Build IB RNR NAK with appropriate timeout value
303
+ * 4.5 Return indication progress made.
304
+ * 5. If resources are available:
305
+ * 5.1 Program HW flow CSRs
306
+ * 5.2 Build TID RDMA WRITE RESP packet
307
+ * 5.3 If more resources needed, do 2.1 - 2.3.
308
+ * 5.4 Wake up next QP on RCD TID queue.
309
+ * 5.5 Return indication progress made.
310
+ */
311
+
312
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
313
+ req = ack_to_tid_req(e);
314
+
315
+ /*
316
+ * Send scheduled RNR NAK's. RNR NAK's need to be sent at
317
+ * segment boundaries, not at request boundaries. Don't change
318
+ * s_ack_state because we are still in the middle of a request
319
+ */
320
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
321
+ qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
322
+ req->cur_seg == req->alloc_seg) {
323
+ qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
324
+ goto normal_no_state;
325
+ }
326
+
327
+ bth2 = mask_psn(qp->s_ack_rdma_psn);
328
+ hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
329
+ bth2, &len,
330
+ &ps->s_txreq->ss);
331
+ if (!hdrlen)
332
+ return 0;
333
+
334
+ hwords += hdrlen;
335
+ bth0 = qp->s_ack_state << 24;
336
+ qp->s_ack_rdma_psn++;
337
+ trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
338
+ e->lpsn, req);
339
+ if (req->cur_seg != req->total_segs)
340
+ break;
341
+
342
+ e->sent = 1;
343
+ /* Do not free e->rdma_sge until all data are received */
344
+ qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
345
+ break;
346
+
347
+ case TID_OP(READ_RESP):
348
+read_resp:
349
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
350
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
351
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
352
+ &bth1, &bth2, &len,
353
+ &last_pkt);
354
+ if (delta == 0)
355
+ goto error_qp;
356
+ hwords += delta;
357
+ if (last_pkt) {
358
+ e->sent = 1;
359
+ /*
360
+ * Increment qp->s_tail_ack_queue through s_ack_state
361
+ * transition.
362
+ */
363
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
364
+ }
365
+ break;
366
+ case TID_OP(READ_REQ):
367
+ goto bail;
368
+
205369 default:
206370 normal:
207371 /*
....@@ -211,8 +375,7 @@
211375 * (see above).
212376 */
213377 qp->s_ack_state = OP(SEND_ONLY);
214
- qp->s_flags &= ~RVT_S_ACK_PENDING;
215
- ps->s_txreq->ss = NULL;
378
+normal_no_state:
216379 if (qp->s_nak_state)
217380 ohdr->u.aeth =
218381 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
....@@ -224,14 +387,24 @@
224387 len = 0;
225388 bth0 = OP(ACKNOWLEDGE) << 24;
226389 bth2 = mask_psn(qp->s_ack_psn);
390
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
391
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
392
+ ps->s_txreq->ss = NULL;
227393 }
228394 qp->s_rdma_ack_cnt++;
229
- ps->s_txreq->sde = priv->s_sde;
395
+ ps->s_txreq->sde = qpriv->s_sde;
230396 ps->s_txreq->s_cur_size = len;
231397 ps->s_txreq->hdr_dwords = hwords;
232
- hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
398
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
233399 return 1;
234
-
400
+error_qp:
401
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
402
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
403
+ spin_lock(&qp->s_lock);
404
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
405
+ spin_unlock(&qp->s_lock);
406
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
407
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
235408 bail:
236409 qp->s_ack_state = OP(ACKNOWLEDGE);
237410 /*
....@@ -258,17 +431,23 @@
258431 struct hfi1_qp_priv *priv = qp->priv;
259432 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
260433 struct ib_other_headers *ohdr;
261
- struct rvt_sge_state *ss;
434
+ struct rvt_sge_state *ss = NULL;
262435 struct rvt_swqe *wqe;
263
- u32 hwords;
264
- u32 len;
265
- u32 bth0 = 0;
266
- u32 bth2;
436
+ struct hfi1_swqe_priv *wpriv;
437
+ struct tid_rdma_request *req = NULL;
438
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
439
+ u32 hwords = 5;
440
+ u32 len = 0;
441
+ u32 bth0 = 0, bth2 = 0;
442
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
267443 u32 pmtu = qp->pmtu;
268444 char newreq;
269445 int middle = 0;
270446 int delta;
447
+ struct tid_rdma_flow *flow = NULL;
448
+ struct tid_rdma_params *remote;
271449
450
+ trace_hfi1_sender_make_rc_req(qp);
272451 lockdep_assert_held(&qp->s_lock);
273452 ps->s_txreq = get_txreq(ps->dev, qp);
274453 if (!ps->s_txreq)
....@@ -309,13 +488,13 @@
309488 }
310489 clear_ahg(qp);
311490 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
312
- hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
313
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
491
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
492
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
314493 /* will get called again */
315494 goto done_free_tx;
316495 }
317496
318
- if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
497
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
319498 goto bail;
320499
321500 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
....@@ -329,6 +508,7 @@
329508
330509 /* Send a request. */
331510 wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
511
+check_s_state:
332512 switch (qp->s_state) {
333513 default:
334514 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
....@@ -350,9 +530,13 @@
350530 /*
351531 * If a fence is requested, wait for previous
352532 * RDMA read and atomic operations to finish.
533
+ * However, there is no need to guard against
534
+ * TID RDMA READ after TID RDMA READ.
353535 */
354536 if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
355
- qp->s_num_rd_atomic) {
537
+ qp->s_num_rd_atomic &&
538
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
539
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
356540 qp->s_flags |= RVT_S_WAIT_FENCE;
357541 goto bail;
358542 }
....@@ -378,9 +562,9 @@
378562 wqe->wr.ex.invalidate_rkey);
379563 local_ops = 1;
380564 }
381
- hfi1_send_complete(qp, wqe,
382
- err ? IB_WC_LOC_PROT_ERR
383
- : IB_WC_SUCCESS);
565
+ rvt_send_complete(qp, wqe,
566
+ err ? IB_WC_LOC_PROT_ERR
567
+ : IB_WC_SUCCESS);
384568 if (local_ops)
385569 atomic_dec(&qp->local_ops_pending);
386570 goto done_free_tx;
....@@ -397,16 +581,22 @@
397581 len = wqe->length;
398582 ss = &qp->s_sge;
399583 bth2 = mask_psn(qp->s_psn);
584
+
585
+ /*
586
+ * Interlock between various IB requests and TID RDMA
587
+ * if necessary.
588
+ */
589
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
590
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
591
+ goto bail;
592
+
400593 switch (wqe->wr.opcode) {
401594 case IB_WR_SEND:
402595 case IB_WR_SEND_WITH_IMM:
403596 case IB_WR_SEND_WITH_INV:
404597 /* If no credit, return. */
405
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
406
- rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
407
- qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
598
+ if (!rvt_rc_credit_avail(qp, wqe))
408599 goto bail;
409
- }
410600 if (len > pmtu) {
411601 qp->s_state = OP(SEND_FIRST);
412602 len = pmtu;
....@@ -439,11 +629,8 @@
439629 goto no_flow_control;
440630 case IB_WR_RDMA_WRITE_WITH_IMM:
441631 /* If no credit, return. */
442
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
443
- rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
444
- qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
632
+ if (!rvt_rc_credit_avail(qp, wqe))
445633 goto bail;
446
- }
447634 no_flow_control:
448635 put_ib_reth_vaddr(
449636 wqe->rdma_wr.remote_addr,
....@@ -473,21 +660,126 @@
473660 qp->s_cur = 0;
474661 break;
475662
663
+ case IB_WR_TID_RDMA_WRITE:
664
+ if (newreq) {
665
+ /*
666
+ * Limit the number of TID RDMA WRITE requests.
667
+ */
668
+ if (atomic_read(&priv->n_tid_requests) >=
669
+ HFI1_TID_RDMA_WRITE_CNT)
670
+ goto bail;
671
+
672
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
673
+ qp->s_lsn++;
674
+ }
675
+
676
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
677
+ &bth1, &bth2,
678
+ &len);
679
+ ss = NULL;
680
+ if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
681
+ priv->s_tid_cur = qp->s_cur;
682
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
683
+ priv->s_tid_tail = qp->s_cur;
684
+ priv->s_state = TID_OP(WRITE_RESP);
685
+ }
686
+ } else if (priv->s_tid_cur == priv->s_tid_head) {
687
+ struct rvt_swqe *__w;
688
+ struct tid_rdma_request *__r;
689
+
690
+ __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
691
+ __r = wqe_to_tid_req(__w);
692
+
693
+ /*
694
+ * The s_tid_cur pointer is advanced to s_cur if
695
+ * any of the following conditions about the WQE
696
+ * to which s_ti_cur currently points to are
697
+ * satisfied:
698
+ * 1. The request is not a TID RDMA WRITE
699
+ * request,
700
+ * 2. The request is in the INACTIVE or
701
+ * COMPLETE states (TID RDMA READ requests
702
+ * stay at INACTIVE and TID RDMA WRITE
703
+ * transition to COMPLETE when done),
704
+ * 3. The request is in the ACTIVE or SYNC
705
+ * state and the number of completed
706
+ * segments is equal to the total segment
707
+ * count.
708
+ * (If ACTIVE, the request is waiting for
709
+ * ACKs. If SYNC, the request has not
710
+ * received any responses because it's
711
+ * waiting on a sync point.)
712
+ */
713
+ if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
714
+ __r->state == TID_REQUEST_INACTIVE ||
715
+ __r->state == TID_REQUEST_COMPLETE ||
716
+ ((__r->state == TID_REQUEST_ACTIVE ||
717
+ __r->state == TID_REQUEST_SYNC) &&
718
+ __r->comp_seg == __r->total_segs)) {
719
+ if (priv->s_tid_tail ==
720
+ priv->s_tid_cur &&
721
+ priv->s_state ==
722
+ TID_OP(WRITE_DATA_LAST)) {
723
+ priv->s_tid_tail = qp->s_cur;
724
+ priv->s_state =
725
+ TID_OP(WRITE_RESP);
726
+ }
727
+ priv->s_tid_cur = qp->s_cur;
728
+ }
729
+ /*
730
+ * A corner case: when the last TID RDMA WRITE
731
+ * request was completed, s_tid_head,
732
+ * s_tid_cur, and s_tid_tail all point to the
733
+ * same location. Other requests are posted and
734
+ * s_cur wraps around to the same location,
735
+ * where a new TID RDMA WRITE is posted. In
736
+ * this case, none of the indices need to be
737
+ * updated. However, the priv->s_state should.
738
+ */
739
+ if (priv->s_tid_tail == qp->s_cur &&
740
+ priv->s_state == TID_OP(WRITE_DATA_LAST))
741
+ priv->s_state = TID_OP(WRITE_RESP);
742
+ }
743
+ req = wqe_to_tid_req(wqe);
744
+ if (newreq) {
745
+ priv->s_tid_head = qp->s_cur;
746
+ priv->pending_tid_w_resp += req->total_segs;
747
+ atomic_inc(&priv->n_tid_requests);
748
+ atomic_dec(&priv->n_requests);
749
+ } else {
750
+ req->state = TID_REQUEST_RESEND;
751
+ req->comp_seg = delta_psn(bth2, wqe->psn);
752
+ /*
753
+ * Pull back any segments since we are going
754
+ * to re-receive them.
755
+ */
756
+ req->setup_head = req->clear_tail;
757
+ priv->pending_tid_w_resp +=
758
+ delta_psn(wqe->lpsn, bth2) + 1;
759
+ }
760
+
761
+ trace_hfi1_tid_write_sender_make_req(qp, newreq);
762
+ trace_hfi1_tid_req_make_req_write(qp, newreq,
763
+ wqe->wr.opcode,
764
+ wqe->psn, wqe->lpsn,
765
+ req);
766
+ if (++qp->s_cur == qp->s_size)
767
+ qp->s_cur = 0;
768
+ break;
769
+
476770 case IB_WR_RDMA_READ:
477771 /*
478772 * Don't allow more operations to be started
479773 * than the QP limits allow.
480774 */
481
- if (newreq) {
482
- if (qp->s_num_rd_atomic >=
483
- qp->s_max_rd_atomic) {
484
- qp->s_flags |= RVT_S_WAIT_RDMAR;
485
- goto bail;
486
- }
487
- qp->s_num_rd_atomic++;
488
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
489
- qp->s_lsn++;
775
+ if (qp->s_num_rd_atomic >=
776
+ qp->s_max_rd_atomic) {
777
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
778
+ goto bail;
490779 }
780
+ qp->s_num_rd_atomic++;
781
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
782
+ qp->s_lsn++;
491783 put_ib_reth_vaddr(
492784 wqe->rdma_wr.remote_addr,
493785 &ohdr->u.rc.reth);
....@@ -503,23 +795,98 @@
503795 qp->s_cur = 0;
504796 break;
505797
798
+ case IB_WR_TID_RDMA_READ:
799
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
800
+ wpriv = wqe->priv;
801
+ req = wqe_to_tid_req(wqe);
802
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
803
+ wqe->wr.opcode,
804
+ wqe->psn, wqe->lpsn,
805
+ req);
806
+ delta = cmp_psn(qp->s_psn, wqe->psn);
807
+
808
+ /*
809
+ * Don't allow more operations to be started
810
+ * than the QP limits allow. We could get here under
811
+ * three conditions; (1) It's a new request; (2) We are
812
+ * sending the second or later segment of a request,
813
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
814
+ * when the last segment of a previous request is
815
+ * received just before this; (3) We are re-sending a
816
+ * request.
817
+ */
818
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
819
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
820
+ goto bail;
821
+ }
822
+ if (newreq) {
823
+ struct tid_rdma_flow *flow =
824
+ &req->flows[req->setup_head];
825
+
826
+ /*
827
+ * Set up s_sge as it is needed for TID
828
+ * allocation. However, if the pages have been
829
+ * walked and mapped, skip it. An earlier try
830
+ * has failed to allocate the TID entries.
831
+ */
832
+ if (!flow->npagesets) {
833
+ qp->s_sge.sge = wqe->sg_list[0];
834
+ qp->s_sge.sg_list = wqe->sg_list + 1;
835
+ qp->s_sge.num_sge = wqe->wr.num_sge;
836
+ qp->s_sge.total_len = wqe->length;
837
+ qp->s_len = wqe->length;
838
+ req->isge = 0;
839
+ req->clear_tail = req->setup_head;
840
+ req->flow_idx = req->setup_head;
841
+ req->state = TID_REQUEST_ACTIVE;
842
+ }
843
+ } else if (delta == 0) {
844
+ /* Re-send a request */
845
+ req->cur_seg = 0;
846
+ req->comp_seg = 0;
847
+ req->ack_pending = 0;
848
+ req->flow_idx = req->clear_tail;
849
+ req->state = TID_REQUEST_RESEND;
850
+ }
851
+ req->s_next_psn = qp->s_psn;
852
+ /* Read one segment at a time */
853
+ len = min_t(u32, req->seg_len,
854
+ wqe->length - req->seg_len * req->cur_seg);
855
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
856
+ &bth1, &bth2,
857
+ &len);
858
+ if (delta <= 0) {
859
+ /* Wait for TID space */
860
+ goto bail;
861
+ }
862
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
863
+ qp->s_lsn++;
864
+ hwords += delta;
865
+ ss = &wpriv->ss;
866
+ /* Check if this is the last segment */
867
+ if (req->cur_seg >= req->total_segs &&
868
+ ++qp->s_cur == qp->s_size)
869
+ qp->s_cur = 0;
870
+ break;
871
+
506872 case IB_WR_ATOMIC_CMP_AND_SWP:
507873 case IB_WR_ATOMIC_FETCH_AND_ADD:
508874 /*
509875 * Don't allow more operations to be started
510876 * than the QP limits allow.
511877 */
512
- if (newreq) {
513
- if (qp->s_num_rd_atomic >=
514
- qp->s_max_rd_atomic) {
515
- qp->s_flags |= RVT_S_WAIT_RDMAR;
516
- goto bail;
517
- }
518
- qp->s_num_rd_atomic++;
519
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
520
- qp->s_lsn++;
878
+ if (qp->s_num_rd_atomic >=
879
+ qp->s_max_rd_atomic) {
880
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
881
+ goto bail;
521882 }
522
- if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
883
+ qp->s_num_rd_atomic++;
884
+ fallthrough;
885
+ case IB_WR_OPFN:
886
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
887
+ qp->s_lsn++;
888
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
889
+ wqe->wr.opcode == IB_WR_OPFN) {
523890 qp->s_state = OP(COMPARE_SWAP);
524891 put_ib_ateth_swap(wqe->atomic_wr.swap,
525892 &ohdr->u.atomic_eth);
....@@ -546,18 +913,23 @@
546913 default:
547914 goto bail;
548915 }
549
- qp->s_sge.sge = wqe->sg_list[0];
550
- qp->s_sge.sg_list = wqe->sg_list + 1;
551
- qp->s_sge.num_sge = wqe->wr.num_sge;
552
- qp->s_sge.total_len = wqe->length;
553
- qp->s_len = wqe->length;
916
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
917
+ qp->s_sge.sge = wqe->sg_list[0];
918
+ qp->s_sge.sg_list = wqe->sg_list + 1;
919
+ qp->s_sge.num_sge = wqe->wr.num_sge;
920
+ qp->s_sge.total_len = wqe->length;
921
+ qp->s_len = wqe->length;
922
+ }
554923 if (newreq) {
555924 qp->s_tail++;
556925 if (qp->s_tail >= qp->s_size)
557926 qp->s_tail = 0;
558927 }
559
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
928
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
929
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
560930 qp->s_psn = wqe->lpsn + 1;
931
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
932
+ qp->s_psn = req->s_next_psn;
561933 else
562934 qp->s_psn++;
563935 break;
....@@ -573,10 +945,10 @@
573945 * See restart_rc().
574946 */
575947 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
576
- /* FALLTHROUGH */
948
+ fallthrough;
577949 case OP(SEND_FIRST):
578950 qp->s_state = OP(SEND_MIDDLE);
579
- /* FALLTHROUGH */
951
+ fallthrough;
580952 case OP(SEND_MIDDLE):
581953 bth2 = mask_psn(qp->s_psn++);
582954 ss = &qp->s_sge;
....@@ -618,10 +990,10 @@
618990 * See restart_rc().
619991 */
620992 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
621
- /* FALLTHROUGH */
993
+ fallthrough;
622994 case OP(RDMA_WRITE_FIRST):
623995 qp->s_state = OP(RDMA_WRITE_MIDDLE);
624
- /* FALLTHROUGH */
996
+ fallthrough;
625997 case OP(RDMA_WRITE_MIDDLE):
626998 bth2 = mask_psn(qp->s_psn++);
627999 ss = &qp->s_sge;
....@@ -674,10 +1046,137 @@
6741046 if (qp->s_cur == qp->s_size)
6751047 qp->s_cur = 0;
6761048 break;
1049
+
1050
+ case TID_OP(WRITE_RESP):
1051
+ /*
1052
+ * This value for s_state is used for restarting a TID RDMA
1053
+ * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
1054
+ * for more).
1055
+ */
1056
+ req = wqe_to_tid_req(wqe);
1057
+ req->state = TID_REQUEST_RESEND;
1058
+ rcu_read_lock();
1059
+ remote = rcu_dereference(priv->tid_rdma.remote);
1060
+ req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
1061
+ len = wqe->length - (req->comp_seg * remote->max_len);
1062
+ rcu_read_unlock();
1063
+
1064
+ bth2 = mask_psn(qp->s_psn);
1065
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
1066
+ &bth2, &len);
1067
+ qp->s_psn = wqe->lpsn + 1;
1068
+ ss = NULL;
1069
+ qp->s_state = TID_OP(WRITE_REQ);
1070
+ priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
1071
+ priv->s_tid_cur = qp->s_cur;
1072
+ if (++qp->s_cur == qp->s_size)
1073
+ qp->s_cur = 0;
1074
+ trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
1075
+ wqe->psn, wqe->lpsn, req);
1076
+ break;
1077
+
1078
+ case TID_OP(READ_RESP):
1079
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
1080
+ goto bail;
1081
+ /* This is used to restart a TID read request */
1082
+ req = wqe_to_tid_req(wqe);
1083
+ wpriv = wqe->priv;
1084
+ /*
1085
+ * Back down. The field qp->s_psn has been set to the psn with
1086
+ * which the request should be restart. It's OK to use division
1087
+ * as this is on the retry path.
1088
+ */
1089
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
1090
+
1091
+ /*
1092
+ * The following function need to be redefined to return the
1093
+ * status to make sure that we find the flow. At the same
1094
+ * time, we can use the req->state change to check if the
1095
+ * call succeeds or not.
1096
+ */
1097
+ req->state = TID_REQUEST_RESEND;
1098
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
1099
+ if (req->state != TID_REQUEST_ACTIVE) {
1100
+ /*
1101
+ * Failed to find the flow. Release all allocated tid
1102
+ * resources.
1103
+ */
1104
+ hfi1_kern_exp_rcv_clear_all(req);
1105
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
1106
+
1107
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
1108
+ goto bail;
1109
+ }
1110
+ req->state = TID_REQUEST_RESEND;
1111
+ len = min_t(u32, req->seg_len,
1112
+ wqe->length - req->seg_len * req->cur_seg);
1113
+ flow = &req->flows[req->flow_idx];
1114
+ len -= flow->sent;
1115
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
1116
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
1117
+ &bth2, &len);
1118
+ if (delta <= 0) {
1119
+ /* Wait for TID space */
1120
+ goto bail;
1121
+ }
1122
+ hwords += delta;
1123
+ ss = &wpriv->ss;
1124
+ /* Check if this is the last segment */
1125
+ if (req->cur_seg >= req->total_segs &&
1126
+ ++qp->s_cur == qp->s_size)
1127
+ qp->s_cur = 0;
1128
+ qp->s_psn = req->s_next_psn;
1129
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1130
+ wqe->psn, wqe->lpsn, req);
1131
+ break;
1132
+ case TID_OP(READ_REQ):
1133
+ req = wqe_to_tid_req(wqe);
1134
+ delta = cmp_psn(qp->s_psn, wqe->psn);
1135
+ /*
1136
+ * If the current WR is not TID RDMA READ, or this is the start
1137
+ * of a new request, we need to change the qp->s_state so that
1138
+ * the request can be set up properly.
1139
+ */
1140
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
1141
+ qp->s_cur == qp->s_tail) {
1142
+ qp->s_state = OP(RDMA_READ_REQUEST);
1143
+ if (delta == 0 || qp->s_cur == qp->s_tail)
1144
+ goto check_s_state;
1145
+ else
1146
+ goto bail;
1147
+ }
1148
+
1149
+ /* Rate limiting */
1150
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
1151
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
1152
+ goto bail;
1153
+ }
1154
+
1155
+ wpriv = wqe->priv;
1156
+ /* Read one segment at a time */
1157
+ len = min_t(u32, req->seg_len,
1158
+ wqe->length - req->seg_len * req->cur_seg);
1159
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
1160
+ &bth2, &len);
1161
+ if (delta <= 0) {
1162
+ /* Wait for TID space */
1163
+ goto bail;
1164
+ }
1165
+ hwords += delta;
1166
+ ss = &wpriv->ss;
1167
+ /* Check if this is the last segment */
1168
+ if (req->cur_seg >= req->total_segs &&
1169
+ ++qp->s_cur == qp->s_size)
1170
+ qp->s_cur = 0;
1171
+ qp->s_psn = req->s_next_psn;
1172
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1173
+ wqe->psn, wqe->lpsn, req);
1174
+ break;
6771175 }
6781176 qp->s_sending_hpsn = bth2;
6791177 delta = delta_psn(bth2, wqe->psn);
680
- if (delta && delta % HFI1_PSN_CREDIT == 0)
1178
+ if (delta && delta % HFI1_PSN_CREDIT == 0 &&
1179
+ wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
6811180 bth2 |= IB_BTH_REQ_ACK;
6821181 if (qp->s_flags & RVT_S_SEND_ONE) {
6831182 qp->s_flags &= ~RVT_S_SEND_ONE;
....@@ -693,6 +1192,7 @@
6931192 qp,
6941193 ohdr,
6951194 bth0 | (qp->s_state << 24),
1195
+ bth1,
6961196 bth2,
6971197 middle,
6981198 ps);
....@@ -709,6 +1209,12 @@
7091209 bail_no_tx:
7101210 ps->s_txreq = NULL;
7111211 qp->s_flags &= ~RVT_S_BUSY;
1212
+ /*
1213
+ * If we didn't get a txreq, the QP will be woken up later to try
1214
+ * again. Set the flags to indicate which work item to wake
1215
+ * up.
1216
+ */
1217
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
7121218 return 0;
7131219 }
7141220
....@@ -796,6 +1302,11 @@
7961302 if (qp->s_mig_state == IB_MIG_MIGRATED)
7971303 bth0 |= IB_BTH_MIG_REQ;
7981304 bth1 = (!!is_fecn) << IB_BECN_SHIFT;
1305
+ /*
1306
+ * Inline ACKs go out without the use of the Verbs send engine, so
1307
+ * we need to set the STL Verbs Extended bit here
1308
+ */
1309
+ bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
7991310 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
8001311 }
8011312
....@@ -936,6 +1447,48 @@
9361447 }
9371448
9381449 /**
1450
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
1451
+ * @qp: the QP
1452
+ * @psn: the packet sequence number to restart at
1453
+ * @wqe: the wqe
1454
+ *
1455
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
1456
+ * for the current wqe.
1457
+ * Called at interrupt level with the QP s_lock held.
1458
+ */
1459
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
1460
+ struct rvt_swqe *wqe)
1461
+{
1462
+ u32 opcode = wqe->wr.opcode;
1463
+
1464
+ if (opcode == IB_WR_RDMA_READ ||
1465
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1466
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1467
+ qp->s_num_rd_atomic++;
1468
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
1469
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1470
+ struct hfi1_qp_priv *priv = qp->priv;
1471
+
1472
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
1473
+ u32 cur_seg;
1474
+
1475
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
1476
+ req->ack_pending = cur_seg - req->comp_seg;
1477
+ priv->pending_tid_r_segs += req->ack_pending;
1478
+ qp->s_num_rd_atomic += req->ack_pending;
1479
+ trace_hfi1_tid_req_update_num_rd_atomic(qp, 0,
1480
+ wqe->wr.opcode,
1481
+ wqe->psn,
1482
+ wqe->lpsn,
1483
+ req);
1484
+ } else {
1485
+ priv->pending_tid_r_segs += req->total_segs;
1486
+ qp->s_num_rd_atomic += req->total_segs;
1487
+ }
1488
+ }
1489
+}
1490
+
1491
+/**
9391492 * reset_psn - reset the QP state to send starting from PSN
9401493 * @qp: the QP
9411494 * @psn: the packet sequence number to restart at
....@@ -949,9 +1502,13 @@
9491502 u32 n = qp->s_acked;
9501503 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
9511504 u32 opcode;
1505
+ struct hfi1_qp_priv *priv = qp->priv;
9521506
9531507 lockdep_assert_held(&qp->s_lock);
9541508 qp->s_cur = n;
1509
+ priv->pending_tid_r_segs = 0;
1510
+ priv->pending_tid_w_resp = 0;
1511
+ qp->s_num_rd_atomic = 0;
9551512
9561513 /*
9571514 * If we are starting the request from the beginning,
....@@ -961,9 +1518,9 @@
9611518 qp->s_state = OP(SEND_LAST);
9621519 goto done;
9631520 }
1521
+ update_num_rd_atomic(qp, psn, wqe);
9641522
9651523 /* Find the work request opcode corresponding to the given PSN. */
966
- opcode = wqe->wr.opcode;
9671524 for (;;) {
9681525 int diff;
9691526
....@@ -973,8 +1530,11 @@
9731530 break;
9741531 wqe = rvt_get_swqe_ptr(qp, n);
9751532 diff = cmp_psn(psn, wqe->psn);
976
- if (diff < 0)
1533
+ if (diff < 0) {
1534
+ /* Point wqe back to the previous one*/
1535
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
9771536 break;
1537
+ }
9781538 qp->s_cur = n;
9791539 /*
9801540 * If we are starting the request from the beginning,
....@@ -984,8 +1544,10 @@
9841544 qp->s_state = OP(SEND_LAST);
9851545 goto done;
9861546 }
987
- opcode = wqe->wr.opcode;
1547
+
1548
+ update_num_rd_atomic(qp, psn, wqe);
9881549 }
1550
+ opcode = wqe->wr.opcode;
9891551
9901552 /*
9911553 * Set the state to restart in the middle of a request.
....@@ -1003,8 +1565,16 @@
10031565 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
10041566 break;
10051567
1568
+ case IB_WR_TID_RDMA_WRITE:
1569
+ qp->s_state = TID_OP(WRITE_RESP);
1570
+ break;
1571
+
10061572 case IB_WR_RDMA_READ:
10071573 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1574
+ break;
1575
+
1576
+ case IB_WR_TID_RDMA_READ:
1577
+ qp->s_state = TID_OP(READ_RESP);
10081578 break;
10091579
10101580 default:
....@@ -1015,6 +1585,7 @@
10151585 qp->s_state = OP(SEND_LAST);
10161586 }
10171587 done:
1588
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
10181589 qp->s_psn = psn;
10191590 /*
10201591 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
....@@ -1025,6 +1596,7 @@
10251596 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
10261597 qp->s_flags |= RVT_S_WAIT_PSN;
10271598 qp->s_flags &= ~HFI1_S_AHG_VALID;
1599
+ trace_hfi1_sender_reset_psn(qp);
10281600 }
10291601
10301602 /*
....@@ -1033,18 +1605,47 @@
10331605 */
10341606 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
10351607 {
1608
+ struct hfi1_qp_priv *priv = qp->priv;
10361609 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
10371610 struct hfi1_ibport *ibp;
10381611
10391612 lockdep_assert_held(&qp->r_lock);
10401613 lockdep_assert_held(&qp->s_lock);
1614
+ trace_hfi1_sender_restart_rc(qp);
10411615 if (qp->s_retry == 0) {
10421616 if (qp->s_mig_state == IB_MIG_ARMED) {
10431617 hfi1_migrate_qp(qp);
10441618 qp->s_retry = qp->s_retry_cnt;
10451619 } else if (qp->s_last == qp->s_acked) {
1046
- hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
1047
- rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1620
+ /*
1621
+ * We need special handling for the OPFN request WQEs as
1622
+ * they are not allowed to generate real user errors
1623
+ */
1624
+ if (wqe->wr.opcode == IB_WR_OPFN) {
1625
+ struct hfi1_ibport *ibp =
1626
+ to_iport(qp->ibqp.device, qp->port_num);
1627
+ /*
1628
+ * Call opfn_conn_reply() with capcode and
1629
+ * remaining data as 0 to close out the
1630
+ * current request
1631
+ */
1632
+ opfn_conn_reply(qp, priv->opfn.curr);
1633
+ wqe = do_rc_completion(qp, wqe, ibp);
1634
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
1635
+ } else {
1636
+ trace_hfi1_tid_write_sender_restart_rc(qp, 0);
1637
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
1638
+ struct tid_rdma_request *req;
1639
+
1640
+ req = wqe_to_tid_req(wqe);
1641
+ hfi1_kern_exp_rcv_clear_all(req);
1642
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
1643
+ }
1644
+
1645
+ hfi1_trdma_send_complete(qp, wqe,
1646
+ IB_WC_RETRY_EXC_ERR);
1647
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1648
+ }
10481649 return;
10491650 } else { /* need to handle delayed completion */
10501651 return;
....@@ -1054,14 +1655,15 @@
10541655 }
10551656
10561657 ibp = to_iport(qp->ibqp.device, qp->port_num);
1057
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
1658
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1659
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
10581660 ibp->rvp.n_rc_resends++;
10591661 else
10601662 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
10611663
10621664 qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
10631665 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1064
- RVT_S_WAIT_ACK);
1666
+ RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
10651667 if (wait)
10661668 qp->s_flags |= RVT_S_SEND_ONE;
10671669 reset_psn(qp, psn);
....@@ -1069,7 +1671,8 @@
10691671
10701672 /*
10711673 * Set qp->s_sending_psn to the next PSN after the given one.
1072
- * This would be psn+1 except when RDMA reads are present.
1674
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
1675
+ * are present.
10731676 */
10741677 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
10751678 {
....@@ -1081,7 +1684,9 @@
10811684 for (;;) {
10821685 wqe = rvt_get_swqe_ptr(qp, n);
10831686 if (cmp_psn(psn, wqe->lpsn) <= 0) {
1084
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
1687
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1688
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
1689
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
10851690 qp->s_sending_psn = wqe->lpsn + 1;
10861691 else
10871692 qp->s_sending_psn = psn + 1;
....@@ -1094,6 +1699,36 @@
10941699 }
10951700 }
10961701
1702
+/**
1703
+ * hfi1_rc_verbs_aborted - handle abort status
1704
+ * @qp: the QP
1705
+ * @opah: the opa header
1706
+ *
1707
+ * This code modifies both ACK bit in BTH[2]
1708
+ * and the s_flags to go into send one mode.
1709
+ *
1710
+ * This serves to throttle the send engine to only
1711
+ * send a single packet in the likely case the
1712
+ * a link has gone down.
1713
+ */
1714
+void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1715
+{
1716
+ struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah);
1717
+ u8 opcode = ib_bth_get_opcode(ohdr);
1718
+ u32 psn;
1719
+
1720
+ /* ignore responses */
1721
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1722
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1723
+ opcode == TID_OP(READ_RESP) ||
1724
+ opcode == TID_OP(WRITE_RESP))
1725
+ return;
1726
+
1727
+ psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK;
1728
+ ohdr->bth[2] = cpu_to_be32(psn);
1729
+ qp->s_flags |= RVT_S_SEND_ONE;
1730
+}
1731
+
10971732 /*
10981733 * This should be called with the QP s_lock held and interrupts disabled.
10991734 */
....@@ -1102,71 +1737,104 @@
11021737 struct ib_other_headers *ohdr;
11031738 struct hfi1_qp_priv *priv = qp->priv;
11041739 struct rvt_swqe *wqe;
1105
- struct ib_header *hdr = NULL;
1106
- struct hfi1_16b_header *hdr_16b = NULL;
1107
- u32 opcode;
1740
+ u32 opcode, head, tail;
11081741 u32 psn;
1742
+ struct tid_rdma_request *req;
11091743
11101744 lockdep_assert_held(&qp->s_lock);
11111745 if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
11121746 return;
11131747
1114
- /* Find out where the BTH is */
1115
- if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
1116
- hdr = &opah->ibh;
1117
- if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
1118
- ohdr = &hdr->u.oth;
1119
- else
1120
- ohdr = &hdr->u.l.oth;
1121
- } else {
1122
- u8 l4;
1123
-
1124
- hdr_16b = &opah->opah;
1125
- l4 = hfi1_16B_get_l4(hdr_16b);
1126
- if (l4 == OPA_16B_L4_IB_LOCAL)
1127
- ohdr = &hdr_16b->u.oth;
1128
- else
1129
- ohdr = &hdr_16b->u.l.oth;
1130
- }
1131
-
1748
+ ohdr = hfi1_get_rc_ohdr(opah);
11321749 opcode = ib_bth_get_opcode(ohdr);
1133
- if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1134
- opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1750
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1751
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1752
+ opcode == TID_OP(READ_RESP) ||
1753
+ opcode == TID_OP(WRITE_RESP)) {
11351754 WARN_ON(!qp->s_rdma_ack_cnt);
11361755 qp->s_rdma_ack_cnt--;
11371756 return;
11381757 }
11391758
11401759 psn = ib_bth_get_psn(ohdr);
1141
- reset_sending_psn(qp, psn);
1760
+ /*
1761
+ * Don't attempt to reset the sending PSN for packets in the
1762
+ * KDETH PSN space since the PSN does not match anything.
1763
+ */
1764
+ if (opcode != TID_OP(WRITE_DATA) &&
1765
+ opcode != TID_OP(WRITE_DATA_LAST) &&
1766
+ opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
1767
+ reset_sending_psn(qp, psn);
1768
+
1769
+ /* Handle TID RDMA WRITE packets differently */
1770
+ if (opcode >= TID_OP(WRITE_REQ) &&
1771
+ opcode <= TID_OP(WRITE_DATA_LAST)) {
1772
+ head = priv->s_tid_head;
1773
+ tail = priv->s_tid_cur;
1774
+ /*
1775
+ * s_tid_cur is set to s_tid_head in the case, where
1776
+ * a new TID RDMA request is being started and all
1777
+ * previous ones have been completed.
1778
+ * Therefore, we need to do a secondary check in order
1779
+ * to properly determine whether we should start the
1780
+ * RC timer.
1781
+ */
1782
+ wqe = rvt_get_swqe_ptr(qp, tail);
1783
+ req = wqe_to_tid_req(wqe);
1784
+ if (head == tail && req->comp_seg < req->total_segs) {
1785
+ if (tail == 0)
1786
+ tail = qp->s_size - 1;
1787
+ else
1788
+ tail -= 1;
1789
+ }
1790
+ } else {
1791
+ head = qp->s_tail;
1792
+ tail = qp->s_acked;
1793
+ }
11421794
11431795 /*
11441796 * Start timer after a packet requesting an ACK has been sent and
11451797 * there are still requests that haven't been acked.
11461798 */
1147
- if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1799
+ if ((psn & IB_BTH_REQ_ACK) && tail != head &&
1800
+ opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
1801
+ opcode != TID_OP(RESYNC) &&
11481802 !(qp->s_flags &
1149
- (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1150
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1151
- rvt_add_retry_timer(qp);
1803
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1804
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1805
+ if (opcode == TID_OP(READ_REQ))
1806
+ rvt_add_retry_timer_ext(qp, priv->timeout_shift);
1807
+ else
1808
+ rvt_add_retry_timer(qp);
1809
+ }
1810
+
1811
+ /* Start TID RDMA ACK timer */
1812
+ if ((opcode == TID_OP(WRITE_DATA) ||
1813
+ opcode == TID_OP(WRITE_DATA_LAST) ||
1814
+ opcode == TID_OP(RESYNC)) &&
1815
+ (psn & IB_BTH_REQ_ACK) &&
1816
+ !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
1817
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1818
+ /*
1819
+ * The TID RDMA ACK packet could be received before this
1820
+ * function is called. Therefore, add the timer only if TID
1821
+ * RDMA ACK packets are actually pending.
1822
+ */
1823
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1824
+ req = wqe_to_tid_req(wqe);
1825
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
1826
+ req->ack_seg < req->cur_seg)
1827
+ hfi1_add_tid_retry_timer(qp);
1828
+ }
11521829
11531830 while (qp->s_last != qp->s_acked) {
1154
- u32 s_last;
1155
-
11561831 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
11571832 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
11581833 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
11591834 break;
1160
- rvt_qp_wqe_unreserve(qp, wqe);
1161
- s_last = qp->s_last;
1162
- trace_hfi1_qp_send_completion(qp, wqe, s_last);
1163
- if (++s_last >= qp->s_size)
1164
- s_last = 0;
1165
- qp->s_last = s_last;
1166
- /* see post_send() */
1167
- barrier();
1168
- rvt_put_swqe(wqe);
1169
- rvt_qp_swqe_complete(qp,
1835
+ trdma_clean_swqe(qp, wqe);
1836
+ trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1837
+ rvt_qp_complete_swqe(qp,
11701838 wqe,
11711839 ib_hfi1_wc_opcode[wqe->wr.opcode],
11721840 IB_WC_SUCCESS);
....@@ -1195,30 +1863,24 @@
11951863 * This is similar to hfi1_send_complete but has to check to be sure
11961864 * that the SGEs are not being referenced if the SWQE is being resent.
11971865 */
1198
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1199
- struct rvt_swqe *wqe,
1200
- struct hfi1_ibport *ibp)
1866
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1867
+ struct rvt_swqe *wqe,
1868
+ struct hfi1_ibport *ibp)
12011869 {
1870
+ struct hfi1_qp_priv *priv = qp->priv;
1871
+
12021872 lockdep_assert_held(&qp->s_lock);
12031873 /*
12041874 * Don't decrement refcount and don't generate a
12051875 * completion if the SWQE is being resent until the send
12061876 * is finished.
12071877 */
1878
+ trace_hfi1_rc_completion(qp, wqe->lpsn);
12081879 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
12091880 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1210
- u32 s_last;
1211
-
1212
- rvt_put_swqe(wqe);
1213
- rvt_qp_wqe_unreserve(qp, wqe);
1214
- s_last = qp->s_last;
1215
- trace_hfi1_qp_send_completion(qp, wqe, s_last);
1216
- if (++s_last >= qp->s_size)
1217
- s_last = 0;
1218
- qp->s_last = s_last;
1219
- /* see post_send() */
1220
- barrier();
1221
- rvt_qp_swqe_complete(qp,
1881
+ trdma_clean_swqe(qp, wqe);
1882
+ trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1883
+ rvt_qp_complete_swqe(qp,
12221884 wqe,
12231885 ib_hfi1_wc_opcode[wqe->wr.opcode],
12241886 IB_WC_SUCCESS);
....@@ -1243,7 +1905,16 @@
12431905 }
12441906
12451907 qp->s_retry = qp->s_retry_cnt;
1246
- update_last_psn(qp, wqe->lpsn);
1908
+ /*
1909
+ * Don't update the last PSN if the request being completed is
1910
+ * a TID RDMA WRITE request.
1911
+ * Completion of the TID RDMA WRITE requests are done by the
1912
+ * TID RDMA ACKs and as such could be for a request that has
1913
+ * already been ACKed as far as the IB state machine is
1914
+ * concerned.
1915
+ */
1916
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
1917
+ update_last_psn(qp, wqe->lpsn);
12471918
12481919 /*
12491920 * If we are completing a request which is in the process of
....@@ -1266,7 +1937,59 @@
12661937 qp->s_draining = 0;
12671938 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
12681939 }
1940
+ if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
1941
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
1942
+ hfi1_schedule_send(qp);
1943
+ }
12691944 return wqe;
1945
+}
1946
+
1947
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
1948
+{
1949
+ /* Retry this request. */
1950
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1951
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
1952
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1953
+ if (list_empty(&qp->rspwait)) {
1954
+ qp->r_flags |= RVT_R_RSP_SEND;
1955
+ rvt_get_qp(qp);
1956
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1957
+ }
1958
+ }
1959
+}
1960
+
1961
+/**
1962
+ * update_qp_retry_state - Update qp retry state.
1963
+ * @qp: the QP
1964
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
1965
+ * @spsn: The start psn for the given TID RDMA WRITE swqe.
1966
+ * @lpsn: The last psn for the given TID RDMA WRITE swqe.
1967
+ *
1968
+ * This function is called to update the qp retry state upon
1969
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
1970
+ * a request.
1971
+ */
1972
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
1973
+ u32 lpsn)
1974
+{
1975
+ struct hfi1_qp_priv *qpriv = qp->priv;
1976
+
1977
+ qp->s_psn = psn + 1;
1978
+ /*
1979
+ * If this is the first TID RDMA WRITE RESP packet for the current
1980
+ * request, change the s_state so that the retry will be processed
1981
+ * correctly. Similarly, if this is the last TID RDMA WRITE RESP
1982
+ * packet, change the s_state and advance the s_cur.
1983
+ */
1984
+ if (cmp_psn(psn, lpsn) >= 0) {
1985
+ qp->s_cur = qpriv->s_tid_cur + 1;
1986
+ if (qp->s_cur >= qp->s_size)
1987
+ qp->s_cur = 0;
1988
+ qp->s_state = TID_OP(WRITE_REQ);
1989
+ } else if (!cmp_psn(psn, spsn)) {
1990
+ qp->s_cur = qpriv->s_tid_cur;
1991
+ qp->s_state = TID_OP(WRITE_RESP);
1992
+ }
12701993 }
12711994
12721995 /**
....@@ -1280,15 +2003,17 @@
12802003 * May be called at interrupt level, with the QP s_lock held.
12812004 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
12822005 */
1283
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1284
- u64 val, struct hfi1_ctxtdata *rcd)
2006
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
2007
+ u64 val, struct hfi1_ctxtdata *rcd)
12852008 {
12862009 struct hfi1_ibport *ibp;
12872010 enum ib_wc_status status;
2011
+ struct hfi1_qp_priv *qpriv = qp->priv;
12882012 struct rvt_swqe *wqe;
12892013 int ret = 0;
12902014 u32 ack_psn;
12912015 int diff;
2016
+ struct rvt_dev_info *rdi;
12922017
12932018 lockdep_assert_held(&qp->s_lock);
12942019 /*
....@@ -1331,20 +2056,14 @@
13312056 */
13322057 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
13332058 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
2059
+ (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
2060
+ (opcode != TID_OP(READ_RESP) || diff != 0)) ||
13342061 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
13352062 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1336
- (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1337
- /* Retry this request. */
1338
- if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1339
- qp->r_flags |= RVT_R_RDMAR_SEQ;
1340
- hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1341
- if (list_empty(&qp->rspwait)) {
1342
- qp->r_flags |= RVT_R_RSP_SEND;
1343
- rvt_get_qp(qp);
1344
- list_add_tail(&qp->rspwait,
1345
- &rcd->qp_wait_list);
1346
- }
1347
- }
2063
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
2064
+ (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2065
+ (delta_psn(psn, qp->s_last_psn) != 1))) {
2066
+ set_restart_qp(qp, rcd);
13482067 /*
13492068 * No need to process the ACK/NAK since we are
13502069 * restarting an earlier request.
....@@ -1356,6 +2075,9 @@
13562075 u64 *vaddr = wqe->sg_list[0].vaddr;
13572076 *vaddr = val;
13582077 }
2078
+ if (wqe->wr.opcode == IB_WR_OPFN)
2079
+ opfn_conn_reply(qp, val);
2080
+
13592081 if (qp->s_num_rd_atomic &&
13602082 (wqe->wr.opcode == IB_WR_RDMA_READ ||
13612083 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
....@@ -1373,26 +2095,85 @@
13732095 hfi1_schedule_send(qp);
13742096 }
13752097 }
2098
+
2099
+ /*
2100
+ * TID RDMA WRITE requests will be completed by the TID RDMA
2101
+ * ACK packet handler (see tid_rdma.c).
2102
+ */
2103
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
2104
+ break;
2105
+
13762106 wqe = do_rc_completion(qp, wqe, ibp);
13772107 if (qp->s_acked == qp->s_tail)
13782108 break;
13792109 }
13802110
2111
+ trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
2112
+ trace_hfi1_sender_do_rc_ack(qp);
13812113 switch (aeth >> IB_AETH_NAK_SHIFT) {
13822114 case 0: /* ACK */
13832115 this_cpu_inc(*ibp->rvp.rc_acks);
1384
- if (qp->s_acked != qp->s_tail) {
2116
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2117
+ if (wqe_to_tid_req(wqe)->ack_pending)
2118
+ rvt_mod_retry_timer_ext(qp,
2119
+ qpriv->timeout_shift);
2120
+ else
2121
+ rvt_stop_rc_timers(qp);
2122
+ } else if (qp->s_acked != qp->s_tail) {
2123
+ struct rvt_swqe *__w = NULL;
2124
+
2125
+ if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
2126
+ __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
2127
+
13852128 /*
1386
- * We are expecting more ACKs so
1387
- * mod the retry timer.
2129
+ * Stop timers if we've received all of the TID RDMA
2130
+ * WRITE * responses.
13882131 */
1389
- rvt_mod_retry_timer(qp);
1390
- /*
1391
- * We can stop re-sending the earlier packets and
1392
- * continue with the next packet the receiver wants.
1393
- */
1394
- if (cmp_psn(qp->s_psn, psn) <= 0)
1395
- reset_psn(qp, psn + 1);
2132
+ if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2133
+ opcode == TID_OP(WRITE_RESP)) {
2134
+ /*
2135
+ * Normally, the loop above would correctly
2136
+ * process all WQEs from s_acked onward and
2137
+ * either complete them or check for correct
2138
+ * PSN sequencing.
2139
+ * However, for TID RDMA, due to pipelining,
2140
+ * the response may not be for the request at
2141
+ * s_acked so the above look would just be
2142
+ * skipped. This does not allow for checking
2143
+ * the PSN sequencing. It has to be done
2144
+ * separately.
2145
+ */
2146
+ if (cmp_psn(psn, qp->s_last_psn + 1)) {
2147
+ set_restart_qp(qp, rcd);
2148
+ goto bail_stop;
2149
+ }
2150
+ /*
2151
+ * If the psn is being resent, stop the
2152
+ * resending.
2153
+ */
2154
+ if (qp->s_cur != qp->s_tail &&
2155
+ cmp_psn(qp->s_psn, psn) <= 0)
2156
+ update_qp_retry_state(qp, psn,
2157
+ __w->psn,
2158
+ __w->lpsn);
2159
+ else if (--qpriv->pending_tid_w_resp)
2160
+ rvt_mod_retry_timer(qp);
2161
+ else
2162
+ rvt_stop_rc_timers(qp);
2163
+ } else {
2164
+ /*
2165
+ * We are expecting more ACKs so
2166
+ * mod the retry timer.
2167
+ */
2168
+ rvt_mod_retry_timer(qp);
2169
+ /*
2170
+ * We can stop re-sending the earlier packets
2171
+ * and continue with the next packet the
2172
+ * receiver wants.
2173
+ */
2174
+ if (cmp_psn(qp->s_psn, psn) <= 0)
2175
+ reset_psn(qp, psn + 1);
2176
+ }
13962177 } else {
13972178 /* No more acks - kill all timers */
13982179 rvt_stop_rc_timers(qp);
....@@ -1408,6 +2189,15 @@
14082189 rvt_get_credit(qp, aeth);
14092190 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
14102191 qp->s_retry = qp->s_retry_cnt;
2192
+ /*
2193
+ * If the current request is a TID RDMA WRITE request and the
2194
+ * response is not a TID RDMA WRITE RESP packet, s_last_psn
2195
+ * can't be advanced.
2196
+ */
2197
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2198
+ opcode != TID_OP(WRITE_RESP) &&
2199
+ cmp_psn(psn, wqe->psn) >= 0)
2200
+ return 1;
14112201 update_last_psn(qp, psn);
14122202 return 1;
14132203
....@@ -1417,20 +2207,31 @@
14172207 goto bail_stop;
14182208 if (qp->s_flags & RVT_S_WAIT_RNR)
14192209 goto bail_stop;
1420
- if (qp->s_rnr_retry == 0) {
1421
- status = IB_WC_RNR_RETRY_EXC_ERR;
1422
- goto class_b;
2210
+ rdi = ib_to_rvt(qp->ibqp.device);
2211
+ if (!(rdi->post_parms[wqe->wr.opcode].flags &
2212
+ RVT_OPERATION_IGN_RNR_CNT)) {
2213
+ if (qp->s_rnr_retry == 0) {
2214
+ status = IB_WC_RNR_RETRY_EXC_ERR;
2215
+ goto class_b;
2216
+ }
2217
+ if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
2218
+ qp->s_rnr_retry--;
14232219 }
1424
- if (qp->s_rnr_retry_cnt < 7)
1425
- qp->s_rnr_retry--;
14262220
1427
- /* The last valid PSN is the previous PSN. */
1428
- update_last_psn(qp, psn - 1);
2221
+ /*
2222
+ * The last valid PSN is the previous PSN. For TID RDMA WRITE
2223
+ * request, s_last_psn should be incremented only when a TID
2224
+ * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
2225
+ * WRITE RESP packets.
2226
+ */
2227
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
2228
+ reset_psn(qp, qp->s_last_psn + 1);
2229
+ } else {
2230
+ update_last_psn(qp, psn - 1);
2231
+ reset_psn(qp, psn);
2232
+ }
14292233
14302234 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1431
-
1432
- reset_psn(qp, psn);
1433
-
14342235 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
14352236 rvt_stop_rc_timers(qp);
14362237 rvt_add_rnr_timer(qp, aeth);
....@@ -1470,7 +2271,10 @@
14702271 ibp->rvp.n_other_naks++;
14712272 class_b:
14722273 if (qp->s_last == qp->s_acked) {
1473
- hfi1_send_complete(qp, wqe, status);
2274
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
2275
+ hfi1_kern_read_tid_flow_free(qp);
2276
+
2277
+ hfi1_trdma_send_complete(qp, wqe, status);
14742278 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
14752279 }
14762280 break;
....@@ -1511,6 +2315,8 @@
15112315
15122316 while (cmp_psn(psn, wqe->lpsn) > 0) {
15132317 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2318
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2319
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
15142320 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
15152321 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
15162322 break;
....@@ -1646,7 +2452,8 @@
16462452 qp->s_rdma_read_len -= pmtu;
16472453 update_last_psn(qp, psn);
16482454 spin_unlock_irqrestore(&qp->s_lock, flags);
1649
- hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
2455
+ rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2456
+ data, pmtu, false, false);
16502457 goto bail;
16512458
16522459 case OP(RDMA_READ_RESPONSE_ONLY):
....@@ -1686,7 +2493,8 @@
16862493 if (unlikely(tlen != qp->s_rdma_read_len))
16872494 goto ack_len_err;
16882495 aeth = be32_to_cpu(ohdr->u.aeth);
1689
- hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
2496
+ rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2497
+ data, tlen, false, false);
16902498 WARN_ON(qp->s_rdma_read_sge.num_sge);
16912499 (void)do_rc_ack(qp, aeth, psn,
16922500 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
....@@ -1706,23 +2514,13 @@
17062514 status = IB_WC_LOC_LEN_ERR;
17072515 ack_err:
17082516 if (qp->s_last == qp->s_acked) {
1709
- hfi1_send_complete(qp, wqe, status);
2517
+ rvt_send_complete(qp, wqe, status);
17102518 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
17112519 }
17122520 ack_done:
17132521 spin_unlock_irqrestore(&qp->s_lock, flags);
17142522 bail:
17152523 return;
1716
-}
1717
-
1718
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
1719
- struct rvt_qp *qp)
1720
-{
1721
- if (list_empty(&qp->rspwait)) {
1722
- qp->r_flags |= RVT_R_RSP_NAK;
1723
- rvt_get_qp(qp);
1724
- list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1725
- }
17262524 }
17272525
17282526 static inline void rc_cancel_ack(struct rvt_qp *qp)
....@@ -1757,8 +2555,9 @@
17572555 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
17582556 struct rvt_ack_entry *e;
17592557 unsigned long flags;
1760
- u8 i, prev;
1761
- int old_req;
2558
+ u8 prev;
2559
+ u8 mra; /* most recent ACK */
2560
+ bool old_req;
17622561
17632562 trace_hfi1_rcv_error(qp, psn);
17642563 if (diff > 0) {
....@@ -1799,34 +2598,13 @@
17992598 * to be sent before sending this one.
18002599 */
18012600 e = NULL;
1802
- old_req = 1;
2601
+ old_req = true;
18032602 ibp->rvp.n_rc_dupreq++;
18042603
18052604 spin_lock_irqsave(&qp->s_lock, flags);
18062605
1807
- for (i = qp->r_head_ack_queue; ; i = prev) {
1808
- if (i == qp->s_tail_ack_queue)
1809
- old_req = 0;
1810
- if (i)
1811
- prev = i - 1;
1812
- else
1813
- prev = HFI1_MAX_RDMA_ATOMIC;
1814
- if (prev == qp->r_head_ack_queue) {
1815
- e = NULL;
1816
- break;
1817
- }
1818
- e = &qp->s_ack_queue[prev];
1819
- if (!e->opcode) {
1820
- e = NULL;
1821
- break;
1822
- }
1823
- if (cmp_psn(psn, e->psn) >= 0) {
1824
- if (prev == qp->s_tail_ack_queue &&
1825
- cmp_psn(psn, e->lpsn) <= 0)
1826
- old_req = 0;
1827
- break;
1828
- }
1829
- }
2606
+ e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
2607
+
18302608 switch (opcode) {
18312609 case OP(RDMA_READ_REQUEST): {
18322610 struct ib_reth *reth;
....@@ -1852,10 +2630,7 @@
18522630 len = be32_to_cpu(reth->length);
18532631 if (unlikely(offset + len != e->rdma_sge.sge_length))
18542632 goto unlock_done;
1855
- if (e->rdma_sge.mr) {
1856
- rvt_put_mr(e->rdma_sge.mr);
1857
- e->rdma_sge.mr = NULL;
1858
- }
2633
+ release_rdma_sge_mr(e);
18592634 if (len != 0) {
18602635 u32 rkey = be32_to_cpu(reth->rkey);
18612636 u64 vaddr = get_ib_reth_vaddr(reth);
....@@ -1873,6 +2648,8 @@
18732648 e->psn = psn;
18742649 if (old_req)
18752650 goto unlock_done;
2651
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2652
+ qp->s_acked_ack_queue = prev;
18762653 qp->s_tail_ack_queue = prev;
18772654 break;
18782655 }
....@@ -1886,6 +2663,8 @@
18862663 */
18872664 if (!e || e->opcode != (u8)opcode || old_req)
18882665 goto unlock_done;
2666
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2667
+ qp->s_acked_ack_queue = prev;
18892668 qp->s_tail_ack_queue = prev;
18902669 break;
18912670 }
....@@ -1901,7 +2680,7 @@
19012680 * Resend the most recent ACK if this request is
19022681 * after all the previous RDMA reads and atomics.
19032682 */
1904
- if (i == qp->r_head_ack_queue) {
2683
+ if (mra == qp->r_head_ack_queue) {
19052684 spin_unlock_irqrestore(&qp->s_lock, flags);
19062685 qp->r_nak_state = 0;
19072686 qp->r_ack_psn = qp->r_psn - 1;
....@@ -1912,7 +2691,9 @@
19122691 * Resend the RDMA read or atomic op which
19132692 * ACKs this duplicate request.
19142693 */
1915
- qp->s_tail_ack_queue = i;
2694
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2695
+ qp->s_acked_ack_queue = mra;
2696
+ qp->s_tail_ack_queue = mra;
19162697 break;
19172698 }
19182699 qp->s_ack_state = OP(ACKNOWLEDGE);
....@@ -1927,17 +2708,6 @@
19272708
19282709 send_ack:
19292710 return 0;
1930
-}
1931
-
1932
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
1933
-{
1934
- unsigned next;
1935
-
1936
- next = n + 1;
1937
- if (next > HFI1_MAX_RDMA_ATOMIC)
1938
- next = 0;
1939
- qp->s_tail_ack_queue = next;
1940
- qp->s_ack_state = OP(ACKNOWLEDGE);
19412711 }
19422712
19432713 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
....@@ -2037,6 +2807,7 @@
20372807 void *data = packet->payload;
20382808 u32 tlen = packet->tlen;
20392809 struct rvt_qp *qp = packet->qp;
2810
+ struct hfi1_qp_priv *qpriv = qp->priv;
20402811 struct hfi1_ibport *ibp = rcd_to_iport(rcd);
20412812 struct ib_other_headers *ohdr = packet->ohdr;
20422813 u32 opcode = packet->opcode;
....@@ -2059,6 +2830,7 @@
20592830 return;
20602831
20612832 fecn = process_ecn(qp, packet);
2833
+ opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
20622834
20632835 /*
20642836 * Process responses (ACKs) before anything else. Note that the
....@@ -2128,7 +2900,7 @@
21282900 if (!ret)
21292901 goto rnr_nak;
21302902 qp->r_rcv_len = 0;
2131
- /* FALLTHROUGH */
2903
+ fallthrough;
21322904 case OP(SEND_MIDDLE):
21332905 case OP(RDMA_WRITE_MIDDLE):
21342906 send_middle:
....@@ -2143,7 +2915,7 @@
21432915 qp->r_rcv_len += pmtu;
21442916 if (unlikely(qp->r_rcv_len > qp->r_len))
21452917 goto nack_inv;
2146
- hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
2918
+ rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
21472919 break;
21482920
21492921 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
....@@ -2168,7 +2940,7 @@
21682940 goto no_immediate_data;
21692941 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
21702942 goto send_last_inv;
2171
- /* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */
2943
+ fallthrough; /* for SEND_ONLY_WITH_IMMEDIATE */
21722944 case OP(SEND_LAST_WITH_IMMEDIATE):
21732945 send_last_imm:
21742946 wc.ex.imm_data = ohdr->u.imm_data;
....@@ -2184,7 +2956,7 @@
21842956 goto send_last;
21852957 case OP(RDMA_WRITE_LAST):
21862958 copy_last = rvt_is_user_qp(qp);
2187
- /* fall through */
2959
+ fallthrough;
21882960 case OP(SEND_LAST):
21892961 no_immediate_data:
21902962 wc.wc_flags = 0;
....@@ -2199,7 +2971,7 @@
21992971 wc.byte_len = tlen + qp->r_rcv_len;
22002972 if (unlikely(wc.byte_len > qp->r_len))
22012973 goto nack_inv;
2202
- hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
2974
+ rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
22032975 rvt_put_ss(&qp->r_sge);
22042976 qp->r_msn++;
22052977 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
....@@ -2232,13 +3004,12 @@
22323004 wc.dlid_path_bits = 0;
22333005 wc.port_num = 0;
22343006 /* Signal completion event if the solicited bit is set. */
2235
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
2236
- ib_bth_is_solicited(ohdr));
3007
+ rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
22373008 break;
22383009
22393010 case OP(RDMA_WRITE_ONLY):
22403011 copy_last = rvt_is_user_qp(qp);
2241
- /* fall through */
3012
+ fallthrough;
22423013 case OP(RDMA_WRITE_FIRST):
22433014 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
22443015 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
....@@ -2290,20 +3061,17 @@
22903061 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
22913062 goto nack_inv;
22923063 next = qp->r_head_ack_queue + 1;
2293
- /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2294
- if (next > HFI1_MAX_RDMA_ATOMIC)
3064
+ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
3065
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
22953066 next = 0;
22963067 spin_lock_irqsave(&qp->s_lock, flags);
2297
- if (unlikely(next == qp->s_tail_ack_queue)) {
3068
+ if (unlikely(next == qp->s_acked_ack_queue)) {
22983069 if (!qp->s_ack_queue[next].sent)
22993070 goto nack_inv_unlck;
23003071 update_ack_queue(qp, next);
23013072 }
23023073 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2303
- if (e->rdma_sge.mr) {
2304
- rvt_put_mr(e->rdma_sge.mr);
2305
- e->rdma_sge.mr = NULL;
2306
- }
3074
+ release_rdma_sge_mr(e);
23073075 reth = &ohdr->u.rc.reth;
23083076 len = be32_to_cpu(reth->length);
23093077 if (len) {
....@@ -2341,6 +3109,7 @@
23413109 qp->r_state = opcode;
23423110 qp->r_nak_state = 0;
23433111 qp->r_head_ack_queue = next;
3112
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
23443113
23453114 /* Schedule the send engine. */
23463115 qp->s_flags |= RVT_S_RESP_PENDING;
....@@ -2354,32 +3123,35 @@
23543123
23553124 case OP(COMPARE_SWAP):
23563125 case OP(FETCH_ADD): {
2357
- struct ib_atomic_eth *ateth;
3126
+ struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
3127
+ u64 vaddr = get_ib_ateth_vaddr(ateth);
3128
+ bool opfn = opcode == OP(COMPARE_SWAP) &&
3129
+ vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
23583130 struct rvt_ack_entry *e;
2359
- u64 vaddr;
23603131 atomic64_t *maddr;
23613132 u64 sdata;
23623133 u32 rkey;
23633134 u8 next;
23643135
2365
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
3136
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
3137
+ !opfn))
23663138 goto nack_inv;
23673139 next = qp->r_head_ack_queue + 1;
2368
- if (next > HFI1_MAX_RDMA_ATOMIC)
3140
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
23693141 next = 0;
23703142 spin_lock_irqsave(&qp->s_lock, flags);
2371
- if (unlikely(next == qp->s_tail_ack_queue)) {
3143
+ if (unlikely(next == qp->s_acked_ack_queue)) {
23723144 if (!qp->s_ack_queue[next].sent)
23733145 goto nack_inv_unlck;
23743146 update_ack_queue(qp, next);
23753147 }
23763148 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2377
- if (e->rdma_sge.mr) {
2378
- rvt_put_mr(e->rdma_sge.mr);
2379
- e->rdma_sge.mr = NULL;
3149
+ release_rdma_sge_mr(e);
3150
+ /* Process OPFN special virtual address */
3151
+ if (opfn) {
3152
+ opfn_conn_response(qp, e, ateth);
3153
+ goto ack;
23803154 }
2381
- ateth = &ohdr->u.atomic_eth;
2382
- vaddr = get_ib_ateth_vaddr(ateth);
23833155 if (unlikely(vaddr & (sizeof(u64) - 1)))
23843156 goto nack_inv_unlck;
23853157 rkey = be32_to_cpu(ateth->rkey);
....@@ -2398,6 +3170,7 @@
23983170 sdata);
23993171 rvt_put_mr(qp->r_sge.sge.mr);
24003172 qp->r_sge.num_sge = 0;
3173
+ack:
24013174 e->opcode = opcode;
24023175 e->sent = 0;
24033176 e->psn = psn;
....@@ -2407,6 +3180,7 @@
24073180 qp->r_state = opcode;
24083181 qp->r_nak_state = 0;
24093182 qp->r_head_ack_queue = next;
3183
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
24103184
24113185 /* Schedule the send engine. */
24123186 qp->s_flags |= RVT_S_RESP_PENDING;