forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/drivers/infiniband/sw/rdmavt/qp.c
....@@ -1,5 +1,5 @@
11 /*
2
- * Copyright(c) 2016, 2017 Intel Corporation.
2
+ * Copyright(c) 2016 - 2020 Intel Corporation.
33 *
44 * This file is provided under a dual BSD/GPLv2 license. When using or
55 * redistributing this file, you may do so under either license.
....@@ -53,9 +53,12 @@
5353 #include <rdma/ib_verbs.h>
5454 #include <rdma/ib_hdrs.h>
5555 #include <rdma/opa_addr.h>
56
+#include <rdma/uverbs_ioctl.h>
5657 #include "qp.h"
5758 #include "vt.h"
5859 #include "trace.h"
60
+
61
+#define RVT_RWQ_COUNT_THRESHOLD 16
5962
6063 static void rvt_rc_timeout(struct timer_list *t);
6164 static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
....@@ -119,6 +122,187 @@
119122 RVT_POST_SEND_OK | RVT_FLUSH_SEND,
120123 };
121124 EXPORT_SYMBOL(ib_rvt_state_ops);
125
+
126
+/* platform specific: return the last level cache (llc) size, in KiB */
127
+static int rvt_wss_llc_size(void)
128
+{
129
+ /* assume that the boot CPU value is universal for all CPUs */
130
+ return boot_cpu_data.x86_cache_size;
131
+}
132
+
133
+/* platform specific: cacheless copy */
134
+static void cacheless_memcpy(void *dst, void *src, size_t n)
135
+{
136
+ /*
137
+ * Use the only available X64 cacheless copy. Add a __user cast
138
+ * to quiet sparse. The src agument is already in the kernel so
139
+ * there are no security issues. The extra fault recovery machinery
140
+ * is not invoked.
141
+ */
142
+ __copy_user_nocache(dst, (void __user *)src, n, 0);
143
+}
144
+
145
+void rvt_wss_exit(struct rvt_dev_info *rdi)
146
+{
147
+ struct rvt_wss *wss = rdi->wss;
148
+
149
+ if (!wss)
150
+ return;
151
+
152
+ /* coded to handle partially initialized and repeat callers */
153
+ kfree(wss->entries);
154
+ wss->entries = NULL;
155
+ kfree(rdi->wss);
156
+ rdi->wss = NULL;
157
+}
158
+
159
+/**
160
+ * rvt_wss_init - Init wss data structures
161
+ *
162
+ * Return: 0 on success
163
+ */
164
+int rvt_wss_init(struct rvt_dev_info *rdi)
165
+{
166
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
167
+ unsigned int wss_threshold = rdi->dparms.wss_threshold;
168
+ unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
169
+ long llc_size;
170
+ long llc_bits;
171
+ long table_size;
172
+ long table_bits;
173
+ struct rvt_wss *wss;
174
+ int node = rdi->dparms.node;
175
+
176
+ if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
177
+ rdi->wss = NULL;
178
+ return 0;
179
+ }
180
+
181
+ rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
182
+ if (!rdi->wss)
183
+ return -ENOMEM;
184
+ wss = rdi->wss;
185
+
186
+ /* check for a valid percent range - default to 80 if none or invalid */
187
+ if (wss_threshold < 1 || wss_threshold > 100)
188
+ wss_threshold = 80;
189
+
190
+ /* reject a wildly large period */
191
+ if (wss_clean_period > 1000000)
192
+ wss_clean_period = 256;
193
+
194
+ /* reject a zero period */
195
+ if (wss_clean_period == 0)
196
+ wss_clean_period = 1;
197
+
198
+ /*
199
+ * Calculate the table size - the next power of 2 larger than the
200
+ * LLC size. LLC size is in KiB.
201
+ */
202
+ llc_size = rvt_wss_llc_size() * 1024;
203
+ table_size = roundup_pow_of_two(llc_size);
204
+
205
+ /* one bit per page in rounded up table */
206
+ llc_bits = llc_size / PAGE_SIZE;
207
+ table_bits = table_size / PAGE_SIZE;
208
+ wss->pages_mask = table_bits - 1;
209
+ wss->num_entries = table_bits / BITS_PER_LONG;
210
+
211
+ wss->threshold = (llc_bits * wss_threshold) / 100;
212
+ if (wss->threshold == 0)
213
+ wss->threshold = 1;
214
+
215
+ wss->clean_period = wss_clean_period;
216
+ atomic_set(&wss->clean_counter, wss_clean_period);
217
+
218
+ wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
219
+ GFP_KERNEL, node);
220
+ if (!wss->entries) {
221
+ rvt_wss_exit(rdi);
222
+ return -ENOMEM;
223
+ }
224
+
225
+ return 0;
226
+}
227
+
228
+/*
229
+ * Advance the clean counter. When the clean period has expired,
230
+ * clean an entry.
231
+ *
232
+ * This is implemented in atomics to avoid locking. Because multiple
233
+ * variables are involved, it can be racy which can lead to slightly
234
+ * inaccurate information. Since this is only a heuristic, this is
235
+ * OK. Any innaccuracies will clean themselves out as the counter
236
+ * advances. That said, it is unlikely the entry clean operation will
237
+ * race - the next possible racer will not start until the next clean
238
+ * period.
239
+ *
240
+ * The clean counter is implemented as a decrement to zero. When zero
241
+ * is reached an entry is cleaned.
242
+ */
243
+static void wss_advance_clean_counter(struct rvt_wss *wss)
244
+{
245
+ int entry;
246
+ int weight;
247
+ unsigned long bits;
248
+
249
+ /* become the cleaner if we decrement the counter to zero */
250
+ if (atomic_dec_and_test(&wss->clean_counter)) {
251
+ /*
252
+ * Set, not add, the clean period. This avoids an issue
253
+ * where the counter could decrement below the clean period.
254
+ * Doing a set can result in lost decrements, slowing the
255
+ * clean advance. Since this a heuristic, this possible
256
+ * slowdown is OK.
257
+ *
258
+ * An alternative is to loop, advancing the counter by a
259
+ * clean period until the result is > 0. However, this could
260
+ * lead to several threads keeping another in the clean loop.
261
+ * This could be mitigated by limiting the number of times
262
+ * we stay in the loop.
263
+ */
264
+ atomic_set(&wss->clean_counter, wss->clean_period);
265
+
266
+ /*
267
+ * Uniquely grab the entry to clean and move to next.
268
+ * The current entry is always the lower bits of
269
+ * wss.clean_entry. The table size, wss.num_entries,
270
+ * is always a power-of-2.
271
+ */
272
+ entry = (atomic_inc_return(&wss->clean_entry) - 1)
273
+ & (wss->num_entries - 1);
274
+
275
+ /* clear the entry and count the bits */
276
+ bits = xchg(&wss->entries[entry], 0);
277
+ weight = hweight64((u64)bits);
278
+ /* only adjust the contended total count if needed */
279
+ if (weight)
280
+ atomic_sub(weight, &wss->total_count);
281
+ }
282
+}
283
+
284
+/*
285
+ * Insert the given address into the working set array.
286
+ */
287
+static void wss_insert(struct rvt_wss *wss, void *address)
288
+{
289
+ u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
290
+ u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
291
+ u32 nr = page & (BITS_PER_LONG - 1);
292
+
293
+ if (!test_and_set_bit(nr, &wss->entries[entry]))
294
+ atomic_inc(&wss->total_count);
295
+
296
+ wss_advance_clean_counter(wss);
297
+}
298
+
299
+/*
300
+ * Is the working set larger than the threshold?
301
+ */
302
+static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
303
+{
304
+ return atomic_read(&wss->total_count) >= wss->threshold;
305
+}
122306
123307 static void get_map_page(struct rvt_qpn_table *qpt,
124308 struct rvt_qpn_map *map)
....@@ -341,15 +525,18 @@
341525 * @rdi: rvt device info structure
342526 * @qpt: queue pair number table pointer
343527 * @port_num: IB port number, 1 based, comes from core
528
+ * @exclude_prefix: prefix of special queue pair number being allocated
344529 *
345530 * Return: The queue pair number
346531 */
347532 static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
348
- enum ib_qp_type type, u8 port_num)
533
+ enum ib_qp_type type, u8 port_num, u8 exclude_prefix)
349534 {
350535 u32 i, offset, max_scan, qpn;
351536 struct rvt_qpn_map *map;
352537 u32 ret;
538
+ u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
539
+ RVT_AIP_QPN_MAX : RVT_QPN_MAX;
353540
354541 if (rdi->driver_f.alloc_qpn)
355542 return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
....@@ -369,7 +556,7 @@
369556 }
370557
371558 qpn = qpt->last + qpt->incr;
372
- if (qpn >= RVT_QPN_MAX)
559
+ if (qpn >= max_qpn)
373560 qpn = qpt->incr | ((qpt->last & 1) ^ 1);
374561 /* offset carries bit 0 */
375562 offset = qpn & RVT_BITS_PER_PAGE_MASK;
....@@ -445,13 +632,7 @@
445632 while (qp->s_last != qp->s_head) {
446633 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
447634
448
- rvt_put_swqe(wqe);
449
-
450
- if (qp->ibqp.qp_type == IB_QPT_UD ||
451
- qp->ibqp.qp_type == IB_QPT_SMI ||
452
- qp->ibqp.qp_type == IB_QPT_GSI)
453
- atomic_dec(&ibah_to_rvtah(
454
- wqe->ud_wr.ah)->refcount);
635
+ rvt_put_qp_swqe(qp, wqe);
455636 if (++qp->s_last >= qp->s_size)
456637 qp->s_last = 0;
457638 smp_wmb(); /* see qp_set_savail */
....@@ -630,6 +811,47 @@
630811 }
631812
632813 /**
814
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
815
+ * @rq: receive queue data structure
816
+ * @size: number of request queue entries
817
+ * @node: The NUMA node
818
+ * @udata: True if user data is available or not false
819
+ *
820
+ * Return: If memory allocation failed, return -ENONEM
821
+ * This function is used by both shared receive
822
+ * queues and non-shared receive queues to allocate
823
+ * memory.
824
+ */
825
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
826
+ struct ib_udata *udata)
827
+{
828
+ if (udata) {
829
+ rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
830
+ if (!rq->wq)
831
+ goto bail;
832
+ /* need kwq with no buffers */
833
+ rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
834
+ if (!rq->kwq)
835
+ goto bail;
836
+ rq->kwq->curr_wq = rq->wq->wq;
837
+ } else {
838
+ /* need kwq with buffers */
839
+ rq->kwq =
840
+ vzalloc_node(sizeof(struct rvt_krwq) + size, node);
841
+ if (!rq->kwq)
842
+ goto bail;
843
+ rq->kwq->curr_wq = rq->kwq->wq;
844
+ }
845
+
846
+ spin_lock_init(&rq->kwq->p_lock);
847
+ spin_lock_init(&rq->kwq->c_lock);
848
+ return 0;
849
+bail:
850
+ rvt_free_rq(rq);
851
+ return -ENOMEM;
852
+}
853
+
854
+/**
633855 * rvt_init_qp - initialize the QP state to the reset state
634856 * @qp: the QP to init or reinit
635857 * @type: the QP type
....@@ -677,11 +899,8 @@
677899 qp->s_mig_state = IB_MIG_MIGRATED;
678900 qp->r_head_ack_queue = 0;
679901 qp->s_tail_ack_queue = 0;
902
+ qp->s_acked_ack_queue = 0;
680903 qp->s_num_rd_atomic = 0;
681
- if (qp->r_rq.wq) {
682
- qp->r_rq.wq->head = 0;
683
- qp->r_rq.wq->tail = 0;
684
- }
685904 qp->r_sge.num_sge = 0;
686905 atomic_set(&qp->s_reserved_used, 0);
687906 }
....@@ -769,9 +988,67 @@
769988 {
770989 struct rvt_qpn_map *map;
771990
991
+ if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE)
992
+ qpn &= RVT_AIP_QP_SUFFIX;
993
+
772994 map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
773995 if (map->page)
774996 clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
997
+}
998
+
999
+/**
1000
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
1001
+ * @type: valid, supported, QP type
1002
+ */
1003
+static u8 get_allowed_ops(enum ib_qp_type type)
1004
+{
1005
+ return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
1006
+ IB_OPCODE_UC : IB_OPCODE_UD;
1007
+}
1008
+
1009
+/**
1010
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
1011
+ * @qp: Valid QP with allowed_ops set
1012
+ *
1013
+ * The rvt_swqe data structure being used is a union, so this is
1014
+ * only valid for UD QPs.
1015
+ */
1016
+static void free_ud_wq_attr(struct rvt_qp *qp)
1017
+{
1018
+ struct rvt_swqe *wqe;
1019
+ int i;
1020
+
1021
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
1022
+ wqe = rvt_get_swqe_ptr(qp, i);
1023
+ kfree(wqe->ud_wr.attr);
1024
+ wqe->ud_wr.attr = NULL;
1025
+ }
1026
+}
1027
+
1028
+/**
1029
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
1030
+ * @qp: Valid QP with allowed_ops set
1031
+ * @node: Numa node for allocation
1032
+ *
1033
+ * The rvt_swqe data structure being used is a union, so this is
1034
+ * only valid for UD QPs.
1035
+ */
1036
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
1037
+{
1038
+ struct rvt_swqe *wqe;
1039
+ int i;
1040
+
1041
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
1042
+ wqe = rvt_get_swqe_ptr(qp, i);
1043
+ wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
1044
+ GFP_KERNEL, node);
1045
+ if (!wqe->ud_wr.attr) {
1046
+ free_ud_wq_attr(qp);
1047
+ return -ENOMEM;
1048
+ }
1049
+ }
1050
+
1051
+ return 0;
7751052 }
7761053
7771054 /**
....@@ -801,13 +1078,15 @@
8011078 struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
8021079 void *priv = NULL;
8031080 size_t sqsize;
1081
+ u8 exclude_prefix = 0;
8041082
8051083 if (!rdi)
8061084 return ERR_PTR(-EINVAL);
8071085
8081086 if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge ||
8091087 init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
810
- init_attr->create_flags)
1088
+ (init_attr->create_flags &&
1089
+ init_attr->create_flags != IB_QP_CREATE_NETDEV_USE))
8111090 return ERR_PTR(-EINVAL);
8121091
8131092 /* Check receive queue parameters if no SRQ is specified. */
....@@ -832,13 +1111,11 @@
8321111 if (init_attr->port_num == 0 ||
8331112 init_attr->port_num > ibpd->device->phys_port_cnt)
8341113 return ERR_PTR(-EINVAL);
835
- /* fall through */
1114
+ fallthrough;
8361115 case IB_QPT_UC:
8371116 case IB_QPT_RC:
8381117 case IB_QPT_UD:
839
- sz = sizeof(struct rvt_sge) *
840
- init_attr->cap.max_send_sge +
841
- sizeof(struct rvt_swqe);
1118
+ sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
8421119 swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
8431120 if (!swq)
8441121 return ERR_PTR(-ENOMEM);
....@@ -858,6 +1135,7 @@
8581135 rdi->dparms.node);
8591136 if (!qp)
8601137 goto bail_swq;
1138
+ qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
8611139
8621140 RCU_INIT_POINTER(qp->next, NULL);
8631141 if (init_attr->qp_type == IB_QPT_RC) {
....@@ -895,17 +1173,12 @@
8951173 qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
8961174 sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
8971175 sizeof(struct rvt_rwqe);
898
- if (udata)
899
- qp->r_rq.wq = vmalloc_user(
900
- sizeof(struct rvt_rwq) +
901
- qp->r_rq.size * sz);
902
- else
903
- qp->r_rq.wq = vzalloc_node(
904
- sizeof(struct rvt_rwq) +
905
- qp->r_rq.size * sz,
906
- rdi->dparms.node);
907
- if (!qp->r_rq.wq)
1176
+ err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
1177
+ rdi->dparms.node, udata);
1178
+ if (err) {
1179
+ ret = ERR_PTR(err);
9081180 goto bail_driver_priv;
1181
+ }
9091182 }
9101183
9111184 /*
....@@ -915,7 +1188,6 @@
9151188 spin_lock_init(&qp->r_lock);
9161189 spin_lock_init(&qp->s_hlock);
9171190 spin_lock_init(&qp->s_lock);
918
- spin_lock_init(&qp->r_rq.lock);
9191191 atomic_set(&qp->refcount, 0);
9201192 atomic_set(&qp->local_ops_pending, 0);
9211193 init_waitqueue_head(&qp->wait);
....@@ -927,22 +1199,40 @@
9271199 qp->s_max_sge = init_attr->cap.max_send_sge;
9281200 if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
9291201 qp->s_flags = RVT_S_SIGNAL_REQ_WR;
1202
+ err = alloc_ud_wq_attr(qp, rdi->dparms.node);
1203
+ if (err) {
1204
+ ret = (ERR_PTR(err));
1205
+ goto bail_rq_rvt;
1206
+ }
1207
+
1208
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1209
+ exclude_prefix = RVT_AIP_QP_PREFIX;
9301210
9311211 err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
9321212 init_attr->qp_type,
933
- init_attr->port_num);
1213
+ init_attr->port_num,
1214
+ exclude_prefix);
9341215 if (err < 0) {
9351216 ret = ERR_PTR(err);
9361217 goto bail_rq_wq;
9371218 }
9381219 qp->ibqp.qp_num = err;
1220
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1221
+ qp->ibqp.qp_num |= RVT_AIP_QP_BASE;
9391222 qp->port_num = init_attr->port_num;
9401223 rvt_init_qp(rdi, qp, init_attr->qp_type);
1224
+ if (rdi->driver_f.qp_priv_init) {
1225
+ err = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
1226
+ if (err) {
1227
+ ret = ERR_PTR(err);
1228
+ goto bail_rq_wq;
1229
+ }
1230
+ }
9411231 break;
9421232
9431233 default:
9441234 /* Don't support raw QPs */
945
- return ERR_PTR(-EINVAL);
1235
+ return ERR_PTR(-EOPNOTSUPP);
9461236 }
9471237
9481238 init_attr->cap.max_inline_data = 0;
....@@ -964,11 +1254,10 @@
9641254 } else {
9651255 u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
9661256
967
- qp->ip = rvt_create_mmap_info(rdi, s,
968
- ibpd->uobject->context,
1257
+ qp->ip = rvt_create_mmap_info(rdi, s, udata,
9691258 qp->r_rq.wq);
970
- if (!qp->ip) {
971
- ret = ERR_PTR(-ENOMEM);
1259
+ if (IS_ERR(qp->ip)) {
1260
+ ret = ERR_CAST(qp->ip);
9721261 goto bail_qpn;
9731262 }
9741263
....@@ -1013,28 +1302,6 @@
10131302
10141303 ret = &qp->ibqp;
10151304
1016
- /*
1017
- * We have our QP and its good, now keep track of what types of opcodes
1018
- * can be processed on this QP. We do this by keeping track of what the
1019
- * 3 high order bits of the opcode are.
1020
- */
1021
- switch (init_attr->qp_type) {
1022
- case IB_QPT_SMI:
1023
- case IB_QPT_GSI:
1024
- case IB_QPT_UD:
1025
- qp->allowed_ops = IB_OPCODE_UD;
1026
- break;
1027
- case IB_QPT_RC:
1028
- qp->allowed_ops = IB_OPCODE_RC;
1029
- break;
1030
- case IB_QPT_UC:
1031
- qp->allowed_ops = IB_OPCODE_UC;
1032
- break;
1033
- default:
1034
- ret = ERR_PTR(-EINVAL);
1035
- goto bail_ip;
1036
- }
1037
-
10381305 return ret;
10391306
10401307 bail_ip:
....@@ -1045,8 +1312,10 @@
10451312 rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
10461313
10471314 bail_rq_wq:
1048
- if (!qp->ip)
1049
- vfree(qp->r_rq.wq);
1315
+ free_ud_wq_attr(qp);
1316
+
1317
+bail_rq_rvt:
1318
+ rvt_free_rq(&qp->r_rq);
10501319
10511320 bail_driver_priv:
10521321 rdi->driver_f.qp_priv_free(rdi, qp);
....@@ -1112,19 +1381,26 @@
11121381 }
11131382 wc.status = IB_WC_WR_FLUSH_ERR;
11141383
1115
- if (qp->r_rq.wq) {
1116
- struct rvt_rwq *wq;
1384
+ if (qp->r_rq.kwq) {
11171385 u32 head;
11181386 u32 tail;
1387
+ struct rvt_rwq *wq = NULL;
1388
+ struct rvt_krwq *kwq = NULL;
11191389
1120
- spin_lock(&qp->r_rq.lock);
1121
-
1390
+ spin_lock(&qp->r_rq.kwq->c_lock);
1391
+ /* qp->ip used to validate if there is a user buffer mmaped */
1392
+ if (qp->ip) {
1393
+ wq = qp->r_rq.wq;
1394
+ head = RDMA_READ_UAPI_ATOMIC(wq->head);
1395
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
1396
+ } else {
1397
+ kwq = qp->r_rq.kwq;
1398
+ head = kwq->head;
1399
+ tail = kwq->tail;
1400
+ }
11221401 /* sanity check pointers before trusting them */
1123
- wq = qp->r_rq.wq;
1124
- head = wq->head;
11251402 if (head >= qp->r_rq.size)
11261403 head = 0;
1127
- tail = wq->tail;
11281404 if (tail >= qp->r_rq.size)
11291405 tail = 0;
11301406 while (tail != head) {
....@@ -1133,9 +1409,11 @@
11331409 tail = 0;
11341410 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
11351411 }
1136
- wq->tail = tail;
1137
-
1138
- spin_unlock(&qp->r_rq.lock);
1412
+ if (qp->ip)
1413
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
1414
+ else
1415
+ kwq->tail = tail;
1416
+ spin_unlock(&qp->r_rq.kwq->c_lock);
11391417 } else if (qp->ibqp.event_handler) {
11401418 ret = 1;
11411419 }
....@@ -1189,10 +1467,7 @@
11891467 int lastwqe = 0;
11901468 int mig = 0;
11911469 int pmtu = 0; /* for gcc warning only */
1192
- enum rdma_link_layer link;
11931470 int opa_ah;
1194
-
1195
- link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
11961471
11971472 spin_lock_irq(&qp->r_lock);
11981473 spin_lock(&qp->s_hlock);
....@@ -1204,7 +1479,7 @@
12041479 opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
12051480
12061481 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
1207
- attr_mask, link))
1482
+ attr_mask))
12081483 goto inval;
12091484
12101485 if (rdi->driver_f.check_modify_qp &&
....@@ -1453,7 +1728,7 @@
14531728 *
14541729 * Return: 0 on success.
14551730 */
1456
-int rvt_destroy_qp(struct ib_qp *ibqp)
1731
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
14571732 {
14581733 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
14591734 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
....@@ -1474,13 +1749,13 @@
14741749
14751750 if (qp->ip)
14761751 kref_put(&qp->ip->ref, rvt_release_mmap_info);
1477
- else
1478
- vfree(qp->r_rq.wq);
1479
- vfree(qp->s_wq);
1752
+ kvfree(qp->r_rq.kwq);
14801753 rdi->driver_f.qp_priv_free(rdi, qp);
14811754 kfree(qp->s_ack_queue);
14821755 rdma_destroy_ah_attr(&qp->remote_ah_attr);
14831756 rdma_destroy_ah_attr(&qp->alt_ah_attr);
1757
+ free_ud_wq_attr(qp);
1758
+ vfree(qp->s_wq);
14841759 kfree(qp);
14851760 return 0;
14861761 }
....@@ -1561,7 +1836,7 @@
15611836 const struct ib_recv_wr **bad_wr)
15621837 {
15631838 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1564
- struct rvt_rwq *wq = qp->r_rq.wq;
1839
+ struct rvt_krwq *wq = qp->r_rq.kwq;
15651840 unsigned long flags;
15661841 int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
15671842 !qp->ibqp.srq;
....@@ -1582,12 +1857,12 @@
15821857 return -EINVAL;
15831858 }
15841859
1585
- spin_lock_irqsave(&qp->r_rq.lock, flags);
1860
+ spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
15861861 next = wq->head + 1;
15871862 if (next >= qp->r_rq.size)
15881863 next = 0;
1589
- if (next == wq->tail) {
1590
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1864
+ if (next == READ_ONCE(wq->tail)) {
1865
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
15911866 *bad_wr = wr;
15921867 return -ENOMEM;
15931868 }
....@@ -1604,16 +1879,18 @@
16041879 wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
16051880 wqe->wr_id = wr->wr_id;
16061881 wqe->num_sge = wr->num_sge;
1607
- for (i = 0; i < wr->num_sge; i++)
1608
- wqe->sg_list[i] = wr->sg_list[i];
1882
+ for (i = 0; i < wr->num_sge; i++) {
1883
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
1884
+ wqe->sg_list[i].length = wr->sg_list[i].length;
1885
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
1886
+ }
16091887 /*
16101888 * Make sure queue entry is written
16111889 * before the head index.
16121890 */
1613
- smp_wmb();
1614
- wq->head = next;
1891
+ smp_store_release(&wq->head, next);
16151892 }
1616
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1893
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
16171894 }
16181895 return 0;
16191896 }
....@@ -1694,10 +1971,9 @@
16941971
16951972 /* see rvt_qp_wqe_unreserve() */
16961973 smp_mb__before_atomic();
1697
- reserved_used = atomic_read(&qp->s_reserved_used);
16981974 if (unlikely(reserved_op)) {
16991975 /* see rvt_qp_wqe_unreserve() */
1700
- smp_mb__before_atomic();
1976
+ reserved_used = atomic_read(&qp->s_reserved_used);
17011977 if (reserved_used >= rdi->dparms.reserved_operations)
17021978 return -ENOMEM;
17031979 return 0;
....@@ -1705,14 +1981,13 @@
17051981 /* non-reserved operations */
17061982 if (likely(qp->s_avail))
17071983 return 0;
1708
- slast = READ_ONCE(qp->s_last);
1984
+ /* See rvt_qp_complete_swqe() */
1985
+ slast = smp_load_acquire(&qp->s_last);
17091986 if (qp->s_head >= slast)
17101987 avail = qp->s_size - (qp->s_head - slast);
17111988 else
17121989 avail = slast - qp->s_head;
17131990
1714
- /* see rvt_qp_wqe_unreserve() */
1715
- smp_mb__before_atomic();
17161991 reserved_used = atomic_read(&qp->s_reserved_used);
17171992 avail = avail - 1 -
17181993 (rdi->dparms.reserved_operations - reserved_used);
....@@ -1737,7 +2012,7 @@
17372012 */
17382013 static int rvt_post_one_wr(struct rvt_qp *qp,
17392014 const struct ib_send_wr *wr,
1740
- int *call_send)
2015
+ bool *call_send)
17412016 {
17422017 struct rvt_swqe *wqe;
17432018 u32 next;
....@@ -1842,22 +2117,17 @@
18422117 wqe->wr.num_sge = j;
18432118 }
18442119
1845
- /* general part of wqe valid - allow for driver checks */
1846
- if (rdi->driver_f.check_send_wqe) {
1847
- ret = rdi->driver_f.check_send_wqe(qp, wqe);
1848
- if (ret < 0)
1849
- goto bail_inval_free;
1850
- if (ret)
1851
- *call_send = ret;
1852
- }
1853
-
2120
+ /*
2121
+ * Calculate and set SWQE PSN values prior to handing it off
2122
+ * to the driver's check routine. This give the driver the
2123
+ * opportunity to adjust PSN values based on internal checks.
2124
+ */
18542125 log_pmtu = qp->log_pmtu;
1855
- if (qp->ibqp.qp_type != IB_QPT_UC &&
1856
- qp->ibqp.qp_type != IB_QPT_RC) {
1857
- struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
2126
+ if (qp->allowed_ops == IB_OPCODE_UD) {
2127
+ struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
18582128
18592129 log_pmtu = ah->log_pmtu;
1860
- atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
2130
+ rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
18612131 }
18622132
18632133 if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
....@@ -1875,8 +2145,18 @@
18752145 (wqe->length ?
18762146 ((wqe->length - 1) >> log_pmtu) :
18772147 0);
1878
- qp->s_next_psn = wqe->lpsn + 1;
18792148 }
2149
+
2150
+ /* general part of wqe valid - allow for driver checks */
2151
+ if (rdi->driver_f.setup_wqe) {
2152
+ ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
2153
+ if (ret < 0)
2154
+ goto bail_inval_free_ref;
2155
+ }
2156
+
2157
+ if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
2158
+ qp->s_next_psn = wqe->lpsn + 1;
2159
+
18802160 if (unlikely(reserved_op)) {
18812161 wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
18822162 rvt_qp_wqe_reserve(qp, wqe);
....@@ -1890,6 +2170,9 @@
18902170
18912171 return 0;
18922172
2173
+bail_inval_free_ref:
2174
+ if (qp->allowed_ops == IB_OPCODE_UD)
2175
+ rdma_destroy_ah_attr(wqe->ud_wr.attr);
18932176 bail_inval_free:
18942177 /* release mr holds */
18952178 while (j) {
....@@ -1916,7 +2199,7 @@
19162199 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
19172200 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
19182201 unsigned long flags = 0;
1919
- int call_send;
2202
+ bool call_send;
19202203 unsigned nreq = 0;
19212204 int err = 0;
19222205
....@@ -1949,7 +2232,11 @@
19492232 bail:
19502233 spin_unlock_irqrestore(&qp->s_hlock, flags);
19512234 if (nreq) {
1952
- if (call_send)
2235
+ /*
2236
+ * Only call do_send if there is exactly one packet, and the
2237
+ * driver said it was ok.
2238
+ */
2239
+ if (nreq == 1 && call_send)
19532240 rdi->driver_f.do_send(qp);
19542241 else
19552242 rdi->driver_f.schedule_send_no_lock(qp);
....@@ -1971,7 +2258,7 @@
19712258 const struct ib_recv_wr **bad_wr)
19722259 {
19732260 struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
1974
- struct rvt_rwq *wq;
2261
+ struct rvt_krwq *wq;
19752262 unsigned long flags;
19762263
19772264 for (; wr; wr = wr->next) {
....@@ -1984,13 +2271,13 @@
19842271 return -EINVAL;
19852272 }
19862273
1987
- spin_lock_irqsave(&srq->rq.lock, flags);
1988
- wq = srq->rq.wq;
2274
+ spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
2275
+ wq = srq->rq.kwq;
19892276 next = wq->head + 1;
19902277 if (next >= srq->rq.size)
19912278 next = 0;
1992
- if (next == wq->tail) {
1993
- spin_unlock_irqrestore(&srq->rq.lock, flags);
2279
+ if (next == READ_ONCE(wq->tail)) {
2280
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
19942281 *bad_wr = wr;
19952282 return -ENOMEM;
19962283 }
....@@ -1998,14 +2285,32 @@
19982285 wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
19992286 wqe->wr_id = wr->wr_id;
20002287 wqe->num_sge = wr->num_sge;
2001
- for (i = 0; i < wr->num_sge; i++)
2002
- wqe->sg_list[i] = wr->sg_list[i];
2288
+ for (i = 0; i < wr->num_sge; i++) {
2289
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
2290
+ wqe->sg_list[i].length = wr->sg_list[i].length;
2291
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
2292
+ }
20032293 /* Make sure queue entry is written before the head index. */
2004
- smp_wmb();
2005
- wq->head = next;
2006
- spin_unlock_irqrestore(&srq->rq.lock, flags);
2294
+ smp_store_release(&wq->head, next);
2295
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
20072296 }
20082297 return 0;
2298
+}
2299
+
2300
+/*
2301
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
2302
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
2303
+ * user struct to a kernel struct.
2304
+ */
2305
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
2306
+{
2307
+ BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
2308
+ offsetof(struct rvt_wqe_sge, addr));
2309
+ BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
2310
+ offsetof(struct rvt_wqe_sge, length));
2311
+ BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
2312
+ offsetof(struct rvt_wqe_sge, lkey));
2313
+ return (struct ib_sge *)sge;
20092314 }
20102315
20112316 /*
....@@ -2031,7 +2336,7 @@
20312336 continue;
20322337 /* Check LKEY */
20332338 ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
2034
- NULL, &wqe->sg_list[i],
2339
+ NULL, rvt_cast_sge(&wqe->sg_list[i]),
20352340 IB_ACCESS_LOCAL_WRITE);
20362341 if (unlikely(ret <= 0))
20372342 goto bad_lkey;
....@@ -2060,6 +2365,25 @@
20602365 }
20612366
20622367 /**
2368
+ * get_rvt_head - get head indices of the circular buffer
2369
+ * @rq: data structure for request queue entry
2370
+ * @ip: the QP
2371
+ *
2372
+ * Return - head index value
2373
+ */
2374
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
2375
+{
2376
+ u32 head;
2377
+
2378
+ if (ip)
2379
+ head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
2380
+ else
2381
+ head = rq->kwq->head;
2382
+
2383
+ return head;
2384
+}
2385
+
2386
+/**
20632387 * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
20642388 * @qp: the QP
20652389 * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
....@@ -2073,39 +2397,54 @@
20732397 {
20742398 unsigned long flags;
20752399 struct rvt_rq *rq;
2400
+ struct rvt_krwq *kwq = NULL;
20762401 struct rvt_rwq *wq;
20772402 struct rvt_srq *srq;
20782403 struct rvt_rwqe *wqe;
20792404 void (*handler)(struct ib_event *, void *);
20802405 u32 tail;
2406
+ u32 head;
20812407 int ret;
2408
+ void *ip = NULL;
20822409
20832410 if (qp->ibqp.srq) {
20842411 srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
20852412 handler = srq->ibsrq.event_handler;
20862413 rq = &srq->rq;
2414
+ ip = srq->ip;
20872415 } else {
20882416 srq = NULL;
20892417 handler = NULL;
20902418 rq = &qp->r_rq;
2419
+ ip = qp->ip;
20912420 }
20922421
2093
- spin_lock_irqsave(&rq->lock, flags);
2422
+ spin_lock_irqsave(&rq->kwq->c_lock, flags);
20942423 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
20952424 ret = 0;
20962425 goto unlock;
20972426 }
2427
+ kwq = rq->kwq;
2428
+ if (ip) {
2429
+ wq = rq->wq;
2430
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
2431
+ } else {
2432
+ tail = kwq->tail;
2433
+ }
20982434
2099
- wq = rq->wq;
2100
- tail = wq->tail;
21012435 /* Validate tail before using it since it is user writable. */
21022436 if (tail >= rq->size)
21032437 tail = 0;
2104
- if (unlikely(tail == wq->head)) {
2438
+
2439
+ if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
2440
+ head = get_rvt_head(rq, ip);
2441
+ kwq->count = rvt_get_rq_count(rq, head, tail);
2442
+ }
2443
+ if (unlikely(kwq->count == 0)) {
21052444 ret = 0;
21062445 goto unlock;
21072446 }
2108
- /* Make sure entry is read after head index is read. */
2447
+ /* Make sure entry is read after the count is read. */
21092448 smp_rmb();
21102449 wqe = rvt_get_rwqe_ptr(rq, tail);
21112450 /*
....@@ -2115,43 +2454,43 @@
21152454 */
21162455 if (++tail >= rq->size)
21172456 tail = 0;
2118
- wq->tail = tail;
2457
+ if (ip)
2458
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
2459
+ else
2460
+ kwq->tail = tail;
21192461 if (!wr_id_only && !init_sge(qp, wqe)) {
21202462 ret = -1;
21212463 goto unlock;
21222464 }
21232465 qp->r_wr_id = wqe->wr_id;
21242466
2467
+ kwq->count--;
21252468 ret = 1;
21262469 set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
21272470 if (handler) {
2128
- u32 n;
2129
-
21302471 /*
21312472 * Validate head pointer value and compute
21322473 * the number of remaining WQEs.
21332474 */
2134
- n = wq->head;
2135
- if (n >= rq->size)
2136
- n = 0;
2137
- if (n < tail)
2138
- n += rq->size - tail;
2139
- else
2140
- n -= tail;
2141
- if (n < srq->limit) {
2142
- struct ib_event ev;
2475
+ if (kwq->count < srq->limit) {
2476
+ kwq->count =
2477
+ rvt_get_rq_count(rq,
2478
+ get_rvt_head(rq, ip), tail);
2479
+ if (kwq->count < srq->limit) {
2480
+ struct ib_event ev;
21432481
2144
- srq->limit = 0;
2145
- spin_unlock_irqrestore(&rq->lock, flags);
2146
- ev.device = qp->ibqp.device;
2147
- ev.element.srq = qp->ibqp.srq;
2148
- ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
2149
- handler(&ev, srq->ibsrq.srq_context);
2150
- goto bail;
2482
+ srq->limit = 0;
2483
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
2484
+ ev.device = qp->ibqp.device;
2485
+ ev.element.srq = qp->ibqp.srq;
2486
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
2487
+ handler(&ev, srq->ibsrq.srq_context);
2488
+ goto bail;
2489
+ }
21512490 }
21522491 }
21532492 unlock:
2154
- spin_unlock_irqrestore(&rq->lock, flags);
2493
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
21552494 bail:
21562495 return ret;
21572496 }
....@@ -2213,11 +2552,12 @@
22132552 }
22142553
22152554 /*
2216
- * rvt_add_retry_timer - add/start a retry timer
2555
+ * rvt_add_retry_timer_ext - add/start a retry timer
22172556 * @qp - the QP
2557
+ * @shift - timeout shift to wait for multiple packets
22182558 * add a retry timer on the QP
22192559 */
2220
-void rvt_add_retry_timer(struct rvt_qp *qp)
2560
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
22212561 {
22222562 struct ib_qp *ibqp = &qp->ibqp;
22232563 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
....@@ -2225,17 +2565,16 @@
22252565 lockdep_assert_held(&qp->s_lock);
22262566 qp->s_flags |= RVT_S_TIMER;
22272567 /* 4.096 usec. * (1 << qp->timeout) */
2228
- qp->s_timer.expires = jiffies + qp->timeout_jiffies +
2229
- rdi->busy_jiffies;
2568
+ qp->s_timer.expires = jiffies + rdi->busy_jiffies +
2569
+ (qp->timeout_jiffies << shift);
22302570 add_timer(&qp->s_timer);
22312571 }
2232
-EXPORT_SYMBOL(rvt_add_retry_timer);
2572
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
22332573
22342574 /**
2235
- * rvt_add_rnr_timer - add/start an rnr timer
2236
- * @qp - the QP
2237
- * @aeth - aeth of RNR timeout, simulated aeth for loopback
2238
- * add an rnr timer on the QP
2575
+ * rvt_add_rnr_timer - add/start an rnr timer on the QP
2576
+ * @qp: the QP
2577
+ * @aeth: aeth of RNR timeout, simulated aeth for loopback
22392578 */
22402579 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
22412580 {
....@@ -2252,7 +2591,7 @@
22522591
22532592 /**
22542593 * rvt_stop_rc_timers - stop all timers
2255
- * @qp - the QP
2594
+ * @qp: the QP
22562595 * stop any pending timers
22572596 */
22582597 void rvt_stop_rc_timers(struct rvt_qp *qp)
....@@ -2286,7 +2625,7 @@
22862625
22872626 /**
22882627 * rvt_del_timers_sync - wait for any timeout routines to exit
2289
- * @qp - the QP
2628
+ * @qp: the QP
22902629 */
22912630 void rvt_del_timers_sync(struct rvt_qp *qp)
22922631 {
....@@ -2295,7 +2634,7 @@
22952634 }
22962635 EXPORT_SYMBOL(rvt_del_timers_sync);
22972636
2298
-/**
2637
+/*
22992638 * This is called from s_timer for missing responses.
23002639 */
23012640 static void rvt_rc_timeout(struct timer_list *t)
....@@ -2345,12 +2684,13 @@
23452684 * rvt_qp_iter_init - initial for QP iteration
23462685 * @rdi: rvt devinfo
23472686 * @v: u64 value
2687
+ * @cb: user-defined callback
23482688 *
23492689 * This returns an iterator suitable for iterating QPs
23502690 * in the system.
23512691 *
2352
- * The @cb is a user defined callback and @v is a 64
2353
- * bit value passed to and relevant for processing in the
2692
+ * The @cb is a user-defined callback and @v is a 64-bit
2693
+ * value passed to and relevant for processing in the
23542694 * @cb. An example use case would be to alter QP processing
23552695 * based on criteria not part of the rvt_qp.
23562696 *
....@@ -2381,7 +2721,7 @@
23812721
23822722 /**
23832723 * rvt_qp_iter_next - return the next QP in iter
2384
- * @iter - the iterator
2724
+ * @iter: the iterator
23852725 *
23862726 * Fine grained QP iterator suitable for use
23872727 * with debugfs seq_file mechanisms.
....@@ -2444,14 +2784,14 @@
24442784
24452785 /**
24462786 * rvt_qp_iter - iterate all QPs
2447
- * @rdi - rvt devinfo
2448
- * @v - a 64 bit value
2449
- * @cb - a callback
2787
+ * @rdi: rvt devinfo
2788
+ * @v: a 64-bit value
2789
+ * @cb: a callback
24502790 *
24512791 * This provides a way for iterating all QPs.
24522792 *
2453
- * The @cb is a user defined callback and @v is a 64
2454
- * bit value passed to and relevant for processing in the
2793
+ * The @cb is a user-defined callback and @v is a 64-bit
2794
+ * value passed to and relevant for processing in the
24552795 * cb. An example use case would be to alter QP processing
24562796 * based on criteria not part of the rvt_qp.
24572797 *
....@@ -2484,3 +2824,450 @@
24842824 rcu_read_unlock();
24852825 }
24862826 EXPORT_SYMBOL(rvt_qp_iter);
2827
+
2828
+/*
2829
+ * This should be called with s_lock and r_lock held.
2830
+ */
2831
+void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
2832
+ enum ib_wc_status status)
2833
+{
2834
+ u32 old_last, last;
2835
+ struct rvt_dev_info *rdi;
2836
+
2837
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2838
+ return;
2839
+ rdi = ib_to_rvt(qp->ibqp.device);
2840
+
2841
+ old_last = qp->s_last;
2842
+ trace_rvt_qp_send_completion(qp, wqe, old_last);
2843
+ last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
2844
+ status);
2845
+ if (qp->s_acked == old_last)
2846
+ qp->s_acked = last;
2847
+ if (qp->s_cur == old_last)
2848
+ qp->s_cur = last;
2849
+ if (qp->s_tail == old_last)
2850
+ qp->s_tail = last;
2851
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
2852
+ qp->s_draining = 0;
2853
+}
2854
+EXPORT_SYMBOL(rvt_send_complete);
2855
+
2856
+/**
2857
+ * rvt_copy_sge - copy data to SGE memory
2858
+ * @qp: associated QP
2859
+ * @ss: the SGE state
2860
+ * @data: the data to copy
2861
+ * @length: the length of the data
2862
+ * @release: boolean to release MR
2863
+ * @copy_last: do a separate copy of the last 8 bytes
2864
+ */
2865
+void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
2866
+ void *data, u32 length,
2867
+ bool release, bool copy_last)
2868
+{
2869
+ struct rvt_sge *sge = &ss->sge;
2870
+ int i;
2871
+ bool in_last = false;
2872
+ bool cacheless_copy = false;
2873
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2874
+ struct rvt_wss *wss = rdi->wss;
2875
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
2876
+
2877
+ if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
2878
+ cacheless_copy = length >= PAGE_SIZE;
2879
+ } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
2880
+ if (length >= PAGE_SIZE) {
2881
+ /*
2882
+ * NOTE: this *assumes*:
2883
+ * o The first vaddr is the dest.
2884
+ * o If multiple pages, then vaddr is sequential.
2885
+ */
2886
+ wss_insert(wss, sge->vaddr);
2887
+ if (length >= (2 * PAGE_SIZE))
2888
+ wss_insert(wss, (sge->vaddr + PAGE_SIZE));
2889
+
2890
+ cacheless_copy = wss_exceeds_threshold(wss);
2891
+ } else {
2892
+ wss_advance_clean_counter(wss);
2893
+ }
2894
+ }
2895
+
2896
+ if (copy_last) {
2897
+ if (length > 8) {
2898
+ length -= 8;
2899
+ } else {
2900
+ copy_last = false;
2901
+ in_last = true;
2902
+ }
2903
+ }
2904
+
2905
+again:
2906
+ while (length) {
2907
+ u32 len = rvt_get_sge_length(sge, length);
2908
+
2909
+ WARN_ON_ONCE(len == 0);
2910
+ if (unlikely(in_last)) {
2911
+ /* enforce byte transfer ordering */
2912
+ for (i = 0; i < len; i++)
2913
+ ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
2914
+ } else if (cacheless_copy) {
2915
+ cacheless_memcpy(sge->vaddr, data, len);
2916
+ } else {
2917
+ memcpy(sge->vaddr, data, len);
2918
+ }
2919
+ rvt_update_sge(ss, len, release);
2920
+ data += len;
2921
+ length -= len;
2922
+ }
2923
+
2924
+ if (copy_last) {
2925
+ copy_last = false;
2926
+ in_last = true;
2927
+ length = 8;
2928
+ goto again;
2929
+ }
2930
+}
2931
+EXPORT_SYMBOL(rvt_copy_sge);
2932
+
2933
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
2934
+ struct rvt_qp *sqp)
2935
+{
2936
+ rvp->n_pkt_drops++;
2937
+ /*
2938
+ * For RC, the requester would timeout and retry so
2939
+ * shortcut the timeouts and just signal too many retries.
2940
+ */
2941
+ return sqp->ibqp.qp_type == IB_QPT_RC ?
2942
+ IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
2943
+}
2944
+
2945
+/**
2946
+ * ruc_loopback - handle UC and RC loopback requests
2947
+ * @sqp: the sending QP
2948
+ *
2949
+ * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
2950
+ * Note that although we are single threaded due to the send engine, we still
2951
+ * have to protect against post_send(). We don't have to worry about
2952
+ * receive interrupts since this is a connected protocol and all packets
2953
+ * will pass through here.
2954
+ */
2955
+void rvt_ruc_loopback(struct rvt_qp *sqp)
2956
+{
2957
+ struct rvt_ibport *rvp = NULL;
2958
+ struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
2959
+ struct rvt_qp *qp;
2960
+ struct rvt_swqe *wqe;
2961
+ struct rvt_sge *sge;
2962
+ unsigned long flags;
2963
+ struct ib_wc wc;
2964
+ u64 sdata;
2965
+ atomic64_t *maddr;
2966
+ enum ib_wc_status send_status;
2967
+ bool release;
2968
+ int ret;
2969
+ bool copy_last = false;
2970
+ int local_ops = 0;
2971
+
2972
+ rcu_read_lock();
2973
+ rvp = rdi->ports[sqp->port_num - 1];
2974
+
2975
+ /*
2976
+ * Note that we check the responder QP state after
2977
+ * checking the requester's state.
2978
+ */
2979
+
2980
+ qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
2981
+ sqp->remote_qpn);
2982
+
2983
+ spin_lock_irqsave(&sqp->s_lock, flags);
2984
+
2985
+ /* Return if we are already busy processing a work request. */
2986
+ if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
2987
+ !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2988
+ goto unlock;
2989
+
2990
+ sqp->s_flags |= RVT_S_BUSY;
2991
+
2992
+again:
2993
+ if (sqp->s_last == READ_ONCE(sqp->s_head))
2994
+ goto clr_busy;
2995
+ wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
2996
+
2997
+ /* Return if it is not OK to start a new work request. */
2998
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
2999
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
3000
+ goto clr_busy;
3001
+ /* We are in the error state, flush the work request. */
3002
+ send_status = IB_WC_WR_FLUSH_ERR;
3003
+ goto flush_send;
3004
+ }
3005
+
3006
+ /*
3007
+ * We can rely on the entry not changing without the s_lock
3008
+ * being held until we update s_last.
3009
+ * We increment s_cur to indicate s_last is in progress.
3010
+ */
3011
+ if (sqp->s_last == sqp->s_cur) {
3012
+ if (++sqp->s_cur >= sqp->s_size)
3013
+ sqp->s_cur = 0;
3014
+ }
3015
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3016
+
3017
+ if (!qp) {
3018
+ send_status = loopback_qp_drop(rvp, sqp);
3019
+ goto serr_no_r_lock;
3020
+ }
3021
+ spin_lock_irqsave(&qp->r_lock, flags);
3022
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
3023
+ qp->ibqp.qp_type != sqp->ibqp.qp_type) {
3024
+ send_status = loopback_qp_drop(rvp, sqp);
3025
+ goto serr;
3026
+ }
3027
+
3028
+ memset(&wc, 0, sizeof(wc));
3029
+ send_status = IB_WC_SUCCESS;
3030
+
3031
+ release = true;
3032
+ sqp->s_sge.sge = wqe->sg_list[0];
3033
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
3034
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
3035
+ sqp->s_len = wqe->length;
3036
+ switch (wqe->wr.opcode) {
3037
+ case IB_WR_REG_MR:
3038
+ goto send_comp;
3039
+
3040
+ case IB_WR_LOCAL_INV:
3041
+ if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
3042
+ if (rvt_invalidate_rkey(sqp,
3043
+ wqe->wr.ex.invalidate_rkey))
3044
+ send_status = IB_WC_LOC_PROT_ERR;
3045
+ local_ops = 1;
3046
+ }
3047
+ goto send_comp;
3048
+
3049
+ case IB_WR_SEND_WITH_INV:
3050
+ case IB_WR_SEND_WITH_IMM:
3051
+ case IB_WR_SEND:
3052
+ ret = rvt_get_rwqe(qp, false);
3053
+ if (ret < 0)
3054
+ goto op_err;
3055
+ if (!ret)
3056
+ goto rnr_nak;
3057
+ if (wqe->length > qp->r_len)
3058
+ goto inv_err;
3059
+ switch (wqe->wr.opcode) {
3060
+ case IB_WR_SEND_WITH_INV:
3061
+ if (!rvt_invalidate_rkey(qp,
3062
+ wqe->wr.ex.invalidate_rkey)) {
3063
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
3064
+ wc.ex.invalidate_rkey =
3065
+ wqe->wr.ex.invalidate_rkey;
3066
+ }
3067
+ break;
3068
+ case IB_WR_SEND_WITH_IMM:
3069
+ wc.wc_flags = IB_WC_WITH_IMM;
3070
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
3071
+ break;
3072
+ default:
3073
+ break;
3074
+ }
3075
+ break;
3076
+
3077
+ case IB_WR_RDMA_WRITE_WITH_IMM:
3078
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3079
+ goto inv_err;
3080
+ wc.wc_flags = IB_WC_WITH_IMM;
3081
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
3082
+ ret = rvt_get_rwqe(qp, true);
3083
+ if (ret < 0)
3084
+ goto op_err;
3085
+ if (!ret)
3086
+ goto rnr_nak;
3087
+ /* skip copy_last set and qp_access_flags recheck */
3088
+ goto do_write;
3089
+ case IB_WR_RDMA_WRITE:
3090
+ copy_last = rvt_is_user_qp(qp);
3091
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3092
+ goto inv_err;
3093
+do_write:
3094
+ if (wqe->length == 0)
3095
+ break;
3096
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
3097
+ wqe->rdma_wr.remote_addr,
3098
+ wqe->rdma_wr.rkey,
3099
+ IB_ACCESS_REMOTE_WRITE)))
3100
+ goto acc_err;
3101
+ qp->r_sge.sg_list = NULL;
3102
+ qp->r_sge.num_sge = 1;
3103
+ qp->r_sge.total_len = wqe->length;
3104
+ break;
3105
+
3106
+ case IB_WR_RDMA_READ:
3107
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
3108
+ goto inv_err;
3109
+ if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
3110
+ wqe->rdma_wr.remote_addr,
3111
+ wqe->rdma_wr.rkey,
3112
+ IB_ACCESS_REMOTE_READ)))
3113
+ goto acc_err;
3114
+ release = false;
3115
+ sqp->s_sge.sg_list = NULL;
3116
+ sqp->s_sge.num_sge = 1;
3117
+ qp->r_sge.sge = wqe->sg_list[0];
3118
+ qp->r_sge.sg_list = wqe->sg_list + 1;
3119
+ qp->r_sge.num_sge = wqe->wr.num_sge;
3120
+ qp->r_sge.total_len = wqe->length;
3121
+ break;
3122
+
3123
+ case IB_WR_ATOMIC_CMP_AND_SWP:
3124
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
3125
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
3126
+ goto inv_err;
3127
+ if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
3128
+ goto inv_err;
3129
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
3130
+ wqe->atomic_wr.remote_addr,
3131
+ wqe->atomic_wr.rkey,
3132
+ IB_ACCESS_REMOTE_ATOMIC)))
3133
+ goto acc_err;
3134
+ /* Perform atomic OP and save result. */
3135
+ maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
3136
+ sdata = wqe->atomic_wr.compare_add;
3137
+ *(u64 *)sqp->s_sge.sge.vaddr =
3138
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
3139
+ (u64)atomic64_add_return(sdata, maddr) - sdata :
3140
+ (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
3141
+ sdata, wqe->atomic_wr.swap);
3142
+ rvt_put_mr(qp->r_sge.sge.mr);
3143
+ qp->r_sge.num_sge = 0;
3144
+ goto send_comp;
3145
+
3146
+ default:
3147
+ send_status = IB_WC_LOC_QP_OP_ERR;
3148
+ goto serr;
3149
+ }
3150
+
3151
+ sge = &sqp->s_sge.sge;
3152
+ while (sqp->s_len) {
3153
+ u32 len = rvt_get_sge_length(sge, sqp->s_len);
3154
+
3155
+ WARN_ON_ONCE(len == 0);
3156
+ rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
3157
+ len, release, copy_last);
3158
+ rvt_update_sge(&sqp->s_sge, len, !release);
3159
+ sqp->s_len -= len;
3160
+ }
3161
+ if (release)
3162
+ rvt_put_ss(&qp->r_sge);
3163
+
3164
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
3165
+ goto send_comp;
3166
+
3167
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
3168
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
3169
+ else
3170
+ wc.opcode = IB_WC_RECV;
3171
+ wc.wr_id = qp->r_wr_id;
3172
+ wc.status = IB_WC_SUCCESS;
3173
+ wc.byte_len = wqe->length;
3174
+ wc.qp = &qp->ibqp;
3175
+ wc.src_qp = qp->remote_qpn;
3176
+ wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
3177
+ wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
3178
+ wc.port_num = 1;
3179
+ /* Signal completion event if the solicited bit is set. */
3180
+ rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
3181
+
3182
+send_comp:
3183
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3184
+ spin_lock_irqsave(&sqp->s_lock, flags);
3185
+ rvp->n_loop_pkts++;
3186
+flush_send:
3187
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
3188
+ spin_lock(&sqp->r_lock);
3189
+ rvt_send_complete(sqp, wqe, send_status);
3190
+ spin_unlock(&sqp->r_lock);
3191
+ if (local_ops) {
3192
+ atomic_dec(&sqp->local_ops_pending);
3193
+ local_ops = 0;
3194
+ }
3195
+ goto again;
3196
+
3197
+rnr_nak:
3198
+ /* Handle RNR NAK */
3199
+ if (qp->ibqp.qp_type == IB_QPT_UC)
3200
+ goto send_comp;
3201
+ rvp->n_rnr_naks++;
3202
+ /*
3203
+ * Note: we don't need the s_lock held since the BUSY flag
3204
+ * makes this single threaded.
3205
+ */
3206
+ if (sqp->s_rnr_retry == 0) {
3207
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
3208
+ goto serr;
3209
+ }
3210
+ if (sqp->s_rnr_retry_cnt < 7)
3211
+ sqp->s_rnr_retry--;
3212
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3213
+ spin_lock_irqsave(&sqp->s_lock, flags);
3214
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
3215
+ goto clr_busy;
3216
+ rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
3217
+ IB_AETH_CREDIT_SHIFT);
3218
+ goto clr_busy;
3219
+
3220
+op_err:
3221
+ send_status = IB_WC_REM_OP_ERR;
3222
+ wc.status = IB_WC_LOC_QP_OP_ERR;
3223
+ goto err;
3224
+
3225
+inv_err:
3226
+ send_status =
3227
+ sqp->ibqp.qp_type == IB_QPT_RC ?
3228
+ IB_WC_REM_INV_REQ_ERR :
3229
+ IB_WC_SUCCESS;
3230
+ wc.status = IB_WC_LOC_QP_OP_ERR;
3231
+ goto err;
3232
+
3233
+acc_err:
3234
+ send_status = IB_WC_REM_ACCESS_ERR;
3235
+ wc.status = IB_WC_LOC_PROT_ERR;
3236
+err:
3237
+ /* responder goes to error state */
3238
+ rvt_rc_error(qp, wc.status);
3239
+
3240
+serr:
3241
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3242
+serr_no_r_lock:
3243
+ spin_lock_irqsave(&sqp->s_lock, flags);
3244
+ spin_lock(&sqp->r_lock);
3245
+ rvt_send_complete(sqp, wqe, send_status);
3246
+ spin_unlock(&sqp->r_lock);
3247
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
3248
+ int lastwqe;
3249
+
3250
+ spin_lock(&sqp->r_lock);
3251
+ lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
3252
+ spin_unlock(&sqp->r_lock);
3253
+
3254
+ sqp->s_flags &= ~RVT_S_BUSY;
3255
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3256
+ if (lastwqe) {
3257
+ struct ib_event ev;
3258
+
3259
+ ev.device = sqp->ibqp.device;
3260
+ ev.element.qp = &sqp->ibqp;
3261
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
3262
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
3263
+ }
3264
+ goto done;
3265
+ }
3266
+clr_busy:
3267
+ sqp->s_flags &= ~RVT_S_BUSY;
3268
+unlock:
3269
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3270
+done:
3271
+ rcu_read_unlock();
3272
+}
3273
+EXPORT_SYMBOL(rvt_ruc_loopback);