hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/drivers/infiniband/sw/rdmavt/qp.c
....@@ -1,5 +1,5 @@
11 /*
2
- * Copyright(c) 2016, 2017 Intel Corporation.
2
+ * Copyright(c) 2016 - 2020 Intel Corporation.
33 *
44 * This file is provided under a dual BSD/GPLv2 license. When using or
55 * redistributing this file, you may do so under either license.
....@@ -53,9 +53,12 @@
5353 #include <rdma/ib_verbs.h>
5454 #include <rdma/ib_hdrs.h>
5555 #include <rdma/opa_addr.h>
56
+#include <rdma/uverbs_ioctl.h>
5657 #include "qp.h"
5758 #include "vt.h"
5859 #include "trace.h"
60
+
61
+#define RVT_RWQ_COUNT_THRESHOLD 16
5962
6063 static void rvt_rc_timeout(struct timer_list *t);
6164 static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
....@@ -119,6 +122,187 @@
119122 RVT_POST_SEND_OK | RVT_FLUSH_SEND,
120123 };
121124 EXPORT_SYMBOL(ib_rvt_state_ops);
125
+
126
+/* platform specific: return the last level cache (llc) size, in KiB */
127
+static int rvt_wss_llc_size(void)
128
+{
129
+ /* assume that the boot CPU value is universal for all CPUs */
130
+ return boot_cpu_data.x86_cache_size;
131
+}
132
+
133
+/* platform specific: cacheless copy */
134
+static void cacheless_memcpy(void *dst, void *src, size_t n)
135
+{
136
+ /*
137
+ * Use the only available X64 cacheless copy. Add a __user cast
138
+ * to quiet sparse. The src agument is already in the kernel so
139
+ * there are no security issues. The extra fault recovery machinery
140
+ * is not invoked.
141
+ */
142
+ __copy_user_nocache(dst, (void __user *)src, n, 0);
143
+}
144
+
145
+void rvt_wss_exit(struct rvt_dev_info *rdi)
146
+{
147
+ struct rvt_wss *wss = rdi->wss;
148
+
149
+ if (!wss)
150
+ return;
151
+
152
+ /* coded to handle partially initialized and repeat callers */
153
+ kfree(wss->entries);
154
+ wss->entries = NULL;
155
+ kfree(rdi->wss);
156
+ rdi->wss = NULL;
157
+}
158
+
159
+/**
160
+ * rvt_wss_init - Init wss data structures
161
+ *
162
+ * Return: 0 on success
163
+ */
164
+int rvt_wss_init(struct rvt_dev_info *rdi)
165
+{
166
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
167
+ unsigned int wss_threshold = rdi->dparms.wss_threshold;
168
+ unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
169
+ long llc_size;
170
+ long llc_bits;
171
+ long table_size;
172
+ long table_bits;
173
+ struct rvt_wss *wss;
174
+ int node = rdi->dparms.node;
175
+
176
+ if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
177
+ rdi->wss = NULL;
178
+ return 0;
179
+ }
180
+
181
+ rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
182
+ if (!rdi->wss)
183
+ return -ENOMEM;
184
+ wss = rdi->wss;
185
+
186
+ /* check for a valid percent range - default to 80 if none or invalid */
187
+ if (wss_threshold < 1 || wss_threshold > 100)
188
+ wss_threshold = 80;
189
+
190
+ /* reject a wildly large period */
191
+ if (wss_clean_period > 1000000)
192
+ wss_clean_period = 256;
193
+
194
+ /* reject a zero period */
195
+ if (wss_clean_period == 0)
196
+ wss_clean_period = 1;
197
+
198
+ /*
199
+ * Calculate the table size - the next power of 2 larger than the
200
+ * LLC size. LLC size is in KiB.
201
+ */
202
+ llc_size = rvt_wss_llc_size() * 1024;
203
+ table_size = roundup_pow_of_two(llc_size);
204
+
205
+ /* one bit per page in rounded up table */
206
+ llc_bits = llc_size / PAGE_SIZE;
207
+ table_bits = table_size / PAGE_SIZE;
208
+ wss->pages_mask = table_bits - 1;
209
+ wss->num_entries = table_bits / BITS_PER_LONG;
210
+
211
+ wss->threshold = (llc_bits * wss_threshold) / 100;
212
+ if (wss->threshold == 0)
213
+ wss->threshold = 1;
214
+
215
+ wss->clean_period = wss_clean_period;
216
+ atomic_set(&wss->clean_counter, wss_clean_period);
217
+
218
+ wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
219
+ GFP_KERNEL, node);
220
+ if (!wss->entries) {
221
+ rvt_wss_exit(rdi);
222
+ return -ENOMEM;
223
+ }
224
+
225
+ return 0;
226
+}
227
+
228
+/*
229
+ * Advance the clean counter. When the clean period has expired,
230
+ * clean an entry.
231
+ *
232
+ * This is implemented in atomics to avoid locking. Because multiple
233
+ * variables are involved, it can be racy which can lead to slightly
234
+ * inaccurate information. Since this is only a heuristic, this is
235
+ * OK. Any innaccuracies will clean themselves out as the counter
236
+ * advances. That said, it is unlikely the entry clean operation will
237
+ * race - the next possible racer will not start until the next clean
238
+ * period.
239
+ *
240
+ * The clean counter is implemented as a decrement to zero. When zero
241
+ * is reached an entry is cleaned.
242
+ */
243
+static void wss_advance_clean_counter(struct rvt_wss *wss)
244
+{
245
+ int entry;
246
+ int weight;
247
+ unsigned long bits;
248
+
249
+ /* become the cleaner if we decrement the counter to zero */
250
+ if (atomic_dec_and_test(&wss->clean_counter)) {
251
+ /*
252
+ * Set, not add, the clean period. This avoids an issue
253
+ * where the counter could decrement below the clean period.
254
+ * Doing a set can result in lost decrements, slowing the
255
+ * clean advance. Since this a heuristic, this possible
256
+ * slowdown is OK.
257
+ *
258
+ * An alternative is to loop, advancing the counter by a
259
+ * clean period until the result is > 0. However, this could
260
+ * lead to several threads keeping another in the clean loop.
261
+ * This could be mitigated by limiting the number of times
262
+ * we stay in the loop.
263
+ */
264
+ atomic_set(&wss->clean_counter, wss->clean_period);
265
+
266
+ /*
267
+ * Uniquely grab the entry to clean and move to next.
268
+ * The current entry is always the lower bits of
269
+ * wss.clean_entry. The table size, wss.num_entries,
270
+ * is always a power-of-2.
271
+ */
272
+ entry = (atomic_inc_return(&wss->clean_entry) - 1)
273
+ & (wss->num_entries - 1);
274
+
275
+ /* clear the entry and count the bits */
276
+ bits = xchg(&wss->entries[entry], 0);
277
+ weight = hweight64((u64)bits);
278
+ /* only adjust the contended total count if needed */
279
+ if (weight)
280
+ atomic_sub(weight, &wss->total_count);
281
+ }
282
+}
283
+
284
+/*
285
+ * Insert the given address into the working set array.
286
+ */
287
+static void wss_insert(struct rvt_wss *wss, void *address)
288
+{
289
+ u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
290
+ u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
291
+ u32 nr = page & (BITS_PER_LONG - 1);
292
+
293
+ if (!test_and_set_bit(nr, &wss->entries[entry]))
294
+ atomic_inc(&wss->total_count);
295
+
296
+ wss_advance_clean_counter(wss);
297
+}
298
+
299
+/*
300
+ * Is the working set larger than the threshold?
301
+ */
302
+static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
303
+{
304
+ return atomic_read(&wss->total_count) >= wss->threshold;
305
+}
122306
123307 static void get_map_page(struct rvt_qpn_table *qpt,
124308 struct rvt_qpn_map *map)
....@@ -321,8 +505,6 @@
321505 if (qps_inuse)
322506 rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
323507 qps_inuse);
324
- if (!rdi->qp_dev)
325
- return;
326508
327509 kfree(rdi->qp_dev->qp_table);
328510 free_qpn_table(&rdi->qp_dev->qpn_table);
....@@ -341,15 +523,18 @@
341523 * @rdi: rvt device info structure
342524 * @qpt: queue pair number table pointer
343525 * @port_num: IB port number, 1 based, comes from core
526
+ * @exclude_prefix: prefix of special queue pair number being allocated
344527 *
345528 * Return: The queue pair number
346529 */
347530 static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
348
- enum ib_qp_type type, u8 port_num)
531
+ enum ib_qp_type type, u8 port_num, u8 exclude_prefix)
349532 {
350533 u32 i, offset, max_scan, qpn;
351534 struct rvt_qpn_map *map;
352535 u32 ret;
536
+ u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
537
+ RVT_AIP_QPN_MAX : RVT_QPN_MAX;
353538
354539 if (rdi->driver_f.alloc_qpn)
355540 return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
....@@ -369,7 +554,7 @@
369554 }
370555
371556 qpn = qpt->last + qpt->incr;
372
- if (qpn >= RVT_QPN_MAX)
557
+ if (qpn >= max_qpn)
373558 qpn = qpt->incr | ((qpt->last & 1) ^ 1);
374559 /* offset carries bit 0 */
375560 offset = qpn & RVT_BITS_PER_PAGE_MASK;
....@@ -445,13 +630,7 @@
445630 while (qp->s_last != qp->s_head) {
446631 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
447632
448
- rvt_put_swqe(wqe);
449
-
450
- if (qp->ibqp.qp_type == IB_QPT_UD ||
451
- qp->ibqp.qp_type == IB_QPT_SMI ||
452
- qp->ibqp.qp_type == IB_QPT_GSI)
453
- atomic_dec(&ibah_to_rvtah(
454
- wqe->ud_wr.ah)->refcount);
633
+ rvt_put_qp_swqe(qp, wqe);
455634 if (++qp->s_last >= qp->s_size)
456635 qp->s_last = 0;
457636 smp_wmb(); /* see qp_set_savail */
....@@ -630,6 +809,47 @@
630809 }
631810
632811 /**
812
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
813
+ * @rq: receive queue data structure
814
+ * @size: number of request queue entries
815
+ * @node: The NUMA node
816
+ * @udata: True if user data is available or not false
817
+ *
818
+ * Return: If memory allocation failed, return -ENONEM
819
+ * This function is used by both shared receive
820
+ * queues and non-shared receive queues to allocate
821
+ * memory.
822
+ */
823
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
824
+ struct ib_udata *udata)
825
+{
826
+ if (udata) {
827
+ rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
828
+ if (!rq->wq)
829
+ goto bail;
830
+ /* need kwq with no buffers */
831
+ rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
832
+ if (!rq->kwq)
833
+ goto bail;
834
+ rq->kwq->curr_wq = rq->wq->wq;
835
+ } else {
836
+ /* need kwq with buffers */
837
+ rq->kwq =
838
+ vzalloc_node(sizeof(struct rvt_krwq) + size, node);
839
+ if (!rq->kwq)
840
+ goto bail;
841
+ rq->kwq->curr_wq = rq->kwq->wq;
842
+ }
843
+
844
+ spin_lock_init(&rq->kwq->p_lock);
845
+ spin_lock_init(&rq->kwq->c_lock);
846
+ return 0;
847
+bail:
848
+ rvt_free_rq(rq);
849
+ return -ENOMEM;
850
+}
851
+
852
+/**
633853 * rvt_init_qp - initialize the QP state to the reset state
634854 * @qp: the QP to init or reinit
635855 * @type: the QP type
....@@ -677,11 +897,8 @@
677897 qp->s_mig_state = IB_MIG_MIGRATED;
678898 qp->r_head_ack_queue = 0;
679899 qp->s_tail_ack_queue = 0;
900
+ qp->s_acked_ack_queue = 0;
680901 qp->s_num_rd_atomic = 0;
681
- if (qp->r_rq.wq) {
682
- qp->r_rq.wq->head = 0;
683
- qp->r_rq.wq->tail = 0;
684
- }
685902 qp->r_sge.num_sge = 0;
686903 atomic_set(&qp->s_reserved_used, 0);
687904 }
....@@ -769,9 +986,67 @@
769986 {
770987 struct rvt_qpn_map *map;
771988
989
+ if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE)
990
+ qpn &= RVT_AIP_QP_SUFFIX;
991
+
772992 map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
773993 if (map->page)
774994 clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
995
+}
996
+
997
+/**
998
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
999
+ * @type: valid, supported, QP type
1000
+ */
1001
+static u8 get_allowed_ops(enum ib_qp_type type)
1002
+{
1003
+ return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
1004
+ IB_OPCODE_UC : IB_OPCODE_UD;
1005
+}
1006
+
1007
+/**
1008
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
1009
+ * @qp: Valid QP with allowed_ops set
1010
+ *
1011
+ * The rvt_swqe data structure being used is a union, so this is
1012
+ * only valid for UD QPs.
1013
+ */
1014
+static void free_ud_wq_attr(struct rvt_qp *qp)
1015
+{
1016
+ struct rvt_swqe *wqe;
1017
+ int i;
1018
+
1019
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
1020
+ wqe = rvt_get_swqe_ptr(qp, i);
1021
+ kfree(wqe->ud_wr.attr);
1022
+ wqe->ud_wr.attr = NULL;
1023
+ }
1024
+}
1025
+
1026
+/**
1027
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
1028
+ * @qp: Valid QP with allowed_ops set
1029
+ * @node: Numa node for allocation
1030
+ *
1031
+ * The rvt_swqe data structure being used is a union, so this is
1032
+ * only valid for UD QPs.
1033
+ */
1034
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
1035
+{
1036
+ struct rvt_swqe *wqe;
1037
+ int i;
1038
+
1039
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
1040
+ wqe = rvt_get_swqe_ptr(qp, i);
1041
+ wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
1042
+ GFP_KERNEL, node);
1043
+ if (!wqe->ud_wr.attr) {
1044
+ free_ud_wq_attr(qp);
1045
+ return -ENOMEM;
1046
+ }
1047
+ }
1048
+
1049
+ return 0;
7751050 }
7761051
7771052 /**
....@@ -801,13 +1076,15 @@
8011076 struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
8021077 void *priv = NULL;
8031078 size_t sqsize;
1079
+ u8 exclude_prefix = 0;
8041080
8051081 if (!rdi)
8061082 return ERR_PTR(-EINVAL);
8071083
8081084 if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge ||
8091085 init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
810
- init_attr->create_flags)
1086
+ (init_attr->create_flags &&
1087
+ init_attr->create_flags != IB_QP_CREATE_NETDEV_USE))
8111088 return ERR_PTR(-EINVAL);
8121089
8131090 /* Check receive queue parameters if no SRQ is specified. */
....@@ -832,13 +1109,11 @@
8321109 if (init_attr->port_num == 0 ||
8331110 init_attr->port_num > ibpd->device->phys_port_cnt)
8341111 return ERR_PTR(-EINVAL);
835
- /* fall through */
1112
+ fallthrough;
8361113 case IB_QPT_UC:
8371114 case IB_QPT_RC:
8381115 case IB_QPT_UD:
839
- sz = sizeof(struct rvt_sge) *
840
- init_attr->cap.max_send_sge +
841
- sizeof(struct rvt_swqe);
1116
+ sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
8421117 swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
8431118 if (!swq)
8441119 return ERR_PTR(-ENOMEM);
....@@ -858,6 +1133,7 @@
8581133 rdi->dparms.node);
8591134 if (!qp)
8601135 goto bail_swq;
1136
+ qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
8611137
8621138 RCU_INIT_POINTER(qp->next, NULL);
8631139 if (init_attr->qp_type == IB_QPT_RC) {
....@@ -895,17 +1171,12 @@
8951171 qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
8961172 sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
8971173 sizeof(struct rvt_rwqe);
898
- if (udata)
899
- qp->r_rq.wq = vmalloc_user(
900
- sizeof(struct rvt_rwq) +
901
- qp->r_rq.size * sz);
902
- else
903
- qp->r_rq.wq = vzalloc_node(
904
- sizeof(struct rvt_rwq) +
905
- qp->r_rq.size * sz,
906
- rdi->dparms.node);
907
- if (!qp->r_rq.wq)
1174
+ err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
1175
+ rdi->dparms.node, udata);
1176
+ if (err) {
1177
+ ret = ERR_PTR(err);
9081178 goto bail_driver_priv;
1179
+ }
9091180 }
9101181
9111182 /*
....@@ -915,7 +1186,6 @@
9151186 spin_lock_init(&qp->r_lock);
9161187 spin_lock_init(&qp->s_hlock);
9171188 spin_lock_init(&qp->s_lock);
918
- spin_lock_init(&qp->r_rq.lock);
9191189 atomic_set(&qp->refcount, 0);
9201190 atomic_set(&qp->local_ops_pending, 0);
9211191 init_waitqueue_head(&qp->wait);
....@@ -927,22 +1197,40 @@
9271197 qp->s_max_sge = init_attr->cap.max_send_sge;
9281198 if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
9291199 qp->s_flags = RVT_S_SIGNAL_REQ_WR;
1200
+ err = alloc_ud_wq_attr(qp, rdi->dparms.node);
1201
+ if (err) {
1202
+ ret = (ERR_PTR(err));
1203
+ goto bail_rq_rvt;
1204
+ }
1205
+
1206
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1207
+ exclude_prefix = RVT_AIP_QP_PREFIX;
9301208
9311209 err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
9321210 init_attr->qp_type,
933
- init_attr->port_num);
1211
+ init_attr->port_num,
1212
+ exclude_prefix);
9341213 if (err < 0) {
9351214 ret = ERR_PTR(err);
9361215 goto bail_rq_wq;
9371216 }
9381217 qp->ibqp.qp_num = err;
1218
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1219
+ qp->ibqp.qp_num |= RVT_AIP_QP_BASE;
9391220 qp->port_num = init_attr->port_num;
9401221 rvt_init_qp(rdi, qp, init_attr->qp_type);
1222
+ if (rdi->driver_f.qp_priv_init) {
1223
+ err = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
1224
+ if (err) {
1225
+ ret = ERR_PTR(err);
1226
+ goto bail_rq_wq;
1227
+ }
1228
+ }
9411229 break;
9421230
9431231 default:
9441232 /* Don't support raw QPs */
945
- return ERR_PTR(-EINVAL);
1233
+ return ERR_PTR(-EOPNOTSUPP);
9461234 }
9471235
9481236 init_attr->cap.max_inline_data = 0;
....@@ -964,11 +1252,10 @@
9641252 } else {
9651253 u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
9661254
967
- qp->ip = rvt_create_mmap_info(rdi, s,
968
- ibpd->uobject->context,
1255
+ qp->ip = rvt_create_mmap_info(rdi, s, udata,
9691256 qp->r_rq.wq);
970
- if (!qp->ip) {
971
- ret = ERR_PTR(-ENOMEM);
1257
+ if (IS_ERR(qp->ip)) {
1258
+ ret = ERR_CAST(qp->ip);
9721259 goto bail_qpn;
9731260 }
9741261
....@@ -1013,28 +1300,6 @@
10131300
10141301 ret = &qp->ibqp;
10151302
1016
- /*
1017
- * We have our QP and its good, now keep track of what types of opcodes
1018
- * can be processed on this QP. We do this by keeping track of what the
1019
- * 3 high order bits of the opcode are.
1020
- */
1021
- switch (init_attr->qp_type) {
1022
- case IB_QPT_SMI:
1023
- case IB_QPT_GSI:
1024
- case IB_QPT_UD:
1025
- qp->allowed_ops = IB_OPCODE_UD;
1026
- break;
1027
- case IB_QPT_RC:
1028
- qp->allowed_ops = IB_OPCODE_RC;
1029
- break;
1030
- case IB_QPT_UC:
1031
- qp->allowed_ops = IB_OPCODE_UC;
1032
- break;
1033
- default:
1034
- ret = ERR_PTR(-EINVAL);
1035
- goto bail_ip;
1036
- }
1037
-
10381303 return ret;
10391304
10401305 bail_ip:
....@@ -1045,8 +1310,10 @@
10451310 rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
10461311
10471312 bail_rq_wq:
1048
- if (!qp->ip)
1049
- vfree(qp->r_rq.wq);
1313
+ free_ud_wq_attr(qp);
1314
+
1315
+bail_rq_rvt:
1316
+ rvt_free_rq(&qp->r_rq);
10501317
10511318 bail_driver_priv:
10521319 rdi->driver_f.qp_priv_free(rdi, qp);
....@@ -1112,19 +1379,26 @@
11121379 }
11131380 wc.status = IB_WC_WR_FLUSH_ERR;
11141381
1115
- if (qp->r_rq.wq) {
1116
- struct rvt_rwq *wq;
1382
+ if (qp->r_rq.kwq) {
11171383 u32 head;
11181384 u32 tail;
1385
+ struct rvt_rwq *wq = NULL;
1386
+ struct rvt_krwq *kwq = NULL;
11191387
1120
- spin_lock(&qp->r_rq.lock);
1121
-
1388
+ spin_lock(&qp->r_rq.kwq->c_lock);
1389
+ /* qp->ip used to validate if there is a user buffer mmaped */
1390
+ if (qp->ip) {
1391
+ wq = qp->r_rq.wq;
1392
+ head = RDMA_READ_UAPI_ATOMIC(wq->head);
1393
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
1394
+ } else {
1395
+ kwq = qp->r_rq.kwq;
1396
+ head = kwq->head;
1397
+ tail = kwq->tail;
1398
+ }
11221399 /* sanity check pointers before trusting them */
1123
- wq = qp->r_rq.wq;
1124
- head = wq->head;
11251400 if (head >= qp->r_rq.size)
11261401 head = 0;
1127
- tail = wq->tail;
11281402 if (tail >= qp->r_rq.size)
11291403 tail = 0;
11301404 while (tail != head) {
....@@ -1133,9 +1407,11 @@
11331407 tail = 0;
11341408 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
11351409 }
1136
- wq->tail = tail;
1137
-
1138
- spin_unlock(&qp->r_rq.lock);
1410
+ if (qp->ip)
1411
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
1412
+ else
1413
+ kwq->tail = tail;
1414
+ spin_unlock(&qp->r_rq.kwq->c_lock);
11391415 } else if (qp->ibqp.event_handler) {
11401416 ret = 1;
11411417 }
....@@ -1189,10 +1465,7 @@
11891465 int lastwqe = 0;
11901466 int mig = 0;
11911467 int pmtu = 0; /* for gcc warning only */
1192
- enum rdma_link_layer link;
11931468 int opa_ah;
1194
-
1195
- link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
11961469
11971470 spin_lock_irq(&qp->r_lock);
11981471 spin_lock(&qp->s_hlock);
....@@ -1204,7 +1477,7 @@
12041477 opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
12051478
12061479 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
1207
- attr_mask, link))
1480
+ attr_mask))
12081481 goto inval;
12091482
12101483 if (rdi->driver_f.check_modify_qp &&
....@@ -1453,7 +1726,7 @@
14531726 *
14541727 * Return: 0 on success.
14551728 */
1456
-int rvt_destroy_qp(struct ib_qp *ibqp)
1729
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
14571730 {
14581731 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
14591732 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
....@@ -1474,13 +1747,13 @@
14741747
14751748 if (qp->ip)
14761749 kref_put(&qp->ip->ref, rvt_release_mmap_info);
1477
- else
1478
- vfree(qp->r_rq.wq);
1479
- vfree(qp->s_wq);
1750
+ kvfree(qp->r_rq.kwq);
14801751 rdi->driver_f.qp_priv_free(rdi, qp);
14811752 kfree(qp->s_ack_queue);
14821753 rdma_destroy_ah_attr(&qp->remote_ah_attr);
14831754 rdma_destroy_ah_attr(&qp->alt_ah_attr);
1755
+ free_ud_wq_attr(qp);
1756
+ vfree(qp->s_wq);
14841757 kfree(qp);
14851758 return 0;
14861759 }
....@@ -1561,7 +1834,7 @@
15611834 const struct ib_recv_wr **bad_wr)
15621835 {
15631836 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1564
- struct rvt_rwq *wq = qp->r_rq.wq;
1837
+ struct rvt_krwq *wq = qp->r_rq.kwq;
15651838 unsigned long flags;
15661839 int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
15671840 !qp->ibqp.srq;
....@@ -1582,12 +1855,12 @@
15821855 return -EINVAL;
15831856 }
15841857
1585
- spin_lock_irqsave(&qp->r_rq.lock, flags);
1858
+ spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
15861859 next = wq->head + 1;
15871860 if (next >= qp->r_rq.size)
15881861 next = 0;
1589
- if (next == wq->tail) {
1590
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1862
+ if (next == READ_ONCE(wq->tail)) {
1863
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
15911864 *bad_wr = wr;
15921865 return -ENOMEM;
15931866 }
....@@ -1604,16 +1877,18 @@
16041877 wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
16051878 wqe->wr_id = wr->wr_id;
16061879 wqe->num_sge = wr->num_sge;
1607
- for (i = 0; i < wr->num_sge; i++)
1608
- wqe->sg_list[i] = wr->sg_list[i];
1880
+ for (i = 0; i < wr->num_sge; i++) {
1881
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
1882
+ wqe->sg_list[i].length = wr->sg_list[i].length;
1883
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
1884
+ }
16091885 /*
16101886 * Make sure queue entry is written
16111887 * before the head index.
16121888 */
1613
- smp_wmb();
1614
- wq->head = next;
1889
+ smp_store_release(&wq->head, next);
16151890 }
1616
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1891
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
16171892 }
16181893 return 0;
16191894 }
....@@ -1694,10 +1969,9 @@
16941969
16951970 /* see rvt_qp_wqe_unreserve() */
16961971 smp_mb__before_atomic();
1697
- reserved_used = atomic_read(&qp->s_reserved_used);
16981972 if (unlikely(reserved_op)) {
16991973 /* see rvt_qp_wqe_unreserve() */
1700
- smp_mb__before_atomic();
1974
+ reserved_used = atomic_read(&qp->s_reserved_used);
17011975 if (reserved_used >= rdi->dparms.reserved_operations)
17021976 return -ENOMEM;
17031977 return 0;
....@@ -1705,14 +1979,13 @@
17051979 /* non-reserved operations */
17061980 if (likely(qp->s_avail))
17071981 return 0;
1708
- slast = READ_ONCE(qp->s_last);
1982
+ /* See rvt_qp_complete_swqe() */
1983
+ slast = smp_load_acquire(&qp->s_last);
17091984 if (qp->s_head >= slast)
17101985 avail = qp->s_size - (qp->s_head - slast);
17111986 else
17121987 avail = slast - qp->s_head;
17131988
1714
- /* see rvt_qp_wqe_unreserve() */
1715
- smp_mb__before_atomic();
17161989 reserved_used = atomic_read(&qp->s_reserved_used);
17171990 avail = avail - 1 -
17181991 (rdi->dparms.reserved_operations - reserved_used);
....@@ -1737,7 +2010,7 @@
17372010 */
17382011 static int rvt_post_one_wr(struct rvt_qp *qp,
17392012 const struct ib_send_wr *wr,
1740
- int *call_send)
2013
+ bool *call_send)
17412014 {
17422015 struct rvt_swqe *wqe;
17432016 u32 next;
....@@ -1842,22 +2115,17 @@
18422115 wqe->wr.num_sge = j;
18432116 }
18442117
1845
- /* general part of wqe valid - allow for driver checks */
1846
- if (rdi->driver_f.check_send_wqe) {
1847
- ret = rdi->driver_f.check_send_wqe(qp, wqe);
1848
- if (ret < 0)
1849
- goto bail_inval_free;
1850
- if (ret)
1851
- *call_send = ret;
1852
- }
1853
-
2118
+ /*
2119
+ * Calculate and set SWQE PSN values prior to handing it off
2120
+ * to the driver's check routine. This give the driver the
2121
+ * opportunity to adjust PSN values based on internal checks.
2122
+ */
18542123 log_pmtu = qp->log_pmtu;
1855
- if (qp->ibqp.qp_type != IB_QPT_UC &&
1856
- qp->ibqp.qp_type != IB_QPT_RC) {
1857
- struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
2124
+ if (qp->allowed_ops == IB_OPCODE_UD) {
2125
+ struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
18582126
18592127 log_pmtu = ah->log_pmtu;
1860
- atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
2128
+ rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
18612129 }
18622130
18632131 if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
....@@ -1875,8 +2143,18 @@
18752143 (wqe->length ?
18762144 ((wqe->length - 1) >> log_pmtu) :
18772145 0);
1878
- qp->s_next_psn = wqe->lpsn + 1;
18792146 }
2147
+
2148
+ /* general part of wqe valid - allow for driver checks */
2149
+ if (rdi->driver_f.setup_wqe) {
2150
+ ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
2151
+ if (ret < 0)
2152
+ goto bail_inval_free_ref;
2153
+ }
2154
+
2155
+ if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
2156
+ qp->s_next_psn = wqe->lpsn + 1;
2157
+
18802158 if (unlikely(reserved_op)) {
18812159 wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
18822160 rvt_qp_wqe_reserve(qp, wqe);
....@@ -1890,6 +2168,9 @@
18902168
18912169 return 0;
18922170
2171
+bail_inval_free_ref:
2172
+ if (qp->allowed_ops == IB_OPCODE_UD)
2173
+ rdma_destroy_ah_attr(wqe->ud_wr.attr);
18932174 bail_inval_free:
18942175 /* release mr holds */
18952176 while (j) {
....@@ -1916,7 +2197,7 @@
19162197 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
19172198 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
19182199 unsigned long flags = 0;
1919
- int call_send;
2200
+ bool call_send;
19202201 unsigned nreq = 0;
19212202 int err = 0;
19222203
....@@ -1949,7 +2230,11 @@
19492230 bail:
19502231 spin_unlock_irqrestore(&qp->s_hlock, flags);
19512232 if (nreq) {
1952
- if (call_send)
2233
+ /*
2234
+ * Only call do_send if there is exactly one packet, and the
2235
+ * driver said it was ok.
2236
+ */
2237
+ if (nreq == 1 && call_send)
19532238 rdi->driver_f.do_send(qp);
19542239 else
19552240 rdi->driver_f.schedule_send_no_lock(qp);
....@@ -1971,7 +2256,7 @@
19712256 const struct ib_recv_wr **bad_wr)
19722257 {
19732258 struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
1974
- struct rvt_rwq *wq;
2259
+ struct rvt_krwq *wq;
19752260 unsigned long flags;
19762261
19772262 for (; wr; wr = wr->next) {
....@@ -1984,13 +2269,13 @@
19842269 return -EINVAL;
19852270 }
19862271
1987
- spin_lock_irqsave(&srq->rq.lock, flags);
1988
- wq = srq->rq.wq;
2272
+ spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
2273
+ wq = srq->rq.kwq;
19892274 next = wq->head + 1;
19902275 if (next >= srq->rq.size)
19912276 next = 0;
1992
- if (next == wq->tail) {
1993
- spin_unlock_irqrestore(&srq->rq.lock, flags);
2277
+ if (next == READ_ONCE(wq->tail)) {
2278
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
19942279 *bad_wr = wr;
19952280 return -ENOMEM;
19962281 }
....@@ -1998,14 +2283,32 @@
19982283 wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
19992284 wqe->wr_id = wr->wr_id;
20002285 wqe->num_sge = wr->num_sge;
2001
- for (i = 0; i < wr->num_sge; i++)
2002
- wqe->sg_list[i] = wr->sg_list[i];
2286
+ for (i = 0; i < wr->num_sge; i++) {
2287
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
2288
+ wqe->sg_list[i].length = wr->sg_list[i].length;
2289
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
2290
+ }
20032291 /* Make sure queue entry is written before the head index. */
2004
- smp_wmb();
2005
- wq->head = next;
2006
- spin_unlock_irqrestore(&srq->rq.lock, flags);
2292
+ smp_store_release(&wq->head, next);
2293
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
20072294 }
20082295 return 0;
2296
+}
2297
+
2298
+/*
2299
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
2300
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
2301
+ * user struct to a kernel struct.
2302
+ */
2303
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
2304
+{
2305
+ BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
2306
+ offsetof(struct rvt_wqe_sge, addr));
2307
+ BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
2308
+ offsetof(struct rvt_wqe_sge, length));
2309
+ BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
2310
+ offsetof(struct rvt_wqe_sge, lkey));
2311
+ return (struct ib_sge *)sge;
20092312 }
20102313
20112314 /*
....@@ -2031,7 +2334,7 @@
20312334 continue;
20322335 /* Check LKEY */
20332336 ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
2034
- NULL, &wqe->sg_list[i],
2337
+ NULL, rvt_cast_sge(&wqe->sg_list[i]),
20352338 IB_ACCESS_LOCAL_WRITE);
20362339 if (unlikely(ret <= 0))
20372340 goto bad_lkey;
....@@ -2060,6 +2363,25 @@
20602363 }
20612364
20622365 /**
2366
+ * get_rvt_head - get head indices of the circular buffer
2367
+ * @rq: data structure for request queue entry
2368
+ * @ip: the QP
2369
+ *
2370
+ * Return - head index value
2371
+ */
2372
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
2373
+{
2374
+ u32 head;
2375
+
2376
+ if (ip)
2377
+ head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
2378
+ else
2379
+ head = rq->kwq->head;
2380
+
2381
+ return head;
2382
+}
2383
+
2384
+/**
20632385 * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
20642386 * @qp: the QP
20652387 * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
....@@ -2073,39 +2395,54 @@
20732395 {
20742396 unsigned long flags;
20752397 struct rvt_rq *rq;
2398
+ struct rvt_krwq *kwq = NULL;
20762399 struct rvt_rwq *wq;
20772400 struct rvt_srq *srq;
20782401 struct rvt_rwqe *wqe;
20792402 void (*handler)(struct ib_event *, void *);
20802403 u32 tail;
2404
+ u32 head;
20812405 int ret;
2406
+ void *ip = NULL;
20822407
20832408 if (qp->ibqp.srq) {
20842409 srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
20852410 handler = srq->ibsrq.event_handler;
20862411 rq = &srq->rq;
2412
+ ip = srq->ip;
20872413 } else {
20882414 srq = NULL;
20892415 handler = NULL;
20902416 rq = &qp->r_rq;
2417
+ ip = qp->ip;
20912418 }
20922419
2093
- spin_lock_irqsave(&rq->lock, flags);
2420
+ spin_lock_irqsave(&rq->kwq->c_lock, flags);
20942421 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
20952422 ret = 0;
20962423 goto unlock;
20972424 }
2425
+ kwq = rq->kwq;
2426
+ if (ip) {
2427
+ wq = rq->wq;
2428
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
2429
+ } else {
2430
+ tail = kwq->tail;
2431
+ }
20982432
2099
- wq = rq->wq;
2100
- tail = wq->tail;
21012433 /* Validate tail before using it since it is user writable. */
21022434 if (tail >= rq->size)
21032435 tail = 0;
2104
- if (unlikely(tail == wq->head)) {
2436
+
2437
+ if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
2438
+ head = get_rvt_head(rq, ip);
2439
+ kwq->count = rvt_get_rq_count(rq, head, tail);
2440
+ }
2441
+ if (unlikely(kwq->count == 0)) {
21052442 ret = 0;
21062443 goto unlock;
21072444 }
2108
- /* Make sure entry is read after head index is read. */
2445
+ /* Make sure entry is read after the count is read. */
21092446 smp_rmb();
21102447 wqe = rvt_get_rwqe_ptr(rq, tail);
21112448 /*
....@@ -2115,43 +2452,43 @@
21152452 */
21162453 if (++tail >= rq->size)
21172454 tail = 0;
2118
- wq->tail = tail;
2455
+ if (ip)
2456
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
2457
+ else
2458
+ kwq->tail = tail;
21192459 if (!wr_id_only && !init_sge(qp, wqe)) {
21202460 ret = -1;
21212461 goto unlock;
21222462 }
21232463 qp->r_wr_id = wqe->wr_id;
21242464
2465
+ kwq->count--;
21252466 ret = 1;
21262467 set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
21272468 if (handler) {
2128
- u32 n;
2129
-
21302469 /*
21312470 * Validate head pointer value and compute
21322471 * the number of remaining WQEs.
21332472 */
2134
- n = wq->head;
2135
- if (n >= rq->size)
2136
- n = 0;
2137
- if (n < tail)
2138
- n += rq->size - tail;
2139
- else
2140
- n -= tail;
2141
- if (n < srq->limit) {
2142
- struct ib_event ev;
2473
+ if (kwq->count < srq->limit) {
2474
+ kwq->count =
2475
+ rvt_get_rq_count(rq,
2476
+ get_rvt_head(rq, ip), tail);
2477
+ if (kwq->count < srq->limit) {
2478
+ struct ib_event ev;
21432479
2144
- srq->limit = 0;
2145
- spin_unlock_irqrestore(&rq->lock, flags);
2146
- ev.device = qp->ibqp.device;
2147
- ev.element.srq = qp->ibqp.srq;
2148
- ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
2149
- handler(&ev, srq->ibsrq.srq_context);
2150
- goto bail;
2480
+ srq->limit = 0;
2481
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
2482
+ ev.device = qp->ibqp.device;
2483
+ ev.element.srq = qp->ibqp.srq;
2484
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
2485
+ handler(&ev, srq->ibsrq.srq_context);
2486
+ goto bail;
2487
+ }
21512488 }
21522489 }
21532490 unlock:
2154
- spin_unlock_irqrestore(&rq->lock, flags);
2491
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
21552492 bail:
21562493 return ret;
21572494 }
....@@ -2213,11 +2550,12 @@
22132550 }
22142551
22152552 /*
2216
- * rvt_add_retry_timer - add/start a retry timer
2553
+ * rvt_add_retry_timer_ext - add/start a retry timer
22172554 * @qp - the QP
2555
+ * @shift - timeout shift to wait for multiple packets
22182556 * add a retry timer on the QP
22192557 */
2220
-void rvt_add_retry_timer(struct rvt_qp *qp)
2558
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
22212559 {
22222560 struct ib_qp *ibqp = &qp->ibqp;
22232561 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
....@@ -2225,17 +2563,16 @@
22252563 lockdep_assert_held(&qp->s_lock);
22262564 qp->s_flags |= RVT_S_TIMER;
22272565 /* 4.096 usec. * (1 << qp->timeout) */
2228
- qp->s_timer.expires = jiffies + qp->timeout_jiffies +
2229
- rdi->busy_jiffies;
2566
+ qp->s_timer.expires = jiffies + rdi->busy_jiffies +
2567
+ (qp->timeout_jiffies << shift);
22302568 add_timer(&qp->s_timer);
22312569 }
2232
-EXPORT_SYMBOL(rvt_add_retry_timer);
2570
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
22332571
22342572 /**
2235
- * rvt_add_rnr_timer - add/start an rnr timer
2236
- * @qp - the QP
2237
- * @aeth - aeth of RNR timeout, simulated aeth for loopback
2238
- * add an rnr timer on the QP
2573
+ * rvt_add_rnr_timer - add/start an rnr timer on the QP
2574
+ * @qp: the QP
2575
+ * @aeth: aeth of RNR timeout, simulated aeth for loopback
22392576 */
22402577 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
22412578 {
....@@ -2252,7 +2589,7 @@
22522589
22532590 /**
22542591 * rvt_stop_rc_timers - stop all timers
2255
- * @qp - the QP
2592
+ * @qp: the QP
22562593 * stop any pending timers
22572594 */
22582595 void rvt_stop_rc_timers(struct rvt_qp *qp)
....@@ -2286,7 +2623,7 @@
22862623
22872624 /**
22882625 * rvt_del_timers_sync - wait for any timeout routines to exit
2289
- * @qp - the QP
2626
+ * @qp: the QP
22902627 */
22912628 void rvt_del_timers_sync(struct rvt_qp *qp)
22922629 {
....@@ -2295,7 +2632,7 @@
22952632 }
22962633 EXPORT_SYMBOL(rvt_del_timers_sync);
22972634
2298
-/**
2635
+/*
22992636 * This is called from s_timer for missing responses.
23002637 */
23012638 static void rvt_rc_timeout(struct timer_list *t)
....@@ -2345,12 +2682,13 @@
23452682 * rvt_qp_iter_init - initial for QP iteration
23462683 * @rdi: rvt devinfo
23472684 * @v: u64 value
2685
+ * @cb: user-defined callback
23482686 *
23492687 * This returns an iterator suitable for iterating QPs
23502688 * in the system.
23512689 *
2352
- * The @cb is a user defined callback and @v is a 64
2353
- * bit value passed to and relevant for processing in the
2690
+ * The @cb is a user-defined callback and @v is a 64-bit
2691
+ * value passed to and relevant for processing in the
23542692 * @cb. An example use case would be to alter QP processing
23552693 * based on criteria not part of the rvt_qp.
23562694 *
....@@ -2381,7 +2719,7 @@
23812719
23822720 /**
23832721 * rvt_qp_iter_next - return the next QP in iter
2384
- * @iter - the iterator
2722
+ * @iter: the iterator
23852723 *
23862724 * Fine grained QP iterator suitable for use
23872725 * with debugfs seq_file mechanisms.
....@@ -2444,14 +2782,14 @@
24442782
24452783 /**
24462784 * rvt_qp_iter - iterate all QPs
2447
- * @rdi - rvt devinfo
2448
- * @v - a 64 bit value
2449
- * @cb - a callback
2785
+ * @rdi: rvt devinfo
2786
+ * @v: a 64-bit value
2787
+ * @cb: a callback
24502788 *
24512789 * This provides a way for iterating all QPs.
24522790 *
2453
- * The @cb is a user defined callback and @v is a 64
2454
- * bit value passed to and relevant for processing in the
2791
+ * The @cb is a user-defined callback and @v is a 64-bit
2792
+ * value passed to and relevant for processing in the
24552793 * cb. An example use case would be to alter QP processing
24562794 * based on criteria not part of the rvt_qp.
24572795 *
....@@ -2484,3 +2822,450 @@
24842822 rcu_read_unlock();
24852823 }
24862824 EXPORT_SYMBOL(rvt_qp_iter);
2825
+
2826
+/*
2827
+ * This should be called with s_lock and r_lock held.
2828
+ */
2829
+void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
2830
+ enum ib_wc_status status)
2831
+{
2832
+ u32 old_last, last;
2833
+ struct rvt_dev_info *rdi;
2834
+
2835
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2836
+ return;
2837
+ rdi = ib_to_rvt(qp->ibqp.device);
2838
+
2839
+ old_last = qp->s_last;
2840
+ trace_rvt_qp_send_completion(qp, wqe, old_last);
2841
+ last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
2842
+ status);
2843
+ if (qp->s_acked == old_last)
2844
+ qp->s_acked = last;
2845
+ if (qp->s_cur == old_last)
2846
+ qp->s_cur = last;
2847
+ if (qp->s_tail == old_last)
2848
+ qp->s_tail = last;
2849
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
2850
+ qp->s_draining = 0;
2851
+}
2852
+EXPORT_SYMBOL(rvt_send_complete);
2853
+
2854
+/**
2855
+ * rvt_copy_sge - copy data to SGE memory
2856
+ * @qp: associated QP
2857
+ * @ss: the SGE state
2858
+ * @data: the data to copy
2859
+ * @length: the length of the data
2860
+ * @release: boolean to release MR
2861
+ * @copy_last: do a separate copy of the last 8 bytes
2862
+ */
2863
+void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
2864
+ void *data, u32 length,
2865
+ bool release, bool copy_last)
2866
+{
2867
+ struct rvt_sge *sge = &ss->sge;
2868
+ int i;
2869
+ bool in_last = false;
2870
+ bool cacheless_copy = false;
2871
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2872
+ struct rvt_wss *wss = rdi->wss;
2873
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
2874
+
2875
+ if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
2876
+ cacheless_copy = length >= PAGE_SIZE;
2877
+ } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
2878
+ if (length >= PAGE_SIZE) {
2879
+ /*
2880
+ * NOTE: this *assumes*:
2881
+ * o The first vaddr is the dest.
2882
+ * o If multiple pages, then vaddr is sequential.
2883
+ */
2884
+ wss_insert(wss, sge->vaddr);
2885
+ if (length >= (2 * PAGE_SIZE))
2886
+ wss_insert(wss, (sge->vaddr + PAGE_SIZE));
2887
+
2888
+ cacheless_copy = wss_exceeds_threshold(wss);
2889
+ } else {
2890
+ wss_advance_clean_counter(wss);
2891
+ }
2892
+ }
2893
+
2894
+ if (copy_last) {
2895
+ if (length > 8) {
2896
+ length -= 8;
2897
+ } else {
2898
+ copy_last = false;
2899
+ in_last = true;
2900
+ }
2901
+ }
2902
+
2903
+again:
2904
+ while (length) {
2905
+ u32 len = rvt_get_sge_length(sge, length);
2906
+
2907
+ WARN_ON_ONCE(len == 0);
2908
+ if (unlikely(in_last)) {
2909
+ /* enforce byte transfer ordering */
2910
+ for (i = 0; i < len; i++)
2911
+ ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
2912
+ } else if (cacheless_copy) {
2913
+ cacheless_memcpy(sge->vaddr, data, len);
2914
+ } else {
2915
+ memcpy(sge->vaddr, data, len);
2916
+ }
2917
+ rvt_update_sge(ss, len, release);
2918
+ data += len;
2919
+ length -= len;
2920
+ }
2921
+
2922
+ if (copy_last) {
2923
+ copy_last = false;
2924
+ in_last = true;
2925
+ length = 8;
2926
+ goto again;
2927
+ }
2928
+}
2929
+EXPORT_SYMBOL(rvt_copy_sge);
2930
+
2931
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
2932
+ struct rvt_qp *sqp)
2933
+{
2934
+ rvp->n_pkt_drops++;
2935
+ /*
2936
+ * For RC, the requester would timeout and retry so
2937
+ * shortcut the timeouts and just signal too many retries.
2938
+ */
2939
+ return sqp->ibqp.qp_type == IB_QPT_RC ?
2940
+ IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
2941
+}
2942
+
2943
+/**
2944
+ * ruc_loopback - handle UC and RC loopback requests
2945
+ * @sqp: the sending QP
2946
+ *
2947
+ * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
2948
+ * Note that although we are single threaded due to the send engine, we still
2949
+ * have to protect against post_send(). We don't have to worry about
2950
+ * receive interrupts since this is a connected protocol and all packets
2951
+ * will pass through here.
2952
+ */
2953
+void rvt_ruc_loopback(struct rvt_qp *sqp)
2954
+{
2955
+ struct rvt_ibport *rvp = NULL;
2956
+ struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
2957
+ struct rvt_qp *qp;
2958
+ struct rvt_swqe *wqe;
2959
+ struct rvt_sge *sge;
2960
+ unsigned long flags;
2961
+ struct ib_wc wc;
2962
+ u64 sdata;
2963
+ atomic64_t *maddr;
2964
+ enum ib_wc_status send_status;
2965
+ bool release;
2966
+ int ret;
2967
+ bool copy_last = false;
2968
+ int local_ops = 0;
2969
+
2970
+ rcu_read_lock();
2971
+ rvp = rdi->ports[sqp->port_num - 1];
2972
+
2973
+ /*
2974
+ * Note that we check the responder QP state after
2975
+ * checking the requester's state.
2976
+ */
2977
+
2978
+ qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
2979
+ sqp->remote_qpn);
2980
+
2981
+ spin_lock_irqsave(&sqp->s_lock, flags);
2982
+
2983
+ /* Return if we are already busy processing a work request. */
2984
+ if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
2985
+ !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2986
+ goto unlock;
2987
+
2988
+ sqp->s_flags |= RVT_S_BUSY;
2989
+
2990
+again:
2991
+ if (sqp->s_last == READ_ONCE(sqp->s_head))
2992
+ goto clr_busy;
2993
+ wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
2994
+
2995
+ /* Return if it is not OK to start a new work request. */
2996
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
2997
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
2998
+ goto clr_busy;
2999
+ /* We are in the error state, flush the work request. */
3000
+ send_status = IB_WC_WR_FLUSH_ERR;
3001
+ goto flush_send;
3002
+ }
3003
+
3004
+ /*
3005
+ * We can rely on the entry not changing without the s_lock
3006
+ * being held until we update s_last.
3007
+ * We increment s_cur to indicate s_last is in progress.
3008
+ */
3009
+ if (sqp->s_last == sqp->s_cur) {
3010
+ if (++sqp->s_cur >= sqp->s_size)
3011
+ sqp->s_cur = 0;
3012
+ }
3013
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3014
+
3015
+ if (!qp) {
3016
+ send_status = loopback_qp_drop(rvp, sqp);
3017
+ goto serr_no_r_lock;
3018
+ }
3019
+ spin_lock_irqsave(&qp->r_lock, flags);
3020
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
3021
+ qp->ibqp.qp_type != sqp->ibqp.qp_type) {
3022
+ send_status = loopback_qp_drop(rvp, sqp);
3023
+ goto serr;
3024
+ }
3025
+
3026
+ memset(&wc, 0, sizeof(wc));
3027
+ send_status = IB_WC_SUCCESS;
3028
+
3029
+ release = true;
3030
+ sqp->s_sge.sge = wqe->sg_list[0];
3031
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
3032
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
3033
+ sqp->s_len = wqe->length;
3034
+ switch (wqe->wr.opcode) {
3035
+ case IB_WR_REG_MR:
3036
+ goto send_comp;
3037
+
3038
+ case IB_WR_LOCAL_INV:
3039
+ if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
3040
+ if (rvt_invalidate_rkey(sqp,
3041
+ wqe->wr.ex.invalidate_rkey))
3042
+ send_status = IB_WC_LOC_PROT_ERR;
3043
+ local_ops = 1;
3044
+ }
3045
+ goto send_comp;
3046
+
3047
+ case IB_WR_SEND_WITH_INV:
3048
+ case IB_WR_SEND_WITH_IMM:
3049
+ case IB_WR_SEND:
3050
+ ret = rvt_get_rwqe(qp, false);
3051
+ if (ret < 0)
3052
+ goto op_err;
3053
+ if (!ret)
3054
+ goto rnr_nak;
3055
+ if (wqe->length > qp->r_len)
3056
+ goto inv_err;
3057
+ switch (wqe->wr.opcode) {
3058
+ case IB_WR_SEND_WITH_INV:
3059
+ if (!rvt_invalidate_rkey(qp,
3060
+ wqe->wr.ex.invalidate_rkey)) {
3061
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
3062
+ wc.ex.invalidate_rkey =
3063
+ wqe->wr.ex.invalidate_rkey;
3064
+ }
3065
+ break;
3066
+ case IB_WR_SEND_WITH_IMM:
3067
+ wc.wc_flags = IB_WC_WITH_IMM;
3068
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
3069
+ break;
3070
+ default:
3071
+ break;
3072
+ }
3073
+ break;
3074
+
3075
+ case IB_WR_RDMA_WRITE_WITH_IMM:
3076
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3077
+ goto inv_err;
3078
+ wc.wc_flags = IB_WC_WITH_IMM;
3079
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
3080
+ ret = rvt_get_rwqe(qp, true);
3081
+ if (ret < 0)
3082
+ goto op_err;
3083
+ if (!ret)
3084
+ goto rnr_nak;
3085
+ /* skip copy_last set and qp_access_flags recheck */
3086
+ goto do_write;
3087
+ case IB_WR_RDMA_WRITE:
3088
+ copy_last = rvt_is_user_qp(qp);
3089
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3090
+ goto inv_err;
3091
+do_write:
3092
+ if (wqe->length == 0)
3093
+ break;
3094
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
3095
+ wqe->rdma_wr.remote_addr,
3096
+ wqe->rdma_wr.rkey,
3097
+ IB_ACCESS_REMOTE_WRITE)))
3098
+ goto acc_err;
3099
+ qp->r_sge.sg_list = NULL;
3100
+ qp->r_sge.num_sge = 1;
3101
+ qp->r_sge.total_len = wqe->length;
3102
+ break;
3103
+
3104
+ case IB_WR_RDMA_READ:
3105
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
3106
+ goto inv_err;
3107
+ if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
3108
+ wqe->rdma_wr.remote_addr,
3109
+ wqe->rdma_wr.rkey,
3110
+ IB_ACCESS_REMOTE_READ)))
3111
+ goto acc_err;
3112
+ release = false;
3113
+ sqp->s_sge.sg_list = NULL;
3114
+ sqp->s_sge.num_sge = 1;
3115
+ qp->r_sge.sge = wqe->sg_list[0];
3116
+ qp->r_sge.sg_list = wqe->sg_list + 1;
3117
+ qp->r_sge.num_sge = wqe->wr.num_sge;
3118
+ qp->r_sge.total_len = wqe->length;
3119
+ break;
3120
+
3121
+ case IB_WR_ATOMIC_CMP_AND_SWP:
3122
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
3123
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
3124
+ goto inv_err;
3125
+ if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
3126
+ goto inv_err;
3127
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
3128
+ wqe->atomic_wr.remote_addr,
3129
+ wqe->atomic_wr.rkey,
3130
+ IB_ACCESS_REMOTE_ATOMIC)))
3131
+ goto acc_err;
3132
+ /* Perform atomic OP and save result. */
3133
+ maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
3134
+ sdata = wqe->atomic_wr.compare_add;
3135
+ *(u64 *)sqp->s_sge.sge.vaddr =
3136
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
3137
+ (u64)atomic64_add_return(sdata, maddr) - sdata :
3138
+ (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
3139
+ sdata, wqe->atomic_wr.swap);
3140
+ rvt_put_mr(qp->r_sge.sge.mr);
3141
+ qp->r_sge.num_sge = 0;
3142
+ goto send_comp;
3143
+
3144
+ default:
3145
+ send_status = IB_WC_LOC_QP_OP_ERR;
3146
+ goto serr;
3147
+ }
3148
+
3149
+ sge = &sqp->s_sge.sge;
3150
+ while (sqp->s_len) {
3151
+ u32 len = rvt_get_sge_length(sge, sqp->s_len);
3152
+
3153
+ WARN_ON_ONCE(len == 0);
3154
+ rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
3155
+ len, release, copy_last);
3156
+ rvt_update_sge(&sqp->s_sge, len, !release);
3157
+ sqp->s_len -= len;
3158
+ }
3159
+ if (release)
3160
+ rvt_put_ss(&qp->r_sge);
3161
+
3162
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
3163
+ goto send_comp;
3164
+
3165
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
3166
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
3167
+ else
3168
+ wc.opcode = IB_WC_RECV;
3169
+ wc.wr_id = qp->r_wr_id;
3170
+ wc.status = IB_WC_SUCCESS;
3171
+ wc.byte_len = wqe->length;
3172
+ wc.qp = &qp->ibqp;
3173
+ wc.src_qp = qp->remote_qpn;
3174
+ wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
3175
+ wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
3176
+ wc.port_num = 1;
3177
+ /* Signal completion event if the solicited bit is set. */
3178
+ rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
3179
+
3180
+send_comp:
3181
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3182
+ spin_lock_irqsave(&sqp->s_lock, flags);
3183
+ rvp->n_loop_pkts++;
3184
+flush_send:
3185
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
3186
+ spin_lock(&sqp->r_lock);
3187
+ rvt_send_complete(sqp, wqe, send_status);
3188
+ spin_unlock(&sqp->r_lock);
3189
+ if (local_ops) {
3190
+ atomic_dec(&sqp->local_ops_pending);
3191
+ local_ops = 0;
3192
+ }
3193
+ goto again;
3194
+
3195
+rnr_nak:
3196
+ /* Handle RNR NAK */
3197
+ if (qp->ibqp.qp_type == IB_QPT_UC)
3198
+ goto send_comp;
3199
+ rvp->n_rnr_naks++;
3200
+ /*
3201
+ * Note: we don't need the s_lock held since the BUSY flag
3202
+ * makes this single threaded.
3203
+ */
3204
+ if (sqp->s_rnr_retry == 0) {
3205
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
3206
+ goto serr;
3207
+ }
3208
+ if (sqp->s_rnr_retry_cnt < 7)
3209
+ sqp->s_rnr_retry--;
3210
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3211
+ spin_lock_irqsave(&sqp->s_lock, flags);
3212
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
3213
+ goto clr_busy;
3214
+ rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
3215
+ IB_AETH_CREDIT_SHIFT);
3216
+ goto clr_busy;
3217
+
3218
+op_err:
3219
+ send_status = IB_WC_REM_OP_ERR;
3220
+ wc.status = IB_WC_LOC_QP_OP_ERR;
3221
+ goto err;
3222
+
3223
+inv_err:
3224
+ send_status =
3225
+ sqp->ibqp.qp_type == IB_QPT_RC ?
3226
+ IB_WC_REM_INV_REQ_ERR :
3227
+ IB_WC_SUCCESS;
3228
+ wc.status = IB_WC_LOC_QP_OP_ERR;
3229
+ goto err;
3230
+
3231
+acc_err:
3232
+ send_status = IB_WC_REM_ACCESS_ERR;
3233
+ wc.status = IB_WC_LOC_PROT_ERR;
3234
+err:
3235
+ /* responder goes to error state */
3236
+ rvt_rc_error(qp, wc.status);
3237
+
3238
+serr:
3239
+ spin_unlock_irqrestore(&qp->r_lock, flags);
3240
+serr_no_r_lock:
3241
+ spin_lock_irqsave(&sqp->s_lock, flags);
3242
+ spin_lock(&sqp->r_lock);
3243
+ rvt_send_complete(sqp, wqe, send_status);
3244
+ spin_unlock(&sqp->r_lock);
3245
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
3246
+ int lastwqe;
3247
+
3248
+ spin_lock(&sqp->r_lock);
3249
+ lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
3250
+ spin_unlock(&sqp->r_lock);
3251
+
3252
+ sqp->s_flags &= ~RVT_S_BUSY;
3253
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3254
+ if (lastwqe) {
3255
+ struct ib_event ev;
3256
+
3257
+ ev.device = sqp->ibqp.device;
3258
+ ev.element.qp = &sqp->ibqp;
3259
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
3260
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
3261
+ }
3262
+ goto done;
3263
+ }
3264
+clr_busy:
3265
+ sqp->s_flags &= ~RVT_S_BUSY;
3266
+unlock:
3267
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
3268
+done:
3269
+ rcu_read_unlock();
3270
+}
3271
+EXPORT_SYMBOL(rvt_ruc_loopback);