.. | .. |
---|
7 | 7 | /* Lightweight memory registration using Fast Registration Work |
---|
8 | 8 | * Requests (FRWR). |
---|
9 | 9 | * |
---|
10 | | - * FRWR features ordered asynchronous registration and deregistration |
---|
11 | | - * of arbitrarily sized memory regions. This is the fastest and safest |
---|
| 10 | + * FRWR features ordered asynchronous registration and invalidation |
---|
| 11 | + * of arbitrarily-sized memory regions. This is the fastest and safest |
---|
12 | 12 | * but most complex memory registration mode. |
---|
13 | 13 | */ |
---|
14 | 14 | |
---|
15 | 15 | /* Normal operation |
---|
16 | 16 | * |
---|
17 | | - * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG |
---|
18 | | - * Work Request (frwr_op_map). When the RDMA operation is finished, this |
---|
| 17 | + * A Memory Region is prepared for RDMA Read or Write using a FAST_REG |
---|
| 18 | + * Work Request (frwr_map). When the RDMA operation is finished, this |
---|
19 | 19 | * Memory Region is invalidated using a LOCAL_INV Work Request |
---|
20 | | - * (frwr_op_unmap_sync). |
---|
| 20 | + * (frwr_unmap_async and frwr_unmap_sync). |
---|
21 | 21 | * |
---|
22 | | - * Typically these Work Requests are not signaled, and neither are RDMA |
---|
23 | | - * SEND Work Requests (with the exception of signaling occasionally to |
---|
24 | | - * prevent provider work queue overflows). This greatly reduces HCA |
---|
| 22 | + * Typically FAST_REG Work Requests are not signaled, and neither are |
---|
| 23 | + * RDMA Send Work Requests (with the exception of signaling occasionally |
---|
| 24 | + * to prevent provider work queue overflows). This greatly reduces HCA |
---|
25 | 25 | * interrupt workload. |
---|
26 | | - * |
---|
27 | | - * As an optimization, frwr_op_unmap marks MRs INVALID before the |
---|
28 | | - * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on |
---|
29 | | - * rb_mrs immediately so that no work (like managing a linked list |
---|
30 | | - * under a spinlock) is needed in the completion upcall. |
---|
31 | | - * |
---|
32 | | - * But this means that frwr_op_map() can occasionally encounter an MR |
---|
33 | | - * that is INVALID but the LOCAL_INV WR has not completed. Work Queue |
---|
34 | | - * ordering prevents a subsequent FAST_REG WR from executing against |
---|
35 | | - * that MR while it is still being invalidated. |
---|
36 | 26 | */ |
---|
37 | 27 | |
---|
38 | 28 | /* Transport recovery |
---|
39 | 29 | * |
---|
40 | | - * ->op_map and the transport connect worker cannot run at the same |
---|
41 | | - * time, but ->op_unmap can fire while the transport connect worker |
---|
42 | | - * is running. Thus MR recovery is handled in ->op_map, to guarantee |
---|
43 | | - * that recovered MRs are owned by a sending RPC, and not one where |
---|
44 | | - * ->op_unmap could fire at the same time transport reconnect is |
---|
45 | | - * being done. |
---|
| 30 | + * frwr_map and frwr_unmap_* cannot run at the same time the transport |
---|
| 31 | + * connect worker is running. The connect worker holds the transport |
---|
| 32 | + * send lock, just as ->send_request does. This prevents frwr_map and |
---|
| 33 | + * the connect worker from running concurrently. When a connection is |
---|
| 34 | + * closed, the Receive completion queue is drained before the allowing |
---|
| 35 | + * the connect worker to get control. This prevents frwr_unmap and the |
---|
| 36 | + * connect worker from running concurrently. |
---|
46 | 37 | * |
---|
47 | | - * When the underlying transport disconnects, MRs are left in one of |
---|
48 | | - * four states: |
---|
49 | | - * |
---|
50 | | - * INVALID: The MR was not in use before the QP entered ERROR state. |
---|
51 | | - * |
---|
52 | | - * VALID: The MR was registered before the QP entered ERROR state. |
---|
53 | | - * |
---|
54 | | - * FLUSHED_FR: The MR was being registered when the QP entered ERROR |
---|
55 | | - * state, and the pending WR was flushed. |
---|
56 | | - * |
---|
57 | | - * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR |
---|
58 | | - * state, and the pending WR was flushed. |
---|
59 | | - * |
---|
60 | | - * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered |
---|
61 | | - * with ib_dereg_mr and then are re-initialized. Because MR recovery |
---|
62 | | - * allocates fresh resources, it is deferred to a workqueue, and the |
---|
63 | | - * recovered MRs are placed back on the rb_mrs list when recovery is |
---|
64 | | - * complete. frwr_op_map allocates another MR for the current RPC while |
---|
65 | | - * the broken MR is reset. |
---|
66 | | - * |
---|
67 | | - * To ensure that frwr_op_map doesn't encounter an MR that is marked |
---|
68 | | - * INVALID but that is about to be flushed due to a previous transport |
---|
69 | | - * disconnect, the transport connect worker attempts to drain all |
---|
70 | | - * pending send queue WRs before the transport is reconnected. |
---|
| 38 | + * When the underlying transport disconnects, MRs that are in flight |
---|
| 39 | + * are flushed and are likely unusable. Thus all MRs are destroyed. |
---|
| 40 | + * New MRs are created on demand. |
---|
71 | 41 | */ |
---|
72 | 42 | |
---|
73 | | -#include <linux/sunrpc/rpc_rdma.h> |
---|
74 | 43 | #include <linux/sunrpc/svc_rdma.h> |
---|
75 | 44 | |
---|
76 | 45 | #include "xprt_rdma.h" |
---|
.. | .. |
---|
80 | 49 | # define RPCDBG_FACILITY RPCDBG_TRANS |
---|
81 | 50 | #endif |
---|
82 | 51 | |
---|
83 | | -bool |
---|
84 | | -frwr_is_supported(struct rpcrdma_ia *ia) |
---|
85 | | -{ |
---|
86 | | - struct ib_device_attr *attrs = &ia->ri_device->attrs; |
---|
87 | | - |
---|
88 | | - if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) |
---|
89 | | - goto out_not_supported; |
---|
90 | | - if (attrs->max_fast_reg_page_list_len == 0) |
---|
91 | | - goto out_not_supported; |
---|
92 | | - return true; |
---|
93 | | - |
---|
94 | | -out_not_supported: |
---|
95 | | - pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", |
---|
96 | | - ia->ri_device->name); |
---|
97 | | - return false; |
---|
98 | | -} |
---|
99 | | - |
---|
100 | | -static int |
---|
101 | | -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) |
---|
102 | | -{ |
---|
103 | | - unsigned int depth = ia->ri_max_frwr_depth; |
---|
104 | | - struct rpcrdma_frwr *frwr = &mr->frwr; |
---|
105 | | - int rc; |
---|
106 | | - |
---|
107 | | - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); |
---|
108 | | - if (IS_ERR(frwr->fr_mr)) |
---|
109 | | - goto out_mr_err; |
---|
110 | | - |
---|
111 | | - mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); |
---|
112 | | - if (!mr->mr_sg) |
---|
113 | | - goto out_list_err; |
---|
114 | | - |
---|
115 | | - INIT_LIST_HEAD(&mr->mr_list); |
---|
116 | | - sg_init_table(mr->mr_sg, depth); |
---|
117 | | - init_completion(&frwr->fr_linv_done); |
---|
118 | | - return 0; |
---|
119 | | - |
---|
120 | | -out_mr_err: |
---|
121 | | - rc = PTR_ERR(frwr->fr_mr); |
---|
122 | | - dprintk("RPC: %s: ib_alloc_mr status %i\n", |
---|
123 | | - __func__, rc); |
---|
124 | | - return rc; |
---|
125 | | - |
---|
126 | | -out_list_err: |
---|
127 | | - rc = -ENOMEM; |
---|
128 | | - dprintk("RPC: %s: sg allocation failure\n", |
---|
129 | | - __func__); |
---|
130 | | - ib_dereg_mr(frwr->fr_mr); |
---|
131 | | - return rc; |
---|
132 | | -} |
---|
133 | | - |
---|
134 | | -static void |
---|
135 | | -frwr_op_release_mr(struct rpcrdma_mr *mr) |
---|
| 52 | +/** |
---|
| 53 | + * frwr_release_mr - Destroy one MR |
---|
| 54 | + * @mr: MR allocated by frwr_mr_init |
---|
| 55 | + * |
---|
| 56 | + */ |
---|
| 57 | +void frwr_release_mr(struct rpcrdma_mr *mr) |
---|
136 | 58 | { |
---|
137 | 59 | int rc; |
---|
138 | 60 | |
---|
139 | 61 | rc = ib_dereg_mr(mr->frwr.fr_mr); |
---|
140 | 62 | if (rc) |
---|
141 | | - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", |
---|
142 | | - mr, rc); |
---|
| 63 | + trace_xprtrdma_frwr_dereg(mr, rc); |
---|
143 | 64 | kfree(mr->mr_sg); |
---|
144 | 65 | kfree(mr); |
---|
145 | 66 | } |
---|
146 | 67 | |
---|
147 | | -static int |
---|
148 | | -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) |
---|
| 68 | +static void frwr_mr_recycle(struct rpcrdma_mr *mr) |
---|
149 | 69 | { |
---|
150 | | - struct rpcrdma_frwr *frwr = &mr->frwr; |
---|
151 | | - int rc; |
---|
152 | | - |
---|
153 | | - rc = ib_dereg_mr(frwr->fr_mr); |
---|
154 | | - if (rc) { |
---|
155 | | - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", |
---|
156 | | - rc, mr); |
---|
157 | | - return rc; |
---|
158 | | - } |
---|
159 | | - |
---|
160 | | - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, |
---|
161 | | - ia->ri_max_frwr_depth); |
---|
162 | | - if (IS_ERR(frwr->fr_mr)) { |
---|
163 | | - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", |
---|
164 | | - PTR_ERR(frwr->fr_mr), mr); |
---|
165 | | - return PTR_ERR(frwr->fr_mr); |
---|
166 | | - } |
---|
167 | | - |
---|
168 | | - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); |
---|
169 | | - frwr->fr_state = FRWR_IS_INVALID; |
---|
170 | | - return 0; |
---|
171 | | -} |
---|
172 | | - |
---|
173 | | -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. |
---|
174 | | - */ |
---|
175 | | -static void |
---|
176 | | -frwr_op_recover_mr(struct rpcrdma_mr *mr) |
---|
177 | | -{ |
---|
178 | | - enum rpcrdma_frwr_state state = mr->frwr.fr_state; |
---|
179 | 70 | struct rpcrdma_xprt *r_xprt = mr->mr_xprt; |
---|
180 | | - struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
---|
181 | | - int rc; |
---|
182 | 71 | |
---|
183 | | - rc = __frwr_mr_reset(ia, mr); |
---|
184 | | - if (state != FRWR_FLUSHED_LI) { |
---|
185 | | - trace_xprtrdma_dma_unmap(mr); |
---|
186 | | - ib_dma_unmap_sg(ia->ri_device, |
---|
| 72 | + trace_xprtrdma_mr_recycle(mr); |
---|
| 73 | + |
---|
| 74 | + if (mr->mr_dir != DMA_NONE) { |
---|
| 75 | + trace_xprtrdma_mr_unmap(mr); |
---|
| 76 | + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, |
---|
187 | 77 | mr->mr_sg, mr->mr_nents, mr->mr_dir); |
---|
| 78 | + mr->mr_dir = DMA_NONE; |
---|
188 | 79 | } |
---|
189 | | - if (rc) |
---|
190 | | - goto out_release; |
---|
191 | 80 | |
---|
192 | | - rpcrdma_mr_put(mr); |
---|
193 | | - r_xprt->rx_stats.mrs_recovered++; |
---|
194 | | - return; |
---|
195 | | - |
---|
196 | | -out_release: |
---|
197 | | - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); |
---|
198 | | - r_xprt->rx_stats.mrs_orphaned++; |
---|
199 | | - |
---|
200 | | - spin_lock(&r_xprt->rx_buf.rb_mrlock); |
---|
| 81 | + spin_lock(&r_xprt->rx_buf.rb_lock); |
---|
201 | 82 | list_del(&mr->mr_all); |
---|
202 | | - spin_unlock(&r_xprt->rx_buf.rb_mrlock); |
---|
| 83 | + r_xprt->rx_stats.mrs_recycled++; |
---|
| 84 | + spin_unlock(&r_xprt->rx_buf.rb_lock); |
---|
203 | 85 | |
---|
204 | | - frwr_op_release_mr(mr); |
---|
| 86 | + frwr_release_mr(mr); |
---|
205 | 87 | } |
---|
206 | 88 | |
---|
207 | | -/* On success, sets: |
---|
208 | | - * ep->rep_attr.cap.max_send_wr |
---|
209 | | - * ep->rep_attr.cap.max_recv_wr |
---|
210 | | - * cdata->max_requests |
---|
211 | | - * ia->ri_max_segs |
---|
| 89 | +/* frwr_reset - Place MRs back on the free list |
---|
| 90 | + * @req: request to reset |
---|
212 | 91 | * |
---|
213 | | - * And these FRWR-related fields: |
---|
214 | | - * ia->ri_max_frwr_depth |
---|
215 | | - * ia->ri_mrtype |
---|
| 92 | + * Used after a failed marshal. For FRWR, this means the MRs |
---|
| 93 | + * don't have to be fully released and recreated. |
---|
| 94 | + * |
---|
| 95 | + * NB: This is safe only as long as none of @req's MRs are |
---|
| 96 | + * involved with an ongoing asynchronous FAST_REG or LOCAL_INV |
---|
| 97 | + * Work Request. |
---|
216 | 98 | */ |
---|
217 | | -static int |
---|
218 | | -frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, |
---|
219 | | - struct rpcrdma_create_data_internal *cdata) |
---|
| 99 | +void frwr_reset(struct rpcrdma_req *req) |
---|
220 | 100 | { |
---|
221 | | - struct ib_device_attr *attrs = &ia->ri_device->attrs; |
---|
| 101 | + struct rpcrdma_mr *mr; |
---|
| 102 | + |
---|
| 103 | + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) |
---|
| 104 | + rpcrdma_mr_put(mr); |
---|
| 105 | +} |
---|
| 106 | + |
---|
| 107 | +/** |
---|
| 108 | + * frwr_mr_init - Initialize one MR |
---|
| 109 | + * @r_xprt: controlling transport instance |
---|
| 110 | + * @mr: generic MR to prepare for FRWR |
---|
| 111 | + * |
---|
| 112 | + * Returns zero if successful. Otherwise a negative errno |
---|
| 113 | + * is returned. |
---|
| 114 | + */ |
---|
| 115 | +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) |
---|
| 116 | +{ |
---|
| 117 | + struct rpcrdma_ep *ep = r_xprt->rx_ep; |
---|
| 118 | + unsigned int depth = ep->re_max_fr_depth; |
---|
| 119 | + struct scatterlist *sg; |
---|
| 120 | + struct ib_mr *frmr; |
---|
| 121 | + int rc; |
---|
| 122 | + |
---|
| 123 | + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); |
---|
| 124 | + if (IS_ERR(frmr)) |
---|
| 125 | + goto out_mr_err; |
---|
| 126 | + |
---|
| 127 | + sg = kmalloc_array(depth, sizeof(*sg), GFP_NOFS); |
---|
| 128 | + if (!sg) |
---|
| 129 | + goto out_list_err; |
---|
| 130 | + |
---|
| 131 | + mr->mr_xprt = r_xprt; |
---|
| 132 | + mr->frwr.fr_mr = frmr; |
---|
| 133 | + mr->mr_dir = DMA_NONE; |
---|
| 134 | + INIT_LIST_HEAD(&mr->mr_list); |
---|
| 135 | + init_completion(&mr->frwr.fr_linv_done); |
---|
| 136 | + |
---|
| 137 | + sg_init_table(sg, depth); |
---|
| 138 | + mr->mr_sg = sg; |
---|
| 139 | + return 0; |
---|
| 140 | + |
---|
| 141 | +out_mr_err: |
---|
| 142 | + rc = PTR_ERR(frmr); |
---|
| 143 | + trace_xprtrdma_frwr_alloc(mr, rc); |
---|
| 144 | + return rc; |
---|
| 145 | + |
---|
| 146 | +out_list_err: |
---|
| 147 | + ib_dereg_mr(frmr); |
---|
| 148 | + return -ENOMEM; |
---|
| 149 | +} |
---|
| 150 | + |
---|
| 151 | +/** |
---|
| 152 | + * frwr_query_device - Prepare a transport for use with FRWR |
---|
| 153 | + * @ep: endpoint to fill in |
---|
| 154 | + * @device: RDMA device to query |
---|
| 155 | + * |
---|
| 156 | + * On success, sets: |
---|
| 157 | + * ep->re_attr |
---|
| 158 | + * ep->re_max_requests |
---|
| 159 | + * ep->re_max_rdma_segs |
---|
| 160 | + * ep->re_max_fr_depth |
---|
| 161 | + * ep->re_mrtype |
---|
| 162 | + * |
---|
| 163 | + * Return values: |
---|
| 164 | + * On success, returns zero. |
---|
| 165 | + * %-EINVAL - the device does not support FRWR memory registration |
---|
| 166 | + * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA |
---|
| 167 | + */ |
---|
| 168 | +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) |
---|
| 169 | +{ |
---|
| 170 | + const struct ib_device_attr *attrs = &device->attrs; |
---|
222 | 171 | int max_qp_wr, depth, delta; |
---|
| 172 | + unsigned int max_sge; |
---|
223 | 173 | |
---|
224 | | - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; |
---|
| 174 | + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || |
---|
| 175 | + attrs->max_fast_reg_page_list_len == 0) { |
---|
| 176 | + pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n", |
---|
| 177 | + device->name); |
---|
| 178 | + return -EINVAL; |
---|
| 179 | + } |
---|
| 180 | + |
---|
| 181 | + max_sge = min_t(unsigned int, attrs->max_send_sge, |
---|
| 182 | + RPCRDMA_MAX_SEND_SGES); |
---|
| 183 | + if (max_sge < RPCRDMA_MIN_SEND_SGES) { |
---|
| 184 | + pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); |
---|
| 185 | + return -ENOMEM; |
---|
| 186 | + } |
---|
| 187 | + ep->re_attr.cap.max_send_sge = max_sge; |
---|
| 188 | + ep->re_attr.cap.max_recv_sge = 1; |
---|
| 189 | + |
---|
| 190 | + ep->re_mrtype = IB_MR_TYPE_MEM_REG; |
---|
225 | 191 | if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) |
---|
226 | | - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; |
---|
| 192 | + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; |
---|
227 | 193 | |
---|
228 | | - ia->ri_max_frwr_depth = |
---|
229 | | - min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, |
---|
230 | | - attrs->max_fast_reg_page_list_len); |
---|
231 | | - dprintk("RPC: %s: device's max FR page list len = %u\n", |
---|
232 | | - __func__, ia->ri_max_frwr_depth); |
---|
| 194 | + /* Quirk: Some devices advertise a large max_fast_reg_page_list_len |
---|
| 195 | + * capability, but perform optimally when the MRs are not larger |
---|
| 196 | + * than a page. |
---|
| 197 | + */ |
---|
| 198 | + if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) |
---|
| 199 | + ep->re_max_fr_depth = attrs->max_sge_rd; |
---|
| 200 | + else |
---|
| 201 | + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; |
---|
| 202 | + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) |
---|
| 203 | + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; |
---|
233 | 204 | |
---|
234 | 205 | /* Add room for frwr register and invalidate WRs. |
---|
235 | 206 | * 1. FRWR reg WR for head |
---|
.. | .. |
---|
245 | 216 | /* Calculate N if the device max FRWR depth is smaller than |
---|
246 | 217 | * RPCRDMA_MAX_DATA_SEGS. |
---|
247 | 218 | */ |
---|
248 | | - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { |
---|
249 | | - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; |
---|
| 219 | + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { |
---|
| 220 | + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; |
---|
250 | 221 | do { |
---|
251 | 222 | depth += 2; /* FRWR reg + invalidate */ |
---|
252 | | - delta -= ia->ri_max_frwr_depth; |
---|
| 223 | + delta -= ep->re_max_fr_depth; |
---|
253 | 224 | } while (delta > 0); |
---|
254 | 225 | } |
---|
255 | 226 | |
---|
256 | | - max_qp_wr = ia->ri_device->attrs.max_qp_wr; |
---|
| 227 | + max_qp_wr = attrs->max_qp_wr; |
---|
257 | 228 | max_qp_wr -= RPCRDMA_BACKWARD_WRS; |
---|
258 | 229 | max_qp_wr -= 1; |
---|
259 | 230 | if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) |
---|
260 | 231 | return -ENOMEM; |
---|
261 | | - if (cdata->max_requests > max_qp_wr) |
---|
262 | | - cdata->max_requests = max_qp_wr; |
---|
263 | | - ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; |
---|
264 | | - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { |
---|
265 | | - cdata->max_requests = max_qp_wr / depth; |
---|
266 | | - if (!cdata->max_requests) |
---|
267 | | - return -EINVAL; |
---|
268 | | - ep->rep_attr.cap.max_send_wr = cdata->max_requests * |
---|
269 | | - depth; |
---|
| 232 | + if (ep->re_max_requests > max_qp_wr) |
---|
| 233 | + ep->re_max_requests = max_qp_wr; |
---|
| 234 | + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; |
---|
| 235 | + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { |
---|
| 236 | + ep->re_max_requests = max_qp_wr / depth; |
---|
| 237 | + if (!ep->re_max_requests) |
---|
| 238 | + return -ENOMEM; |
---|
| 239 | + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; |
---|
270 | 240 | } |
---|
271 | | - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; |
---|
272 | | - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ |
---|
273 | | - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; |
---|
274 | | - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; |
---|
275 | | - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ |
---|
| 241 | + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; |
---|
| 242 | + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ |
---|
| 243 | + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; |
---|
| 244 | + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; |
---|
| 245 | + ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; |
---|
| 246 | + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ |
---|
276 | 247 | |
---|
277 | | - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / |
---|
278 | | - ia->ri_max_frwr_depth); |
---|
| 248 | + ep->re_max_rdma_segs = |
---|
| 249 | + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); |
---|
| 250 | + /* Reply chunks require segments for head and tail buffers */ |
---|
| 251 | + ep->re_max_rdma_segs += 2; |
---|
| 252 | + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) |
---|
| 253 | + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; |
---|
| 254 | + |
---|
| 255 | + /* Ensure the underlying device is capable of conveying the |
---|
| 256 | + * largest r/wsize NFS will ask for. This guarantees that |
---|
| 257 | + * failing over from one RDMA device to another will not |
---|
| 258 | + * break NFS I/O. |
---|
| 259 | + */ |
---|
| 260 | + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) |
---|
| 261 | + return -ENOMEM; |
---|
| 262 | + |
---|
279 | 263 | return 0; |
---|
280 | 264 | } |
---|
281 | 265 | |
---|
282 | | -/* FRWR mode conveys a list of pages per chunk segment. The |
---|
283 | | - * maximum length of that list is the FRWR page list depth. |
---|
284 | | - */ |
---|
285 | | -static size_t |
---|
286 | | -frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) |
---|
287 | | -{ |
---|
288 | | - struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
---|
289 | | - |
---|
290 | | - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, |
---|
291 | | - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); |
---|
292 | | -} |
---|
293 | | - |
---|
294 | | -static void |
---|
295 | | -__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) |
---|
296 | | -{ |
---|
297 | | - if (wc->status != IB_WC_WR_FLUSH_ERR) |
---|
298 | | - pr_err("rpcrdma: %s: %s (%u/0x%x)\n", |
---|
299 | | - wr, ib_wc_status_msg(wc->status), |
---|
300 | | - wc->status, wc->vendor_err); |
---|
301 | | -} |
---|
302 | | - |
---|
303 | 266 | /** |
---|
304 | | - * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC |
---|
305 | | - * @cq: completion queue (ignored) |
---|
306 | | - * @wc: completed WR |
---|
| 267 | + * frwr_map - Register a memory region |
---|
| 268 | + * @r_xprt: controlling transport |
---|
| 269 | + * @seg: memory region co-ordinates |
---|
| 270 | + * @nsegs: number of segments remaining |
---|
| 271 | + * @writing: true when RDMA Write will be used |
---|
| 272 | + * @xid: XID of RPC using the registered memory |
---|
| 273 | + * @mr: MR to fill in |
---|
307 | 274 | * |
---|
308 | | - */ |
---|
309 | | -static void |
---|
310 | | -frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) |
---|
311 | | -{ |
---|
312 | | - struct ib_cqe *cqe = wc->wr_cqe; |
---|
313 | | - struct rpcrdma_frwr *frwr = |
---|
314 | | - container_of(cqe, struct rpcrdma_frwr, fr_cqe); |
---|
315 | | - |
---|
316 | | - /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
317 | | - if (wc->status != IB_WC_SUCCESS) { |
---|
318 | | - frwr->fr_state = FRWR_FLUSHED_FR; |
---|
319 | | - __frwr_sendcompletion_flush(wc, "fastreg"); |
---|
320 | | - } |
---|
321 | | - trace_xprtrdma_wc_fastreg(wc, frwr); |
---|
322 | | -} |
---|
323 | | - |
---|
324 | | -/** |
---|
325 | | - * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC |
---|
326 | | - * @cq: completion queue (ignored) |
---|
327 | | - * @wc: completed WR |
---|
328 | | - * |
---|
329 | | - */ |
---|
330 | | -static void |
---|
331 | | -frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) |
---|
332 | | -{ |
---|
333 | | - struct ib_cqe *cqe = wc->wr_cqe; |
---|
334 | | - struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, |
---|
335 | | - fr_cqe); |
---|
336 | | - |
---|
337 | | - /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
338 | | - if (wc->status != IB_WC_SUCCESS) { |
---|
339 | | - frwr->fr_state = FRWR_FLUSHED_LI; |
---|
340 | | - __frwr_sendcompletion_flush(wc, "localinv"); |
---|
341 | | - } |
---|
342 | | - trace_xprtrdma_wc_li(wc, frwr); |
---|
343 | | -} |
---|
344 | | - |
---|
345 | | -/** |
---|
346 | | - * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC |
---|
347 | | - * @cq: completion queue (ignored) |
---|
348 | | - * @wc: completed WR |
---|
349 | | - * |
---|
350 | | - * Awaken anyone waiting for an MR to finish being fenced. |
---|
351 | | - */ |
---|
352 | | -static void |
---|
353 | | -frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) |
---|
354 | | -{ |
---|
355 | | - struct ib_cqe *cqe = wc->wr_cqe; |
---|
356 | | - struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, |
---|
357 | | - fr_cqe); |
---|
358 | | - |
---|
359 | | - /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
360 | | - if (wc->status != IB_WC_SUCCESS) { |
---|
361 | | - frwr->fr_state = FRWR_FLUSHED_LI; |
---|
362 | | - __frwr_sendcompletion_flush(wc, "localinv"); |
---|
363 | | - } |
---|
364 | | - complete(&frwr->fr_linv_done); |
---|
365 | | - trace_xprtrdma_wc_li_wake(wc, frwr); |
---|
366 | | -} |
---|
367 | | - |
---|
368 | | -/* Post a REG_MR Work Request to register a memory region |
---|
| 275 | + * Prepare a REG_MR Work Request to register a memory region |
---|
369 | 276 | * for remote access via RDMA READ or RDMA WRITE. |
---|
| 277 | + * |
---|
| 278 | + * Returns the next segment or a negative errno pointer. |
---|
| 279 | + * On success, @mr is filled in. |
---|
370 | 280 | */ |
---|
371 | | -static struct rpcrdma_mr_seg * |
---|
372 | | -frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, |
---|
373 | | - int nsegs, bool writing, struct rpcrdma_mr **out) |
---|
| 281 | +struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, |
---|
| 282 | + struct rpcrdma_mr_seg *seg, |
---|
| 283 | + int nsegs, bool writing, __be32 xid, |
---|
| 284 | + struct rpcrdma_mr *mr) |
---|
374 | 285 | { |
---|
375 | | - struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
---|
376 | | - bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; |
---|
377 | | - struct rpcrdma_frwr *frwr; |
---|
378 | | - struct rpcrdma_mr *mr; |
---|
379 | | - struct ib_mr *ibmr; |
---|
| 286 | + struct rpcrdma_ep *ep = r_xprt->rx_ep; |
---|
380 | 287 | struct ib_reg_wr *reg_wr; |
---|
381 | | - int i, n; |
---|
| 288 | + int i, n, dma_nents; |
---|
| 289 | + struct ib_mr *ibmr; |
---|
382 | 290 | u8 key; |
---|
383 | 291 | |
---|
384 | | - mr = NULL; |
---|
385 | | - do { |
---|
386 | | - if (mr) |
---|
387 | | - rpcrdma_mr_defer_recovery(mr); |
---|
388 | | - mr = rpcrdma_mr_get(r_xprt); |
---|
389 | | - if (!mr) |
---|
390 | | - return ERR_PTR(-EAGAIN); |
---|
391 | | - } while (mr->frwr.fr_state != FRWR_IS_INVALID); |
---|
392 | | - frwr = &mr->frwr; |
---|
393 | | - frwr->fr_state = FRWR_IS_VALID; |
---|
394 | | - |
---|
395 | | - if (nsegs > ia->ri_max_frwr_depth) |
---|
396 | | - nsegs = ia->ri_max_frwr_depth; |
---|
| 292 | + if (nsegs > ep->re_max_fr_depth) |
---|
| 293 | + nsegs = ep->re_max_fr_depth; |
---|
397 | 294 | for (i = 0; i < nsegs;) { |
---|
398 | 295 | if (seg->mr_page) |
---|
399 | 296 | sg_set_page(&mr->mr_sg[i], |
---|
.. | .. |
---|
406 | 303 | |
---|
407 | 304 | ++seg; |
---|
408 | 305 | ++i; |
---|
409 | | - if (holes_ok) |
---|
| 306 | + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) |
---|
410 | 307 | continue; |
---|
411 | 308 | if ((i < nsegs && offset_in_page(seg->mr_offset)) || |
---|
412 | 309 | offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) |
---|
413 | 310 | break; |
---|
414 | 311 | } |
---|
415 | 312 | mr->mr_dir = rpcrdma_data_dir(writing); |
---|
| 313 | + mr->mr_nents = i; |
---|
416 | 314 | |
---|
417 | | - mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); |
---|
418 | | - if (!mr->mr_nents) |
---|
| 315 | + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, |
---|
| 316 | + mr->mr_dir); |
---|
| 317 | + if (!dma_nents) |
---|
419 | 318 | goto out_dmamap_err; |
---|
420 | | - trace_xprtrdma_dma_map(mr); |
---|
421 | 319 | |
---|
422 | | - ibmr = frwr->fr_mr; |
---|
423 | | - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); |
---|
424 | | - if (unlikely(n != mr->mr_nents)) |
---|
| 320 | + ibmr = mr->frwr.fr_mr; |
---|
| 321 | + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); |
---|
| 322 | + if (n != dma_nents) |
---|
425 | 323 | goto out_mapmr_err; |
---|
426 | 324 | |
---|
| 325 | + ibmr->iova &= 0x00000000ffffffff; |
---|
| 326 | + ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32; |
---|
427 | 327 | key = (u8)(ibmr->rkey & 0x000000FF); |
---|
428 | 328 | ib_update_fast_reg_key(ibmr, ++key); |
---|
429 | 329 | |
---|
430 | | - reg_wr = &frwr->fr_regwr; |
---|
| 330 | + reg_wr = &mr->frwr.fr_regwr; |
---|
431 | 331 | reg_wr->mr = ibmr; |
---|
432 | 332 | reg_wr->key = ibmr->rkey; |
---|
433 | 333 | reg_wr->access = writing ? |
---|
.. | .. |
---|
437 | 337 | mr->mr_handle = ibmr->rkey; |
---|
438 | 338 | mr->mr_length = ibmr->length; |
---|
439 | 339 | mr->mr_offset = ibmr->iova; |
---|
| 340 | + trace_xprtrdma_mr_map(mr); |
---|
440 | 341 | |
---|
441 | | - *out = mr; |
---|
442 | 342 | return seg; |
---|
443 | 343 | |
---|
444 | 344 | out_dmamap_err: |
---|
445 | | - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", |
---|
446 | | - mr->mr_sg, i); |
---|
447 | | - frwr->fr_state = FRWR_IS_INVALID; |
---|
448 | | - rpcrdma_mr_put(mr); |
---|
| 345 | + mr->mr_dir = DMA_NONE; |
---|
| 346 | + trace_xprtrdma_frwr_sgerr(mr, i); |
---|
449 | 347 | return ERR_PTR(-EIO); |
---|
450 | 348 | |
---|
451 | 349 | out_mapmr_err: |
---|
452 | | - pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", |
---|
453 | | - frwr->fr_mr, n, mr->mr_nents); |
---|
454 | | - rpcrdma_mr_defer_recovery(mr); |
---|
| 350 | + trace_xprtrdma_frwr_maperr(mr, n); |
---|
455 | 351 | return ERR_PTR(-EIO); |
---|
456 | 352 | } |
---|
457 | 353 | |
---|
458 | | -/* Post Send WR containing the RPC Call message. |
---|
| 354 | +/** |
---|
| 355 | + * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC |
---|
| 356 | + * @cq: completion queue |
---|
| 357 | + * @wc: WCE for a completed FastReg WR |
---|
459 | 358 | * |
---|
460 | | - * For FRMR, chain any FastReg WRs to the Send WR. Only a |
---|
| 359 | + */ |
---|
| 360 | +static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) |
---|
| 361 | +{ |
---|
| 362 | + struct ib_cqe *cqe = wc->wr_cqe; |
---|
| 363 | + struct rpcrdma_frwr *frwr = |
---|
| 364 | + container_of(cqe, struct rpcrdma_frwr, fr_cqe); |
---|
| 365 | + |
---|
| 366 | + /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
| 367 | + trace_xprtrdma_wc_fastreg(wc, frwr); |
---|
| 368 | + /* The MR will get recycled when the associated req is retransmitted */ |
---|
| 369 | + |
---|
| 370 | + rpcrdma_flush_disconnect(cq->cq_context, wc); |
---|
| 371 | +} |
---|
| 372 | + |
---|
| 373 | +/** |
---|
| 374 | + * frwr_send - post Send WRs containing the RPC Call message |
---|
| 375 | + * @r_xprt: controlling transport instance |
---|
| 376 | + * @req: prepared RPC Call |
---|
| 377 | + * |
---|
| 378 | + * For FRWR, chain any FastReg WRs to the Send WR. Only a |
---|
461 | 379 | * single ib_post_send call is needed to register memory |
---|
462 | 380 | * and then post the Send WR. |
---|
| 381 | + * |
---|
| 382 | + * Returns the return code from ib_post_send. |
---|
| 383 | + * |
---|
| 384 | + * Caller must hold the transport send lock to ensure that the |
---|
| 385 | + * pointers to the transport's rdma_cm_id and QP are stable. |
---|
463 | 386 | */ |
---|
464 | | -static int |
---|
465 | | -frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) |
---|
| 387 | +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
---|
466 | 388 | { |
---|
467 | 389 | struct ib_send_wr *post_wr; |
---|
468 | 390 | struct rpcrdma_mr *mr; |
---|
469 | 391 | |
---|
470 | | - post_wr = &req->rl_sendctx->sc_wr; |
---|
| 392 | + post_wr = &req->rl_wr; |
---|
471 | 393 | list_for_each_entry(mr, &req->rl_registered, mr_list) { |
---|
472 | 394 | struct rpcrdma_frwr *frwr; |
---|
473 | 395 | |
---|
.. | .. |
---|
483 | 405 | post_wr = &frwr->fr_regwr.wr; |
---|
484 | 406 | } |
---|
485 | 407 | |
---|
486 | | - /* If ib_post_send fails, the next ->send_request for |
---|
487 | | - * @req will queue these MWs for recovery. |
---|
488 | | - */ |
---|
489 | | - return ib_post_send(ia->ri_id->qp, post_wr, NULL); |
---|
| 408 | + return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); |
---|
490 | 409 | } |
---|
491 | 410 | |
---|
492 | | -/* Handle a remotely invalidated mr on the @mrs list |
---|
| 411 | +/** |
---|
| 412 | + * frwr_reminv - handle a remotely invalidated mr on the @mrs list |
---|
| 413 | + * @rep: Received reply |
---|
| 414 | + * @mrs: list of MRs to check |
---|
| 415 | + * |
---|
493 | 416 | */ |
---|
494 | | -static void |
---|
495 | | -frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) |
---|
| 417 | +void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) |
---|
496 | 418 | { |
---|
497 | 419 | struct rpcrdma_mr *mr; |
---|
498 | 420 | |
---|
499 | 421 | list_for_each_entry(mr, mrs, mr_list) |
---|
500 | 422 | if (mr->mr_handle == rep->rr_inv_rkey) { |
---|
501 | 423 | list_del_init(&mr->mr_list); |
---|
502 | | - trace_xprtrdma_remoteinv(mr); |
---|
503 | | - mr->frwr.fr_state = FRWR_IS_INVALID; |
---|
504 | | - rpcrdma_mr_unmap_and_put(mr); |
---|
| 424 | + trace_xprtrdma_mr_reminv(mr); |
---|
| 425 | + rpcrdma_mr_put(mr); |
---|
505 | 426 | break; /* only one invalidated MR per RPC */ |
---|
506 | 427 | } |
---|
507 | 428 | } |
---|
508 | 429 | |
---|
509 | | -/* Invalidate all memory regions that were registered for "req". |
---|
| 430 | +static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) |
---|
| 431 | +{ |
---|
| 432 | + if (wc->status != IB_WC_SUCCESS) |
---|
| 433 | + frwr_mr_recycle(mr); |
---|
| 434 | + else |
---|
| 435 | + rpcrdma_mr_put(mr); |
---|
| 436 | +} |
---|
| 437 | + |
---|
| 438 | +/** |
---|
| 439 | + * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC |
---|
| 440 | + * @cq: completion queue |
---|
| 441 | + * @wc: WCE for a completed LocalInv WR |
---|
510 | 442 | * |
---|
511 | | - * Sleeps until it is safe for the host CPU to access the |
---|
512 | | - * previously mapped memory regions. |
---|
513 | | - * |
---|
514 | | - * Caller ensures that @mrs is not empty before the call. This |
---|
515 | | - * function empties the list. |
---|
516 | 443 | */ |
---|
517 | | -static void |
---|
518 | | -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) |
---|
| 444 | +static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) |
---|
| 445 | +{ |
---|
| 446 | + struct ib_cqe *cqe = wc->wr_cqe; |
---|
| 447 | + struct rpcrdma_frwr *frwr = |
---|
| 448 | + container_of(cqe, struct rpcrdma_frwr, fr_cqe); |
---|
| 449 | + struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); |
---|
| 450 | + |
---|
| 451 | + /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
| 452 | + trace_xprtrdma_wc_li(wc, frwr); |
---|
| 453 | + __frwr_release_mr(wc, mr); |
---|
| 454 | + |
---|
| 455 | + rpcrdma_flush_disconnect(cq->cq_context, wc); |
---|
| 456 | +} |
---|
| 457 | + |
---|
| 458 | +/** |
---|
| 459 | + * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC |
---|
| 460 | + * @cq: completion queue |
---|
| 461 | + * @wc: WCE for a completed LocalInv WR |
---|
| 462 | + * |
---|
| 463 | + * Awaken anyone waiting for an MR to finish being fenced. |
---|
| 464 | + */ |
---|
| 465 | +static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) |
---|
| 466 | +{ |
---|
| 467 | + struct ib_cqe *cqe = wc->wr_cqe; |
---|
| 468 | + struct rpcrdma_frwr *frwr = |
---|
| 469 | + container_of(cqe, struct rpcrdma_frwr, fr_cqe); |
---|
| 470 | + struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); |
---|
| 471 | + |
---|
| 472 | + /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
| 473 | + trace_xprtrdma_wc_li_wake(wc, frwr); |
---|
| 474 | + __frwr_release_mr(wc, mr); |
---|
| 475 | + complete(&frwr->fr_linv_done); |
---|
| 476 | + |
---|
| 477 | + rpcrdma_flush_disconnect(cq->cq_context, wc); |
---|
| 478 | +} |
---|
| 479 | + |
---|
| 480 | +/** |
---|
| 481 | + * frwr_unmap_sync - invalidate memory regions that were registered for @req |
---|
| 482 | + * @r_xprt: controlling transport instance |
---|
| 483 | + * @req: rpcrdma_req with a non-empty list of MRs to process |
---|
| 484 | + * |
---|
| 485 | + * Sleeps until it is safe for the host CPU to access the previously mapped |
---|
| 486 | + * memory regions. This guarantees that registered MRs are properly fenced |
---|
| 487 | + * from the server before the RPC consumer accesses the data in them. It |
---|
| 488 | + * also ensures proper Send flow control: waking the next RPC waits until |
---|
| 489 | + * this RPC has relinquished all its Send Queue entries. |
---|
| 490 | + */ |
---|
| 491 | +void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
---|
519 | 492 | { |
---|
520 | 493 | struct ib_send_wr *first, **prev, *last; |
---|
521 | 494 | const struct ib_send_wr *bad_wr; |
---|
522 | | - struct rpcrdma_ia *ia = &r_xprt->rx_ia; |
---|
523 | 495 | struct rpcrdma_frwr *frwr; |
---|
524 | 496 | struct rpcrdma_mr *mr; |
---|
525 | | - int count, rc; |
---|
| 497 | + int rc; |
---|
526 | 498 | |
---|
527 | 499 | /* ORDER: Invalidate all of the MRs first |
---|
528 | 500 | * |
---|
.. | .. |
---|
530 | 502 | * a single ib_post_send() call. |
---|
531 | 503 | */ |
---|
532 | 504 | frwr = NULL; |
---|
533 | | - count = 0; |
---|
534 | 505 | prev = &first; |
---|
535 | | - list_for_each_entry(mr, mrs, mr_list) { |
---|
536 | | - mr->frwr.fr_state = FRWR_IS_INVALID; |
---|
| 506 | + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { |
---|
| 507 | + |
---|
| 508 | + trace_xprtrdma_mr_localinv(mr); |
---|
| 509 | + r_xprt->rx_stats.local_inv_needed++; |
---|
537 | 510 | |
---|
538 | 511 | frwr = &mr->frwr; |
---|
539 | | - trace_xprtrdma_localinv(mr); |
---|
540 | | - |
---|
541 | 512 | frwr->fr_cqe.done = frwr_wc_localinv; |
---|
542 | 513 | last = &frwr->fr_invwr; |
---|
543 | | - memset(last, 0, sizeof(*last)); |
---|
| 514 | + last->next = NULL; |
---|
544 | 515 | last->wr_cqe = &frwr->fr_cqe; |
---|
| 516 | + last->sg_list = NULL; |
---|
| 517 | + last->num_sge = 0; |
---|
545 | 518 | last->opcode = IB_WR_LOCAL_INV; |
---|
| 519 | + last->send_flags = IB_SEND_SIGNALED; |
---|
546 | 520 | last->ex.invalidate_rkey = mr->mr_handle; |
---|
547 | | - count++; |
---|
548 | 521 | |
---|
549 | 522 | *prev = last; |
---|
550 | 523 | prev = &last->next; |
---|
551 | 524 | } |
---|
552 | | - if (!frwr) |
---|
553 | | - goto unmap; |
---|
554 | 525 | |
---|
555 | 526 | /* Strong send queue ordering guarantees that when the |
---|
556 | 527 | * last WR in the chain completes, all WRs in the chain |
---|
557 | 528 | * are complete. |
---|
558 | 529 | */ |
---|
559 | | - last->send_flags = IB_SEND_SIGNALED; |
---|
560 | 530 | frwr->fr_cqe.done = frwr_wc_localinv_wake; |
---|
561 | 531 | reinit_completion(&frwr->fr_linv_done); |
---|
562 | 532 | |
---|
563 | 533 | /* Transport disconnect drains the receive CQ before it |
---|
564 | 534 | * replaces the QP. The RPC reply handler won't call us |
---|
565 | | - * unless ri_id->qp is a valid pointer. |
---|
| 535 | + * unless re_id->qp is a valid pointer. |
---|
566 | 536 | */ |
---|
567 | | - r_xprt->rx_stats.local_inv_needed++; |
---|
568 | 537 | bad_wr = NULL; |
---|
569 | | - rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); |
---|
| 538 | + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); |
---|
| 539 | + |
---|
| 540 | + /* The final LOCAL_INV WR in the chain is supposed to |
---|
| 541 | + * do the wake. If it was never posted, the wake will |
---|
| 542 | + * not happen, so don't wait in that case. |
---|
| 543 | + */ |
---|
570 | 544 | if (bad_wr != first) |
---|
571 | 545 | wait_for_completion(&frwr->fr_linv_done); |
---|
572 | | - if (rc) |
---|
573 | | - goto reset_mrs; |
---|
| 546 | + if (!rc) |
---|
| 547 | + return; |
---|
574 | 548 | |
---|
575 | | - /* ORDER: Now DMA unmap all of the MRs, and return |
---|
576 | | - * them to the free MR list. |
---|
| 549 | + /* Recycle MRs in the LOCAL_INV chain that did not get posted. |
---|
577 | 550 | */ |
---|
578 | | -unmap: |
---|
579 | | - while (!list_empty(mrs)) { |
---|
580 | | - mr = rpcrdma_mr_pop(mrs); |
---|
581 | | - rpcrdma_mr_unmap_and_put(mr); |
---|
582 | | - } |
---|
583 | | - return; |
---|
584 | | - |
---|
585 | | -reset_mrs: |
---|
586 | | - pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); |
---|
587 | | - |
---|
588 | | - /* Find and reset the MRs in the LOCAL_INV WRs that did not |
---|
589 | | - * get posted. |
---|
590 | | - */ |
---|
| 551 | + trace_xprtrdma_post_linv(req, rc); |
---|
591 | 552 | while (bad_wr) { |
---|
592 | 553 | frwr = container_of(bad_wr, struct rpcrdma_frwr, |
---|
593 | 554 | fr_invwr); |
---|
594 | 555 | mr = container_of(frwr, struct rpcrdma_mr, frwr); |
---|
595 | | - |
---|
596 | | - __frwr_mr_reset(ia, mr); |
---|
597 | | - |
---|
598 | 556 | bad_wr = bad_wr->next; |
---|
| 557 | + |
---|
| 558 | + frwr_mr_recycle(mr); |
---|
599 | 559 | } |
---|
600 | | - goto unmap; |
---|
601 | 560 | } |
---|
602 | 561 | |
---|
603 | | -const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { |
---|
604 | | - .ro_map = frwr_op_map, |
---|
605 | | - .ro_send = frwr_op_send, |
---|
606 | | - .ro_reminv = frwr_op_reminv, |
---|
607 | | - .ro_unmap_sync = frwr_op_unmap_sync, |
---|
608 | | - .ro_recover_mr = frwr_op_recover_mr, |
---|
609 | | - .ro_open = frwr_op_open, |
---|
610 | | - .ro_maxpages = frwr_op_maxpages, |
---|
611 | | - .ro_init_mr = frwr_op_init_mr, |
---|
612 | | - .ro_release_mr = frwr_op_release_mr, |
---|
613 | | - .ro_displayname = "frwr", |
---|
614 | | - .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, |
---|
615 | | -}; |
---|
| 562 | +/** |
---|
| 563 | + * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC |
---|
| 564 | + * @cq: completion queue |
---|
| 565 | + * @wc: WCE for a completed LocalInv WR |
---|
| 566 | + * |
---|
| 567 | + */ |
---|
| 568 | +static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) |
---|
| 569 | +{ |
---|
| 570 | + struct ib_cqe *cqe = wc->wr_cqe; |
---|
| 571 | + struct rpcrdma_frwr *frwr = |
---|
| 572 | + container_of(cqe, struct rpcrdma_frwr, fr_cqe); |
---|
| 573 | + struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); |
---|
| 574 | + struct rpcrdma_rep *rep = mr->mr_req->rl_reply; |
---|
| 575 | + |
---|
| 576 | + /* WARNING: Only wr_cqe and status are reliable at this point */ |
---|
| 577 | + trace_xprtrdma_wc_li_done(wc, frwr); |
---|
| 578 | + __frwr_release_mr(wc, mr); |
---|
| 579 | + |
---|
| 580 | + /* Ensure @rep is generated before __frwr_release_mr */ |
---|
| 581 | + smp_rmb(); |
---|
| 582 | + rpcrdma_complete_rqst(rep); |
---|
| 583 | + |
---|
| 584 | + rpcrdma_flush_disconnect(cq->cq_context, wc); |
---|
| 585 | +} |
---|
| 586 | + |
---|
| 587 | +/** |
---|
| 588 | + * frwr_unmap_async - invalidate memory regions that were registered for @req |
---|
| 589 | + * @r_xprt: controlling transport instance |
---|
| 590 | + * @req: rpcrdma_req with a non-empty list of MRs to process |
---|
| 591 | + * |
---|
| 592 | + * This guarantees that registered MRs are properly fenced from the |
---|
| 593 | + * server before the RPC consumer accesses the data in them. It also |
---|
| 594 | + * ensures proper Send flow control: waking the next RPC waits until |
---|
| 595 | + * this RPC has relinquished all its Send Queue entries. |
---|
| 596 | + */ |
---|
| 597 | +void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) |
---|
| 598 | +{ |
---|
| 599 | + struct ib_send_wr *first, *last, **prev; |
---|
| 600 | + const struct ib_send_wr *bad_wr; |
---|
| 601 | + struct rpcrdma_frwr *frwr; |
---|
| 602 | + struct rpcrdma_mr *mr; |
---|
| 603 | + int rc; |
---|
| 604 | + |
---|
| 605 | + /* Chain the LOCAL_INV Work Requests and post them with |
---|
| 606 | + * a single ib_post_send() call. |
---|
| 607 | + */ |
---|
| 608 | + frwr = NULL; |
---|
| 609 | + prev = &first; |
---|
| 610 | + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { |
---|
| 611 | + |
---|
| 612 | + trace_xprtrdma_mr_localinv(mr); |
---|
| 613 | + r_xprt->rx_stats.local_inv_needed++; |
---|
| 614 | + |
---|
| 615 | + frwr = &mr->frwr; |
---|
| 616 | + frwr->fr_cqe.done = frwr_wc_localinv; |
---|
| 617 | + last = &frwr->fr_invwr; |
---|
| 618 | + last->next = NULL; |
---|
| 619 | + last->wr_cqe = &frwr->fr_cqe; |
---|
| 620 | + last->sg_list = NULL; |
---|
| 621 | + last->num_sge = 0; |
---|
| 622 | + last->opcode = IB_WR_LOCAL_INV; |
---|
| 623 | + last->send_flags = IB_SEND_SIGNALED; |
---|
| 624 | + last->ex.invalidate_rkey = mr->mr_handle; |
---|
| 625 | + |
---|
| 626 | + *prev = last; |
---|
| 627 | + prev = &last->next; |
---|
| 628 | + } |
---|
| 629 | + |
---|
| 630 | + /* Strong send queue ordering guarantees that when the |
---|
| 631 | + * last WR in the chain completes, all WRs in the chain |
---|
| 632 | + * are complete. The last completion will wake up the |
---|
| 633 | + * RPC waiter. |
---|
| 634 | + */ |
---|
| 635 | + frwr->fr_cqe.done = frwr_wc_localinv_done; |
---|
| 636 | + |
---|
| 637 | + /* Transport disconnect drains the receive CQ before it |
---|
| 638 | + * replaces the QP. The RPC reply handler won't call us |
---|
| 639 | + * unless re_id->qp is a valid pointer. |
---|
| 640 | + */ |
---|
| 641 | + bad_wr = NULL; |
---|
| 642 | + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); |
---|
| 643 | + if (!rc) |
---|
| 644 | + return; |
---|
| 645 | + |
---|
| 646 | + /* Recycle MRs in the LOCAL_INV chain that did not get posted. |
---|
| 647 | + */ |
---|
| 648 | + trace_xprtrdma_post_linv(req, rc); |
---|
| 649 | + while (bad_wr) { |
---|
| 650 | + frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); |
---|
| 651 | + mr = container_of(frwr, struct rpcrdma_mr, frwr); |
---|
| 652 | + bad_wr = bad_wr->next; |
---|
| 653 | + |
---|
| 654 | + frwr_mr_recycle(mr); |
---|
| 655 | + } |
---|
| 656 | + |
---|
| 657 | + /* The final LOCAL_INV WR in the chain is supposed to |
---|
| 658 | + * do the wake. If it was never posted, the wake will |
---|
| 659 | + * not happen, so wake here in that case. |
---|
| 660 | + */ |
---|
| 661 | + rpcrdma_complete_rqst(req->rl_reply); |
---|
| 662 | +} |
---|