.. | .. |
---|
36 | 36 | |
---|
37 | 37 | #include "mlx5_ib.h" |
---|
38 | 38 | #include "cmd.h" |
---|
| 39 | +#include "qp.h" |
---|
| 40 | + |
---|
| 41 | +#include <linux/mlx5/eq.h> |
---|
| 42 | + |
---|
| 43 | +/* Contains the details of a pagefault. */ |
---|
| 44 | +struct mlx5_pagefault { |
---|
| 45 | + u32 bytes_committed; |
---|
| 46 | + u32 token; |
---|
| 47 | + u8 event_subtype; |
---|
| 48 | + u8 type; |
---|
| 49 | + union { |
---|
| 50 | + /* Initiator or send message responder pagefault details. */ |
---|
| 51 | + struct { |
---|
| 52 | + /* Received packet size, only valid for responders. */ |
---|
| 53 | + u32 packet_size; |
---|
| 54 | + /* |
---|
| 55 | + * Number of resource holding WQE, depends on type. |
---|
| 56 | + */ |
---|
| 57 | + u32 wq_num; |
---|
| 58 | + /* |
---|
| 59 | + * WQE index. Refers to either the send queue or |
---|
| 60 | + * receive queue, according to event_subtype. |
---|
| 61 | + */ |
---|
| 62 | + u16 wqe_index; |
---|
| 63 | + } wqe; |
---|
| 64 | + /* RDMA responder pagefault details */ |
---|
| 65 | + struct { |
---|
| 66 | + u32 r_key; |
---|
| 67 | + /* |
---|
| 68 | + * Received packet size, minimal size page fault |
---|
| 69 | + * resolution required for forward progress. |
---|
| 70 | + */ |
---|
| 71 | + u32 packet_size; |
---|
| 72 | + u32 rdma_op_len; |
---|
| 73 | + u64 rdma_va; |
---|
| 74 | + } rdma; |
---|
| 75 | + }; |
---|
| 76 | + |
---|
| 77 | + struct mlx5_ib_pf_eq *eq; |
---|
| 78 | + struct work_struct work; |
---|
| 79 | +}; |
---|
39 | 80 | |
---|
40 | 81 | #define MAX_PREFETCH_LEN (4*1024*1024U) |
---|
41 | 82 | |
---|
.. | .. |
---|
53 | 94 | |
---|
54 | 95 | static u64 mlx5_imr_ksm_entries; |
---|
55 | 96 | |
---|
56 | | -static int check_parent(struct ib_umem_odp *odp, |
---|
57 | | - struct mlx5_ib_mr *parent) |
---|
| 97 | +static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, |
---|
| 98 | + struct mlx5_ib_mr *imr, int flags) |
---|
58 | 99 | { |
---|
59 | | - struct mlx5_ib_mr *mr = odp->private; |
---|
60 | | - |
---|
61 | | - return mr && mr->parent == parent && !odp->dying; |
---|
62 | | -} |
---|
63 | | - |
---|
64 | | -static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) |
---|
65 | | -{ |
---|
66 | | - struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; |
---|
67 | | - struct ib_ucontext *ctx = odp->umem->context; |
---|
68 | | - struct rb_node *rb; |
---|
69 | | - |
---|
70 | | - down_read(&ctx->umem_rwsem); |
---|
71 | | - while (1) { |
---|
72 | | - rb = rb_next(&odp->interval_tree.rb); |
---|
73 | | - if (!rb) |
---|
74 | | - goto not_found; |
---|
75 | | - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); |
---|
76 | | - if (check_parent(odp, parent)) |
---|
77 | | - goto end; |
---|
78 | | - } |
---|
79 | | -not_found: |
---|
80 | | - odp = NULL; |
---|
81 | | -end: |
---|
82 | | - up_read(&ctx->umem_rwsem); |
---|
83 | | - return odp; |
---|
84 | | -} |
---|
85 | | - |
---|
86 | | -static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, |
---|
87 | | - u64 start, u64 length, |
---|
88 | | - struct mlx5_ib_mr *parent) |
---|
89 | | -{ |
---|
90 | | - struct ib_umem_odp *odp; |
---|
91 | | - struct rb_node *rb; |
---|
92 | | - |
---|
93 | | - down_read(&ctx->umem_rwsem); |
---|
94 | | - odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); |
---|
95 | | - if (!odp) |
---|
96 | | - goto end; |
---|
97 | | - |
---|
98 | | - while (1) { |
---|
99 | | - if (check_parent(odp, parent)) |
---|
100 | | - goto end; |
---|
101 | | - rb = rb_next(&odp->interval_tree.rb); |
---|
102 | | - if (!rb) |
---|
103 | | - goto not_found; |
---|
104 | | - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); |
---|
105 | | - if (ib_umem_start(odp->umem) > start + length) |
---|
106 | | - goto not_found; |
---|
107 | | - } |
---|
108 | | -not_found: |
---|
109 | | - odp = NULL; |
---|
110 | | -end: |
---|
111 | | - up_read(&ctx->umem_rwsem); |
---|
112 | | - return odp; |
---|
113 | | -} |
---|
114 | | - |
---|
115 | | -void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, |
---|
116 | | - size_t nentries, struct mlx5_ib_mr *mr, int flags) |
---|
117 | | -{ |
---|
118 | | - struct ib_pd *pd = mr->ibmr.pd; |
---|
119 | | - struct ib_ucontext *ctx = pd->uobject->context; |
---|
120 | | - struct mlx5_ib_dev *dev = to_mdev(pd->device); |
---|
121 | | - struct ib_umem_odp *odp; |
---|
122 | | - unsigned long va; |
---|
123 | | - int i; |
---|
| 100 | + struct mlx5_klm *end = pklm + nentries; |
---|
124 | 101 | |
---|
125 | 102 | if (flags & MLX5_IB_UPD_XLT_ZAP) { |
---|
126 | | - for (i = 0; i < nentries; i++, pklm++) { |
---|
| 103 | + for (; pklm != end; pklm++, idx++) { |
---|
127 | 104 | pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); |
---|
128 | | - pklm->key = cpu_to_be32(dev->null_mkey); |
---|
| 105 | + pklm->key = cpu_to_be32(imr->dev->null_mkey); |
---|
129 | 106 | pklm->va = 0; |
---|
130 | 107 | } |
---|
131 | 108 | return; |
---|
132 | 109 | } |
---|
133 | 110 | |
---|
134 | | - odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, |
---|
135 | | - nentries * MLX5_IMR_MTT_SIZE, mr); |
---|
| 111 | + /* |
---|
| 112 | + * The locking here is pretty subtle. Ideally the implicit_children |
---|
| 113 | + * xarray would be protected by the umem_mutex, however that is not |
---|
| 114 | + * possible. Instead this uses a weaker update-then-lock pattern: |
---|
| 115 | + * |
---|
| 116 | + * srcu_read_lock() |
---|
| 117 | + * xa_store() |
---|
| 118 | + * mutex_lock(umem_mutex) |
---|
| 119 | + * mlx5_ib_update_xlt() |
---|
| 120 | + * mutex_unlock(umem_mutex) |
---|
| 121 | + * destroy lkey |
---|
| 122 | + * |
---|
| 123 | + * ie any change the xarray must be followed by the locked update_xlt |
---|
| 124 | + * before destroying. |
---|
| 125 | + * |
---|
| 126 | + * The umem_mutex provides the acquire/release semantic needed to make |
---|
| 127 | + * the xa_store() visible to a racing thread. While SRCU is not |
---|
| 128 | + * technically required, using it gives consistent use of the SRCU |
---|
| 129 | + * locking around the xarray. |
---|
| 130 | + */ |
---|
| 131 | + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); |
---|
| 132 | + lockdep_assert_held(&imr->dev->odp_srcu); |
---|
136 | 133 | |
---|
137 | | - for (i = 0; i < nentries; i++, pklm++) { |
---|
| 134 | + for (; pklm != end; pklm++, idx++) { |
---|
| 135 | + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); |
---|
| 136 | + |
---|
138 | 137 | pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); |
---|
139 | | - va = (offset + i) * MLX5_IMR_MTT_SIZE; |
---|
140 | | - if (odp && odp->umem->address == va) { |
---|
141 | | - struct mlx5_ib_mr *mtt = odp->private; |
---|
142 | | - |
---|
| 138 | + if (mtt) { |
---|
143 | 139 | pklm->key = cpu_to_be32(mtt->ibmr.lkey); |
---|
144 | | - odp = odp_next(odp); |
---|
| 140 | + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); |
---|
145 | 141 | } else { |
---|
146 | | - pklm->key = cpu_to_be32(dev->null_mkey); |
---|
| 142 | + pklm->key = cpu_to_be32(imr->dev->null_mkey); |
---|
| 143 | + pklm->va = 0; |
---|
147 | 144 | } |
---|
148 | | - mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", |
---|
149 | | - i, va, be32_to_cpu(pklm->key)); |
---|
150 | 145 | } |
---|
151 | 146 | } |
---|
152 | 147 | |
---|
153 | | -static void mr_leaf_free_action(struct work_struct *work) |
---|
| 148 | +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) |
---|
154 | 149 | { |
---|
155 | | - struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); |
---|
156 | | - int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; |
---|
157 | | - struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; |
---|
| 150 | + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; |
---|
158 | 151 | |
---|
159 | | - mr->parent = NULL; |
---|
160 | | - synchronize_srcu(&mr->dev->mr_srcu); |
---|
| 152 | + if (umem_dma & ODP_READ_ALLOWED_BIT) |
---|
| 153 | + mtt_entry |= MLX5_IB_MTT_READ; |
---|
| 154 | + if (umem_dma & ODP_WRITE_ALLOWED_BIT) |
---|
| 155 | + mtt_entry |= MLX5_IB_MTT_WRITE; |
---|
161 | 156 | |
---|
162 | | - ib_umem_release(odp->umem); |
---|
163 | | - if (imr->live) |
---|
164 | | - mlx5_ib_update_xlt(imr, idx, 1, 0, |
---|
165 | | - MLX5_IB_UPD_XLT_INDIRECT | |
---|
166 | | - MLX5_IB_UPD_XLT_ATOMIC); |
---|
167 | | - mlx5_mr_cache_free(mr->dev, mr); |
---|
168 | | - |
---|
169 | | - if (atomic_dec_and_test(&imr->num_leaf_free)) |
---|
170 | | - wake_up(&imr->q_leaf_free); |
---|
| 157 | + return mtt_entry; |
---|
171 | 158 | } |
---|
172 | 159 | |
---|
173 | | -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, |
---|
174 | | - unsigned long end) |
---|
| 160 | +static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, |
---|
| 161 | + struct mlx5_ib_mr *mr, int flags) |
---|
175 | 162 | { |
---|
| 163 | + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); |
---|
| 164 | + dma_addr_t pa; |
---|
| 165 | + size_t i; |
---|
| 166 | + |
---|
| 167 | + if (flags & MLX5_IB_UPD_XLT_ZAP) |
---|
| 168 | + return; |
---|
| 169 | + |
---|
| 170 | + for (i = 0; i < nentries; i++) { |
---|
| 171 | + pa = odp->dma_list[idx + i]; |
---|
| 172 | + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); |
---|
| 173 | + } |
---|
| 174 | +} |
---|
| 175 | + |
---|
| 176 | +void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, |
---|
| 177 | + struct mlx5_ib_mr *mr, int flags) |
---|
| 178 | +{ |
---|
| 179 | + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { |
---|
| 180 | + populate_klm(xlt, idx, nentries, mr, flags); |
---|
| 181 | + } else { |
---|
| 182 | + populate_mtt(xlt, idx, nentries, mr, flags); |
---|
| 183 | + } |
---|
| 184 | +} |
---|
| 185 | + |
---|
| 186 | +static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) |
---|
| 187 | +{ |
---|
| 188 | + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); |
---|
| 189 | + |
---|
| 190 | + /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ |
---|
| 191 | + mutex_lock(&odp->umem_mutex); |
---|
| 192 | + if (odp->npages) { |
---|
| 193 | + mlx5_mr_cache_invalidate(mr); |
---|
| 194 | + ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), |
---|
| 195 | + ib_umem_end(odp)); |
---|
| 196 | + WARN_ON(odp->npages); |
---|
| 197 | + } |
---|
| 198 | + odp->private = NULL; |
---|
| 199 | + mutex_unlock(&odp->umem_mutex); |
---|
| 200 | + |
---|
| 201 | + if (!mr->cache_ent) { |
---|
| 202 | + mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); |
---|
| 203 | + WARN_ON(mr->descs); |
---|
| 204 | + } |
---|
| 205 | +} |
---|
| 206 | + |
---|
| 207 | +/* |
---|
| 208 | + * This must be called after the mr has been removed from implicit_children |
---|
| 209 | + * and the SRCU synchronized. NOTE: The MR does not necessarily have to be |
---|
| 210 | + * empty here, parallel page faults could have raced with the free process and |
---|
| 211 | + * added pages to it. |
---|
| 212 | + */ |
---|
| 213 | +static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) |
---|
| 214 | +{ |
---|
| 215 | + struct mlx5_ib_mr *imr = mr->parent; |
---|
| 216 | + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); |
---|
| 217 | + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); |
---|
| 218 | + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; |
---|
| 219 | + int srcu_key; |
---|
| 220 | + |
---|
| 221 | + /* implicit_child_mr's are not allowed to have deferred work */ |
---|
| 222 | + WARN_ON(atomic_read(&mr->num_deferred_work)); |
---|
| 223 | + |
---|
| 224 | + if (need_imr_xlt) { |
---|
| 225 | + srcu_key = srcu_read_lock(&mr->dev->odp_srcu); |
---|
| 226 | + mutex_lock(&odp_imr->umem_mutex); |
---|
| 227 | + mlx5_ib_update_xlt(mr->parent, idx, 1, 0, |
---|
| 228 | + MLX5_IB_UPD_XLT_INDIRECT | |
---|
| 229 | + MLX5_IB_UPD_XLT_ATOMIC); |
---|
| 230 | + mutex_unlock(&odp_imr->umem_mutex); |
---|
| 231 | + srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); |
---|
| 232 | + } |
---|
| 233 | + |
---|
| 234 | + dma_fence_odp_mr(mr); |
---|
| 235 | + |
---|
| 236 | + mr->parent = NULL; |
---|
| 237 | + mlx5_mr_cache_free(mr->dev, mr); |
---|
| 238 | + ib_umem_odp_release(odp); |
---|
| 239 | + if (atomic_dec_and_test(&imr->num_deferred_work)) |
---|
| 240 | + wake_up(&imr->q_deferred_work); |
---|
| 241 | +} |
---|
| 242 | + |
---|
| 243 | +static void free_implicit_child_mr_work(struct work_struct *work) |
---|
| 244 | +{ |
---|
| 245 | + struct mlx5_ib_mr *mr = |
---|
| 246 | + container_of(work, struct mlx5_ib_mr, odp_destroy.work); |
---|
| 247 | + |
---|
| 248 | + free_implicit_child_mr(mr, true); |
---|
| 249 | +} |
---|
| 250 | + |
---|
| 251 | +static void free_implicit_child_mr_rcu(struct rcu_head *head) |
---|
| 252 | +{ |
---|
| 253 | + struct mlx5_ib_mr *mr = |
---|
| 254 | + container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); |
---|
| 255 | + |
---|
| 256 | + /* Freeing a MR is a sleeping operation, so bounce to a work queue */ |
---|
| 257 | + INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); |
---|
| 258 | + queue_work(system_unbound_wq, &mr->odp_destroy.work); |
---|
| 259 | +} |
---|
| 260 | + |
---|
| 261 | +static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) |
---|
| 262 | +{ |
---|
| 263 | + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); |
---|
| 264 | + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; |
---|
| 265 | + struct mlx5_ib_mr *imr = mr->parent; |
---|
| 266 | + |
---|
| 267 | + xa_lock(&imr->implicit_children); |
---|
| 268 | + /* |
---|
| 269 | + * This can race with mlx5_ib_free_implicit_mr(), the first one to |
---|
| 270 | + * reach the xa lock wins the race and destroys the MR. |
---|
| 271 | + */ |
---|
| 272 | + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != |
---|
| 273 | + mr) |
---|
| 274 | + goto out_unlock; |
---|
| 275 | + |
---|
| 276 | + atomic_inc(&imr->num_deferred_work); |
---|
| 277 | + call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, |
---|
| 278 | + free_implicit_child_mr_rcu); |
---|
| 279 | + |
---|
| 280 | +out_unlock: |
---|
| 281 | + xa_unlock(&imr->implicit_children); |
---|
| 282 | +} |
---|
| 283 | + |
---|
| 284 | +static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, |
---|
| 285 | + const struct mmu_notifier_range *range, |
---|
| 286 | + unsigned long cur_seq) |
---|
| 287 | +{ |
---|
| 288 | + struct ib_umem_odp *umem_odp = |
---|
| 289 | + container_of(mni, struct ib_umem_odp, notifier); |
---|
176 | 290 | struct mlx5_ib_mr *mr; |
---|
177 | 291 | const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / |
---|
178 | 292 | sizeof(struct mlx5_mtt)) - 1; |
---|
179 | 293 | u64 idx = 0, blk_start_idx = 0; |
---|
| 294 | + u64 invalidations = 0; |
---|
| 295 | + unsigned long start; |
---|
| 296 | + unsigned long end; |
---|
180 | 297 | int in_block = 0; |
---|
181 | 298 | u64 addr; |
---|
182 | 299 | |
---|
183 | | - if (!umem || !umem->odp_data) { |
---|
184 | | - pr_err("invalidation called on NULL umem or non-ODP umem\n"); |
---|
185 | | - return; |
---|
186 | | - } |
---|
| 300 | + if (!mmu_notifier_range_blockable(range)) |
---|
| 301 | + return false; |
---|
187 | 302 | |
---|
188 | | - mr = umem->odp_data->private; |
---|
| 303 | + mutex_lock(&umem_odp->umem_mutex); |
---|
| 304 | + mmu_interval_set_seq(mni, cur_seq); |
---|
| 305 | + /* |
---|
| 306 | + * If npages is zero then umem_odp->private may not be setup yet. This |
---|
| 307 | + * does not complete until after the first page is mapped for DMA. |
---|
| 308 | + */ |
---|
| 309 | + if (!umem_odp->npages) |
---|
| 310 | + goto out; |
---|
| 311 | + mr = umem_odp->private; |
---|
189 | 312 | |
---|
190 | | - if (!mr || !mr->ibmr.pd) |
---|
191 | | - return; |
---|
192 | | - |
---|
193 | | - start = max_t(u64, ib_umem_start(umem), start); |
---|
194 | | - end = min_t(u64, ib_umem_end(umem), end); |
---|
| 313 | + start = max_t(u64, ib_umem_start(umem_odp), range->start); |
---|
| 314 | + end = min_t(u64, ib_umem_end(umem_odp), range->end); |
---|
195 | 315 | |
---|
196 | 316 | /* |
---|
197 | 317 | * Iteration one - zap the HW's MTTs. The notifiers_count ensures that |
---|
.. | .. |
---|
199 | 319 | * overwrite the same MTTs. Concurent invalidations might race us, |
---|
200 | 320 | * but they will write 0s as well, so no difference in the end result. |
---|
201 | 321 | */ |
---|
202 | | - |
---|
203 | | - for (addr = start; addr < end; addr += BIT(umem->page_shift)) { |
---|
204 | | - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; |
---|
| 322 | + for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { |
---|
| 323 | + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; |
---|
205 | 324 | /* |
---|
206 | 325 | * Strive to write the MTTs in chunks, but avoid overwriting |
---|
207 | 326 | * non-existing MTTs. The huristic here can be improved to |
---|
208 | 327 | * estimate the cost of another UMR vs. the cost of bigger |
---|
209 | 328 | * UMR. |
---|
210 | 329 | */ |
---|
211 | | - if (umem->odp_data->dma_list[idx] & |
---|
| 330 | + if (umem_odp->dma_list[idx] & |
---|
212 | 331 | (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { |
---|
213 | 332 | if (!in_block) { |
---|
214 | 333 | blk_start_idx = idx; |
---|
215 | 334 | in_block = 1; |
---|
216 | 335 | } |
---|
| 336 | + |
---|
| 337 | + /* Count page invalidations */ |
---|
| 338 | + invalidations += idx - blk_start_idx + 1; |
---|
217 | 339 | } else { |
---|
218 | 340 | u64 umr_offset = idx & umr_block_mask; |
---|
219 | 341 | |
---|
.. | .. |
---|
231 | 353 | idx - blk_start_idx + 1, 0, |
---|
232 | 354 | MLX5_IB_UPD_XLT_ZAP | |
---|
233 | 355 | MLX5_IB_UPD_XLT_ATOMIC); |
---|
| 356 | + |
---|
| 357 | + mlx5_update_odp_stats(mr, invalidations, invalidations); |
---|
| 358 | + |
---|
234 | 359 | /* |
---|
235 | 360 | * We are now sure that the device will not access the |
---|
236 | 361 | * memory. We can safely unmap it, and mark it as dirty if |
---|
237 | 362 | * needed. |
---|
238 | 363 | */ |
---|
239 | 364 | |
---|
240 | | - ib_umem_odp_unmap_dma_pages(umem, start, end); |
---|
| 365 | + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); |
---|
241 | 366 | |
---|
242 | | - if (unlikely(!umem->npages && mr->parent && |
---|
243 | | - !umem->odp_data->dying)) { |
---|
244 | | - WRITE_ONCE(umem->odp_data->dying, 1); |
---|
245 | | - atomic_inc(&mr->parent->num_leaf_free); |
---|
246 | | - schedule_work(&umem->odp_data->work); |
---|
247 | | - } |
---|
| 367 | + if (unlikely(!umem_odp->npages && mr->parent)) |
---|
| 368 | + destroy_unused_implicit_child_mr(mr); |
---|
| 369 | +out: |
---|
| 370 | + mutex_unlock(&umem_odp->umem_mutex); |
---|
| 371 | + return true; |
---|
248 | 372 | } |
---|
| 373 | + |
---|
| 374 | +const struct mmu_interval_notifier_ops mlx5_mn_ops = { |
---|
| 375 | + .invalidate = mlx5_ib_invalidate_range, |
---|
| 376 | +}; |
---|
249 | 377 | |
---|
250 | 378 | void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) |
---|
251 | 379 | { |
---|
.. | .. |
---|
253 | 381 | |
---|
254 | 382 | memset(caps, 0, sizeof(*caps)); |
---|
255 | 383 | |
---|
256 | | - if (!MLX5_CAP_GEN(dev->mdev, pg)) |
---|
| 384 | + if (!MLX5_CAP_GEN(dev->mdev, pg) || |
---|
| 385 | + !mlx5_ib_can_load_pas_with_umr(dev, 0)) |
---|
257 | 386 | return; |
---|
258 | 387 | |
---|
259 | 388 | caps->general_caps = IB_ODP_SUPPORT; |
---|
.. | .. |
---|
265 | 394 | |
---|
266 | 395 | if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) |
---|
267 | 396 | caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; |
---|
| 397 | + |
---|
| 398 | + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) |
---|
| 399 | + caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; |
---|
268 | 400 | |
---|
269 | 401 | if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) |
---|
270 | 402 | caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; |
---|
.. | .. |
---|
281 | 413 | if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) |
---|
282 | 414 | caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; |
---|
283 | 415 | |
---|
| 416 | + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) |
---|
| 417 | + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; |
---|
| 418 | + |
---|
| 419 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) |
---|
| 420 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; |
---|
| 421 | + |
---|
| 422 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) |
---|
| 423 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; |
---|
| 424 | + |
---|
| 425 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) |
---|
| 426 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; |
---|
| 427 | + |
---|
| 428 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) |
---|
| 429 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; |
---|
| 430 | + |
---|
| 431 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) |
---|
| 432 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; |
---|
| 433 | + |
---|
| 434 | + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) |
---|
| 435 | + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; |
---|
| 436 | + |
---|
284 | 437 | if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && |
---|
285 | 438 | MLX5_CAP_GEN(dev->mdev, null_mkey) && |
---|
286 | | - MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) |
---|
| 439 | + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && |
---|
| 440 | + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) |
---|
287 | 441 | caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; |
---|
288 | | - |
---|
289 | | - return; |
---|
290 | 442 | } |
---|
291 | 443 | |
---|
292 | 444 | static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, |
---|
.. | .. |
---|
295 | 447 | { |
---|
296 | 448 | int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? |
---|
297 | 449 | pfault->wqe.wq_num : pfault->token; |
---|
298 | | - int ret = mlx5_core_page_fault_resume(dev->mdev, |
---|
299 | | - pfault->token, |
---|
300 | | - wq_num, |
---|
301 | | - pfault->type, |
---|
302 | | - error); |
---|
303 | | - if (ret) |
---|
304 | | - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", |
---|
305 | | - wq_num); |
---|
306 | | -} |
---|
307 | | - |
---|
308 | | -static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, |
---|
309 | | - struct ib_umem *umem, |
---|
310 | | - bool ksm, int access_flags) |
---|
311 | | -{ |
---|
312 | | - struct mlx5_ib_dev *dev = to_mdev(pd->device); |
---|
313 | | - struct mlx5_ib_mr *mr; |
---|
| 450 | + u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; |
---|
314 | 451 | int err; |
---|
315 | 452 | |
---|
316 | | - mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : |
---|
317 | | - MLX5_IMR_MTT_CACHE_ENTRY); |
---|
| 453 | + MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); |
---|
| 454 | + MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); |
---|
| 455 | + MLX5_SET(page_fault_resume_in, in, token, pfault->token); |
---|
| 456 | + MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); |
---|
| 457 | + MLX5_SET(page_fault_resume_in, in, error, !!error); |
---|
318 | 458 | |
---|
319 | | - if (IS_ERR(mr)) |
---|
320 | | - return mr; |
---|
321 | | - |
---|
322 | | - mr->ibmr.pd = pd; |
---|
323 | | - |
---|
324 | | - mr->dev = dev; |
---|
325 | | - mr->access_flags = access_flags; |
---|
326 | | - mr->mmkey.iova = 0; |
---|
327 | | - mr->umem = umem; |
---|
328 | | - |
---|
329 | | - if (ksm) { |
---|
330 | | - err = mlx5_ib_update_xlt(mr, 0, |
---|
331 | | - mlx5_imr_ksm_entries, |
---|
332 | | - MLX5_KSM_PAGE_SHIFT, |
---|
333 | | - MLX5_IB_UPD_XLT_INDIRECT | |
---|
334 | | - MLX5_IB_UPD_XLT_ZAP | |
---|
335 | | - MLX5_IB_UPD_XLT_ENABLE); |
---|
336 | | - |
---|
337 | | - } else { |
---|
338 | | - err = mlx5_ib_update_xlt(mr, 0, |
---|
339 | | - MLX5_IMR_MTT_ENTRIES, |
---|
340 | | - PAGE_SHIFT, |
---|
341 | | - MLX5_IB_UPD_XLT_ZAP | |
---|
342 | | - MLX5_IB_UPD_XLT_ENABLE | |
---|
343 | | - MLX5_IB_UPD_XLT_ATOMIC); |
---|
344 | | - } |
---|
345 | | - |
---|
| 459 | + err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); |
---|
346 | 460 | if (err) |
---|
347 | | - goto fail; |
---|
348 | | - |
---|
349 | | - mr->ibmr.lkey = mr->mmkey.key; |
---|
350 | | - mr->ibmr.rkey = mr->mmkey.key; |
---|
351 | | - |
---|
352 | | - mr->live = 1; |
---|
353 | | - |
---|
354 | | - mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", |
---|
355 | | - mr->mmkey.key, dev->mdev, mr); |
---|
356 | | - |
---|
357 | | - return mr; |
---|
358 | | - |
---|
359 | | -fail: |
---|
360 | | - mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); |
---|
361 | | - mlx5_mr_cache_free(dev, mr); |
---|
362 | | - |
---|
363 | | - return ERR_PTR(err); |
---|
| 461 | + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", |
---|
| 462 | + wq_num, err); |
---|
364 | 463 | } |
---|
365 | 464 | |
---|
366 | | -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, |
---|
367 | | - u64 io_virt, size_t bcnt) |
---|
| 465 | +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, |
---|
| 466 | + unsigned long idx) |
---|
368 | 467 | { |
---|
369 | | - struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; |
---|
370 | | - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); |
---|
371 | | - struct ib_umem_odp *odp, *result = NULL; |
---|
372 | | - u64 addr = io_virt & MLX5_IMR_MTT_MASK; |
---|
373 | | - int nentries = 0, start_idx = 0, ret; |
---|
374 | | - struct mlx5_ib_mr *mtt; |
---|
375 | | - struct ib_umem *umem; |
---|
| 468 | + struct ib_umem_odp *odp; |
---|
| 469 | + struct mlx5_ib_mr *mr; |
---|
| 470 | + struct mlx5_ib_mr *ret; |
---|
| 471 | + int err; |
---|
376 | 472 | |
---|
377 | | - mutex_lock(&mr->umem->odp_data->umem_mutex); |
---|
378 | | - odp = odp_lookup(ctx, addr, 1, mr); |
---|
| 473 | + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), |
---|
| 474 | + idx * MLX5_IMR_MTT_SIZE, |
---|
| 475 | + MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); |
---|
| 476 | + if (IS_ERR(odp)) |
---|
| 477 | + return ERR_CAST(odp); |
---|
379 | 478 | |
---|
380 | | - mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", |
---|
381 | | - io_virt, bcnt, addr, odp); |
---|
| 479 | + ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY, |
---|
| 480 | + imr->access_flags); |
---|
| 481 | + if (IS_ERR(mr)) |
---|
| 482 | + goto out_umem; |
---|
382 | 483 | |
---|
383 | | -next_mr: |
---|
384 | | - if (likely(odp)) { |
---|
385 | | - if (nentries) |
---|
386 | | - nentries++; |
---|
387 | | - } else { |
---|
388 | | - umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); |
---|
389 | | - if (IS_ERR(umem)) { |
---|
390 | | - mutex_unlock(&mr->umem->odp_data->umem_mutex); |
---|
391 | | - return ERR_CAST(umem); |
---|
392 | | - } |
---|
| 484 | + mr->ibmr.pd = imr->ibmr.pd; |
---|
| 485 | + mr->umem = &odp->umem; |
---|
| 486 | + mr->ibmr.lkey = mr->mmkey.key; |
---|
| 487 | + mr->ibmr.rkey = mr->mmkey.key; |
---|
| 488 | + mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; |
---|
| 489 | + mr->parent = imr; |
---|
| 490 | + odp->private = mr; |
---|
393 | 491 | |
---|
394 | | - mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); |
---|
395 | | - if (IS_ERR(mtt)) { |
---|
396 | | - mutex_unlock(&mr->umem->odp_data->umem_mutex); |
---|
397 | | - ib_umem_release(umem); |
---|
398 | | - return ERR_CAST(mtt); |
---|
399 | | - } |
---|
400 | | - |
---|
401 | | - odp = umem->odp_data; |
---|
402 | | - odp->private = mtt; |
---|
403 | | - mtt->umem = umem; |
---|
404 | | - mtt->mmkey.iova = addr; |
---|
405 | | - mtt->parent = mr; |
---|
406 | | - INIT_WORK(&odp->work, mr_leaf_free_action); |
---|
407 | | - |
---|
408 | | - if (!nentries) |
---|
409 | | - start_idx = addr >> MLX5_IMR_MTT_SHIFT; |
---|
410 | | - nentries++; |
---|
| 492 | + err = mlx5_ib_update_xlt(mr, 0, |
---|
| 493 | + MLX5_IMR_MTT_ENTRIES, |
---|
| 494 | + PAGE_SHIFT, |
---|
| 495 | + MLX5_IB_UPD_XLT_ZAP | |
---|
| 496 | + MLX5_IB_UPD_XLT_ENABLE); |
---|
| 497 | + if (err) { |
---|
| 498 | + ret = ERR_PTR(err); |
---|
| 499 | + goto out_mr; |
---|
411 | 500 | } |
---|
412 | 501 | |
---|
413 | | - /* Return first odp if region not covered by single one */ |
---|
414 | | - if (likely(!result)) |
---|
415 | | - result = odp; |
---|
416 | | - |
---|
417 | | - addr += MLX5_IMR_MTT_SIZE; |
---|
418 | | - if (unlikely(addr < io_virt + bcnt)) { |
---|
419 | | - odp = odp_next(odp); |
---|
420 | | - if (odp && odp->umem->address != addr) |
---|
421 | | - odp = NULL; |
---|
422 | | - goto next_mr; |
---|
423 | | - } |
---|
424 | | - |
---|
425 | | - if (unlikely(nentries)) { |
---|
426 | | - ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, |
---|
427 | | - MLX5_IB_UPD_XLT_INDIRECT | |
---|
428 | | - MLX5_IB_UPD_XLT_ATOMIC); |
---|
429 | | - if (ret) { |
---|
430 | | - mlx5_ib_err(dev, "Failed to update PAS\n"); |
---|
431 | | - result = ERR_PTR(ret); |
---|
| 502 | + /* |
---|
| 503 | + * Once the store to either xarray completes any error unwind has to |
---|
| 504 | + * use synchronize_srcu(). Avoid this with xa_reserve() |
---|
| 505 | + */ |
---|
| 506 | + ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, |
---|
| 507 | + GFP_KERNEL); |
---|
| 508 | + if (unlikely(ret)) { |
---|
| 509 | + if (xa_is_err(ret)) { |
---|
| 510 | + ret = ERR_PTR(xa_err(ret)); |
---|
| 511 | + goto out_mr; |
---|
432 | 512 | } |
---|
| 513 | + /* |
---|
| 514 | + * Another thread beat us to creating the child mr, use |
---|
| 515 | + * theirs. |
---|
| 516 | + */ |
---|
| 517 | + goto out_mr; |
---|
433 | 518 | } |
---|
434 | 519 | |
---|
435 | | - mutex_unlock(&mr->umem->odp_data->umem_mutex); |
---|
436 | | - return result; |
---|
| 520 | + mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); |
---|
| 521 | + return mr; |
---|
| 522 | + |
---|
| 523 | +out_mr: |
---|
| 524 | + mlx5_mr_cache_free(imr->dev, mr); |
---|
| 525 | +out_umem: |
---|
| 526 | + ib_umem_odp_release(odp); |
---|
| 527 | + return ret; |
---|
437 | 528 | } |
---|
438 | 529 | |
---|
439 | 530 | struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, |
---|
| 531 | + struct ib_udata *udata, |
---|
440 | 532 | int access_flags) |
---|
441 | 533 | { |
---|
442 | | - struct ib_ucontext *ctx = pd->ibpd.uobject->context; |
---|
| 534 | + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); |
---|
| 535 | + struct ib_umem_odp *umem_odp; |
---|
443 | 536 | struct mlx5_ib_mr *imr; |
---|
444 | | - struct ib_umem *umem; |
---|
| 537 | + int err; |
---|
445 | 538 | |
---|
446 | | - umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); |
---|
447 | | - if (IS_ERR(umem)) |
---|
448 | | - return ERR_CAST(umem); |
---|
| 539 | + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); |
---|
| 540 | + if (IS_ERR(umem_odp)) |
---|
| 541 | + return ERR_CAST(umem_odp); |
---|
449 | 542 | |
---|
450 | | - imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); |
---|
| 543 | + imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags); |
---|
451 | 544 | if (IS_ERR(imr)) { |
---|
452 | | - ib_umem_release(umem); |
---|
453 | | - return ERR_CAST(imr); |
---|
| 545 | + err = PTR_ERR(imr); |
---|
| 546 | + goto out_umem; |
---|
454 | 547 | } |
---|
455 | 548 | |
---|
456 | | - imr->umem = umem; |
---|
457 | | - init_waitqueue_head(&imr->q_leaf_free); |
---|
458 | | - atomic_set(&imr->num_leaf_free, 0); |
---|
| 549 | + imr->ibmr.pd = &pd->ibpd; |
---|
| 550 | + imr->mmkey.iova = 0; |
---|
| 551 | + imr->umem = &umem_odp->umem; |
---|
| 552 | + imr->ibmr.lkey = imr->mmkey.key; |
---|
| 553 | + imr->ibmr.rkey = imr->mmkey.key; |
---|
| 554 | + imr->umem = &umem_odp->umem; |
---|
| 555 | + imr->is_odp_implicit = true; |
---|
| 556 | + atomic_set(&imr->num_deferred_work, 0); |
---|
| 557 | + init_waitqueue_head(&imr->q_deferred_work); |
---|
| 558 | + xa_init(&imr->implicit_children); |
---|
459 | 559 | |
---|
| 560 | + err = mlx5_ib_update_xlt(imr, 0, |
---|
| 561 | + mlx5_imr_ksm_entries, |
---|
| 562 | + MLX5_KSM_PAGE_SHIFT, |
---|
| 563 | + MLX5_IB_UPD_XLT_INDIRECT | |
---|
| 564 | + MLX5_IB_UPD_XLT_ZAP | |
---|
| 565 | + MLX5_IB_UPD_XLT_ENABLE); |
---|
| 566 | + if (err) |
---|
| 567 | + goto out_mr; |
---|
| 568 | + |
---|
| 569 | + err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), |
---|
| 570 | + &imr->mmkey, GFP_KERNEL)); |
---|
| 571 | + if (err) |
---|
| 572 | + goto out_mr; |
---|
| 573 | + |
---|
| 574 | + mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); |
---|
460 | 575 | return imr; |
---|
461 | | -} |
---|
462 | | - |
---|
463 | | -static int mr_leaf_free(struct ib_umem *umem, u64 start, |
---|
464 | | - u64 end, void *cookie) |
---|
465 | | -{ |
---|
466 | | - struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; |
---|
467 | | - |
---|
468 | | - if (mr->parent != imr) |
---|
469 | | - return 0; |
---|
470 | | - |
---|
471 | | - ib_umem_odp_unmap_dma_pages(umem, |
---|
472 | | - ib_umem_start(umem), |
---|
473 | | - ib_umem_end(umem)); |
---|
474 | | - |
---|
475 | | - if (umem->odp_data->dying) |
---|
476 | | - return 0; |
---|
477 | | - |
---|
478 | | - WRITE_ONCE(umem->odp_data->dying, 1); |
---|
479 | | - atomic_inc(&imr->num_leaf_free); |
---|
480 | | - schedule_work(&umem->odp_data->work); |
---|
481 | | - |
---|
482 | | - return 0; |
---|
| 576 | +out_mr: |
---|
| 577 | + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); |
---|
| 578 | + mlx5_mr_cache_free(dev, imr); |
---|
| 579 | +out_umem: |
---|
| 580 | + ib_umem_odp_release(umem_odp); |
---|
| 581 | + return ERR_PTR(err); |
---|
483 | 582 | } |
---|
484 | 583 | |
---|
485 | 584 | void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) |
---|
486 | 585 | { |
---|
487 | | - struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; |
---|
| 586 | + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); |
---|
| 587 | + struct mlx5_ib_dev *dev = imr->dev; |
---|
| 588 | + struct list_head destroy_list; |
---|
| 589 | + struct mlx5_ib_mr *mtt; |
---|
| 590 | + struct mlx5_ib_mr *tmp; |
---|
| 591 | + unsigned long idx; |
---|
488 | 592 | |
---|
489 | | - down_read(&ctx->umem_rwsem); |
---|
490 | | - rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, |
---|
491 | | - mr_leaf_free, true, imr); |
---|
492 | | - up_read(&ctx->umem_rwsem); |
---|
| 593 | + INIT_LIST_HEAD(&destroy_list); |
---|
493 | 594 | |
---|
494 | | - wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); |
---|
| 595 | + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); |
---|
| 596 | + /* |
---|
| 597 | + * This stops the SRCU protected page fault path from touching either |
---|
| 598 | + * the imr or any children. The page fault path can only reach the |
---|
| 599 | + * children xarray via the imr. |
---|
| 600 | + */ |
---|
| 601 | + synchronize_srcu(&dev->odp_srcu); |
---|
| 602 | + |
---|
| 603 | + /* |
---|
| 604 | + * All work on the prefetch list must be completed, xa_erase() prevented |
---|
| 605 | + * new work from being created. |
---|
| 606 | + */ |
---|
| 607 | + wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); |
---|
| 608 | + |
---|
| 609 | + /* |
---|
| 610 | + * At this point it is forbidden for any other thread to enter |
---|
| 611 | + * pagefault_mr() on this imr. It is already forbidden to call |
---|
| 612 | + * pagefault_mr() on an implicit child. Due to this additions to |
---|
| 613 | + * implicit_children are prevented. |
---|
| 614 | + */ |
---|
| 615 | + |
---|
| 616 | + /* |
---|
| 617 | + * Block destroy_unused_implicit_child_mr() from incrementing |
---|
| 618 | + * num_deferred_work. |
---|
| 619 | + */ |
---|
| 620 | + xa_lock(&imr->implicit_children); |
---|
| 621 | + xa_for_each (&imr->implicit_children, idx, mtt) { |
---|
| 622 | + __xa_erase(&imr->implicit_children, idx); |
---|
| 623 | + list_add(&mtt->odp_destroy.elm, &destroy_list); |
---|
| 624 | + } |
---|
| 625 | + xa_unlock(&imr->implicit_children); |
---|
| 626 | + |
---|
| 627 | + /* |
---|
| 628 | + * Wait for any concurrent destroy_unused_implicit_child_mr() to |
---|
| 629 | + * complete. |
---|
| 630 | + */ |
---|
| 631 | + wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work)); |
---|
| 632 | + |
---|
| 633 | + /* |
---|
| 634 | + * Fence the imr before we destroy the children. This allows us to |
---|
| 635 | + * skip updating the XLT of the imr during destroy of the child mkey |
---|
| 636 | + * the imr points to. |
---|
| 637 | + */ |
---|
| 638 | + mlx5_mr_cache_invalidate(imr); |
---|
| 639 | + |
---|
| 640 | + list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) |
---|
| 641 | + free_implicit_child_mr(mtt, false); |
---|
| 642 | + |
---|
| 643 | + mlx5_mr_cache_free(dev, imr); |
---|
| 644 | + ib_umem_odp_release(odp_imr); |
---|
495 | 645 | } |
---|
496 | 646 | |
---|
497 | | -static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, |
---|
498 | | - u64 io_virt, size_t bcnt, u32 *bytes_mapped) |
---|
| 647 | +/** |
---|
| 648 | + * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR |
---|
| 649 | + * @mr: to fence |
---|
| 650 | + * |
---|
| 651 | + * On return no parallel threads will be touching this MR and no DMA will be |
---|
| 652 | + * active. |
---|
| 653 | + */ |
---|
| 654 | +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) |
---|
499 | 655 | { |
---|
| 656 | + /* Prevent new page faults and prefetch requests from succeeding */ |
---|
| 657 | + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); |
---|
| 658 | + |
---|
| 659 | + /* Wait for all running page-fault handlers to finish. */ |
---|
| 660 | + synchronize_srcu(&mr->dev->odp_srcu); |
---|
| 661 | + |
---|
| 662 | + wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work)); |
---|
| 663 | + |
---|
| 664 | + dma_fence_odp_mr(mr); |
---|
| 665 | +} |
---|
| 666 | + |
---|
| 667 | +#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) |
---|
| 668 | +#define MLX5_PF_FLAGS_SNAPSHOT BIT(2) |
---|
| 669 | +#define MLX5_PF_FLAGS_ENABLE BIT(3) |
---|
| 670 | +static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, |
---|
| 671 | + u64 user_va, size_t bcnt, u32 *bytes_mapped, |
---|
| 672 | + u32 flags) |
---|
| 673 | +{ |
---|
| 674 | + int page_shift, ret, np; |
---|
| 675 | + bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; |
---|
500 | 676 | u64 access_mask; |
---|
501 | | - int npages = 0, page_shift, np; |
---|
502 | | - u64 start_idx, page_mask; |
---|
503 | | - struct ib_umem_odp *odp; |
---|
504 | | - int current_seq; |
---|
505 | | - size_t size; |
---|
506 | | - int ret; |
---|
| 677 | + u64 start_idx; |
---|
| 678 | + bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); |
---|
| 679 | + u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; |
---|
507 | 680 | |
---|
508 | | - if (!mr->umem->odp_data->page_list) { |
---|
509 | | - odp = implicit_mr_get_data(mr, io_virt, bcnt); |
---|
| 681 | + if (flags & MLX5_PF_FLAGS_ENABLE) |
---|
| 682 | + xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; |
---|
510 | 683 | |
---|
511 | | - if (IS_ERR(odp)) |
---|
512 | | - return PTR_ERR(odp); |
---|
513 | | - mr = odp->private; |
---|
514 | | - |
---|
515 | | - } else { |
---|
516 | | - odp = mr->umem->odp_data; |
---|
517 | | - } |
---|
518 | | - |
---|
519 | | -next_mr: |
---|
520 | | - size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); |
---|
521 | | - |
---|
522 | | - page_shift = mr->umem->page_shift; |
---|
523 | | - page_mask = ~(BIT(page_shift) - 1); |
---|
524 | | - start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; |
---|
| 684 | + page_shift = odp->page_shift; |
---|
| 685 | + start_idx = (user_va - ib_umem_start(odp)) >> page_shift; |
---|
525 | 686 | access_mask = ODP_READ_ALLOWED_BIT; |
---|
526 | 687 | |
---|
527 | | - if (mr->umem->writable) |
---|
| 688 | + if (odp->umem.writable && !downgrade) |
---|
528 | 689 | access_mask |= ODP_WRITE_ALLOWED_BIT; |
---|
529 | 690 | |
---|
530 | | - current_seq = READ_ONCE(odp->notifiers_seq); |
---|
| 691 | + np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); |
---|
| 692 | + if (np < 0) |
---|
| 693 | + return np; |
---|
| 694 | + |
---|
531 | 695 | /* |
---|
532 | | - * Ensure the sequence number is valid for some time before we call |
---|
533 | | - * gup. |
---|
| 696 | + * No need to check whether the MTTs really belong to this MR, since |
---|
| 697 | + * ib_umem_odp_map_dma_and_lock already checks this. |
---|
534 | 698 | */ |
---|
535 | | - smp_rmb(); |
---|
536 | | - |
---|
537 | | - ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, |
---|
538 | | - access_mask, current_seq); |
---|
539 | | - |
---|
540 | | - if (ret < 0) |
---|
541 | | - goto out; |
---|
542 | | - |
---|
543 | | - np = ret; |
---|
544 | | - |
---|
545 | | - mutex_lock(&odp->umem_mutex); |
---|
546 | | - if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { |
---|
547 | | - /* |
---|
548 | | - * No need to check whether the MTTs really belong to |
---|
549 | | - * this MR, since ib_umem_odp_map_dma_pages already |
---|
550 | | - * checks this. |
---|
551 | | - */ |
---|
552 | | - ret = mlx5_ib_update_xlt(mr, start_idx, np, |
---|
553 | | - page_shift, MLX5_IB_UPD_XLT_ATOMIC); |
---|
554 | | - } else { |
---|
555 | | - ret = -EAGAIN; |
---|
556 | | - } |
---|
| 699 | + ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags); |
---|
557 | 700 | mutex_unlock(&odp->umem_mutex); |
---|
558 | 701 | |
---|
559 | 702 | if (ret < 0) { |
---|
560 | 703 | if (ret != -EAGAIN) |
---|
561 | | - mlx5_ib_err(dev, "Failed to update mkey page tables\n"); |
---|
| 704 | + mlx5_ib_err(mr->dev, |
---|
| 705 | + "Failed to update mkey page tables\n"); |
---|
562 | 706 | goto out; |
---|
563 | 707 | } |
---|
564 | 708 | |
---|
565 | 709 | if (bytes_mapped) { |
---|
566 | 710 | u32 new_mappings = (np << page_shift) - |
---|
567 | | - (io_virt - round_down(io_virt, 1 << page_shift)); |
---|
568 | | - *bytes_mapped += min_t(u32, new_mappings, size); |
---|
| 711 | + (user_va - round_down(user_va, 1 << page_shift)); |
---|
| 712 | + |
---|
| 713 | + *bytes_mapped += min_t(u32, new_mappings, bcnt); |
---|
569 | 714 | } |
---|
570 | 715 | |
---|
571 | | - npages += np << (page_shift - PAGE_SHIFT); |
---|
572 | | - bcnt -= size; |
---|
573 | | - |
---|
574 | | - if (unlikely(bcnt)) { |
---|
575 | | - struct ib_umem_odp *next; |
---|
576 | | - |
---|
577 | | - io_virt += size; |
---|
578 | | - next = odp_next(odp); |
---|
579 | | - if (unlikely(!next || next->umem->address != io_virt)) { |
---|
580 | | - mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", |
---|
581 | | - io_virt, next); |
---|
582 | | - return -EAGAIN; |
---|
583 | | - } |
---|
584 | | - odp = next; |
---|
585 | | - mr = odp->private; |
---|
586 | | - goto next_mr; |
---|
587 | | - } |
---|
588 | | - |
---|
589 | | - return npages; |
---|
| 716 | + return np << (page_shift - PAGE_SHIFT); |
---|
590 | 717 | |
---|
591 | 718 | out: |
---|
592 | | - if (ret == -EAGAIN) { |
---|
593 | | - if (mr->parent || !odp->dying) { |
---|
594 | | - unsigned long timeout = |
---|
595 | | - msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); |
---|
| 719 | + return ret; |
---|
| 720 | +} |
---|
596 | 721 | |
---|
597 | | - if (!wait_for_completion_timeout( |
---|
598 | | - &odp->notifier_completion, |
---|
599 | | - timeout)) { |
---|
600 | | - mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", |
---|
601 | | - current_seq, odp->notifiers_seq); |
---|
| 722 | +static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, |
---|
| 723 | + struct ib_umem_odp *odp_imr, u64 user_va, |
---|
| 724 | + size_t bcnt, u32 *bytes_mapped, u32 flags) |
---|
| 725 | +{ |
---|
| 726 | + unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; |
---|
| 727 | + unsigned long upd_start_idx = end_idx + 1; |
---|
| 728 | + unsigned long upd_len = 0; |
---|
| 729 | + unsigned long npages = 0; |
---|
| 730 | + int err; |
---|
| 731 | + int ret; |
---|
| 732 | + |
---|
| 733 | + if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || |
---|
| 734 | + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) |
---|
| 735 | + return -EFAULT; |
---|
| 736 | + |
---|
| 737 | + /* Fault each child mr that intersects with our interval. */ |
---|
| 738 | + while (bcnt) { |
---|
| 739 | + unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; |
---|
| 740 | + struct ib_umem_odp *umem_odp; |
---|
| 741 | + struct mlx5_ib_mr *mtt; |
---|
| 742 | + u64 len; |
---|
| 743 | + |
---|
| 744 | + mtt = xa_load(&imr->implicit_children, idx); |
---|
| 745 | + if (unlikely(!mtt)) { |
---|
| 746 | + mtt = implicit_get_child_mr(imr, idx); |
---|
| 747 | + if (IS_ERR(mtt)) { |
---|
| 748 | + ret = PTR_ERR(mtt); |
---|
| 749 | + goto out; |
---|
602 | 750 | } |
---|
603 | | - } else { |
---|
604 | | - /* The MR is being killed, kill the QP as well. */ |
---|
605 | | - ret = -EFAULT; |
---|
| 751 | + upd_start_idx = min(upd_start_idx, idx); |
---|
| 752 | + upd_len = idx - upd_start_idx + 1; |
---|
606 | 753 | } |
---|
| 754 | + |
---|
| 755 | + umem_odp = to_ib_umem_odp(mtt->umem); |
---|
| 756 | + len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - |
---|
| 757 | + user_va; |
---|
| 758 | + |
---|
| 759 | + ret = pagefault_real_mr(mtt, umem_odp, user_va, len, |
---|
| 760 | + bytes_mapped, flags); |
---|
| 761 | + if (ret < 0) |
---|
| 762 | + goto out; |
---|
| 763 | + user_va += len; |
---|
| 764 | + bcnt -= len; |
---|
| 765 | + npages += ret; |
---|
607 | 766 | } |
---|
608 | 767 | |
---|
| 768 | + ret = npages; |
---|
| 769 | + |
---|
| 770 | + /* |
---|
| 771 | + * Any time the implicit_children are changed we must perform an |
---|
| 772 | + * update of the xlt before exiting to ensure the HW and the |
---|
| 773 | + * implicit_children remains synchronized. |
---|
| 774 | + */ |
---|
| 775 | +out: |
---|
| 776 | + if (likely(!upd_len)) |
---|
| 777 | + return ret; |
---|
| 778 | + |
---|
| 779 | + /* |
---|
| 780 | + * Notice this is not strictly ordered right, the KSM is updated after |
---|
| 781 | + * the implicit_children is updated, so a parallel page fault could |
---|
| 782 | + * see a MR that is not yet visible in the KSM. This is similar to a |
---|
| 783 | + * parallel page fault seeing a MR that is being concurrently removed |
---|
| 784 | + * from the KSM. Both of these improbable situations are resolved |
---|
| 785 | + * safely by resuming the HW and then taking another page fault. The |
---|
| 786 | + * next pagefault handler will see the new information. |
---|
| 787 | + */ |
---|
| 788 | + mutex_lock(&odp_imr->umem_mutex); |
---|
| 789 | + err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, |
---|
| 790 | + MLX5_IB_UPD_XLT_INDIRECT | |
---|
| 791 | + MLX5_IB_UPD_XLT_ATOMIC); |
---|
| 792 | + mutex_unlock(&odp_imr->umem_mutex); |
---|
| 793 | + if (err) { |
---|
| 794 | + mlx5_ib_err(imr->dev, "Failed to update PAS\n"); |
---|
| 795 | + return err; |
---|
| 796 | + } |
---|
609 | 797 | return ret; |
---|
| 798 | +} |
---|
| 799 | + |
---|
| 800 | +/* |
---|
| 801 | + * Returns: |
---|
| 802 | + * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are |
---|
| 803 | + * not accessible, or the MR is no longer valid. |
---|
| 804 | + * -EAGAIN/-ENOMEM: The operation should be retried |
---|
| 805 | + * |
---|
| 806 | + * -EINVAL/others: General internal malfunction |
---|
| 807 | + * >0: Number of pages mapped |
---|
| 808 | + */ |
---|
| 809 | +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, |
---|
| 810 | + u32 *bytes_mapped, u32 flags) |
---|
| 811 | +{ |
---|
| 812 | + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); |
---|
| 813 | + |
---|
| 814 | + lockdep_assert_held(&mr->dev->odp_srcu); |
---|
| 815 | + if (unlikely(io_virt < mr->mmkey.iova)) |
---|
| 816 | + return -EFAULT; |
---|
| 817 | + |
---|
| 818 | + if (!odp->is_implicit_odp) { |
---|
| 819 | + u64 user_va; |
---|
| 820 | + |
---|
| 821 | + if (check_add_overflow(io_virt - mr->mmkey.iova, |
---|
| 822 | + (u64)odp->umem.address, &user_va)) |
---|
| 823 | + return -EFAULT; |
---|
| 824 | + if (unlikely(user_va >= ib_umem_end(odp) || |
---|
| 825 | + ib_umem_end(odp) - user_va < bcnt)) |
---|
| 826 | + return -EFAULT; |
---|
| 827 | + return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, |
---|
| 828 | + flags); |
---|
| 829 | + } |
---|
| 830 | + return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, |
---|
| 831 | + flags); |
---|
| 832 | +} |
---|
| 833 | + |
---|
| 834 | +int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable) |
---|
| 835 | +{ |
---|
| 836 | + u32 flags = MLX5_PF_FLAGS_SNAPSHOT; |
---|
| 837 | + int ret; |
---|
| 838 | + |
---|
| 839 | + if (enable) |
---|
| 840 | + flags |= MLX5_PF_FLAGS_ENABLE; |
---|
| 841 | + |
---|
| 842 | + ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), |
---|
| 843 | + mr->umem->address, mr->umem->length, NULL, |
---|
| 844 | + flags); |
---|
| 845 | + return ret >= 0 ? 0 : ret; |
---|
610 | 846 | } |
---|
611 | 847 | |
---|
612 | 848 | struct pf_frame { |
---|
.. | .. |
---|
616 | 852 | size_t bcnt; |
---|
617 | 853 | int depth; |
---|
618 | 854 | }; |
---|
| 855 | + |
---|
| 856 | +static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key) |
---|
| 857 | +{ |
---|
| 858 | + if (!mmkey) |
---|
| 859 | + return false; |
---|
| 860 | + if (mmkey->type == MLX5_MKEY_MW) |
---|
| 861 | + return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key); |
---|
| 862 | + return mmkey->key == key; |
---|
| 863 | +} |
---|
| 864 | + |
---|
| 865 | +static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey) |
---|
| 866 | +{ |
---|
| 867 | + struct mlx5_ib_mw *mw; |
---|
| 868 | + struct mlx5_ib_devx_mr *devx_mr; |
---|
| 869 | + |
---|
| 870 | + if (mmkey->type == MLX5_MKEY_MW) { |
---|
| 871 | + mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); |
---|
| 872 | + return mw->ndescs; |
---|
| 873 | + } |
---|
| 874 | + |
---|
| 875 | + devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr, |
---|
| 876 | + mmkey); |
---|
| 877 | + return devx_mr->ndescs; |
---|
| 878 | +} |
---|
619 | 879 | |
---|
620 | 880 | /* |
---|
621 | 881 | * Handle a single data segment in a page-fault WQE or RDMA region. |
---|
.. | .. |
---|
629 | 889 | * abort the page fault handling. |
---|
630 | 890 | */ |
---|
631 | 891 | static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, |
---|
632 | | - u32 key, u64 io_virt, size_t bcnt, |
---|
| 892 | + struct ib_pd *pd, u32 key, |
---|
| 893 | + u64 io_virt, size_t bcnt, |
---|
633 | 894 | u32 *bytes_committed, |
---|
634 | 895 | u32 *bytes_mapped) |
---|
635 | 896 | { |
---|
636 | 897 | int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; |
---|
637 | 898 | struct pf_frame *head = NULL, *frame; |
---|
638 | 899 | struct mlx5_core_mkey *mmkey; |
---|
639 | | - struct mlx5_ib_mw *mw; |
---|
640 | 900 | struct mlx5_ib_mr *mr; |
---|
641 | 901 | struct mlx5_klm *pklm; |
---|
642 | 902 | u32 *out = NULL; |
---|
643 | 903 | size_t offset; |
---|
| 904 | + int ndescs; |
---|
644 | 905 | |
---|
645 | | - srcu_key = srcu_read_lock(&dev->mr_srcu); |
---|
| 906 | + srcu_key = srcu_read_lock(&dev->odp_srcu); |
---|
646 | 907 | |
---|
647 | 908 | io_virt += *bytes_committed; |
---|
648 | 909 | bcnt -= *bytes_committed; |
---|
649 | 910 | |
---|
650 | 911 | next_mr: |
---|
651 | | - mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key)); |
---|
652 | | - if (!mmkey || mmkey->key != key) { |
---|
| 912 | + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); |
---|
| 913 | + if (!mmkey) { |
---|
| 914 | + mlx5_ib_dbg( |
---|
| 915 | + dev, |
---|
| 916 | + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", |
---|
| 917 | + key); |
---|
| 918 | + if (bytes_mapped) |
---|
| 919 | + *bytes_mapped += bcnt; |
---|
| 920 | + /* |
---|
| 921 | + * The user could specify a SGL with multiple lkeys and only |
---|
| 922 | + * some of them are ODP. Treat the non-ODP ones as fully |
---|
| 923 | + * faulted. |
---|
| 924 | + */ |
---|
| 925 | + ret = 0; |
---|
| 926 | + goto srcu_unlock; |
---|
| 927 | + } |
---|
| 928 | + if (!mkey_is_eq(mmkey, key)) { |
---|
653 | 929 | mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); |
---|
654 | 930 | ret = -EFAULT; |
---|
655 | 931 | goto srcu_unlock; |
---|
.. | .. |
---|
658 | 934 | switch (mmkey->type) { |
---|
659 | 935 | case MLX5_MKEY_MR: |
---|
660 | 936 | mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); |
---|
661 | | - if (!mr->live || !mr->ibmr.pd) { |
---|
662 | | - mlx5_ib_dbg(dev, "got dead MR\n"); |
---|
663 | | - ret = -EFAULT; |
---|
664 | | - goto srcu_unlock; |
---|
665 | | - } |
---|
666 | 937 | |
---|
667 | | - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); |
---|
| 938 | + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); |
---|
668 | 939 | if (ret < 0) |
---|
669 | 940 | goto srcu_unlock; |
---|
| 941 | + |
---|
| 942 | + mlx5_update_odp_stats(mr, faults, ret); |
---|
670 | 943 | |
---|
671 | 944 | npages += ret; |
---|
672 | 945 | ret = 0; |
---|
673 | 946 | break; |
---|
674 | 947 | |
---|
675 | 948 | case MLX5_MKEY_MW: |
---|
676 | | - mw = container_of(mmkey, struct mlx5_ib_mw, mmkey); |
---|
| 949 | + case MLX5_MKEY_INDIRECT_DEVX: |
---|
| 950 | + ndescs = get_indirect_num_descs(mmkey); |
---|
677 | 951 | |
---|
678 | 952 | if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { |
---|
679 | 953 | mlx5_ib_dbg(dev, "indirection level exceeded\n"); |
---|
.. | .. |
---|
682 | 956 | } |
---|
683 | 957 | |
---|
684 | 958 | outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + |
---|
685 | | - sizeof(*pklm) * (mw->ndescs - 2); |
---|
| 959 | + sizeof(*pklm) * (ndescs - 2); |
---|
686 | 960 | |
---|
687 | 961 | if (outlen > cur_outlen) { |
---|
688 | 962 | kfree(out); |
---|
.. | .. |
---|
697 | 971 | pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, |
---|
698 | 972 | bsf0_klm0_pas_mtt0_1); |
---|
699 | 973 | |
---|
700 | | - ret = mlx5_core_query_mkey(dev->mdev, &mw->mmkey, out, outlen); |
---|
| 974 | + ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen); |
---|
701 | 975 | if (ret) |
---|
702 | 976 | goto srcu_unlock; |
---|
703 | 977 | |
---|
704 | 978 | offset = io_virt - MLX5_GET64(query_mkey_out, out, |
---|
705 | 979 | memory_key_mkey_entry.start_addr); |
---|
706 | 980 | |
---|
707 | | - for (i = 0; bcnt && i < mw->ndescs; i++, pklm++) { |
---|
| 981 | + for (i = 0; bcnt && i < ndescs; i++, pklm++) { |
---|
708 | 982 | if (offset >= be32_to_cpu(pklm->bcount)) { |
---|
709 | 983 | offset -= be32_to_cpu(pklm->bcount); |
---|
710 | 984 | continue; |
---|
.. | .. |
---|
756 | 1030 | } |
---|
757 | 1031 | kfree(out); |
---|
758 | 1032 | |
---|
759 | | - srcu_read_unlock(&dev->mr_srcu, srcu_key); |
---|
| 1033 | + srcu_read_unlock(&dev->odp_srcu, srcu_key); |
---|
760 | 1034 | *bytes_committed = 0; |
---|
761 | 1035 | return ret ? ret : npages; |
---|
762 | 1036 | } |
---|
.. | .. |
---|
764 | 1038 | /** |
---|
765 | 1039 | * Parse a series of data segments for page fault handling. |
---|
766 | 1040 | * |
---|
767 | | - * @qp the QP on which the fault occurred. |
---|
768 | 1041 | * @pfault contains page fault information. |
---|
769 | 1042 | * @wqe points at the first data segment in the WQE. |
---|
770 | 1043 | * @wqe_end points after the end of the WQE. |
---|
.. | .. |
---|
781 | 1054 | */ |
---|
782 | 1055 | static int pagefault_data_segments(struct mlx5_ib_dev *dev, |
---|
783 | 1056 | struct mlx5_pagefault *pfault, |
---|
784 | | - struct mlx5_ib_qp *qp, void *wqe, |
---|
| 1057 | + void *wqe, |
---|
785 | 1058 | void *wqe_end, u32 *bytes_mapped, |
---|
786 | | - u32 *total_wqe_bytes, int receive_queue) |
---|
| 1059 | + u32 *total_wqe_bytes, bool receive_queue) |
---|
787 | 1060 | { |
---|
788 | 1061 | int ret = 0, npages = 0; |
---|
789 | 1062 | u64 io_virt; |
---|
.. | .. |
---|
791 | 1064 | u32 byte_count; |
---|
792 | 1065 | size_t bcnt; |
---|
793 | 1066 | int inline_segment; |
---|
794 | | - |
---|
795 | | - /* Skip SRQ next-WQE segment. */ |
---|
796 | | - if (receive_queue && qp->ibqp.srq) |
---|
797 | | - wqe += sizeof(struct mlx5_wqe_srq_next_seg); |
---|
798 | 1067 | |
---|
799 | 1068 | if (bytes_mapped) |
---|
800 | 1069 | *bytes_mapped = 0; |
---|
.. | .. |
---|
839 | 1108 | continue; |
---|
840 | 1109 | } |
---|
841 | 1110 | |
---|
842 | | - ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, |
---|
| 1111 | + ret = pagefault_single_data_segment(dev, NULL, key, |
---|
| 1112 | + io_virt, bcnt, |
---|
843 | 1113 | &pfault->bytes_committed, |
---|
844 | 1114 | bytes_mapped); |
---|
845 | 1115 | if (ret < 0) |
---|
.. | .. |
---|
849 | 1119 | |
---|
850 | 1120 | return ret < 0 ? ret : npages; |
---|
851 | 1121 | } |
---|
852 | | - |
---|
853 | | -static const u32 mlx5_ib_odp_opcode_cap[] = { |
---|
854 | | - [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, |
---|
855 | | - [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, |
---|
856 | | - [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, |
---|
857 | | - [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, |
---|
858 | | - [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, |
---|
859 | | - [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, |
---|
860 | | - [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, |
---|
861 | | - [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, |
---|
862 | | -}; |
---|
863 | 1122 | |
---|
864 | 1123 | /* |
---|
865 | 1124 | * Parse initiator WQE. Advances the wqe pointer to point at the |
---|
.. | .. |
---|
871 | 1130 | { |
---|
872 | 1131 | struct mlx5_wqe_ctrl_seg *ctrl = *wqe; |
---|
873 | 1132 | u16 wqe_index = pfault->wqe.wqe_index; |
---|
874 | | - u32 transport_caps; |
---|
875 | 1133 | struct mlx5_base_av *av; |
---|
876 | 1134 | unsigned ds, opcode; |
---|
877 | | -#if defined(DEBUG) |
---|
878 | | - u32 ctrl_wqe_index, ctrl_qpn; |
---|
879 | | -#endif |
---|
880 | 1135 | u32 qpn = qp->trans_qp.base.mqp.qpn; |
---|
881 | 1136 | |
---|
882 | 1137 | ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; |
---|
.. | .. |
---|
892 | 1147 | return -EFAULT; |
---|
893 | 1148 | } |
---|
894 | 1149 | |
---|
895 | | -#if defined(DEBUG) |
---|
896 | | - ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & |
---|
897 | | - MLX5_WQE_CTRL_WQE_INDEX_MASK) >> |
---|
898 | | - MLX5_WQE_CTRL_WQE_INDEX_SHIFT; |
---|
899 | | - if (wqe_index != ctrl_wqe_index) { |
---|
900 | | - mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", |
---|
901 | | - wqe_index, qpn, |
---|
902 | | - ctrl_wqe_index); |
---|
903 | | - return -EFAULT; |
---|
904 | | - } |
---|
905 | | - |
---|
906 | | - ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> |
---|
907 | | - MLX5_WQE_CTRL_QPN_SHIFT; |
---|
908 | | - if (qpn != ctrl_qpn) { |
---|
909 | | - mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", |
---|
910 | | - wqe_index, qpn, |
---|
911 | | - ctrl_qpn); |
---|
912 | | - return -EFAULT; |
---|
913 | | - } |
---|
914 | | -#endif /* DEBUG */ |
---|
915 | | - |
---|
916 | 1150 | *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; |
---|
917 | 1151 | *wqe += sizeof(*ctrl); |
---|
918 | 1152 | |
---|
919 | 1153 | opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & |
---|
920 | 1154 | MLX5_WQE_CTRL_OPCODE_MASK; |
---|
921 | 1155 | |
---|
922 | | - switch (qp->ibqp.qp_type) { |
---|
923 | | - case IB_QPT_RC: |
---|
924 | | - transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; |
---|
925 | | - break; |
---|
926 | | - case IB_QPT_UD: |
---|
927 | | - transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; |
---|
928 | | - break; |
---|
929 | | - default: |
---|
930 | | - mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", |
---|
931 | | - qp->ibqp.qp_type); |
---|
932 | | - return -EFAULT; |
---|
933 | | - } |
---|
| 1156 | + if (qp->ibqp.qp_type == IB_QPT_XRC_INI) |
---|
| 1157 | + *wqe += sizeof(struct mlx5_wqe_xrc_seg); |
---|
934 | 1158 | |
---|
935 | | - if (unlikely(opcode >= ARRAY_SIZE(mlx5_ib_odp_opcode_cap) || |
---|
936 | | - !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { |
---|
937 | | - mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", |
---|
938 | | - opcode); |
---|
939 | | - return -EFAULT; |
---|
940 | | - } |
---|
941 | | - |
---|
942 | | - if (qp->ibqp.qp_type != IB_QPT_RC) { |
---|
| 1159 | + if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) { |
---|
943 | 1160 | av = *wqe; |
---|
944 | 1161 | if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) |
---|
945 | 1162 | *wqe += sizeof(struct mlx5_av); |
---|
.. | .. |
---|
964 | 1181 | } |
---|
965 | 1182 | |
---|
966 | 1183 | /* |
---|
967 | | - * Parse responder WQE. Advances the wqe pointer to point at the |
---|
968 | | - * scatter-gather list, and set wqe_end to the end of the WQE. |
---|
| 1184 | + * Parse responder WQE and set wqe_end to the end of the WQE. |
---|
969 | 1185 | */ |
---|
970 | | -static int mlx5_ib_mr_responder_pfault_handler( |
---|
971 | | - struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, |
---|
972 | | - struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) |
---|
| 1186 | +static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev, |
---|
| 1187 | + struct mlx5_ib_srq *srq, |
---|
| 1188 | + void **wqe, void **wqe_end, |
---|
| 1189 | + int wqe_length) |
---|
| 1190 | +{ |
---|
| 1191 | + int wqe_size = 1 << srq->msrq.wqe_shift; |
---|
| 1192 | + |
---|
| 1193 | + if (wqe_size > wqe_length) { |
---|
| 1194 | + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); |
---|
| 1195 | + return -EFAULT; |
---|
| 1196 | + } |
---|
| 1197 | + |
---|
| 1198 | + *wqe_end = *wqe + wqe_size; |
---|
| 1199 | + *wqe += sizeof(struct mlx5_wqe_srq_next_seg); |
---|
| 1200 | + |
---|
| 1201 | + return 0; |
---|
| 1202 | +} |
---|
| 1203 | + |
---|
| 1204 | +static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, |
---|
| 1205 | + struct mlx5_ib_qp *qp, |
---|
| 1206 | + void *wqe, void **wqe_end, |
---|
| 1207 | + int wqe_length) |
---|
973 | 1208 | { |
---|
974 | 1209 | struct mlx5_ib_wq *wq = &qp->rq; |
---|
975 | 1210 | int wqe_size = 1 << wq->wqe_shift; |
---|
976 | 1211 | |
---|
977 | | - if (qp->ibqp.srq) { |
---|
978 | | - mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); |
---|
979 | | - return -EFAULT; |
---|
980 | | - } |
---|
981 | | - |
---|
982 | | - if (qp->wq_sig) { |
---|
| 1212 | + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { |
---|
983 | 1213 | mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); |
---|
984 | 1214 | return -EFAULT; |
---|
985 | 1215 | } |
---|
.. | .. |
---|
989 | 1219 | return -EFAULT; |
---|
990 | 1220 | } |
---|
991 | 1221 | |
---|
992 | | - switch (qp->ibqp.qp_type) { |
---|
993 | | - case IB_QPT_RC: |
---|
994 | | - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & |
---|
995 | | - IB_ODP_SUPPORT_RECV)) |
---|
996 | | - goto invalid_transport_or_opcode; |
---|
997 | | - break; |
---|
998 | | - default: |
---|
999 | | -invalid_transport_or_opcode: |
---|
1000 | | - mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", |
---|
1001 | | - qp->ibqp.qp_type); |
---|
1002 | | - return -EFAULT; |
---|
1003 | | - } |
---|
1004 | | - |
---|
1005 | | - *wqe_end = *wqe + wqe_size; |
---|
| 1222 | + *wqe_end = wqe + wqe_size; |
---|
1006 | 1223 | |
---|
1007 | 1224 | return 0; |
---|
1008 | 1225 | } |
---|
1009 | 1226 | |
---|
1010 | | -static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, |
---|
1011 | | - u32 wq_num) |
---|
| 1227 | +static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, |
---|
| 1228 | + u32 wq_num, int pf_type) |
---|
1012 | 1229 | { |
---|
1013 | | - struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); |
---|
| 1230 | + struct mlx5_core_rsc_common *common = NULL; |
---|
| 1231 | + struct mlx5_core_srq *srq; |
---|
1014 | 1232 | |
---|
1015 | | - if (!mqp) { |
---|
1016 | | - mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); |
---|
1017 | | - return NULL; |
---|
| 1233 | + switch (pf_type) { |
---|
| 1234 | + case MLX5_WQE_PF_TYPE_RMP: |
---|
| 1235 | + srq = mlx5_cmd_get_srq(dev, wq_num); |
---|
| 1236 | + if (srq) |
---|
| 1237 | + common = &srq->common; |
---|
| 1238 | + break; |
---|
| 1239 | + case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: |
---|
| 1240 | + case MLX5_WQE_PF_TYPE_RESP: |
---|
| 1241 | + case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: |
---|
| 1242 | + common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); |
---|
| 1243 | + break; |
---|
| 1244 | + default: |
---|
| 1245 | + break; |
---|
1018 | 1246 | } |
---|
1019 | 1247 | |
---|
| 1248 | + return common; |
---|
| 1249 | +} |
---|
| 1250 | + |
---|
| 1251 | +static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) |
---|
| 1252 | +{ |
---|
| 1253 | + struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; |
---|
| 1254 | + |
---|
1020 | 1255 | return to_mibqp(mqp); |
---|
| 1256 | +} |
---|
| 1257 | + |
---|
| 1258 | +static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res) |
---|
| 1259 | +{ |
---|
| 1260 | + struct mlx5_core_srq *msrq = |
---|
| 1261 | + container_of(res, struct mlx5_core_srq, common); |
---|
| 1262 | + |
---|
| 1263 | + return to_mibsrq(msrq); |
---|
1021 | 1264 | } |
---|
1022 | 1265 | |
---|
1023 | 1266 | static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, |
---|
1024 | 1267 | struct mlx5_pagefault *pfault) |
---|
1025 | 1268 | { |
---|
1026 | | - int ret; |
---|
1027 | | - void *wqe, *wqe_end; |
---|
1028 | | - u32 bytes_mapped, total_wqe_bytes; |
---|
1029 | | - char *buffer = NULL; |
---|
1030 | | - int resume_with_error = 1; |
---|
| 1269 | + bool sq = pfault->type & MLX5_PFAULT_REQUESTOR; |
---|
1031 | 1270 | u16 wqe_index = pfault->wqe.wqe_index; |
---|
1032 | | - int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; |
---|
| 1271 | + void *wqe, *wqe_start = NULL, *wqe_end = NULL; |
---|
| 1272 | + u32 bytes_mapped, total_wqe_bytes; |
---|
| 1273 | + struct mlx5_core_rsc_common *res; |
---|
| 1274 | + int resume_with_error = 1; |
---|
1033 | 1275 | struct mlx5_ib_qp *qp; |
---|
| 1276 | + size_t bytes_copied; |
---|
| 1277 | + int ret = 0; |
---|
1034 | 1278 | |
---|
1035 | | - buffer = (char *)__get_free_page(GFP_KERNEL); |
---|
1036 | | - if (!buffer) { |
---|
| 1279 | + res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); |
---|
| 1280 | + if (!res) { |
---|
| 1281 | + mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); |
---|
| 1282 | + return; |
---|
| 1283 | + } |
---|
| 1284 | + |
---|
| 1285 | + if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ && |
---|
| 1286 | + res->res != MLX5_RES_XSRQ) { |
---|
| 1287 | + mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", |
---|
| 1288 | + pfault->type); |
---|
| 1289 | + goto resolve_page_fault; |
---|
| 1290 | + } |
---|
| 1291 | + |
---|
| 1292 | + wqe_start = (void *)__get_free_page(GFP_KERNEL); |
---|
| 1293 | + if (!wqe_start) { |
---|
1037 | 1294 | mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); |
---|
1038 | 1295 | goto resolve_page_fault; |
---|
1039 | 1296 | } |
---|
1040 | 1297 | |
---|
1041 | | - qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); |
---|
1042 | | - if (!qp) |
---|
1043 | | - goto resolve_page_fault; |
---|
| 1298 | + wqe = wqe_start; |
---|
| 1299 | + qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; |
---|
| 1300 | + if (qp && sq) { |
---|
| 1301 | + ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, |
---|
| 1302 | + &bytes_copied); |
---|
| 1303 | + if (ret) |
---|
| 1304 | + goto read_user; |
---|
| 1305 | + ret = mlx5_ib_mr_initiator_pfault_handler( |
---|
| 1306 | + dev, pfault, qp, &wqe, &wqe_end, bytes_copied); |
---|
| 1307 | + } else if (qp && !sq) { |
---|
| 1308 | + ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, |
---|
| 1309 | + &bytes_copied); |
---|
| 1310 | + if (ret) |
---|
| 1311 | + goto read_user; |
---|
| 1312 | + ret = mlx5_ib_mr_responder_pfault_handler_rq( |
---|
| 1313 | + dev, qp, wqe, &wqe_end, bytes_copied); |
---|
| 1314 | + } else if (!qp) { |
---|
| 1315 | + struct mlx5_ib_srq *srq = res_to_srq(res); |
---|
1044 | 1316 | |
---|
1045 | | - ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, |
---|
1046 | | - PAGE_SIZE, &qp->trans_qp.base); |
---|
1047 | | - if (ret < 0) { |
---|
1048 | | - mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", |
---|
1049 | | - ret, wqe_index, pfault->token); |
---|
1050 | | - goto resolve_page_fault; |
---|
| 1317 | + ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, |
---|
| 1318 | + &bytes_copied); |
---|
| 1319 | + if (ret) |
---|
| 1320 | + goto read_user; |
---|
| 1321 | + ret = mlx5_ib_mr_responder_pfault_handler_srq( |
---|
| 1322 | + dev, srq, &wqe, &wqe_end, bytes_copied); |
---|
1051 | 1323 | } |
---|
1052 | 1324 | |
---|
1053 | | - wqe = buffer; |
---|
1054 | | - if (requestor) |
---|
1055 | | - ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, |
---|
1056 | | - &wqe_end, ret); |
---|
1057 | | - else |
---|
1058 | | - ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, |
---|
1059 | | - &wqe_end, ret); |
---|
1060 | | - if (ret < 0) |
---|
| 1325 | + if (ret < 0 || wqe >= wqe_end) |
---|
1061 | 1326 | goto resolve_page_fault; |
---|
1062 | 1327 | |
---|
1063 | | - if (wqe >= wqe_end) { |
---|
1064 | | - mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); |
---|
1065 | | - goto resolve_page_fault; |
---|
1066 | | - } |
---|
| 1328 | + ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped, |
---|
| 1329 | + &total_wqe_bytes, !sq); |
---|
| 1330 | + if (ret == -EAGAIN) |
---|
| 1331 | + goto out; |
---|
1067 | 1332 | |
---|
1068 | | - ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, |
---|
1069 | | - &bytes_mapped, &total_wqe_bytes, |
---|
1070 | | - !requestor); |
---|
1071 | | - if (ret == -EAGAIN) { |
---|
1072 | | - resume_with_error = 0; |
---|
| 1333 | + if (ret < 0 || total_wqe_bytes > bytes_mapped) |
---|
1073 | 1334 | goto resolve_page_fault; |
---|
1074 | | - } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { |
---|
1075 | | - goto resolve_page_fault; |
---|
1076 | | - } |
---|
1077 | 1335 | |
---|
| 1336 | +out: |
---|
| 1337 | + ret = 0; |
---|
1078 | 1338 | resume_with_error = 0; |
---|
| 1339 | + |
---|
| 1340 | +read_user: |
---|
| 1341 | + if (ret) |
---|
| 1342 | + mlx5_ib_err( |
---|
| 1343 | + dev, |
---|
| 1344 | + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", |
---|
| 1345 | + ret, wqe_index, pfault->token); |
---|
| 1346 | + |
---|
1079 | 1347 | resolve_page_fault: |
---|
1080 | 1348 | mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); |
---|
1081 | 1349 | mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", |
---|
1082 | 1350 | pfault->wqe.wq_num, resume_with_error, |
---|
1083 | 1351 | pfault->type); |
---|
1084 | | - free_page((unsigned long)buffer); |
---|
| 1352 | + mlx5_core_res_put(res); |
---|
| 1353 | + free_page((unsigned long)wqe_start); |
---|
1085 | 1354 | } |
---|
1086 | 1355 | |
---|
1087 | 1356 | static int pages_in_range(u64 address, u32 length) |
---|
.. | .. |
---|
1123 | 1392 | prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); |
---|
1124 | 1393 | } |
---|
1125 | 1394 | |
---|
1126 | | - ret = pagefault_single_data_segment(dev, rkey, address, length, |
---|
| 1395 | + ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, |
---|
1127 | 1396 | &pfault->bytes_committed, NULL); |
---|
1128 | 1397 | if (ret == -EAGAIN) { |
---|
1129 | 1398 | /* We're racing with an invalidation, don't prefetch */ |
---|
.. | .. |
---|
1149 | 1418 | if (prefetch_activated) { |
---|
1150 | 1419 | u32 bytes_committed = 0; |
---|
1151 | 1420 | |
---|
1152 | | - ret = pagefault_single_data_segment(dev, rkey, address, |
---|
| 1421 | + ret = pagefault_single_data_segment(dev, NULL, rkey, address, |
---|
1153 | 1422 | prefetch_len, |
---|
1154 | 1423 | &bytes_committed, NULL); |
---|
1155 | 1424 | if (ret < 0 && ret != -EAGAIN) { |
---|
.. | .. |
---|
1159 | 1428 | } |
---|
1160 | 1429 | } |
---|
1161 | 1430 | |
---|
1162 | | -void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, |
---|
1163 | | - struct mlx5_pagefault *pfault) |
---|
| 1431 | +static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) |
---|
1164 | 1432 | { |
---|
1165 | | - struct mlx5_ib_dev *dev = context; |
---|
1166 | 1433 | u8 event_subtype = pfault->event_subtype; |
---|
1167 | 1434 | |
---|
1168 | 1435 | switch (event_subtype) { |
---|
.. | .. |
---|
1177 | 1444 | event_subtype); |
---|
1178 | 1445 | mlx5_ib_page_fault_resume(dev, pfault, 1); |
---|
1179 | 1446 | } |
---|
| 1447 | +} |
---|
| 1448 | + |
---|
| 1449 | +static void mlx5_ib_eqe_pf_action(struct work_struct *work) |
---|
| 1450 | +{ |
---|
| 1451 | + struct mlx5_pagefault *pfault = container_of(work, |
---|
| 1452 | + struct mlx5_pagefault, |
---|
| 1453 | + work); |
---|
| 1454 | + struct mlx5_ib_pf_eq *eq = pfault->eq; |
---|
| 1455 | + |
---|
| 1456 | + mlx5_ib_pfault(eq->dev, pfault); |
---|
| 1457 | + mempool_free(pfault, eq->pool); |
---|
| 1458 | +} |
---|
| 1459 | + |
---|
| 1460 | +static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) |
---|
| 1461 | +{ |
---|
| 1462 | + struct mlx5_eqe_page_fault *pf_eqe; |
---|
| 1463 | + struct mlx5_pagefault *pfault; |
---|
| 1464 | + struct mlx5_eqe *eqe; |
---|
| 1465 | + int cc = 0; |
---|
| 1466 | + |
---|
| 1467 | + while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { |
---|
| 1468 | + pfault = mempool_alloc(eq->pool, GFP_ATOMIC); |
---|
| 1469 | + if (!pfault) { |
---|
| 1470 | + schedule_work(&eq->work); |
---|
| 1471 | + break; |
---|
| 1472 | + } |
---|
| 1473 | + |
---|
| 1474 | + pf_eqe = &eqe->data.page_fault; |
---|
| 1475 | + pfault->event_subtype = eqe->sub_type; |
---|
| 1476 | + pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); |
---|
| 1477 | + |
---|
| 1478 | + mlx5_ib_dbg(eq->dev, |
---|
| 1479 | + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", |
---|
| 1480 | + eqe->sub_type, pfault->bytes_committed); |
---|
| 1481 | + |
---|
| 1482 | + switch (eqe->sub_type) { |
---|
| 1483 | + case MLX5_PFAULT_SUBTYPE_RDMA: |
---|
| 1484 | + /* RDMA based event */ |
---|
| 1485 | + pfault->type = |
---|
| 1486 | + be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; |
---|
| 1487 | + pfault->token = |
---|
| 1488 | + be32_to_cpu(pf_eqe->rdma.pftype_token) & |
---|
| 1489 | + MLX5_24BIT_MASK; |
---|
| 1490 | + pfault->rdma.r_key = |
---|
| 1491 | + be32_to_cpu(pf_eqe->rdma.r_key); |
---|
| 1492 | + pfault->rdma.packet_size = |
---|
| 1493 | + be16_to_cpu(pf_eqe->rdma.packet_length); |
---|
| 1494 | + pfault->rdma.rdma_op_len = |
---|
| 1495 | + be32_to_cpu(pf_eqe->rdma.rdma_op_len); |
---|
| 1496 | + pfault->rdma.rdma_va = |
---|
| 1497 | + be64_to_cpu(pf_eqe->rdma.rdma_va); |
---|
| 1498 | + mlx5_ib_dbg(eq->dev, |
---|
| 1499 | + "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", |
---|
| 1500 | + pfault->type, pfault->token, |
---|
| 1501 | + pfault->rdma.r_key); |
---|
| 1502 | + mlx5_ib_dbg(eq->dev, |
---|
| 1503 | + "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", |
---|
| 1504 | + pfault->rdma.rdma_op_len, |
---|
| 1505 | + pfault->rdma.rdma_va); |
---|
| 1506 | + break; |
---|
| 1507 | + |
---|
| 1508 | + case MLX5_PFAULT_SUBTYPE_WQE: |
---|
| 1509 | + /* WQE based event */ |
---|
| 1510 | + pfault->type = |
---|
| 1511 | + (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; |
---|
| 1512 | + pfault->token = |
---|
| 1513 | + be32_to_cpu(pf_eqe->wqe.token); |
---|
| 1514 | + pfault->wqe.wq_num = |
---|
| 1515 | + be32_to_cpu(pf_eqe->wqe.pftype_wq) & |
---|
| 1516 | + MLX5_24BIT_MASK; |
---|
| 1517 | + pfault->wqe.wqe_index = |
---|
| 1518 | + be16_to_cpu(pf_eqe->wqe.wqe_index); |
---|
| 1519 | + pfault->wqe.packet_size = |
---|
| 1520 | + be16_to_cpu(pf_eqe->wqe.packet_length); |
---|
| 1521 | + mlx5_ib_dbg(eq->dev, |
---|
| 1522 | + "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", |
---|
| 1523 | + pfault->type, pfault->token, |
---|
| 1524 | + pfault->wqe.wq_num, |
---|
| 1525 | + pfault->wqe.wqe_index); |
---|
| 1526 | + break; |
---|
| 1527 | + |
---|
| 1528 | + default: |
---|
| 1529 | + mlx5_ib_warn(eq->dev, |
---|
| 1530 | + "Unsupported page fault event sub-type: 0x%02hhx\n", |
---|
| 1531 | + eqe->sub_type); |
---|
| 1532 | + /* Unsupported page faults should still be |
---|
| 1533 | + * resolved by the page fault handler |
---|
| 1534 | + */ |
---|
| 1535 | + } |
---|
| 1536 | + |
---|
| 1537 | + pfault->eq = eq; |
---|
| 1538 | + INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); |
---|
| 1539 | + queue_work(eq->wq, &pfault->work); |
---|
| 1540 | + |
---|
| 1541 | + cc = mlx5_eq_update_cc(eq->core, ++cc); |
---|
| 1542 | + } |
---|
| 1543 | + |
---|
| 1544 | + mlx5_eq_update_ci(eq->core, cc, 1); |
---|
| 1545 | +} |
---|
| 1546 | + |
---|
| 1547 | +static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, |
---|
| 1548 | + void *data) |
---|
| 1549 | +{ |
---|
| 1550 | + struct mlx5_ib_pf_eq *eq = |
---|
| 1551 | + container_of(nb, struct mlx5_ib_pf_eq, irq_nb); |
---|
| 1552 | + unsigned long flags; |
---|
| 1553 | + |
---|
| 1554 | + if (spin_trylock_irqsave(&eq->lock, flags)) { |
---|
| 1555 | + mlx5_ib_eq_pf_process(eq); |
---|
| 1556 | + spin_unlock_irqrestore(&eq->lock, flags); |
---|
| 1557 | + } else { |
---|
| 1558 | + schedule_work(&eq->work); |
---|
| 1559 | + } |
---|
| 1560 | + |
---|
| 1561 | + return IRQ_HANDLED; |
---|
| 1562 | +} |
---|
| 1563 | + |
---|
| 1564 | +/* mempool_refill() was proposed but unfortunately wasn't accepted |
---|
| 1565 | + * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html |
---|
| 1566 | + * Cheap workaround. |
---|
| 1567 | + */ |
---|
| 1568 | +static void mempool_refill(mempool_t *pool) |
---|
| 1569 | +{ |
---|
| 1570 | + while (pool->curr_nr < pool->min_nr) |
---|
| 1571 | + mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); |
---|
| 1572 | +} |
---|
| 1573 | + |
---|
| 1574 | +static void mlx5_ib_eq_pf_action(struct work_struct *work) |
---|
| 1575 | +{ |
---|
| 1576 | + struct mlx5_ib_pf_eq *eq = |
---|
| 1577 | + container_of(work, struct mlx5_ib_pf_eq, work); |
---|
| 1578 | + |
---|
| 1579 | + mempool_refill(eq->pool); |
---|
| 1580 | + |
---|
| 1581 | + spin_lock_irq(&eq->lock); |
---|
| 1582 | + mlx5_ib_eq_pf_process(eq); |
---|
| 1583 | + spin_unlock_irq(&eq->lock); |
---|
| 1584 | +} |
---|
| 1585 | + |
---|
| 1586 | +enum { |
---|
| 1587 | + MLX5_IB_NUM_PF_EQE = 0x1000, |
---|
| 1588 | + MLX5_IB_NUM_PF_DRAIN = 64, |
---|
| 1589 | +}; |
---|
| 1590 | + |
---|
| 1591 | +static int |
---|
| 1592 | +mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) |
---|
| 1593 | +{ |
---|
| 1594 | + struct mlx5_eq_param param = {}; |
---|
| 1595 | + int err; |
---|
| 1596 | + |
---|
| 1597 | + INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); |
---|
| 1598 | + spin_lock_init(&eq->lock); |
---|
| 1599 | + eq->dev = dev; |
---|
| 1600 | + |
---|
| 1601 | + eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, |
---|
| 1602 | + sizeof(struct mlx5_pagefault)); |
---|
| 1603 | + if (!eq->pool) |
---|
| 1604 | + return -ENOMEM; |
---|
| 1605 | + |
---|
| 1606 | + eq->wq = alloc_workqueue("mlx5_ib_page_fault", |
---|
| 1607 | + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, |
---|
| 1608 | + MLX5_NUM_CMD_EQE); |
---|
| 1609 | + if (!eq->wq) { |
---|
| 1610 | + err = -ENOMEM; |
---|
| 1611 | + goto err_mempool; |
---|
| 1612 | + } |
---|
| 1613 | + |
---|
| 1614 | + eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; |
---|
| 1615 | + param = (struct mlx5_eq_param) { |
---|
| 1616 | + .irq_index = 0, |
---|
| 1617 | + .nent = MLX5_IB_NUM_PF_EQE, |
---|
| 1618 | + }; |
---|
| 1619 | + param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; |
---|
| 1620 | + eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); |
---|
| 1621 | + if (IS_ERR(eq->core)) { |
---|
| 1622 | + err = PTR_ERR(eq->core); |
---|
| 1623 | + goto err_wq; |
---|
| 1624 | + } |
---|
| 1625 | + err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); |
---|
| 1626 | + if (err) { |
---|
| 1627 | + mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); |
---|
| 1628 | + goto err_eq; |
---|
| 1629 | + } |
---|
| 1630 | + |
---|
| 1631 | + return 0; |
---|
| 1632 | +err_eq: |
---|
| 1633 | + mlx5_eq_destroy_generic(dev->mdev, eq->core); |
---|
| 1634 | +err_wq: |
---|
| 1635 | + destroy_workqueue(eq->wq); |
---|
| 1636 | +err_mempool: |
---|
| 1637 | + mempool_destroy(eq->pool); |
---|
| 1638 | + return err; |
---|
| 1639 | +} |
---|
| 1640 | + |
---|
| 1641 | +static int |
---|
| 1642 | +mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) |
---|
| 1643 | +{ |
---|
| 1644 | + int err; |
---|
| 1645 | + |
---|
| 1646 | + mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); |
---|
| 1647 | + err = mlx5_eq_destroy_generic(dev->mdev, eq->core); |
---|
| 1648 | + cancel_work_sync(&eq->work); |
---|
| 1649 | + destroy_workqueue(eq->wq); |
---|
| 1650 | + mempool_destroy(eq->pool); |
---|
| 1651 | + |
---|
| 1652 | + return err; |
---|
1180 | 1653 | } |
---|
1181 | 1654 | |
---|
1182 | 1655 | void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) |
---|
.. | .. |
---|
1205 | 1678 | } |
---|
1206 | 1679 | } |
---|
1207 | 1680 | |
---|
| 1681 | +static const struct ib_device_ops mlx5_ib_dev_odp_ops = { |
---|
| 1682 | + .advise_mr = mlx5_ib_advise_mr, |
---|
| 1683 | +}; |
---|
| 1684 | + |
---|
1208 | 1685 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) |
---|
1209 | 1686 | { |
---|
1210 | | - int ret; |
---|
| 1687 | + int ret = 0; |
---|
| 1688 | + |
---|
| 1689 | + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) |
---|
| 1690 | + return ret; |
---|
| 1691 | + |
---|
| 1692 | + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); |
---|
1211 | 1693 | |
---|
1212 | 1694 | if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { |
---|
1213 | 1695 | ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); |
---|
.. | .. |
---|
1217 | 1699 | } |
---|
1218 | 1700 | } |
---|
1219 | 1701 | |
---|
1220 | | - return 0; |
---|
| 1702 | + ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); |
---|
| 1703 | + |
---|
| 1704 | + return ret; |
---|
| 1705 | +} |
---|
| 1706 | + |
---|
| 1707 | +void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) |
---|
| 1708 | +{ |
---|
| 1709 | + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) |
---|
| 1710 | + return; |
---|
| 1711 | + |
---|
| 1712 | + mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); |
---|
1221 | 1713 | } |
---|
1222 | 1714 | |
---|
1223 | 1715 | int mlx5_ib_odp_init(void) |
---|
.. | .. |
---|
1228 | 1720 | return 0; |
---|
1229 | 1721 | } |
---|
1230 | 1722 | |
---|
| 1723 | +struct prefetch_mr_work { |
---|
| 1724 | + struct work_struct work; |
---|
| 1725 | + u32 pf_flags; |
---|
| 1726 | + u32 num_sge; |
---|
| 1727 | + struct { |
---|
| 1728 | + u64 io_virt; |
---|
| 1729 | + struct mlx5_ib_mr *mr; |
---|
| 1730 | + size_t length; |
---|
| 1731 | + } frags[]; |
---|
| 1732 | +}; |
---|
| 1733 | + |
---|
| 1734 | +static void destroy_prefetch_work(struct prefetch_mr_work *work) |
---|
| 1735 | +{ |
---|
| 1736 | + u32 i; |
---|
| 1737 | + |
---|
| 1738 | + for (i = 0; i < work->num_sge; ++i) |
---|
| 1739 | + if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work)) |
---|
| 1740 | + wake_up(&work->frags[i].mr->q_deferred_work); |
---|
| 1741 | + kvfree(work); |
---|
| 1742 | +} |
---|
| 1743 | + |
---|
| 1744 | +static struct mlx5_ib_mr * |
---|
| 1745 | +get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, |
---|
| 1746 | + u32 lkey) |
---|
| 1747 | +{ |
---|
| 1748 | + struct mlx5_ib_dev *dev = to_mdev(pd->device); |
---|
| 1749 | + struct mlx5_core_mkey *mmkey; |
---|
| 1750 | + struct ib_umem_odp *odp; |
---|
| 1751 | + struct mlx5_ib_mr *mr; |
---|
| 1752 | + |
---|
| 1753 | + lockdep_assert_held(&dev->odp_srcu); |
---|
| 1754 | + |
---|
| 1755 | + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); |
---|
| 1756 | + if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) |
---|
| 1757 | + return NULL; |
---|
| 1758 | + |
---|
| 1759 | + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); |
---|
| 1760 | + |
---|
| 1761 | + if (mr->ibmr.pd != pd) |
---|
| 1762 | + return NULL; |
---|
| 1763 | + |
---|
| 1764 | + odp = to_ib_umem_odp(mr->umem); |
---|
| 1765 | + |
---|
| 1766 | + /* prefetch with write-access must be supported by the MR */ |
---|
| 1767 | + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && |
---|
| 1768 | + !odp->umem.writable) |
---|
| 1769 | + return NULL; |
---|
| 1770 | + |
---|
| 1771 | + return mr; |
---|
| 1772 | +} |
---|
| 1773 | + |
---|
| 1774 | +static void mlx5_ib_prefetch_mr_work(struct work_struct *w) |
---|
| 1775 | +{ |
---|
| 1776 | + struct prefetch_mr_work *work = |
---|
| 1777 | + container_of(w, struct prefetch_mr_work, work); |
---|
| 1778 | + struct mlx5_ib_dev *dev; |
---|
| 1779 | + u32 bytes_mapped = 0; |
---|
| 1780 | + int srcu_key; |
---|
| 1781 | + int ret; |
---|
| 1782 | + u32 i; |
---|
| 1783 | + |
---|
| 1784 | + /* We rely on IB/core that work is executed if we have num_sge != 0 only. */ |
---|
| 1785 | + WARN_ON(!work->num_sge); |
---|
| 1786 | + dev = work->frags[0].mr->dev; |
---|
| 1787 | + /* SRCU should be held when calling to mlx5_odp_populate_xlt() */ |
---|
| 1788 | + srcu_key = srcu_read_lock(&dev->odp_srcu); |
---|
| 1789 | + for (i = 0; i < work->num_sge; ++i) { |
---|
| 1790 | + ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, |
---|
| 1791 | + work->frags[i].length, &bytes_mapped, |
---|
| 1792 | + work->pf_flags); |
---|
| 1793 | + if (ret <= 0) |
---|
| 1794 | + continue; |
---|
| 1795 | + mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); |
---|
| 1796 | + } |
---|
| 1797 | + srcu_read_unlock(&dev->odp_srcu, srcu_key); |
---|
| 1798 | + |
---|
| 1799 | + destroy_prefetch_work(work); |
---|
| 1800 | +} |
---|
| 1801 | + |
---|
| 1802 | +static bool init_prefetch_work(struct ib_pd *pd, |
---|
| 1803 | + enum ib_uverbs_advise_mr_advice advice, |
---|
| 1804 | + u32 pf_flags, struct prefetch_mr_work *work, |
---|
| 1805 | + struct ib_sge *sg_list, u32 num_sge) |
---|
| 1806 | +{ |
---|
| 1807 | + u32 i; |
---|
| 1808 | + |
---|
| 1809 | + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); |
---|
| 1810 | + work->pf_flags = pf_flags; |
---|
| 1811 | + |
---|
| 1812 | + for (i = 0; i < num_sge; ++i) { |
---|
| 1813 | + work->frags[i].io_virt = sg_list[i].addr; |
---|
| 1814 | + work->frags[i].length = sg_list[i].length; |
---|
| 1815 | + work->frags[i].mr = |
---|
| 1816 | + get_prefetchable_mr(pd, advice, sg_list[i].lkey); |
---|
| 1817 | + if (!work->frags[i].mr) { |
---|
| 1818 | + work->num_sge = i; |
---|
| 1819 | + return false; |
---|
| 1820 | + } |
---|
| 1821 | + |
---|
| 1822 | + /* Keep the MR pointer will valid outside the SRCU */ |
---|
| 1823 | + atomic_inc(&work->frags[i].mr->num_deferred_work); |
---|
| 1824 | + } |
---|
| 1825 | + work->num_sge = num_sge; |
---|
| 1826 | + return true; |
---|
| 1827 | +} |
---|
| 1828 | + |
---|
| 1829 | +static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, |
---|
| 1830 | + enum ib_uverbs_advise_mr_advice advice, |
---|
| 1831 | + u32 pf_flags, struct ib_sge *sg_list, |
---|
| 1832 | + u32 num_sge) |
---|
| 1833 | +{ |
---|
| 1834 | + struct mlx5_ib_dev *dev = to_mdev(pd->device); |
---|
| 1835 | + u32 bytes_mapped = 0; |
---|
| 1836 | + int srcu_key; |
---|
| 1837 | + int ret = 0; |
---|
| 1838 | + u32 i; |
---|
| 1839 | + |
---|
| 1840 | + srcu_key = srcu_read_lock(&dev->odp_srcu); |
---|
| 1841 | + for (i = 0; i < num_sge; ++i) { |
---|
| 1842 | + struct mlx5_ib_mr *mr; |
---|
| 1843 | + |
---|
| 1844 | + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); |
---|
| 1845 | + if (!mr) { |
---|
| 1846 | + ret = -ENOENT; |
---|
| 1847 | + goto out; |
---|
| 1848 | + } |
---|
| 1849 | + ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, |
---|
| 1850 | + &bytes_mapped, pf_flags); |
---|
| 1851 | + if (ret < 0) |
---|
| 1852 | + goto out; |
---|
| 1853 | + mlx5_update_odp_stats(mr, prefetch, ret); |
---|
| 1854 | + } |
---|
| 1855 | + ret = 0; |
---|
| 1856 | + |
---|
| 1857 | +out: |
---|
| 1858 | + srcu_read_unlock(&dev->odp_srcu, srcu_key); |
---|
| 1859 | + return ret; |
---|
| 1860 | +} |
---|
| 1861 | + |
---|
| 1862 | +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, |
---|
| 1863 | + enum ib_uverbs_advise_mr_advice advice, |
---|
| 1864 | + u32 flags, struct ib_sge *sg_list, u32 num_sge) |
---|
| 1865 | +{ |
---|
| 1866 | + struct mlx5_ib_dev *dev = to_mdev(pd->device); |
---|
| 1867 | + u32 pf_flags = 0; |
---|
| 1868 | + struct prefetch_mr_work *work; |
---|
| 1869 | + int srcu_key; |
---|
| 1870 | + |
---|
| 1871 | + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) |
---|
| 1872 | + pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; |
---|
| 1873 | + |
---|
| 1874 | + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) |
---|
| 1875 | + pf_flags |= MLX5_PF_FLAGS_SNAPSHOT; |
---|
| 1876 | + |
---|
| 1877 | + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) |
---|
| 1878 | + return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, |
---|
| 1879 | + num_sge); |
---|
| 1880 | + |
---|
| 1881 | + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); |
---|
| 1882 | + if (!work) |
---|
| 1883 | + return -ENOMEM; |
---|
| 1884 | + |
---|
| 1885 | + srcu_key = srcu_read_lock(&dev->odp_srcu); |
---|
| 1886 | + if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { |
---|
| 1887 | + srcu_read_unlock(&dev->odp_srcu, srcu_key); |
---|
| 1888 | + destroy_prefetch_work(work); |
---|
| 1889 | + return -EINVAL; |
---|
| 1890 | + } |
---|
| 1891 | + queue_work(system_unbound_wq, &work->work); |
---|
| 1892 | + srcu_read_unlock(&dev->odp_srcu, srcu_key); |
---|
| 1893 | + return 0; |
---|
| 1894 | +} |
---|