~hc/RK356X_SDK_RELEASE.git

..	..	@@ -39,598 +39,307 @@
39	39	#include <linux/export.h>
40	40	#include <linux/vmalloc.h>
41	41	#include <linux/hugetlb.h>
42		-#include <linux/interval_tree_generic.h>
	42	+#include <linux/interval_tree.h>
	43	+#include <linux/hmm.h>
	44	+#include <linux/pagemap.h>
43	45
44	46	#include <rdma/ib_verbs.h>
45	47	#include <rdma/ib_umem.h>
46	48	#include <rdma/ib_umem_odp.h>
47	49
48		-/*
49		- * The ib_umem list keeps track of memory regions for which the HW
50		- * device request to receive notification when the related memory
51		- * mapping is changed.
52		- *
53		- * ib_umem_lock protects the list.
54		- */
	50	+#include "uverbs.h"
55	51
56		-static u64 node_start(struct umem_odp_node *n)
	52	+static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
	53	+ const struct mmu_interval_notifier_ops *ops)
57	54	{
58		- struct ib_umem_odp *umem_odp =
59		- container_of(n, struct ib_umem_odp, interval_tree);
60		-
61		- return ib_umem_start(umem_odp->umem);
62		-}
63		-
64		-/* Note that the representation of the intervals in the interval tree
65		- * considers the ending point as contained in the interval, while the
66		- * function ib_umem_end returns the first address which is not contained
67		- * in the umem.
68		- */
69		-static u64 node_last(struct umem_odp_node *n)
70		-{
71		- struct ib_umem_odp *umem_odp =
72		- container_of(n, struct ib_umem_odp, interval_tree);
73		-
74		- return ib_umem_end(umem_odp->umem) - 1;
75		-}
76		-
77		-INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
78		- node_start, node_last, static, rbt_ib_umem)
79		-
80		-static void ib_umem_notifier_start_account(struct ib_umem *item)
81		-{
82		- mutex_lock(&item->odp_data->umem_mutex);
83		-
84		- /* Only update private counters for this umem if it has them.
85		- * Otherwise skip it. All page faults will be delayed for this umem. */
86		- if (item->odp_data->mn_counters_active) {
87		- int notifiers_count = item->odp_data->notifiers_count++;
88		-
89		- if (notifiers_count == 0)
90		- /* Initialize the completion object for waiting on
91		- * notifiers. Since notifier_count is zero, no one
92		- * should be waiting right now. */
93		- reinit_completion(&item->odp_data->notifier_completion);
94		- }
95		- mutex_unlock(&item->odp_data->umem_mutex);
96		-}
97		-
98		-static void ib_umem_notifier_end_account(struct ib_umem *item)
99		-{
100		- mutex_lock(&item->odp_data->umem_mutex);
101		-
102		- /* Only update private counters for this umem if it has them.
103		- * Otherwise skip it. All page faults will be delayed for this umem. */
104		- if (item->odp_data->mn_counters_active) {
105		- /*
106		- * This sequence increase will notify the QP page fault that
107		- * the page that is going to be mapped in the spte could have
108		- * been freed.
109		- */
110		- ++item->odp_data->notifiers_seq;
111		- if (--item->odp_data->notifiers_count == 0)
112		- complete_all(&item->odp_data->notifier_completion);
113		- }
114		- mutex_unlock(&item->odp_data->umem_mutex);
115		-}
116		-
117		-/* Account for a new mmu notifier in an ib_ucontext. */
118		-static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
119		-{
120		- atomic_inc(&context->notifier_count);
121		-}
122		-
123		-/* Account for a terminating mmu notifier in an ib_ucontext.
124		- *
125		- * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
126		- * the function takes the semaphore itself. */
127		-static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
128		-{
129		- int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
130		-
131		- if (zero_notifiers &&
132		- !list_empty(&context->no_private_counters)) {
133		- /* No currently running mmu notifiers. Now is the chance to
134		- * add private accounting to all previously added umems. */
135		- struct ib_umem_odp odp_data, next;
136		-
137		- /* Prevent concurrent mmu notifiers from working on the
138		- * no_private_counters list. */
139		- down_write(&context->umem_rwsem);
140		-
141		- /* Read the notifier_count again, with the umem_rwsem
142		- * semaphore taken for write. */
143		- if (!atomic_read(&context->notifier_count)) {
144		- list_for_each_entry_safe(odp_data, next,
145		- &context->no_private_counters,
146		- no_private_counters) {
147		- mutex_lock(&odp_data->umem_mutex);
148		- odp_data->mn_counters_active = true;
149		- list_del(&odp_data->no_private_counters);
150		- complete_all(&odp_data->notifier_completion);
151		- mutex_unlock(&odp_data->umem_mutex);
152		- }
153		- }
154		-
155		- up_write(&context->umem_rwsem);
156		- }
157		-}
158		-
159		-static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
160		- u64 end, void *cookie) {
161		- /*
162		- * Increase the number of notifiers running, to
163		- * prevent any further fault handling on this MR.
164		- */
165		- ib_umem_notifier_start_account(item);
166		- item->odp_data->dying = 1;
167		- /* Make sure that the fact the umem is dying is out before we release
168		- * all pending page faults. */
169		- smp_wmb();
170		- complete_all(&item->odp_data->notifier_completion);
171		- item->context->invalidate_range(item, ib_umem_start(item),
172		- ib_umem_end(item));
173		- return 0;
174		-}
175		-
176		-static void ib_umem_notifier_release(struct mmu_notifier *mn,
177		- struct mm_struct *mm)
178		-{
179		- struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
180		-
181		- if (!context->invalidate_range)
182		- return;
183		-
184		- ib_ucontext_notifier_start_account(context);
185		- down_read(&context->umem_rwsem);
186		- rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
187		- ULLONG_MAX,
188		- ib_umem_notifier_release_trampoline,
189		- true,
190		- NULL);
191		- up_read(&context->umem_rwsem);
192		-}
193		-
194		-static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
195		- u64 end, void *cookie)
196		-{
197		- ib_umem_notifier_start_account(item);
198		- item->context->invalidate_range(item, start, start + PAGE_SIZE);
199		- ib_umem_notifier_end_account(item);
200		- return 0;
201		-}
202		-
203		-static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
204		- u64 end, void *cookie)
205		-{
206		- ib_umem_notifier_start_account(item);
207		- item->context->invalidate_range(item, start, end);
208		- return 0;
209		-}
210		-
211		-static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
212		- struct mm_struct *mm,
213		- unsigned long start,
214		- unsigned long end,
215		- bool blockable)
216		-{
217		- struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
218	55	int ret;
219	56
220		- if (!context->invalidate_range)
221		- return 0;
	57	+ umem_odp->umem.is_odp = 1;
	58	+ mutex_init(&umem_odp->umem_mutex);
222	59
223		- if (blockable)
224		- down_read(&context->umem_rwsem);
225		- else if (!down_read_trylock(&context->umem_rwsem))
226		- return -EAGAIN;
	60	+ if (!umem_odp->is_implicit_odp) {
	61	+ size_t page_size = 1UL << umem_odp->page_shift;
	62	+ unsigned long start;
	63	+ unsigned long end;
	64	+ size_t ndmas, npfns;
227	65
228		- ib_ucontext_notifier_start_account(context);
229		- ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
230		- end,
231		- invalidate_range_start_trampoline,
232		- blockable, NULL);
233		- up_read(&context->umem_rwsem);
	66	+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
	67	+ if (check_add_overflow(umem_odp->umem.address,
	68	+ (unsigned long)umem_odp->umem.length,
	69	+ &end))
	70	+ return -EOVERFLOW;
	71	+ end = ALIGN(end, page_size);
	72	+ if (unlikely(end < page_size))
	73	+ return -EOVERFLOW;
234	74
	75	+ ndmas = (end - start) >> umem_odp->page_shift;
	76	+ if (!ndmas)
	77	+ return -EINVAL;
	78	+
	79	+ npfns = (end - start) >> PAGE_SHIFT;
	80	+ umem_odp->pfn_list = kvcalloc(
	81	+ npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
	82	+ if (!umem_odp->pfn_list)
	83	+ return -ENOMEM;
	84	+
	85	+ umem_odp->dma_list = kvcalloc(
	86	+ ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
	87	+ if (!umem_odp->dma_list) {
	88	+ ret = -ENOMEM;
	89	+ goto out_pfn_list;
	90	+ }
	91	+
	92	+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
	93	+ umem_odp->umem.owning_mm,
	94	+ start, end - start, ops);
	95	+ if (ret)
	96	+ goto out_dma_list;
	97	+ }
	98	+
	99	+ return 0;
	100	+
	101	+out_dma_list:
	102	+ kvfree(umem_odp->dma_list);
	103	+out_pfn_list:
	104	+ kvfree(umem_odp->pfn_list);
235	105	return ret;
236	106	}
237	107
238		-static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
239		- u64 end, void *cookie)
240		-{
241		- ib_umem_notifier_end_account(item);
242		- return 0;
243		-}
244		-
245		-static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
246		- struct mm_struct *mm,
247		- unsigned long start,
248		- unsigned long end)
249		-{
250		- struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
251		-
252		- if (!context->invalidate_range)
253		- return;
254		-
255		- /*
256		- * TODO: we currently bail out if there is any sleepable work to be done
257		- * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
258		- * here. But this is ugly and fragile.
259		- */
260		- down_read(&context->umem_rwsem);
261		- rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
262		- end,
263		- invalidate_range_end_trampoline, true, NULL);
264		- up_read(&context->umem_rwsem);
265		- ib_ucontext_notifier_end_account(context);
266		-}
267		-
268		-static const struct mmu_notifier_ops ib_umem_notifiers = {
269		- .release = ib_umem_notifier_release,
270		- .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
271		- .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
272		-};
273		-
274		-struct ib_umem ib_alloc_odp_umem(struct ib_ucontext context,
275		- unsigned long addr,
276		- size_t size)
	108	+/**
	109	+ * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
	110	+ *
	111	+ * Implicit ODP umems do not have a VA range and do not have any page lists.
	112	+ * They exist only to hold the per_mm reference to help the driver create
	113	+ * children umems.
	114	+ *
	115	+ * @device: IB device to create UMEM
	116	+ * @access: ib_reg_mr access flags
	117	+ */
	118	+struct ib_umem_odp ib_umem_odp_alloc_implicit(struct ib_device device,
	119	+ int access)
277	120	{
278	121	struct ib_umem *umem;
279		- struct ib_umem_odp *odp_data;
280		- int pages = size >> PAGE_SHIFT;
	122	+ struct ib_umem_odp *umem_odp;
281	123	int ret;
282	124
283		- umem = kzalloc(sizeof(*umem), GFP_KERNEL);
284		- if (!umem)
285		- return ERR_PTR(-ENOMEM);
	125	+ if (access & IB_ACCESS_HUGETLB)
	126	+ return ERR_PTR(-EINVAL);
286	127
287		- umem->context = context;
288		- umem->length = size;
289		- umem->address = addr;
290		- umem->page_shift = PAGE_SHIFT;
291		- umem->writable = 1;
	128	+ umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
	129	+ if (!umem_odp)
	130	+ return ERR_PTR(-ENOMEM);
	131	+ umem = &umem_odp->umem;
	132	+ umem->ibdev = device;
	133	+ umem->writable = ib_access_writable(access);
	134	+ umem->owning_mm = current->mm;
	135	+ umem_odp->is_implicit_odp = 1;
	136	+ umem_odp->page_shift = PAGE_SHIFT;
	137	+
	138	+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
	139	+ ret = ib_init_umem_odp(umem_odp, NULL);
	140	+ if (ret) {
	141	+ put_pid(umem_odp->tgid);
	142	+ kfree(umem_odp);
	143	+ return ERR_PTR(ret);
	144	+ }
	145	+ return umem_odp;
	146	+}
	147	+EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
	148	+
	149	+/**
	150	+ * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
	151	+ * parent ODP umem
	152	+ *
	153	+ * @root: The parent umem enclosing the child. This must be allocated using
	154	+ * ib_alloc_implicit_odp_umem()
	155	+ * @addr: The starting userspace VA
	156	+ * @size: The length of the userspace VA
	157	+ * @ops: MMU interval ops, currently only @invalidate
	158	+ */
	159	+struct ib_umem_odp *
	160	+ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
	161	+ size_t size,
	162	+ const struct mmu_interval_notifier_ops *ops)
	163	+{
	164	+ /*
	165	+ * Caller must ensure that root cannot be freed during the call to
	166	+ * ib_alloc_odp_umem.
	167	+ */
	168	+ struct ib_umem_odp *odp_data;
	169	+ struct ib_umem *umem;
	170	+ int ret;
	171	+
	172	+ if (WARN_ON(!root->is_implicit_odp))
	173	+ return ERR_PTR(-EINVAL);
292	174
293	175	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
294		- if (!odp_data) {
295		- ret = -ENOMEM;
296		- goto out_umem;
297		- }
298		- odp_data->umem = umem;
	176	+ if (!odp_data)
	177	+ return ERR_PTR(-ENOMEM);
	178	+ umem = &odp_data->umem;
	179	+ umem->ibdev = root->umem.ibdev;
	180	+ umem->length = size;
	181	+ umem->address = addr;
	182	+ umem->writable = root->umem.writable;
	183	+ umem->owning_mm = root->umem.owning_mm;
	184	+ odp_data->page_shift = PAGE_SHIFT;
	185	+ odp_data->notifier.ops = ops;
299	186
300		- mutex_init(&odp_data->umem_mutex);
301		- init_completion(&odp_data->notifier_completion);
302		-
303		- odp_data->page_list =
304		- vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
305		- if (!odp_data->page_list) {
306		- ret = -ENOMEM;
307		- goto out_odp_data;
	187	+ /*
	188	+ * A mmget must be held when registering a notifier, the owming_mm only
	189	+ * has a mm_grab at this point.
	190	+ */
	191	+ if (!mmget_not_zero(umem->owning_mm)) {
	192	+ ret = -EFAULT;
	193	+ goto out_free;
308	194	}
309	195
310		- odp_data->dma_list =
311		- vzalloc(array_size(pages, sizeof(*odp_data->dma_list)));
312		- if (!odp_data->dma_list) {
313		- ret = -ENOMEM;
314		- goto out_page_list;
315		- }
	196	+ odp_data->tgid = get_pid(root->tgid);
	197	+ ret = ib_init_umem_odp(odp_data, ops);
	198	+ if (ret)
	199	+ goto out_tgid;
	200	+ mmput(umem->owning_mm);
	201	+ return odp_data;
316	202
317		- down_write(&context->umem_rwsem);
318		- context->odp_mrs_count++;
319		- rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
320		- if (likely(!atomic_read(&context->notifier_count)))
321		- odp_data->mn_counters_active = true;
322		- else
323		- list_add(&odp_data->no_private_counters,
324		- &context->no_private_counters);
325		- up_write(&context->umem_rwsem);
326		-
327		- umem->odp_data = odp_data;
328		-
329		- return umem;
330		-
331		-out_page_list:
332		- vfree(odp_data->page_list);
333		-out_odp_data:
	203	+out_tgid:
	204	+ put_pid(odp_data->tgid);
	205	+ mmput(umem->owning_mm);
	206	+out_free:
334	207	kfree(odp_data);
335		-out_umem:
336		- kfree(umem);
337	208	return ERR_PTR(ret);
338	209	}
339		-EXPORT_SYMBOL(ib_alloc_odp_umem);
	210	+EXPORT_SYMBOL(ib_umem_odp_alloc_child);
340	211
341		-int ib_umem_odp_get(struct ib_ucontext context, struct ib_umem umem,
342		- int access)
	212	+/**
	213	+ * ib_umem_odp_get - Create a umem_odp for a userspace va
	214	+ *
	215	+ * @device: IB device struct to get UMEM
	216	+ * @addr: userspace virtual address to start at
	217	+ * @size: length of region to pin
	218	+ * @access: IB_ACCESS_xxx flags for memory being pinned
	219	+ * @ops: MMU interval ops, currently only @invalidate
	220	+ *
	221	+ * The driver should use when the access flags indicate ODP memory. It avoids
	222	+ * pinning, instead, stores the mm for future page fault handling in
	223	+ * conjunction with MMU notifiers.
	224	+ */
	225	+struct ib_umem_odp ib_umem_odp_get(struct ib_device device,
	226	+ unsigned long addr, size_t size, int access,
	227	+ const struct mmu_interval_notifier_ops *ops)
343	228	{
344		- int ret_val;
345		- struct pid *our_pid;
346		- struct mm_struct *mm = get_task_mm(current);
	229	+ struct ib_umem_odp *umem_odp;
	230	+ struct mm_struct *mm;
	231	+ int ret;
347	232
348		- if (!mm)
349		- return -EINVAL;
	233	+ if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
	234	+ return ERR_PTR(-EINVAL);
350	235
351		- if (access & IB_ACCESS_HUGETLB) {
352		- struct vm_area_struct *vma;
353		- struct hstate *h;
	236	+ umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
	237	+ if (!umem_odp)
	238	+ return ERR_PTR(-ENOMEM);
354	239
355		- down_read(&mm->mmap_sem);
356		- vma = find_vma(mm, ib_umem_start(umem));
357		- if (!vma \|\| !is_vm_hugetlb_page(vma)) {
358		- up_read(&mm->mmap_sem);
359		- ret_val = -EINVAL;
360		- goto out_mm;
361		- }
362		- h = hstate_vma(vma);
363		- umem->page_shift = huge_page_shift(h);
364		- up_read(&mm->mmap_sem);
365		- umem->hugetlb = 1;
366		- } else {
367		- umem->hugetlb = 0;
368		- }
	240	+ umem_odp->umem.ibdev = device;
	241	+ umem_odp->umem.length = size;
	242	+ umem_odp->umem.address = addr;
	243	+ umem_odp->umem.writable = ib_access_writable(access);
	244	+ umem_odp->umem.owning_mm = mm = current->mm;
	245	+ umem_odp->notifier.ops = ops;
369	246
370		- /* Prevent creating ODP MRs in child processes */
371		- rcu_read_lock();
372		- our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
373		- rcu_read_unlock();
374		- put_pid(our_pid);
375		- if (context->tgid != our_pid) {
376		- ret_val = -EINVAL;
377		- goto out_mm;
378		- }
	247	+ umem_odp->page_shift = PAGE_SHIFT;
	248	+#ifdef CONFIG_HUGETLB_PAGE
	249	+ if (access & IB_ACCESS_HUGETLB)
	250	+ umem_odp->page_shift = HPAGE_SHIFT;
	251	+#endif
379	252
380		- umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
381		- if (!umem->odp_data) {
382		- ret_val = -ENOMEM;
383		- goto out_mm;
384		- }
385		- umem->odp_data->umem = umem;
	253	+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
	254	+ ret = ib_init_umem_odp(umem_odp, ops);
	255	+ if (ret)
	256	+ goto err_put_pid;
	257	+ return umem_odp;
386	258
387		- mutex_init(&umem->odp_data->umem_mutex);
388		-
389		- init_completion(&umem->odp_data->notifier_completion);
390		-
391		- if (ib_umem_num_pages(umem)) {
392		- umem->odp_data->page_list =
393		- vzalloc(array_size(sizeof(*umem->odp_data->page_list),
394		- ib_umem_num_pages(umem)));
395		- if (!umem->odp_data->page_list) {
396		- ret_val = -ENOMEM;
397		- goto out_odp_data;
398		- }
399		-
400		- umem->odp_data->dma_list =
401		- vzalloc(array_size(sizeof(*umem->odp_data->dma_list),
402		- ib_umem_num_pages(umem)));
403		- if (!umem->odp_data->dma_list) {
404		- ret_val = -ENOMEM;
405		- goto out_page_list;
406		- }
407		- }
408		-
409		- /*
410		- * When using MMU notifiers, we will get a
411		- * notification before the "current" task (and MM) is
412		- * destroyed. We use the umem_rwsem semaphore to synchronize.
413		- */
414		- down_write(&context->umem_rwsem);
415		- context->odp_mrs_count++;
416		- if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
417		- rbt_ib_umem_insert(&umem->odp_data->interval_tree,
418		- &context->umem_tree);
419		- if (likely(!atomic_read(&context->notifier_count)) \|\|
420		- context->odp_mrs_count == 1)
421		- umem->odp_data->mn_counters_active = true;
422		- else
423		- list_add(&umem->odp_data->no_private_counters,
424		- &context->no_private_counters);
425		- downgrade_write(&context->umem_rwsem);
426		-
427		- if (context->odp_mrs_count == 1) {
428		- /*
429		- * Note that at this point, no MMU notifier is running
430		- * for this context!
431		- */
432		- atomic_set(&context->notifier_count, 0);
433		- INIT_HLIST_NODE(&context->mn.hlist);
434		- context->mn.ops = &ib_umem_notifiers;
435		- /*
436		- * Lock-dep detects a false positive for mmap_sem vs.
437		- * umem_rwsem, due to not grasping downgrade_write correctly.
438		- */
439		- lockdep_off();
440		- ret_val = mmu_notifier_register(&context->mn, mm);
441		- lockdep_on();
442		- if (ret_val) {
443		- pr_err("Failed to register mmu_notifier %d\n", ret_val);
444		- ret_val = -EBUSY;
445		- goto out_mutex;
446		- }
447		- }
448		-
449		- up_read(&context->umem_rwsem);
450		-
451		- /*
452		- * Note that doing an mmput can cause a notifier for the relevant mm.
453		- * If the notifier is called while we hold the umem_rwsem, this will
454		- * cause a deadlock. Therefore, we release the reference only after we
455		- * released the semaphore.
456		- */
457		- mmput(mm);
458		- return 0;
459		-
460		-out_mutex:
461		- up_read(&context->umem_rwsem);
462		- vfree(umem->odp_data->dma_list);
463		-out_page_list:
464		- vfree(umem->odp_data->page_list);
465		-out_odp_data:
466		- kfree(umem->odp_data);
467		-out_mm:
468		- mmput(mm);
469		- return ret_val;
	259	+err_put_pid:
	260	+ put_pid(umem_odp->tgid);
	261	+ kfree(umem_odp);
	262	+ return ERR_PTR(ret);
470	263	}
	264	+EXPORT_SYMBOL(ib_umem_odp_get);
471	265
472		-void ib_umem_odp_release(struct ib_umem *umem)
	266	+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
473	267	{
474		- struct ib_ucontext *context = umem->context;
475		-
476	268	/*
477	269	* Ensure that no more pages are mapped in the umem.
478	270	*
479	271	* It is the driver's responsibility to ensure, before calling us,
480	272	* that the hardware will not attempt to access the MR any more.
481	273	*/
482		- ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
483		- ib_umem_end(umem));
484		-
485		- down_write(&context->umem_rwsem);
486		- if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
487		- rbt_ib_umem_remove(&umem->odp_data->interval_tree,
488		- &context->umem_tree);
489		- context->odp_mrs_count--;
490		- if (!umem->odp_data->mn_counters_active) {
491		- list_del(&umem->odp_data->no_private_counters);
492		- complete_all(&umem->odp_data->notifier_completion);
	274	+ if (!umem_odp->is_implicit_odp) {
	275	+ mutex_lock(&umem_odp->umem_mutex);
	276	+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
	277	+ ib_umem_end(umem_odp));
	278	+ mutex_unlock(&umem_odp->umem_mutex);
	279	+ mmu_interval_notifier_remove(&umem_odp->notifier);
	280	+ kvfree(umem_odp->dma_list);
	281	+ kvfree(umem_odp->pfn_list);
493	282	}
494		-
495		- /*
496		- * Downgrade the lock to a read lock. This ensures that the notifiers
497		- * (who lock the mutex for reading) will be able to finish, and we
498		- * will be able to enventually obtain the mmu notifiers SRCU. Note
499		- * that since we are doing it atomically, no other user could register
500		- * and unregister while we do the check.
501		- */
502		- downgrade_write(&context->umem_rwsem);
503		- if (!context->odp_mrs_count) {
504		- struct task_struct *owning_process = NULL;
505		- struct mm_struct *owning_mm = NULL;
506		-
507		- owning_process = get_pid_task(context->tgid,
508		- PIDTYPE_PID);
509		- if (owning_process == NULL)
510		- /*
511		- * The process is already dead, notifier were removed
512		- * already.
513		- */
514		- goto out;
515		-
516		- owning_mm = get_task_mm(owning_process);
517		- if (owning_mm == NULL)
518		- /*
519		- * The process' mm is already dead, notifier were
520		- * removed already.
521		- */
522		- goto out_put_task;
523		- mmu_notifier_unregister(&context->mn, owning_mm);
524		-
525		- mmput(owning_mm);
526		-
527		-out_put_task:
528		- put_task_struct(owning_process);
529		- }
530		-out:
531		- up_read(&context->umem_rwsem);
532		-
533		- vfree(umem->odp_data->dma_list);
534		- vfree(umem->odp_data->page_list);
535		- kfree(umem->odp_data);
536		- kfree(umem);
	283	+ put_pid(umem_odp->tgid);
	284	+ kfree(umem_odp);
537	285	}
	286	+EXPORT_SYMBOL(ib_umem_odp_release);
538	287
539	288	/*
540	289	* Map for DMA and insert a single page into the on-demand paging page tables.
541	290	*
542	291	* @umem: the umem to insert the page to.
543		- * @page_index: index in the umem to add the page to.
	292	+ * @dma_index: index in the umem to add the dma to.
544	293	* @page: the page struct to map and add.
545	294	* @access_mask: access permissions needed for this page.
546	295	* @current_seq: sequence number for synchronization with invalidations.
547	296	* the sequence number is taken from
548		- * umem->odp_data->notifiers_seq.
	297	+ * umem_odp->notifiers_seq.
549	298	*
550		- * The function returns -EFAULT if the DMA mapping operation fails. It returns
551		- * -EAGAIN if a concurrent invalidation prevents us from updating the page.
	299	+ * The function returns -EFAULT if the DMA mapping operation fails.
552	300	*
553		- * The page is released via put_page even if the operation failed. For
554		- * on-demand pinning, the page is released whenever it isn't stored in the
555		- * umem.
556	301	*/
557	302	static int ib_umem_odp_map_dma_single_page(
558		- struct ib_umem *umem,
559		- int page_index,
	303	+ struct ib_umem_odp *umem_odp,
	304	+ unsigned int dma_index,
560	305	struct page *page,
561		- u64 access_mask,
562		- unsigned long current_seq)
	306	+ u64 access_mask)
563	307	{
564		- struct ib_device *dev = umem->context->device;
565		- dma_addr_t dma_addr;
566		- int stored_page = 0;
567		- int remove_existing_mapping = 0;
568		- int ret = 0;
	308	+ struct ib_device *dev = umem_odp->umem.ibdev;
	309	+ dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
569	310
570		- /*
571		- * Note: we avoid writing if seq is different from the initial seq, to
572		- * handle case of a racing notifier. This check also allows us to bail
573		- * early if we have a notifier running in parallel with us.
574		- */
575		- if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
576		- ret = -EAGAIN;
577		- goto out;
578		- }
579		- if (!(umem->odp_data->dma_list[page_index])) {
580		- dma_addr = ib_dma_map_page(dev,
581		- page,
582		- 0, BIT(umem->page_shift),
583		- DMA_BIDIRECTIONAL);
584		- if (ib_dma_mapping_error(dev, dma_addr)) {
585		- ret = -EFAULT;
586		- goto out;
587		- }
588		- umem->odp_data->dma_list[page_index] = dma_addr \| access_mask;
589		- umem->odp_data->page_list[page_index] = page;
590		- umem->npages++;
591		- stored_page = 1;
592		- } else if (umem->odp_data->page_list[page_index] == page) {
593		- umem->odp_data->dma_list[page_index] \|= access_mask;
594		- } else {
595		- pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
596		- umem->odp_data->page_list[page_index], page);
597		- /* Better remove the mapping now, to prevent any further
598		- * damage. */
599		- remove_existing_mapping = 1;
	311	+ if (*dma_addr) {
	312	+ /*
	313	+ * If the page is already dma mapped it means it went through
	314	+ * a non-invalidating trasition, like read-only to writable.
	315	+ * Resync the flags.
	316	+ */
	317	+ dma_addr = (dma_addr & ODP_DMA_ADDR_MASK) \| access_mask;
	318	+ return 0;
600	319	}
601	320
602		-out:
603		- /* On Demand Paging - avoid pinning the page */
604		- if (umem->context->invalidate_range \|\| !stored_page)
605		- put_page(page);
606		-
607		- if (remove_existing_mapping && umem->context->invalidate_range) {
608		- invalidate_page_trampoline(
609		- umem,
610		- ib_umem_start(umem) + (page_index >> umem->page_shift),
611		- ib_umem_start(umem) + ((page_index + 1) >>
612		- umem->page_shift),
613		- NULL);
614		- ret = -EAGAIN;
	321	+ *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
	322	+ DMA_BIDIRECTIONAL);
	323	+ if (ib_dma_mapping_error(dev, *dma_addr)) {
	324	+ *dma_addr = 0;
	325	+ return -EFAULT;
615	326	}
616		-
617		- return ret;
	327	+ umem_odp->npages++;
	328	+ *dma_addr \|= access_mask;
	329	+ return 0;
618	330	}
619	331
620	332	/**
621		- * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
	333	+ * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
622	334	*
623		- * Pins the range of pages passed in the argument, and maps them to
624		- * DMA addresses. The DMA addresses of the mapped pages is updated in
625		- * umem->odp_data->dma_list.
	335	+ * Maps the range passed in the argument to DMA addresses.
	336	+ * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
	337	+ * Upon success the ODP MR will be locked to let caller complete its device
	338	+ * page table update.
626	339	*
627	340	* Returns the number of pages mapped in success, negative error code
628	341	* for failure.
629		- * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
630		- * the function from completing its task.
631		- * An -ENOENT error code indicates that userspace process is being terminated
632		- * and mm was already destroyed.
633		- * @umem: the umem to map and pin
	342	+ * @umem_odp: the umem to map and pin
634	343	* @user_virt: the address from which we need to map.
635	344	* @bcnt: the minimal number of bytes to pin and map. The mapping might be
636	345	* bigger due to alignment, and may also be smaller in case of an error
..	..	@@ -638,150 +347,158 @@
638	347	* the return value.
639	348	* @access_mask: bit mask of the requested access permissions for the given
640	349	* range.
641		- * @current_seq: the MMU notifiers sequance value for synchronization with
642		- * invalidations. the sequance number is read from
643		- * umem->odp_data->notifiers_seq before calling this function
	350	+ * @fault: is faulting required for the given range
644	351	*/
645		-int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
646		- u64 access_mask, unsigned long current_seq)
	352	+int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
	353	+ u64 bcnt, u64 access_mask, bool fault)
	354	+ __acquires(&umem_odp->umem_mutex)
647	355	{
648	356	struct task_struct *owning_process = NULL;
649		- struct mm_struct *owning_mm = NULL;
650		- struct page **local_page_list = NULL;
651		- u64 page_mask, off;
652		- int j, k, ret = 0, start_idx, npages = 0, page_shift;
653		- unsigned int flags = 0;
654		- phys_addr_t p = 0;
	357	+ struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
	358	+ int pfn_index, dma_index, ret = 0, start_idx;
	359	+ unsigned int page_shift, hmm_order, pfn_start_idx;
	360	+ unsigned long num_pfns, current_seq;
	361	+ struct hmm_range range = {};
	362	+ unsigned long timeout;
655	363
656	364	if (access_mask == 0)
657	365	return -EINVAL;
658	366
659		- if (user_virt < ib_umem_start(umem) \|\|
660		- user_virt + bcnt > ib_umem_end(umem))
	367	+ if (user_virt < ib_umem_start(umem_odp) \|\|
	368	+ user_virt + bcnt > ib_umem_end(umem_odp))
661	369	return -EFAULT;
662	370
663		- local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
664		- if (!local_page_list)
665		- return -ENOMEM;
	371	+ page_shift = umem_odp->page_shift;
666	372
667		- page_shift = umem->page_shift;
668		- page_mask = ~(BIT(page_shift) - 1);
669		- off = user_virt & (~page_mask);
670		- user_virt = user_virt & page_mask;
671		- bcnt += off; /* Charge for the first page offset as well. */
672		-
673		- owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
674		- if (owning_process == NULL) {
	373	+ /*
	374	+ * owning_process is allowed to be NULL, this means somehow the mm is
	375	+ * existing beyond the lifetime of the originating process.. Presumably
	376	+ * mmget_not_zero will fail in this case.
	377	+ */
	378	+ owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
	379	+ if (!owning_process \|\| !mmget_not_zero(owning_mm)) {
675	380	ret = -EINVAL;
676		- goto out_no_task;
677		- }
678		-
679		- owning_mm = get_task_mm(owning_process);
680		- if (owning_mm == NULL) {
681		- ret = -ENOENT;
682	381	goto out_put_task;
683	382	}
684	383
685		- if (access_mask & ODP_WRITE_ALLOWED_BIT)
686		- flags \|= FOLL_WRITE;
	384	+ range.notifier = &umem_odp->notifier;
	385	+ range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
	386	+ range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
	387	+ pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
	388	+ num_pfns = (range.end - range.start) >> PAGE_SHIFT;
	389	+ if (fault) {
	390	+ range.default_flags = HMM_PFN_REQ_FAULT;
687	391
688		- start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
689		- k = start_idx;
	392	+ if (access_mask & ODP_WRITE_ALLOWED_BIT)
	393	+ range.default_flags \|= HMM_PFN_REQ_WRITE;
	394	+ }
690	395
691		- while (bcnt > 0) {
692		- const size_t gup_num_pages = min_t(size_t,
693		- ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
694		- PAGE_SIZE / sizeof(struct page *));
	396	+ range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
	397	+ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
695	398
696		- down_read(&owning_mm->mmap_sem);
697		- /*
698		- * Note: this might result in redundent page getting. We can
699		- * avoid this by checking dma_list to be 0 before calling
700		- * get_user_pages. However, this make the code much more
701		- * complex (and doesn't gain us much performance in most use
702		- * cases).
703		- */
704		- npages = get_user_pages_remote(owning_process, owning_mm,
705		- user_virt, gup_num_pages,
706		- flags, local_page_list, NULL, NULL);
707		- up_read(&owning_mm->mmap_sem);
	399	+retry:
	400	+ current_seq = range.notifier_seq =
	401	+ mmu_interval_read_begin(&umem_odp->notifier);
708	402
709		- if (npages < 0)
710		- break;
	403	+ mmap_read_lock(owning_mm);
	404	+ ret = hmm_range_fault(&range);
	405	+ mmap_read_unlock(owning_mm);
	406	+ if (unlikely(ret)) {
	407	+ if (ret == -EBUSY && !time_after(jiffies, timeout))
	408	+ goto retry;
	409	+ goto out_put_mm;
	410	+ }
711	411
712		- bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
713		- mutex_lock(&umem->odp_data->umem_mutex);
714		- for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
715		- if (user_virt & ~page_mask) {
716		- p += PAGE_SIZE;
717		- if (page_to_phys(local_page_list[j]) != p) {
718		- ret = -EFAULT;
719		- break;
720		- }
721		- put_page(local_page_list[j]);
	412	+ start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
	413	+ dma_index = start_idx;
	414	+
	415	+ mutex_lock(&umem_odp->umem_mutex);
	416	+ if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
	417	+ mutex_unlock(&umem_odp->umem_mutex);
	418	+ goto retry;
	419	+ }
	420	+
	421	+ for (pfn_index = 0; pfn_index < num_pfns;
	422	+ pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
	423	+
	424	+ if (fault) {
	425	+ /*
	426	+ * Since we asked for hmm_range_fault() to populate
	427	+ * pages it shouldn't return an error entry on success.
	428	+ */
	429	+ WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
	430	+ WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
	431	+ } else {
	432	+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
	433	+ WARN_ON(umem_odp->dma_list[dma_index]);
722	434	continue;
723	435	}
724		-
725		- ret = ib_umem_odp_map_dma_single_page(
726		- umem, k, local_page_list[j],
727		- access_mask, current_seq);
728		- if (ret < 0)
729		- break;
730		-
731		- p = page_to_phys(local_page_list[j]);
732		- k++;
	436	+ access_mask = ODP_READ_ALLOWED_BIT;
	437	+ if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
	438	+ access_mask \|= ODP_WRITE_ALLOWED_BIT;
733	439	}
734		- mutex_unlock(&umem->odp_data->umem_mutex);
735	440
	441	+ hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
	442	+ /* If a hugepage was detected and ODP wasn't set for, the umem
	443	+ * page_shift will be used, the opposite case is an error.
	444	+ */
	445	+ if (hmm_order + PAGE_SHIFT < page_shift) {
	446	+ ret = -EINVAL;
	447	+ ibdev_dbg(umem_odp->umem.ibdev,
	448	+ "%s: un-expected hmm_order %d, page_shift %d\n",
	449	+ __func__, hmm_order, page_shift);
	450	+ break;
	451	+ }
	452	+
	453	+ ret = ib_umem_odp_map_dma_single_page(
	454	+ umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
	455	+ access_mask);
736	456	if (ret < 0) {
737		- /* Release left over pages when handling errors. */
738		- for (++j; j < npages; ++j)
739		- put_page(local_page_list[j]);
	457	+ ibdev_dbg(umem_odp->umem.ibdev,
	458	+ "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
740	459	break;
741	460	}
742	461	}
	462	+ /* upon sucesss lock should stay on hold for the callee */
	463	+ if (!ret)
	464	+ ret = dma_index - start_idx;
	465	+ else
	466	+ mutex_unlock(&umem_odp->umem_mutex);
743	467
744		- if (ret >= 0) {
745		- if (npages < 0 && k == start_idx)
746		- ret = npages;
747		- else
748		- ret = k - start_idx;
749		- }
750		-
751		- mmput(owning_mm);
	468	+out_put_mm:
	469	+ mmput_async(owning_mm);
752	470	out_put_task:
753		- put_task_struct(owning_process);
754		-out_no_task:
755		- free_page((unsigned long)local_page_list);
	471	+ if (owning_process)
	472	+ put_task_struct(owning_process);
756	473	return ret;
757	474	}
758		-EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
	475	+EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
759	476
760		-void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
	477	+void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
761	478	u64 bound)
762	479	{
	480	+ dma_addr_t dma_addr;
	481	+ dma_addr_t dma;
763	482	int idx;
764	483	u64 addr;
765		- struct ib_device *dev = umem->context->device;
	484	+ struct ib_device *dev = umem_odp->umem.ibdev;
766	485
767		- virt = max_t(u64, virt, ib_umem_start(umem));
768		- bound = min_t(u64, bound, ib_umem_end(umem));
769		- /* Note that during the run of this function, the
770		- * notifiers_count of the MR is > 0, preventing any racing
771		- * faults from completion. We might be racing with other
772		- * invalidations, so we must make sure we free each page only
773		- * once. */
774		- mutex_lock(&umem->odp_data->umem_mutex);
775		- for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
776		- idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
777		- if (umem->odp_data->page_list[idx]) {
778		- struct page *page = umem->odp_data->page_list[idx];
779		- dma_addr_t dma = umem->odp_data->dma_list[idx];
780		- dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
	486	+ lockdep_assert_held(&umem_odp->umem_mutex);
781	487
782		- WARN_ON(!dma_addr);
	488	+ virt = max_t(u64, virt, ib_umem_start(umem_odp));
	489	+ bound = min_t(u64, bound, ib_umem_end(umem_odp));
	490	+ for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
	491	+ idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
	492	+ dma = umem_odp->dma_list[idx];
783	493
784		- ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
	494	+ /* The access flags guaranteed a valid DMA address in case was NULL */
	495	+ if (dma) {
	496	+ unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
	497	+ struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
	498	+
	499	+ dma_addr = dma & ODP_DMA_ADDR_MASK;
	500	+ ib_dma_unmap_page(dev, dma_addr,
	501	+ BIT(umem_odp->page_shift),
785	502	DMA_BIDIRECTIONAL);
786	503	if (dma & ODP_WRITE_ALLOWED_BIT) {
787	504	struct page *head_page = compound_head(page);
..	..	@@ -796,57 +513,9 @@
796	513	*/
797	514	set_page_dirty(head_page);
798	515	}
799		- /* on demand pinning support */
800		- if (!umem->context->invalidate_range)
801		- put_page(page);
802		- umem->odp_data->page_list[idx] = NULL;
803		- umem->odp_data->dma_list[idx] = 0;
804		- umem->npages--;
	516	+ umem_odp->dma_list[idx] = 0;
	517	+ umem_odp->npages--;
805	518	}
806	519	}
807		- mutex_unlock(&umem->odp_data->umem_mutex);
808	520	}
809	521	EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
810		-
811		-/* @last is not a part of the interval. See comment for function
812		- * node_last.
813		- */
814		-int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
815		- u64 start, u64 last,
816		- umem_call_back cb,
817		- bool blockable,
818		- void *cookie)
819		-{
820		- int ret_val = 0;
821		- struct umem_odp_node node, next;
822		- struct ib_umem_odp *umem;
823		-
824		- if (unlikely(start == last))
825		- return ret_val;
826		-
827		- for (node = rbt_ib_umem_iter_first(root, start, last - 1);
828		- node; node = next) {
829		- /* TODO move the blockable decision up to the callback */
830		- if (!blockable)
831		- return -EAGAIN;
832		- next = rbt_ib_umem_iter_next(node, start, last - 1);
833		- umem = container_of(node, struct ib_umem_odp, interval_tree);
834		- ret_val = cb(umem->umem, start, last, cookie) \|\| ret_val;
835		- }
836		-
837		- return ret_val;
838		-}
839		-EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
840		-
841		-struct ib_umem_odp rbt_ib_umem_lookup(struct rb_root_cached root,
842		- u64 addr, u64 length)
843		-{
844		- struct umem_odp_node *node;
845		-
846		- node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
847		- if (node)
848		- return container_of(node, struct ib_umem_odp, interval_tree);
849		- return NULL;
850		-
851		-}
852		-EXPORT_SYMBOL(rbt_ib_umem_lookup);