~hc/RK356X_SDK_RELEASE.git

..	..	@@ -37,64 +37,121 @@
37	37	#include <linux/sched/signal.h>
38	38	#include <linux/sched/mm.h>
39	39	#include <linux/export.h>
40		-#include <linux/hugetlb.h>
41	40	#include <linux/slab.h>
	41	+#include <linux/pagemap.h>
	42	+#include <linux/count_zeros.h>
42	43	#include <rdma/ib_umem_odp.h>
43	44
44	45	#include "uverbs.h"
45	46
46		-
47	47	static void __ib_umem_release(struct ib_device dev, struct ib_umem umem, int dirty)
48	48	{
49		- struct scatterlist *sg;
	49	+ struct sg_page_iter sg_iter;
50	50	struct page *page;
51		- int i;
52	51
53	52	if (umem->nmap > 0)
54		- ib_dma_unmap_sg(dev, umem->sg_head.sgl,
55		- umem->npages,
	53	+ ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
56	54	DMA_BIDIRECTIONAL);
57	55
58		- for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
59		-
60		- page = sg_page(sg);
61		- if (!PageDirty(page) && umem->writable && dirty)
62		- set_page_dirty_lock(page);
63		- put_page(page);
	56	+ for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
	57	+ page = sg_page_iter_page(&sg_iter);
	58	+ unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
64	59	}
65	60
66	61	sg_free_table(&umem->sg_head);
67	62	}
68	63
69	64	/**
	65	+ * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
	66	+ *
	67	+ * @umem: umem struct
	68	+ * @pgsz_bitmap: bitmap of HW supported page sizes
	69	+ * @virt: IOVA
	70	+ *
	71	+ * This helper is intended for HW that support multiple page
	72	+ * sizes but can do only a single page size in an MR.
	73	+ *
	74	+ * Returns 0 if the umem requires page sizes not supported by
	75	+ * the driver to be mapped. Drivers always supporting PAGE_SIZE
	76	+ * or smaller will never see a 0 result.
	77	+ */
	78	+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
	79	+ unsigned long pgsz_bitmap,
	80	+ unsigned long virt)
	81	+{
	82	+ struct scatterlist *sg;
	83	+ unsigned long va, pgoff;
	84	+ dma_addr_t mask;
	85	+ int i;
	86	+
	87	+ /* rdma_for_each_block() has a bug if the page size is smaller than the
	88	+ * page size used to build the umem. For now prevent smaller page sizes
	89	+ * from being returned.
	90	+ */
	91	+ pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
	92	+
	93	+ /* At minimum, drivers must support PAGE_SIZE or smaller */
	94	+ if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
	95	+ return 0;
	96	+
	97	+ umem->iova = va = virt;
	98	+ /* The best result is the smallest page size that results in the minimum
	99	+ * number of required pages. Compute the largest page size that could
	100	+ * work based on VA address bits that don't change.
	101	+ */
	102	+ mask = pgsz_bitmap &
	103	+ GENMASK(BITS_PER_LONG - 1,
	104	+ bits_per((umem->length - 1 + virt) ^ virt));
	105	+ /* offset into first SGL */
	106	+ pgoff = umem->address & ~PAGE_MASK;
	107	+
	108	+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
	109	+ /* Walk SGL and reduce max page size if VA/PA bits differ
	110	+ * for any address.
	111	+ */
	112	+ mask \|= (sg_dma_address(sg) + pgoff) ^ va;
	113	+ va += sg_dma_len(sg) - pgoff;
	114	+ /* Except for the last entry, the ending iova alignment sets
	115	+ * the maximum possible page size as the low bits of the iova
	116	+ * must be zero when starting the next chunk.
	117	+ */
	118	+ if (i != (umem->nmap - 1))
	119	+ mask \|= va;
	120	+ pgoff = 0;
	121	+ }
	122	+
	123	+ /* The mask accumulates 1's in each position where the VA and physical
	124	+ * address differ, thus the length of trailing 0 is the largest page
	125	+ * size that can pass the VA through to the physical.
	126	+ */
	127	+ if (mask)
	128	+ pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
	129	+ return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0;
	130	+}
	131	+EXPORT_SYMBOL(ib_umem_find_best_pgsz);
	132	+
	133	+/**
70	134	* ib_umem_get - Pin and DMA map userspace memory.
71	135	*
72		- * If access flags indicate ODP memory, avoid pinning. Instead, stores
73		- * the mm for future page fault handling in conjunction with MMU notifiers.
74		- *
75		- * @context: userspace context to pin memory for
	136	+ * @device: IB device to connect UMEM
76	137	* @addr: userspace virtual address to start at
77	138	* @size: length of region to pin
78	139	* @access: IB_ACCESS_xxx flags for memory being pinned
79		- * @dmasync: flush in-flight DMA when the memory region is written
80	140	*/
81		-struct ib_umem ib_umem_get(struct ib_ucontext context, unsigned long addr,
82		- size_t size, int access, int dmasync)
	141	+struct ib_umem ib_umem_get(struct ib_device device, unsigned long addr,
	142	+ size_t size, int access)
83	143	{
84	144	struct ib_umem *umem;
85	145	struct page **page_list;
86		- struct vm_area_struct **vma_list;
87	146	unsigned long lock_limit;
	147	+ unsigned long new_pinned;
88	148	unsigned long cur_base;
	149	+ unsigned long dma_attr = 0;
	150	+ struct mm_struct *mm;
89	151	unsigned long npages;
90	152	int ret;
91		- int i;
92		- unsigned long dma_attrs = 0;
93		- struct scatterlist sg, sg_list_start;
	153	+ struct scatterlist *sg = NULL;
94	154	unsigned int gup_flags = FOLL_WRITE;
95		-
96		- if (dmasync)
97		- dma_attrs \|= DMA_ATTR_WRITE_BARRIER;
98	155
99	156	/*
100	157	* If the combination of the addr and size requested for this memory
..	..	@@ -107,27 +164,23 @@
107	164	if (!can_do_mlock())
108	165	return ERR_PTR(-EPERM);
109	166
110		- umem = kzalloc(sizeof *umem, GFP_KERNEL);
	167	+ if (access & IB_ACCESS_ON_DEMAND)
	168	+ return ERR_PTR(-EOPNOTSUPP);
	169	+
	170	+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
111	171	if (!umem)
112	172	return ERR_PTR(-ENOMEM);
113		-
114		- umem->context = context;
	173	+ umem->ibdev = device;
115	174	umem->length = size;
116	175	umem->address = addr;
117		- umem->page_shift = PAGE_SHIFT;
	176	+ /*
	177	+ * Drivers should call ib_umem_find_best_pgsz() to set the iova
	178	+ * correctly.
	179	+ */
	180	+ umem->iova = addr;
118	181	umem->writable = ib_access_writable(access);
119		-
120		- if (access & IB_ACCESS_ON_DEMAND) {
121		- ret = ib_umem_odp_get(context, umem, access);
122		- if (ret)
123		- goto umem_kfree;
124		- return umem;
125		- }
126		-
127		- umem->odp_data = NULL;
128		-
129		- /* We assume the memory is from hugetlb until proved otherwise */
130		- umem->hugetlb = 1;
	182	+ umem->owning_mm = mm = current->mm;
	183	+ mmgrab(mm);
131	184
132	185	page_list = (struct page **) __get_free_page(GFP_KERNEL);
133	186	if (!page_list) {
..	..	@@ -135,75 +188,56 @@
135	188	goto umem_kfree;
136	189	}
137	190
138		- /*
139		- * if we can't alloc the vma_list, it's not so bad;
140		- * just assume the memory is not hugetlb memory
141		- */
142		- vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
143		- if (!vma_list)
144		- umem->hugetlb = 0;
145		-
146	191	npages = ib_umem_num_pages(umem);
	192	+ if (npages == 0 \|\| npages > UINT_MAX) {
	193	+ ret = -EINVAL;
	194	+ goto out;
	195	+ }
147	196
148	197	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
149	198
150		- down_write(&current->mm->mmap_sem);
151		- current->mm->pinned_vm += npages;
152		- if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) {
153		- up_write(&current->mm->mmap_sem);
	199	+ new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
	200	+ if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
	201	+ atomic64_sub(npages, &mm->pinned_vm);
154	202	ret = -ENOMEM;
155		- goto vma;
	203	+ goto out;
156	204	}
157		- up_write(&current->mm->mmap_sem);
158	205
159	206	cur_base = addr & PAGE_MASK;
160		-
161		- if (npages == 0 \|\| npages > UINT_MAX) {
162		- ret = -EINVAL;
163		- goto vma;
164		- }
165		-
166		- ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
167		- if (ret)
168		- goto vma;
169	207
170	208	if (!umem->writable)
171	209	gup_flags \|= FOLL_FORCE;
172	210
173		- sg_list_start = umem->sg_head.sgl;
174		-
175		- down_read(&current->mm->mmap_sem);
176	211	while (npages) {
177		- ret = get_user_pages_longterm(cur_base,
178		- min_t(unsigned long, npages,
179		- PAGE_SIZE / sizeof (struct page *)),
180		- gup_flags, page_list, vma_list);
181		- if (ret < 0) {
182		- up_read(&current->mm->mmap_sem);
	212	+ cond_resched();
	213	+ ret = pin_user_pages_fast(cur_base,
	214	+ min_t(unsigned long, npages,
	215	+ PAGE_SIZE /
	216	+ sizeof(struct page *)),
	217	+ gup_flags \| FOLL_LONGTERM, page_list);
	218	+ if (ret < 0)
	219	+ goto umem_release;
	220	+
	221	+ cur_base += ret * PAGE_SIZE;
	222	+ npages -= ret;
	223	+ sg = __sg_alloc_table_from_pages(&umem->sg_head, page_list, ret,
	224	+ 0, ret << PAGE_SHIFT,
	225	+ ib_dma_max_seg_size(device), sg, npages,
	226	+ GFP_KERNEL);
	227	+ umem->sg_nents = umem->sg_head.nents;
	228	+ if (IS_ERR(sg)) {
	229	+ unpin_user_pages_dirty_lock(page_list, ret, 0);
	230	+ ret = PTR_ERR(sg);
183	231	goto umem_release;
184	232	}
185		-
186		- umem->npages += ret;
187		- cur_base += ret * PAGE_SIZE;
188		- npages -= ret;
189		-
190		- for_each_sg(sg_list_start, sg, ret, i) {
191		- if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
192		- umem->hugetlb = 0;
193		-
194		- sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
195		- }
196		-
197		- /* preparing for next loop */
198		- sg_list_start = sg;
199	233	}
200		- up_read(&current->mm->mmap_sem);
201	234
202		- umem->nmap = ib_dma_map_sg_attrs(context->device,
203		- umem->sg_head.sgl,
204		- umem->npages,
205		- DMA_BIDIRECTIONAL,
206		- dma_attrs);
	235	+ if (access & IB_ACCESS_RELAXED_ORDERING)
	236	+ dma_attr \|= DMA_ATTR_WEAK_ORDERING;
	237	+
	238	+ umem->nmap =
	239	+ ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
	240	+ DMA_BIDIRECTIONAL, dma_attr);
207	241
208	242	if (!umem->nmap) {
209	243	ret = -ENOMEM;
..	..	@@ -214,32 +248,18 @@
214	248	goto out;
215	249
216	250	umem_release:
217		- __ib_umem_release(context->device, umem, 0);
218		-vma:
219		- down_write(&current->mm->mmap_sem);
220		- current->mm->pinned_vm -= ib_umem_num_pages(umem);
221		- up_write(&current->mm->mmap_sem);
	251	+ __ib_umem_release(device, umem, 0);
	252	+ atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
222	253	out:
223		- if (vma_list)
224		- free_page((unsigned long) vma_list);
225	254	free_page((unsigned long) page_list);
226	255	umem_kfree:
227		- if (ret)
	256	+ if (ret) {
	257	+ mmdrop(umem->owning_mm);
228	258	kfree(umem);
	259	+ }
229	260	return ret ? ERR_PTR(ret) : umem;
230	261	}
231	262	EXPORT_SYMBOL(ib_umem_get);
232		-
233		-static void ib_umem_account(struct work_struct *work)
234		-{
235		- struct ib_umem *umem = container_of(work, struct ib_umem, work);
236		-
237		- down_write(&umem->mm->mmap_sem);
238		- umem->mm->pinned_vm -= umem->diff;
239		- up_write(&umem->mm->mmap_sem);
240		- mmput(umem->mm);
241		- kfree(umem);
242		-}
243	263
244	264	/**
245	265	* ib_umem_release - release memory pinned with ib_umem_get
..	..	@@ -247,72 +267,18 @@
247	267	*/
248	268	void ib_umem_release(struct ib_umem *umem)
249	269	{
250		- struct ib_ucontext *context = umem->context;
251		- struct mm_struct *mm;
252		- struct task_struct *task;
253		- unsigned long diff;
254		-
255		- if (umem->odp_data) {
256		- ib_umem_odp_release(umem);
	270	+ if (!umem)
257	271	return;
258		- }
	272	+ if (umem->is_odp)
	273	+ return ib_umem_odp_release(to_ib_umem_odp(umem));
259	274
260		- __ib_umem_release(umem->context->device, umem, 1);
	275	+ __ib_umem_release(umem->ibdev, umem, 1);
261	276
262		- task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
263		- if (!task)
264		- goto out;
265		- mm = get_task_mm(task);
266		- put_task_struct(task);
267		- if (!mm)
268		- goto out;
269		-
270		- diff = ib_umem_num_pages(umem);
271		-
272		- /*
273		- * We may be called with the mm's mmap_sem already held. This
274		- * can happen when a userspace munmap() is the call that drops
275		- * the last reference to our file and calls our release
276		- * method. If there are memory regions to destroy, we'll end
277		- * up here and not be able to take the mmap_sem. In that case
278		- * we defer the vm_locked accounting to the system workqueue.
279		- */
280		- if (context->closing) {
281		- if (!down_write_trylock(&mm->mmap_sem)) {
282		- INIT_WORK(&umem->work, ib_umem_account);
283		- umem->mm = mm;
284		- umem->diff = diff;
285		-
286		- queue_work(ib_wq, &umem->work);
287		- return;
288		- }
289		- } else
290		- down_write(&mm->mmap_sem);
291		-
292		- mm->pinned_vm -= diff;
293		- up_write(&mm->mmap_sem);
294		- mmput(mm);
295		-out:
	277	+ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
	278	+ mmdrop(umem->owning_mm);
296	279	kfree(umem);
297	280	}
298	281	EXPORT_SYMBOL(ib_umem_release);
299		-
300		-int ib_umem_page_count(struct ib_umem *umem)
301		-{
302		- int i;
303		- int n;
304		- struct scatterlist *sg;
305		-
306		- if (umem->odp_data)
307		- return ib_umem_num_pages(umem);
308		-
309		- n = 0;
310		- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
311		- n += sg_dma_len(sg) >> umem->page_shift;
312		-
313		- return n;
314		-}
315		-EXPORT_SYMBOL(ib_umem_page_count);
316	282
317	283	/*
318	284	* Copy from the given ib_umem's pages to the given buffer.
..	..	@@ -336,7 +302,7 @@
336	302	return -EINVAL;
337	303	}
338	304
339		- ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
	305	+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
340	306	offset + ib_umem_offset(umem));
341	307
342	308	if (ret < 0)