~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,7 +1,7 @@
1	1	// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
2	2	/*
3	3	*
4		- * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
	4	+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
5	5	*
6	6	* This program is free software and is provided to you under the terms of the
7	7	* GNU General Public License version 2 as published by the Free Software
..	..	@@ -25,6 +25,7 @@
25	25
26	26	#include <linux/kernel.h>
27	27	#include <linux/dma-mapping.h>
	28	+#include <linux/migrate.h>
28	29	#include <mali_kbase.h>
29	30	#include <gpu/mali_kbase_gpu_fault.h>
30	31	#include <gpu/mali_kbase_gpu_regmap.h>
..	..	@@ -41,54 +42,265 @@
41	42	#include <mmu/mali_kbase_mmu_internal.h>
42	43	#include <mali_kbase_cs_experimental.h>
43	44	#include <device/mali_kbase_device.h>
	45	+#include <uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_id.h>
	46	+#if !MALI_USE_CSF
	47	+#include <mali_kbase_hwaccess_jm.h>
	48	+#endif
44	49
45	50	#include <mali_kbase_trace_gpu_mem.h>
46		-#define KBASE_MMU_PAGE_ENTRIES 512
	51	+#include <backend/gpu/mali_kbase_pm_internal.h>
	52	+
	53	+/* Threshold used to decide whether to flush full caches or just a physical range */
	54	+#define KBASE_PA_RANGE_THRESHOLD_NR_PAGES 20
	55	+#define MGM_DEFAULT_PTE_GROUP (0)
	56	+
	57	+/* Macro to convert updated PDGs to flags indicating levels skip in flush */
	58	+#define pgd_level_to_skip_flush(dirty_pgds) (~(dirty_pgds) & 0xF)
	59	+
	60	+/* Small wrapper function to factor out GPU-dependent context releasing */
	61	+static void release_ctx(struct kbase_device *kbdev,
	62	+ struct kbase_context *kctx)
	63	+{
	64	+#if MALI_USE_CSF
	65	+ CSTD_UNUSED(kbdev);
	66	+ kbase_ctx_sched_release_ctx_lock(kctx);
	67	+#else /* MALI_USE_CSF */
	68	+ kbasep_js_runpool_release_ctx(kbdev, kctx);
	69	+#endif /* MALI_USE_CSF */
	70	+}
	71	+
	72	+static void mmu_hw_operation_begin(struct kbase_device *kbdev)
	73	+{
	74	+#if !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
	75	+#if MALI_USE_CSF
	76	+ if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_GPU2019_3878)) {
	77	+ unsigned long flags;
	78	+
	79	+ lockdep_assert_held(&kbdev->mmu_hw_mutex);
	80	+
	81	+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
	82	+ WARN_ON_ONCE(kbdev->mmu_hw_operation_in_progress);
	83	+ kbdev->mmu_hw_operation_in_progress = true;
	84	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
	85	+ }
	86	+#endif /* MALI_USE_CSF */
	87	+#endif /* !CONFIG_MALI_BIFROST_NO_MALI */
	88	+}
	89	+
	90	+static void mmu_hw_operation_end(struct kbase_device *kbdev)
	91	+{
	92	+#if !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
	93	+#if MALI_USE_CSF
	94	+ if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_GPU2019_3878)) {
	95	+ unsigned long flags;
	96	+
	97	+ lockdep_assert_held(&kbdev->mmu_hw_mutex);
	98	+
	99	+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
	100	+ WARN_ON_ONCE(!kbdev->mmu_hw_operation_in_progress);
	101	+ kbdev->mmu_hw_operation_in_progress = false;
	102	+ /* Invoke the PM state machine, the L2 power off may have been
	103	+ * skipped due to the MMU command.
	104	+ */
	105	+ kbase_pm_update_state(kbdev);
	106	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
	107	+ }
	108	+#endif /* MALI_USE_CSF */
	109	+#endif /* !CONFIG_MALI_BIFROST_NO_MALI */
	110	+}
47	111
48	112	/**
49		- * kbase_mmu_flush_invalidate() - Flush and invalidate the GPU caches.
50		- * @kctx: The KBase context.
51		- * @vpfn: The virtual page frame number to start the flush on.
52		- * @nr: The number of pages to flush.
53		- * @sync: Set if the operation should be synchronous or not.
	113	+ * mmu_flush_cache_on_gpu_ctrl() - Check if cache flush needs to be done
	114	+ * through GPU_CONTROL interface.
54	115	*
55		- * Issue a cache flush + invalidate to the GPU caches and invalidate the TLBs.
	116	+ * @kbdev: kbase device to check GPU model ID on.
56	117	*
57		- * If sync is not set then transactions still in flight when the flush is issued
58		- * may use the old page tables and the data they write will not be written out
59		- * to memory, this function returns after the flush has been issued but
60		- * before all accesses which might effect the flushed region have completed.
	118	+ * This function returns whether a cache flush for page table update should
	119	+ * run through GPU_CONTROL interface or MMU_AS_CONTROL interface.
61	120	*
62		- * If sync is set then accesses in the flushed region will be drained
63		- * before data is flush and invalidated through L1, L2 and into memory,
64		- * after which point this function will return.
	121	+ * Return: True if cache flush should be done on GPU command.
65	122	*/
66		-static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
67		- u64 vpfn, size_t nr, bool sync);
	123	+static bool mmu_flush_cache_on_gpu_ctrl(struct kbase_device *kbdev)
	124	+{
	125	+ uint32_t const arch_maj_cur = (kbdev->gpu_props.props.raw_props.gpu_id &
	126	+ GPU_ID2_ARCH_MAJOR) >>
	127	+ GPU_ID2_ARCH_MAJOR_SHIFT;
	128	+
	129	+ return arch_maj_cur > 11;
	130	+}
68	131
69	132	/**
70		- * kbase_mmu_flush_invalidate_no_ctx() - Flush and invalidate the GPU caches.
71		- * @kbdev: Device pointer.
72		- * @vpfn: The virtual page frame number to start the flush on.
73		- * @nr: The number of pages to flush.
74		- * @sync: Set if the operation should be synchronous or not.
75		- * @as_nr: GPU address space number for which flush + invalidate is required.
	133	+ * mmu_flush_pa_range() - Flush physical address range
76	134	*
77		- * This is used for MMU tables which do not belong to a user space context.
	135	+ * @kbdev: kbase device to issue the MMU operation on.
	136	+ * @phys: Starting address of the physical range to start the operation on.
	137	+ * @nr_bytes: Number of bytes to work on.
	138	+ * @op: Type of cache flush operation to perform.
	139	+ *
	140	+ * Issue a cache flush physical range command.
78	141	*/
79		-static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev,
80		- u64 vpfn, size_t nr, bool sync, int as_nr);
	142	+#if MALI_USE_CSF
	143	+static void mmu_flush_pa_range(struct kbase_device *kbdev, phys_addr_t phys, size_t nr_bytes,
	144	+ enum kbase_mmu_op_type op)
	145	+{
	146	+ u32 flush_op;
	147	+
	148	+ lockdep_assert_held(&kbdev->hwaccess_lock);
	149	+
	150	+ /* Translate operation to command */
	151	+ if (op == KBASE_MMU_OP_FLUSH_PT)
	152	+ flush_op = GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2;
	153	+ else if (op == KBASE_MMU_OP_FLUSH_MEM)
	154	+ flush_op = GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2_LSC;
	155	+ else {
	156	+ dev_warn(kbdev->dev, "Invalid flush request (op = %d)", op);
	157	+ return;
	158	+ }
	159	+
	160	+ if (kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op))
	161	+ dev_err(kbdev->dev, "Flush for physical address range did not complete");
	162	+}
	163	+#endif
81	164
82	165	/**
83		- * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
84		- * @kbdev: Device pointer.
85		- * @handle: Address of DMA region.
86		- * @size: Size of the region to sync.
	166	+ * mmu_invalidate() - Perform an invalidate operation on MMU caches.
	167	+ * @kbdev: The Kbase device.
	168	+ * @kctx: The Kbase context.
	169	+ * @as_nr: GPU address space number for which invalidate is required.
	170	+ * @op_param: Non-NULL pointer to struct containing information about the MMU
	171	+ * operation to perform.
87	172	*
88		- * This should be called after each page directory update.
	173	+ * Perform an MMU invalidate operation on a particual address space
	174	+ * by issuing a UNLOCK command.
89	175	*/
90		-static void kbase_mmu_sync_pgd(struct kbase_device *kbdev,
91		- dma_addr_t handle, size_t size)
	176	+static void mmu_invalidate(struct kbase_device kbdev, struct kbase_context kctx, int as_nr,
	177	+ const struct kbase_mmu_hw_op_param *op_param)
	178	+{
	179	+ unsigned long flags;
	180	+
	181	+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
	182	+
	183	+ if (kbdev->pm.backend.gpu_powered && (!kctx \|\| kctx->as_nr >= 0)) {
	184	+ as_nr = kctx ? kctx->as_nr : as_nr;
	185	+ if (kbase_mmu_hw_do_unlock(kbdev, &kbdev->as[as_nr], op_param))
	186	+ dev_err(kbdev->dev,
	187	+ "Invalidate after GPU page table update did not complete");
	188	+ }
	189	+
	190	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
	191	+}
	192	+
	193	+/* Perform a flush/invalidate on a particular address space
	194	+ */
	195	+static void mmu_flush_invalidate_as(struct kbase_device kbdev, struct kbase_as as,
	196	+ const struct kbase_mmu_hw_op_param *op_param)
	197	+{
	198	+ unsigned long flags;
	199	+
	200	+ /* AS transaction begin */
	201	+ mutex_lock(&kbdev->mmu_hw_mutex);
	202	+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
	203	+
	204	+ if (kbdev->pm.backend.gpu_powered && (kbase_mmu_hw_do_flush_locked(kbdev, as, op_param)))
	205	+ dev_err(kbdev->dev, "Flush for GPU page table update did not complete");
	206	+
	207	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
	208	+ mutex_unlock(&kbdev->mmu_hw_mutex);
	209	+ /* AS transaction end */
	210	+}
	211	+
	212	+/**
	213	+ * mmu_flush_invalidate() - Perform a flush operation on GPU caches.
	214	+ * @kbdev: The Kbase device.
	215	+ * @kctx: The Kbase context.
	216	+ * @as_nr: GPU address space number for which flush + invalidate is required.
	217	+ * @op_param: Non-NULL pointer to struct containing information about the MMU
	218	+ * operation to perform.
	219	+ *
	220	+ * This function performs the cache flush operation described by @op_param.
	221	+ * The function retains a reference to the given @kctx and releases it
	222	+ * after performing the flush operation.
	223	+ *
	224	+ * If operation is set to KBASE_MMU_OP_FLUSH_PT then this function will issue
	225	+ * a cache flush + invalidate to the L2 caches and invalidate the TLBs.
	226	+ *
	227	+ * If operation is set to KBASE_MMU_OP_FLUSH_MEM then this function will issue
	228	+ * a cache flush + invalidate to the L2 and GPU Load/Store caches as well as
	229	+ * invalidating the TLBs.
	230	+ */
	231	+static void mmu_flush_invalidate(struct kbase_device kbdev, struct kbase_context kctx, int as_nr,
	232	+ const struct kbase_mmu_hw_op_param *op_param)
	233	+{
	234	+ bool ctx_is_in_runpool;
	235	+
	236	+ /* Early out if there is nothing to do */
	237	+ if (op_param->nr == 0)
	238	+ return;
	239	+
	240	+ /* If no context is provided then MMU operation is performed on address
	241	+ * space which does not belong to user space context. Otherwise, retain
	242	+ * refcount to context provided and release after flush operation.
	243	+ */
	244	+ if (!kctx) {
	245	+ mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], op_param);
	246	+ } else {
	247	+#if !MALI_USE_CSF
	248	+ mutex_lock(&kbdev->js_data.queue_mutex);
	249	+ ctx_is_in_runpool = kbase_ctx_sched_inc_refcount(kctx);
	250	+ mutex_unlock(&kbdev->js_data.queue_mutex);
	251	+#else
	252	+ ctx_is_in_runpool = kbase_ctx_sched_inc_refcount_if_as_valid(kctx);
	253	+#endif /* !MALI_USE_CSF */
	254	+
	255	+ if (ctx_is_in_runpool) {
	256	+ KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID);
	257	+
	258	+ mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr], op_param);
	259	+
	260	+ release_ctx(kbdev, kctx);
	261	+ }
	262	+ }
	263	+}
	264	+
	265	+/**
	266	+ * mmu_flush_invalidate_on_gpu_ctrl() - Perform a flush operation on GPU caches via
	267	+ * the GPU_CONTROL interface
	268	+ * @kbdev: The Kbase device.
	269	+ * @kctx: The Kbase context.
	270	+ * @as_nr: GPU address space number for which flush + invalidate is required.
	271	+ * @op_param: Non-NULL pointer to struct containing information about the MMU
	272	+ * operation to perform.
	273	+ *
	274	+ * Perform a flush/invalidate on a particular address space via the GPU_CONTROL
	275	+ * interface.
	276	+ */
	277	+static void mmu_flush_invalidate_on_gpu_ctrl(struct kbase_device kbdev, struct kbase_context kctx,
	278	+ int as_nr, const struct kbase_mmu_hw_op_param *op_param)
	279	+{
	280	+ unsigned long flags;
	281	+
	282	+ /* AS transaction begin */
	283	+ mutex_lock(&kbdev->mmu_hw_mutex);
	284	+ spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
	285	+
	286	+ if (kbdev->pm.backend.gpu_powered && (!kctx \|\| kctx->as_nr >= 0)) {
	287	+ as_nr = kctx ? kctx->as_nr : as_nr;
	288	+ if (kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, &kbdev->as[as_nr], op_param))
	289	+ dev_err(kbdev->dev, "Flush for GPU page table update did not complete");
	290	+ }
	291	+
	292	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
	293	+ mutex_unlock(&kbdev->mmu_hw_mutex);
	294	+}
	295	+
	296	+static void kbase_mmu_sync_pgd_gpu(struct kbase_device kbdev, struct kbase_context kctx,
	297	+ phys_addr_t phys, size_t size,
	298	+ enum kbase_mmu_op_type flush_op)
	299	+{
	300	+ kbase_mmu_flush_pa_range(kbdev, kctx, phys, size, flush_op);
	301	+}
	302	+
	303	+static void kbase_mmu_sync_pgd_cpu(struct kbase_device *kbdev, dma_addr_t handle, size_t size)
92	304	{
93	305	/* In non-coherent system, ensure the GPU can read
94	306	* the pages from memory
..	..	@@ -96,6 +308,34 @@
96	308	if (kbdev->system_coherency == COHERENCY_NONE)
97	309	dma_sync_single_for_device(kbdev->dev, handle, size,
98	310	DMA_TO_DEVICE);
	311	+}
	312	+
	313	+/**
	314	+ * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
	315	+ * @kbdev: Device pointer.
	316	+ * @kctx: Context pointer.
	317	+ * @phys: Starting physical address of the destination region.
	318	+ * @handle: Address of DMA region.
	319	+ * @size: Size of the region to sync.
	320	+ * @flush_op: MMU cache flush operation to perform on the physical address
	321	+ * range, if GPU control is available.
	322	+ *
	323	+ * This function is called whenever the association between a virtual address
	324	+ * range and a physical address range changes, because a mapping is created or
	325	+ * destroyed.
	326	+ * One of the effects of this operation is performing an MMU cache flush
	327	+ * operation only on the physical address range affected by this function, if
	328	+ * GPU control is available.
	329	+ *
	330	+ * This should be called after each page directory update.
	331	+ */
	332	+static void kbase_mmu_sync_pgd(struct kbase_device kbdev, struct kbase_context kctx,
	333	+ phys_addr_t phys, dma_addr_t handle, size_t size,
	334	+ enum kbase_mmu_op_type flush_op)
	335	+{
	336	+
	337	+ kbase_mmu_sync_pgd_cpu(kbdev, handle, size);
	338	+ kbase_mmu_sync_pgd_gpu(kbdev, kctx, phys, size, flush_op);
99	339	}
100	340
101	341	/*
..	..	@@ -107,9 +347,154 @@
107	347	* a 4kB physical page.
108	348	*/
109	349
110		-static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
111		- struct tagged_addr *phys, size_t nr,
112		- unsigned long flags, int group_id);
	350	+static int kbase_mmu_update_pages_no_flush(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	351	+ u64 vpfn, struct tagged_addr *phys, size_t nr,
	352	+ unsigned long flags, int group_id, u64 *dirty_pgds);
	353	+
	354	+/**
	355	+ * kbase_mmu_update_and_free_parent_pgds() - Update number of valid entries and
	356	+ * free memory of the page directories
	357	+ *
	358	+ * @kbdev: Device pointer.
	359	+ * @mmut: GPU MMU page table.
	360	+ * @pgds: Physical addresses of page directories to be freed.
	361	+ * @vpfn: The virtual page frame number.
	362	+ * @level: The level of MMU page table.
	363	+ * @flush_op: The type of MMU flush operation to perform.
	364	+ * @dirty_pgds: Flags to track every level where a PGD has been updated.
	365	+ */
	366	+static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
	367	+ struct kbase_mmu_table mmut, phys_addr_t pgds,
	368	+ u64 vpfn, int level,
	369	+ enum kbase_mmu_op_type flush_op, u64 *dirty_pgds);
	370	+
	371	+static void kbase_mmu_account_freed_pgd(struct kbase_device kbdev, struct kbase_mmu_table mmut)
	372	+{
	373	+ atomic_sub(1, &kbdev->memdev.used_pages);
	374	+
	375	+ /* If MMU tables belong to a context then pages will have been accounted
	376	+ * against it, so we must decrement the usage counts here.
	377	+ */
	378	+ if (mmut->kctx) {
	379	+ kbase_process_page_usage_dec(mmut->kctx, 1);
	380	+ atomic_sub(1, &mmut->kctx->used_pages);
	381	+ }
	382	+
	383	+ kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
	384	+}
	385	+
	386	+static bool kbase_mmu_handle_isolated_pgd_page(struct kbase_device *kbdev,
	387	+ struct kbase_mmu_table *mmut,
	388	+ struct page *p)
	389	+{
	390	+ struct kbase_page_metadata *page_md = kbase_page_private(p);
	391	+ bool page_is_isolated = false;
	392	+
	393	+ lockdep_assert_held(&mmut->mmu_lock);
	394	+
	395	+ if (!kbase_page_migration_enabled)
	396	+ return false;
	397	+
	398	+ spin_lock(&page_md->migrate_lock);
	399	+ if (PAGE_STATUS_GET(page_md->status) == PT_MAPPED) {
	400	+ WARN_ON_ONCE(!mmut->kctx);
	401	+ if (IS_PAGE_ISOLATED(page_md->status)) {
	402	+ page_md->status = PAGE_STATUS_SET(page_md->status,
	403	+ FREE_PT_ISOLATED_IN_PROGRESS);
	404	+ page_md->data.free_pt_isolated.kbdev = kbdev;
	405	+ page_is_isolated = true;
	406	+ } else {
	407	+ page_md->status =
	408	+ PAGE_STATUS_SET(page_md->status, FREE_IN_PROGRESS);
	409	+ }
	410	+ } else {
	411	+ WARN_ON_ONCE(mmut->kctx);
	412	+ WARN_ON_ONCE(PAGE_STATUS_GET(page_md->status) != NOT_MOVABLE);
	413	+ }
	414	+ spin_unlock(&page_md->migrate_lock);
	415	+
	416	+ if (unlikely(page_is_isolated)) {
	417	+ /* Do the CPU cache flush and accounting here for the isolated
	418	+ * PGD page, which is done inside kbase_mmu_free_pgd() for the
	419	+ * PGD page that did not get isolated.
	420	+ */
	421	+ dma_sync_single_for_device(kbdev->dev, kbase_dma_addr(p), PAGE_SIZE,
	422	+ DMA_BIDIRECTIONAL);
	423	+ kbase_mmu_account_freed_pgd(kbdev, mmut);
	424	+ }
	425	+
	426	+ return page_is_isolated;
	427	+}
	428	+
	429	+/**
	430	+ * kbase_mmu_free_pgd() - Free memory of the page directory
	431	+ *
	432	+ * @kbdev: Device pointer.
	433	+ * @mmut: GPU MMU page table.
	434	+ * @pgd: Physical address of page directory to be freed.
	435	+ *
	436	+ * This function is supposed to be called with mmu_lock held and after
	437	+ * ensuring that GPU won't be able to access the page.
	438	+ */
	439	+static void kbase_mmu_free_pgd(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	440	+ phys_addr_t pgd)
	441	+{
	442	+ struct page *p;
	443	+ bool page_is_isolated = false;
	444	+
	445	+ lockdep_assert_held(&mmut->mmu_lock);
	446	+
	447	+ p = pfn_to_page(PFN_DOWN(pgd));
	448	+ page_is_isolated = kbase_mmu_handle_isolated_pgd_page(kbdev, mmut, p);
	449	+
	450	+ if (likely(!page_is_isolated)) {
	451	+ kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, true);
	452	+ kbase_mmu_account_freed_pgd(kbdev, mmut);
	453	+ }
	454	+}
	455	+
	456	+/**
	457	+ * kbase_mmu_free_pgds_list() - Free the PGD pages present in the list
	458	+ *
	459	+ * @kbdev: Device pointer.
	460	+ * @mmut: GPU MMU page table.
	461	+ *
	462	+ * This function will call kbase_mmu_free_pgd() on each page directory page
	463	+ * present in the list of free PGDs inside @mmut.
	464	+ *
	465	+ * The function is supposed to be called after the GPU cache and MMU TLB has
	466	+ * been invalidated post the teardown loop.
	467	+ *
	468	+ * The mmu_lock shall be held prior to calling the function.
	469	+ */
	470	+static void kbase_mmu_free_pgds_list(struct kbase_device kbdev, struct kbase_mmu_table mmut)
	471	+{
	472	+ size_t i;
	473	+
	474	+ lockdep_assert_held(&mmut->mmu_lock);
	475	+
	476	+ for (i = 0; i < mmut->scratch_mem.free_pgds.head_index; i++)
	477	+ kbase_mmu_free_pgd(kbdev, mmut, page_to_phys(mmut->scratch_mem.free_pgds.pgds[i]));
	478	+
	479	+ mmut->scratch_mem.free_pgds.head_index = 0;
	480	+}
	481	+
	482	+static void kbase_mmu_add_to_free_pgds_list(struct kbase_mmu_table mmut, struct page p)
	483	+{
	484	+ lockdep_assert_held(&mmut->mmu_lock);
	485	+
	486	+ if (WARN_ON_ONCE(mmut->scratch_mem.free_pgds.head_index > (MAX_FREE_PGDS - 1)))
	487	+ return;
	488	+
	489	+ mmut->scratch_mem.free_pgds.pgds[mmut->scratch_mem.free_pgds.head_index++] = p;
	490	+}
	491	+
	492	+static inline void kbase_mmu_reset_free_pgds_list(struct kbase_mmu_table *mmut)
	493	+{
	494	+ lockdep_assert_held(&mmut->mmu_lock);
	495	+
	496	+ mmut->scratch_mem.free_pgds.head_index = 0;
	497	+}
113	498
114	499	/**
115	500	* reg_grow_calc_extra_pages() - Calculate the number of backed pages to add to
..	..	@@ -138,7 +523,7 @@
138	523	if (!multiple) {
139	524	dev_warn(
140	525	kbdev->dev,
141		- "VA Region 0x%llx extension was 0, allocator needs to set this properly for KBASE_REG_PF_GROW\n",
	526	+ "VA Region 0x%llx extension was 0, allocator needs to set this properly for KBASE_REG_PF_GROW",
142	527	((unsigned long long)reg->start_pfn) << PAGE_SHIFT);
143	528	return minimum_extra;
144	529	}
..	..	@@ -191,22 +576,70 @@
191	576	}
192	577
193	578	#ifdef CONFIG_MALI_CINSTR_GWT
194		-static void kbase_gpu_mmu_handle_write_faulting_as(
195		- struct kbase_device *kbdev,
196		- struct kbase_as *faulting_as,
197		- u64 start_pfn, size_t nr, u32 op)
	579	+static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
	580	+ struct kbase_as *faulting_as,
	581	+ u64 start_pfn, size_t nr,
	582	+ u32 kctx_id, u64 dirty_pgds)
198	583	{
	584	+ /* Calls to this function are inherently synchronous, with respect to
	585	+ * MMU operations.
	586	+ */
	587	+ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
	588	+ struct kbase_mmu_hw_op_param op_param;
	589	+ int ret = 0;
	590	+
199	591	mutex_lock(&kbdev->mmu_hw_mutex);
200	592
201	593	kbase_mmu_hw_clear_fault(kbdev, faulting_as,
202	594	KBASE_MMU_FAULT_TYPE_PAGE);
203		- kbase_mmu_hw_do_operation(kbdev, faulting_as, start_pfn,
204		- nr, op, 1);
	595	+
	596	+ /* flush L2 and unlock the VA (resumes the MMU) */
	597	+ op_param.vpfn = start_pfn;
	598	+ op_param.nr = nr;
	599	+ op_param.op = KBASE_MMU_OP_FLUSH_PT;
	600	+ op_param.kctx_id = kctx_id;
	601	+ op_param.mmu_sync_info = mmu_sync_info;
	602	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	603	+ unsigned long irq_flags;
	604	+
	605	+ spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
	606	+ op_param.flush_skip_levels =
	607	+ pgd_level_to_skip_flush(dirty_pgds);
	608	+ ret = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, faulting_as, &op_param);
	609	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
	610	+ } else {
	611	+ mmu_hw_operation_begin(kbdev);
	612	+ ret = kbase_mmu_hw_do_flush(kbdev, faulting_as, &op_param);
	613	+ mmu_hw_operation_end(kbdev);
	614	+ }
205	615
206	616	mutex_unlock(&kbdev->mmu_hw_mutex);
207	617
	618	+ if (ret)
	619	+ dev_err(kbdev->dev,
	620	+ "Flush for GPU page fault due to write access did not complete");
	621	+
208	622	kbase_mmu_hw_enable_fault(kbdev, faulting_as,
209	623	KBASE_MMU_FAULT_TYPE_PAGE);
	624	+}
	625	+
	626	+static void set_gwt_element_page_addr_and_size(
	627	+ struct kbasep_gwt_list_element *element,
	628	+ u64 fault_page_addr, struct tagged_addr fault_phys)
	629	+{
	630	+ u64 fault_pfn = fault_page_addr >> PAGE_SHIFT;
	631	+ unsigned int vindex = fault_pfn & (NUM_4K_PAGES_IN_2MB_PAGE - 1);
	632	+
	633	+ /* If the fault address lies within a 2MB page, then consider
	634	+ * the whole 2MB page for dumping to avoid incomplete dumps.
	635	+ */
	636	+ if (is_huge(fault_phys) && (vindex == index_in_large_page(fault_phys))) {
	637	+ element->page_addr = fault_page_addr & ~(SZ_2M - 1);
	638	+ element->num_pages = NUM_4K_PAGES_IN_2MB_PAGE;
	639	+ } else {
	640	+ element->page_addr = fault_page_addr;
	641	+ element->num_pages = 1;
	642	+ }
210	643	}
211	644
212	645	static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
..	..	@@ -215,11 +648,11 @@
215	648	struct kbasep_gwt_list_element *pos;
216	649	struct kbase_va_region *region;
217	650	struct kbase_device *kbdev;
	651	+ struct tagged_addr *fault_phys_addr;
218	652	struct kbase_fault *fault;
219	653	u64 fault_pfn, pfn_offset;
220		- u32 op;
221		- int ret;
222	654	int as_no;
	655	+ u64 dirty_pgds = 0;
223	656
224	657	as_no = faulting_as->number;
225	658	kbdev = container_of(faulting_as, struct kbase_device, as[as_no]);
..	..	@@ -247,15 +680,18 @@
247	680	return;
248	681	}
249	682
	683	+ pfn_offset = fault_pfn - region->start_pfn;
	684	+ fault_phys_addr = &kbase_get_gpu_phy_pages(region)[pfn_offset];
	685	+
250	686	/* Capture addresses of faulting write location
251	687	* for job dumping if write tracking is enabled.
252	688	*/
253	689	if (kctx->gwt_enabled) {
254		- u64 page_addr = fault->addr & PAGE_MASK;
	690	+ u64 fault_page_addr = fault->addr & PAGE_MASK;
255	691	bool found = false;
256	692	/* Check if this write was already handled. */
257	693	list_for_each_entry(pos, &kctx->gwt_current_list, link) {
258		- if (page_addr == pos->page_addr) {
	694	+ if (fault_page_addr == pos->page_addr) {
259	695	found = true;
260	696	break;
261	697	}
..	..	@@ -265,8 +701,8 @@
265	701	pos = kmalloc(sizeof(*pos), GFP_KERNEL);
266	702	if (pos) {
267	703	pos->region = region;
268		- pos->page_addr = page_addr;
269		- pos->num_pages = 1;
	704	+ set_gwt_element_page_addr_and_size(pos,
	705	+ fault_page_addr, *fault_phys_addr);
270	706	list_add(&pos->link, &kctx->gwt_current_list);
271	707	} else {
272	708	dev_warn(kbdev->dev, "kmalloc failure");
..	..	@@ -274,17 +710,12 @@
274	710	}
275	711	}
276	712
277		- pfn_offset = fault_pfn - region->start_pfn;
278	713	/* Now make this faulting page writable to GPU. */
279		- ret = kbase_mmu_update_pages_no_flush(kctx, fault_pfn,
280		- &kbase_get_gpu_phy_pages(region)[pfn_offset],
281		- 1, region->flags, region->gpu_alloc->group_id);
	714	+ kbase_mmu_update_pages_no_flush(kbdev, &kctx->mmu, fault_pfn, fault_phys_addr, 1,
	715	+ region->flags, region->gpu_alloc->group_id, &dirty_pgds);
282	716
283		- /* flush L2 and unlock the VA (resumes the MMU) */
284		- op = AS_COMMAND_FLUSH_PT;
285		-
286		- kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as,
287		- fault_pfn, 1, op);
	717	+ kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, fault_pfn, 1,
	718	+ kctx->id, dirty_pgds);
288	719
289	720	kbase_gpu_vm_unlock(kctx);
290	721	}
..	..	@@ -315,75 +746,32 @@
315	746	}
316	747	#endif
317	748
318		-#define MAX_POOL_LEVEL 2
319		-
320	749	/**
321		- * page_fault_try_alloc - Try to allocate memory from a context pool
322		- * @kctx: Context pointer
323		- * @region: Region to grow
324		- * @new_pages: Number of 4 kB pages to allocate
325		- * @pages_to_grow: Pointer to variable to store number of outstanding pages on
326		- * failure. This can be either 4 kB or 2 MB pages, depending on
327		- * the number of pages requested.
328		- * @grow_2mb_pool: Pointer to variable to store which pool needs to grow - true
329		- * for 2 MB, false for 4 kB.
330		- * @prealloc_sas: Pointer to kbase_sub_alloc structures
	750	+ * estimate_pool_space_required - Determine how much a pool should be grown by to support a future
	751	+ * allocation
	752	+ * @pool: The memory pool to check, including its linked pools
	753	+ * @pages_required: Number of 4KiB pages require for the pool to support a future allocation
331	754	*
332		- * This function will try to allocate as many pages as possible from the context
333		- * pool, then if required will try to allocate the remaining pages from the
334		- * device pool.
	755	+ * The value returned is accounting for the size of @pool and the size of each memory pool linked to
	756	+ * @pool. Hence, the caller should use @pool and (if not already satisfied) all its linked pools to
	757	+ * allocate from.
335	758	*
336		- * This function will not allocate any new memory beyond that that is already
337		- * present in the context or device pools. This is because it is intended to be
338		- * called with the vm_lock held, which could cause recursive locking if the
339		- * allocation caused the out-of-memory killer to run.
	759	+ * Note: this is only an estimate, because even during the calculation the memory pool(s) involved
	760	+ * can be updated to be larger or smaller. Hence, the result is only a guide as to whether an
	761	+ * allocation could succeed, or an estimate of the correct amount to grow the pool by. The caller
	762	+ * should keep attempting an allocation and then re-growing with a new value queried form this
	763	+ * function until the allocation succeeds.
340	764	*
341		- * If 2 MB pages are enabled and new_pages is >= 2 MB then pages_to_grow will be
342		- * a count of 2 MB pages, otherwise it will be a count of 4 kB pages.
343		- *
344		- * Return: true if successful, false on failure
	765	+ * Return: an estimate of the amount of extra 4KiB pages in @pool that are required to satisfy an
	766	+ * allocation, or 0 if @pool (including its linked pools) is likely to already satisfy the
	767	+ * allocation.
345	768	*/
346		-static bool page_fault_try_alloc(struct kbase_context *kctx,
347		- struct kbase_va_region *region, size_t new_pages,
348		- int pages_to_grow, bool grow_2mb_pool,
349		- struct kbase_sub_alloc **prealloc_sas)
	769	+static size_t estimate_pool_space_required(struct kbase_mem_pool *pool, const size_t pages_required)
350	770	{
351		- struct tagged_addr *gpu_pages[MAX_POOL_LEVEL] = {NULL};
352		- struct tagged_addr *cpu_pages[MAX_POOL_LEVEL] = {NULL};
353		- size_t pages_alloced[MAX_POOL_LEVEL] = {0};
354		- struct kbase_mem_pool pool, root_pool;
355		- int pool_level = 0;
356		- bool alloc_failed = false;
357	771	size_t pages_still_required;
358	772
359		- if (WARN_ON(region->gpu_alloc->group_id >=
360		- MEMORY_GROUP_MANAGER_NR_GROUPS)) {
361		- /* Do not try to grow the memory pool */
362		- *pages_to_grow = 0;
363		- return false;
364		- }
365		-
366		-#ifdef CONFIG_MALI_2MB_ALLOC
367		- if (new_pages >= (SZ_2M / SZ_4K)) {
368		- root_pool = &kctx->mem_pools.large[region->gpu_alloc->group_id];
369		- *grow_2mb_pool = true;
370		- } else {
371		-#endif
372		- root_pool = &kctx->mem_pools.small[region->gpu_alloc->group_id];
373		- *grow_2mb_pool = false;
374		-#ifdef CONFIG_MALI_2MB_ALLOC
375		- }
376		-#endif
377		-
378		- if (region->gpu_alloc != region->cpu_alloc)
379		- new_pages *= 2;
380		-
381		- pages_still_required = new_pages;
382		-
383		- /* Determine how many pages are in the pools before trying to allocate.
384		- * Don't attempt to allocate & free if the allocation can't succeed.
385		- */
386		- for (pool = root_pool; pool != NULL; pool = pool->next_pool) {
	773	+ for (pages_still_required = pages_required; pool != NULL && pages_still_required;
	774	+ pool = pool->next_pool) {
387	775	size_t pool_size_4k;
388	776
389	777	kbase_mem_pool_lock(pool);
..	..	@@ -395,10 +783,71 @@
395	783	pages_still_required -= pool_size_4k;
396	784
397	785	kbase_mem_pool_unlock(pool);
398		-
399		- if (!pages_still_required)
400		- break;
401	786	}
	787	+ return pages_still_required;
	788	+}
	789	+
	790	+/**
	791	+ * page_fault_try_alloc - Try to allocate memory from a context pool
	792	+ * @kctx: Context pointer
	793	+ * @region: Region to grow
	794	+ * @new_pages: Number of 4 KiB pages to allocate
	795	+ * @pages_to_grow: Pointer to variable to store number of outstanding pages on failure. This can be
	796	+ * either 4 KiB or 2 MiB pages, depending on the number of pages requested.
	797	+ * @grow_2mb_pool: Pointer to variable to store which pool needs to grow - true for 2 MiB, false for
	798	+ * 4 KiB.
	799	+ * @prealloc_sas: Pointer to kbase_sub_alloc structures
	800	+ *
	801	+ * This function will try to allocate as many pages as possible from the context pool, then if
	802	+ * required will try to allocate the remaining pages from the device pool.
	803	+ *
	804	+ * This function will not allocate any new memory beyond that is already present in the context or
	805	+ * device pools. This is because it is intended to be called whilst the thread has acquired the
	806	+ * region list lock with kbase_gpu_vm_lock(), and a large enough memory allocation whilst that is
	807	+ * held could invoke the OoM killer and cause an effective deadlock with kbase_cpu_vm_close().
	808	+ *
	809	+ * If 2 MiB pages are enabled and new_pages is >= 2 MiB then pages_to_grow will be a count of 2 MiB
	810	+ * pages, otherwise it will be a count of 4 KiB pages.
	811	+ *
	812	+ * Return: true if successful, false on failure
	813	+ */
	814	+static bool page_fault_try_alloc(struct kbase_context *kctx,
	815	+ struct kbase_va_region *region, size_t new_pages,
	816	+ int pages_to_grow, bool grow_2mb_pool,
	817	+ struct kbase_sub_alloc **prealloc_sas)
	818	+{
	819	+ size_t total_gpu_pages_alloced = 0;
	820	+ size_t total_cpu_pages_alloced = 0;
	821	+ struct kbase_mem_pool pool, root_pool;
	822	+ bool alloc_failed = false;
	823	+ size_t pages_still_required;
	824	+ size_t total_mempools_free_4k = 0;
	825	+
	826	+ lockdep_assert_held(&kctx->reg_lock);
	827	+ lockdep_assert_held(&kctx->mem_partials_lock);
	828	+
	829	+ if (WARN_ON(region->gpu_alloc->group_id >=
	830	+ MEMORY_GROUP_MANAGER_NR_GROUPS)) {
	831	+ /* Do not try to grow the memory pool */
	832	+ *pages_to_grow = 0;
	833	+ return false;
	834	+ }
	835	+
	836	+ if (kctx->kbdev->pagesize_2mb && new_pages >= (SZ_2M / SZ_4K)) {
	837	+ root_pool = &kctx->mem_pools.large[region->gpu_alloc->group_id];
	838	+ *grow_2mb_pool = true;
	839	+ } else {
	840	+ root_pool = &kctx->mem_pools.small[region->gpu_alloc->group_id];
	841	+ *grow_2mb_pool = false;
	842	+ }
	843	+
	844	+ if (region->gpu_alloc != region->cpu_alloc)
	845	+ new_pages *= 2;
	846	+
	847	+ /* Determine how many pages are in the pools before trying to allocate.
	848	+ * Don't attempt to allocate & free if the allocation can't succeed.
	849	+ */
	850	+ pages_still_required = estimate_pool_space_required(root_pool, new_pages);
402	851
403	852	if (pages_still_required) {
404	853	/* Insufficient pages in pools. Don't try to allocate - just
..	..	@@ -409,11 +858,11 @@
409	858	return false;
410	859	}
411	860
412		- /* Since we've dropped the pool locks, the amount of memory in the pools
413		- * may change between the above check and the actual allocation.
	861	+ /* Since we're not holding any of the mempool locks, the amount of memory in the pools may
	862	+ * change between the above estimate and the actual allocation.
414	863	*/
415		- pool = root_pool;
416		- for (pool_level = 0; pool_level < MAX_POOL_LEVEL; pool_level++) {
	864	+ pages_still_required = new_pages;
	865	+ for (pool = root_pool; pool != NULL && pages_still_required; pool = pool->next_pool) {
417	866	size_t pool_size_4k;
418	867	size_t pages_to_alloc_4k;
419	868	size_t pages_to_alloc_4k_per_alloc;
..	..	@@ -422,93 +871,91 @@
422	871
423	872	/* Allocate as much as possible from this pool*/
424	873	pool_size_4k = kbase_mem_pool_size(pool) << pool->order;
425		- pages_to_alloc_4k = MIN(new_pages, pool_size_4k);
	874	+ total_mempools_free_4k += pool_size_4k;
	875	+ pages_to_alloc_4k = MIN(pages_still_required, pool_size_4k);
426	876	if (region->gpu_alloc == region->cpu_alloc)
427	877	pages_to_alloc_4k_per_alloc = pages_to_alloc_4k;
428	878	else
429	879	pages_to_alloc_4k_per_alloc = pages_to_alloc_4k >> 1;
430	880
431		- pages_alloced[pool_level] = pages_to_alloc_4k;
432	881	if (pages_to_alloc_4k) {
433		- gpu_pages[pool_level] =
434		- kbase_alloc_phy_pages_helper_locked(
435		- region->gpu_alloc, pool,
436		- pages_to_alloc_4k_per_alloc,
437		- &prealloc_sas[0]);
	882	+ struct tagged_addr *gpu_pages =
	883	+ kbase_alloc_phy_pages_helper_locked(region->gpu_alloc, pool,
	884	+ pages_to_alloc_4k_per_alloc,
	885	+ &prealloc_sas[0]);
438	886
439		- if (!gpu_pages[pool_level]) {
	887	+ if (!gpu_pages)
440	888	alloc_failed = true;
441		- } else if (region->gpu_alloc != region->cpu_alloc) {
442		- cpu_pages[pool_level] =
443		- kbase_alloc_phy_pages_helper_locked(
444		- region->cpu_alloc, pool,
445		- pages_to_alloc_4k_per_alloc,
446		- &prealloc_sas[1]);
	889	+ else
	890	+ total_gpu_pages_alloced += pages_to_alloc_4k_per_alloc;
447	891
448		- if (!cpu_pages[pool_level])
	892	+ if (!alloc_failed && region->gpu_alloc != region->cpu_alloc) {
	893	+ struct tagged_addr *cpu_pages = kbase_alloc_phy_pages_helper_locked(
	894	+ region->cpu_alloc, pool, pages_to_alloc_4k_per_alloc,
	895	+ &prealloc_sas[1]);
	896	+
	897	+ if (!cpu_pages)
449	898	alloc_failed = true;
	899	+ else
	900	+ total_cpu_pages_alloced += pages_to_alloc_4k_per_alloc;
450	901	}
451	902	}
452	903
453	904	kbase_mem_pool_unlock(pool);
454	905
455	906	if (alloc_failed) {
456		- WARN_ON(!new_pages);
457		- WARN_ON(pages_to_alloc_4k >= new_pages);
458		- WARN_ON(pages_to_alloc_4k_per_alloc >= new_pages);
	907	+ WARN_ON(!pages_still_required);
	908	+ WARN_ON(pages_to_alloc_4k >= pages_still_required);
	909	+ WARN_ON(pages_to_alloc_4k_per_alloc >= pages_still_required);
459	910	break;
460	911	}
461	912
462		- new_pages -= pages_to_alloc_4k;
463		-
464		- if (!new_pages)
465		- break;
466		-
467		- pool = pool->next_pool;
468		- if (!pool)
469		- break;
	913	+ pages_still_required -= pages_to_alloc_4k;
470	914	}
471	915
472		- if (new_pages) {
473		- /* Allocation was unsuccessful */
474		- int max_pool_level = pool_level;
475		-
476		- pool = root_pool;
477		-
478		- /* Free memory allocated so far */
479		- for (pool_level = 0; pool_level <= max_pool_level;
480		- pool_level++) {
481		- kbase_mem_pool_lock(pool);
482		-
483		- if (region->gpu_alloc != region->cpu_alloc) {
484		- if (pages_alloced[pool_level] &&
485		- cpu_pages[pool_level])
486		- kbase_free_phy_pages_helper_locked(
487		- region->cpu_alloc,
488		- pool, cpu_pages[pool_level],
489		- pages_alloced[pool_level]);
490		- }
491		-
492		- if (pages_alloced[pool_level] && gpu_pages[pool_level])
493		- kbase_free_phy_pages_helper_locked(
494		- region->gpu_alloc,
495		- pool, gpu_pages[pool_level],
496		- pages_alloced[pool_level]);
497		-
498		- kbase_mem_pool_unlock(pool);
499		-
500		- pool = pool->next_pool;
501		- }
502		-
503		- /*
504		- * If the allocation failed despite there being enough memory in
505		- * the pool, then just fail. Otherwise, try to grow the memory
506		- * pool.
	916	+ if (pages_still_required) {
	917	+ /* Allocation was unsuccessful. We have dropped the mem_pool lock after allocation,
	918	+ * so must in any case use kbase_free_phy_pages_helper() rather than
	919	+ * kbase_free_phy_pages_helper_locked()
507	920	*/
508		- if (alloc_failed)
	921	+ if (total_gpu_pages_alloced > 0)
	922	+ kbase_free_phy_pages_helper(region->gpu_alloc, total_gpu_pages_alloced);
	923	+ if (region->gpu_alloc != region->cpu_alloc && total_cpu_pages_alloced > 0)
	924	+ kbase_free_phy_pages_helper(region->cpu_alloc, total_cpu_pages_alloced);
	925	+
	926	+ if (alloc_failed) {
	927	+ /* Note that in allocating from the above memory pools, we always ensure
	928	+ * never to request more than is available in each pool with the pool's
	929	+ * lock held. Hence failing to allocate in such situations would be unusual
	930	+ * and we should cancel the growth instead (as re-growing the memory pool
	931	+ * might not fix the situation)
	932	+ */
	933	+ dev_warn(
	934	+ kctx->kbdev->dev,
	935	+ "Page allocation failure of %zu pages: managed %zu pages, mempool (inc linked pools) had %zu pages available",
	936	+ new_pages, total_gpu_pages_alloced + total_cpu_pages_alloced,
	937	+ total_mempools_free_4k);
509	938	*pages_to_grow = 0;
510		- else
511		- *pages_to_grow = new_pages;
	939	+ } else {
	940	+ /* Tell the caller to try to grow the memory pool
	941	+ *
	942	+ * Freeing pages above may have spilled or returned them to the OS, so we
	943	+ * have to take into account how many are still in the pool before giving a
	944	+ * new estimate for growth required of the pool. We can just re-estimate a
	945	+ * new value.
	946	+ */
	947	+ pages_still_required = estimate_pool_space_required(root_pool, new_pages);
	948	+ if (pages_still_required) {
	949	+ *pages_to_grow = pages_still_required;
	950	+ } else {
	951	+ /* It's possible another thread could've grown the pool to be just
	952	+ * big enough after we rolled back the allocation. Request at least
	953	+ * one more page to ensure the caller doesn't fail the growth by
	954	+ * conflating it with the alloc_failed case above
	955	+ */
	956	+ *pages_to_grow = 1u;
	957	+ }
	958	+ }
512	959
513	960	return false;
514	961	}
..	..	@@ -517,18 +964,6 @@
517	964	*pages_to_grow = 0;
518	965
519	966	return true;
520		-}
521		-
522		-/* Small wrapper function to factor out GPU-dependent context releasing */
523		-static void release_ctx(struct kbase_device *kbdev,
524		- struct kbase_context *kctx)
525		-{
526		-#if MALI_USE_CSF
527		- CSTD_UNUSED(kbdev);
528		- kbase_ctx_sched_release_ctx_lock(kctx);
529		-#else /* MALI_USE_CSF */
530		- kbasep_js_runpool_release_ctx(kbdev, kctx);
531		-#endif /* MALI_USE_CSF */
532	967	}
533	968
534	969	void kbase_mmu_page_fault_worker(struct work_struct *data)
..	..	@@ -554,15 +989,19 @@
554	989	size_t pages_trimmed = 0;
555	990	#endif
556	991
	992	+ /* Calls to this function are inherently synchronous, with respect to
	993	+ * MMU operations.
	994	+ */
	995	+ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
	996	+
557	997	faulting_as = container_of(data, struct kbase_as, work_pagefault);
558	998	fault = &faulting_as->pf_data;
559	999	fault_pfn = fault->addr >> PAGE_SHIFT;
560	1000	as_no = faulting_as->number;
561	1001
562	1002	kbdev = container_of(faulting_as, struct kbase_device, as[as_no]);
563		- dev_dbg(kbdev->dev,
564		- "Entering %s %pK, fault_pfn %lld, as_no %d\n",
565		- __func__, (void *)data, fault_pfn, as_no);
	1003	+ dev_dbg(kbdev->dev, "Entering %s %pK, fault_pfn %lld, as_no %d", __func__, (void *)data,
	1004	+ fault_pfn, as_no);
566	1005
567	1006	/* Grab the context that was already refcounted in kbase_mmu_interrupt()
568	1007	* Therefore, it cannot be scheduled out of this AS until we explicitly
..	..	@@ -585,8 +1024,7 @@
585	1024	#ifdef CONFIG_MALI_ARBITER_SUPPORT
586	1025	/* check if we still have GPU */
587	1026	if (unlikely(kbase_is_gpu_removed(kbdev))) {
588		- dev_dbg(kbdev->dev,
589		- "%s: GPU has been removed\n", __func__);
	1027	+ dev_dbg(kbdev->dev, "%s: GPU has been removed", __func__);
590	1028	goto fault_done;
591	1029	}
592	1030	#endif
..	..	@@ -649,20 +1087,24 @@
649	1087	goto fault_done;
650	1088	}
651	1089
652		-#ifdef CONFIG_MALI_2MB_ALLOC
653		- /* Preallocate memory for the sub-allocation structs if necessary */
654		- for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
655		- prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
656		- if (!prealloc_sas[i]) {
657		- kbase_mmu_report_fault_and_kill(kctx, faulting_as,
658		- "Failed pre-allocating memory for sub-allocations' metadata",
659		- fault);
660		- goto fault_done;
	1090	+page_fault_retry:
	1091	+ if (kbdev->pagesize_2mb) {
	1092	+ /* Preallocate (or re-allocate) memory for the sub-allocation structs if necessary */
	1093	+ for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
	1094	+ if (!prealloc_sas[i]) {
	1095	+ prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
	1096	+
	1097	+ if (!prealloc_sas[i]) {
	1098	+ kbase_mmu_report_fault_and_kill(
	1099	+ kctx, faulting_as,
	1100	+ "Failed pre-allocating memory for sub-allocations' metadata",
	1101	+ fault);
	1102	+ goto fault_done;
	1103	+ }
	1104	+ }
661	1105	}
662	1106	}
663		-#endif /* CONFIG_MALI_2MB_ALLOC */
664	1107
665		-page_fault_retry:
666	1108	/* so we have a translation fault,
667	1109	* let's see if it is for growable memory
668	1110	*/
..	..	@@ -720,6 +1162,8 @@
720	1162	current_backed_size = kbase_reg_current_backed_size(region);
721	1163
722	1164	if (fault_rel_pfn < current_backed_size) {
	1165	+ struct kbase_mmu_hw_op_param op_param;
	1166	+
723	1167	dev_dbg(kbdev->dev,
724	1168	"Page fault @ 0x%llx in allocated region 0x%llx-0x%llx of growable TMEM: Ignoring",
725	1169	fault->addr, region->start_pfn,
..	..	@@ -738,8 +1182,29 @@
738	1182	* transaction (which should cause the other page fault to be
739	1183	* raised again).
740	1184	*/
741		- kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0,
742		- AS_COMMAND_UNLOCK, 1);
	1185	+ op_param.mmu_sync_info = mmu_sync_info;
	1186	+ op_param.kctx_id = kctx->id;
	1187	+ if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	1188	+ mmu_hw_operation_begin(kbdev);
	1189	+ err = kbase_mmu_hw_do_unlock_no_addr(kbdev, faulting_as,
	1190	+ &op_param);
	1191	+ mmu_hw_operation_end(kbdev);
	1192	+ } else {
	1193	+ /* Can safely skip the invalidate for all levels in case
	1194	+ * of duplicate page faults.
	1195	+ */
	1196	+ op_param.flush_skip_levels = 0xF;
	1197	+ op_param.vpfn = fault_pfn;
	1198	+ op_param.nr = 1;
	1199	+ err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
	1200	+ &op_param);
	1201	+ }
	1202	+
	1203	+ if (err) {
	1204	+ dev_err(kbdev->dev,
	1205	+ "Invalidation for MMU did not complete on handling page fault @ 0x%llx",
	1206	+ fault->addr);
	1207	+ }
743	1208
744	1209	mutex_unlock(&kbdev->mmu_hw_mutex);
745	1210
..	..	@@ -754,18 +1219,41 @@
754	1219
755	1220	/* cap to max vsize */
756	1221	new_pages = min(new_pages, region->nr_pages - current_backed_size);
757		- dev_dbg(kctx->kbdev->dev, "Allocate %zu pages on page fault\n",
758		- new_pages);
	1222	+ dev_dbg(kctx->kbdev->dev, "Allocate %zu pages on page fault", new_pages);
759	1223
760	1224	if (new_pages == 0) {
	1225	+ struct kbase_mmu_hw_op_param op_param;
	1226	+
761	1227	mutex_lock(&kbdev->mmu_hw_mutex);
762	1228
763	1229	/* Duplicate of a fault we've already handled, nothing to do */
764	1230	kbase_mmu_hw_clear_fault(kbdev, faulting_as,
765	1231	KBASE_MMU_FAULT_TYPE_PAGE);
	1232	+
766	1233	/* See comment [1] about UNLOCK usage */
767		- kbase_mmu_hw_do_operation(kbdev, faulting_as, 0, 0,
768		- AS_COMMAND_UNLOCK, 1);
	1234	+ op_param.mmu_sync_info = mmu_sync_info;
	1235	+ op_param.kctx_id = kctx->id;
	1236	+ if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	1237	+ mmu_hw_operation_begin(kbdev);
	1238	+ err = kbase_mmu_hw_do_unlock_no_addr(kbdev, faulting_as,
	1239	+ &op_param);
	1240	+ mmu_hw_operation_end(kbdev);
	1241	+ } else {
	1242	+ /* Can safely skip the invalidate for all levels in case
	1243	+ * of duplicate page faults.
	1244	+ */
	1245	+ op_param.flush_skip_levels = 0xF;
	1246	+ op_param.vpfn = fault_pfn;
	1247	+ op_param.nr = 1;
	1248	+ err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
	1249	+ &op_param);
	1250	+ }
	1251	+
	1252	+ if (err) {
	1253	+ dev_err(kbdev->dev,
	1254	+ "Invalidation for MMU did not complete on handling page fault @ 0x%llx",
	1255	+ fault->addr);
	1256	+ }
769	1257
770	1258	mutex_unlock(&kbdev->mmu_hw_mutex);
771	1259
..	..	@@ -790,8 +1278,9 @@
790	1278	spin_unlock(&kctx->mem_partials_lock);
791	1279
792	1280	if (grown) {
	1281	+ u64 dirty_pgds = 0;
793	1282	u64 pfn_offset;
794		- u32 op;
	1283	+ struct kbase_mmu_hw_op_param op_param;
795	1284
796	1285	/* alloc success */
797	1286	WARN_ON(kbase_reg_current_backed_size(region) >
..	..	@@ -807,10 +1296,10 @@
807	1296	* so the no_flush version of insert_pages is used which allows
808	1297	* us to unlock the MMU as we see fit.
809	1298	*/
810		- err = kbase_mmu_insert_pages_no_flush(kbdev, &kctx->mmu,
811		- region->start_pfn + pfn_offset,
812		- &kbase_get_gpu_phy_pages(region)[pfn_offset],
813		- new_pages, region->flags, region->gpu_alloc->group_id);
	1299	+ err = kbase_mmu_insert_pages_no_flush(
	1300	+ kbdev, &kctx->mmu, region->start_pfn + pfn_offset,
	1301	+ &kbase_get_gpu_phy_pages(region)[pfn_offset], new_pages, region->flags,
	1302	+ region->gpu_alloc->group_id, &dirty_pgds, region, false);
814	1303	if (err) {
815	1304	kbase_free_phy_pages_helper(region->gpu_alloc,
816	1305	new_pages);
..	..	@@ -829,23 +1318,18 @@
829	1318	(u64)new_pages);
830	1319	trace_mali_mmu_page_fault_grow(region, fault, new_pages);
831	1320
832		-#if MALI_INCREMENTAL_RENDERING
	1321	+#if MALI_INCREMENTAL_RENDERING_JM
833	1322	/* Switch to incremental rendering if we have nearly run out of
834	1323	* memory in a JIT memory allocation.
835	1324	*/
836	1325	if (region->threshold_pages &&
837	1326	kbase_reg_current_backed_size(region) >
838	1327	region->threshold_pages) {
839		-
840		- dev_dbg(kctx->kbdev->dev,
841		- "%zu pages exceeded IR threshold %zu\n",
842		- new_pages + current_backed_size,
843		- region->threshold_pages);
	1328	+ dev_dbg(kctx->kbdev->dev, "%zu pages exceeded IR threshold %zu",
	1329	+ new_pages + current_backed_size, region->threshold_pages);
844	1330
845	1331	if (kbase_mmu_switch_to_ir(kctx, region) >= 0) {
846		- dev_dbg(kctx->kbdev->dev,
847		- "Get region %pK for IR\n",
848		- (void *)region);
	1332	+ dev_dbg(kctx->kbdev->dev, "Get region %pK for IR", (void *)region);
849	1333	kbase_va_region_alloc_get(kctx, region);
850	1334	}
851	1335	}
..	..	@@ -853,9 +1337,6 @@
853	1337
854	1338	/* AS transaction begin */
855	1339	mutex_lock(&kbdev->mmu_hw_mutex);
856		-
857		- /* flush L2 and unlock the VA (resumes the MMU) */
858		- op = AS_COMMAND_FLUSH_PT;
859	1340
860	1341	/* clear MMU interrupt - this needs to be done after updating
861	1342	* the page tables but before issuing a FLUSH command. The
..	..	@@ -868,9 +1349,30 @@
868	1349	kbase_mmu_hw_clear_fault(kbdev, faulting_as,
869	1350	KBASE_MMU_FAULT_TYPE_PAGE);
870	1351
871		- kbase_mmu_hw_do_operation(kbdev, faulting_as,
872		- fault->addr >> PAGE_SHIFT,
873		- new_pages, op, 1);
	1352	+ op_param.vpfn = region->start_pfn + pfn_offset;
	1353	+ op_param.nr = new_pages;
	1354	+ op_param.op = KBASE_MMU_OP_FLUSH_PT;
	1355	+ op_param.kctx_id = kctx->id;
	1356	+ op_param.mmu_sync_info = mmu_sync_info;
	1357	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	1358	+ /* Unlock to invalidate the TLB (and resume the MMU) */
	1359	+ op_param.flush_skip_levels =
	1360	+ pgd_level_to_skip_flush(dirty_pgds);
	1361	+ err = kbase_mmu_hw_do_unlock(kbdev, faulting_as,
	1362	+ &op_param);
	1363	+ } else {
	1364	+ /* flush L2 and unlock the VA (resumes the MMU) */
	1365	+ mmu_hw_operation_begin(kbdev);
	1366	+ err = kbase_mmu_hw_do_flush(kbdev, faulting_as,
	1367	+ &op_param);
	1368	+ mmu_hw_operation_end(kbdev);
	1369	+ }
	1370	+
	1371	+ if (err) {
	1372	+ dev_err(kbdev->dev,
	1373	+ "Flush for GPU page table update did not complete on handling page fault @ 0x%llx",
	1374	+ fault->addr);
	1375	+ }
874	1376
875	1377	mutex_unlock(&kbdev->mmu_hw_mutex);
876	1378	/* AS transaction end */
..	..	@@ -915,8 +1417,7 @@
915	1417	* Otherwise fail the allocation.
916	1418	*/
917	1419	if (pages_to_grow > 0) {
918		-#ifdef CONFIG_MALI_2MB_ALLOC
919		- if (grow_2mb_pool) {
	1420	+ if (kbdev->pagesize_2mb && grow_2mb_pool) {
920	1421	/* Round page requirement up to nearest 2 MB */
921	1422	struct kbase_mem_pool *const lp_mem_pool =
922	1423	&kctx->mem_pools.large[
..	..	@@ -927,25 +1428,22 @@
927	1428	>> lp_mem_pool->order;
928	1429
929	1430	ret = kbase_mem_pool_grow(lp_mem_pool,
930		- pages_to_grow);
	1431	+ pages_to_grow, kctx->task);
931	1432	} else {
932		-#endif
933	1433	struct kbase_mem_pool *const mem_pool =
934	1434	&kctx->mem_pools.small[
935	1435	region->gpu_alloc->group_id];
936	1436
937	1437	ret = kbase_mem_pool_grow(mem_pool,
938		- pages_to_grow);
939		-#ifdef CONFIG_MALI_2MB_ALLOC
	1438	+ pages_to_grow, kctx->task);
940	1439	}
941		-#endif
942	1440	}
943	1441	if (ret < 0) {
944	1442	/* failed to extend, handle as a normal PF */
945	1443	kbase_mmu_report_fault_and_kill(kctx, faulting_as,
946	1444	"Page allocation failure", fault);
947	1445	} else {
948		- dev_dbg(kbdev->dev, "Try again after pool_grow\n");
	1446	+ dev_dbg(kbdev->dev, "Try again after pool_grow");
949	1447	goto page_fault_retry;
950	1448	}
951	1449	}
..	..	@@ -972,27 +1470,25 @@
972	1470	release_ctx(kbdev, kctx);
973	1471
974	1472	atomic_dec(&kbdev->faults_pending);
975		- dev_dbg(kbdev->dev, "Leaving page_fault_worker %pK\n", (void *)data);
	1473	+ dev_dbg(kbdev->dev, "Leaving page_fault_worker %pK", (void *)data);
976	1474	}
977	1475
978	1476	static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
979	1477	struct kbase_mmu_table *mmut)
980	1478	{
981	1479	u64 *page;
982		- int i;
983	1480	struct page *p;
	1481	+ phys_addr_t pgd;
984	1482
985		-#ifdef CONFIG_MALI_2MB_ALLOC
986		- p = kbase_mem_pool_alloc(&kbdev->mem_pools.large[mmut->group_id]);
987		-#else /* CONFIG_MALI_2MB_ALLOC */
988	1483	p = kbase_mem_pool_alloc(&kbdev->mem_pools.small[mmut->group_id]);
989		-#endif /* CONFIG_MALI_2MB_ALLOC */
990	1484	if (!p)
991		- return 0;
	1485	+ return KBASE_MMU_INVALID_PGD_ADDRESS;
992	1486
993	1487	page = kmap(p);
994	1488	if (page == NULL)
995	1489	goto alloc_free;
	1490	+
	1491	+ pgd = page_to_phys(p);
996	1492
997	1493	/* If the MMU tables belong to a context then account the memory usage
998	1494	* to that context, otherwise the MMU tables are device wide and are
..	..	@@ -1014,37 +1510,42 @@
1014	1510
1015	1511	kbase_trace_gpu_mem_usage_inc(kbdev, mmut->kctx, 1);
1016	1512
1017		- for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++)
1018		- kbdev->mmu_mode->entry_invalidate(&page[i]);
	1513	+ kbdev->mmu_mode->entries_invalidate(page, KBASE_MMU_PAGE_ENTRIES);
1019	1514
1020		- kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE);
	1515	+ /* As this page is newly created, therefore there is no content to
	1516	+ * clean or invalidate in the GPU caches.
	1517	+ */
	1518	+ kbase_mmu_sync_pgd_cpu(kbdev, kbase_dma_addr(p), PAGE_SIZE);
1021	1519
1022	1520	kunmap(p);
1023		- return page_to_phys(p);
	1521	+ return pgd;
1024	1522
1025	1523	alloc_free:
1026		-
1027		-#ifdef CONFIG_MALI_2MB_ALLOC
1028		- kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id], p, false);
1029		-#else /* CONFIG_MALI_2MB_ALLOC */
1030	1524	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, false);
1031		-#endif /* CONFIG_MALI_2MB_ALLOC */
1032	1525
1033		- return 0;
	1526	+ return KBASE_MMU_INVALID_PGD_ADDRESS;
1034	1527	}
1035	1528
1036		-/* Given PGD PFN for level N, return PGD PFN for level N+1, allocating the
1037		- * new table from the pool if needed and possible
	1529	+/**
	1530	+ * mmu_get_next_pgd() - Given PGD PFN for level N, return PGD PFN for level N+1
	1531	+ *
	1532	+ * @kbdev: Device pointer.
	1533	+ * @mmut: GPU MMU page table.
	1534	+ * @pgd: Physical addresse of level N page directory.
	1535	+ * @vpfn: The virtual page frame number.
	1536	+ * @level: The level of MMU page table (N).
	1537	+ *
	1538	+ * Return:
	1539	+ * * 0 - OK
	1540	+ * * -EFAULT - level N+1 PGD does not exist
	1541	+ * * -EINVAL - kmap() failed for level N PGD PFN
1038	1542	*/
1039		-static int mmu_get_next_pgd(struct kbase_device *kbdev,
1040		- struct kbase_mmu_table *mmut,
1041		- phys_addr_t *pgd, u64 vpfn, int level)
	1543	+static int mmu_get_next_pgd(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	1544	+ phys_addr_t *pgd, u64 vpfn, int level)
1042	1545	{
1043	1546	u64 *page;
1044	1547	phys_addr_t target_pgd;
1045	1548	struct page *p;
1046		-
1047		- KBASE_DEBUG_ASSERT(*pgd);
1048	1549
1049	1550	lockdep_assert_held(&mmut->mmu_lock);
1050	1551
..	..	@@ -1058,25 +1559,19 @@
1058	1559	p = pfn_to_page(PFN_DOWN(*pgd));
1059	1560	page = kmap(p);
1060	1561	if (page == NULL) {
1061		- dev_warn(kbdev->dev, "%s: kmap failure\n", __func__);
	1562	+ dev_err(kbdev->dev, "%s: kmap failure", __func__);
1062	1563	return -EINVAL;
1063	1564	}
1064	1565
1065		- target_pgd = kbdev->mmu_mode->pte_to_phy_addr(page[vpfn]);
1066		-
1067		- if (!target_pgd) {
1068		- target_pgd = kbase_mmu_alloc_pgd(kbdev, mmut);
1069		- if (!target_pgd) {
1070		- dev_dbg(kbdev->dev, "%s: kbase_mmu_alloc_pgd failure\n",
1071		- __func__);
1072		- kunmap(p);
1073		- return -ENOMEM;
1074		- }
1075		-
1076		- kbdev->mmu_mode->entry_set_pte(&page[vpfn], target_pgd);
1077		-
1078		- kbase_mmu_sync_pgd(kbdev, kbase_dma_addr(p), PAGE_SIZE);
1079		- /* Rely on the caller to update the address space flags. */
	1566	+ if (!kbdev->mmu_mode->pte_is_valid(page[vpfn], level)) {
	1567	+ dev_dbg(kbdev->dev, "%s: invalid PTE at level %d vpfn 0x%llx", __func__, level,
	1568	+ vpfn);
	1569	+ kunmap(p);
	1570	+ return -EFAULT;
	1571	+ } else {
	1572	+ target_pgd = kbdev->mmu_mode->pte_to_phy_addr(
	1573	+ kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
	1574	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[vpfn]));
1080	1575	}
1081	1576
1082	1577	kunmap(p);
..	..	@@ -1085,14 +1580,69 @@
1085	1580	return 0;
1086	1581	}
1087	1582
1088		-/*
1089		- * Returns the PGD for the specified level of translation
	1583	+/**
	1584	+ * mmu_get_lowest_valid_pgd() - Find a valid PGD at or closest to in_level
	1585	+ *
	1586	+ * @kbdev: Device pointer.
	1587	+ * @mmut: GPU MMU page table.
	1588	+ * @vpfn: The virtual page frame number.
	1589	+ * @in_level: The level of MMU page table (N).
	1590	+ * @out_level: Set to the level of the lowest valid PGD found on success.
	1591	+ * Invalid on error.
	1592	+ * @out_pgd: Set to the lowest valid PGD found on success.
	1593	+ * Invalid on error.
	1594	+ *
	1595	+ * Does a page table walk starting from top level (L0) to in_level to find a valid PGD at or
	1596	+ * closest to in_level
	1597	+ *
	1598	+ * Terminology:
	1599	+ * Level-0 = Top-level = highest
	1600	+ * Level-3 = Bottom-level = lowest
	1601	+ *
	1602	+ * Return:
	1603	+ * * 0 - OK
	1604	+ * * -EINVAL - kmap() failed during page table walk.
1090	1605	*/
1091		-static int mmu_get_pgd_at_level(struct kbase_device *kbdev,
1092		- struct kbase_mmu_table *mmut,
1093		- u64 vpfn,
1094		- int level,
1095		- phys_addr_t *out_pgd)
	1606	+static int mmu_get_lowest_valid_pgd(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	1607	+ u64 vpfn, int in_level, int out_level, phys_addr_t out_pgd)
	1608	+{
	1609	+ phys_addr_t pgd;
	1610	+ int l;
	1611	+ int err = 0;
	1612	+
	1613	+ lockdep_assert_held(&mmut->mmu_lock);
	1614	+ pgd = mmut->pgd;
	1615	+
	1616	+ for (l = MIDGARD_MMU_TOPLEVEL; l < in_level; l++) {
	1617	+ err = mmu_get_next_pgd(kbdev, mmut, &pgd, vpfn, l);
	1618	+
	1619	+ /* Handle failure condition */
	1620	+ if (err) {
	1621	+ dev_dbg(kbdev->dev,
	1622	+ "%s: mmu_get_next_pgd() failed to find a valid pgd at level %d",
	1623	+ __func__, l + 1);
	1624	+ break;
	1625	+ }
	1626	+ }
	1627	+
	1628	+ *out_pgd = pgd;
	1629	+ *out_level = l;
	1630	+
	1631	+ /* -EFAULT indicates that pgd param was valid but the next pgd entry at vpfn was invalid.
	1632	+ * This implies that we have found the lowest valid pgd. Reset the error code.
	1633	+ */
	1634	+ if (err == -EFAULT)
	1635	+ err = 0;
	1636	+
	1637	+ return err;
	1638	+}
	1639	+
	1640	+/*
	1641	+ * On success, sets out_pgd to the PGD for the specified level of translation
	1642	+ * Returns -EFAULT if a valid PGD is not found
	1643	+ */
	1644	+static int mmu_get_pgd_at_level(struct kbase_device kbdev, struct kbase_mmu_table mmut, u64 vpfn,
	1645	+ int level, phys_addr_t *out_pgd)
1096	1646	{
1097	1647	phys_addr_t pgd;
1098	1648	int l;
..	..	@@ -1104,9 +1654,9 @@
1104	1654	int err = mmu_get_next_pgd(kbdev, mmut, &pgd, vpfn, l);
1105	1655	/* Handle failure condition */
1106	1656	if (err) {
1107		- dev_dbg(kbdev->dev,
1108		- "%s: mmu_get_next_pgd failure at level %d\n",
1109		- __func__, l);
	1657	+ dev_err(kbdev->dev,
	1658	+ "%s: mmu_get_next_pgd() failed to find a valid pgd at level %d",
	1659	+ __func__, l + 1);
1110	1660	return err;
1111	1661	}
1112	1662	}
..	..	@@ -1116,20 +1666,11 @@
1116	1666	return 0;
1117	1667	}
1118	1668
1119		-static int mmu_get_bottom_pgd(struct kbase_device *kbdev,
1120		- struct kbase_mmu_table *mmut,
1121		- u64 vpfn,
1122		- phys_addr_t *out_pgd)
1123		-{
1124		- return mmu_get_pgd_at_level(kbdev, mmut, vpfn, MIDGARD_MMU_BOTTOMLEVEL,
1125		- out_pgd);
1126		-}
1127		-
1128	1669	static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
1129		- struct kbase_mmu_table *mmut,
1130		- u64 from_vpfn, u64 to_vpfn)
	1670	+ struct kbase_mmu_table *mmut, u64 from_vpfn,
	1671	+ u64 to_vpfn, u64 *dirty_pgds,
	1672	+ struct tagged_addr *phys, bool ignore_page_migration)
1131	1673	{
1132		- phys_addr_t pgd;
1133	1674	u64 vpfn = from_vpfn;
1134	1675	struct kbase_mmu_mode const *mmu_mode;
1135	1676
..	..	@@ -1140,30 +1681,36 @@
1140	1681	lockdep_assert_held(&mmut->mmu_lock);
1141	1682
1142	1683	mmu_mode = kbdev->mmu_mode;
	1684	+ kbase_mmu_reset_free_pgds_list(mmut);
1143	1685
1144	1686	while (vpfn < to_vpfn) {
1145		- unsigned int i;
1146	1687	unsigned int idx = vpfn & 0x1FF;
1147	1688	unsigned int count = KBASE_MMU_PAGE_ENTRIES - idx;
1148	1689	unsigned int pcount = 0;
1149	1690	unsigned int left = to_vpfn - vpfn;
1150	1691	int level;
1151	1692	u64 *page;
	1693	+ phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
	1694	+ phys_addr_t pgd = mmut->pgd;
	1695	+ struct page *p = phys_to_page(pgd);
	1696	+
	1697	+ register unsigned int num_of_valid_entries;
1152	1698
1153	1699	if (count > left)
1154	1700	count = left;
1155	1701
1156	1702	/* need to check if this is a 2MB page or a 4kB */
1157		- pgd = mmut->pgd;
1158		-
1159	1703	for (level = MIDGARD_MMU_TOPLEVEL;
1160	1704	level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
1161	1705	idx = (vpfn >> ((3 - level) * 9)) & 0x1FF;
1162		- page = kmap(phys_to_page(pgd));
	1706	+ pgds[level] = pgd;
	1707	+ page = kmap(p);
1163	1708	if (mmu_mode->ate_is_valid(page[idx], level))
1164	1709	break; /* keep the mapping */
1165		- kunmap(phys_to_page(pgd));
1166		- pgd = mmu_mode->pte_to_phy_addr(page[idx]);
	1710	+ kunmap(p);
	1711	+ pgd = mmu_mode->pte_to_phy_addr(kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
	1712	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[idx]));
	1713	+ p = phys_to_page(pgd);
1167	1714	}
1168	1715
1169	1716	switch (level) {
..	..	@@ -1176,49 +1723,311 @@
1176	1723	pcount = count;
1177	1724	break;
1178	1725	default:
1179		- dev_warn(kbdev->dev, "%sNo support for ATEs at level %d\n",
1180		- __func__, level);
	1726	+ dev_warn(kbdev->dev, "%sNo support for ATEs at level %d", __func__, level);
1181	1727	goto next;
1182	1728	}
1183	1729
	1730	+ if (dirty_pgds && pcount > 0)
	1731	+ *dirty_pgds \|= 1ULL << level;
	1732	+
	1733	+ num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
	1734	+ if (WARN_ON_ONCE(num_of_valid_entries < pcount))
	1735	+ num_of_valid_entries = 0;
	1736	+ else
	1737	+ num_of_valid_entries -= pcount;
	1738	+
1184	1739	/* Invalidate the entries we added */
1185		- for (i = 0; i < pcount; i++)
1186		- mmu_mode->entry_invalidate(&page[idx + i]);
	1740	+ mmu_mode->entries_invalidate(&page[idx], pcount);
1187	1741
1188		- kbase_mmu_sync_pgd(kbdev,
1189		- kbase_dma_addr(phys_to_page(pgd)) + 8 * idx,
1190		- 8 * pcount);
1191		- kunmap(phys_to_page(pgd));
	1742	+ if (!num_of_valid_entries) {
	1743	+ kunmap(p);
1192	1744
	1745	+ kbase_mmu_add_to_free_pgds_list(mmut, p);
	1746	+
	1747	+ kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
	1748	+ KBASE_MMU_OP_NONE, dirty_pgds);
	1749	+ vpfn += count;
	1750	+ continue;
	1751	+ }
	1752	+
	1753	+ mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
	1754	+
	1755	+ /* MMU cache flush strategy is NONE because GPU cache maintenance is
	1756	+ * going to be done by the caller
	1757	+ */
	1758	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (idx * sizeof(u64)),
	1759	+ kbase_dma_addr(p) + sizeof(u64) * idx, sizeof(u64) * pcount,
	1760	+ KBASE_MMU_OP_NONE);
	1761	+ kunmap(p);
1193	1762	next:
1194	1763	vpfn += count;
1195	1764	}
	1765	+
	1766	+ /* If page migration is enabled: the only way to recover from failure
	1767	+ * is to mark all pages as not movable. It is not predictable what's
	1768	+ * going to happen to these pages at this stage. They might return
	1769	+ * movable once they are returned to a memory pool.
	1770	+ */
	1771	+ if (kbase_page_migration_enabled && !ignore_page_migration && phys) {
	1772	+ const u64 num_pages = to_vpfn - from_vpfn + 1;
	1773	+ u64 i;
	1774	+
	1775	+ for (i = 0; i < num_pages; i++) {
	1776	+ struct page *phys_page = as_page(phys[i]);
	1777	+ struct kbase_page_metadata *page_md = kbase_page_private(phys_page);
	1778	+
	1779	+ if (page_md) {
	1780	+ spin_lock(&page_md->migrate_lock);
	1781	+ page_md->status = PAGE_STATUS_SET(page_md->status, (u8)NOT_MOVABLE);
	1782	+ spin_unlock(&page_md->migrate_lock);
	1783	+ }
	1784	+ }
	1785	+ }
1196	1786	}
1197	1787
1198		-/*
1199		- * Map the single page 'phys' 'nr' of times, starting at GPU PFN 'vpfn'
	1788	+static void mmu_flush_invalidate_insert_pages(struct kbase_device *kbdev,
	1789	+ struct kbase_mmu_table *mmut, const u64 vpfn,
	1790	+ size_t nr, u64 dirty_pgds,
	1791	+ enum kbase_caller_mmu_sync_info mmu_sync_info,
	1792	+ bool insert_pages_failed)
	1793	+{
	1794	+ struct kbase_mmu_hw_op_param op_param;
	1795	+ int as_nr = 0;
	1796	+
	1797	+ op_param.vpfn = vpfn;
	1798	+ op_param.nr = nr;
	1799	+ op_param.op = KBASE_MMU_OP_FLUSH_PT;
	1800	+ op_param.mmu_sync_info = mmu_sync_info;
	1801	+ op_param.kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF;
	1802	+ op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
	1803	+
	1804	+#if MALI_USE_CSF
	1805	+ as_nr = mmut->kctx ? mmut->kctx->as_nr : MCU_AS_NR;
	1806	+#else
	1807	+ WARN_ON(!mmut->kctx);
	1808	+#endif
	1809	+
	1810	+ /* MMU cache flush strategy depends on whether GPU control commands for
	1811	+ * flushing physical address ranges are supported. The new physical pages
	1812	+ * are not present in GPU caches therefore they don't need any cache
	1813	+ * maintenance, but PGDs in the page table may or may not be created anew.
	1814	+ *
	1815	+ * Operations that affect the whole GPU cache shall only be done if it's
	1816	+ * impossible to update physical ranges.
	1817	+ *
	1818	+ * On GPUs where flushing by physical address range is supported,
	1819	+ * full cache flush is done when an error occurs during
	1820	+ * insert_pages() to keep the error handling simpler.
	1821	+ */
	1822	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev) && !insert_pages_failed)
	1823	+ mmu_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
	1824	+ else
	1825	+ mmu_flush_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
	1826	+}
	1827	+
	1828	+/**
	1829	+ * update_parent_pgds() - Updates the page table from bottom level towards
	1830	+ * the top level to insert a new ATE
	1831	+ *
	1832	+ * @kbdev: Device pointer.
	1833	+ * @mmut: GPU MMU page table.
	1834	+ * @cur_level: The level of MMU page table where the ATE needs to be added.
	1835	+ * The bottom PGD level.
	1836	+ * @insert_level: The level of MMU page table where the chain of newly allocated
	1837	+ * PGDs needs to be linked-in/inserted.
	1838	+ * The top-most PDG level to be updated.
	1839	+ * @insert_vpfn: The virtual page frame number for the ATE.
	1840	+ * @pgds_to_insert: Ptr to an array (size MIDGARD_MMU_BOTTOMLEVEL+1) that contains
	1841	+ * the physical addresses of newly allocated PGDs from index
	1842	+ * insert_level+1 to cur_level, and an existing PGD at index
	1843	+ * insert_level.
	1844	+ *
	1845	+ * The newly allocated PGDs are linked from the bottom level up and inserted into the PGD
	1846	+ * at insert_level which already exists in the MMU Page Tables.Migration status is also
	1847	+ * updated for all the newly allocated PGD pages.
	1848	+ *
	1849	+ * Return:
	1850	+ * * 0 - OK
	1851	+ * * -EFAULT - level N+1 PGD does not exist
	1852	+ * * -EINVAL - kmap() failed for level N PGD PFN
1200	1853	*/
1201		-int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
1202		- struct tagged_addr phys, size_t nr,
1203		- unsigned long flags, int const group_id)
	1854	+static int update_parent_pgds(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	1855	+ int cur_level, int insert_level, u64 insert_vpfn,
	1856	+ phys_addr_t *pgds_to_insert)
	1857	+{
	1858	+ int pgd_index;
	1859	+ int err = 0;
	1860	+
	1861	+ /* Add a PTE for the new PGD page at pgd_index into the parent PGD at (pgd_index-1)
	1862	+ * Loop runs from the bottom-most to the top-most level so that all entries in the chain
	1863	+ * are valid when they are inserted into the MMU Page table via the insert_level PGD.
	1864	+ */
	1865	+ for (pgd_index = cur_level; pgd_index > insert_level; pgd_index--) {
	1866	+ int parent_index = pgd_index - 1;
	1867	+ phys_addr_t parent_pgd = pgds_to_insert[parent_index];
	1868	+ unsigned int current_valid_entries;
	1869	+ u64 pte;
	1870	+ phys_addr_t target_pgd = pgds_to_insert[pgd_index];
	1871	+ u64 parent_vpfn = (insert_vpfn >> ((3 - parent_index) * 9)) & 0x1FF;
	1872	+ struct page *parent_page = pfn_to_page(PFN_DOWN(parent_pgd));
	1873	+ u64 *parent_page_va;
	1874	+
	1875	+ if (WARN_ON_ONCE(target_pgd == KBASE_MMU_INVALID_PGD_ADDRESS)) {
	1876	+ err = -EFAULT;
	1877	+ goto failure_recovery;
	1878	+ }
	1879	+
	1880	+ parent_page_va = kmap(parent_page);
	1881	+ if (unlikely(parent_page_va == NULL)) {
	1882	+ dev_err(kbdev->dev, "%s: kmap failure", __func__);
	1883	+ err = -EINVAL;
	1884	+ goto failure_recovery;
	1885	+ }
	1886	+
	1887	+ current_valid_entries = kbdev->mmu_mode->get_num_valid_entries(parent_page_va);
	1888	+
	1889	+ kbdev->mmu_mode->entry_set_pte(&pte, target_pgd);
	1890	+ parent_page_va[parent_vpfn] = kbdev->mgm_dev->ops.mgm_update_gpu_pte(
	1891	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, parent_index, pte);
	1892	+ kbdev->mmu_mode->set_num_valid_entries(parent_page_va, current_valid_entries + 1);
	1893	+ kunmap(parent_page);
	1894	+
	1895	+ if (parent_index != insert_level) {
	1896	+ /* Newly allocated PGDs */
	1897	+ kbase_mmu_sync_pgd_cpu(
	1898	+ kbdev, kbase_dma_addr(parent_page) + (parent_vpfn * sizeof(u64)),
	1899	+ sizeof(u64));
	1900	+ } else {
	1901	+ /* A new valid entry is added to an existing PGD. Perform the
	1902	+ * invalidate operation for GPU cache as it could be having a
	1903	+ * cacheline that contains the entry (in an invalid form).
	1904	+ */
	1905	+ kbase_mmu_sync_pgd(
	1906	+ kbdev, mmut->kctx, parent_pgd + (parent_vpfn * sizeof(u64)),
	1907	+ kbase_dma_addr(parent_page) + (parent_vpfn * sizeof(u64)),
	1908	+ sizeof(u64), KBASE_MMU_OP_FLUSH_PT);
	1909	+ }
	1910	+
	1911	+ /* Update the new target_pgd page to its stable state */
	1912	+ if (kbase_page_migration_enabled) {
	1913	+ struct kbase_page_metadata *page_md =
	1914	+ kbase_page_private(phys_to_page(target_pgd));
	1915	+
	1916	+ spin_lock(&page_md->migrate_lock);
	1917	+
	1918	+ WARN_ON_ONCE(PAGE_STATUS_GET(page_md->status) != ALLOCATE_IN_PROGRESS \|\|
	1919	+ IS_PAGE_ISOLATED(page_md->status));
	1920	+
	1921	+ if (mmut->kctx) {
	1922	+ page_md->status = PAGE_STATUS_SET(page_md->status, PT_MAPPED);
	1923	+ page_md->data.pt_mapped.mmut = mmut;
	1924	+ page_md->data.pt_mapped.pgd_vpfn_level =
	1925	+ PGD_VPFN_LEVEL_SET(insert_vpfn, parent_index);
	1926	+ } else {
	1927	+ page_md->status = PAGE_STATUS_SET(page_md->status, NOT_MOVABLE);
	1928	+ }
	1929	+
	1930	+ spin_unlock(&page_md->migrate_lock);
	1931	+ }
	1932	+ }
	1933	+
	1934	+ return 0;
	1935	+
	1936	+failure_recovery:
	1937	+ /* Cleanup PTEs from PGDs. The Parent PGD in the loop above is just "PGD" here */
	1938	+ for (; pgd_index < cur_level; pgd_index++) {
	1939	+ phys_addr_t pgd = pgds_to_insert[pgd_index];
	1940	+ struct page *pgd_page = pfn_to_page(PFN_DOWN(pgd));
	1941	+ u64 *pgd_page_va = kmap(pgd_page);
	1942	+ u64 vpfn = (insert_vpfn >> ((3 - pgd_index) * 9)) & 0x1FF;
	1943	+
	1944	+ kbdev->mmu_mode->entries_invalidate(&pgd_page_va[vpfn], 1);
	1945	+ kunmap(pgd_page);
	1946	+ }
	1947	+
	1948	+ return err;
	1949	+}
	1950	+
	1951	+/**
	1952	+ * mmu_insert_alloc_pgds() - allocate memory for PGDs from level_low to
	1953	+ * level_high (inclusive)
	1954	+ *
	1955	+ * @kbdev: Device pointer.
	1956	+ * @mmut: GPU MMU page table.
	1957	+ * @level_low: The lower bound for the levels for which the PGD allocs are required
	1958	+ * @level_high: The higher bound for the levels for which the PGD allocs are required
	1959	+ * @new_pgds: Ptr to an array (size MIDGARD_MMU_BOTTOMLEVEL+1) to write the
	1960	+ * newly allocated PGD addresses to.
	1961	+ *
	1962	+ * Numerically, level_low < level_high, not to be confused with top level and
	1963	+ * bottom level concepts for MMU PGDs. They are only used as low and high bounds
	1964	+ * in an incrementing for-loop.
	1965	+ *
	1966	+ * Return:
	1967	+ * * 0 - OK
	1968	+ * * -ENOMEM - allocation failed for a PGD.
	1969	+ */
	1970	+static int mmu_insert_alloc_pgds(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	1971	+ phys_addr_t *new_pgds, int level_low, int level_high)
	1972	+{
	1973	+ int err = 0;
	1974	+ int i;
	1975	+
	1976	+ lockdep_assert_held(&mmut->mmu_lock);
	1977	+
	1978	+ for (i = level_low; i <= level_high; i++) {
	1979	+ do {
	1980	+ new_pgds[i] = kbase_mmu_alloc_pgd(kbdev, mmut);
	1981	+ if (new_pgds[i] != KBASE_MMU_INVALID_PGD_ADDRESS)
	1982	+ break;
	1983	+
	1984	+ mutex_unlock(&mmut->mmu_lock);
	1985	+ err = kbase_mem_pool_grow(&kbdev->mem_pools.small[mmut->group_id],
	1986	+ level_high, NULL);
	1987	+ mutex_lock(&mmut->mmu_lock);
	1988	+ if (err) {
	1989	+ dev_err(kbdev->dev, "%s: kbase_mem_pool_grow() returned error %d",
	1990	+ __func__, err);
	1991	+
	1992	+ /* Free all PGDs allocated in previous successful iterations
	1993	+ * from (i-1) to level_low
	1994	+ */
	1995	+ for (i = (i - 1); i >= level_low; i--) {
	1996	+ if (new_pgds[i] != KBASE_MMU_INVALID_PGD_ADDRESS)
	1997	+ kbase_mmu_free_pgd(kbdev, mmut, new_pgds[i]);
	1998	+ }
	1999	+
	2000	+ return err;
	2001	+ }
	2002	+ } while (1);
	2003	+ }
	2004	+
	2005	+ return 0;
	2006	+}
	2007	+
	2008	+int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 start_vpfn,
	2009	+ struct tagged_addr phys, size_t nr, unsigned long flags,
	2010	+ int const group_id, enum kbase_caller_mmu_sync_info mmu_sync_info,
	2011	+ bool ignore_page_migration)
1204	2012	{
1205	2013	phys_addr_t pgd;
1206	2014	u64 *pgd_page;
1207		- /* In case the insert_single_page only partially completes
1208		- * we need to be able to recover
1209		- */
1210		- bool recover_required = false;
1211		- u64 start_vpfn = vpfn;
1212		- size_t recover_count = 0;
	2015	+ u64 insert_vpfn = start_vpfn;
1213	2016	size_t remain = nr;
1214	2017	int err;
1215	2018	struct kbase_device *kbdev;
	2019	+ u64 dirty_pgds = 0;
	2020	+ unsigned int i;
	2021	+ phys_addr_t new_pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
	2022	+ enum kbase_mmu_op_type flush_op;
	2023	+ struct kbase_mmu_table *mmut = &kctx->mmu;
	2024	+ int l, cur_level, insert_level;
1216	2025
1217	2026	if (WARN_ON(kctx == NULL))
1218	2027	return -EINVAL;
1219	2028
1220	2029	/* 64-bit address range is the max */
1221		- KBASE_DEBUG_ASSERT(vpfn <= (U64_MAX / PAGE_SIZE));
	2030	+ KBASE_DEBUG_ASSERT(start_vpfn <= (U64_MAX / PAGE_SIZE));
1222	2031
1223	2032	kbdev = kctx->kbdev;
1224	2033
..	..	@@ -1226,76 +2035,87 @@
1226	2035	if (nr == 0)
1227	2036	return 0;
1228	2037
1229		- mutex_lock(&kctx->mmu.mmu_lock);
	2038	+ /* If page migration is enabled, pages involved in multiple GPU mappings
	2039	+ * are always treated as not movable.
	2040	+ */
	2041	+ if (kbase_page_migration_enabled && !ignore_page_migration) {
	2042	+ struct page *phys_page = as_page(phys);
	2043	+ struct kbase_page_metadata *page_md = kbase_page_private(phys_page);
	2044	+
	2045	+ if (page_md) {
	2046	+ spin_lock(&page_md->migrate_lock);
	2047	+ page_md->status = PAGE_STATUS_SET(page_md->status, (u8)NOT_MOVABLE);
	2048	+ spin_unlock(&page_md->migrate_lock);
	2049	+ }
	2050	+ }
	2051	+
	2052	+ mutex_lock(&mmut->mmu_lock);
1230	2053
1231	2054	while (remain) {
1232		- unsigned int i;
1233		- unsigned int index = vpfn & 0x1FF;
1234		- unsigned int count = KBASE_MMU_PAGE_ENTRIES - index;
	2055	+ unsigned int vindex = insert_vpfn & 0x1FF;
	2056	+ unsigned int count = KBASE_MMU_PAGE_ENTRIES - vindex;
1235	2057	struct page *p;
	2058	+ register unsigned int num_of_valid_entries;
	2059	+ bool newly_created_pgd = false;
1236	2060
1237	2061	if (count > remain)
1238	2062	count = remain;
1239	2063
	2064	+ cur_level = MIDGARD_MMU_BOTTOMLEVEL;
	2065	+ insert_level = cur_level;
	2066	+
1240	2067	/*
1241		- * Repeatedly calling mmu_get_bottom_pte() is clearly
	2068	+ * Repeatedly calling mmu_get_lowest_valid_pgd() is clearly
1242	2069	* suboptimal. We don't have to re-parse the whole tree
1243	2070	* each time (just cache the l0-l2 sequence).
1244	2071	* On the other hand, it's only a gain when we map more than
1245	2072	* 256 pages at once (on average). Do we really care?
1246	2073	*/
1247		- do {
1248		- err = mmu_get_bottom_pgd(kbdev, &kctx->mmu,
1249		- vpfn, &pgd);
1250		- if (err != -ENOMEM)
1251		- break;
1252		- /* Fill the memory pool with enough pages for
1253		- * the page walk to succeed
1254		- */
1255		- mutex_unlock(&kctx->mmu.mmu_lock);
1256		- err = kbase_mem_pool_grow(
1257		-#ifdef CONFIG_MALI_2MB_ALLOC
1258		- &kbdev->mem_pools.large[
1259		-#else
1260		- &kbdev->mem_pools.small[
1261		-#endif
1262		- kctx->mmu.group_id],
1263		- MIDGARD_MMU_BOTTOMLEVEL);
1264		- mutex_lock(&kctx->mmu.mmu_lock);
1265		- } while (!err);
	2074	+ /* insert_level < cur_level if there's no valid PGD for cur_level and insert_vpn */
	2075	+ err = mmu_get_lowest_valid_pgd(kbdev, mmut, insert_vpfn, cur_level, &insert_level,
	2076	+ &pgd);
	2077	+
1266	2078	if (err) {
1267		- dev_warn(kbdev->dev, "kbase_mmu_insert_pages: mmu_get_bottom_pgd failure\n");
1268		- if (recover_required) {
1269		- /* Invalidate the pages we have partially
1270		- * completed
1271		- */
1272		- mmu_insert_pages_failure_recovery(kbdev,
1273		- &kctx->mmu,
1274		- start_vpfn,
1275		- start_vpfn + recover_count);
1276		- }
	2079	+ dev_err(kbdev->dev, "%s: mmu_get_lowest_valid_pgd() returned error %d",
	2080	+ __func__, err);
1277	2081	goto fail_unlock;
	2082	+ }
	2083	+
	2084	+ /* No valid pgd at cur_level */
	2085	+ if (insert_level != cur_level) {
	2086	+ /* Allocate new pgds for all missing levels from the required level
	2087	+ * down to the lowest valid pgd at insert_level
	2088	+ */
	2089	+ err = mmu_insert_alloc_pgds(kbdev, mmut, new_pgds, (insert_level + 1),
	2090	+ cur_level);
	2091	+ if (err)
	2092	+ goto fail_unlock;
	2093	+
	2094	+ newly_created_pgd = true;
	2095	+
	2096	+ new_pgds[insert_level] = pgd;
	2097	+
	2098	+ /* If we didn't find an existing valid pgd at cur_level,
	2099	+ * we've now allocated one. The ATE in the next step should
	2100	+ * be inserted in this newly allocated pgd.
	2101	+ */
	2102	+ pgd = new_pgds[cur_level];
1278	2103	}
1279	2104
1280	2105	p = pfn_to_page(PFN_DOWN(pgd));
1281	2106	pgd_page = kmap(p);
1282	2107	if (!pgd_page) {
1283		- dev_warn(kbdev->dev, "kbase_mmu_insert_pages: kmap failure\n");
1284		- if (recover_required) {
1285		- /* Invalidate the pages we have partially
1286		- * completed
1287		- */
1288		- mmu_insert_pages_failure_recovery(kbdev,
1289		- &kctx->mmu,
1290		- start_vpfn,
1291		- start_vpfn + recover_count);
1292		- }
	2108	+ dev_err(kbdev->dev, "%s: kmap failure", __func__);
1293	2109	err = -ENOMEM;
1294		- goto fail_unlock;
	2110	+
	2111	+ goto fail_unlock_free_pgds;
1295	2112	}
1296	2113
	2114	+ num_of_valid_entries =
	2115	+ kbdev->mmu_mode->get_num_valid_entries(pgd_page);
	2116	+
1297	2117	for (i = 0; i < count; i++) {
1298		- unsigned int ofs = index + i;
	2118	+ unsigned int ofs = vindex + i;
1299	2119
1300	2120	/* Fail if the current page is a valid ATE entry */
1301	2121	KBASE_DEBUG_ASSERT(0 == (pgd_page[ofs] & 1UL));
..	..	@@ -1304,56 +2124,167 @@
1304	2124	phys, flags, MIDGARD_MMU_BOTTOMLEVEL, group_id);
1305	2125	}
1306	2126
1307		- vpfn += count;
1308		- remain -= count;
	2127	+ kbdev->mmu_mode->set_num_valid_entries(
	2128	+ pgd_page, num_of_valid_entries + count);
1309	2129
1310		- kbase_mmu_sync_pgd(kbdev,
1311		- kbase_dma_addr(p) + (index * sizeof(u64)),
1312		- count * sizeof(u64));
	2130	+ dirty_pgds \|= 1ULL << (newly_created_pgd ? insert_level : MIDGARD_MMU_BOTTOMLEVEL);
1313	2131
1314		- kunmap(p);
1315		- /* We have started modifying the page table.
1316		- * If further pages need inserting and fail we need to undo what
1317		- * has already taken place
	2132	+ /* MMU cache flush operation here will depend on whether bottom level
	2133	+ * PGD is newly created or not.
	2134	+ *
	2135	+ * If bottom level PGD is newly created then no GPU cache maintenance is
	2136	+ * required as the PGD will not exist in GPU cache. Otherwise GPU cache
	2137	+ * maintenance is required for existing PGD.
1318	2138	*/
1319		- recover_required = true;
1320		- recover_count += count;
	2139	+ flush_op = newly_created_pgd ? KBASE_MMU_OP_NONE : KBASE_MMU_OP_FLUSH_PT;
	2140	+
	2141	+ kbase_mmu_sync_pgd(kbdev, kctx, pgd + (vindex * sizeof(u64)),
	2142	+ kbase_dma_addr(p) + (vindex * sizeof(u64)), count * sizeof(u64),
	2143	+ flush_op);
	2144	+
	2145	+ if (newly_created_pgd) {
	2146	+ err = update_parent_pgds(kbdev, mmut, cur_level, insert_level, insert_vpfn,
	2147	+ new_pgds);
	2148	+ if (err) {
	2149	+ dev_err(kbdev->dev, "%s: update_parent_pgds() failed (%d)",
	2150	+ __func__, err);
	2151	+
	2152	+ kbdev->mmu_mode->entries_invalidate(&pgd_page[vindex], count);
	2153	+
	2154	+ kunmap(p);
	2155	+ goto fail_unlock_free_pgds;
	2156	+ }
	2157	+ }
	2158	+
	2159	+ insert_vpfn += count;
	2160	+ remain -= count;
	2161	+ kunmap(p);
1321	2162	}
1322		- mutex_unlock(&kctx->mmu.mmu_lock);
1323		- kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false);
	2163	+
	2164	+ mutex_unlock(&mmut->mmu_lock);
	2165	+
	2166	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, start_vpfn, nr, dirty_pgds, mmu_sync_info,
	2167	+ false);
	2168	+
1324	2169	return 0;
1325	2170
	2171	+fail_unlock_free_pgds:
	2172	+ /* Free the pgds allocated by us from insert_level+1 to bottom level */
	2173	+ for (l = cur_level; l > insert_level; l--)
	2174	+ kbase_mmu_free_pgd(kbdev, mmut, new_pgds[l]);
	2175	+
1326	2176	fail_unlock:
1327		- mutex_unlock(&kctx->mmu.mmu_lock);
1328		- kbase_mmu_flush_invalidate(kctx, start_vpfn, nr, false);
	2177	+ if (insert_vpfn != start_vpfn) {
	2178	+ /* Invalidate the pages we have partially completed */
	2179	+ mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn, insert_vpfn, &dirty_pgds,
	2180	+ NULL, true);
	2181	+ }
	2182	+
	2183	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, start_vpfn, nr, dirty_pgds, mmu_sync_info,
	2184	+ true);
	2185	+ kbase_mmu_free_pgds_list(kbdev, mmut);
	2186	+ mutex_unlock(&mmut->mmu_lock);
	2187	+
1329	2188	return err;
1330	2189	}
1331	2190
1332		-static inline void cleanup_empty_pte(struct kbase_device *kbdev,
1333		- struct kbase_mmu_table mmut, u64 pte)
	2191	+int kbase_mmu_insert_single_imported_page(struct kbase_context *kctx, u64 vpfn,
	2192	+ struct tagged_addr phys, size_t nr, unsigned long flags,
	2193	+ int const group_id,
	2194	+ enum kbase_caller_mmu_sync_info mmu_sync_info)
1334	2195	{
1335		- phys_addr_t tmp_pgd;
1336		- struct page *tmp_p;
	2196	+ /* The aliasing sink page has metadata and shall be moved to NOT_MOVABLE. */
	2197	+ return kbase_mmu_insert_single_page(kctx, vpfn, phys, nr, flags, group_id, mmu_sync_info,
	2198	+ false);
	2199	+}
1337	2200
1338		- tmp_pgd = kbdev->mmu_mode->pte_to_phy_addr(*pte);
1339		- tmp_p = phys_to_page(tmp_pgd);
1340		-#ifdef CONFIG_MALI_2MB_ALLOC
1341		- kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id],
1342		-#else
1343		- kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
1344		-#endif
1345		- tmp_p, false);
	2201	+int kbase_mmu_insert_single_aliased_page(struct kbase_context *kctx, u64 vpfn,
	2202	+ struct tagged_addr phys, size_t nr, unsigned long flags,
	2203	+ int const group_id,
	2204	+ enum kbase_caller_mmu_sync_info mmu_sync_info)
	2205	+{
	2206	+ /* The aliasing sink page has metadata and shall be moved to NOT_MOVABLE. */
	2207	+ return kbase_mmu_insert_single_page(kctx, vpfn, phys, nr, flags, group_id, mmu_sync_info,
	2208	+ false);
	2209	+}
1346	2210
1347		- /* If the MMU tables belong to a context then we accounted the memory
1348		- * usage to that context, so decrement here.
	2211	+static void kbase_mmu_progress_migration_on_insert(struct tagged_addr phys,
	2212	+ struct kbase_va_region *reg,
	2213	+ struct kbase_mmu_table *mmut, const u64 vpfn)
	2214	+{
	2215	+ struct page *phys_page = as_page(phys);
	2216	+ struct kbase_page_metadata *page_md = kbase_page_private(phys_page);
	2217	+
	2218	+ spin_lock(&page_md->migrate_lock);
	2219	+
	2220	+ /* If no GPU va region is given: the metadata provided are
	2221	+ * invalid.
	2222	+ *
	2223	+ * If the page is already allocated and mapped: this is
	2224	+ * an additional GPU mapping, probably to create a memory
	2225	+ * alias, which means it is no longer possible to migrate
	2226	+ * the page easily because tracking all the GPU mappings
	2227	+ * would be too costly.
	2228	+ *
	2229	+ * In any case: the page becomes not movable. It is kept
	2230	+ * alive, but attempts to migrate it will fail. The page
	2231	+ * will be freed if it is still not movable when it returns
	2232	+ * to a memory pool. Notice that the movable flag is not
	2233	+ * cleared because that would require taking the page lock.
1349	2234	*/
1350		- if (mmut->kctx) {
1351		- kbase_process_page_usage_dec(mmut->kctx, 1);
1352		- atomic_sub(1, &mmut->kctx->used_pages);
	2235	+ if (!reg \|\| PAGE_STATUS_GET(page_md->status) == (u8)ALLOCATED_MAPPED) {
	2236	+ page_md->status = PAGE_STATUS_SET(page_md->status, (u8)NOT_MOVABLE);
	2237	+ } else if (PAGE_STATUS_GET(page_md->status) == (u8)ALLOCATE_IN_PROGRESS) {
	2238	+ page_md->status = PAGE_STATUS_SET(page_md->status, (u8)ALLOCATED_MAPPED);
	2239	+ page_md->data.mapped.reg = reg;
	2240	+ page_md->data.mapped.mmut = mmut;
	2241	+ page_md->data.mapped.vpfn = vpfn;
1353	2242	}
1354		- atomic_sub(1, &kbdev->memdev.used_pages);
1355	2243
1356		- kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
	2244	+ spin_unlock(&page_md->migrate_lock);
	2245	+}
	2246	+
	2247	+static void kbase_mmu_progress_migration_on_teardown(struct kbase_device *kbdev,
	2248	+ struct tagged_addr *phys, size_t requested_nr)
	2249	+{
	2250	+ size_t i;
	2251	+
	2252	+ for (i = 0; i < requested_nr; i++) {
	2253	+ struct page *phys_page = as_page(phys[i]);
	2254	+ struct kbase_page_metadata *page_md = kbase_page_private(phys_page);
	2255	+
	2256	+ /* Skip the 4KB page that is part of a large page, as the large page is
	2257	+ * excluded from the migration process.
	2258	+ */
	2259	+ if (is_huge(phys[i]) \|\| is_partial(phys[i]))
	2260	+ continue;
	2261	+
	2262	+ if (page_md) {
	2263	+ u8 status;
	2264	+
	2265	+ spin_lock(&page_md->migrate_lock);
	2266	+ status = PAGE_STATUS_GET(page_md->status);
	2267	+
	2268	+ if (status == ALLOCATED_MAPPED) {
	2269	+ if (IS_PAGE_ISOLATED(page_md->status)) {
	2270	+ page_md->status = PAGE_STATUS_SET(
	2271	+ page_md->status, (u8)FREE_ISOLATED_IN_PROGRESS);
	2272	+ page_md->data.free_isolated.kbdev = kbdev;
	2273	+ /* At this point, we still have a reference
	2274	+ * to the page via its page migration metadata,
	2275	+ * and any page with the FREE_ISOLATED_IN_PROGRESS
	2276	+ * status will subsequently be freed in either
	2277	+ * kbase_page_migrate() or kbase_page_putback()
	2278	+ */
	2279	+ phys[i] = as_tagged(0);
	2280	+ } else
	2281	+ page_md->status = PAGE_STATUS_SET(page_md->status,
	2282	+ (u8)FREE_IN_PROGRESS);
	2283	+ }
	2284	+
	2285	+ spin_unlock(&page_md->migrate_lock);
	2286	+ }
	2287	+ }
1357	2288	}
1358	2289
1359	2290	u64 kbase_mmu_create_ate(struct kbase_device *const kbdev,
..	..	@@ -1367,12 +2298,10 @@
1367	2298	group_id, level, entry);
1368	2299	}
1369	2300
1370		-int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev,
1371		- struct kbase_mmu_table *mmut,
1372		- const u64 start_vpfn,
1373		- struct tagged_addr *phys, size_t nr,
1374		- unsigned long flags,
1375		- int const group_id)
	2301	+int kbase_mmu_insert_pages_no_flush(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	2302	+ const u64 start_vpfn, struct tagged_addr *phys, size_t nr,
	2303	+ unsigned long flags, int const group_id, u64 *dirty_pgds,
	2304	+ struct kbase_va_region *reg, bool ignore_page_migration)
1376	2305	{
1377	2306	phys_addr_t pgd;
1378	2307	u64 *pgd_page;
..	..	@@ -1380,6 +2309,9 @@
1380	2309	size_t remain = nr;
1381	2310	int err;
1382	2311	struct kbase_mmu_mode const *mmu_mode;
	2312	+ unsigned int i;
	2313	+ phys_addr_t new_pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
	2314	+ int l, cur_level, insert_level;
1383	2315
1384	2316	/* Note that 0 is a valid start_vpfn */
1385	2317	/* 64-bit address range is the max */
..	..	@@ -1394,11 +2326,12 @@
1394	2326	mutex_lock(&mmut->mmu_lock);
1395	2327
1396	2328	while (remain) {
1397		- unsigned int i;
1398	2329	unsigned int vindex = insert_vpfn & 0x1FF;
1399	2330	unsigned int count = KBASE_MMU_PAGE_ENTRIES - vindex;
1400	2331	struct page *p;
1401		- int cur_level;
	2332	+ register unsigned int num_of_valid_entries;
	2333	+ bool newly_created_pgd = false;
	2334	+ enum kbase_mmu_op_type flush_op;
1402	2335
1403	2336	if (count > remain)
1404	2337	count = remain;
..	..	@@ -1408,69 +2341,64 @@
1408	2341	else
1409	2342	cur_level = MIDGARD_MMU_BOTTOMLEVEL;
1410	2343
	2344	+ insert_level = cur_level;
	2345	+
1411	2346	/*
1412		- * Repeatedly calling mmu_get_pgd_at_level() is clearly
	2347	+ * Repeatedly calling mmu_get_lowest_valid_pgd() is clearly
1413	2348	* suboptimal. We don't have to re-parse the whole tree
1414	2349	* each time (just cache the l0-l2 sequence).
1415	2350	* On the other hand, it's only a gain when we map more than
1416	2351	* 256 pages at once (on average). Do we really care?
1417	2352	*/
1418		- do {
1419		- err = mmu_get_pgd_at_level(kbdev, mmut, insert_vpfn,
1420		- cur_level, &pgd);
1421		- if (err != -ENOMEM)
1422		- break;
1423		- /* Fill the memory pool with enough pages for
1424		- * the page walk to succeed
1425		- */
1426		- mutex_unlock(&mmut->mmu_lock);
1427		- err = kbase_mem_pool_grow(
1428		-#ifdef CONFIG_MALI_2MB_ALLOC
1429		- &kbdev->mem_pools.large[mmut->group_id],
1430		-#else
1431		- &kbdev->mem_pools.small[mmut->group_id],
1432		-#endif
1433		- cur_level);
1434		- mutex_lock(&mmut->mmu_lock);
1435		- } while (!err);
	2353	+ /* insert_level < cur_level if there's no valid PGD for cur_level and insert_vpn */
	2354	+ err = mmu_get_lowest_valid_pgd(kbdev, mmut, insert_vpfn, cur_level, &insert_level,
	2355	+ &pgd);
1436	2356
1437	2357	if (err) {
1438		- dev_warn(kbdev->dev,
1439		- "%s: mmu_get_bottom_pgd failure\n", __func__);
1440		- if (insert_vpfn != start_vpfn) {
1441		- /* Invalidate the pages we have partially
1442		- * completed
1443		- */
1444		- mmu_insert_pages_failure_recovery(kbdev,
1445		- mmut, start_vpfn, insert_vpfn);
1446		- }
	2358	+ dev_err(kbdev->dev, "%s: mmu_get_lowest_valid_pgd() returned error %d",
	2359	+ __func__, err);
1447	2360	goto fail_unlock;
	2361	+ }
	2362	+
	2363	+ /* No valid pgd at cur_level */
	2364	+ if (insert_level != cur_level) {
	2365	+ /* Allocate new pgds for all missing levels from the required level
	2366	+ * down to the lowest valid pgd at insert_level
	2367	+ */
	2368	+ err = mmu_insert_alloc_pgds(kbdev, mmut, new_pgds, (insert_level + 1),
	2369	+ cur_level);
	2370	+ if (err)
	2371	+ goto fail_unlock;
	2372	+
	2373	+ newly_created_pgd = true;
	2374	+
	2375	+ new_pgds[insert_level] = pgd;
	2376	+
	2377	+ /* If we didn't find an existing valid pgd at cur_level,
	2378	+ * we've now allocated one. The ATE in the next step should
	2379	+ * be inserted in this newly allocated pgd.
	2380	+ */
	2381	+ pgd = new_pgds[cur_level];
1448	2382	}
1449	2383
1450	2384	p = pfn_to_page(PFN_DOWN(pgd));
1451	2385	pgd_page = kmap(p);
1452	2386	if (!pgd_page) {
1453		- dev_warn(kbdev->dev, "%s: kmap failure\n",
1454		- __func__);
1455		- if (insert_vpfn != start_vpfn) {
1456		- /* Invalidate the pages we have partially
1457		- * completed
1458		- */
1459		- mmu_insert_pages_failure_recovery(kbdev,
1460		- mmut, start_vpfn, insert_vpfn);
1461		- }
	2387	+ dev_err(kbdev->dev, "%s: kmap failure", __func__);
1462	2388	err = -ENOMEM;
1463		- goto fail_unlock;
	2389	+
	2390	+ goto fail_unlock_free_pgds;
1464	2391	}
	2392	+
	2393	+ num_of_valid_entries =
	2394	+ mmu_mode->get_num_valid_entries(pgd_page);
1465	2395
1466	2396	if (cur_level == MIDGARD_MMU_LEVEL(2)) {
1467	2397	int level_index = (insert_vpfn >> 9) & 0x1FF;
1468		- u64 *target = &pgd_page[level_index];
	2398	+ pgd_page[level_index] =
	2399	+ kbase_mmu_create_ate(kbdev, *phys, flags, cur_level, group_id);
1469	2400
1470		- if (mmu_mode->pte_is_valid(*target, cur_level))
1471		- cleanup_empty_pte(kbdev, mmut, target);
1472		- target = kbase_mmu_create_ate(kbdev, phys, flags,
1473		- cur_level, group_id);
	2401	+ num_of_valid_entries++;
1474	2402	} else {
1475	2403	for (i = 0; i < count; i++) {
1476	2404	unsigned int ofs = vindex + i;
..	..	@@ -1487,24 +2415,77 @@
1487	2415
1488	2416	*target = kbase_mmu_create_ate(kbdev,
1489	2417	phys[i], flags, cur_level, group_id);
	2418	+
	2419	+ /* If page migration is enabled, this is the right time
	2420	+ * to update the status of the page.
	2421	+ */
	2422	+ if (kbase_page_migration_enabled && !ignore_page_migration &&
	2423	+ !is_huge(phys[i]) && !is_partial(phys[i]))
	2424	+ kbase_mmu_progress_migration_on_insert(phys[i], reg, mmut,
	2425	+ insert_vpfn + i);
	2426	+ }
	2427	+ num_of_valid_entries += count;
	2428	+ }
	2429	+
	2430	+ mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries);
	2431	+
	2432	+ if (dirty_pgds)
	2433	+ *dirty_pgds \|= 1ULL << (newly_created_pgd ? insert_level : cur_level);
	2434	+
	2435	+ /* MMU cache flush operation here will depend on whether bottom level
	2436	+ * PGD is newly created or not.
	2437	+ *
	2438	+ * If bottom level PGD is newly created then no GPU cache maintenance is
	2439	+ * required as the PGD will not exist in GPU cache. Otherwise GPU cache
	2440	+ * maintenance is required for existing PGD.
	2441	+ */
	2442	+ flush_op = newly_created_pgd ? KBASE_MMU_OP_NONE : KBASE_MMU_OP_FLUSH_PT;
	2443	+
	2444	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (vindex * sizeof(u64)),
	2445	+ kbase_dma_addr(p) + (vindex * sizeof(u64)), count * sizeof(u64),
	2446	+ flush_op);
	2447	+
	2448	+ if (newly_created_pgd) {
	2449	+ err = update_parent_pgds(kbdev, mmut, cur_level, insert_level, insert_vpfn,
	2450	+ new_pgds);
	2451	+ if (err) {
	2452	+ dev_err(kbdev->dev, "%s: update_parent_pgds() failed (%d)",
	2453	+ __func__, err);
	2454	+
	2455	+ kbdev->mmu_mode->entries_invalidate(&pgd_page[vindex], count);
	2456	+
	2457	+ kunmap(p);
	2458	+ goto fail_unlock_free_pgds;
1490	2459	}
1491	2460	}
1492	2461
1493	2462	phys += count;
1494	2463	insert_vpfn += count;
1495	2464	remain -= count;
1496		-
1497		- kbase_mmu_sync_pgd(kbdev,
1498		- kbase_dma_addr(p) + (vindex * sizeof(u64)),
1499		- count * sizeof(u64));
1500		-
1501	2465	kunmap(p);
1502	2466	}
1503	2467
1504		- err = 0;
	2468	+ mutex_unlock(&mmut->mmu_lock);
	2469	+
	2470	+ return 0;
	2471	+
	2472	+fail_unlock_free_pgds:
	2473	+ /* Free the pgds allocated by us from insert_level+1 to bottom level */
	2474	+ for (l = cur_level; l > insert_level; l--)
	2475	+ kbase_mmu_free_pgd(kbdev, mmut, new_pgds[l]);
1505	2476
1506	2477	fail_unlock:
	2478	+ if (insert_vpfn != start_vpfn) {
	2479	+ /* Invalidate the pages we have partially completed */
	2480	+ mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn, insert_vpfn, dirty_pgds,
	2481	+ phys, ignore_page_migration);
	2482	+ }
	2483	+
	2484	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, start_vpfn, nr,
	2485	+ dirty_pgds ? *dirty_pgds : 0xF, CALLER_MMU_ASYNC, true);
	2486	+ kbase_mmu_free_pgds_list(kbdev, mmut);
1507	2487	mutex_unlock(&mmut->mmu_lock);
	2488	+
1508	2489	return err;
1509	2490	}
1510	2491
..	..	@@ -1512,167 +2493,80 @@
1512	2493	* Map 'nr' pages pointed to by 'phys' at GPU PFN 'vpfn' for GPU address space
1513	2494	* number 'as_nr'.
1514	2495	*/
1515		-int kbase_mmu_insert_pages(struct kbase_device *kbdev,
1516		- struct kbase_mmu_table *mmut, u64 vpfn,
1517		- struct tagged_addr *phys, size_t nr,
1518		- unsigned long flags, int as_nr, int const group_id)
	2496	+int kbase_mmu_insert_pages(struct kbase_device kbdev, struct kbase_mmu_table mmut, u64 vpfn,
	2497	+ struct tagged_addr *phys, size_t nr, unsigned long flags, int as_nr,
	2498	+ int const group_id, enum kbase_caller_mmu_sync_info mmu_sync_info,
	2499	+ struct kbase_va_region *reg, bool ignore_page_migration)
1519	2500	{
1520	2501	int err;
	2502	+ u64 dirty_pgds = 0;
1521	2503
1522		- err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn,
1523		- phys, nr, flags, group_id);
	2504	+ /* Early out if there is nothing to do */
	2505	+ if (nr == 0)
	2506	+ return 0;
1524	2507
1525		- if (mmut->kctx)
1526		- kbase_mmu_flush_invalidate(mmut->kctx, vpfn, nr, false);
1527		- else
1528		- kbase_mmu_flush_invalidate_no_ctx(kbdev, vpfn, nr, false,
1529		- as_nr);
	2508	+ err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
	2509	+ &dirty_pgds, reg, ignore_page_migration);
	2510	+ if (err)
	2511	+ return err;
1530	2512
1531		- return err;
	2513	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, vpfn, nr, dirty_pgds, mmu_sync_info, false);
	2514	+
	2515	+ return 0;
1532	2516	}
1533	2517
1534	2518	KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
1535	2519
1536		-/**
1537		- * kbase_mmu_flush_invalidate_noretain() - Flush and invalidate the GPU caches
1538		- * without retaining the kbase context.
1539		- * @kctx: The KBase context.
1540		- * @vpfn: The virtual page frame number to start the flush on.
1541		- * @nr: The number of pages to flush.
1542		- * @sync: Set if the operation should be synchronous or not.
1543		- *
1544		- * As per kbase_mmu_flush_invalidate but doesn't retain the kctx or do any
1545		- * other locking.
1546		- */
1547		-static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
1548		- u64 vpfn, size_t nr, bool sync)
	2520	+int kbase_mmu_insert_imported_pages(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	2521	+ u64 vpfn, struct tagged_addr *phys, size_t nr,
	2522	+ unsigned long flags, int as_nr, int const group_id,
	2523	+ enum kbase_caller_mmu_sync_info mmu_sync_info,
	2524	+ struct kbase_va_region *reg)
1549	2525	{
1550		- struct kbase_device *kbdev = kctx->kbdev;
1551	2526	int err;
1552		- u32 op;
	2527	+ u64 dirty_pgds = 0;
1553	2528
1554	2529	/* Early out if there is nothing to do */
1555	2530	if (nr == 0)
1556		- return;
	2531	+ return 0;
1557	2532
1558		- if (sync)
1559		- op = AS_COMMAND_FLUSH_MEM;
1560		- else
1561		- op = AS_COMMAND_FLUSH_PT;
1562		-
1563		- err = kbase_mmu_hw_do_operation(kbdev,
1564		- &kbdev->as[kctx->as_nr],
1565		- vpfn, nr, op, 0);
1566		- if (err) {
1567		- /* Flush failed to complete, assume the
1568		- * GPU has hung and perform a reset to recover
1569		- */
1570		- dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
1571		-
1572		- if (kbase_prepare_to_reset_gpu_locked(kbdev, RESET_FLAGS_NONE))
1573		- kbase_reset_gpu_locked(kbdev);
1574		- }
1575		-}
1576		-
1577		-/* Perform a flush/invalidate on a particular address space
1578		- */
1579		-static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
1580		- struct kbase_as *as,
1581		- u64 vpfn, size_t nr, bool sync)
1582		-{
1583		- int err;
1584		- u32 op;
1585		- bool gpu_powered;
1586		- unsigned long flags;
1587		-
1588		- spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
1589		- gpu_powered = kbdev->pm.backend.gpu_powered;
1590		- spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
1591		-
1592		- /* GPU is off so there's no need to perform flush/invalidate.
1593		- * But even if GPU is not actually powered down, after gpu_powered flag
1594		- * was set to false, it is still safe to skip the flush/invalidate.
1595		- * The TLB invalidation will anyways be performed due to AS_COMMAND_UPDATE
1596		- * which is sent when address spaces are restored after gpu_powered flag
1597		- * is set to true. Flushing of L2 cache is certainly not required as L2
1598		- * cache is definitely off if gpu_powered is false.
	2533	+ /* Imported allocations don't have metadata and therefore always ignore the
	2534	+ * page migration logic.
1599	2535	*/
1600		- if (!gpu_powered)
1601		- return;
	2536	+ err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
	2537	+ &dirty_pgds, reg, true);
	2538	+ if (err)
	2539	+ return err;
1602	2540
1603		- if (kbase_pm_context_active_handle_suspend(kbdev,
1604		- KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
1605		- /* GPU has just been powered off due to system suspend.
1606		- * So again, no need to perform flush/invalidate.
1607		- */
1608		- return;
1609		- }
	2541	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, vpfn, nr, dirty_pgds, mmu_sync_info, false);
1610	2542
1611		- /* AS transaction begin */
1612		- mutex_lock(&kbdev->mmu_hw_mutex);
1613		-
1614		- if (sync)
1615		- op = AS_COMMAND_FLUSH_MEM;
1616		- else
1617		- op = AS_COMMAND_FLUSH_PT;
1618		-
1619		- err = kbase_mmu_hw_do_operation(kbdev,
1620		- as, vpfn, nr, op, 0);
1621		-
1622		- if (err) {
1623		- /* Flush failed to complete, assume the GPU has hung and
1624		- * perform a reset to recover
1625		- */
1626		- dev_err(kbdev->dev, "Flush for GPU page table update did not complete. Issuing GPU soft-reset to recover\n");
1627		-
1628		- if (kbase_prepare_to_reset_gpu(
1629		- kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
1630		- kbase_reset_gpu(kbdev);
1631		- }
1632		-
1633		- mutex_unlock(&kbdev->mmu_hw_mutex);
1634		- /* AS transaction end */
1635		-
1636		- kbase_pm_context_idle(kbdev);
	2543	+ return 0;
1637	2544	}
1638	2545
1639		-static void kbase_mmu_flush_invalidate_no_ctx(struct kbase_device *kbdev,
1640		- u64 vpfn, size_t nr, bool sync, int as_nr)
	2546	+int kbase_mmu_insert_aliased_pages(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	2547	+ u64 vpfn, struct tagged_addr *phys, size_t nr,
	2548	+ unsigned long flags, int as_nr, int const group_id,
	2549	+ enum kbase_caller_mmu_sync_info mmu_sync_info,
	2550	+ struct kbase_va_region *reg)
1641	2551	{
1642		- /* Skip if there is nothing to do */
1643		- if (nr) {
1644		- kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[as_nr], vpfn,
1645		- nr, sync);
1646		- }
1647		-}
1648		-
1649		-static void kbase_mmu_flush_invalidate(struct kbase_context *kctx,
1650		- u64 vpfn, size_t nr, bool sync)
1651		-{
1652		- struct kbase_device *kbdev;
1653		- bool ctx_is_in_runpool;
	2552	+ int err;
	2553	+ u64 dirty_pgds = 0;
1654	2554
1655	2555	/* Early out if there is nothing to do */
1656	2556	if (nr == 0)
1657		- return;
	2557	+ return 0;
1658	2558
1659		- kbdev = kctx->kbdev;
1660		-#if !MALI_USE_CSF
1661		- mutex_lock(&kbdev->js_data.queue_mutex);
1662		- ctx_is_in_runpool = kbase_ctx_sched_inc_refcount(kctx);
1663		- mutex_unlock(&kbdev->js_data.queue_mutex);
1664		-#else
1665		- ctx_is_in_runpool = kbase_ctx_sched_inc_refcount_if_as_valid(kctx);
1666		-#endif /* !MALI_USE_CSF */
	2559	+ /* Memory aliases are always built on top of existing allocations,
	2560	+ * therefore the state of physical pages shall be updated.
	2561	+ */
	2562	+ err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
	2563	+ &dirty_pgds, reg, false);
	2564	+ if (err)
	2565	+ return err;
1667	2566
1668		- if (ctx_is_in_runpool) {
1669		- KBASE_DEBUG_ASSERT(kctx->as_nr != KBASEP_AS_NR_INVALID);
	2567	+ mmu_flush_invalidate_insert_pages(kbdev, mmut, vpfn, nr, dirty_pgds, mmu_sync_info, false);
1670	2568
1671		- kbase_mmu_flush_invalidate_as(kbdev, &kbdev->as[kctx->as_nr],
1672		- vpfn, nr, sync);
1673		-
1674		- release_ctx(kbdev, kctx);
1675		- }
	2569	+ return 0;
1676	2570	}
1677	2571
1678	2572	void kbase_mmu_update(struct kbase_device *kbdev,
..	..	@@ -1697,6 +2591,14 @@
1697	2591
1698	2592	void kbase_mmu_disable(struct kbase_context *kctx)
1699	2593	{
	2594	+ /* Calls to this function are inherently asynchronous, with respect to
	2595	+ * MMU operations.
	2596	+ */
	2597	+ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
	2598	+ struct kbase_device *kbdev = kctx->kbdev;
	2599	+ struct kbase_mmu_hw_op_param op_param = { 0 };
	2600	+ int lock_err, flush_err;
	2601	+
1700	2602	/* ASSERT that the context has a valid as_nr, which is only the case
1701	2603	* when it's scheduled in.
1702	2604	*
..	..	@@ -1707,69 +2609,201 @@
1707	2609	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
1708	2610	lockdep_assert_held(&kctx->kbdev->mmu_hw_mutex);
1709	2611
1710		- /*
1711		- * The address space is being disabled, drain all knowledge of it out
1712		- * from the caches as pages and page tables might be freed after this.
1713		- *
1714		- * The job scheduler code will already be holding the locks and context
1715		- * so just do the flush.
1716		- */
1717		- kbase_mmu_flush_invalidate_noretain(kctx, 0, ~0, true);
	2612	+ op_param.vpfn = 0;
	2613	+ op_param.nr = ~0;
	2614	+ op_param.op = KBASE_MMU_OP_FLUSH_MEM;
	2615	+ op_param.kctx_id = kctx->id;
	2616	+ op_param.mmu_sync_info = mmu_sync_info;
1718	2617
1719		- kctx->kbdev->mmu_mode->disable_as(kctx->kbdev, kctx->as_nr);
	2618	+#if MALI_USE_CSF
	2619	+ /* 0xF value used to prevent skipping of any levels when flushing */
	2620	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev))
	2621	+ op_param.flush_skip_levels = pgd_level_to_skip_flush(0xF);
	2622	+#endif
	2623	+
	2624	+ /* lock MMU to prevent existing jobs on GPU from executing while the AS is
	2625	+ * not yet disabled
	2626	+ */
	2627	+ lock_err = kbase_mmu_hw_do_lock(kbdev, &kbdev->as[kctx->as_nr], &op_param);
	2628	+ if (lock_err)
	2629	+ dev_err(kbdev->dev, "Failed to lock AS %d for ctx %d_%d", kctx->as_nr, kctx->tgid,
	2630	+ kctx->id);
	2631	+
	2632	+ /* Issue the flush command only when L2 cache is in stable power on state.
	2633	+ * Any other state for L2 cache implies that shader cores are powered off,
	2634	+ * which in turn implies there is no execution happening on the GPU.
	2635	+ */
	2636	+ if (kbdev->pm.backend.l2_state == KBASE_L2_ON) {
	2637	+ flush_err = kbase_gpu_cache_flush_and_busy_wait(kbdev,
	2638	+ GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
	2639	+ if (flush_err)
	2640	+ dev_err(kbdev->dev,
	2641	+ "Failed to flush GPU cache when disabling AS %d for ctx %d_%d",
	2642	+ kctx->as_nr, kctx->tgid, kctx->id);
	2643	+ }
	2644	+ kbdev->mmu_mode->disable_as(kbdev, kctx->as_nr);
	2645	+
	2646	+ if (!lock_err) {
	2647	+ /* unlock the MMU to allow it to resume */
	2648	+ lock_err =
	2649	+ kbase_mmu_hw_do_unlock_no_addr(kbdev, &kbdev->as[kctx->as_nr], &op_param);
	2650	+ if (lock_err)
	2651	+ dev_err(kbdev->dev, "Failed to unlock AS %d for ctx %d_%d", kctx->as_nr,
	2652	+ kctx->tgid, kctx->id);
	2653	+ }
	2654	+
	2655	+#if !MALI_USE_CSF
	2656	+ /*
	2657	+ * JM GPUs has some L1 read only caches that need to be invalidated
	2658	+ * with START_FLUSH configuration. Purge the MMU disabled kctx from
	2659	+ * the slot_rb tracking field so such invalidation is performed when
	2660	+ * a new katom is executed on the affected slots.
	2661	+ */
	2662	+ kbase_backend_slot_kctx_purge_locked(kbdev, kctx);
	2663	+#endif
1720	2664	}
1721	2665	KBASE_EXPORT_TEST_API(kbase_mmu_disable);
1722	2666
1723		-/*
1724		- * We actually only discard the ATE, and not the page table
1725		- * pages. There is a potential DoS here, as we'll leak memory by
1726		- * having PTEs that are potentially unused. Will require physical
1727		- * page accounting, so MMU pages are part of the process allocation.
1728		- *
1729		- * IMPORTANT: This uses kbasep_js_runpool_release_ctx() when the context is
1730		- * currently scheduled into the runpool, and so potentially uses a lot of locks.
1731		- * These locks must be taken in the correct order with respect to others
1732		- * already held by the caller. Refer to kbasep_js_runpool_release_ctx() for more
1733		- * information.
1734		- */
1735		-int kbase_mmu_teardown_pages(struct kbase_device *kbdev,
1736		- struct kbase_mmu_table *mmut, u64 vpfn, size_t nr, int as_nr)
	2667	+static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
	2668	+ struct kbase_mmu_table mmut, phys_addr_t pgds,
	2669	+ u64 vpfn, int level,
	2670	+ enum kbase_mmu_op_type flush_op, u64 *dirty_pgds)
1737	2671	{
1738		- phys_addr_t pgd;
1739		- u64 start_vpfn = vpfn;
1740		- size_t requested_nr = nr;
1741		- struct kbase_mmu_mode const *mmu_mode;
1742		- int err = -EFAULT;
	2672	+ int current_level;
1743	2673
1744		- if (nr == 0) {
1745		- /* early out if nothing to do */
1746		- return 0;
	2674	+ lockdep_assert_held(&mmut->mmu_lock);
	2675	+
	2676	+ for (current_level = level - 1; current_level >= MIDGARD_MMU_LEVEL(0);
	2677	+ current_level--) {
	2678	+ phys_addr_t current_pgd = pgds[current_level];
	2679	+ struct page *p = phys_to_page(current_pgd);
	2680	+ u64 *current_page = kmap(p);
	2681	+ unsigned int current_valid_entries =
	2682	+ kbdev->mmu_mode->get_num_valid_entries(current_page);
	2683	+ int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
	2684	+
	2685	+ /* We need to track every level that needs updating */
	2686	+ if (dirty_pgds)
	2687	+ *dirty_pgds \|= 1ULL << current_level;
	2688	+
	2689	+ kbdev->mmu_mode->entries_invalidate(&current_page[index], 1);
	2690	+ if (current_valid_entries == 1 &&
	2691	+ current_level != MIDGARD_MMU_LEVEL(0)) {
	2692	+ kunmap(p);
	2693	+
	2694	+ /* Ensure the cacheline containing the last valid entry
	2695	+ * of PGD is invalidated from the GPU cache, before the
	2696	+ * PGD page is freed.
	2697	+ */
	2698	+ kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
	2699	+ current_pgd + (index * sizeof(u64)),
	2700	+ sizeof(u64), flush_op);
	2701	+
	2702	+ kbase_mmu_add_to_free_pgds_list(mmut, p);
	2703	+ } else {
	2704	+ current_valid_entries--;
	2705	+
	2706	+ kbdev->mmu_mode->set_num_valid_entries(
	2707	+ current_page, current_valid_entries);
	2708	+
	2709	+ kunmap(p);
	2710	+
	2711	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, current_pgd + (index * sizeof(u64)),
	2712	+ kbase_dma_addr(p) + (index * sizeof(u64)), sizeof(u64),
	2713	+ flush_op);
	2714	+ break;
	2715	+ }
1747	2716	}
	2717	+}
1748	2718
1749		- mutex_lock(&mmut->mmu_lock);
	2719	+/**
	2720	+ * mmu_flush_invalidate_teardown_pages() - Perform flush operation after unmapping pages.
	2721	+ *
	2722	+ * @kbdev: Pointer to kbase device.
	2723	+ * @kctx: Pointer to kbase context.
	2724	+ * @as_nr: Address space number, for GPU cache maintenance operations
	2725	+ * that happen outside a specific kbase context.
	2726	+ * @phys: Array of physical pages to flush.
	2727	+ * @phys_page_nr: Number of physical pages to flush.
	2728	+ * @op_param: Non-NULL pointer to struct containing information about the flush
	2729	+ * operation to perform.
	2730	+ *
	2731	+ * This function will do one of three things:
	2732	+ * 1. Invalidate the MMU caches, followed by a partial GPU cache flush of the
	2733	+ * individual pages that were unmapped if feature is supported on GPU.
	2734	+ * 2. Perform a full GPU cache flush through the GPU_CONTROL interface if feature is
	2735	+ * supported on GPU or,
	2736	+ * 3. Perform a full GPU cache flush through the MMU_CONTROL interface.
	2737	+ *
	2738	+ * When performing a partial GPU cache flush, the number of physical
	2739	+ * pages does not have to be identical to the number of virtual pages on the MMU,
	2740	+ * to support a single physical address flush for an aliased page.
	2741	+ */
	2742	+static void mmu_flush_invalidate_teardown_pages(struct kbase_device *kbdev,
	2743	+ struct kbase_context *kctx, int as_nr,
	2744	+ struct tagged_addr *phys, size_t phys_page_nr,
	2745	+ struct kbase_mmu_hw_op_param *op_param)
	2746	+{
	2747	+ if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	2748	+ /* Full cache flush through the MMU_COMMAND */
	2749	+ mmu_flush_invalidate(kbdev, kctx, as_nr, op_param);
	2750	+ } else if (op_param->op == KBASE_MMU_OP_FLUSH_MEM) {
	2751	+ /* Full cache flush through the GPU_CONTROL */
	2752	+ mmu_flush_invalidate_on_gpu_ctrl(kbdev, kctx, as_nr, op_param);
	2753	+ }
	2754	+#if MALI_USE_CSF
	2755	+ else {
	2756	+ /* Partial GPU cache flush with MMU cache invalidation */
	2757	+ unsigned long irq_flags;
	2758	+ unsigned int i;
	2759	+ bool flush_done = false;
1750	2760
1751		- mmu_mode = kbdev->mmu_mode;
	2761	+ mmu_invalidate(kbdev, kctx, as_nr, op_param);
	2762	+
	2763	+ for (i = 0; !flush_done && i < phys_page_nr; i++) {
	2764	+ spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
	2765	+ if (kbdev->pm.backend.gpu_powered && (!kctx \|\| kctx->as_nr >= 0))
	2766	+ mmu_flush_pa_range(kbdev, as_phys_addr_t(phys[i]), PAGE_SIZE,
	2767	+ KBASE_MMU_OP_FLUSH_MEM);
	2768	+ else
	2769	+ flush_done = true;
	2770	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
	2771	+ }
	2772	+ }
	2773	+#endif
	2774	+}
	2775	+
	2776	+static int kbase_mmu_teardown_pgd_pages(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	2777	+ u64 vpfn, size_t nr, u64 *dirty_pgds,
	2778	+ struct list_head *free_pgds_list,
	2779	+ enum kbase_mmu_op_type flush_op)
	2780	+{
	2781	+ struct kbase_mmu_mode const *mmu_mode = kbdev->mmu_mode;
	2782	+
	2783	+ lockdep_assert_held(&mmut->mmu_lock);
	2784	+ kbase_mmu_reset_free_pgds_list(mmut);
1752	2785
1753	2786	while (nr) {
1754		- unsigned int i;
1755	2787	unsigned int index = vpfn & 0x1FF;
1756	2788	unsigned int count = KBASE_MMU_PAGE_ENTRIES - index;
1757	2789	unsigned int pcount;
1758	2790	int level;
1759	2791	u64 *page;
	2792	+ phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
	2793	+ register unsigned int num_of_valid_entries;
	2794	+ phys_addr_t pgd = mmut->pgd;
	2795	+ struct page *p = phys_to_page(pgd);
1760	2796
1761	2797	if (count > nr)
1762	2798	count = nr;
1763	2799
1764		- /* need to check if this is a 2MB or a 4kB page */
1765		- pgd = mmut->pgd;
1766		-
	2800	+ /* need to check if this is a 2MB page or a 4kB */
1767	2801	for (level = MIDGARD_MMU_TOPLEVEL;
1768	2802	level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
1769	2803	phys_addr_t next_pgd;
1770	2804
1771	2805	index = (vpfn >> ((3 - level) * 9)) & 0x1FF;
1772		- page = kmap(phys_to_page(pgd));
	2806	+ page = kmap(p);
1773	2807	if (mmu_mode->ate_is_valid(page[index], level))
1774	2808	break; /* keep the mapping */
1775	2809	else if (!mmu_mode->pte_is_valid(page[index], level)) {
..	..	@@ -1792,27 +2826,31 @@
1792	2826	count = nr;
1793	2827	goto next;
1794	2828	}
1795		- next_pgd = mmu_mode->pte_to_phy_addr(page[index]);
1796		- kunmap(phys_to_page(pgd));
	2829	+ next_pgd = mmu_mode->pte_to_phy_addr(
	2830	+ kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
	2831	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[index]));
	2832	+ kunmap(p);
	2833	+ pgds[level] = pgd;
1797	2834	pgd = next_pgd;
	2835	+ p = phys_to_page(pgd);
1798	2836	}
1799	2837
1800	2838	switch (level) {
1801	2839	case MIDGARD_MMU_LEVEL(0):
1802	2840	case MIDGARD_MMU_LEVEL(1):
1803		- dev_warn(kbdev->dev,
1804		- "%s: No support for ATEs at level %d\n",
1805		- __func__, level);
1806		- kunmap(phys_to_page(pgd));
	2841	+ dev_warn(kbdev->dev, "%s: No support for ATEs at level %d", __func__,
	2842	+ level);
	2843	+ kunmap(p);
1807	2844	goto out;
1808	2845	case MIDGARD_MMU_LEVEL(2):
1809	2846	/* can only teardown if count >= 512 */
1810	2847	if (count >= 512) {
1811	2848	pcount = 1;
1812	2849	} else {
1813		- dev_warn(kbdev->dev,
1814		- "%s: limiting teardown as it tries to do a partial 2MB teardown, need 512, but have %d to tear down\n",
1815		- __func__, count);
	2850	+ dev_warn(
	2851	+ kbdev->dev,
	2852	+ "%s: limiting teardown as it tries to do a partial 2MB teardown, need 512, but have %d to tear down",
	2853	+ __func__, count);
1816	2854	pcount = 0;
1817	2855	}
1818	2856	break;
..	..	@@ -1821,72 +2859,177 @@
1821	2859	pcount = count;
1822	2860	break;
1823	2861	default:
1824		- dev_err(kbdev->dev,
1825		- "%s: found non-mapped memory, early out\n",
1826		- __func__);
	2862	+ dev_err(kbdev->dev, "%s: found non-mapped memory, early out", __func__);
1827	2863	vpfn += count;
1828	2864	nr -= count;
1829	2865	continue;
1830	2866	}
1831	2867
	2868	+ if (pcount > 0)
	2869	+ *dirty_pgds \|= 1ULL << level;
	2870	+
	2871	+ num_of_valid_entries = mmu_mode->get_num_valid_entries(page);
	2872	+ if (WARN_ON_ONCE(num_of_valid_entries < pcount))
	2873	+ num_of_valid_entries = 0;
	2874	+ else
	2875	+ num_of_valid_entries -= pcount;
	2876	+
1832	2877	/* Invalidate the entries we added */
1833		- for (i = 0; i < pcount; i++)
1834		- mmu_mode->entry_invalidate(&page[index + i]);
	2878	+ mmu_mode->entries_invalidate(&page[index], pcount);
1835	2879
1836		- kbase_mmu_sync_pgd(kbdev,
1837		- kbase_dma_addr(phys_to_page(pgd)) +
1838		- 8 * index, 8*pcount);
	2880	+ if (!num_of_valid_entries) {
	2881	+ kunmap(p);
1839	2882
	2883	+ /* Ensure the cacheline(s) containing the last valid entries
	2884	+ * of PGD is invalidated from the GPU cache, before the
	2885	+ * PGD page is freed.
	2886	+ */
	2887	+ kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
	2888	+ pgd + (index * sizeof(u64)),
	2889	+ pcount * sizeof(u64), flush_op);
	2890	+
	2891	+ kbase_mmu_add_to_free_pgds_list(mmut, p);
	2892	+
	2893	+ kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
	2894	+ flush_op, dirty_pgds);
	2895	+
	2896	+ vpfn += count;
	2897	+ nr -= count;
	2898	+ continue;
	2899	+ }
	2900	+
	2901	+ mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
	2902	+
	2903	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
	2904	+ kbase_dma_addr(p) + (index * sizeof(u64)), pcount * sizeof(u64),
	2905	+ flush_op);
1840	2906	next:
1841		- kunmap(phys_to_page(pgd));
	2907	+ kunmap(p);
1842	2908	vpfn += count;
1843	2909	nr -= count;
1844	2910	}
1845		- err = 0;
1846	2911	out:
1847		- mutex_unlock(&mmut->mmu_lock);
	2912	+ return 0;
	2913	+}
1848	2914
1849		- if (mmut->kctx)
1850		- kbase_mmu_flush_invalidate(mmut->kctx, start_vpfn, requested_nr,
1851		- true);
1852		- else
1853		- kbase_mmu_flush_invalidate_no_ctx(kbdev, start_vpfn, requested_nr,
1854		- true, as_nr);
	2915	+int kbase_mmu_teardown_pages(struct kbase_device kbdev, struct kbase_mmu_table mmut, u64 vpfn,
	2916	+ struct tagged_addr *phys, size_t nr_phys_pages, size_t nr_virt_pages,
	2917	+ int as_nr, bool ignore_page_migration)
	2918	+{
	2919	+ u64 start_vpfn = vpfn;
	2920	+ enum kbase_mmu_op_type flush_op = KBASE_MMU_OP_NONE;
	2921	+ struct kbase_mmu_hw_op_param op_param;
	2922	+ int err = -EFAULT;
	2923	+ u64 dirty_pgds = 0;
	2924	+ LIST_HEAD(free_pgds_list);
	2925	+
	2926	+ /* Calls to this function are inherently asynchronous, with respect to
	2927	+ * MMU operations.
	2928	+ */
	2929	+ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
	2930	+
	2931	+ /* This function performs two operations: MMU maintenance and flushing
	2932	+ * the caches. To ensure internal consistency between the caches and the
	2933	+ * MMU, it does not make sense to be able to flush only the physical pages
	2934	+ * from the cache and keep the PTE, nor does it make sense to use this
	2935	+ * function to remove a PTE and keep the physical pages in the cache.
	2936	+ *
	2937	+ * However, we have legitimate cases where we can try to tear down a mapping
	2938	+ * with zero virtual and zero physical pages, so we must have the following
	2939	+ * behaviour:
	2940	+ * - if both physical and virtual page counts are zero, return early
	2941	+ * - if either physical and virtual page counts are zero, return early
	2942	+ * - if there are fewer physical pages than virtual pages, return -EINVAL
	2943	+ */
	2944	+ if (unlikely(nr_virt_pages == 0 \|\| nr_phys_pages == 0))
	2945	+ return 0;
	2946	+
	2947	+ if (unlikely(nr_virt_pages < nr_phys_pages))
	2948	+ return -EINVAL;
	2949	+
	2950	+ /* MMU cache flush strategy depends on the number of pages to unmap. In both cases
	2951	+ * the operation is invalidate but the granularity of cache maintenance may change
	2952	+ * according to the situation.
	2953	+ *
	2954	+ * If GPU control command operations are present and the number of pages is "small",
	2955	+ * then the optimal strategy is flushing on the physical address range of the pages
	2956	+ * which are affected by the operation. That implies both the PGDs which are modified
	2957	+ * or removed from the page table and the physical pages which are freed from memory.
	2958	+ *
	2959	+ * Otherwise, there's no alternative to invalidating the whole GPU cache.
	2960	+ */
	2961	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev) && phys &&
	2962	+ nr_phys_pages <= KBASE_PA_RANGE_THRESHOLD_NR_PAGES)
	2963	+ flush_op = KBASE_MMU_OP_FLUSH_PT;
	2964	+
	2965	+ mutex_lock(&mmut->mmu_lock);
	2966	+
	2967	+ err = kbase_mmu_teardown_pgd_pages(kbdev, mmut, vpfn, nr_virt_pages, &dirty_pgds,
	2968	+ &free_pgds_list, flush_op);
	2969	+
	2970	+ /* Set up MMU operation parameters. See above about MMU cache flush strategy. */
	2971	+ op_param = (struct kbase_mmu_hw_op_param){
	2972	+ .vpfn = start_vpfn,
	2973	+ .nr = nr_virt_pages,
	2974	+ .mmu_sync_info = mmu_sync_info,
	2975	+ .kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF,
	2976	+ .op = (flush_op == KBASE_MMU_OP_FLUSH_PT) ? KBASE_MMU_OP_FLUSH_PT :
	2977	+ KBASE_MMU_OP_FLUSH_MEM,
	2978	+ .flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds),
	2979	+ };
	2980	+ mmu_flush_invalidate_teardown_pages(kbdev, mmut->kctx, as_nr, phys, nr_phys_pages,
	2981	+ &op_param);
	2982	+
	2983	+ /* If page migration is enabled: the status of all physical pages involved
	2984	+ * shall be updated, unless they are not movable. Their status shall be
	2985	+ * updated before releasing the lock to protect against concurrent
	2986	+ * requests to migrate the pages, if they have been isolated.
	2987	+ */
	2988	+ if (kbase_page_migration_enabled && phys && !ignore_page_migration)
	2989	+ kbase_mmu_progress_migration_on_teardown(kbdev, phys, nr_phys_pages);
	2990	+
	2991	+ kbase_mmu_free_pgds_list(kbdev, mmut);
	2992	+
	2993	+ mutex_unlock(&mmut->mmu_lock);
1855	2994
1856	2995	return err;
1857	2996	}
1858		-
1859	2997	KBASE_EXPORT_TEST_API(kbase_mmu_teardown_pages);
1860	2998
1861	2999	/**
1862		- * kbase_mmu_update_pages_no_flush() - Update page table entries on the GPU
	3000	+ * kbase_mmu_update_pages_no_flush() - Update phy pages and attributes data in GPU
	3001	+ * page table entries
1863	3002	*
1864		- * This will update page table entries that already exist on the GPU based on
1865		- * the new flags that are passed. It is used as a response to the changes of
1866		- * the memory attributes
1867		- *
1868		- * The caller is responsible for validating the memory attributes
1869		- *
1870		- * @kctx: Kbase context
	3003	+ * @kbdev: Pointer to kbase device.
	3004	+ * @mmut: The involved MMU table
1871	3005	* @vpfn: Virtual PFN (Page Frame Number) of the first page to update
1872		- * @phys: Tagged physical addresses of the physical pages to replace the
1873		- * current mappings
	3006	+ * @phys: Pointer to the array of tagged physical addresses of the physical
	3007	+ * pages that are pointed to by the page table entries (that need to
	3008	+ * be updated). The pointer should be within the reg->gpu_alloc->pages
	3009	+ * array.
1874	3010	* @nr: Number of pages to update
1875	3011	* @flags: Flags
1876	3012	* @group_id: The physical memory group in which the page was allocated.
1877	3013	* Valid range is 0..(MEMORY_GROUP_MANAGER_NR_GROUPS-1).
	3014	+ * @dirty_pgds: Flags to track every level where a PGD has been updated.
	3015	+ *
	3016	+ * This will update page table entries that already exist on the GPU based on
	3017	+ * new flags and replace any existing phy pages that are passed (the PGD pages
	3018	+ * remain unchanged). It is used as a response to the changes of phys as well
	3019	+ * as the the memory attributes.
	3020	+ *
	3021	+ * The caller is responsible for validating the memory attributes.
	3022	+ *
	3023	+ * Return: 0 if the attributes data in page table entries were updated
	3024	+ * successfully, otherwise an error code.
1878	3025	*/
1879		-static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
1880		- struct tagged_addr *phys, size_t nr,
1881		- unsigned long flags, int const group_id)
	3026	+static int kbase_mmu_update_pages_no_flush(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	3027	+ u64 vpfn, struct tagged_addr *phys, size_t nr,
	3028	+ unsigned long flags, int const group_id, u64 *dirty_pgds)
1882	3029	{
1883	3030	phys_addr_t pgd;
1884	3031	u64 *pgd_page;
1885	3032	int err;
1886		- struct kbase_device *kbdev;
1887		-
1888		- if (WARN_ON(kctx == NULL))
1889		- return -EINVAL;
1890	3033
1891	3034	KBASE_DEBUG_ASSERT(vpfn <= (U64_MAX / PAGE_SIZE));
1892	3035
..	..	@@ -1894,146 +3037,550 @@
1894	3037	if (nr == 0)
1895	3038	return 0;
1896	3039
1897		- mutex_lock(&kctx->mmu.mmu_lock);
1898		-
1899		- kbdev = kctx->kbdev;
	3040	+ mutex_lock(&mmut->mmu_lock);
1900	3041
1901	3042	while (nr) {
1902	3043	unsigned int i;
1903	3044	unsigned int index = vpfn & 0x1FF;
1904	3045	size_t count = KBASE_MMU_PAGE_ENTRIES - index;
1905	3046	struct page *p;
	3047	+ register unsigned int num_of_valid_entries;
	3048	+ int cur_level = MIDGARD_MMU_BOTTOMLEVEL;
1906	3049
1907	3050	if (count > nr)
1908	3051	count = nr;
1909	3052
1910		- do {
1911		- err = mmu_get_bottom_pgd(kbdev, &kctx->mmu,
1912		- vpfn, &pgd);
1913		- if (err != -ENOMEM)
1914		- break;
1915		- /* Fill the memory pool with enough pages for
1916		- * the page walk to succeed
1917		- */
1918		- mutex_unlock(&kctx->mmu.mmu_lock);
1919		- err = kbase_mem_pool_grow(
1920		-#ifdef CONFIG_MALI_2MB_ALLOC
1921		- &kbdev->mem_pools.large[
1922		-#else
1923		- &kbdev->mem_pools.small[
1924		-#endif
1925		- kctx->mmu.group_id],
1926		- MIDGARD_MMU_BOTTOMLEVEL);
1927		- mutex_lock(&kctx->mmu.mmu_lock);
1928		- } while (!err);
1929		- if (err) {
1930		- dev_warn(kbdev->dev,
1931		- "mmu_get_bottom_pgd failure\n");
	3053	+ if (is_huge(phys) && (index == index_in_large_page(phys)))
	3054	+ cur_level = MIDGARD_MMU_LEVEL(2);
	3055	+
	3056	+ err = mmu_get_pgd_at_level(kbdev, mmut, vpfn, cur_level, &pgd);
	3057	+ if (WARN_ON(err))
1932	3058	goto fail_unlock;
1933		- }
1934	3059
1935	3060	p = pfn_to_page(PFN_DOWN(pgd));
1936	3061	pgd_page = kmap(p);
1937	3062	if (!pgd_page) {
1938		- dev_warn(kbdev->dev, "kmap failure\n");
	3063	+ dev_warn(kbdev->dev, "kmap failure on update_pages");
1939	3064	err = -ENOMEM;
1940	3065	goto fail_unlock;
1941	3066	}
1942	3067
1943		- for (i = 0; i < count; i++)
1944		- pgd_page[index + i] = kbase_mmu_create_ate(kbdev,
1945		- phys[i], flags, MIDGARD_MMU_BOTTOMLEVEL,
1946		- group_id);
	3068	+ num_of_valid_entries =
	3069	+ kbdev->mmu_mode->get_num_valid_entries(pgd_page);
	3070	+
	3071	+ if (cur_level == MIDGARD_MMU_LEVEL(2)) {
	3072	+ int level_index = (vpfn >> 9) & 0x1FF;
	3073	+ struct tagged_addr *target_phys =
	3074	+ phys - index_in_large_page(*phys);
	3075	+
	3076	+#ifdef CONFIG_MALI_BIFROST_DEBUG
	3077	+ WARN_ON_ONCE(!kbdev->mmu_mode->ate_is_valid(
	3078	+ pgd_page[level_index], MIDGARD_MMU_LEVEL(2)));
	3079	+#endif
	3080	+ pgd_page[level_index] = kbase_mmu_create_ate(kbdev,
	3081	+ *target_phys, flags, MIDGARD_MMU_LEVEL(2),
	3082	+ group_id);
	3083	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (level_index * sizeof(u64)),
	3084	+ kbase_dma_addr(p) + (level_index * sizeof(u64)),
	3085	+ sizeof(u64), KBASE_MMU_OP_NONE);
	3086	+ } else {
	3087	+ for (i = 0; i < count; i++) {
	3088	+#ifdef CONFIG_MALI_BIFROST_DEBUG
	3089	+ WARN_ON_ONCE(!kbdev->mmu_mode->ate_is_valid(
	3090	+ pgd_page[index + i],
	3091	+ MIDGARD_MMU_BOTTOMLEVEL));
	3092	+#endif
	3093	+ pgd_page[index + i] = kbase_mmu_create_ate(kbdev,
	3094	+ phys[i], flags, MIDGARD_MMU_BOTTOMLEVEL,
	3095	+ group_id);
	3096	+ }
	3097	+
	3098	+ /* MMU cache flush strategy is NONE because GPU cache maintenance
	3099	+ * will be done by the caller.
	3100	+ */
	3101	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
	3102	+ kbase_dma_addr(p) + (index * sizeof(u64)),
	3103	+ count * sizeof(u64), KBASE_MMU_OP_NONE);
	3104	+ }
	3105	+
	3106	+ kbdev->mmu_mode->set_num_valid_entries(pgd_page,
	3107	+ num_of_valid_entries);
	3108	+
	3109	+ if (dirty_pgds && count > 0)
	3110	+ *dirty_pgds \|= 1ULL << cur_level;
1947	3111
1948	3112	phys += count;
1949	3113	vpfn += count;
1950	3114	nr -= count;
1951	3115
1952		- kbase_mmu_sync_pgd(kbdev,
1953		- kbase_dma_addr(p) + (index * sizeof(u64)),
1954		- count * sizeof(u64));
1955		-
1956		- kunmap(pfn_to_page(PFN_DOWN(pgd)));
	3116	+ kunmap(p);
1957	3117	}
1958	3118
1959		- mutex_unlock(&kctx->mmu.mmu_lock);
	3119	+ mutex_unlock(&mmut->mmu_lock);
1960	3120	return 0;
1961	3121
1962	3122	fail_unlock:
1963		- mutex_unlock(&kctx->mmu.mmu_lock);
	3123	+ mutex_unlock(&mmut->mmu_lock);
1964	3124	return err;
1965	3125	}
1966	3126
1967		-int kbase_mmu_update_pages(struct kbase_context *kctx, u64 vpfn,
1968		- struct tagged_addr *phys, size_t nr,
1969		- unsigned long flags, int const group_id)
	3127	+static int kbase_mmu_update_pages_common(struct kbase_device kbdev, struct kbase_context kctx,
	3128	+ u64 vpfn, struct tagged_addr *phys, size_t nr,
	3129	+ unsigned long flags, int const group_id)
1970	3130	{
1971	3131	int err;
	3132	+ struct kbase_mmu_hw_op_param op_param;
	3133	+ u64 dirty_pgds = 0;
	3134	+ struct kbase_mmu_table *mmut;
	3135	+ /* Calls to this function are inherently asynchronous, with respect to
	3136	+ * MMU operations.
	3137	+ */
	3138	+ const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
	3139	+ int as_nr;
1972	3140
1973		- err = kbase_mmu_update_pages_no_flush(kctx, vpfn, phys, nr, flags,
1974		- group_id);
1975		- kbase_mmu_flush_invalidate(kctx, vpfn, nr, true);
	3141	+#if !MALI_USE_CSF
	3142	+ if (unlikely(kctx == NULL))
	3143	+ return -EINVAL;
	3144	+
	3145	+ as_nr = kctx->as_nr;
	3146	+ mmut = &kctx->mmu;
	3147	+#else
	3148	+ if (kctx) {
	3149	+ mmut = &kctx->mmu;
	3150	+ as_nr = kctx->as_nr;
	3151	+ } else {
	3152	+ mmut = &kbdev->csf.mcu_mmu;
	3153	+ as_nr = MCU_AS_NR;
	3154	+ }
	3155	+#endif
	3156	+
	3157	+ err = kbase_mmu_update_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
	3158	+ &dirty_pgds);
	3159	+
	3160	+ op_param = (const struct kbase_mmu_hw_op_param){
	3161	+ .vpfn = vpfn,
	3162	+ .nr = nr,
	3163	+ .op = KBASE_MMU_OP_FLUSH_MEM,
	3164	+ .kctx_id = kctx ? kctx->id : 0xFFFFFFFF,
	3165	+ .mmu_sync_info = mmu_sync_info,
	3166	+ .flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds),
	3167	+ };
	3168	+
	3169	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev))
	3170	+ mmu_flush_invalidate_on_gpu_ctrl(kbdev, kctx, as_nr, &op_param);
	3171	+ else
	3172	+ mmu_flush_invalidate(kbdev, kctx, as_nr, &op_param);
	3173	+
1976	3174	return err;
1977	3175	}
1978	3176
1979		-static void mmu_teardown_level(struct kbase_device *kbdev,
1980		- struct kbase_mmu_table *mmut, phys_addr_t pgd,
1981		- int level, u64 *pgd_page_buffer)
	3177	+int kbase_mmu_update_pages(struct kbase_context kctx, u64 vpfn, struct tagged_addr phys,
	3178	+ size_t nr, unsigned long flags, int const group_id)
1982	3179	{
1983		- phys_addr_t target_pgd;
1984		- struct page *p;
	3180	+ if (unlikely(kctx == NULL))
	3181	+ return -EINVAL;
	3182	+
	3183	+ return kbase_mmu_update_pages_common(kctx->kbdev, kctx, vpfn, phys, nr, flags, group_id);
	3184	+}
	3185	+
	3186	+#if MALI_USE_CSF
	3187	+int kbase_mmu_update_csf_mcu_pages(struct kbase_device kbdev, u64 vpfn, struct tagged_addr phys,
	3188	+ size_t nr, unsigned long flags, int const group_id)
	3189	+{
	3190	+ return kbase_mmu_update_pages_common(kbdev, NULL, vpfn, phys, nr, flags, group_id);
	3191	+}
	3192	+#endif /* MALI_USE_CSF */
	3193	+
	3194	+static void mmu_page_migration_transaction_begin(struct kbase_device *kbdev)
	3195	+{
	3196	+ lockdep_assert_held(&kbdev->hwaccess_lock);
	3197	+
	3198	+ WARN_ON_ONCE(kbdev->mmu_page_migrate_in_progress);
	3199	+ kbdev->mmu_page_migrate_in_progress = true;
	3200	+}
	3201	+
	3202	+static void mmu_page_migration_transaction_end(struct kbase_device *kbdev)
	3203	+{
	3204	+ lockdep_assert_held(&kbdev->hwaccess_lock);
	3205	+ WARN_ON_ONCE(!kbdev->mmu_page_migrate_in_progress);
	3206	+ kbdev->mmu_page_migrate_in_progress = false;
	3207	+ /* Invoke the PM state machine, as the MMU page migration session
	3208	+ * may have deferred a transition in L2 state machine.
	3209	+ */
	3210	+ kbase_pm_update_state(kbdev);
	3211	+}
	3212	+
	3213	+int kbase_mmu_migrate_page(struct tagged_addr old_phys, struct tagged_addr new_phys,
	3214	+ dma_addr_t old_dma_addr, dma_addr_t new_dma_addr, int level)
	3215	+{
	3216	+ struct kbase_page_metadata *page_md = kbase_page_private(as_page(old_phys));
	3217	+ struct kbase_mmu_hw_op_param op_param;
	3218	+ struct kbase_mmu_table *mmut = (level == MIDGARD_MMU_BOTTOMLEVEL) ?
	3219	+ page_md->data.mapped.mmut :
	3220	+ page_md->data.pt_mapped.mmut;
	3221	+ struct kbase_device *kbdev;
	3222	+ phys_addr_t pgd;
	3223	+ u64 old_page, new_page, pgd_page, target, vpfn;
	3224	+ int index, check_state, ret = 0;
	3225	+ unsigned long hwaccess_flags = 0;
	3226	+ unsigned int num_of_valid_entries;
	3227	+ u8 vmap_count = 0;
	3228	+
	3229	+ /* Due to the hard binding of mmu_command_instr with kctx_id via kbase_mmu_hw_op_param,
	3230	+ * here we skip the no kctx case, which is only used with MCU's mmut.
	3231	+ */
	3232	+ if (!mmut->kctx)
	3233	+ return -EINVAL;
	3234	+
	3235	+ if (level > MIDGARD_MMU_BOTTOMLEVEL)
	3236	+ return -EINVAL;
	3237	+ else if (level == MIDGARD_MMU_BOTTOMLEVEL)
	3238	+ vpfn = page_md->data.mapped.vpfn;
	3239	+ else
	3240	+ vpfn = PGD_VPFN_LEVEL_GET_VPFN(page_md->data.pt_mapped.pgd_vpfn_level);
	3241	+
	3242	+ kbdev = mmut->kctx->kbdev;
	3243	+ index = (vpfn >> ((3 - level) * 9)) & 0x1FF;
	3244	+
	3245	+ /* Create all mappings before copying content.
	3246	+ * This is done as early as possible because is the only operation that may
	3247	+ * fail. It is possible to do this before taking any locks because the
	3248	+ * pages to migrate are not going to change and even the parent PGD is not
	3249	+ * going to be affected by any other concurrent operation, since the page
	3250	+ * has been isolated before migration and therefore it cannot disappear in
	3251	+ * the middle of this function.
	3252	+ */
	3253	+ old_page = kmap(as_page(old_phys));
	3254	+ if (!old_page) {
	3255	+ dev_warn(kbdev->dev, "%s: kmap failure for old page.", __func__);
	3256	+ ret = -EINVAL;
	3257	+ goto old_page_map_error;
	3258	+ }
	3259	+
	3260	+ new_page = kmap(as_page(new_phys));
	3261	+ if (!new_page) {
	3262	+ dev_warn(kbdev->dev, "%s: kmap failure for new page.", __func__);
	3263	+ ret = -EINVAL;
	3264	+ goto new_page_map_error;
	3265	+ }
	3266	+
	3267	+ /* GPU cache maintenance affects both memory content and page table,
	3268	+ * but at two different stages. A single virtual memory page is affected
	3269	+ * by the migration.
	3270	+ *
	3271	+ * Notice that the MMU maintenance is done in the following steps:
	3272	+ *
	3273	+ * 1) The MMU region is locked without performing any other operation.
	3274	+ * This lock must cover the entire migration process, in order to
	3275	+ * prevent any GPU access to the virtual page whose physical page
	3276	+ * is being migrated.
	3277	+ * 2) Immediately after locking: the MMU region content is flushed via
	3278	+ * GPU control while the lock is taken and without unlocking.
	3279	+ * The region must stay locked for the duration of the whole page
	3280	+ * migration procedure.
	3281	+ * This is necessary to make sure that pending writes to the old page
	3282	+ * are finalized before copying content to the new page.
	3283	+ * 3) Before unlocking: changes to the page table are flushed.
	3284	+ * Finer-grained GPU control operations are used if possible, otherwise
	3285	+ * the whole GPU cache shall be flushed again.
	3286	+ * This is necessary to make sure that the GPU accesses the new page
	3287	+ * after migration.
	3288	+ * 4) The MMU region is unlocked.
	3289	+ */
	3290	+#define PGD_VPFN_MASK(level) (~((((u64)1) << ((3 - level) * 9)) - 1))
	3291	+ op_param.mmu_sync_info = CALLER_MMU_ASYNC;
	3292	+ op_param.kctx_id = mmut->kctx->id;
	3293	+ op_param.vpfn = vpfn & PGD_VPFN_MASK(level);
	3294	+ op_param.nr = 1 << ((3 - level) * 9);
	3295	+ op_param.op = KBASE_MMU_OP_FLUSH_PT;
	3296	+ /* When level is not MIDGARD_MMU_BOTTOMLEVEL, it is assumed PGD page migration */
	3297	+ op_param.flush_skip_levels = (level == MIDGARD_MMU_BOTTOMLEVEL) ?
	3298	+ pgd_level_to_skip_flush(1ULL << level) :
	3299	+ pgd_level_to_skip_flush(3ULL << level);
	3300	+
	3301	+ mutex_lock(&mmut->mmu_lock);
	3302	+
	3303	+ /* The state was evaluated before entering this function, but it could
	3304	+ * have changed before the mmu_lock was taken. However, the state
	3305	+ * transitions which are possible at this point are only two, and in both
	3306	+ * cases it is a stable state progressing to a "free in progress" state.
	3307	+ *
	3308	+ * After taking the mmu_lock the state can no longer change: read it again
	3309	+ * and make sure that it hasn't changed before continuing.
	3310	+ */
	3311	+ spin_lock(&page_md->migrate_lock);
	3312	+ check_state = PAGE_STATUS_GET(page_md->status);
	3313	+ if (level == MIDGARD_MMU_BOTTOMLEVEL)
	3314	+ vmap_count = page_md->vmap_count;
	3315	+ spin_unlock(&page_md->migrate_lock);
	3316	+
	3317	+ if (level == MIDGARD_MMU_BOTTOMLEVEL) {
	3318	+ if (check_state != ALLOCATED_MAPPED) {
	3319	+ dev_dbg(kbdev->dev,
	3320	+ "%s: state changed to %d (was %d), abort page migration", __func__,
	3321	+ check_state, ALLOCATED_MAPPED);
	3322	+ ret = -EAGAIN;
	3323	+ goto page_state_change_out;
	3324	+ } else if (vmap_count > 0) {
	3325	+ dev_dbg(kbdev->dev, "%s: page was multi-mapped, abort page migration",
	3326	+ __func__);
	3327	+ ret = -EAGAIN;
	3328	+ goto page_state_change_out;
	3329	+ }
	3330	+ } else {
	3331	+ if (check_state != PT_MAPPED) {
	3332	+ dev_dbg(kbdev->dev,
	3333	+ "%s: state changed to %d (was %d), abort PGD page migration",
	3334	+ __func__, check_state, PT_MAPPED);
	3335	+ WARN_ON_ONCE(check_state != FREE_PT_ISOLATED_IN_PROGRESS);
	3336	+ ret = -EAGAIN;
	3337	+ goto page_state_change_out;
	3338	+ }
	3339	+ }
	3340	+
	3341	+ ret = mmu_get_pgd_at_level(kbdev, mmut, vpfn, level, &pgd);
	3342	+ if (ret) {
	3343	+ dev_err(kbdev->dev, "%s: failed to find PGD for old page.", __func__);
	3344	+ goto get_pgd_at_level_error;
	3345	+ }
	3346	+
	3347	+ pgd_page = kmap(phys_to_page(pgd));
	3348	+ if (!pgd_page) {
	3349	+ dev_warn(kbdev->dev, "%s: kmap failure for PGD page.", __func__);
	3350	+ ret = -EINVAL;
	3351	+ goto pgd_page_map_error;
	3352	+ }
	3353	+
	3354	+ mutex_lock(&kbdev->pm.lock);
	3355	+ mutex_lock(&kbdev->mmu_hw_mutex);
	3356	+
	3357	+ /* Lock MMU region and flush GPU cache by using GPU control,
	3358	+ * in order to keep MMU region locked.
	3359	+ */
	3360	+ spin_lock_irqsave(&kbdev->hwaccess_lock, hwaccess_flags);
	3361	+ if (unlikely(!kbase_pm_l2_allow_mmu_page_migration(kbdev))) {
	3362	+ /* Defer the migration as L2 is in a transitional phase */
	3363	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, hwaccess_flags);
	3364	+ mutex_unlock(&kbdev->mmu_hw_mutex);
	3365	+ mutex_unlock(&kbdev->pm.lock);
	3366	+ dev_dbg(kbdev->dev, "%s: L2 in transtion, abort PGD page migration", __func__);
	3367	+ ret = -EAGAIN;
	3368	+ goto l2_state_defer_out;
	3369	+ }
	3370	+ /* Prevent transitional phases in L2 by starting the transaction */
	3371	+ mmu_page_migration_transaction_begin(kbdev);
	3372	+ if (kbdev->pm.backend.gpu_powered && mmut->kctx->as_nr >= 0) {
	3373	+ int as_nr = mmut->kctx->as_nr;
	3374	+ struct kbase_as *as = &kbdev->as[as_nr];
	3375	+
	3376	+ ret = kbase_mmu_hw_do_lock(kbdev, as, &op_param);
	3377	+ if (!ret) {
	3378	+ ret = kbase_gpu_cache_flush_and_busy_wait(
	3379	+ kbdev, GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
	3380	+ }
	3381	+ if (ret)
	3382	+ mmu_page_migration_transaction_end(kbdev);
	3383	+ }
	3384	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, hwaccess_flags);
	3385	+
	3386	+ if (ret < 0) {
	3387	+ mutex_unlock(&kbdev->mmu_hw_mutex);
	3388	+ mutex_unlock(&kbdev->pm.lock);
	3389	+ dev_err(kbdev->dev, "%s: failed to lock MMU region or flush GPU cache", __func__);
	3390	+ goto undo_mappings;
	3391	+ }
	3392	+
	3393	+ /* Copy memory content.
	3394	+ *
	3395	+ * It is necessary to claim the ownership of the DMA buffer for the old
	3396	+ * page before performing the copy, to make sure of reading a consistent
	3397	+ * version of its content, before copying. After the copy, ownership of
	3398	+ * the DMA buffer for the new page is given to the GPU in order to make
	3399	+ * the content visible to potential GPU access that may happen as soon as
	3400	+ * this function releases the lock on the MMU region.
	3401	+ */
	3402	+ dma_sync_single_for_cpu(kbdev->dev, old_dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
	3403	+ memcpy(new_page, old_page, PAGE_SIZE);
	3404	+ dma_sync_single_for_device(kbdev->dev, new_dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
	3405	+
	3406	+ /* Remap GPU virtual page.
	3407	+ *
	3408	+ * This code rests on the assumption that page migration is only enabled
	3409	+ * for 4 kB pages, that necessarily live in the bottom level of the MMU
	3410	+ * page table. For this reason, the PGD level tells us inequivocably
	3411	+ * whether the page being migrated is a "content page" or another PGD
	3412	+ * of the page table:
	3413	+ *
	3414	+ * - Bottom level implies ATE (Address Translation Entry)
	3415	+ * - Any other level implies PTE (Page Table Entry)
	3416	+ *
	3417	+ * The current implementation doesn't handle the case of a level 0 PGD,
	3418	+ * that is: the root PGD of the page table.
	3419	+ */
	3420	+ target = &pgd_page[index];
	3421	+
	3422	+ /* Certain entries of a page table page encode the count of valid entries
	3423	+ * present in that page. So need to save & restore the count information
	3424	+ * when updating the PTE/ATE to point to the new page.
	3425	+ */
	3426	+ num_of_valid_entries = kbdev->mmu_mode->get_num_valid_entries(pgd_page);
	3427	+
	3428	+ if (level == MIDGARD_MMU_BOTTOMLEVEL) {
	3429	+ WARN_ON_ONCE((*target & 1UL) == 0);
	3430	+ *target =
	3431	+ kbase_mmu_create_ate(kbdev, new_phys, page_md->data.mapped.reg->flags,
	3432	+ level, page_md->data.mapped.reg->gpu_alloc->group_id);
	3433	+ } else {
	3434	+ u64 managed_pte;
	3435	+
	3436	+#ifdef CONFIG_MALI_BIFROST_DEBUG
	3437	+ /* The PTE should be pointing to the page being migrated */
	3438	+ WARN_ON_ONCE(as_phys_addr_t(old_phys) != kbdev->mmu_mode->pte_to_phy_addr(
	3439	+ kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
	3440	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, pgd_page[index])));
	3441	+#endif
	3442	+ kbdev->mmu_mode->entry_set_pte(&managed_pte, as_phys_addr_t(new_phys));
	3443	+ *target = kbdev->mgm_dev->ops.mgm_update_gpu_pte(
	3444	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, managed_pte);
	3445	+ }
	3446	+
	3447	+ kbdev->mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries);
	3448	+
	3449	+ /* This function always updates a single entry inside an existing PGD,
	3450	+ * therefore cache maintenance is necessary and affects a single entry.
	3451	+ */
	3452	+ kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
	3453	+ kbase_dma_addr(phys_to_page(pgd)) + (index * sizeof(u64)), sizeof(u64),
	3454	+ KBASE_MMU_OP_FLUSH_PT);
	3455	+
	3456	+ /* Unlock MMU region.
	3457	+ *
	3458	+ * Notice that GPUs which don't issue flush commands via GPU control
	3459	+ * still need an additional GPU cache flush here, this time only
	3460	+ * for the page table, because the function call above to sync PGDs
	3461	+ * won't have any effect on them.
	3462	+ */
	3463	+ spin_lock_irqsave(&kbdev->hwaccess_lock, hwaccess_flags);
	3464	+ if (kbdev->pm.backend.gpu_powered && mmut->kctx->as_nr >= 0) {
	3465	+ int as_nr = mmut->kctx->as_nr;
	3466	+ struct kbase_as *as = &kbdev->as[as_nr];
	3467	+
	3468	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev)) {
	3469	+ ret = kbase_mmu_hw_do_unlock(kbdev, as, &op_param);
	3470	+ } else {
	3471	+ ret = kbase_gpu_cache_flush_and_busy_wait(kbdev,
	3472	+ GPU_COMMAND_CACHE_CLN_INV_L2);
	3473	+ if (!ret)
	3474	+ ret = kbase_mmu_hw_do_unlock_no_addr(kbdev, as, &op_param);
	3475	+ }
	3476	+ }
	3477	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, hwaccess_flags);
	3478	+ /* Releasing locks before checking the migration transaction error state */
	3479	+ mutex_unlock(&kbdev->mmu_hw_mutex);
	3480	+ mutex_unlock(&kbdev->pm.lock);
	3481	+
	3482	+ spin_lock_irqsave(&kbdev->hwaccess_lock, hwaccess_flags);
	3483	+ /* Release the transition prevention in L2 by ending the transaction */
	3484	+ mmu_page_migration_transaction_end(kbdev);
	3485	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, hwaccess_flags);
	3486	+
	3487	+ /* Checking the final migration transaction error state */
	3488	+ if (ret < 0) {
	3489	+ dev_err(kbdev->dev, "%s: failed to unlock MMU region.", __func__);
	3490	+ goto undo_mappings;
	3491	+ }
	3492	+
	3493	+ /* Undertaking metadata transfer, while we are holding the mmu_lock */
	3494	+ spin_lock(&page_md->migrate_lock);
	3495	+ if (level == MIDGARD_MMU_BOTTOMLEVEL) {
	3496	+ size_t page_array_index =
	3497	+ page_md->data.mapped.vpfn - page_md->data.mapped.reg->start_pfn;
	3498	+
	3499	+ WARN_ON(PAGE_STATUS_GET(page_md->status) != ALLOCATED_MAPPED);
	3500	+
	3501	+ /* Replace page in array of pages of the physical allocation. */
	3502	+ page_md->data.mapped.reg->gpu_alloc->pages[page_array_index] = new_phys;
	3503	+ }
	3504	+ /* Update the new page dma_addr with the transferred metadata from the old_page */
	3505	+ page_md->dma_addr = new_dma_addr;
	3506	+ page_md->status = PAGE_ISOLATE_SET(page_md->status, 0);
	3507	+ spin_unlock(&page_md->migrate_lock);
	3508	+ set_page_private(as_page(new_phys), (unsigned long)page_md);
	3509	+ /* Old page metatdata pointer cleared as it now owned by the new page */
	3510	+ set_page_private(as_page(old_phys), 0);
	3511	+
	3512	+l2_state_defer_out:
	3513	+ kunmap(phys_to_page(pgd));
	3514	+pgd_page_map_error:
	3515	+get_pgd_at_level_error:
	3516	+page_state_change_out:
	3517	+ mutex_unlock(&mmut->mmu_lock);
	3518	+
	3519	+ kunmap(as_page(new_phys));
	3520	+new_page_map_error:
	3521	+ kunmap(as_page(old_phys));
	3522	+old_page_map_error:
	3523	+ return ret;
	3524	+
	3525	+undo_mappings:
	3526	+ /* Unlock the MMU table and undo mappings. */
	3527	+ mutex_unlock(&mmut->mmu_lock);
	3528	+ kunmap(phys_to_page(pgd));
	3529	+ kunmap(as_page(new_phys));
	3530	+ kunmap(as_page(old_phys));
	3531	+
	3532	+ return ret;
	3533	+}
	3534	+
	3535	+static void mmu_teardown_level(struct kbase_device kbdev, struct kbase_mmu_table mmut,
	3536	+ phys_addr_t pgd, unsigned int level)
	3537	+{
1985	3538	u64 *pgd_page;
1986	3539	int i;
1987		- struct kbase_mmu_mode const *mmu_mode;
	3540	+ struct memory_group_manager_device *mgm_dev = kbdev->mgm_dev;
	3541	+ struct kbase_mmu_mode const *mmu_mode = kbdev->mmu_mode;
	3542	+ u64 *pgd_page_buffer = NULL;
	3543	+ struct page *p = phys_to_page(pgd);
1988	3544
1989	3545	lockdep_assert_held(&mmut->mmu_lock);
1990	3546
1991		- pgd_page = kmap_atomic(pfn_to_page(PFN_DOWN(pgd)));
	3547	+ pgd_page = kmap_atomic(p);
1992	3548	/* kmap_atomic should NEVER fail. */
1993		- if (WARN_ON(pgd_page == NULL))
	3549	+ if (WARN_ON_ONCE(pgd_page == NULL))
1994	3550	return;
1995		- /* Copy the page to our preallocated buffer so that we can minimize
1996		- * kmap_atomic usage
	3551	+ if (level < MIDGARD_MMU_BOTTOMLEVEL) {
	3552	+ /* Copy the page to our preallocated buffer so that we can minimize
	3553	+ * kmap_atomic usage
	3554	+ */
	3555	+ pgd_page_buffer = mmut->scratch_mem.teardown_pages.levels[level];
	3556	+ memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
	3557	+ }
	3558	+
	3559	+ /* When page migration is enabled, kbase_region_tracker_term() would ensure
	3560	+ * there are no pages left mapped on the GPU for a context. Hence the count
	3561	+ * of valid entries is expected to be zero here.
1997	3562	*/
1998		- memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
	3563	+ if (kbase_page_migration_enabled && mmut->kctx)
	3564	+ WARN_ON_ONCE(kbdev->mmu_mode->get_num_valid_entries(pgd_page));
	3565	+ /* Invalidate page after copying */
	3566	+ mmu_mode->entries_invalidate(pgd_page, KBASE_MMU_PAGE_ENTRIES);
1999	3567	kunmap_atomic(pgd_page);
2000	3568	pgd_page = pgd_page_buffer;
2001	3569
2002		- mmu_mode = kbdev->mmu_mode;
2003		-
2004		- for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
2005		- target_pgd = mmu_mode->pte_to_phy_addr(pgd_page[i]);
2006		-
2007		- if (target_pgd) {
	3570	+ if (level < MIDGARD_MMU_BOTTOMLEVEL) {
	3571	+ for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
2008	3572	if (mmu_mode->pte_is_valid(pgd_page[i], level)) {
2009		- mmu_teardown_level(kbdev, mmut,
2010		- target_pgd,
2011		- level + 1,
2012		- pgd_page_buffer +
2013		- (PAGE_SIZE / sizeof(u64)));
	3573	+ phys_addr_t target_pgd = mmu_mode->pte_to_phy_addr(
	3574	+ mgm_dev->ops.mgm_pte_to_original_pte(mgm_dev,
	3575	+ MGM_DEFAULT_PTE_GROUP,
	3576	+ level, pgd_page[i]));
	3577	+
	3578	+ mmu_teardown_level(kbdev, mmut, target_pgd, level + 1);
2014	3579	}
2015	3580	}
2016	3581	}
2017	3582
2018		- p = pfn_to_page(PFN_DOWN(pgd));
2019		-#ifdef CONFIG_MALI_2MB_ALLOC
2020		- kbase_mem_pool_free(&kbdev->mem_pools.large[mmut->group_id],
2021		-#else
2022		- kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
2023		-#endif
2024		- p, true);
2025		-
2026		- atomic_sub(1, &kbdev->memdev.used_pages);
2027		-
2028		- /* If MMU tables belong to a context then pages will have been accounted
2029		- * against it, so we must decrement the usage counts here.
2030		- */
2031		- if (mmut->kctx) {
2032		- kbase_process_page_usage_dec(mmut->kctx, 1);
2033		- atomic_sub(1, &mmut->kctx->used_pages);
2034		- }
2035		-
2036		- kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
	3583	+ kbase_mmu_free_pgd(kbdev, mmut, pgd);
2037	3584	}
2038	3585
2039	3586	int kbase_mmu_init(struct kbase_device *const kbdev,
..	..	@@ -2044,31 +3591,26 @@
2044	3591	WARN_ON(group_id < 0))
2045	3592	return -EINVAL;
2046	3593
	3594	+ compiletime_assert(KBASE_MEM_ALLOC_MAX_SIZE <= (((8ull << 30) >> PAGE_SHIFT)),
	3595	+ "List of free PGDs may not be large enough.");
	3596	+ compiletime_assert(MAX_PAGES_FOR_FREE_PGDS >= MIDGARD_MMU_BOTTOMLEVEL,
	3597	+ "Array of MMU levels is not large enough.");
	3598	+
2047	3599	mmut->group_id = group_id;
2048	3600	mutex_init(&mmut->mmu_lock);
2049	3601	mmut->kctx = kctx;
	3602	+ mmut->pgd = KBASE_MMU_INVALID_PGD_ADDRESS;
2050	3603
2051		- /* Preallocate MMU depth of four pages for mmu_teardown_level to use */
2052		- mmut->mmu_teardown_pages = kmalloc(PAGE_SIZE * 4, GFP_KERNEL);
2053		-
2054		- if (mmut->mmu_teardown_pages == NULL)
2055		- return -ENOMEM;
2056		-
2057		- mmut->pgd = 0;
2058	3604	/* We allocate pages into the kbdev memory pool, then
2059	3605	* kbase_mmu_alloc_pgd will allocate out of that pool. This is done to
2060	3606	* avoid allocations from the kernel happening with the lock held.
2061	3607	*/
2062		- while (!mmut->pgd) {
	3608	+ while (mmut->pgd == KBASE_MMU_INVALID_PGD_ADDRESS) {
2063	3609	int err;
2064	3610
2065	3611	err = kbase_mem_pool_grow(
2066		-#ifdef CONFIG_MALI_2MB_ALLOC
2067		- &kbdev->mem_pools.large[mmut->group_id],
2068		-#else
2069	3612	&kbdev->mem_pools.small[mmut->group_id],
2070		-#endif
2071		- MIDGARD_MMU_BOTTOMLEVEL);
	3613	+ MIDGARD_MMU_BOTTOMLEVEL, kctx ? kctx->task : NULL);
2072	3614	if (err) {
2073	3615	kbase_mmu_term(kbdev, mmut);
2074	3616	return -ENOMEM;
..	..	@@ -2084,25 +3626,43 @@
2084	3626
2085	3627	void kbase_mmu_term(struct kbase_device kbdev, struct kbase_mmu_table mmut)
2086	3628	{
2087		- if (mmut->pgd) {
	3629	+ WARN((mmut->kctx) && (mmut->kctx->as_nr != KBASEP_AS_NR_INVALID),
	3630	+ "kctx-%d_%d must first be scheduled out to flush GPU caches+tlbs before tearing down MMU tables",
	3631	+ mmut->kctx->tgid, mmut->kctx->id);
	3632	+
	3633	+ if (mmut->pgd != KBASE_MMU_INVALID_PGD_ADDRESS) {
2088	3634	mutex_lock(&mmut->mmu_lock);
2089		- mmu_teardown_level(kbdev, mmut, mmut->pgd, MIDGARD_MMU_TOPLEVEL,
2090		- mmut->mmu_teardown_pages);
	3635	+ mmu_teardown_level(kbdev, mmut, mmut->pgd, MIDGARD_MMU_TOPLEVEL);
2091	3636	mutex_unlock(&mmut->mmu_lock);
2092	3637
2093	3638	if (mmut->kctx)
2094	3639	KBASE_TLSTREAM_AUX_PAGESALLOC(kbdev, mmut->kctx->id, 0);
2095	3640	}
2096	3641
2097		- kfree(mmut->mmu_teardown_pages);
2098	3642	mutex_destroy(&mmut->mmu_lock);
2099	3643	}
2100	3644
2101		-void kbase_mmu_as_term(struct kbase_device *kbdev, int i)
	3645	+void kbase_mmu_as_term(struct kbase_device *kbdev, unsigned int i)
2102	3646	{
2103	3647	destroy_workqueue(kbdev->as[i].pf_wq);
2104	3648	}
2105	3649
	3650	+void kbase_mmu_flush_pa_range(struct kbase_device kbdev, struct kbase_context kctx,
	3651	+ phys_addr_t phys, size_t size,
	3652	+ enum kbase_mmu_op_type flush_op)
	3653	+{
	3654	+#if MALI_USE_CSF
	3655	+ unsigned long irq_flags;
	3656	+
	3657	+ spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
	3658	+ if (mmu_flush_cache_on_gpu_ctrl(kbdev) && (flush_op != KBASE_MMU_OP_NONE) &&
	3659	+ kbdev->pm.backend.gpu_powered && (!kctx \|\| kctx->as_nr >= 0))
	3660	+ mmu_flush_pa_range(kbdev, phys, size, KBASE_MMU_OP_FLUSH_PT);
	3661	+ spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
	3662	+#endif
	3663	+}
	3664	+
	3665	+#ifdef CONFIG_MALI_VECTOR_DUMP
2106	3666	static size_t kbasep_mmu_dump_level(struct kbase_context *kctx, phys_addr_t pgd,
2107	3667	int level, char ** const buffer, size_t *size_left)
2108	3668	{
..	..	@@ -2123,7 +3683,7 @@
2123	3683
2124	3684	pgd_page = kmap(pfn_to_page(PFN_DOWN(pgd)));
2125	3685	if (!pgd_page) {
2126		- dev_warn(kbdev->dev, "%s: kmap failure\n", __func__);
	3686	+ dev_warn(kbdev->dev, "%s: kmap failure", __func__);
2127	3687	return 0;
2128	3688	}
2129	3689
..	..	@@ -2148,7 +3708,9 @@
2148	3708	for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
2149	3709	if (mmu_mode->pte_is_valid(pgd_page[i], level)) {
2150	3710	target_pgd = mmu_mode->pte_to_phy_addr(
2151		- pgd_page[i]);
	3711	+ kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
	3712	+ kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP,
	3713	+ level, pgd_page[i]));
2152	3714
2153	3715	dump_size = kbasep_mmu_dump_level(kctx,
2154	3716	target_pgd, level + 1,
..	..	@@ -2242,6 +3804,7 @@
2242	3804	return NULL;
2243	3805	}
2244	3806	KBASE_EXPORT_TEST_API(kbase_mmu_dump);
	3807	+#endif /* CONFIG_MALI_VECTOR_DUMP */
2245	3808
2246	3809	void kbase_mmu_bus_fault_worker(struct work_struct *data)
2247	3810	{
..	..	@@ -2274,8 +3837,7 @@
2274	3837	#ifdef CONFIG_MALI_ARBITER_SUPPORT
2275	3838	/* check if we still have GPU */
2276	3839	if (unlikely(kbase_is_gpu_removed(kbdev))) {
2277		- dev_dbg(kbdev->dev,
2278		- "%s: GPU has been removed\n", __func__);
	3840	+ dev_dbg(kbdev->dev, "%s: GPU has been removed", __func__);
2279	3841	release_ctx(kbdev, kctx);
2280	3842	atomic_dec(&kbdev->faults_pending);
2281	3843	return;
..	..	@@ -2293,6 +3855,13 @@
2293	3855
2294	3856	}
2295	3857
	3858	+#if MALI_USE_CSF
	3859	+ /* Before the GPU power off, wait is done for the completion of
	3860	+ * in-flight MMU fault work items. So GPU is expected to remain
	3861	+ * powered up whilst the bus fault handling is being done.
	3862	+ */
	3863	+ kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault);
	3864	+#else
2296	3865	/* NOTE: If GPU already powered off for suspend,
2297	3866	* we don't need to switch to unmapped
2298	3867	*/
..	..	@@ -2301,6 +3870,7 @@
2301	3870	kbase_gpu_report_bus_fault_and_kill(kctx, faulting_as, fault);
2302	3871	kbase_pm_context_idle(kbdev);
2303	3872	}
	3873	+#endif
2304	3874
2305	3875	release_ctx(kbdev, kctx);
2306	3876