From 1543e317f1da31b75942316931e8f491a8920811 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Thu, 04 Jan 2024 10:08:02 +0000
Subject: [PATCH] disable FB

---
 kernel/drivers/gpu/arm/bifrost/mali_kbase_mem.c | 1452 +++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 1,034 insertions(+), 418 deletions(-)

diff --git a/kernel/drivers/gpu/arm/bifrost/mali_kbase_mem.c b/kernel/drivers/gpu/arm/bifrost/mali_kbase_mem.c
index 2084171..1c94e9c 100644
--- a/kernel/drivers/gpu/arm/bifrost/mali_kbase_mem.c
+++ b/kernel/drivers/gpu/arm/bifrost/mali_kbase_mem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2023 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -20,7 +20,7 @@
  */
 
 /**
- * Base kernel memory APIs
+ * DOC: Base kernel memory APIs
  */
 #include <linux/dma-buf.h>
 #include <linux/kernel.h>
@@ -44,6 +44,11 @@
 #include <mali_kbase_config_defaults.h>
 #include <mali_kbase_trace_gpu_mem.h>
 
+#define VA_REGION_SLAB_NAME_PREFIX "va-region-slab-"
+#define VA_REGION_SLAB_NAME_SIZE (DEVNAME_SIZE + sizeof(VA_REGION_SLAB_NAME_PREFIX) + 1)
+
+#if MALI_JIT_PRESSURE_LIMIT_BASE
+
 /*
  * Alignment of objects allocated by the GPU inside a just-in-time memory
  * region whose size is given by an end address
@@ -66,6 +71,7 @@
  */
 #define KBASE_GPU_ALLOCATED_OBJECT_MAX_BYTES (512u)
 
+#endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
 /* Forward declarations */
 static void free_partial_locked(struct kbase_context *kctx,
@@ -89,10 +95,8 @@
 #error "Unknown CPU VA width for this architecture"
 #endif
 
-#if IS_ENABLED(CONFIG_64BIT)
-	if (kbase_ctx_flag(kctx, KCTX_COMPAT))
+	if (kbase_ctx_compat_mode(kctx))
 		cpu_va_bits = 32;
-#endif
 
 	return cpu_va_bits;
 }
@@ -104,29 +108,37 @@
 								    u64 gpu_pfn)
 {
 	struct rb_root *rbtree = NULL;
-	struct kbase_reg_zone *exec_va_zone =
-		kbase_ctx_reg_zone_get(kctx, KBASE_REG_ZONE_EXEC_VA);
 
-	/* The gpu_pfn can only be greater than the starting pfn of the EXEC_VA
-	 * zone if this has been initialized.
-	 */
+	struct kbase_reg_zone *exec_va_zone = kbase_ctx_reg_zone_get(kctx, KBASE_REG_ZONE_EXEC_VA);
+
+#if MALI_USE_CSF
+	struct kbase_reg_zone *fixed_va_zone =
+		kbase_ctx_reg_zone_get(kctx, KBASE_REG_ZONE_FIXED_VA);
+
+	struct kbase_reg_zone *exec_fixed_va_zone =
+		kbase_ctx_reg_zone_get(kctx, KBASE_REG_ZONE_EXEC_FIXED_VA);
+
+	if (gpu_pfn >= fixed_va_zone->base_pfn) {
+		rbtree = &kctx->reg_rbtree_fixed;
+		return rbtree;
+	} else if (gpu_pfn >= exec_fixed_va_zone->base_pfn) {
+		rbtree = &kctx->reg_rbtree_exec_fixed;
+		return rbtree;
+	}
+#endif
 	if (gpu_pfn >= exec_va_zone->base_pfn)
 		rbtree = &kctx->reg_rbtree_exec;
 	else {
 		u64 same_va_end;
 
-#if IS_ENABLED(CONFIG_64BIT)
-		if (kbase_ctx_flag(kctx, KCTX_COMPAT)) {
-#endif /* CONFIG_64BIT */
+		if (kbase_ctx_compat_mode(kctx)) {
 			same_va_end = KBASE_REG_ZONE_CUSTOM_VA_BASE;
-#if IS_ENABLED(CONFIG_64BIT)
 		} else {
 			struct kbase_reg_zone *same_va_zone =
 				kbase_ctx_reg_zone_get(kctx,
 						       KBASE_REG_ZONE_SAME_VA);
 			same_va_end = kbase_reg_zone_end_pfn(same_va_zone);
 		}
-#endif /* CONFIG_64BIT */
 
 		if (gpu_pfn >= same_va_end)
 			rbtree = &kctx->reg_rbtree_custom;
@@ -350,7 +362,9 @@
 }
 
 /**
- * Remove a region object from the global list.
+ * kbase_remove_va_region - Remove a region object from the global list.
+ *
+ * @kbdev: The kbase device
  * @reg: Region object to remove
  *
  * The region reg is removed, possibly by merging with other free and
@@ -358,19 +372,23 @@
  * region lock held. The associated memory is not released (see
  * kbase_free_alloced_region). Internal use only.
  */
-int kbase_remove_va_region(struct kbase_va_region *reg)
+void kbase_remove_va_region(struct kbase_device *kbdev,
+			    struct kbase_va_region *reg)
 {
 	struct rb_node *rbprev;
 	struct kbase_va_region *prev = NULL;
 	struct rb_node *rbnext;
 	struct kbase_va_region *next = NULL;
 	struct rb_root *reg_rbtree = NULL;
+	struct kbase_va_region *orig_reg = reg;
 
 	int merged_front = 0;
 	int merged_back = 0;
-	int err = 0;
 
 	reg_rbtree = reg->rbtree;
+
+	if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree)))
+		return;
 
 	/* Try to merge with the previous block first */
 	rbprev = rb_prev(&(reg->rblink));
@@ -378,10 +396,14 @@
 		prev = rb_entry(rbprev, struct kbase_va_region, rblink);
 		if (prev->flags & KBASE_REG_FREE) {
 			/* We're compatible with the previous VMA, merge with
-			 * it
+			 * it, handling any gaps for robustness.
 			 */
+			u64 prev_end_pfn = prev->start_pfn + prev->nr_pages;
+
 			WARN_ON((prev->flags & KBASE_REG_ZONE_MASK) !=
 					    (reg->flags & KBASE_REG_ZONE_MASK));
+			if (!WARN_ON(reg->start_pfn < prev_end_pfn))
+				prev->nr_pages += reg->start_pfn - prev_end_pfn;
 			prev->nr_pages += reg->nr_pages;
 			rb_erase(&(reg->rblink), reg_rbtree);
 			reg = prev;
@@ -393,42 +415,76 @@
 	/* Note we do the lookup here as the tree may have been rebalanced. */
 	rbnext = rb_next(&(reg->rblink));
 	if (rbnext) {
-		/* We're compatible with the next VMA, merge with it */
 		next = rb_entry(rbnext, struct kbase_va_region, rblink);
 		if (next->flags & KBASE_REG_FREE) {
+			/* We're compatible with the next VMA, merge with it,
+			 * handling any gaps for robustness.
+			 */
+			u64 reg_end_pfn = reg->start_pfn + reg->nr_pages;
+
 			WARN_ON((next->flags & KBASE_REG_ZONE_MASK) !=
 					    (reg->flags & KBASE_REG_ZONE_MASK));
+			if (!WARN_ON(next->start_pfn < reg_end_pfn))
+				next->nr_pages += next->start_pfn - reg_end_pfn;
 			next->start_pfn = reg->start_pfn;
 			next->nr_pages += reg->nr_pages;
 			rb_erase(&(reg->rblink), reg_rbtree);
 			merged_back = 1;
-			if (merged_front) {
-				/* We already merged with prev, free it */
-				kfree(reg);
-			}
 		}
 	}
 
-	/* If we failed to merge then we need to add a new block */
-	if (!(merged_front || merged_back)) {
+	if (merged_front && merged_back) {
+		/* We already merged with prev, free it */
+		kfree(reg);
+	} else if (!(merged_front || merged_back)) {
+		/* If we failed to merge then we need to add a new block */
+
 		/*
-		 * We didn't merge anything. Add a new free
-		 * placeholder and remove the original one.
+		 * We didn't merge anything. Try to add a new free
+		 * placeholder, and in any case, remove the original one.
 		 */
 		struct kbase_va_region *free_reg;
 
-		free_reg = kbase_alloc_free_region(reg_rbtree,
-				reg->start_pfn, reg->nr_pages,
-				reg->flags & KBASE_REG_ZONE_MASK);
+		free_reg = kbase_alloc_free_region(kbdev, reg_rbtree, reg->start_pfn, reg->nr_pages,
+						   reg->flags & KBASE_REG_ZONE_MASK);
 		if (!free_reg) {
-			err = -ENOMEM;
+			/* In case of failure, we cannot allocate a replacement
+			 * free region, so we will be left with a 'gap' in the
+			 * region tracker's address range (though, the rbtree
+			 * will itself still be correct after erasing
+			 * 'reg').
+			 *
+			 * The gap will be rectified when an adjacent region is
+			 * removed by one of the above merging paths. Other
+			 * paths will gracefully fail to allocate if they try
+			 * to allocate in the gap.
+			 *
+			 * There is nothing that the caller can do, since free
+			 * paths must not fail. The existing 'reg' cannot be
+			 * repurposed as the free region as callers must have
+			 * freedom of use with it by virtue of it being owned
+			 * by them, not the region tracker insert/remove code.
+			 */
+			dev_warn(
+				kbdev->dev,
+				"Could not alloc a replacement free region for 0x%.16llx..0x%.16llx",
+				(unsigned long long)reg->start_pfn << PAGE_SHIFT,
+				(unsigned long long)(reg->start_pfn + reg->nr_pages) << PAGE_SHIFT);
+			rb_erase(&(reg->rblink), reg_rbtree);
+
 			goto out;
 		}
 		rb_replace_node(&(reg->rblink), &(free_reg->rblink), reg_rbtree);
 	}
 
- out:
-	return err;
+	/* This operation is always safe because the function never frees
+	 * the region. If the region has been merged to both front and back,
+	 * then it's the previous region that is supposed to be freed.
+	 */
+	orig_reg->start_pfn = 0;
+
+out:
+	return;
 }
 
 KBASE_EXPORT_TEST_API(kbase_remove_va_region);
@@ -437,13 +493,18 @@
  * kbase_insert_va_region_nolock - Insert a VA region to the list,
  * replacing the existing one.
  *
+ * @kbdev: The kbase device
  * @new_reg: The new region to insert
  * @at_reg: The region to replace
  * @start_pfn: The Page Frame Number to insert at
  * @nr_pages: The number of pages of the region
+ *
+ * Return: 0 on success, error code otherwise.
  */
-static int kbase_insert_va_region_nolock(struct kbase_va_region *new_reg,
-		struct kbase_va_region *at_reg, u64 start_pfn, size_t nr_pages)
+static int kbase_insert_va_region_nolock(struct kbase_device *kbdev,
+					 struct kbase_va_region *new_reg,
+					 struct kbase_va_region *at_reg, u64 start_pfn,
+					 size_t nr_pages)
 {
 	struct rb_root *reg_rbtree = NULL;
 	int err = 0;
@@ -456,6 +517,9 @@
 	KBASE_DEBUG_ASSERT((start_pfn >= at_reg->start_pfn) && (start_pfn < at_reg->start_pfn + at_reg->nr_pages));
 	/* at least nr_pages from start_pfn should be contained within at_reg */
 	KBASE_DEBUG_ASSERT(start_pfn + nr_pages <= at_reg->start_pfn + at_reg->nr_pages);
+	/* having at_reg means the rb_tree should not be empty */
+	if (WARN_ON(RB_EMPTY_ROOT(reg_rbtree)))
+		return -ENOMEM;
 
 	new_reg->start_pfn = start_pfn;
 	new_reg->nr_pages = nr_pages;
@@ -484,10 +548,9 @@
 	else {
 		struct kbase_va_region *new_front_reg;
 
-		new_front_reg = kbase_alloc_free_region(reg_rbtree,
-				at_reg->start_pfn,
-				start_pfn - at_reg->start_pfn,
-				at_reg->flags & KBASE_REG_ZONE_MASK);
+		new_front_reg = kbase_alloc_free_region(kbdev, reg_rbtree, at_reg->start_pfn,
+							start_pfn - at_reg->start_pfn,
+							at_reg->flags & KBASE_REG_ZONE_MASK);
 
 		if (new_front_reg) {
 			at_reg->nr_pages -= nr_pages + new_front_reg->nr_pages;
@@ -511,6 +574,8 @@
  * @addr: the address to insert the region at
  * @nr_pages: the number of pages in the region
  * @align: the minimum alignment in pages
+ *
+ * Return: 0 on success, error code otherwise.
  */
 int kbase_add_va_region(struct kbase_context *kctx,
 		struct kbase_va_region *reg, u64 addr,
@@ -527,12 +592,19 @@
 
 	lockdep_assert_held(&kctx->reg_lock);
 
-	/* The executable allocation from the SAME_VA zone would already have an
+	/* The executable allocation from the SAME_VA zone should already have an
 	 * appropriately aligned GPU VA chosen for it.
-	 * Also the executable allocation from EXEC_VA zone doesn't need the
-	 * special alignment.
+	 * Also, executable allocations from EXEC_VA don't need the special
+	 * alignment.
 	 */
+#if MALI_USE_CSF
+	/* The same is also true for the EXEC_FIXED_VA zone.
+	 */
+#endif
 	if (!(reg->flags & KBASE_REG_GPU_NX) && !addr &&
+#if MALI_USE_CSF
+		((reg->flags & KBASE_REG_ZONE_MASK) != KBASE_REG_ZONE_EXEC_FIXED_VA) &&
+#endif
 	    ((reg->flags & KBASE_REG_ZONE_MASK) != KBASE_REG_ZONE_EXEC_VA)) {
 		if (cpu_va_bits > gpu_pc_bits) {
 			align = max(align, (size_t)((1ULL << gpu_pc_bits)
@@ -564,15 +636,17 @@
 /**
  * kbase_add_va_region_rbtree - Insert a region into its corresponding rbtree
  *
- * Insert a region into the rbtree that was specified when the region was
- * created. If addr is 0 a free area in the rbtree is used, otherwise the
- * specified address is used.
- *
  * @kbdev: The kbase device
  * @reg: The region to add
  * @addr: The address to add the region at, or 0 to map at any available address
  * @nr_pages: The size of the region in pages
  * @align: The minimum alignment in pages
+ *
+ * Insert a region into the rbtree that was specified when the region was
+ * created. If addr is 0 a free area in the rbtree is used, otherwise the
+ * specified address is used.
+ *
+ * Return: 0 on success, error code otherwise.
  */
 int kbase_add_va_region_rbtree(struct kbase_device *kbdev,
 		struct kbase_va_region *reg,
@@ -613,8 +687,7 @@
 			goto exit;
 		}
 
-		err = kbase_insert_va_region_nolock(reg, tmp, gpu_pfn,
-				nr_pages);
+		err = kbase_insert_va_region_nolock(kbdev, reg, tmp, gpu_pfn, nr_pages);
 		if (err) {
 			dev_warn(dev, "Failed to insert va region");
 			err = -ENOMEM;
@@ -639,8 +712,7 @@
 				nr_pages, align_offset, align_mask,
 				&start_pfn);
 		if (tmp) {
-			err = kbase_insert_va_region_nolock(reg, tmp,
-							start_pfn, nr_pages);
+			err = kbase_insert_va_region_nolock(kbdev, reg, tmp, start_pfn, nr_pages);
 			if (unlikely(err)) {
 				dev_warn(dev, "Failed to insert region: 0x%08llx start_pfn, %zu nr_pages",
 					start_pfn, nr_pages);
@@ -659,6 +731,59 @@
 /*
  * @brief Initialize the internal region tracker data structure.
  */
+#if MALI_USE_CSF
+static void kbase_region_tracker_ds_init(struct kbase_context *kctx,
+					 struct kbase_va_region *same_va_reg,
+					 struct kbase_va_region *custom_va_reg,
+					 struct kbase_va_region *exec_va_reg,
+					 struct kbase_va_region *exec_fixed_va_reg,
+					 struct kbase_va_region *fixed_va_reg)
+{
+	u64 last_zone_end_pfn;
+
+	kctx->reg_rbtree_same = RB_ROOT;
+	kbase_region_tracker_insert(same_va_reg);
+
+	last_zone_end_pfn = same_va_reg->start_pfn + same_va_reg->nr_pages;
+
+	/* Although custom_va_reg doesn't always exist, initialize
+	 * unconditionally because of the mem_view debugfs
+	 * implementation which relies on it being empty.
+	 */
+	kctx->reg_rbtree_custom = RB_ROOT;
+	kctx->reg_rbtree_exec = RB_ROOT;
+
+	if (custom_va_reg) {
+		WARN_ON(custom_va_reg->start_pfn < last_zone_end_pfn);
+		kbase_region_tracker_insert(custom_va_reg);
+		last_zone_end_pfn = custom_va_reg->start_pfn + custom_va_reg->nr_pages;
+	}
+
+	/* Initialize exec, fixed and exec_fixed. These are always
+	 * initialized at this stage, if they will exist at all.
+	 */
+	kctx->reg_rbtree_fixed = RB_ROOT;
+	kctx->reg_rbtree_exec_fixed = RB_ROOT;
+
+	if (exec_va_reg) {
+		WARN_ON(exec_va_reg->start_pfn < last_zone_end_pfn);
+		kbase_region_tracker_insert(exec_va_reg);
+		last_zone_end_pfn = exec_va_reg->start_pfn + exec_va_reg->nr_pages;
+	}
+
+	if (exec_fixed_va_reg) {
+		WARN_ON(exec_fixed_va_reg->start_pfn < last_zone_end_pfn);
+		kbase_region_tracker_insert(exec_fixed_va_reg);
+		last_zone_end_pfn = exec_fixed_va_reg->start_pfn + exec_fixed_va_reg->nr_pages;
+	}
+
+	if (fixed_va_reg) {
+		WARN_ON(fixed_va_reg->start_pfn < last_zone_end_pfn);
+		kbase_region_tracker_insert(fixed_va_reg);
+		last_zone_end_pfn = fixed_va_reg->start_pfn + fixed_va_reg->nr_pages;
+	}
+}
+#else
 static void kbase_region_tracker_ds_init(struct kbase_context *kctx,
 		struct kbase_va_region *same_va_reg,
 		struct kbase_va_region *custom_va_reg)
@@ -679,6 +804,41 @@
 	if (custom_va_reg)
 		kbase_region_tracker_insert(custom_va_reg);
 }
+#endif /* MALI_USE_CSF */
+
+static struct kbase_context *kbase_reg_flags_to_kctx(struct kbase_va_region *reg)
+{
+	struct kbase_context *kctx = NULL;
+	struct rb_root *rbtree = reg->rbtree;
+
+	switch (reg->flags & KBASE_REG_ZONE_MASK) {
+	case KBASE_REG_ZONE_CUSTOM_VA:
+		kctx = container_of(rbtree, struct kbase_context, reg_rbtree_custom);
+		break;
+	case KBASE_REG_ZONE_SAME_VA:
+		kctx = container_of(rbtree, struct kbase_context, reg_rbtree_same);
+		break;
+	case KBASE_REG_ZONE_EXEC_VA:
+		kctx = container_of(rbtree, struct kbase_context, reg_rbtree_exec);
+		break;
+#if MALI_USE_CSF
+	case KBASE_REG_ZONE_EXEC_FIXED_VA:
+		kctx = container_of(rbtree, struct kbase_context, reg_rbtree_exec_fixed);
+		break;
+	case KBASE_REG_ZONE_FIXED_VA:
+		kctx = container_of(rbtree, struct kbase_context, reg_rbtree_fixed);
+		break;
+	case KBASE_REG_ZONE_MCU_SHARED:
+		/* This is only expected to be called on driver unload. */
+		break;
+#endif
+	default:
+		WARN(1, "Unknown zone in region: flags=0x%lx\n", reg->flags);
+		break;
+	}
+
+	return kctx;
+}
 
 static void kbase_region_tracker_erase_rbtree(struct rb_root *rbtree)
 {
@@ -690,7 +850,9 @@
 		if (rbnode) {
 			rb_erase(rbnode, rbtree);
 			reg = rb_entry(rbnode, struct kbase_va_region, rblink);
-			WARN_ON(reg->va_refcnt != 1);
+			WARN_ON(kbase_refcount_read(&reg->va_refcnt) != 1);
+			if (kbase_page_migration_enabled)
+				kbase_gpu_munmap(kbase_reg_flags_to_kctx(reg), reg);
 			/* Reset the start_pfn - as the rbtree is being
 			 * destroyed and we've already erased this region, there
 			 * is no further need to attempt to remove it.
@@ -707,12 +869,19 @@
 
 void kbase_region_tracker_term(struct kbase_context *kctx)
 {
+	WARN(kctx->as_nr != KBASEP_AS_NR_INVALID,
+	     "kctx-%d_%d must first be scheduled out to flush GPU caches+tlbs before erasing remaining regions",
+	     kctx->tgid, kctx->id);
+
 	kbase_gpu_vm_lock(kctx);
 	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_same);
 	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_custom);
 	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_exec);
 #if MALI_USE_CSF
 	WARN_ON(!list_empty(&kctx->csf.event_pages_head));
+	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_exec_fixed);
+	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_fixed);
+
 #endif
 	kbase_gpu_vm_unlock(kctx);
 }
@@ -724,8 +893,8 @@
 
 static size_t kbase_get_same_va_bits(struct kbase_context *kctx)
 {
-	return min(kbase_get_num_cpu_va_bits(kctx),
-			(size_t) kctx->kbdev->gpu_props.mmu.va_bits);
+	return min_t(size_t, kbase_get_num_cpu_va_bits(kctx),
+			kctx->kbdev->gpu_props.mmu.va_bits);
 }
 
 int kbase_region_tracker_init(struct kbase_context *kctx)
@@ -734,19 +903,41 @@
 	struct kbase_va_region *custom_va_reg = NULL;
 	size_t same_va_bits = kbase_get_same_va_bits(kctx);
 	u64 custom_va_size = KBASE_REG_ZONE_CUSTOM_VA_SIZE;
-	u64 gpu_va_limit = (1ULL << kctx->kbdev->gpu_props.mmu.va_bits) >> PAGE_SHIFT;
+	u64 gpu_va_bits = kctx->kbdev->gpu_props.mmu.va_bits;
+	u64 gpu_va_limit = (1ULL << gpu_va_bits) >> PAGE_SHIFT;
 	u64 same_va_pages;
 	u64 same_va_base = 1u;
 	int err;
+#if MALI_USE_CSF
+	struct kbase_va_region *exec_va_reg;
+	struct kbase_va_region *exec_fixed_va_reg;
+	struct kbase_va_region *fixed_va_reg;
+
+	u64 exec_va_base;
+	u64 fixed_va_end;
+	u64 exec_fixed_va_base;
+	u64 fixed_va_base;
+	u64 fixed_va_pages;
+#endif
 
 	/* Take the lock as kbase_free_alloced_region requires it */
 	kbase_gpu_vm_lock(kctx);
 
 	same_va_pages = (1ULL << (same_va_bits - PAGE_SHIFT)) - same_va_base;
+
+#if MALI_USE_CSF
+	if ((same_va_base + same_va_pages) > KBASE_REG_ZONE_EXEC_VA_BASE_64) {
+		/* Depending on how the kernel is configured, it's possible (eg on aarch64) for
+		 * same_va_bits to reach 48 bits. Cap same_va_pages so that the same_va zone
+		 * doesn't cross into the exec_va zone.
+		 */
+		same_va_pages = KBASE_REG_ZONE_EXEC_VA_BASE_64 - same_va_base;
+	}
+#endif
+
 	/* all have SAME_VA */
-	same_va_reg =
-		kbase_alloc_free_region(&kctx->reg_rbtree_same, same_va_base,
-					same_va_pages, KBASE_REG_ZONE_SAME_VA);
+	same_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_same, same_va_base,
+					      same_va_pages, KBASE_REG_ZONE_SAME_VA);
 
 	if (!same_va_reg) {
 		err = -ENOMEM;
@@ -755,10 +946,7 @@
 	kbase_ctx_reg_zone_init(kctx, KBASE_REG_ZONE_SAME_VA, same_va_base,
 				same_va_pages);
 
-#if IS_ENABLED(CONFIG_64BIT)
-	/* 32-bit clients have custom VA zones */
-	if (kbase_ctx_flag(kctx, KCTX_COMPAT)) {
-#endif
+	if (kbase_ctx_compat_mode(kctx)) {
 		if (gpu_va_limit <= KBASE_REG_ZONE_CUSTOM_VA_BASE) {
 			err = -EINVAL;
 			goto fail_free_same_va;
@@ -770,10 +958,9 @@
 		if ((KBASE_REG_ZONE_CUSTOM_VA_BASE + KBASE_REG_ZONE_CUSTOM_VA_SIZE) >= gpu_va_limit)
 			custom_va_size = gpu_va_limit - KBASE_REG_ZONE_CUSTOM_VA_BASE;
 
-		custom_va_reg = kbase_alloc_free_region(
-				&kctx->reg_rbtree_custom,
-				KBASE_REG_ZONE_CUSTOM_VA_BASE,
-				custom_va_size, KBASE_REG_ZONE_CUSTOM_VA);
+		custom_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_custom,
+							KBASE_REG_ZONE_CUSTOM_VA_BASE,
+							custom_va_size, KBASE_REG_ZONE_CUSTOM_VA);
 
 		if (!custom_va_reg) {
 			err = -ENOMEM;
@@ -782,11 +969,70 @@
 		kbase_ctx_reg_zone_init(kctx, KBASE_REG_ZONE_CUSTOM_VA,
 					KBASE_REG_ZONE_CUSTOM_VA_BASE,
 					custom_va_size);
-#if IS_ENABLED(CONFIG_64BIT)
 	} else {
 		custom_va_size = 0;
 	}
-#endif
+
+#if MALI_USE_CSF
+	/* The position of EXEC_VA depends on whether the client is 32-bit or 64-bit. */
+	exec_va_base = KBASE_REG_ZONE_EXEC_VA_BASE_64;
+
+	/* Similarly the end of the FIXED_VA zone also depends on whether the client
+	 * is 32 or 64-bits.
+	 */
+	fixed_va_end = KBASE_REG_ZONE_FIXED_VA_END_64;
+
+	if (kbase_ctx_compat_mode(kctx)) {
+		exec_va_base = KBASE_REG_ZONE_EXEC_VA_BASE_32;
+		fixed_va_end = KBASE_REG_ZONE_FIXED_VA_END_32;
+	}
+
+	kbase_ctx_reg_zone_init(kctx, KBASE_REG_ZONE_EXEC_VA, exec_va_base,
+				KBASE_REG_ZONE_EXEC_VA_SIZE);
+
+	exec_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_exec, exec_va_base,
+					      KBASE_REG_ZONE_EXEC_VA_SIZE, KBASE_REG_ZONE_EXEC_VA);
+
+	if (!exec_va_reg) {
+		err = -ENOMEM;
+		goto fail_free_custom_va;
+	}
+
+	exec_fixed_va_base = exec_va_base + KBASE_REG_ZONE_EXEC_VA_SIZE;
+
+	kbase_ctx_reg_zone_init(kctx, KBASE_REG_ZONE_EXEC_FIXED_VA, exec_fixed_va_base,
+				KBASE_REG_ZONE_EXEC_FIXED_VA_SIZE);
+
+	exec_fixed_va_reg =
+		kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_exec_fixed,
+					exec_fixed_va_base, KBASE_REG_ZONE_EXEC_FIXED_VA_SIZE,
+					KBASE_REG_ZONE_EXEC_FIXED_VA);
+
+	if (!exec_fixed_va_reg) {
+		err = -ENOMEM;
+		goto fail_free_exec_va;
+	}
+
+	fixed_va_base = exec_fixed_va_base + KBASE_REG_ZONE_EXEC_FIXED_VA_SIZE;
+	fixed_va_pages = fixed_va_end - fixed_va_base;
+
+	kbase_ctx_reg_zone_init(kctx, KBASE_REG_ZONE_FIXED_VA, fixed_va_base, fixed_va_pages);
+
+	fixed_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_fixed, fixed_va_base,
+					       fixed_va_pages, KBASE_REG_ZONE_FIXED_VA);
+
+	kctx->gpu_va_end = fixed_va_end;
+
+	if (!fixed_va_reg) {
+		err = -ENOMEM;
+		goto fail_free_exec_fixed_va;
+	}
+
+	kbase_region_tracker_ds_init(kctx, same_va_reg, custom_va_reg, exec_va_reg,
+				     exec_fixed_va_reg, fixed_va_reg);
+
+	INIT_LIST_HEAD(&kctx->csf.event_pages_head);
+#else
 	/* EXEC_VA zone's codepaths are slightly easier when its base_pfn is
 	 * initially U64_MAX
 	 */
@@ -794,16 +1040,22 @@
 	/* Other zones are 0: kbase_create_context() uses vzalloc */
 
 	kbase_region_tracker_ds_init(kctx, same_va_reg, custom_va_reg);
-
 	kctx->gpu_va_end = same_va_base + same_va_pages + custom_va_size;
-	kctx->jit_va = false;
-
-#if MALI_USE_CSF
-	INIT_LIST_HEAD(&kctx->csf.event_pages_head);
 #endif
+	kctx->jit_va = false;
 
 	kbase_gpu_vm_unlock(kctx);
 	return 0;
+
+#if MALI_USE_CSF
+fail_free_exec_fixed_va:
+	kbase_free_alloced_region(exec_fixed_va_reg);
+fail_free_exec_va:
+	kbase_free_alloced_region(exec_va_reg);
+fail_free_custom_va:
+	if (custom_va_reg)
+		kbase_free_alloced_region(custom_va_reg);
+#endif
 
 fail_free_same_va:
 	kbase_free_alloced_region(same_va_reg);
@@ -834,7 +1086,9 @@
 }
 
 /**
- * Determine if any allocations have been made on a context's region tracker
+ * kbase_region_tracker_has_allocs - Determine if any allocations have been made
+ * on a context's region tracker
+ *
  * @kctx: KBase context
  *
  * Check the context to determine if any allocations have been made yet from
@@ -862,6 +1116,8 @@
 		unsigned long zone_bits = KBASE_REG_ZONE(zone_idx);
 		unsigned long reg_zone;
 
+		if (!kbase_is_ctx_reg_zone(zone_bits))
+			continue;
 		zone = kbase_ctx_reg_zone_get(kctx, zone_bits);
 		zone_base_addr = zone->base_pfn << PAGE_SHIFT;
 
@@ -901,7 +1157,6 @@
 	return false;
 }
 
-#if IS_ENABLED(CONFIG_64BIT)
 static int kbase_region_tracker_init_jit_64(struct kbase_context *kctx,
 		u64 jit_va_pages)
 {
@@ -950,9 +1205,8 @@
 	 * Create a custom VA zone at the end of the VA for allocations which
 	 * JIT can use so it doesn't have to allocate VA from the kernel.
 	 */
-	custom_va_reg =
-		kbase_alloc_free_region(&kctx->reg_rbtree_custom, jit_va_start,
-					jit_va_pages, KBASE_REG_ZONE_CUSTOM_VA);
+	custom_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_custom, jit_va_start,
+						jit_va_pages, KBASE_REG_ZONE_CUSTOM_VA);
 
 	/*
 	 * The context will be destroyed if we fail here so no point
@@ -969,7 +1223,6 @@
 	kbase_region_tracker_insert(custom_va_reg);
 	return 0;
 }
-#endif
 
 int kbase_region_tracker_init_jit(struct kbase_context *kctx, u64 jit_va_pages,
 		int max_allocations, int trim_level, int group_id,
@@ -1010,10 +1263,8 @@
 		goto exit_unlock;
 	}
 
-#if IS_ENABLED(CONFIG_64BIT)
-	if (!kbase_ctx_flag(kctx, KCTX_COMPAT))
+	if (!kbase_ctx_compat_mode(kctx))
 		err = kbase_region_tracker_init_jit_64(kctx, jit_va_pages);
-#endif
 	/*
 	 * Nothing to do for 32-bit clients, JIT uses the existing
 	 * custom VA zone.
@@ -1039,6 +1290,7 @@
 
 int kbase_region_tracker_init_exec(struct kbase_context *kctx, u64 exec_va_pages)
 {
+#if !MALI_USE_CSF
 	struct kbase_va_region *exec_va_reg;
 	struct kbase_reg_zone *exec_va_zone;
 	struct kbase_reg_zone *target_zone;
@@ -1047,6 +1299,7 @@
 	unsigned long target_zone_bits;
 	u64 exec_va_start;
 	int err;
+#endif
 
 	/* The EXEC_VA zone shall be created by making space either:
 	 * - for 64-bit clients, at the end of the process's address space
@@ -1060,6 +1313,12 @@
 	if (exec_va_pages == 0 || exec_va_pages > KBASE_REG_ZONE_EXEC_VA_MAX_PAGES)
 		return -EINVAL;
 
+#if MALI_USE_CSF
+	/* For CSF GPUs we now setup the EXEC_VA zone during initialization,
+	 * so this request is a null-op.
+	 */
+	return 0;
+#else
 	kbase_gpu_vm_lock(kctx);
 
 	/* Verify that we've not already created a EXEC_VA zone, and that the
@@ -1081,17 +1340,14 @@
 		goto exit_unlock;
 	}
 
-#if IS_ENABLED(CONFIG_64BIT)
-	if (kbase_ctx_flag(kctx, KCTX_COMPAT)) {
-#endif
+	if (kbase_ctx_compat_mode(kctx)) {
 		/* 32-bit client: take from CUSTOM_VA zone */
 		target_zone_bits = KBASE_REG_ZONE_CUSTOM_VA;
-#if IS_ENABLED(CONFIG_64BIT)
 	} else {
 		/* 64-bit client: take from SAME_VA zone */
 		target_zone_bits = KBASE_REG_ZONE_SAME_VA;
 	}
-#endif
+
 	target_zone = kbase_ctx_reg_zone_get(kctx, target_zone_bits);
 	target_zone_base_addr = target_zone->base_pfn << PAGE_SHIFT;
 
@@ -1119,10 +1375,8 @@
 	/* Taken from the end of the target zone */
 	exec_va_start = kbase_reg_zone_end_pfn(target_zone) - exec_va_pages;
 
-	exec_va_reg = kbase_alloc_free_region(&kctx->reg_rbtree_exec,
-			exec_va_start,
-			exec_va_pages,
-			KBASE_REG_ZONE_EXEC_VA);
+	exec_va_reg = kbase_alloc_free_region(kctx->kbdev, &kctx->reg_rbtree_exec, exec_va_start,
+					      exec_va_pages, KBASE_REG_ZONE_EXEC_VA);
 	if (!exec_va_reg) {
 		err = -ENOMEM;
 		goto exit_unlock;
@@ -1145,6 +1399,7 @@
 exit_unlock:
 	kbase_gpu_vm_unlock(kctx);
 	return err;
+#endif /* MALI_USE_CSF */
 }
 
 #if MALI_USE_CSF
@@ -1164,10 +1419,9 @@
 
 	kbdev->csf.shared_reg_rbtree = RB_ROOT;
 
-	shared_reg = kbase_alloc_free_region(&kbdev->csf.shared_reg_rbtree,
-					shared_reg_start_pfn,
-					shared_reg_size,
-					KBASE_REG_ZONE_MCU_SHARED);
+	shared_reg =
+		kbase_alloc_free_region(kbdev, &kbdev->csf.shared_reg_rbtree, shared_reg_start_pfn,
+					shared_reg_size, KBASE_REG_ZONE_MCU_SHARED);
 	if (!shared_reg)
 		return -ENOMEM;
 
@@ -1176,10 +1430,30 @@
 }
 #endif
 
+static void kbasep_mem_page_size_init(struct kbase_device *kbdev)
+{
+#if IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC_OVERRIDE)
+#if IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC)
+	kbdev->pagesize_2mb = true;
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_LARGE_PAGE_ALLOC) != 1) {
+		dev_warn(
+			kbdev->dev,
+			"2MB page is enabled by force while current GPU-HW doesn't meet the requirement to do so.\n");
+	}
+#else /* IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC) */
+	kbdev->pagesize_2mb = false;
+#endif /* IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC) */
+#else /* IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC_OVERRIDE) */
+	/* Set it to the default based on which GPU is present */
+	kbdev->pagesize_2mb = kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_LARGE_PAGE_ALLOC);
+#endif /* IS_ENABLED(CONFIG_LARGE_PAGE_ALLOC_OVERRIDE) */
+}
+
 int kbase_mem_init(struct kbase_device *kbdev)
 {
 	int err = 0;
 	struct kbasep_mem_device *memdev;
+	char va_region_slab_name[VA_REGION_SLAB_NAME_SIZE];
 #if IS_ENABLED(CONFIG_OF)
 	struct device_node *mgm_node = NULL;
 #endif
@@ -1188,6 +1462,20 @@
 
 	memdev = &kbdev->memdev;
 
+	kbasep_mem_page_size_init(kbdev);
+
+	scnprintf(va_region_slab_name, VA_REGION_SLAB_NAME_SIZE, VA_REGION_SLAB_NAME_PREFIX "%s",
+		  kbdev->devname);
+
+	/* Initialize slab cache for kbase_va_regions */
+	kbdev->va_region_slab =
+		kmem_cache_create(va_region_slab_name, sizeof(struct kbase_va_region), 0, 0, NULL);
+	if (kbdev->va_region_slab == NULL) {
+		dev_err(kbdev->dev, "Failed to create va_region_slab\n");
+		return -ENOMEM;
+	}
+
+	kbase_mem_migrate_init(kbdev);
 	kbase_mem_pool_group_config_set_max_size(&kbdev->mem_pool_defaults,
 		KBASE_MEM_POOL_MAX_SIZE_KCTX);
 
@@ -1250,8 +1538,7 @@
 		kbase_mem_pool_group_config_set_max_size(&mem_pool_defaults,
 			KBASE_MEM_POOL_MAX_SIZE_KBDEV);
 
-		err = kbase_mem_pool_group_init(&kbdev->mem_pools, kbdev,
-			&mem_pool_defaults, NULL);
+		err = kbase_mem_pool_group_init(&kbdev->mem_pools, kbdev, &mem_pool_defaults, NULL);
 	}
 
 	return err;
@@ -1277,6 +1564,11 @@
 
 	kbase_mem_pool_group_term(&kbdev->mem_pools);
 
+	kbase_mem_migrate_term(kbdev);
+
+	kmem_cache_destroy(kbdev->va_region_slab);
+	kbdev->va_region_slab = NULL;
+
 	WARN_ON(kbdev->total_gpu_pages);
 	WARN_ON(!RB_EMPTY_ROOT(&kbdev->process_root));
 	WARN_ON(!RB_EMPTY_ROOT(&kbdev->dma_buf_root));
@@ -1288,7 +1580,9 @@
 KBASE_EXPORT_TEST_API(kbase_mem_term);
 
 /**
- * Allocate a free region object.
+ * kbase_alloc_free_region - Allocate a free region object.
+ *
+ * @kbdev:     kbase device
  * @rbtree:    Backlink to the red-black tree of memory regions.
  * @start_pfn: The Page Frame Number in GPU virtual address space.
  * @nr_pages:  The size of the region in pages.
@@ -1299,9 +1593,10 @@
  *
  * zone is KBASE_REG_ZONE_CUSTOM_VA or KBASE_REG_ZONE_SAME_VA.
  *
+ * Return: pointer to the allocated region object on success, NULL otherwise.
  */
-struct kbase_va_region *kbase_alloc_free_region(struct rb_root *rbtree,
-		u64 start_pfn, size_t nr_pages, int zone)
+struct kbase_va_region *kbase_alloc_free_region(struct kbase_device *kbdev, struct rb_root *rbtree,
+						u64 start_pfn, size_t nr_pages, int zone)
 {
 	struct kbase_va_region *new_reg;
 
@@ -1313,12 +1608,13 @@
 	/* 64-bit address range is the max */
 	KBASE_DEBUG_ASSERT(start_pfn + nr_pages <= (U64_MAX / PAGE_SIZE));
 
-	new_reg = kzalloc(sizeof(*new_reg), GFP_KERNEL);
+	new_reg = kmem_cache_zalloc(kbdev->va_region_slab, GFP_KERNEL);
 
 	if (!new_reg)
 		return NULL;
 
-	new_reg->va_refcnt = 1;
+	kbase_refcount_set(&new_reg->va_refcnt, 1);
+	atomic_set(&new_reg->no_user_free_count, 0);
 	new_reg->cpu_alloc = NULL; /* no alloc bound yet */
 	new_reg->gpu_alloc = NULL; /* no alloc bound yet */
 	new_reg->rbtree = rbtree;
@@ -1337,35 +1633,9 @@
 
 KBASE_EXPORT_TEST_API(kbase_alloc_free_region);
 
-static struct kbase_context *kbase_reg_flags_to_kctx(
-		struct kbase_va_region *reg)
-{
-	struct kbase_context *kctx = NULL;
-	struct rb_root *rbtree = reg->rbtree;
-
-	switch (reg->flags & KBASE_REG_ZONE_MASK) {
-	case KBASE_REG_ZONE_CUSTOM_VA:
-		kctx = container_of(rbtree, struct kbase_context,
-				reg_rbtree_custom);
-		break;
-	case KBASE_REG_ZONE_SAME_VA:
-		kctx = container_of(rbtree, struct kbase_context,
-				reg_rbtree_same);
-		break;
-	case KBASE_REG_ZONE_EXEC_VA:
-		kctx = container_of(rbtree, struct kbase_context,
-				reg_rbtree_exec);
-		break;
-	default:
-		WARN(1, "Unknown zone in region: flags=0x%lx\n", reg->flags);
-		break;
-	}
-
-	return kctx;
-}
-
 /**
- * Free a region object.
+ * kbase_free_alloced_region - Free a region object.
+ *
  * @reg: Region
  *
  * The described region must be freed of any mapping.
@@ -1374,6 +1644,7 @@
  * alloc object will be released.
  * It is a bug if no alloc object exists for non-free regions.
  *
+ * If region is KBASE_REG_ZONE_MCU_SHARED it is freed
  */
 void kbase_free_alloced_region(struct kbase_va_region *reg)
 {
@@ -1397,6 +1668,13 @@
 			(void *)reg);
 #if MALI_USE_CSF
 		if (reg->flags & KBASE_REG_CSF_EVENT)
+			/*
+			 * This should not be reachable if called from 'mcu_shared' functions
+			 * such as:
+			 * kbase_csf_firmware_mcu_shared_mapping_init
+			 * kbase_csf_firmware_mcu_shared_mapping_term
+			 */
+
 			kbase_unlink_event_mem_page(kctx, reg);
 #endif
 
@@ -1410,8 +1688,6 @@
 		 * on the list at termination time of the region tracker.
 		 */
 		if (!list_empty(&reg->gpu_alloc->evict_node)) {
-			mutex_unlock(&kctx->jit_evict_lock);
-
 			/*
 			 * Unlink the physical allocation before unmaking it
 			 * evictable so that the allocation isn't grown back to
@@ -1421,6 +1697,8 @@
 			reg->cpu_alloc->reg = NULL;
 			if (reg->cpu_alloc != reg->gpu_alloc)
 				reg->gpu_alloc->reg = NULL;
+
+			mutex_unlock(&kctx->jit_evict_lock);
 
 			/*
 			 * If a region has been made evictable then we must
@@ -1457,7 +1735,9 @@
 
 KBASE_EXPORT_TEST_API(kbase_free_alloced_region);
 
-int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg, u64 addr, size_t nr_pages, size_t align)
+int kbase_gpu_mmap(struct kbase_context *kctx, struct kbase_va_region *reg,
+		   u64 addr, size_t nr_pages, size_t align,
+		   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
 	size_t i = 0;
@@ -1494,41 +1774,46 @@
 		KBASE_DEBUG_ASSERT(alloc->imported.alias.aliased);
 		for (i = 0; i < alloc->imported.alias.nents; i++) {
 			if (alloc->imported.alias.aliased[i].alloc) {
-				err = kbase_mmu_insert_pages(kctx->kbdev,
-						&kctx->mmu,
-						reg->start_pfn + (i * stride),
-						alloc->imported.alias.aliased[i].alloc->pages + alloc->imported.alias.aliased[i].offset,
-						alloc->imported.alias.aliased[i].length,
-						reg->flags & gwt_mask,
-						kctx->as_nr,
-						group_id);
+				err = kbase_mmu_insert_aliased_pages(
+					kctx->kbdev, &kctx->mmu, reg->start_pfn + (i * stride),
+					alloc->imported.alias.aliased[i].alloc->pages +
+						alloc->imported.alias.aliased[i].offset,
+					alloc->imported.alias.aliased[i].length,
+					reg->flags & gwt_mask, kctx->as_nr, group_id, mmu_sync_info,
+					NULL);
 				if (err)
-					goto bad_insert;
+					goto bad_aliased_insert;
 
 				/* Note: mapping count is tracked at alias
 				 * creation time
 				 */
 			} else {
-				err = kbase_mmu_insert_single_page(kctx,
-					reg->start_pfn + i * stride,
-					kctx->aliasing_sink_page,
+				err = kbase_mmu_insert_single_aliased_page(
+					kctx, reg->start_pfn + i * stride, kctx->aliasing_sink_page,
 					alloc->imported.alias.aliased[i].length,
-					(reg->flags & mask & gwt_mask) | attr,
-					group_id);
+					(reg->flags & mask & gwt_mask) | attr, group_id,
+					mmu_sync_info);
 
 				if (err)
-					goto bad_insert;
+					goto bad_aliased_insert;
 			}
 		}
 	} else {
-		err = kbase_mmu_insert_pages(kctx->kbdev,
-				&kctx->mmu,
-				reg->start_pfn,
-				kbase_get_gpu_phy_pages(reg),
-				kbase_reg_current_backed_size(reg),
-				reg->flags & gwt_mask,
-				kctx->as_nr,
-				group_id);
+		if (reg->gpu_alloc->type == KBASE_MEM_TYPE_IMPORTED_UMM ||
+		    reg->gpu_alloc->type == KBASE_MEM_TYPE_IMPORTED_USER_BUF) {
+
+			err = kbase_mmu_insert_imported_pages(
+				kctx->kbdev, &kctx->mmu, reg->start_pfn,
+				kbase_get_gpu_phy_pages(reg), kbase_reg_current_backed_size(reg),
+				reg->flags & gwt_mask, kctx->as_nr, group_id, mmu_sync_info, reg);
+		} else {
+			err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
+						     kbase_get_gpu_phy_pages(reg),
+						     kbase_reg_current_backed_size(reg),
+						     reg->flags & gwt_mask, kctx->as_nr, group_id,
+						     mmu_sync_info, reg, true);
+		}
+
 		if (err)
 			goto bad_insert;
 		kbase_mem_phy_alloc_gpu_mapped(alloc);
@@ -1538,9 +1823,9 @@
 	    !WARN_ON(reg->nr_pages < reg->gpu_alloc->nents) &&
 	    reg->gpu_alloc->type == KBASE_MEM_TYPE_IMPORTED_UMM &&
 	    reg->gpu_alloc->imported.umm.current_mapping_usage_count) {
-		/* For padded imported dma-buf memory, map the dummy aliasing
-		 * page from the end of the dma-buf pages, to the end of the
-		 * region using a read only mapping.
+		/* For padded imported dma-buf or user-buf memory, map the dummy
+		 * aliasing page from the end of the imported pages, to the end of
+		 * the region using a read only mapping.
 		 *
 		 * Only map when it's imported dma-buf memory that is currently
 		 * mapped.
@@ -1548,37 +1833,46 @@
 		 * Assume reg->gpu_alloc->nents is the number of actual pages
 		 * in the dma-buf memory.
 		 */
-		err = kbase_mmu_insert_single_page(kctx,
-				reg->start_pfn + reg->gpu_alloc->nents,
-				kctx->aliasing_sink_page,
-				reg->nr_pages - reg->gpu_alloc->nents,
-				(reg->flags | KBASE_REG_GPU_RD) &
-				~KBASE_REG_GPU_WR,
-				KBASE_MEM_GROUP_SINK);
+		err = kbase_mmu_insert_single_imported_page(
+			kctx, reg->start_pfn + reg->gpu_alloc->nents, kctx->aliasing_sink_page,
+			reg->nr_pages - reg->gpu_alloc->nents,
+			(reg->flags | KBASE_REG_GPU_RD) & ~KBASE_REG_GPU_WR, KBASE_MEM_GROUP_SINK,
+			mmu_sync_info);
 		if (err)
 			goto bad_insert;
 	}
 
 	return err;
 
-bad_insert:
-	kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu,
-				 reg->start_pfn, reg->nr_pages,
-				 kctx->as_nr);
+bad_aliased_insert:
+	while (i-- > 0) {
+		struct tagged_addr *phys_alloc = NULL;
+		u64 const stride = alloc->imported.alias.stride;
 
-	kbase_remove_va_region(reg);
+		if (alloc->imported.alias.aliased[i].alloc != NULL)
+			phys_alloc = alloc->imported.alias.aliased[i].alloc->pages +
+				     alloc->imported.alias.aliased[i].offset;
+
+		kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn + (i * stride),
+					 phys_alloc, alloc->imported.alias.aliased[i].length,
+					 alloc->imported.alias.aliased[i].length, kctx->as_nr,
+					 false);
+	}
+bad_insert:
+	kbase_remove_va_region(kctx->kbdev, reg);
 
 	return err;
 }
 
 KBASE_EXPORT_TEST_API(kbase_gpu_mmap);
 
-static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
-		struct kbase_mem_phy_alloc *alloc, bool writeable);
+static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem_phy_alloc *alloc,
+				    struct kbase_va_region *reg, bool writeable);
 
 int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
 	int err = 0;
+	struct kbase_mem_phy_alloc *alloc;
 
 	if (reg->start_pfn == 0)
 		return 0;
@@ -1586,43 +1880,98 @@
 	if (!reg->gpu_alloc)
 		return -EINVAL;
 
-	/* Tear down down GPU page tables, depending on memory type. */
-	switch (reg->gpu_alloc->type) {
-	case KBASE_MEM_TYPE_ALIAS: /* Fall-through */
-	case KBASE_MEM_TYPE_IMPORTED_UMM:
-		err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu,
-				reg->start_pfn, reg->nr_pages, kctx->as_nr);
+	alloc = reg->gpu_alloc;
+
+	/* Tear down GPU page tables, depending on memory type. */
+	switch (alloc->type) {
+	case KBASE_MEM_TYPE_ALIAS: {
+			size_t i = 0;
+			/* Due to the way the number of valid PTEs and ATEs are tracked
+			 * currently, only the GPU virtual range that is backed & mapped
+			 * should be passed to the kbase_mmu_teardown_pages() function,
+			 * hence individual aliased regions needs to be unmapped
+			 * separately.
+			 */
+			for (i = 0; i < alloc->imported.alias.nents; i++) {
+				struct tagged_addr *phys_alloc = NULL;
+				int err_loop;
+
+				if (alloc->imported.alias.aliased[i].alloc != NULL)
+					phys_alloc = alloc->imported.alias.aliased[i].alloc->pages +
+						     alloc->imported.alias.aliased[i].offset;
+
+				err_loop = kbase_mmu_teardown_pages(
+					kctx->kbdev, &kctx->mmu,
+					reg->start_pfn + (i * alloc->imported.alias.stride),
+					phys_alloc, alloc->imported.alias.aliased[i].length,
+					alloc->imported.alias.aliased[i].length, kctx->as_nr,
+					false);
+
+				if (WARN_ON_ONCE(err_loop))
+					err = err_loop;
+			}
+		}
 		break;
-	default:
-		err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu,
-			reg->start_pfn, kbase_reg_current_backed_size(reg),
-			kctx->as_nr);
+	case KBASE_MEM_TYPE_IMPORTED_UMM: {
+			size_t nr_phys_pages = reg->nr_pages;
+			size_t nr_virt_pages = reg->nr_pages;
+			/* If the region has import padding and falls under the threshold for
+			 * issuing a partial GPU cache flush, we want to reduce the number of
+			 * physical pages that get flushed.
+
+			 * This is symmetric with case of mapping the memory, which first maps
+			 * each imported physical page to a separate virtual page, and then
+			 * maps the single aliasing sink page to each of the virtual padding
+			 * pages.
+			 */
+			if (reg->flags & KBASE_REG_IMPORT_PAD)
+				nr_phys_pages = alloc->nents + 1;
+
+			err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
+						       alloc->pages, nr_phys_pages, nr_virt_pages,
+						       kctx->as_nr, true);
+		}
+		break;
+	case KBASE_MEM_TYPE_IMPORTED_USER_BUF: {
+			size_t nr_reg_pages = kbase_reg_current_backed_size(reg);
+
+			err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
+						       alloc->pages, nr_reg_pages, nr_reg_pages,
+						       kctx->as_nr, true);
+		}
+		break;
+	default: {
+			size_t nr_reg_pages = kbase_reg_current_backed_size(reg);
+
+			err = kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
+						       alloc->pages, nr_reg_pages, nr_reg_pages,
+						       kctx->as_nr, false);
+		}
 		break;
 	}
 
 	/* Update tracking, and other cleanup, depending on memory type. */
-	switch (reg->gpu_alloc->type) {
+	switch (alloc->type) {
 	case KBASE_MEM_TYPE_ALIAS:
 		/* We mark the source allocs as unmapped from the GPU when
 		 * putting reg's allocs
 		 */
 		break;
 	case KBASE_MEM_TYPE_IMPORTED_USER_BUF: {
-			struct kbase_alloc_import_user_buf *user_buf =
-				&reg->gpu_alloc->imported.user_buf;
+		struct kbase_alloc_import_user_buf *user_buf = &alloc->imported.user_buf;
 
-			if (user_buf->current_mapping_usage_count & PINNED_ON_IMPORT) {
-				user_buf->current_mapping_usage_count &=
-					~PINNED_ON_IMPORT;
+		if (user_buf->current_mapping_usage_count & PINNED_ON_IMPORT) {
+			user_buf->current_mapping_usage_count &= ~PINNED_ON_IMPORT;
 
-				/* The allocation could still have active mappings. */
-				if (user_buf->current_mapping_usage_count == 0) {
-					kbase_jd_user_buf_unmap(kctx, reg->gpu_alloc,
-						(reg->flags & KBASE_REG_GPU_WR));
-				}
+			/* The allocation could still have active mappings. */
+			if (user_buf->current_mapping_usage_count == 0) {
+				kbase_jd_user_buf_unmap(kctx, alloc, reg,
+							(reg->flags &
+							 (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR)));
 			}
 		}
-		/* Fall-through */
+	}
+		fallthrough;
 	default:
 		kbase_mem_phy_alloc_gpu_unmapped(reg->gpu_alloc);
 		break;
@@ -1741,7 +2090,8 @@
 		BUG_ON(!cpu_page);
 		BUG_ON(offset + size > PAGE_SIZE);
 
-		dma_addr = kbase_dma_addr(cpu_page) + offset;
+		dma_addr = kbase_dma_addr_from_tagged(t_cpu_pa) + offset;
+
 		if (sync_fn == KBASE_SYNC_TO_CPU)
 			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr,
 					size, DMA_BIDIRECTIONAL);
@@ -1752,29 +2102,30 @@
 		void *src = NULL;
 		void *dst = NULL;
 		struct page *gpu_page;
+		dma_addr_t dma_addr;
 
 		if (WARN(!gpu_pa, "No GPU PA found for infinite cache op"))
 			return;
 
 		gpu_page = pfn_to_page(PFN_DOWN(gpu_pa));
+		dma_addr = kbase_dma_addr_from_tagged(t_gpu_pa) + offset;
 
 		if (sync_fn == KBASE_SYNC_TO_DEVICE) {
 			src = ((unsigned char *)kmap(cpu_page)) + offset;
 			dst = ((unsigned char *)kmap(gpu_page)) + offset;
 		} else if (sync_fn == KBASE_SYNC_TO_CPU) {
-			dma_sync_single_for_cpu(kctx->kbdev->dev,
-					kbase_dma_addr(gpu_page) + offset,
-					size, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, size,
+						DMA_BIDIRECTIONAL);
 			src = ((unsigned char *)kmap(gpu_page)) + offset;
 			dst = ((unsigned char *)kmap(cpu_page)) + offset;
 		}
+
 		memcpy(dst, src, size);
 		kunmap(gpu_page);
 		kunmap(cpu_page);
 		if (sync_fn == KBASE_SYNC_TO_DEVICE)
-			dma_sync_single_for_device(kctx->kbdev->dev,
-					kbase_dma_addr(gpu_page) + offset,
-					size, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, size,
+						   DMA_BIDIRECTIONAL);
 	}
 }
 
@@ -1920,29 +2271,27 @@
 		__func__, (void *)reg, (void *)kctx);
 	lockdep_assert_held(&kctx->reg_lock);
 
-	if (reg->flags & KBASE_REG_NO_USER_FREE) {
+	if (kbase_va_region_is_no_user_free(reg)) {
 		dev_warn(kctx->kbdev->dev, "Attempt to free GPU memory whose freeing by user space is forbidden!\n");
 		return -EINVAL;
 	}
 
-	/*
-	 * Unlink the physical allocation before unmaking it evictable so
-	 * that the allocation isn't grown back to its last backed size
-	 * as we're going to unmap it anyway.
-	 */
-	reg->cpu_alloc->reg = NULL;
-	if (reg->cpu_alloc != reg->gpu_alloc)
-		reg->gpu_alloc->reg = NULL;
-
-	/*
-	 * If a region has been made evictable then we must unmake it
+	/* If a region has been made evictable then we must unmake it
 	 * before trying to free it.
 	 * If the memory hasn't been reclaimed it will be unmapped and freed
 	 * below, if it has been reclaimed then the operations below are no-ops.
 	 */
 	if (reg->flags & KBASE_REG_DONT_NEED) {
-		KBASE_DEBUG_ASSERT(reg->cpu_alloc->type ==
-				   KBASE_MEM_TYPE_NATIVE);
+		WARN_ON(reg->cpu_alloc->type != KBASE_MEM_TYPE_NATIVE);
+		mutex_lock(&kctx->jit_evict_lock);
+		/* Unlink the physical allocation before unmaking it evictable so
+		 * that the allocation isn't grown back to its last backed size
+		 * as we're going to unmap it anyway.
+		 */
+		reg->cpu_alloc->reg = NULL;
+		if (reg->cpu_alloc != reg->gpu_alloc)
+			reg->gpu_alloc->reg = NULL;
+		mutex_unlock(&kctx->jit_evict_lock);
 		kbase_mem_evictable_unmake(reg->gpu_alloc);
 	}
 
@@ -1952,22 +2301,35 @@
 		goto out;
 	}
 
+#if MALI_USE_CSF
+	if (((reg->flags & KBASE_REG_ZONE_MASK) == KBASE_REG_ZONE_FIXED_VA) ||
+	    ((reg->flags & KBASE_REG_ZONE_MASK) == KBASE_REG_ZONE_EXEC_FIXED_VA)) {
+		if (reg->flags & KBASE_REG_FIXED_ADDRESS)
+			atomic64_dec(&kctx->num_fixed_allocs);
+		else
+			atomic64_dec(&kctx->num_fixable_allocs);
+	}
+#endif
+
 	/* This will also free the physical pages */
 	kbase_free_alloced_region(reg);
 
- out:
+out:
 	return err;
 }
 
 KBASE_EXPORT_TEST_API(kbase_mem_free_region);
 
 /**
- * Free the region from the GPU and unregister it.
+ * kbase_mem_free - Free the region from the GPU and unregister it.
+ *
  * @kctx:  KBase context
  * @gpu_addr: GPU address to free
  *
  * This function implements the free operation on a memory segment.
  * It will loudly fail if called with outstanding mappings.
+ *
+ * Return: 0 on success.
  */
 int kbase_mem_free(struct kbase_context *kctx, u64 gpu_addr)
 {
@@ -1979,12 +2341,14 @@
 		__func__, gpu_addr, (void *)kctx);
 
 	if ((gpu_addr & ~PAGE_MASK) && (gpu_addr >= PAGE_SIZE)) {
-		dev_warn(kctx->kbdev->dev, "kbase_mem_free: gpu_addr parameter is invalid");
+		dev_warn(kctx->kbdev->dev, "%s: gpu_addr parameter is invalid", __func__);
 		return -EINVAL;
 	}
 
 	if (gpu_addr == 0) {
-		dev_warn(kctx->kbdev->dev, "gpu_addr 0 is reserved for the ringbuffer and it's an error to try to free it using kbase_mem_free\n");
+		dev_warn(kctx->kbdev->dev,
+			"gpu_addr 0 is reserved for the ringbuffer and it's an error to try to free it using %s\n",
+			__func__);
 		return -EINVAL;
 	}
 	kbase_gpu_vm_lock(kctx);
@@ -2010,8 +2374,8 @@
 		/* Validate the region */
 		reg = kbase_region_tracker_find_region_base_address(kctx, gpu_addr);
 		if (kbase_is_region_invalid_or_free(reg)) {
-			dev_warn(kctx->kbdev->dev, "kbase_mem_free called with nonexistent gpu_addr 0x%llX",
-					gpu_addr);
+			dev_warn(kctx->kbdev->dev, "%s called with nonexistent gpu_addr 0x%llX",
+				__func__, gpu_addr);
 			err = -EINVAL;
 			goto out_unlock;
 		}
@@ -2026,7 +2390,7 @@
 		err = kbase_mem_free_region(kctx, reg);
 	}
 
- out_unlock:
+out_unlock:
 	kbase_gpu_vm_unlock(kctx);
 	return err;
 }
@@ -2126,11 +2490,19 @@
 	if (flags & BASEP_MEM_PERMANENT_KERNEL_MAPPING)
 		reg->flags |= KBASE_REG_PERMANENT_KERNEL_MAPPING;
 
-	if (flags & BASEP_MEM_NO_USER_FREE)
-		reg->flags |= KBASE_REG_NO_USER_FREE;
+	if (flags & BASEP_MEM_NO_USER_FREE) {
+		kbase_gpu_vm_lock(kctx);
+		kbase_va_region_no_user_free_inc(reg);
+		kbase_gpu_vm_unlock(kctx);
+	}
 
 	if (flags & BASE_MEM_GPU_VA_SAME_4GB_PAGE)
 		reg->flags |= KBASE_REG_GPU_VA_SAME_4GB_PAGE;
+
+#if MALI_USE_CSF
+	if (flags & BASE_MEM_FIXED)
+		reg->flags |= KBASE_REG_FIXED_ADDRESS;
+#endif
 
 	return 0;
 }
@@ -2174,18 +2546,14 @@
 
 	tp = alloc->pages + alloc->nents;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
 	/* Check if we have enough pages requested so we can allocate a large
 	 * page (512 * 4KB = 2MB )
 	 */
-	if (nr_left >= (SZ_2M / SZ_4K)) {
+	if (kbdev->pagesize_2mb && nr_left >= (SZ_2M / SZ_4K)) {
 		int nr_lp = nr_left / (SZ_2M / SZ_4K);
 
-		res = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.large[alloc->group_id],
-			 nr_lp * (SZ_2M / SZ_4K),
-			 tp,
-			 true);
+		res = kbase_mem_pool_alloc_pages(&kctx->mem_pools.large[alloc->group_id],
+						 nr_lp * (SZ_2M / SZ_4K), tp, true, kctx->task);
 
 		if (res > 0) {
 			nr_left -= res;
@@ -2239,7 +2607,7 @@
 
 				err = kbase_mem_pool_grow(
 					&kctx->mem_pools.large[alloc->group_id],
-					1);
+					1, kctx->task);
 				if (err)
 					break;
 			} while (1);
@@ -2280,13 +2648,11 @@
 			}
 		}
 	}
-no_new_partial:
-#endif
 
+no_new_partial:
 	if (nr_left) {
-		res = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.small[alloc->group_id],
-			nr_left, tp, false);
+		res = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[alloc->group_id], nr_left,
+						 tp, false, kctx->task);
 		if (res <= 0)
 			goto alloc_failed;
 	}
@@ -2345,17 +2711,16 @@
 
 	lockdep_assert_held(&pool->pool_lock);
 
-#if !defined(CONFIG_MALI_2MB_ALLOC)
-	WARN_ON(pool->order);
-#endif
+	kctx = alloc->imported.native.kctx;
+	kbdev = kctx->kbdev;
+
+	if (!kbdev->pagesize_2mb)
+		WARN_ON(pool->order);
 
 	if (alloc->reg) {
 		if (nr_pages_requested > alloc->reg->nr_pages - alloc->nents)
 			goto invalid_request;
 	}
-
-	kctx = alloc->imported.native.kctx;
-	kbdev = kctx->kbdev;
 
 	lockdep_assert_held(&kctx->mem_partials_lock);
 
@@ -2375,8 +2740,7 @@
 	tp = alloc->pages + alloc->nents;
 	new_pages = tp;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-	if (pool->order) {
+	if (kbdev->pagesize_2mb && pool->order) {
 		int nr_lp = nr_left / (SZ_2M / SZ_4K);
 
 		res = kbase_mem_pool_alloc_pages_locked(pool,
@@ -2460,15 +2824,12 @@
 		if (nr_left)
 			goto alloc_failed;
 	} else {
-#endif
 		res = kbase_mem_pool_alloc_pages_locked(pool,
 						 nr_left,
 						 tp);
 		if (res <= 0)
 			goto alloc_failed;
-#ifdef CONFIG_MALI_2MB_ALLOC
 	}
-#endif
 
 	KBASE_TLSTREAM_AUX_PAGESALLOC(
 			kbdev,
@@ -2489,8 +2850,7 @@
 
 		struct tagged_addr *start_free = alloc->pages + alloc->nents;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-		if (pool->order) {
+		if (kbdev->pagesize_2mb && pool->order) {
 			while (nr_pages_to_free) {
 				if (is_huge_head(*start_free)) {
 					kbase_mem_pool_free_pages_locked(
@@ -2508,15 +2868,12 @@
 				}
 			}
 		} else {
-#endif
 			kbase_mem_pool_free_pages_locked(pool,
 					nr_pages_to_free,
 					start_free,
 					false, /* not dirty */
 					true); /* return to pool */
-#ifdef CONFIG_MALI_2MB_ALLOC
 		}
-#endif
 	}
 
 	kbase_process_page_usage_dec(kctx, nr_pages_requested);
@@ -2778,6 +3135,13 @@
 /**
  * kbase_jd_user_buf_unpin_pages - Release the pinned pages of a user buffer.
  * @alloc: The allocation for the imported user buffer.
+ *
+ * This must only be called when terminating an alloc, when its refcount
+ * (number of users) has become 0. This also ensures it is only called once all
+ * CPU mappings have been closed.
+ *
+ * Instead call kbase_jd_user_buf_unmap() if you need to unpin pages on active
+ * allocations
  */
 static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc);
 #endif
@@ -2908,8 +3272,31 @@
 out_term:
 	return -1;
 }
-
 KBASE_EXPORT_TEST_API(kbase_alloc_phy_pages);
+
+void kbase_set_phy_alloc_page_status(struct kbase_mem_phy_alloc *alloc,
+				     enum kbase_page_status status)
+{
+	u32 i = 0;
+
+	for (; i < alloc->nents; i++) {
+		struct tagged_addr phys = alloc->pages[i];
+		struct kbase_page_metadata *page_md = kbase_page_private(as_page(phys));
+
+		/* Skip the 4KB page that is part of a large page, as the large page is
+		 * excluded from the migration process.
+		 */
+		if (is_huge(phys) || is_partial(phys))
+			continue;
+
+		if (!page_md)
+			continue;
+
+		spin_lock(&page_md->migrate_lock);
+		page_md->status = PAGE_STATUS_SET(page_md->status, (u8)status);
+		spin_unlock(&page_md->migrate_lock);
+	}
+}
 
 bool kbase_check_alloc_flags(unsigned long flags)
 {
@@ -2983,6 +3370,14 @@
 			(BASE_MEM_COHERENT_LOCAL | BASE_MEM_COHERENT_SYSTEM))
 		return false;
 
+#if MALI_USE_CSF
+	if ((flags & BASE_MEM_SAME_VA) && (flags & (BASE_MEM_FIXABLE | BASE_MEM_FIXED)))
+		return false;
+
+	if ((flags & BASE_MEM_FIXABLE) && (flags & BASE_MEM_FIXED))
+		return false;
+#endif
+
 	return true;
 }
 
@@ -3004,7 +3399,11 @@
 	if (flags & BASE_MEM_GROW_ON_GPF)
 		return false;
 
-#if !MALI_USE_CSF
+#if MALI_USE_CSF
+	/* Imported memory cannot be fixed */
+	if ((flags & (BASE_MEM_FIXED | BASE_MEM_FIXABLE)))
+		return false;
+#else
 	/* Imported memory cannot be aligned to the end of its initial commit */
 	if (flags & BASE_MEM_TILER_ALIGN_TOP)
 		return false;
@@ -3139,10 +3538,6 @@
 #undef KBASE_MSG_PRE
 }
 
-/**
- * Acquire the per-context region list lock
- * @kctx:  KBase context
- */
 void kbase_gpu_vm_lock(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx != NULL);
@@ -3151,10 +3546,6 @@
 
 KBASE_EXPORT_TEST_API(kbase_gpu_vm_lock);
 
-/**
- * Release the per-context region list lock
- * @kctx:  KBase context
- */
 void kbase_gpu_vm_unlock(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx != NULL);
@@ -3165,7 +3556,7 @@
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 struct kbase_jit_debugfs_data {
-	int (*func)(struct kbase_jit_debugfs_data *);
+	int (*func)(struct kbase_jit_debugfs_data *data);
 	struct mutex lock;
 	struct kbase_context *kctx;
 	u64 active_value;
@@ -3388,14 +3779,9 @@
 void kbase_jit_debugfs_init(struct kbase_context *kctx)
 {
 	/* prevent unprivileged use of debug file system
-         * in old kernel version
-         */
-#if (KERNEL_VERSION(4, 7, 0) <= LINUX_VERSION_CODE)
-	/* only for newer kernel version debug file system is safe */
+	 * in old kernel version
+	 */
 	const mode_t mode = 0444;
-#else
-	const mode_t mode = 0400;
-#endif
 
 	/* Caller already ensures this, but we keep the pattern for
 	 * maintenance safety.
@@ -3469,7 +3855,15 @@
 		mutex_unlock(&kctx->jit_evict_lock);
 
 		kbase_gpu_vm_lock(kctx);
-		reg->flags &= ~KBASE_REG_NO_USER_FREE;
+
+		/*
+		 * Incrementing the refcount is prevented on JIT regions.
+		 * If/when this ever changes we would need to compensate
+		 * by implementing "free on putting the last reference",
+		 * but only for JIT regions.
+		 */
+		WARN_ON(atomic_read(&reg->no_user_free_count) > 1);
+		kbase_va_region_no_user_free_dec(reg);
 		kbase_mem_free_region(kctx, reg);
 		kbase_gpu_vm_unlock(kctx);
 	} while (1);
@@ -3484,6 +3878,7 @@
 	INIT_WORK(&kctx->jit_work, kbase_jit_destroy_worker);
 
 #if MALI_USE_CSF
+	mutex_init(&kctx->csf.kcpu_queues.jit_lock);
 	INIT_LIST_HEAD(&kctx->csf.kcpu_queues.jit_cmds_head);
 	INIT_LIST_HEAD(&kctx->csf.kcpu_queues.jit_blocked_queues);
 #else /* !MALI_USE_CSF */
@@ -3698,7 +4093,8 @@
 static int kbase_jit_grow(struct kbase_context *kctx,
 			  const struct base_jit_alloc_info *info,
 			  struct kbase_va_region *reg,
-			  struct kbase_sub_alloc **prealloc_sas)
+			  struct kbase_sub_alloc **prealloc_sas,
+			  enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	size_t delta;
 	size_t pages_required;
@@ -3728,18 +4124,14 @@
 	delta = info->commit_pages - reg->gpu_alloc->nents;
 	pages_required = delta;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-	if (pages_required >= (SZ_2M / SZ_4K)) {
+	if (kctx->kbdev->pagesize_2mb && pages_required >= (SZ_2M / SZ_4K)) {
 		pool = &kctx->mem_pools.large[kctx->jit_group_id];
 		/* Round up to number of 2 MB pages required */
 		pages_required += ((SZ_2M / SZ_4K) - 1);
 		pages_required /= (SZ_2M / SZ_4K);
 	} else {
-#endif
 		pool = &kctx->mem_pools.small[kctx->jit_group_id];
-#ifdef CONFIG_MALI_2MB_ALLOC
 	}
-#endif
 
 	if (reg->cpu_alloc != reg->gpu_alloc)
 		pages_required *= 2;
@@ -3760,7 +4152,7 @@
 		spin_unlock(&kctx->mem_partials_lock);
 
 		kbase_gpu_vm_unlock(kctx);
-		ret = kbase_mem_pool_grow(pool, pool_delta);
+		ret = kbase_mem_pool_grow(pool, pool_delta, kctx->task);
 		kbase_gpu_vm_lock(kctx);
 
 		if (ret)
@@ -3795,7 +4187,7 @@
 	spin_unlock(&kctx->mem_partials_lock);
 
 	ret = kbase_mem_grow_gpu_mapping(kctx, reg, info->commit_pages,
-			old_size);
+					 old_size, mmu_sync_info);
 	/*
 	 * The grow failed so put the allocation back in the
 	 * pool and return failure.
@@ -3920,11 +4312,11 @@
 		const struct base_jit_alloc_info *info,
 		bool ignore_pressure_limit)
 {
-#if MALI_USE_CSF
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-#else
+#if !MALI_USE_CSF
 	lockdep_assert_held(&kctx->jctx.lock);
-#endif
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
 
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 	if (!ignore_pressure_limit &&
@@ -4010,23 +4402,28 @@
 	struct kbase_sub_alloc *prealloc_sas[2] = { NULL, NULL };
 	int i;
 
-#if MALI_USE_CSF
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-#else
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+
+#if !MALI_USE_CSF
 	lockdep_assert_held(&kctx->jctx.lock);
-#endif
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
 
 	if (!jit_allow_allocate(kctx, info, ignore_pressure_limit))
 		return NULL;
 
-#ifdef CONFIG_MALI_2MB_ALLOC
-	/* Preallocate memory for the sub-allocation structs */
-	for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
-		prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
-		if (!prealloc_sas[i])
-			goto end;
+	if (kctx->kbdev->pagesize_2mb) {
+		/* Preallocate memory for the sub-allocation structs */
+		for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
+			prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
+			if (!prealloc_sas[i])
+				goto end;
+		}
 	}
-#endif
 
 	kbase_gpu_vm_lock(kctx);
 	mutex_lock(&kctx->jit_evict_lock);
@@ -4102,7 +4499,8 @@
 		 * so any state protected by that lock might need to be
 		 * re-evaluated if more code is added here in future.
 		 */
-		ret = kbase_jit_grow(kctx, info, reg, prealloc_sas);
+		ret = kbase_jit_grow(kctx, info, reg, prealloc_sas,
+				     mmu_sync_info);
 
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 		if (!ignore_pressure_limit)
@@ -4114,7 +4512,7 @@
 		if (ret < 0) {
 			/*
 			 * An update to an allocation from the pool failed,
-			 * chances are slim a new allocation would fair any
+			 * chances are slim a new allocation would fare any
 			 * better so return the allocation to the pool and
 			 * return the function with failure.
 			 */
@@ -4136,6 +4534,17 @@
 			mutex_unlock(&kctx->jit_evict_lock);
 			reg = NULL;
 			goto end;
+		} else {
+			/* A suitable JIT allocation existed on the evict list, so we need
+			 * to make sure that the NOT_MOVABLE property is cleared.
+			 */
+			if (kbase_page_migration_enabled) {
+				kbase_gpu_vm_lock(kctx);
+				mutex_lock(&kctx->jit_evict_lock);
+				kbase_set_phy_alloc_page_status(reg->gpu_alloc, ALLOCATED_MAPPED);
+				mutex_unlock(&kctx->jit_evict_lock);
+				kbase_gpu_vm_unlock(kctx);
+			}
 		}
 	} else {
 		/* No suitable JIT allocation was found so create a new one */
@@ -4150,7 +4559,7 @@
 			flags |= BASE_MEM_TILER_ALIGN_TOP;
 #endif /* !MALI_USE_CSF */
 
-		flags |= base_mem_group_id_set(kctx->jit_group_id);
+		flags |= kbase_mem_group_id_set(kctx->jit_group_id);
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 		if (!ignore_pressure_limit) {
 			flags |= BASEP_MEM_PERFORM_JIT_TRIM;
@@ -4165,8 +4574,8 @@
 		mutex_unlock(&kctx->jit_evict_lock);
 		kbase_gpu_vm_unlock(kctx);
 
-		reg = kbase_mem_alloc(kctx, info->va_pages, info->commit_pages,
-				      info->extension, &flags, &gpu_addr);
+		reg = kbase_mem_alloc(kctx, info->va_pages, info->commit_pages, info->extension,
+				      &flags, &gpu_addr, mmu_sync_info);
 		if (!reg) {
 			/* Most likely not enough GPU virtual space left for
 			 * the new JIT allocation.
@@ -4192,6 +4601,29 @@
 		}
 	}
 
+	/* Similarly to tiler heap init, there is a short window of time
+	 * where the (either recycled or newly allocated, in our case) region has
+	 * "no user free" count incremented but is still missing the DONT_NEED flag, and
+	 * doesn't yet have the ACTIVE_JIT_ALLOC flag either. Temporarily leaking the
+	 * allocation is the least bad option that doesn't lead to a security issue down the
+	 * line (it will eventually be cleaned up during context termination).
+	 *
+	 * We also need to call kbase_gpu_vm_lock regardless, as we're updating the region
+	 * flags.
+	 */
+	kbase_gpu_vm_lock(kctx);
+	if (unlikely(atomic_read(&reg->no_user_free_count) > 1)) {
+		kbase_gpu_vm_unlock(kctx);
+		dev_err(kctx->kbdev->dev, "JIT region has no_user_free_count > 1!\n");
+
+		mutex_lock(&kctx->jit_evict_lock);
+		list_move(&reg->jit_node, &kctx->jit_pool_head);
+		mutex_unlock(&kctx->jit_evict_lock);
+
+		reg = NULL;
+		goto end;
+	}
+
 	trace_mali_jit_alloc(reg, info->id);
 
 	kctx->jit_current_allocations++;
@@ -4209,6 +4641,7 @@
 	kbase_jit_report_update_pressure(kctx, reg, info->va_pages,
 			KBASE_JIT_REPORT_ON_ALLOC_OR_FREE);
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
+	kbase_gpu_vm_unlock(kctx);
 
 end:
 	for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i)
@@ -4220,6 +4653,12 @@
 void kbase_jit_free(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
 	u64 old_pages;
+
+#if !MALI_USE_CSF
+	lockdep_assert_held(&kctx->jctx.lock);
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
 
 	/* JIT id not immediately available here, so use 0u */
 	trace_mali_jit_free(reg, 0u);
@@ -4273,6 +4712,12 @@
 
 	list_move(&reg->jit_node, &kctx->jit_pool_head);
 
+	/* Inactive JIT regions should be freed by the shrinker and not impacted
+	 * by page migration. Once freed, they will enter into the page migration
+	 * state machine via the mempools.
+	 */
+	if (kbase_page_migration_enabled)
+		kbase_set_phy_alloc_page_status(reg->gpu_alloc, NOT_MOVABLE);
 	mutex_unlock(&kctx->jit_evict_lock);
 }
 
@@ -4319,7 +4764,14 @@
 	mutex_unlock(&kctx->jit_evict_lock);
 
 	if (reg) {
-		reg->flags &= ~KBASE_REG_NO_USER_FREE;
+		/*
+		 * Incrementing the refcount is prevented on JIT regions.
+		 * If/when this ever changes we would need to compensate
+		 * by implementing "free on putting the last reference",
+		 * but only for JIT regions.
+		 */
+		WARN_ON(atomic_read(&reg->no_user_free_count) > 1);
+		kbase_va_region_no_user_free_dec(reg);
 		kbase_mem_free_region(kctx, reg);
 	}
 
@@ -4341,7 +4793,14 @@
 		list_del(&walker->jit_node);
 		list_del_init(&walker->gpu_alloc->evict_node);
 		mutex_unlock(&kctx->jit_evict_lock);
-		walker->flags &= ~KBASE_REG_NO_USER_FREE;
+		/*
+		 * Incrementing the refcount is prevented on JIT regions.
+		 * If/when this ever changes we would need to compensate
+		 * by implementing "free on putting the last reference",
+		 * but only for JIT regions.
+		 */
+		WARN_ON(atomic_read(&walker->no_user_free_count) > 1);
+		kbase_va_region_no_user_free_dec(walker);
 		kbase_mem_free_region(kctx, walker);
 		mutex_lock(&kctx->jit_evict_lock);
 	}
@@ -4353,7 +4812,14 @@
 		list_del(&walker->jit_node);
 		list_del_init(&walker->gpu_alloc->evict_node);
 		mutex_unlock(&kctx->jit_evict_lock);
-		walker->flags &= ~KBASE_REG_NO_USER_FREE;
+		/*
+		 * Incrementing the refcount is prevented on JIT regions.
+		 * If/when this ever changes we would need to compensate
+		 * by implementing "free on putting the last reference",
+		 * but only for JIT regions.
+		 */
+		WARN_ON(atomic_read(&walker->no_user_free_count) > 1);
+		kbase_va_region_no_user_free_dec(walker);
 		kbase_mem_free_region(kctx, walker);
 		mutex_lock(&kctx->jit_evict_lock);
 	}
@@ -4396,8 +4862,8 @@
 
 	addr_start = reg->heap_info_gpu_addr - jit_report_gpu_mem_offset;
 
-	ptr = kbase_vmap(kctx, addr_start, KBASE_JIT_REPORT_GPU_MEM_SIZE,
-			&mapping);
+	ptr = kbase_vmap_prot(kctx, addr_start, KBASE_JIT_REPORT_GPU_MEM_SIZE,
+			KBASE_REG_CPU_RD, &mapping);
 	if (!ptr) {
 		dev_warn(kctx->kbdev->dev,
 				"%s: JIT start=0x%llx unable to map memory near end pointer %llx\n",
@@ -4455,17 +4921,44 @@
 }
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
+void kbase_unpin_user_buf_page(struct page *page)
+{
+#if KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE
+	put_page(page);
+#else
+	unpin_user_page(page);
+#endif
+}
+
 #if MALI_USE_CSF
 static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc)
 {
-	if (alloc->nents) {
+	/* In CSF builds, we keep pages pinned until the last reference is
+	 * released on the alloc. A refcount of 0 also means we can be sure
+	 * that all CPU mappings have been closed on this alloc, and no more
+	 * mappings of it will be created.
+	 *
+	 * Further, the WARN() below captures the restriction that this
+	 * function will not handle anything other than the alloc termination
+	 * path, because the caller of kbase_mem_phy_alloc_put() is not
+	 * required to hold the kctx's reg_lock, and so we could not handle
+	 * removing an existing CPU mapping here.
+	 *
+	 * Refer to this function's kernel-doc comments for alternatives for
+	 * unpinning a User buffer.
+	 */
+
+	if (alloc->nents && !WARN(kref_read(&alloc->kref) != 0,
+				  "must only be called on terminating an allocation")) {
 		struct page **pages = alloc->imported.user_buf.pages;
 		long i;
 
 		WARN_ON(alloc->nents != alloc->imported.user_buf.nr_pages);
 
 		for (i = 0; i < alloc->nents; i++)
-			put_page(pages[i]);
+			kbase_unpin_user_buf_page(pages[i]);
+
+		alloc->nents = 0;
 	}
 }
 #endif
@@ -4479,6 +4972,9 @@
 	struct mm_struct *mm = alloc->imported.user_buf.mm;
 	long pinned_pages;
 	long i;
+	int write;
+
+	lockdep_assert_held(&kctx->reg_lock);
 
 	if (WARN_ON(alloc->type != KBASE_MEM_TYPE_IMPORTED_USER_BUF))
 		return -EINVAL;
@@ -4493,44 +4989,28 @@
 	if (WARN_ON(reg->gpu_alloc->imported.user_buf.mm != current->mm))
 		return -EINVAL;
 
-#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
-	pinned_pages = get_user_pages(NULL, mm,
-			address,
-			alloc->imported.user_buf.nr_pages,
-#if KERNEL_VERSION(4, 4, 168) <= LINUX_VERSION_CODE && \
-KERNEL_VERSION(4, 5, 0) > LINUX_VERSION_CODE
-			reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0,
-			pages, NULL);
+	write = reg->flags & (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR);
+
+#if KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE
+	pinned_pages = get_user_pages_remote(NULL, mm, address, alloc->imported.user_buf.nr_pages,
+					     write ? FOLL_WRITE : 0, pages, NULL);
+#elif KERNEL_VERSION(5, 9, 0) > LINUX_VERSION_CODE
+	pinned_pages = get_user_pages_remote(NULL, mm, address, alloc->imported.user_buf.nr_pages,
+					     write ? FOLL_WRITE : 0, pages, NULL, NULL);
 #else
-			reg->flags & KBASE_REG_GPU_WR,
-			0, pages, NULL);
-#endif
-#elif KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE
-	pinned_pages = get_user_pages_remote(NULL, mm,
-			address,
-			alloc->imported.user_buf.nr_pages,
-			reg->flags & KBASE_REG_GPU_WR,
-			0, pages, NULL);
-#elif KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE
-	pinned_pages = get_user_pages_remote(NULL, mm,
-			address,
-			alloc->imported.user_buf.nr_pages,
-			reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0,
-			pages, NULL);
-#else
-	pinned_pages = get_user_pages_remote(NULL, mm,
-			address,
-			alloc->imported.user_buf.nr_pages,
-			reg->flags & KBASE_REG_GPU_WR ? FOLL_WRITE : 0,
-			pages, NULL, NULL);
+	pinned_pages = pin_user_pages_remote(mm, address, alloc->imported.user_buf.nr_pages,
+					     write ? FOLL_WRITE : 0, pages, NULL, NULL);
 #endif
 
 	if (pinned_pages <= 0)
 		return pinned_pages;
 
 	if (pinned_pages != alloc->imported.user_buf.nr_pages) {
+		/* Above code already ensures there will not have been a CPU
+		 * mapping by ensuring alloc->nents is 0
+		 */
 		for (i = 0; i < pinned_pages; i++)
-			put_page(pages[i]);
+			kbase_unpin_user_buf_page(pages[i]);
 		return -ENOMEM;
 	}
 
@@ -4542,46 +5022,64 @@
 static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 		struct kbase_va_region *reg)
 {
-	long pinned_pages;
+	int err;
+	long pinned_pages = 0;
 	struct kbase_mem_phy_alloc *alloc;
 	struct page **pages;
 	struct tagged_addr *pa;
-	long i;
-	unsigned long address;
+	long i, dma_mapped_pages;
 	struct device *dev;
-	unsigned long offset;
-	unsigned long local_size;
 	unsigned long gwt_mask = ~0;
-	int err = kbase_jd_user_buf_pin_pages(kctx, reg);
+	/* Calls to this function are inherently asynchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
+
+	lockdep_assert_held(&kctx->reg_lock);
+
+	err = kbase_jd_user_buf_pin_pages(kctx, reg);
 
 	if (err)
 		return err;
 
 	alloc = reg->gpu_alloc;
 	pa = kbase_get_gpu_phy_pages(reg);
-	address = alloc->imported.user_buf.address;
 	pinned_pages = alloc->nents;
 	pages = alloc->imported.user_buf.pages;
 	dev = kctx->kbdev->dev;
-	offset = address & ~PAGE_MASK;
-	local_size = alloc->imported.user_buf.size;
 
+	/* Manual CPU cache synchronization.
+	 *
+	 * The driver disables automatic CPU cache synchronization because the
+	 * memory pages that enclose the imported region may also contain
+	 * sub-regions which are not imported and that are allocated and used
+	 * by the user process. This may be the case of memory at the beginning
+	 * of the first page and at the end of the last page. Automatic CPU cache
+	 * synchronization would force some operations on those memory allocations,
+	 * unbeknown to the user process: in particular, a CPU cache invalidate
+	 * upon unmapping would destroy the content of dirty CPU caches and cause
+	 * the user process to lose CPU writes to the non-imported sub-regions.
+	 *
+	 * When the GPU claims ownership of the imported memory buffer, it shall
+	 * commit CPU writes for the whole of all pages that enclose the imported
+	 * region, otherwise the initial content of memory would be wrong.
+	 */
 	for (i = 0; i < pinned_pages; i++) {
 		dma_addr_t dma_addr;
-		unsigned long min;
-
-		min = MIN(PAGE_SIZE - offset, local_size);
-		dma_addr = dma_map_page(dev, pages[i],
-				offset, min,
-				DMA_BIDIRECTIONAL);
-		if (dma_mapping_error(dev, dma_addr))
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_addr = dma_map_page(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_addr = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, DMA_BIDIRECTIONAL,
+					      DMA_ATTR_SKIP_CPU_SYNC);
+#endif
+		err = dma_mapping_error(dev, dma_addr);
+		if (err)
 			goto unwind;
 
 		alloc->imported.user_buf.dma_addrs[i] = dma_addr;
 		pa[i] = as_tagged(page_to_phys(pages[i]));
 
-		local_size -= min;
-		offset = 0;
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
 	}
 
 #ifdef CONFIG_MALI_CINSTR_GWT
@@ -4589,24 +5087,46 @@
 		gwt_mask = ~KBASE_REG_GPU_WR;
 #endif
 
-	err = kbase_mmu_insert_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
-			pa, kbase_reg_current_backed_size(reg),
-			reg->flags & gwt_mask, kctx->as_nr,
-			alloc->group_id);
+	err = kbase_mmu_insert_imported_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn, pa,
+					      kbase_reg_current_backed_size(reg),
+					      reg->flags & gwt_mask, kctx->as_nr, alloc->group_id,
+					      mmu_sync_info, NULL);
 	if (err == 0)
 		return 0;
 
 	/* fall down */
 unwind:
 	alloc->nents = 0;
-	while (i--) {
-		dma_unmap_page(kctx->kbdev->dev,
-				alloc->imported.user_buf.dma_addrs[i],
-				PAGE_SIZE, DMA_BIDIRECTIONAL);
+	dma_mapped_pages = i;
+	/* Run the unmap loop in the same order as map loop, and perform again
+	 * CPU cache synchronization to re-write the content of dirty CPU caches
+	 * to memory. This is precautionary measure in case a GPU job has taken
+	 * advantage of a partially GPU-mapped range to write and corrupt the
+	 * content of memory, either inside or outside the imported region.
+	 *
+	 * Notice that this error recovery path doesn't try to be optimal and just
+	 * flushes the entire page range.
+	 */
+	for (i = 0; i < dma_mapped_pages; i++) {
+		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
+
+		dma_sync_single_for_device(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL,
+				     DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 	}
 
-	while (++i < pinned_pages) {
-		put_page(pages[i]);
+	/* The user buffer could already have been previously pinned before
+	 * entering this function, and hence there could potentially be CPU
+	 * mappings of it
+	 */
+	kbase_mem_shrink_cpu_mapping(kctx, reg, 0, pinned_pages);
+
+	for (i = 0; i < pinned_pages; i++) {
+		kbase_unpin_user_buf_page(pages[i]);
 		pages[i] = NULL;
 	}
 
@@ -4617,30 +5137,118 @@
  * GPUs, which implies that a call to kbase_jd_user_buf_pin_pages() will NOT
  * have a corresponding call to kbase_jd_user_buf_unpin_pages().
  */
-static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
-		struct kbase_mem_phy_alloc *alloc, bool writeable)
+static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem_phy_alloc *alloc,
+				    struct kbase_va_region *reg, bool writeable)
 {
 	long i;
 	struct page **pages;
-	unsigned long size = alloc->imported.user_buf.size;
+	unsigned long offset_within_page = alloc->imported.user_buf.address & ~PAGE_MASK;
+	unsigned long remaining_size = alloc->imported.user_buf.size;
+
+	lockdep_assert_held(&kctx->reg_lock);
 
 	KBASE_DEBUG_ASSERT(alloc->type == KBASE_MEM_TYPE_IMPORTED_USER_BUF);
 	pages = alloc->imported.user_buf.pages;
+
+#if !MALI_USE_CSF
+	kbase_mem_shrink_cpu_mapping(kctx, reg, 0, alloc->nents);
+#else
+	CSTD_UNUSED(reg);
+#endif
+
 	for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
-		unsigned long local_size;
+		unsigned long imported_size = MIN(remaining_size, PAGE_SIZE - offset_within_page);
+		/* Notice: this is a temporary variable that is used for DMA sync
+		 * operations, and that could be incremented by an offset if the
+		 * current page contains both imported and non-imported memory
+		 * sub-regions.
+		 *
+		 * It is valid to add an offset to this value, because the offset
+		 * is always kept within the physically contiguous dma-mapped range
+		 * and there's no need to translate to physical address to offset it.
+		 *
+		 * This variable is not going to be used for the actual DMA unmap
+		 * operation, that shall always use the original DMA address of the
+		 * whole memory page.
+		 */
 		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
-		local_size = MIN(size, PAGE_SIZE - (dma_addr & ~PAGE_MASK));
-		dma_unmap_page(kctx->kbdev->dev, dma_addr, local_size,
-				DMA_BIDIRECTIONAL);
+		/* Manual CPU cache synchronization.
+		 *
+		 * When the GPU returns ownership of the buffer to the CPU, the driver
+		 * needs to treat imported and non-imported memory differently.
+		 *
+		 * The first case to consider is non-imported sub-regions at the
+		 * beginning of the first page and at the end of last page. For these
+		 * sub-regions: CPU cache shall be committed with a clean+invalidate,
+		 * in order to keep the last CPU write.
+		 *
+		 * Imported region prefers the opposite treatment: this memory has been
+		 * legitimately mapped and used by the GPU, hence GPU writes shall be
+		 * committed to memory, while CPU cache shall be invalidated to make
+		 * sure that CPU reads the correct memory content.
+		 *
+		 * The following diagram shows the expect value of the variables
+		 * used in this loop in the corner case of an imported region encloed
+		 * by a single memory page:
+		 *
+		 * page boundary ->|---------- | <- dma_addr (initial value)
+		 *                 |           |
+		 *                 | - - - - - | <- offset_within_page
+		 *                 |XXXXXXXXXXX|\
+		 *                 |XXXXXXXXXXX| \
+		 *                 |XXXXXXXXXXX|  }- imported_size
+		 *                 |XXXXXXXXXXX| /
+		 *                 |XXXXXXXXXXX|/
+		 *                 | - - - - - | <- offset_within_page + imported_size
+		 *                 |           |\
+		 *                 |           | }- PAGE_SIZE - imported_size - offset_within_page
+		 *                 |           |/
+		 * page boundary ->|-----------|
+		 *
+		 * If the imported region is enclosed by more than one page, then
+		 * offset_within_page = 0 for any page after the first.
+		 */
+
+		/* Only for first page: handle non-imported range at the beginning. */
+		if (offset_within_page > 0) {
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, offset_within_page,
+						   DMA_BIDIRECTIONAL);
+			dma_addr += offset_within_page;
+		}
+
+		/* For every page: handle imported range. */
+		if (imported_size > 0)
+			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, imported_size,
+						DMA_BIDIRECTIONAL);
+
+		/* Only for last page (that may coincide with first page):
+		 * handle non-imported range at the end.
+		 */
+		if ((imported_size + offset_within_page) < PAGE_SIZE) {
+			dma_addr += imported_size;
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr,
+						   PAGE_SIZE - imported_size - offset_within_page,
+						   DMA_BIDIRECTIONAL);
+		}
+
+		/* Notice: use the original DMA address to unmap the whole memory page. */
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		dma_unmap_page(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i], PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+#else
+		dma_unmap_page_attrs(kctx->kbdev->dev, alloc->imported.user_buf.dma_addrs[i],
+				     PAGE_SIZE, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+#endif
 		if (writeable)
 			set_page_dirty_lock(pages[i]);
 #if !MALI_USE_CSF
-		put_page(pages[i]);
+		kbase_unpin_user_buf_page(pages[i]);
 		pages[i] = NULL;
 #endif
 
-		size -= local_size;
+		remaining_size -= imported_size;
+		offset_within_page = 0;
 	}
 #if !MALI_USE_CSF
 	alloc->nents = 0;
@@ -4687,11 +5295,11 @@
 	return 0;
 }
 
-struct kbase_mem_phy_alloc *kbase_map_external_resource(
-		struct kbase_context *kctx, struct kbase_va_region *reg,
-		struct mm_struct *locked_mm)
+int kbase_map_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg,
+				struct mm_struct *locked_mm)
 {
-	int err;
+	int err = 0;
+	struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
 
 	lockdep_assert_held(&kctx->reg_lock);
 
@@ -4700,7 +5308,7 @@
 	case KBASE_MEM_TYPE_IMPORTED_USER_BUF: {
 		if ((reg->gpu_alloc->imported.user_buf.mm != locked_mm) &&
 		    (!reg->gpu_alloc->nents))
-			goto exit;
+			return -EINVAL;
 
 		reg->gpu_alloc->imported.user_buf.current_mapping_usage_count++;
 		if (reg->gpu_alloc->imported.user_buf
@@ -4708,7 +5316,7 @@
 			err = kbase_jd_user_buf_map(kctx, reg);
 			if (err) {
 				reg->gpu_alloc->imported.user_buf.current_mapping_usage_count--;
-				goto exit;
+				return err;
 			}
 		}
 	}
@@ -4716,21 +5324,30 @@
 	case KBASE_MEM_TYPE_IMPORTED_UMM: {
 		err = kbase_mem_umm_map(kctx, reg);
 		if (err)
-			goto exit;
+			return err;
 		break;
 	}
 	default:
-		goto exit;
+		dev_dbg(kctx->kbdev->dev,
+			"Invalid external resource GPU allocation type (%x) on mapping",
+			alloc->type);
+		return -EINVAL;
 	}
 
-	return kbase_mem_phy_alloc_get(reg->gpu_alloc);
-exit:
-	return NULL;
+	kbase_va_region_alloc_get(kctx, reg);
+	kbase_mem_phy_alloc_get(alloc);
+	return err;
 }
 
-void kbase_unmap_external_resource(struct kbase_context *kctx,
-		struct kbase_va_region *reg, struct kbase_mem_phy_alloc *alloc)
+void kbase_unmap_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
+	/* gpu_alloc was used in kbase_map_external_resources, so we need to use it for the
+	 * unmapping operation.
+	 */
+	struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
+
+	lockdep_assert_held(&kctx->reg_lock);
+
 	switch (alloc->type) {
 	case KBASE_MEM_TYPE_IMPORTED_UMM: {
 		kbase_mem_umm_unmap(kctx, reg, alloc);
@@ -4742,26 +5359,33 @@
 		if (alloc->imported.user_buf.current_mapping_usage_count == 0) {
 			bool writeable = true;
 
-			if (!kbase_is_region_invalid_or_free(reg) &&
-					reg->gpu_alloc == alloc)
-				kbase_mmu_teardown_pages(
-						kctx->kbdev,
-						&kctx->mmu,
-						reg->start_pfn,
-						kbase_reg_current_backed_size(reg),
-						kctx->as_nr);
+			if (!kbase_is_region_invalid_or_free(reg)) {
+				kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
+							 alloc->pages,
+							 kbase_reg_current_backed_size(reg),
+							 kbase_reg_current_backed_size(reg),
+							 kctx->as_nr, true);
+			}
 
-			if (reg && ((reg->flags & KBASE_REG_GPU_WR) == 0))
+			if ((reg->flags & (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR)) == 0)
 				writeable = false;
 
-			kbase_jd_user_buf_unmap(kctx, alloc, writeable);
+			kbase_jd_user_buf_unmap(kctx, alloc, reg, writeable);
 		}
-	}
+		}
 	break;
 	default:
-	break;
+		WARN(1, "Invalid external resource GPU allocation type (%x) on unmapping",
+		     alloc->type);
+		return;
 	}
 	kbase_mem_phy_alloc_put(alloc);
+	kbase_va_region_alloc_put(kctx, reg);
+}
+
+static inline u64 kbasep_get_va_gpu_addr(struct kbase_va_region *reg)
+{
+	return reg->start_pfn << PAGE_SHIFT;
 }
 
 struct kbase_ctx_ext_res_meta *kbase_sticky_resource_acquire(
@@ -4777,7 +5401,7 @@
 	 * metadata which matches the region which is being acquired.
 	 */
 	list_for_each_entry(walker, &kctx->ext_res_meta_head, ext_res_node) {
-		if (walker->gpu_addr == gpu_addr) {
+		if (kbasep_get_va_gpu_addr(walker->reg) == gpu_addr) {
 			meta = walker;
 			meta->ref++;
 			break;
@@ -4789,8 +5413,7 @@
 		struct kbase_va_region *reg;
 
 		/* Find the region */
-		reg = kbase_region_tracker_find_region_enclosing_address(
-				kctx, gpu_addr);
+		reg = kbase_region_tracker_find_region_enclosing_address(kctx, gpu_addr);
 		if (kbase_is_region_invalid_or_free(reg))
 			goto failed;
 
@@ -4798,18 +5421,18 @@
 		meta = kzalloc(sizeof(*meta), GFP_KERNEL);
 		if (!meta)
 			goto failed;
-
 		/*
 		 * Fill in the metadata object and acquire a reference
 		 * for the physical resource.
 		 */
-		meta->alloc = kbase_map_external_resource(kctx, reg, NULL);
-		meta->ref = 1;
+		meta->reg = reg;
 
-		if (!meta->alloc)
+		/* Map the external resource to the GPU allocation of the region
+		 * and acquire the reference to the VA region
+		 */
+		if (kbase_map_external_resource(kctx, meta->reg, NULL))
 			goto fail_map;
-
-		meta->gpu_addr = reg->start_pfn << PAGE_SHIFT;
+		meta->ref = 1;
 
 		list_add(&meta->ext_res_node, &kctx->ext_res_meta_head);
 	}
@@ -4834,7 +5457,7 @@
 	 * metadata which matches the region which is being released.
 	 */
 	list_for_each_entry(walker, &kctx->ext_res_meta_head, ext_res_node)
-		if (walker->gpu_addr == gpu_addr)
+		if (kbasep_get_va_gpu_addr(walker->reg) == gpu_addr)
 			return walker;
 
 	return NULL;
@@ -4843,14 +5466,7 @@
 static void release_sticky_resource_meta(struct kbase_context *kctx,
 		struct kbase_ctx_ext_res_meta *meta)
 {
-	struct kbase_va_region *reg;
-
-	/* Drop the physical memory reference and free the metadata. */
-	reg = kbase_region_tracker_find_region_enclosing_address(
-			kctx,
-			meta->gpu_addr);
-
-	kbase_unmap_external_resource(kctx, reg, meta->alloc);
+	kbase_unmap_external_resource(kctx, meta->reg);
 	list_del(&meta->ext_res_node);
 	kfree(meta);
 }

--
Gitblit v1.6.2