From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/drivers/hv/hv_balloon.c | 338 ++++++++++++++++++++++++++++++++++----------------------
1 files changed, 206 insertions(+), 132 deletions(-)
diff --git a/kernel/drivers/hv/hv_balloon.c b/kernel/drivers/hv/hv_balloon.c
index e5fc719..6a71699 100644
--- a/kernel/drivers/hv/hv_balloon.c
+++ b/kernel/drivers/hv/hv_balloon.c
@@ -1,19 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2012, Microsoft Corporation.
*
* Author:
* K. Y. Srinivasan <kys@microsoft.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,6 +23,9 @@
#include <linux/percpu_counter.h>
#include <linux/hyperv.h>
+#include <asm/hyperv-tlfs.h>
+
+#include <asm/mshyperv.h>
#define CREATE_TRACE_POINTS
#include "hv_trace_balloon.h"
@@ -351,8 +344,6 @@
*
* mem_range: Memory range to hot add.
*
- * On Linux we currently don't support this since we cannot hot add
- * arbitrary granularity of memory.
*/
struct dm_hot_add {
@@ -467,6 +458,7 @@
struct work_struct wrk;
};
+static bool allow_hibernation;
static bool hot_add = true;
static bool do_hot_add;
/*
@@ -487,7 +479,7 @@
MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
static atomic_t trans_id = ATOMIC_INIT(0);
-static int dm_ring_size = (5 * PAGE_SIZE);
+static int dm_ring_size = 20 * 1024;
/*
* Driver specific state.
@@ -503,10 +495,10 @@
};
-static __u8 recv_buffer[PAGE_SIZE];
-static __u8 *send_buffer;
-#define PAGES_IN_2M 512
-#define HA_CHUNK (32 * 1024)
+static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
+static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
+#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
+#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
struct hv_dynmem_device {
struct hv_device *dev;
@@ -541,7 +533,6 @@
* State to synchronize hot-add.
*/
struct completion ol_waitevent;
- bool ha_waiting;
/*
* This thread handles hot-add
* requests from the host as well as notifying
@@ -642,10 +633,7 @@
switch (val) {
case MEM_ONLINE:
case MEM_CANCEL_ONLINE:
- if (dm_device.ha_waiting) {
- dm_device.ha_waiting = false;
- complete(&dm_device.ol_waitevent);
- }
+ complete(&dm_device.ol_waitevent);
break;
case MEM_OFFLINE:
@@ -681,15 +669,18 @@
/* Check if the particular page is backed and can be onlined and online it. */
static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
{
- if (!has_pfn_is_backed(has, page_to_pfn(pg)))
+ if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
+ if (!PageOffline(pg))
+ __SetPageOffline(pg);
return;
+ }
+ if (PageOffline(pg))
+ __ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
- __online_page_set_limits(pg);
- __online_page_increment_counters(pg);
- __online_page_free(pg);
+ generic_online_page(pg, 0);
- WARN_ON_ONCE(!spin_is_locked(&dm_device.ha_lock));
+ lockdep_assert_held(&dm_device.ha_lock);
dm_device.num_pages_onlined++;
}
@@ -731,12 +722,11 @@
has->covered_end_pfn += processed_pfn;
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
- init_completion(&dm_device.ol_waitevent);
- dm_device.ha_waiting = !memhp_auto_online;
+ reinit_completion(&dm_device.ol_waitevent);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
- (HA_CHUNK << PAGE_SHIFT));
+ (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
@@ -758,20 +748,19 @@
}
/*
- * Wait for the memory block to be onlined when memory onlining
- * is done outside of kernel (memhp_auto_online). Since the hot
- * add has succeeded, it is ok to proceed even if the pages in
- * the hot added region have not been "onlined" within the
- * allowed time.
+ * Wait for memory to get onlined. If the kernel onlined the
+ * memory when adding it, this will return directly. Otherwise,
+ * it will wait for user space to online the memory. This helps
+ * to avoid adding memory faster than it is getting onlined. As
+ * adding succeeded, it is ok to proceed even if the memory was
+ * not onlined in time.
*/
- if (dm_device.ha_waiting)
- wait_for_completion_timeout(&dm_device.ol_waitevent,
- 5*HZ);
+ wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
post_status(&dm_device);
}
}
-static void hv_online_page(struct page *pg)
+static void hv_online_page(struct page *pg, unsigned int order)
{
struct hv_hotadd_state *has;
unsigned long flags;
@@ -780,10 +769,11 @@
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
/* The page belongs to a different HAS. */
- if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
+ if ((pfn < has->start_pfn) ||
+ (pfn + (1UL << order) > has->end_pfn))
continue;
- hv_page_online_one(has, pg);
+ hv_bring_pgs_online(has, pfn, 1UL << order);
break;
}
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
@@ -1057,8 +1047,12 @@
else
resp.result = 0;
- if (!do_hot_add || (resp.page_count == 0))
- pr_err("Memory hot add failed\n");
+ if (!do_hot_add || resp.page_count == 0) {
+ if (!allow_hibernation)
+ pr_err("Memory hot add failed\n");
+ else
+ pr_info("Ignore hot-add request!\n");
+ }
dm->state = DM_INITIALIZED;
resp.hdr.trans_id = atomic_inc_return(&trans_id);
@@ -1080,7 +1074,7 @@
__u64 *max_page_count = (__u64 *)&info_hdr[1];
pr_info("Max. dynamic memory size: %llu MB\n",
- (*max_page_count) >> (20 - PAGE_SHIFT));
+ (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT));
}
break;
@@ -1092,6 +1086,7 @@
static unsigned long compute_balloon_floor(void)
{
unsigned long min_pages;
+ unsigned long nr_pages = totalram_pages();
#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
/* Simple continuous piecewiese linear function:
* max MiB -> min MiB gradient
@@ -1104,16 +1099,16 @@
* 8192 744 (1/16)
* 32768 1512 (1/32)
*/
- if (totalram_pages < MB2PAGES(128))
- min_pages = MB2PAGES(8) + (totalram_pages >> 1);
- else if (totalram_pages < MB2PAGES(512))
- min_pages = MB2PAGES(40) + (totalram_pages >> 2);
- else if (totalram_pages < MB2PAGES(2048))
- min_pages = MB2PAGES(104) + (totalram_pages >> 3);
- else if (totalram_pages < MB2PAGES(8192))
- min_pages = MB2PAGES(232) + (totalram_pages >> 4);
+ if (nr_pages < MB2PAGES(128))
+ min_pages = MB2PAGES(8) + (nr_pages >> 1);
+ else if (nr_pages < MB2PAGES(512))
+ min_pages = MB2PAGES(40) + (nr_pages >> 2);
+ else if (nr_pages < MB2PAGES(2048))
+ min_pages = MB2PAGES(104) + (nr_pages >> 3);
+ else if (nr_pages < MB2PAGES(8192))
+ min_pages = MB2PAGES(232) + (nr_pages >> 4);
else
- min_pages = MB2PAGES(488) + (totalram_pages >> 5);
+ min_pages = MB2PAGES(488) + (nr_pages >> 5);
#undef MB2PAGES
return min_pages;
}
@@ -1200,6 +1195,7 @@
for (i = 0; i < num_pages; i++) {
pg = pfn_to_page(i + start_frame);
+ __ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
}
@@ -1212,12 +1208,12 @@
struct dm_balloon_response *bl_resp,
int alloc_unit)
{
- unsigned int i = 0;
+ unsigned int i, j;
struct page *pg;
for (i = 0; i < num_pages / alloc_unit; i++) {
if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
- PAGE_SIZE)
+ HV_HYP_PAGE_SIZE)
return i * alloc_unit;
/*
@@ -1240,6 +1236,10 @@
if (alloc_unit != 1)
split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
+
+ /* mark all pages offline */
+ for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
+ __SetPageOffline(pg + j);
bl_resp->range_count++;
bl_resp->range_array[i].finfo.start_page =
@@ -1266,9 +1266,9 @@
/*
* We will attempt 2M allocations. However, if we fail to
- * allocate 2M chunks, we will go back to 4k allocations.
+ * allocate 2M chunks, we will go back to PAGE_SIZE allocations.
*/
- alloc_unit = 512;
+ alloc_unit = PAGES_IN_2M;
avail_pages = si_mem_available();
floor = compute_balloon_floor();
@@ -1283,8 +1283,8 @@
}
while (!done) {
- bl_resp = (struct dm_balloon_response *)send_buffer;
- memset(send_buffer, 0, PAGE_SIZE);
+ memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE);
+ bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
bl_resp->hdr.type = DM_BALLOON_RESPONSE;
bl_resp->hdr.size = sizeof(struct dm_balloon_response);
bl_resp->more_pages = 1;
@@ -1482,7 +1482,7 @@
memset(recv_buffer, 0, sizeof(recv_buffer));
vmbus_recvpacket(dev->channel, recv_buffer,
- PAGE_SIZE, &recvlen, &requestid);
+ HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (recvlen > 0) {
dm_msg = (struct dm_message *)recv_buffer;
@@ -1500,6 +1500,11 @@
break;
case DM_BALLOON_REQUEST:
+ if (allow_hibernation) {
+ pr_info("Ignore balloon-up request!\n");
+ break;
+ }
+
if (dm->state == DM_BALLOON_UP)
pr_warn("Currently ballooning\n");
bal_msg = (struct dm_balloon *)recv_buffer;
@@ -1509,6 +1514,11 @@
break;
case DM_UNBALLOON_REQUEST:
+ if (allow_hibernation) {
+ pr_info("Ignore balloon-down request!\n");
+ break;
+ }
+
dm->state = DM_BALLOON_DOWN;
balloon_down(dm,
(struct dm_unballoon_request *)recv_buffer);
@@ -1548,65 +1558,25 @@
break;
default:
- pr_warn("Unhandled message: type: %d\n", dm_hdr->type);
+ pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
}
}
}
-static int balloon_probe(struct hv_device *dev,
- const struct hv_vmbus_device_id *dev_id)
+static int balloon_connect_vsp(struct hv_device *dev)
{
- int ret;
- unsigned long t;
struct dm_version_request version_req;
struct dm_capabilities cap_msg;
-
-#ifdef CONFIG_MEMORY_HOTPLUG
- do_hot_add = hot_add;
-#else
- do_hot_add = false;
-#endif
-
- /*
- * First allocate a send buffer.
- */
-
- send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!send_buffer)
- return -ENOMEM;
+ unsigned long t;
+ int ret;
ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
- balloon_onchannelcallback, dev);
-
+ balloon_onchannelcallback, dev);
if (ret)
- goto probe_error0;
+ return ret;
- dm_device.dev = dev;
- dm_device.state = DM_INITIALIZING;
- dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
- init_completion(&dm_device.host_event);
- init_completion(&dm_device.config_event);
- INIT_LIST_HEAD(&dm_device.ha_region_list);
- spin_lock_init(&dm_device.ha_lock);
- INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
- INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
- dm_device.host_specified_ha_region = false;
-
- dm_device.thread =
- kthread_run(dm_thread_func, &dm_device, "hv_balloon");
- if (IS_ERR(dm_device.thread)) {
- ret = PTR_ERR(dm_device.thread);
- goto probe_error1;
- }
-
-#ifdef CONFIG_MEMORY_HOTPLUG
- set_online_page_callback(&hv_online_page);
- register_memory_notifier(&hv_memory_nb);
-#endif
-
- hv_set_drvdata(dev, &dm_device);
/*
* Initiate the hand shake with the host and negotiate
* a version that the host can support. We start with the
@@ -1622,16 +1592,15 @@
dm_device.version = version_req.version.version;
ret = vmbus_sendpacket(dev->channel, &version_req,
- sizeof(struct dm_version_request),
- (unsigned long)NULL,
- VM_PKT_DATA_INBAND, 0);
+ sizeof(struct dm_version_request),
+ (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
if (ret)
- goto probe_error2;
+ goto out;
t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
if (t == 0) {
ret = -ETIMEDOUT;
- goto probe_error2;
+ goto out;
}
/*
@@ -1639,8 +1608,8 @@
* fail the probe function.
*/
if (dm_device.state == DM_INIT_ERROR) {
- ret = -ETIMEDOUT;
- goto probe_error2;
+ ret = -EPROTO;
+ goto out;
}
pr_info("Using Dynamic Memory protocol version %u.%u\n",
@@ -1655,6 +1624,11 @@
cap_msg.hdr.size = sizeof(struct dm_capabilities);
cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
+ /*
+ * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
+ * currently still requires the bits to be set, so we have to add code
+ * to fail the host's hot-add and balloon up/down requests, if any.
+ */
cap_msg.caps.cap_bits.balloon = 1;
cap_msg.caps.cap_bits.hot_add = 1;
@@ -1673,16 +1647,15 @@
cap_msg.max_page_number = -1;
ret = vmbus_sendpacket(dev->channel, &cap_msg,
- sizeof(struct dm_capabilities),
- (unsigned long)NULL,
- VM_PKT_DATA_INBAND, 0);
+ sizeof(struct dm_capabilities),
+ (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
if (ret)
- goto probe_error2;
+ goto out;
t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
if (t == 0) {
ret = -ETIMEDOUT;
- goto probe_error2;
+ goto out;
}
/*
@@ -1690,25 +1663,72 @@
* fail the probe function.
*/
if (dm_device.state == DM_INIT_ERROR) {
- ret = -ETIMEDOUT;
- goto probe_error2;
+ ret = -EPROTO;
+ goto out;
}
+ return 0;
+out:
+ vmbus_close(dev->channel);
+ return ret;
+}
+
+static int balloon_probe(struct hv_device *dev,
+ const struct hv_vmbus_device_id *dev_id)
+{
+ int ret;
+
+ allow_hibernation = hv_is_hibernation_supported();
+ if (allow_hibernation)
+ hot_add = false;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ do_hot_add = hot_add;
+#else
+ do_hot_add = false;
+#endif
+ dm_device.dev = dev;
+ dm_device.state = DM_INITIALIZING;
+ dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
+ init_completion(&dm_device.host_event);
+ init_completion(&dm_device.config_event);
+ INIT_LIST_HEAD(&dm_device.ha_region_list);
+ spin_lock_init(&dm_device.ha_lock);
+ INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
+ INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
+ dm_device.host_specified_ha_region = false;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ set_online_page_callback(&hv_online_page);
+ init_completion(&dm_device.ol_waitevent);
+ register_memory_notifier(&hv_memory_nb);
+#endif
+
+ hv_set_drvdata(dev, &dm_device);
+
+ ret = balloon_connect_vsp(dev);
+ if (ret != 0)
+ return ret;
+
dm_device.state = DM_INITIALIZED;
- last_post_time = jiffies;
+
+ dm_device.thread =
+ kthread_run(dm_thread_func, &dm_device, "hv_balloon");
+ if (IS_ERR(dm_device.thread)) {
+ ret = PTR_ERR(dm_device.thread);
+ goto probe_error;
+ }
return 0;
-probe_error2:
+probe_error:
+ dm_device.state = DM_INIT_ERROR;
+ dm_device.thread = NULL;
+ vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
+ unregister_memory_notifier(&hv_memory_nb);
restore_online_page_callback(&hv_online_page);
#endif
- kthread_stop(dm_device.thread);
-
-probe_error1:
- vmbus_close(dev->channel);
-probe_error0:
- kfree(send_buffer);
return ret;
}
@@ -1725,12 +1745,11 @@
cancel_work_sync(&dm->balloon_wrk.wrk);
cancel_work_sync(&dm->ha_wrk.wrk);
- vmbus_close(dev->channel);
kthread_stop(dm->thread);
- kfree(send_buffer);
+ vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
- restore_online_page_callback(&hv_online_page);
unregister_memory_notifier(&hv_memory_nb);
+ restore_online_page_callback(&hv_online_page);
#endif
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
@@ -1744,6 +1763,59 @@
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
return 0;
+}
+
+static int balloon_suspend(struct hv_device *hv_dev)
+{
+ struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev);
+
+ tasklet_disable(&hv_dev->channel->callback_event);
+
+ cancel_work_sync(&dm->balloon_wrk.wrk);
+ cancel_work_sync(&dm->ha_wrk.wrk);
+
+ if (dm->thread) {
+ kthread_stop(dm->thread);
+ dm->thread = NULL;
+ vmbus_close(hv_dev->channel);
+ }
+
+ tasklet_enable(&hv_dev->channel->callback_event);
+
+ return 0;
+
+}
+
+static int balloon_resume(struct hv_device *dev)
+{
+ int ret;
+
+ dm_device.state = DM_INITIALIZING;
+
+ ret = balloon_connect_vsp(dev);
+
+ if (ret != 0)
+ goto out;
+
+ dm_device.thread =
+ kthread_run(dm_thread_func, &dm_device, "hv_balloon");
+ if (IS_ERR(dm_device.thread)) {
+ ret = PTR_ERR(dm_device.thread);
+ dm_device.thread = NULL;
+ goto close_channel;
+ }
+
+ dm_device.state = DM_INITIALIZED;
+ return 0;
+close_channel:
+ vmbus_close(dev->channel);
+out:
+ dm_device.state = DM_INIT_ERROR;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ unregister_memory_notifier(&hv_memory_nb);
+ restore_online_page_callback(&hv_online_page);
+#endif
+ return ret;
}
static const struct hv_vmbus_device_id id_table[] = {
@@ -1760,6 +1832,8 @@
.id_table = id_table,
.probe = balloon_probe,
.remove = balloon_remove,
+ .suspend = balloon_suspend,
+ .resume = balloon_resume,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
--
Gitblit v1.6.2