~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,20 +1,9 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* The Kyber I/O scheduler. Controls latency by throttling queue depths using
3	4	* scalable techniques.
4	5	*
5	6	* Copyright (C) 2017 Facebook
6		- *
7		- * This program is free software; you can redistribute it and/or
8		- * modify it under the terms of the GNU General Public
9		- * License v2 as published by the Free Software Foundation.
10		- *
11		- * This program is distributed in the hope that it will be useful,
12		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14		- * General Public License for more details.
15		- *
16		- * You should have received a copy of the GNU General Public License
17		- * along with this program. If not, see <https://www.gnu.org/licenses/>.
18	7	*/
19	8
20	9	#include <linux/kernel.h>
..	..	@@ -29,19 +18,30 @@
29	18	#include "blk-mq-debugfs.h"
30	19	#include "blk-mq-sched.h"
31	20	#include "blk-mq-tag.h"
32		-#include "blk-stat.h"
33	21
34		-/* Scheduling domains. */
	22	+#define CREATE_TRACE_POINTS
	23	+#include <trace/events/kyber.h>
	24	+
	25	+/*
	26	+ * Scheduling domains: the device is divided into multiple domains based on the
	27	+ * request type.
	28	+ */
35	29	enum {
36	30	KYBER_READ,
37		- KYBER_SYNC_WRITE,
38		- KYBER_OTHER, /* Async writes, discard, etc. */
	31	+ KYBER_WRITE,
	32	+ KYBER_DISCARD,
	33	+ KYBER_OTHER,
39	34	KYBER_NUM_DOMAINS,
40	35	};
41	36
42		-enum {
43		- KYBER_MIN_DEPTH = 256,
	37	+static const char *kyber_domain_names[] = {
	38	+ [KYBER_READ] = "READ",
	39	+ [KYBER_WRITE] = "WRITE",
	40	+ [KYBER_DISCARD] = "DISCARD",
	41	+ [KYBER_OTHER] = "OTHER",
	42	+};
44	43
	44	+enum {
45	45	/*
46	46	* In order to prevent starvation of synchronous requests by a flood of
47	47	* asynchronous requests, we reserve 25% of requests for synchronous
..	..	@@ -51,25 +51,87 @@
51	51	};
52	52
53	53	/*
54		- * Initial device-wide depths for each scheduling domain.
	54	+ * Maximum device-wide depth for each scheduling domain.
55	55	*
56		- * Even for fast devices with lots of tags like NVMe, you can saturate
57		- * the device with only a fraction of the maximum possible queue depth.
58		- * So, we cap these to a reasonable value.
	56	+ * Even for fast devices with lots of tags like NVMe, you can saturate the
	57	+ * device with only a fraction of the maximum possible queue depth. So, we cap
	58	+ * these to a reasonable value.
59	59	*/
60	60	static const unsigned int kyber_depth[] = {
61	61	[KYBER_READ] = 256,
62		- [KYBER_SYNC_WRITE] = 128,
63		- [KYBER_OTHER] = 64,
	62	+ [KYBER_WRITE] = 128,
	63	+ [KYBER_DISCARD] = 64,
	64	+ [KYBER_OTHER] = 16,
64	65	};
65	66
66	67	/*
67		- * Scheduling domain batch sizes. We favor reads.
	68	+ * Default latency targets for each scheduling domain.
	69	+ */
	70	+static const u64 kyber_latency_targets[] = {
	71	+ [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
	72	+ [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
	73	+ [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
	74	+};
	75	+
	76	+/*
	77	+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
	78	+ * domain.
68	79	*/
69	80	static const unsigned int kyber_batch_size[] = {
70	81	[KYBER_READ] = 16,
71		- [KYBER_SYNC_WRITE] = 8,
72		- [KYBER_OTHER] = 8,
	82	+ [KYBER_WRITE] = 8,
	83	+ [KYBER_DISCARD] = 1,
	84	+ [KYBER_OTHER] = 1,
	85	+};
	86	+
	87	+/*
	88	+ * Requests latencies are recorded in a histogram with buckets defined relative
	89	+ * to the target latency:
	90	+ *
	91	+ * <= 1/4 * target latency
	92	+ * <= 1/2 * target latency
	93	+ * <= 3/4 * target latency
	94	+ * <= target latency
	95	+ * <= 1 1/4 * target latency
	96	+ * <= 1 1/2 * target latency
	97	+ * <= 1 3/4 * target latency
	98	+ * > 1 3/4 * target latency
	99	+ */
	100	+enum {
	101	+ /*
	102	+ * The width of the latency histogram buckets is
	103	+ * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
	104	+ */
	105	+ KYBER_LATENCY_SHIFT = 2,
	106	+ /*
	107	+ * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
	108	+ * thus, "good".
	109	+ */
	110	+ KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
	111	+ /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
	112	+ KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
	113	+};
	114	+
	115	+/*
	116	+ * We measure both the total latency and the I/O latency (i.e., latency after
	117	+ * submitting to the device).
	118	+ */
	119	+enum {
	120	+ KYBER_TOTAL_LATENCY,
	121	+ KYBER_IO_LATENCY,
	122	+};
	123	+
	124	+static const char *kyber_latency_type_names[] = {
	125	+ [KYBER_TOTAL_LATENCY] = "total",
	126	+ [KYBER_IO_LATENCY] = "I/O",
	127	+};
	128	+
	129	+/*
	130	+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
	131	+ * domain except for KYBER_OTHER.
	132	+ */
	133	+struct kyber_cpu_latency {
	134	+ atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
73	135	};
74	136
75	137	/*
..	..	@@ -88,12 +150,9 @@
88	150	struct kyber_queue_data {
89	151	struct request_queue *q;
90	152
91		- struct blk_stat_callback *cb;
92		-
93	153	/*
94		- * The device is divided into multiple scheduling domains based on the
95		- * request type. Each domain has a fixed number of in-flight requests of
96		- * that type device-wide, limited by these tokens.
	154	+ * Each scheduling domain has a limited number of in-flight requests
	155	+ * device-wide, limited by these tokens.
97	156	*/
98	157	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99	158
..	..	@@ -103,8 +162,19 @@
103	162	*/
104	163	unsigned int async_depth;
105	164
	165	+ struct kyber_cpu_latency __percpu *cpu_latency;
	166	+
	167	+ /* Timer for stats aggregation and adjusting domain tokens. */
	168	+ struct timer_list timer;
	169	+
	170	+ unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
	171	+
	172	+ unsigned long latency_timeout[KYBER_OTHER];
	173	+
	174	+ int domain_p99[KYBER_OTHER];
	175	+
106	176	/* Target latencies in nanoseconds. */
107		- u64 read_lat_nsec, write_lat_nsec;
	177	+ u64 latency_targets[KYBER_OTHER];
108	178	};
109	179
110	180	struct kyber_hctx_data {
..	..	@@ -114,7 +184,7 @@
114	184	unsigned int batching;
115	185	struct kyber_ctx_queue *kcqs;
116	186	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117		- wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
	187	+ struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
118	188	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119	189	atomic_t wait_index[KYBER_NUM_DOMAINS];
120	190	};
..	..	@@ -124,233 +194,219 @@
124	194
125	195	static unsigned int kyber_sched_domain(unsigned int op)
126	196	{
127		- if ((op & REQ_OP_MASK) == REQ_OP_READ)
	197	+ switch (op & REQ_OP_MASK) {
	198	+ case REQ_OP_READ:
128	199	return KYBER_READ;
129		- else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130		- return KYBER_SYNC_WRITE;
131		- else
	200	+ case REQ_OP_WRITE:
	201	+ return KYBER_WRITE;
	202	+ case REQ_OP_DISCARD:
	203	+ return KYBER_DISCARD;
	204	+ default:
132	205	return KYBER_OTHER;
	206	+ }
133	207	}
134	208
135		-enum {
136		- NONE = 0,
137		- GOOD = 1,
138		- GREAT = 2,
139		- BAD = -1,
140		- AWFUL = -2,
141		-};
142		-
143		-#define IS_GOOD(status) ((status) > 0)
144		-#define IS_BAD(status) ((status) < 0)
145		-
146		-static int kyber_lat_status(struct blk_stat_callback *cb,
147		- unsigned int sched_domain, u64 target)
	209	+static void flush_latency_buckets(struct kyber_queue_data *kqd,
	210	+ struct kyber_cpu_latency *cpu_latency,
	211	+ unsigned int sched_domain, unsigned int type)
148	212	{
149		- u64 latency;
	213	+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
	214	+ atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
	215	+ unsigned int bucket;
150	216
151		- if (!cb->stat[sched_domain].nr_samples)
152		- return NONE;
153		-
154		- latency = cb->stat[sched_domain].mean;
155		- if (latency >= 2 * target)
156		- return AWFUL;
157		- else if (latency > target)
158		- return BAD;
159		- else if (latency <= target / 2)
160		- return GREAT;
161		- else /* (latency <= target) */
162		- return GOOD;
	217	+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
	218	+ buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
163	219	}
164	220
165	221	/*
166		- * Adjust the read or synchronous write depth given the status of reads and
167		- * writes. The goal is that the latencies of the two domains are fair (i.e., if
168		- * one is good, then the other is good).
	222	+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
	223	+ * aren't enough samples yet.
169	224	*/
170		-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171		- unsigned int sched_domain, int this_status,
172		- int other_status)
	225	+static int calculate_percentile(struct kyber_queue_data *kqd,
	226	+ unsigned int sched_domain, unsigned int type,
	227	+ unsigned int percentile)
173	228	{
174		- unsigned int orig_depth, depth;
	229	+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
	230	+ unsigned int bucket, samples = 0, percentile_samples;
	231	+
	232	+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
	233	+ samples += buckets[bucket];
	234	+
	235	+ if (!samples)
	236	+ return -1;
175	237
176	238	/*
177		- * If this domain had no samples, or reads and writes are both good or
178		- * both bad, don't adjust the depth.
	239	+ * We do the calculation once we have 500 samples or one second passes
	240	+ * since the first sample was recorded, whichever comes first.
179	241	*/
180		- if (this_status == NONE \|\|
181		- (IS_GOOD(this_status) && IS_GOOD(other_status)) \|\|
182		- (IS_BAD(this_status) && IS_BAD(other_status)))
183		- return;
184		-
185		- orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186		-
187		- if (other_status == NONE) {
188		- depth++;
189		- } else {
190		- switch (this_status) {
191		- case GOOD:
192		- if (other_status == AWFUL)
193		- depth -= max(depth / 4, 1U);
194		- else
195		- depth -= max(depth / 8, 1U);
196		- break;
197		- case GREAT:
198		- if (other_status == AWFUL)
199		- depth /= 2;
200		- else
201		- depth -= max(depth / 4, 1U);
202		- break;
203		- case BAD:
204		- depth++;
205		- break;
206		- case AWFUL:
207		- if (other_status == GREAT)
208		- depth += 2;
209		- else
210		- depth++;
211		- break;
212		- }
	242	+ if (!kqd->latency_timeout[sched_domain])
	243	+ kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
	244	+ if (samples < 500 &&
	245	+ time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
	246	+ return -1;
213	247	}
	248	+ kqd->latency_timeout[sched_domain] = 0;
214	249
	250	+ percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
	251	+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
	252	+ if (buckets[bucket] >= percentile_samples)
	253	+ break;
	254	+ percentile_samples -= buckets[bucket];
	255	+ }
	256	+ memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
	257	+
	258	+ trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
	259	+ kyber_latency_type_names[type], percentile,
	260	+ bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
	261	+
	262	+ return bucket;
	263	+}
	264	+
	265	+static void kyber_resize_domain(struct kyber_queue_data *kqd,
	266	+ unsigned int sched_domain, unsigned int depth)
	267	+{
215	268	depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216		- if (depth != orig_depth)
	269	+ if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
217	270	sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
	271	+ trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
	272	+ depth);
	273	+ }
218	274	}
219	275
220		-/*
221		- * Adjust the depth of other requests given the status of reads and synchronous
222		- * writes. As long as either domain is doing fine, we don't throttle, but if
223		- * both domains are doing badly, we throttle heavily.
224		- */
225		-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226		- int read_status, int write_status,
227		- bool have_samples)
	276	+static void kyber_timer_fn(struct timer_list *t)
228	277	{
229		- unsigned int orig_depth, depth;
230		- int status;
	278	+ struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
	279	+ unsigned int sched_domain;
	280	+ int cpu;
	281	+ bool bad = false;
231	282
232		- orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
	283	+ /* Sum all of the per-cpu latency histograms. */
	284	+ for_each_online_cpu(cpu) {
	285	+ struct kyber_cpu_latency *cpu_latency;
233	286
234		- if (read_status == NONE && write_status == NONE) {
235		- depth += 2;
236		- } else if (have_samples) {
237		- if (read_status == NONE)
238		- status = write_status;
239		- else if (write_status == NONE)
240		- status = read_status;
241		- else
242		- status = max(read_status, write_status);
243		- switch (status) {
244		- case GREAT:
245		- depth += 2;
246		- break;
247		- case GOOD:
248		- depth++;
249		- break;
250		- case BAD:
251		- depth -= max(depth / 4, 1U);
252		- break;
253		- case AWFUL:
254		- depth /= 2;
255		- break;
	287	+ cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
	288	+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
	289	+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
	290	+ KYBER_TOTAL_LATENCY);
	291	+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
	292	+ KYBER_IO_LATENCY);
256	293	}
257	294	}
258	295
259		- depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260		- if (depth != orig_depth)
261		- sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262		-}
	296	+ /*
	297	+ * Check if any domains have a high I/O latency, which might indicate
	298	+ * congestion in the device. Note that we use the p90; we don't want to
	299	+ * be too sensitive to outliers here.
	300	+ */
	301	+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
	302	+ int p90;
263	303
264		-/*
265		- * Apply heuristics for limiting queue depths based on gathered latency
266		- * statistics.
267		- */
268		-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269		-{
270		- struct kyber_queue_data *kqd = cb->data;
271		- int read_status, write_status;
272		-
273		- read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274		- write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275		-
276		- kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277		- kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278		- kyber_adjust_other_depth(kqd, read_status, write_status,
279		- cb->stat[KYBER_OTHER].nr_samples != 0);
	304	+ p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
	305	+ 90);
	306	+ if (p90 >= KYBER_GOOD_BUCKETS)
	307	+ bad = true;
	308	+ }
280	309
281	310	/*
282		- * Continue monitoring latencies if we aren't hitting the targets or
283		- * we're still throttling other requests.
	311	+ * Adjust the scheduling domain depths. If we determined that there was
	312	+ * congestion, we throttle all domains with good latencies. Either way,
	313	+ * we ease up on throttling domains with bad latencies.
284	314	*/
285		- if (!blk_stat_is_active(kqd->cb) &&
286		- ((IS_BAD(read_status) \|\| IS_BAD(write_status) \|\|
287		- kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288		- blk_stat_activate_msecs(kqd->cb, 100);
	315	+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
	316	+ unsigned int orig_depth, depth;
	317	+ int p99;
	318	+
	319	+ p99 = calculate_percentile(kqd, sched_domain,
	320	+ KYBER_TOTAL_LATENCY, 99);
	321	+ /*
	322	+ * This is kind of subtle: different domains will not
	323	+ * necessarily have enough samples to calculate the latency
	324	+ * percentiles during the same window, so we have to remember
	325	+ * the p99 for the next time we observe congestion; once we do,
	326	+ * we don't want to throttle again until we get more data, so we
	327	+ * reset it to -1.
	328	+ */
	329	+ if (bad) {
	330	+ if (p99 < 0)
	331	+ p99 = kqd->domain_p99[sched_domain];
	332	+ kqd->domain_p99[sched_domain] = -1;
	333	+ } else if (p99 >= 0) {
	334	+ kqd->domain_p99[sched_domain] = p99;
	335	+ }
	336	+ if (p99 < 0)
	337	+ continue;
	338	+
	339	+ /*
	340	+ * If this domain has bad latency, throttle less. Otherwise,
	341	+ * throttle more iff we determined that there is congestion.
	342	+ *
	343	+ * The new depth is scaled linearly with the p99 latency vs the
	344	+ * latency target. E.g., if the p99 is 3/4 of the target, then
	345	+ * we throttle down to 3/4 of the current depth, and if the p99
	346	+ * is 2x the target, then we double the depth.
	347	+ */
	348	+ if (bad \|\| p99 >= KYBER_GOOD_BUCKETS) {
	349	+ orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
	350	+ depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
	351	+ kyber_resize_domain(kqd, sched_domain, depth);
	352	+ }
	353	+ }
289	354	}
290	355
291		-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
	356	+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
292	357	{
293	358	/*
294	359	* All of the hardware queues have the same depth, so we can just grab
295	360	* the shift of the first one.
296	361	*/
297		- return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298		-}
299		-
300		-static int kyber_bucket_fn(const struct request *rq)
301		-{
302		- return kyber_sched_domain(rq->cmd_flags);
	362	+ return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
303	363	}
304	364
305	365	static struct kyber_queue_data kyber_queue_data_alloc(struct request_queue q)
306	366	{
307	367	struct kyber_queue_data *kqd;
308		- unsigned int max_tokens;
309	368	unsigned int shift;
310	369	int ret = -ENOMEM;
311	370	int i;
312	371
313		- kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
	372	+ kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314	373	if (!kqd)
315	374	goto err;
	375	+
316	376	kqd->q = q;
317	377
318		- kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
319		- KYBER_NUM_DOMAINS, kqd);
320		- if (!kqd->cb)
	378	+ kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
	379	+ GFP_KERNEL \| __GFP_ZERO);
	380	+ if (!kqd->cpu_latency)
321	381	goto err_kqd;
322	382
323		- /*
324		- * The maximum number of tokens for any scheduling domain is at least
325		- * the queue depth of a single hardware queue. If the hardware doesn't
326		- * have many tags, still provide a reasonable number.
327		- */
328		- max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329		- KYBER_MIN_DEPTH);
	383	+ timer_setup(&kqd->timer, kyber_timer_fn, 0);
	384	+
330	385	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331	386	WARN_ON(!kyber_depth[i]);
332	387	WARN_ON(!kyber_batch_size[i]);
333	388	ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334		- max_tokens, -1, false, GFP_KERNEL,
335		- q->node);
	389	+ kyber_depth[i], -1, false,
	390	+ GFP_KERNEL, q->node);
336	391	if (ret) {
337	392	while (--i >= 0)
338	393	sbitmap_queue_free(&kqd->domain_tokens[i]);
339		- goto err_cb;
	394	+ goto err_buckets;
340	395	}
341		- sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342	396	}
343	397
344		- shift = kyber_sched_tags_shift(kqd);
345		- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
	398	+ for (i = 0; i < KYBER_OTHER; i++) {
	399	+ kqd->domain_p99[i] = -1;
	400	+ kqd->latency_targets[i] = kyber_latency_targets[i];
	401	+ }
346	402
347		- kqd->read_lat_nsec = 2000000ULL;
348		- kqd->write_lat_nsec = 10000000ULL;
	403	+ shift = kyber_sched_tags_shift(q);
	404	+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
349	405
350	406	return kqd;
351	407
352		-err_cb:
353		- blk_stat_free_callback(kqd->cb);
	408	+err_buckets:
	409	+ free_percpu(kqd->cpu_latency);
354	410	err_kqd:
355	411	kfree(kqd);
356	412	err:
..	..	@@ -372,10 +428,10 @@
372	428	return PTR_ERR(kqd);
373	429	}
374	430
	431	+ blk_stat_enable_accounting(q);
	432	+
375	433	eq->elevator_data = kqd;
376	434	q->elevator = eq;
377		-
378		- blk_stat_add_callback(q, kqd->cb);
379	435
380	436	return 0;
381	437	}
..	..	@@ -383,14 +439,13 @@
383	439	static void kyber_exit_sched(struct elevator_queue *e)
384	440	{
385	441	struct kyber_queue_data *kqd = e->elevator_data;
386		- struct request_queue *q = kqd->q;
387	442	int i;
388	443
389		- blk_stat_remove_callback(q, kqd->cb);
	444	+ del_timer_sync(&kqd->timer);
390	445
391	446	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392	447	sbitmap_queue_free(&kqd->domain_tokens[i]);
393		- blk_stat_free_callback(kqd->cb);
	448	+ free_percpu(kqd->cpu_latency);
394	449	kfree(kqd);
395	450	}
396	451
..	..	@@ -435,10 +490,11 @@
435	490
436	491	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437	492	INIT_LIST_HEAD(&khd->rqs[i]);
438		- init_waitqueue_func_entry(&khd->domain_wait[i],
	493	+ khd->domain_wait[i].sbq = NULL;
	494	+ init_waitqueue_func_entry(&khd->domain_wait[i].wait,
439	495	kyber_domain_wake);
440		- khd->domain_wait[i].private = hctx;
441		- INIT_LIST_HEAD(&khd->domain_wait[i].entry);
	496	+ khd->domain_wait[i].wait.private = hctx;
	497	+ INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
442	498	atomic_set(&khd->wait_index[i], 0);
443	499	}
444	500
..	..	@@ -446,7 +502,7 @@
446	502	khd->batching = 0;
447	503
448	504	hctx->sched_data = khd;
449		- sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
	505	+ sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
450	506	kqd->async_depth);
451	507
452	508	return 0;
..	..	@@ -506,24 +562,25 @@
506	562	}
507	563	}
508	564
509		-static bool kyber_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
	565	+static bool kyber_bio_merge(struct request_queue q, struct bio bio,
	566	+ unsigned int nr_segs)
510	567	{
	568	+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	569	+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
511	570	struct kyber_hctx_data *khd = hctx->sched_data;
512		- struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513		- struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
	571	+ struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
514	572	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515	573	struct list_head *rq_list = &kcq->rq_list[sched_domain];
516	574	bool merged;
517	575
518	576	spin_lock(&kcq->lock);
519		- merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
	577	+ merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
520	578	spin_unlock(&kcq->lock);
521		- blk_mq_put_ctx(ctx);
522	579
523	580	return merged;
524	581	}
525	582
526		-static void kyber_prepare_request(struct request rq, struct bio bio)
	583	+static void kyber_prepare_request(struct request *rq)
527	584	{
528	585	rq_set_domain_token(rq, -1);
529	586	}
..	..	@@ -536,7 +593,7 @@
536	593
537	594	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538	595	unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539		- struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
	596	+ struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
540	597	struct list_head *head = &kcq->rq_list[sched_domain];
541	598
542	599	spin_lock(&kcq->lock);
..	..	@@ -545,7 +602,7 @@
545	602	else
546	603	list_move_tail(&rq->queuelist, head);
547	604	sbitmap_set_bit(&khd->kcq_map[sched_domain],
548		- rq->mq_ctx->index_hw);
	605	+ rq->mq_ctx->index_hw[hctx->type]);
549	606	blk_mq_sched_request_inserted(rq);
550	607	spin_unlock(&kcq->lock);
551	608	}
..	..	@@ -558,41 +615,44 @@
558	615	rq_clear_domain_token(kqd, rq);
559	616	}
560	617
561		-static void kyber_completed_request(struct request *rq)
	618	+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
	619	+ unsigned int sched_domain, unsigned int type,
	620	+ u64 target, u64 latency)
562	621	{
563		- struct request_queue *q = rq->q;
564		- struct kyber_queue_data *kqd = q->elevator->elevator_data;
565		- unsigned int sched_domain;
566		- u64 now, latency, target;
	622	+ unsigned int bucket;
	623	+ u64 divisor;
567	624
568		- /*
569		- * Check if this request met our latency goal. If not, quickly gather
570		- * some statistics and start throttling.
571		- */
572		- sched_domain = kyber_sched_domain(rq->cmd_flags);
573		- switch (sched_domain) {
574		- case KYBER_READ:
575		- target = kqd->read_lat_nsec;
576		- break;
577		- case KYBER_SYNC_WRITE:
578		- target = kqd->write_lat_nsec;
579		- break;
580		- default:
581		- return;
	625	+ if (latency > 0) {
	626	+ divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
	627	+ bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
	628	+ KYBER_LATENCY_BUCKETS - 1);
	629	+ } else {
	630	+ bucket = 0;
582	631	}
583	632
584		- /* If we are already monitoring latencies, don't check again. */
585		- if (blk_stat_is_active(kqd->cb))
	633	+ atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
	634	+}
	635	+
	636	+static void kyber_completed_request(struct request *rq, u64 now)
	637	+{
	638	+ struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
	639	+ struct kyber_cpu_latency *cpu_latency;
	640	+ unsigned int sched_domain;
	641	+ u64 target;
	642	+
	643	+ sched_domain = kyber_sched_domain(rq->cmd_flags);
	644	+ if (sched_domain == KYBER_OTHER)
586	645	return;
587	646
588		- now = ktime_get_ns();
589		- if (now < rq->io_start_time_ns)
590		- return;
	647	+ cpu_latency = get_cpu_ptr(kqd->cpu_latency);
	648	+ target = kqd->latency_targets[sched_domain];
	649	+ add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
	650	+ target, now - rq->start_time_ns);
	651	+ add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
	652	+ now - rq->io_start_time_ns);
	653	+ put_cpu_ptr(kqd->cpu_latency);
591	654
592		- latency = now - rq->io_start_time_ns;
593		-
594		- if (latency > target)
595		- blk_stat_activate_msecs(kqd->cb, 10);
	655	+ timer_reduce(&kqd->timer, jiffies + HZ / 10);
596	656	}
597	657
598	658	struct flush_kcq_data {
..	..	@@ -629,12 +689,13 @@
629	689	flush_busy_kcq, &data);
630	690	}
631	691
632		-static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
	692	+static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
633	693	void *key)
634	694	{
635		- struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
	695	+ struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
	696	+ struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
636	697
637		- list_del_init(&wait->entry);
	698	+ sbitmap_del_wait_queue(wait);
638	699	blk_mq_run_hw_queue(hctx, true);
639	700	return 1;
640	701	}
..	..	@@ -645,7 +706,7 @@
645	706	{
646	707	unsigned int sched_domain = khd->cur_domain;
647	708	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
648		- wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
	709	+ struct sbq_wait *wait = &khd->domain_wait[sched_domain];
649	710	struct sbq_wait_state *ws;
650	711	int nr;
651	712
..	..	@@ -656,11 +717,11 @@
656	717	* run when one becomes available. Note that this is serialized on
657	718	* khd->lock, but we still need to be careful about the waker.
658	719	*/
659		- if (nr < 0 && list_empty_careful(&wait->entry)) {
	720	+ if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
660	721	ws = sbq_wait_ptr(domain_tokens,
661	722	&khd->wait_index[sched_domain]);
662	723	khd->domain_ws[sched_domain] = ws;
663		- add_wait_queue(&ws->wait, wait);
	724	+ sbitmap_add_wait_queue(domain_tokens, ws, wait);
664	725
665	726	/*
666	727	* Try again in case a token was freed before we got on the wait
..	..	@@ -676,10 +737,10 @@
676	737	* between the !list_empty_careful() check and us grabbing the lock, but
677	738	* list_del_init() is okay with that.
678	739	*/
679		- if (nr >= 0 && !list_empty_careful(&wait->entry)) {
	740	+ if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
680	741	ws = khd->domain_ws[sched_domain];
681	742	spin_lock_irq(&ws->wait.lock);
682		- list_del_init(&wait->entry);
	743	+ sbitmap_del_wait_queue(wait);
683	744	spin_unlock_irq(&ws->wait.lock);
684	745	}
685	746
..	..	@@ -713,6 +774,9 @@
713	774	rq_set_domain_token(rq, nr);
714	775	list_del_init(&rq->queuelist);
715	776	return rq;
	777	+ } else {
	778	+ trace_kyber_throttled(kqd->q,
	779	+ kyber_domain_names[khd->cur_domain]);
716	780	}
717	781	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718	782	nr = kyber_get_domain_token(kqd, khd, hctx);
..	..	@@ -723,6 +787,9 @@
723	787	rq_set_domain_token(rq, nr);
724	788	list_del_init(&rq->queuelist);
725	789	return rq;
	790	+ } else {
	791	+ trace_kyber_throttled(kqd->q,
	792	+ kyber_domain_names[khd->cur_domain]);
726	793	}
727	794	}
728	795
..	..	@@ -790,17 +857,17 @@
790	857	return false;
791	858	}
792	859
793		-#define KYBER_LAT_SHOW_STORE(op) \
794		-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
795		- char *page) \
	860	+#define KYBER_LAT_SHOW_STORE(domain, name) \
	861	+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
	862	+ char *page) \
796	863	{ \
797	864	struct kyber_queue_data *kqd = e->elevator_data; \
798	865	\
799		- return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
	866	+ return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
800	867	} \
801	868	\
802		-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
803		- const char *page, size_t count) \
	869	+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
	870	+ const char *page, size_t count) \
804	871	{ \
805	872	struct kyber_queue_data *kqd = e->elevator_data; \
806	873	unsigned long long nsec; \
..	..	@@ -810,12 +877,12 @@
810	877	if (ret) \
811	878	return ret; \
812	879	\
813		- kqd->op##_lat_nsec = nsec; \
	880	+ kqd->latency_targets[domain] = nsec; \
814	881	\
815	882	return count; \
816	883	}
817		-KYBER_LAT_SHOW_STORE(read);
818		-KYBER_LAT_SHOW_STORE(write);
	884	+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
	885	+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
819	886	#undef KYBER_LAT_SHOW_STORE
820	887
821	888	#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
..	..	@@ -876,13 +943,14 @@
876	943	{ \
877	944	struct blk_mq_hw_ctx *hctx = data; \
878	945	struct kyber_hctx_data *khd = hctx->sched_data; \
879		- wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
	946	+ wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
880	947	\
881	948	seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
882	949	return 0; \
883	950	}
884	951	KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885		-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
	952	+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
	953	+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
886	954	KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887	955	#undef KYBER_DEBUGFS_DOMAIN_ATTRS
888	956
..	..	@@ -900,20 +968,7 @@
900	968	struct blk_mq_hw_ctx *hctx = data;
901	969	struct kyber_hctx_data *khd = hctx->sched_data;
902	970
903		- switch (khd->cur_domain) {
904		- case KYBER_READ:
905		- seq_puts(m, "READ\n");
906		- break;
907		- case KYBER_SYNC_WRITE:
908		- seq_puts(m, "SYNC_WRITE\n");
909		- break;
910		- case KYBER_OTHER:
911		- seq_puts(m, "OTHER\n");
912		- break;
913		- default:
914		- seq_printf(m, "%u\n", khd->cur_domain);
915		- break;
916		- }
	971	+ seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
917	972	return 0;
918	973	}
919	974
..	..	@@ -930,7 +985,8 @@
930	985	{#name "_tokens", 0400, kyber_##name##_tokens_show}
931	986	static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932	987	KYBER_QUEUE_DOMAIN_ATTRS(read),
933		- KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
	988	+ KYBER_QUEUE_DOMAIN_ATTRS(write),
	989	+ KYBER_QUEUE_DOMAIN_ATTRS(discard),
934	990	KYBER_QUEUE_DOMAIN_ATTRS(other),
935	991	{"async_depth", 0400, kyber_async_depth_show},
936	992	{},
..	..	@@ -942,7 +998,8 @@
942	998	{#name "_waiting", 0400, kyber_##name##_waiting_show}
943	999	static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
944	1000	KYBER_HCTX_DOMAIN_ATTRS(read),
945		- KYBER_HCTX_DOMAIN_ATTRS(sync_write),
	1001	+ KYBER_HCTX_DOMAIN_ATTRS(write),
	1002	+ KYBER_HCTX_DOMAIN_ATTRS(discard),
946	1003	KYBER_HCTX_DOMAIN_ATTRS(other),
947	1004	{"cur_domain", 0400, kyber_cur_domain_show},
948	1005	{"batching", 0400, kyber_batching_show},
..	..	@@ -952,7 +1009,7 @@
952	1009	#endif
953	1010
954	1011	static struct elevator_type kyber_sched = {
955		- .ops.mq = {
	1012	+ .ops = {
956	1013	.init_sched = kyber_init_sched,
957	1014	.exit_sched = kyber_exit_sched,
958	1015	.init_hctx = kyber_init_hctx,
..	..	@@ -967,13 +1024,13 @@
967	1024	.dispatch_request = kyber_dispatch_request,
968	1025	.has_work = kyber_has_work,
969	1026	},
970		- .uses_mq = true,
971	1027	#ifdef CONFIG_BLK_DEBUG_FS
972	1028	.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
973	1029	.hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
974	1030	#endif
975	1031	.elevator_attrs = kyber_sched_attrs,
976	1032	.elevator_name = "kyber",
	1033	+ .elevator_features = ELEVATOR_F_MQ_AWARE,
977	1034	.elevator_owner = THIS_MODULE,
978	1035	};
979	1036