~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,8 +1,13 @@
1	1	// SPDX-License-Identifier: GPL-2.0
2	2	/*
3		- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
	3	+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
4	4	*
5		- * Jan 23 2005 Matt Mackall <mpm@selenic.com>
	5	+ * This performs nlog2(n) + 0.37n + o(n) comparisons on average,
	6	+ * and 1.5nlog2(n) + O(n) in the (very contrived) worst case.
	7	+ *
	8	+ * Glibc qsort() manages nlog2(n) - 1.26n for random inputs (1.63*n
	9	+ * better) at the expense of stack usage and much larger code to avoid
	10	+ * quicksort's O(n^2) worst case.
6	11	*/
7	12
8	13	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
..	..	@@ -11,96 +16,257 @@
11	16	#include <linux/export.h>
12	17	#include <linux/sort.h>
13	18
14		-static int alignment_ok(const void *base, int align)
	19	+/**
	20	+ * is_aligned - is this pointer & size okay for word-wide copying?
	21	+ * @base: pointer to data
	22	+ * @size: size of each element
	23	+ * @align: required alignment (typically 4 or 8)
	24	+ *
	25	+ * Returns true if elements can be copied using word loads and stores.
	26	+ * The size must be a multiple of the alignment, and the base address must
	27	+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
	28	+ *
	29	+ * For some reason, gcc doesn't know to optimize "if (a & mask \|\| b & mask)"
	30	+ * to "if ((a \| b) & mask)", so we do that by hand.
	31	+ */
	32	+__attribute_const__ __always_inline
	33	+static bool is_aligned(const void *base, size_t size, unsigned char align)
15	34	{
16		- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \|\|
17		- ((unsigned long)base & (align - 1)) == 0;
18		-}
	35	+ unsigned char lsbits = (unsigned char)size;
19	36
20		-static void u32_swap(void a, void b, int size)
21		-{
22		- u32 t = (u32 )a;
23		- (u32 )a = (u32 )b;
24		- (u32 )b = t;
25		-}
26		-
27		-static void u64_swap(void a, void b, int size)
28		-{
29		- u64 t = (u64 )a;
30		- (u64 )a = (u64 )b;
31		- (u64 )b = t;
32		-}
33		-
34		-static void generic_swap(void a, void b, int size)
35		-{
36		- char t;
37		-
38		- do {
39		- t = (char )a;
40		- (char )a++ = (char )b;
41		- (char )b++ = t;
42		- } while (--size > 0);
	37	+ (void)base;
	38	+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	39	+ lsbits \|= (unsigned char)(uintptr_t)base;
	40	+#endif
	41	+ return (lsbits & (align - 1)) == 0;
43	42	}
44	43
45	44	/**
46		- * sort - sort an array of elements
	45	+ * swap_words_32 - swap two elements in 32-bit chunks
	46	+ * @a: pointer to the first element to swap
	47	+ * @b: pointer to the second element to swap
	48	+ * @n: element size (must be a multiple of 4)
	49	+ *
	50	+ * Exchange the two objects in memory. This exploits base+index addressing,
	51	+ * which basically all CPUs have, to minimize loop overhead computations.
	52	+ *
	53	+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
	54	+ * bottom of the loop, even though the zero flag is stil valid from the
	55	+ * subtract (since the intervening mov instructions don't alter the flags).
	56	+ * Gcc 8.1.0 doesn't have that problem.
	57	+ */
	58	+static void swap_words_32(void a, void b, size_t n)
	59	+{
	60	+ do {
	61	+ u32 t = (u32 )(a + (n -= 4));
	62	+ (u32 )(a + n) = (u32 )(b + n);
	63	+ (u32 )(b + n) = t;
	64	+ } while (n);
	65	+}
	66	+
	67	+/**
	68	+ * swap_words_64 - swap two elements in 64-bit chunks
	69	+ * @a: pointer to the first element to swap
	70	+ * @b: pointer to the second element to swap
	71	+ * @n: element size (must be a multiple of 8)
	72	+ *
	73	+ * Exchange the two objects in memory. This exploits base+index
	74	+ * addressing, which basically all CPUs have, to minimize loop overhead
	75	+ * computations.
	76	+ *
	77	+ * We'd like to use 64-bit loads if possible. If they're not, emulating
	78	+ * one requires base+index+4 addressing which x86 has but most other
	79	+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
	80	+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
	81	+ * x32 ABI). Are there any cases the kernel needs to worry about?
	82	+ */
	83	+static void swap_words_64(void a, void b, size_t n)
	84	+{
	85	+ do {
	86	+#ifdef CONFIG_64BIT
	87	+ u64 t = (u64 )(a + (n -= 8));
	88	+ (u64 )(a + n) = (u64 )(b + n);
	89	+ (u64 )(b + n) = t;
	90	+#else
	91	+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
	92	+ u32 t = (u32 )(a + (n -= 4));
	93	+ (u32 )(a + n) = (u32 )(b + n);
	94	+ (u32 )(b + n) = t;
	95	+
	96	+ t = (u32 )(a + (n -= 4));
	97	+ (u32 )(a + n) = (u32 )(b + n);
	98	+ (u32 )(b + n) = t;
	99	+#endif
	100	+ } while (n);
	101	+}
	102	+
	103	+/**
	104	+ * swap_bytes - swap two elements a byte at a time
	105	+ * @a: pointer to the first element to swap
	106	+ * @b: pointer to the second element to swap
	107	+ * @n: element size
	108	+ *
	109	+ * This is the fallback if alignment doesn't allow using larger chunks.
	110	+ */
	111	+static void swap_bytes(void a, void b, size_t n)
	112	+{
	113	+ do {
	114	+ char t = ((char *)a)[--n];
	115	+ ((char )a)[n] = ((char )b)[n];
	116	+ ((char *)b)[n] = t;
	117	+ } while (n);
	118	+}
	119	+
	120	+/*
	121	+ * The values are arbitrary as long as they can't be confused with
	122	+ * a pointer, but small integers make for the smallest compare
	123	+ * instructions.
	124	+ */
	125	+#define SWAP_WORDS_64 (swap_func_t)0
	126	+#define SWAP_WORDS_32 (swap_func_t)1
	127	+#define SWAP_BYTES (swap_func_t)2
	128	+
	129	+/*
	130	+ * The function pointer is last to make tail calls most efficient if the
	131	+ * compiler decides not to inline this function.
	132	+ */
	133	+static void do_swap(void a, void b, size_t size, swap_func_t swap_func)
	134	+{
	135	+ if (swap_func == SWAP_WORDS_64)
	136	+ swap_words_64(a, b, size);
	137	+ else if (swap_func == SWAP_WORDS_32)
	138	+ swap_words_32(a, b, size);
	139	+ else if (swap_func == SWAP_BYTES)
	140	+ swap_bytes(a, b, size);
	141	+ else
	142	+ swap_func(a, b, (int)size);
	143	+}
	144	+
	145	+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
	146	+
	147	+static int do_cmp(const void a, const void b, cmp_r_func_t cmp, const void *priv)
	148	+{
	149	+ if (cmp == _CMP_WRAPPER)
	150	+ return ((cmp_func_t)(priv))(a, b);
	151	+ return cmp(a, b, priv);
	152	+}
	153	+
	154	+/**
	155	+ * parent - given the offset of the child, find the offset of the parent.
	156	+ * @i: the offset of the heap element whose parent is sought. Non-zero.
	157	+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
	158	+ * @size: size of each element
	159	+ *
	160	+ * In terms of array indexes, the parent of element j = @i/@size is simply
	161	+ * (j-1)/2. But when working in byte offsets, we can't use implicit
	162	+ * truncation of integer divides.
	163	+ *
	164	+ * Fortunately, we only need one bit of the quotient, not the full divide.
	165	+ * @size has a least significant bit. That bit will be clear if @i is
	166	+ * an even multiple of @size, and set if it's an odd multiple.
	167	+ *
	168	+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
	169	+ * branch is unpredictable, it's done with a bit of clever branch-free
	170	+ * code instead.
	171	+ */
	172	+__attribute_const__ __always_inline
	173	+static size_t parent(size_t i, unsigned int lsbit, size_t size)
	174	+{
	175	+ i -= size;
	176	+ i -= size & -(i & lsbit);
	177	+ return i / 2;
	178	+}
	179	+
	180	+/**
	181	+ * sort_r - sort an array of elements
47	182	* @base: pointer to data to sort
48	183	* @num: number of elements
49	184	* @size: size of each element
50	185	* @cmp_func: pointer to comparison function
51	186	* @swap_func: pointer to swap function or NULL
	187	+ * @priv: third argument passed to comparison function
52	188	*
53		- * This function does a heapsort on the given array. You may provide a
54		- * swap_func function optimized to your element type.
	189	+ * This function does a heapsort on the given array. You may provide
	190	+ * a swap_func function if you need to do something more than a memory
	191	+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
	192	+ * avoids a slow retpoline and so is significantly faster.
55	193	*
56	194	* Sorting time is O(n log n) both on average and worst-case. While
57		- * qsort is about 20% faster on average, it suffers from exploitable
	195	+ * quicksort is slightly faster on average, it suffers from exploitable
58	196	* O(n*n) worst-case behavior and extra memory requirements that make
59	197	* it less suitable for kernel use.
60	198	*/
61		-
62		-void sort(void *base, size_t num, size_t size,
63		- int (cmp_func)(const void , const void *),
64		- void (swap_func)(void , void *, int size))
	199	+void sort_r(void *base, size_t num, size_t size,
	200	+ cmp_r_func_t cmp_func,
	201	+ swap_func_t swap_func,
	202	+ const void *priv)
65	203	{
66	204	/* pre-scale counters for performance */
67		- int i = (num/2 - 1) * size, n = num * size, c, r;
	205	+ size_t n = num * size, a = (num/2) * size;
	206	+ const unsigned int lsbit = size & -size; /* Used to find parent */
	207	+
	208	+ if (!a) /* num < 2 \|\| size == 0 */
	209	+ return;
68	210
69	211	if (!swap_func) {
70		- if (size == 4 && alignment_ok(base, 4))
71		- swap_func = u32_swap;
72		- else if (size == 8 && alignment_ok(base, 8))
73		- swap_func = u64_swap;
	212	+ if (is_aligned(base, size, 8))
	213	+ swap_func = SWAP_WORDS_64;
	214	+ else if (is_aligned(base, size, 4))
	215	+ swap_func = SWAP_WORDS_32;
74	216	else
75		- swap_func = generic_swap;
	217	+ swap_func = SWAP_BYTES;
76	218	}
77	219
78		- /* heapify */
79		- for ( ; i >= 0; i -= size) {
80		- for (r = i; r * 2 + size < n; r = c) {
81		- c = r * 2 + size;
82		- if (c < n - size &&
83		- cmp_func(base + c, base + c + size) < 0)
84		- c += size;
85		- if (cmp_func(base + r, base + c) >= 0)
86		- break;
87		- swap_func(base + r, base + c, size);
88		- }
89		- }
	220	+ /*
	221	+ * Loop invariants:
	222	+ * 1. elements [a,n) satisfy the heap property (compare greater than
	223	+ * all of their children),
	224	+ * 2. elements [n,num*size) are sorted, and
	225	+ * 3. a <= b <= c <= d <= n (whenever they are valid).
	226	+ */
	227	+ for (;;) {
	228	+ size_t b, c, d;
90	229
91		- /* sort */
92		- for (i = n - size; i > 0; i -= size) {
93		- swap_func(base, base + i, size);
94		- for (r = 0; r * 2 + size < i; r = c) {
95		- c = r * 2 + size;
96		- if (c < i - size &&
97		- cmp_func(base + c, base + c + size) < 0)
98		- c += size;
99		- if (cmp_func(base + r, base + c) >= 0)
100		- break;
101		- swap_func(base + r, base + c, size);
	230	+ if (a) /* Building heap: sift down --a */
	231	+ a -= size;
	232	+ else if (n -= size) /* Sorting: Extract root to --n */
	233	+ do_swap(base, base + n, size, swap_func);
	234	+ else /* Sort complete */
	235	+ break;
	236	+
	237	+ /*
	238	+ * Sift element at "a" down into heap. This is the
	239	+ * "bottom-up" variant, which significantly reduces
	240	+ * calls to cmp_func(): we find the sift-down path all
	241	+ * the way to the leaves (one compare per level), then
	242	+ * backtrack to find where to insert the target element.
	243	+ *
	244	+ * Because elements tend to sift down close to the leaves,
	245	+ * this uses fewer compares than doing two per level
	246	+ * on the way down. (A bit more than half as many on
	247	+ * average, 3/4 worst-case.)
	248	+ */
	249	+ for (b = a; c = 2*b + size, (d = c + size) < n;)
	250	+ b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
	251	+ if (d == n) /* Special case last leaf with no sibling */
	252	+ b = c;
	253	+
	254	+ /* Now backtrack from "b" to the correct location for "a" */
	255	+ while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
	256	+ b = parent(b, lsbit, size);
	257	+ c = b; /* Where "a" belongs */
	258	+ while (b != a) { /* Shift it into place */
	259	+ b = parent(b, lsbit, size);
	260	+ do_swap(base + b, base + c, size, swap_func);
102	261	}
103	262	}
104	263	}
	264	+EXPORT_SYMBOL(sort_r);
105	265
	266	+void sort(void *base, size_t num, size_t size,
	267	+ cmp_func_t cmp_func,
	268	+ swap_func_t swap_func)
	269	+{
	270	+ return sort_r(base, num, size, _CMP_WRAPPER, swap_func, cmp_func);
	271	+}
106	272	EXPORT_SYMBOL(sort);