~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,12 +1,8 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
3	4	*
4	5	* Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
5		- *
6		- * This program is free software; you can redistribute it and/or
7		- * modify it under the terms of the GNU General Public License
8		- * as published by the Free Software Foundation; either version
9		- * 2 of the License, or (at your option) any later version.
10	6	*
11	7	* Meant to be mostly used for locally generated traffic :
12	8	* Fast classification depends on skb->sk being set before reaching us.
..	..	@@ -54,25 +50,43 @@
54	50	#include <net/tcp_states.h>
55	51	#include <net/tcp.h>
56	52
	53	+struct fq_skb_cb {
	54	+ u64 time_to_send;
	55	+};
	56	+
	57	+static inline struct fq_skb_cb fq_skb_cb(struct sk_buff skb)
	58	+{
	59	+ qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
	60	+ return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
	61	+}
	62	+
57	63	/*
58		- * Per flow structure, dynamically allocated
	64	+ * Per flow structure, dynamically allocated.
	65	+ * If packets have monotically increasing time_to_send, they are placed in O(1)
	66	+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
59	67	*/
60	68	struct fq_flow {
	69	+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
	70	+ struct rb_root t_root;
61	71	struct sk_buff head; / list of skbs for this flow : first skb */
62	72	union {
63	73	struct sk_buff tail; / last skb in the list */
64		- unsigned long age; /* jiffies when flow was emptied, for gc */
	74	+ unsigned long age; /* (jiffies \| 1UL) when flow was emptied, for gc */
65	75	};
66	76	struct rb_node fq_node; /* anchor in fq_root[] trees */
67	77	struct sock *sk;
68		- int qlen; /* number of packets in flow queue */
69		- int credit;
70	78	u32 socket_hash; /* sk_hash */
71		- struct fq_flow next; / next pointer in RR lists, or &detached */
	79	+ int qlen; /* number of packets in flow queue */
	80	+
	81	+/* Second cache line, used in fq_dequeue() */
	82	+ int credit;
	83	+ /* 32bit hole on 64bit arches */
	84	+
	85	+ struct fq_flow next; / next pointer in RR lists */
72	86
73	87	struct rb_node rate_node; /* anchor in q->delayed tree */
74	88	u64 time_next_packet;
75		-};
	89	+} ____cacheline_aligned_in_smp;
76	90
77	91	struct fq_flow_head {
78	92	struct fq_flow *first;
..	..	@@ -86,47 +100,59 @@
86	100
87	101	struct rb_root delayed; /* for rate limited flows */
88	102	u64 time_next_delayed_flow;
	103	+ u64 ktime_cache; /* copy of last ktime_get_ns() */
89	104	unsigned long unthrottle_latency_ns;
90	105
91	106	struct fq_flow internal; /* for non classified or high prio packets */
92	107	u32 quantum;
93	108	u32 initial_quantum;
94	109	u32 flow_refill_delay;
95		- u32 flow_max_rate; /* optional max rate per flow */
96	110	u32 flow_plimit; /* max packets per flow */
	111	+ unsigned long flow_max_rate; /* optional max rate per flow */
	112	+ u64 ce_threshold;
	113	+ u64 horizon; /* horizon in ns */
97	114	u32 orphan_mask; /* mask for orphaned skb */
98	115	u32 low_rate_threshold;
99	116	struct rb_root *fq_root;
100	117	u8 rate_enable;
101	118	u8 fq_trees_log;
102		-
	119	+ u8 horizon_drop;
103	120	u32 flows;
104	121	u32 inactive_flows;
105	122	u32 throttled_flows;
106	123
107	124	u64 stat_gc_flows;
108	125	u64 stat_internal_packets;
109		- u64 stat_tcp_retrans;
110	126	u64 stat_throttled;
	127	+ u64 stat_ce_mark;
	128	+ u64 stat_horizon_drops;
	129	+ u64 stat_horizon_caps;
111	130	u64 stat_flows_plimit;
112	131	u64 stat_pkts_too_long;
113	132	u64 stat_allocation_errors;
	133	+
	134	+ u32 timer_slack; /* hrtimer slack in ns */
114	135	struct qdisc_watchdog watchdog;
115	136	};
116	137
117		-/* special value to mark a detached flow (not on old/new list) */
118		-static struct fq_flow detached, throttled;
119		-
	138	+/*
	139	+ * f->tail and f->age share the same location.
	140	+ * We can use the low order bit to differentiate if this location points
	141	+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
	142	+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
	143	+ */
120	144	static void fq_flow_set_detached(struct fq_flow *f)
121	145	{
122		- f->next = &detached;
123		- f->age = jiffies;
	146	+ f->age = jiffies \| 1UL;
124	147	}
125	148
126	149	static bool fq_flow_is_detached(const struct fq_flow *f)
127	150	{
128		- return f->next == &detached;
	151	+ return !!(f->age & 1UL);
129	152	}
	153	+
	154	+/* special value to mark a throttled flow (not on old/new list) */
	155	+static struct fq_flow throttled;
130	156
131	157	static bool fq_flow_is_throttled(const struct fq_flow *f)
132	158	{
..	..	@@ -192,9 +218,10 @@
192	218	struct rb_root *root,
193	219	struct sock *sk)
194	220	{
195		- struct fq_flow f, tofree[FQ_GC_MAX];
196	221	struct rb_node *p, parent;
197		- int fcnt = 0;
	222	+ void *tofree[FQ_GC_MAX];
	223	+ struct fq_flow *f;
	224	+ int i, fcnt = 0;
198	225
199	226	p = &root->rb_node;
200	227	parent = NULL;
..	..	@@ -217,15 +244,18 @@
217	244	p = &parent->rb_left;
218	245	}
219	246
	247	+ if (!fcnt)
	248	+ return;
	249	+
	250	+ for (i = fcnt; i > 0; ) {
	251	+ f = tofree[--i];
	252	+ rb_erase(&f->fq_node, root);
	253	+ }
220	254	q->flows -= fcnt;
221	255	q->inactive_flows -= fcnt;
222	256	q->stat_gc_flows += fcnt;
223		- while (fcnt) {
224		- struct fq_flow *f = tofree[--fcnt];
225	257
226		- rb_erase(&f->fq_node, root);
227		- kmem_cache_free(fq_flow_cachep, f);
228		- }
	258	+ kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
229	259	}
230	260
231	261	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
..	..	@@ -256,6 +286,17 @@
256	286	*/
257	287	sk = (struct sock *)((hash << 1) \| 1UL);
258	288	skb_orphan(skb);
	289	+ } else if (sk->sk_state == TCP_CLOSE) {
	290	+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
	291	+ /*
	292	+ * Sockets in TCP_CLOSE are non connected.
	293	+ * Typical use case is UDP sockets, they can send packets
	294	+ * with sendto() to many different destinations.
	295	+ * We probably could use a generic bit advertising
	296	+ * non connected sockets, instead of sk_state == TCP_CLOSE,
	297	+ * if we care enough.
	298	+ */
	299	+ sk = (struct sock *)((hash << 1) \| 1UL);
259	300	}
260	301
261	302	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
..	..	@@ -276,10 +317,13 @@
276	317	* It not, we need to refill credit with
277	318	* initial quantum
278	319	*/
279		- if (unlikely(skb->sk &&
	320	+ if (unlikely(skb->sk == sk &&
280	321	f->socket_hash != sk->sk_hash)) {
281	322	f->credit = q->initial_quantum;
282	323	f->socket_hash = sk->sk_hash;
	324	+ if (q->rate_enable)
	325	+ smp_store_release(&sk->sk_pacing_status,
	326	+ SK_PACING_FQ);
283	327	if (fq_flow_is_throttled(f))
284	328	fq_flow_unset_throttled(q, f);
285	329	f->time_next_packet = 0ULL;
..	..	@@ -297,10 +341,16 @@
297	341	q->stat_allocation_errors++;
298	342	return &q->internal;
299	343	}
	344	+ /* f->t_root is already zeroed after kmem_cache_zalloc() */
	345	+
300	346	fq_flow_set_detached(f);
301	347	f->sk = sk;
302		- if (skb->sk)
	348	+ if (skb->sk == sk) {
303	349	f->socket_hash = sk->sk_hash;
	350	+ if (q->rate_enable)
	351	+ smp_store_release(&sk->sk_pacing_status,
	352	+ SK_PACING_FQ);
	353	+ }
304	354	f->credit = q->initial_quantum;
305	355
306	356	rb_link_node(&f->fq_node, parent, p);
..	..	@@ -311,78 +361,82 @@
311	361	return f;
312	362	}
313	363
314		-
315		-/* remove one skb from head of flow queue */
316		-static struct sk_buff fq_dequeue_head(struct Qdisc sch, struct fq_flow *flow)
	364	+static struct sk_buff fq_peek(struct fq_flow flow)
317	365	{
318		- struct sk_buff *skb = flow->head;
	366	+ struct sk_buff *skb = skb_rb_first(&flow->t_root);
	367	+ struct sk_buff *head = flow->head;
319	368
320		- if (skb) {
	369	+ if (!skb)
	370	+ return head;
	371	+
	372	+ if (!head)
	373	+ return skb;
	374	+
	375	+ if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
	376	+ return skb;
	377	+ return head;
	378	+}
	379	+
	380	+static void fq_erase_head(struct Qdisc sch, struct fq_flow flow,
	381	+ struct sk_buff *skb)
	382	+{
	383	+ if (skb == flow->head) {
321	384	flow->head = skb->next;
322		- skb->next = NULL;
323		- flow->qlen--;
324		- qdisc_qstats_backlog_dec(sch, skb);
325		- sch->q.qlen--;
	385	+ } else {
	386	+ rb_erase(&skb->rbnode, &flow->t_root);
	387	+ skb->dev = qdisc_dev(sch);
326	388	}
327		- return skb;
328	389	}
329	390
330		-/* We might add in the future detection of retransmits
331		- * For the time being, just return false
	391	+/* Remove one skb from flow queue.
	392	+ * This skb must be the return value of prior fq_peek().
332	393	*/
333		-static bool skb_is_retransmit(struct sk_buff *skb)
	394	+static void fq_dequeue_skb(struct Qdisc sch, struct fq_flow flow,
	395	+ struct sk_buff *skb)
334	396	{
335		- return false;
	397	+ fq_erase_head(sch, flow, skb);
	398	+ skb_mark_not_on_list(skb);
	399	+ flow->qlen--;
	400	+ qdisc_qstats_backlog_dec(sch, skb);
	401	+ sch->q.qlen--;
336	402	}
337	403
338		-/* add skb to flow queue
339		- * flow queue is a linked list, kind of FIFO, except for TCP retransmits
340		- * We special case tcp retransmits to be transmitted before other packets.
341		- * We rely on fact that TCP retransmits are unlikely, so we do not waste
342		- * a separate queue or a pointer.
343		- * head-> [retrans pkt 1]
344		- * [retrans pkt 2]
345		- * [ normal pkt 1]
346		- * [ normal pkt 2]
347		- * [ normal pkt 3]
348		- * tail-> [ normal pkt 4]
349		- */
350	404	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
351	405	{
352		- struct sk_buff prev, head = flow->head;
	406	+ struct rb_node *p, parent;
	407	+ struct sk_buff head, aux;
353	408
354		- skb->next = NULL;
355		- if (!head) {
356		- flow->head = skb;
357		- flow->tail = skb;
358		- return;
359		- }
360		- if (likely(!skb_is_retransmit(skb))) {
361		- flow->tail->next = skb;
362		- flow->tail = skb;
363		- return;
364		- }
365		-
366		- /* This skb is a tcp retransmit,
367		- * find the last retrans packet in the queue
368		- */
369		- prev = NULL;
370		- while (skb_is_retransmit(head)) {
371		- prev = head;
372		- head = head->next;
	409	+ head = flow->head;
	410	+ if (!head \|\|
	411	+ fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
373	412	if (!head)
374		- break;
375		- }
376		- if (!prev) { /* no rtx packet in queue, become the new head */
377		- skb->next = flow->head;
378		- flow->head = skb;
379		- } else {
380		- if (prev == flow->tail)
381		- flow->tail = skb;
	413	+ flow->head = skb;
382	414	else
383		- skb->next = prev->next;
384		- prev->next = skb;
	415	+ flow->tail->next = skb;
	416	+ flow->tail = skb;
	417	+ skb->next = NULL;
	418	+ return;
385	419	}
	420	+
	421	+ p = &flow->t_root.rb_node;
	422	+ parent = NULL;
	423	+
	424	+ while (*p) {
	425	+ parent = *p;
	426	+ aux = rb_to_skb(parent);
	427	+ if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
	428	+ p = &parent->rb_right;
	429	+ else
	430	+ p = &parent->rb_left;
	431	+ }
	432	+ rb_link_node(&skb->rbnode, parent, p);
	433	+ rb_insert_color(&skb->rbnode, &flow->t_root);
	434	+}
	435	+
	436	+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
	437	+ const struct fq_sched_data *q)
	438	+{
	439	+ return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
386	440	}
387	441
388	442	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch,
..	..	@@ -394,6 +448,28 @@
394	448	if (unlikely(sch->q.qlen >= sch->limit))
395	449	return qdisc_drop(skb, sch, to_free);
396	450
	451	+ if (!skb->tstamp) {
	452	+ fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
	453	+ } else {
	454	+ /* Check if packet timestamp is too far in the future.
	455	+ * Try first if our cached value, to avoid ktime_get_ns()
	456	+ * cost in most cases.
	457	+ */
	458	+ if (fq_packet_beyond_horizon(skb, q)) {
	459	+ /* Refresh our cache and check another time */
	460	+ q->ktime_cache = ktime_get_ns();
	461	+ if (fq_packet_beyond_horizon(skb, q)) {
	462	+ if (q->horizon_drop) {
	463	+ q->stat_horizon_drops++;
	464	+ return qdisc_drop(skb, sch, to_free);
	465	+ }
	466	+ q->stat_horizon_caps++;
	467	+ skb->tstamp = q->ktime_cache + q->horizon;
	468	+ }
	469	+ }
	470	+ fq_skb_cb(skb)->time_to_send = skb->tstamp;
	471	+ }
	472	+
397	473	f = fq_classify(skb, q);
398	474	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
399	475	q->stat_flows_plimit++;
..	..	@@ -401,21 +477,11 @@
401	477	}
402	478
403	479	f->qlen++;
404		- if (skb_is_retransmit(skb))
405		- q->stat_tcp_retrans++;
406	480	qdisc_qstats_backlog_inc(sch, skb);
407	481	if (fq_flow_is_detached(f)) {
408		- struct sock *sk = skb->sk;
409		-
410	482	fq_flow_add_tail(&q->new_flows, f);
411	483	if (time_after(jiffies, f->age + q->flow_refill_delay))
412	484	f->credit = max_t(u32, f->credit, q->quantum);
413		- if (sk && q->rate_enable) {
414		- if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
415		- SK_PACING_FQ))
416		- smp_store_release(&sk->sk_pacing_status,
417		- SK_PACING_FQ);
418		- }
419	485	q->inactive_flows--;
420	486	}
421	487
..	..	@@ -460,15 +526,23 @@
460	526	static struct sk_buff fq_dequeue(struct Qdisc sch)
461	527	{
462	528	struct fq_sched_data *q = qdisc_priv(sch);
463		- u64 now = ktime_get_ns();
464	529	struct fq_flow_head *head;
465	530	struct sk_buff *skb;
466	531	struct fq_flow *f;
467		- u32 rate, plen;
	532	+ unsigned long rate;
	533	+ u32 plen;
	534	+ u64 now;
468	535
469		- skb = fq_dequeue_head(sch, &q->internal);
470		- if (skb)
	536	+ if (!sch->q.qlen)
	537	+ return NULL;
	538	+
	539	+ skb = fq_peek(&q->internal);
	540	+ if (unlikely(skb)) {
	541	+ fq_dequeue_skb(sch, &q->internal, skb);
471	542	goto out;
	543	+ }
	544	+
	545	+ q->ktime_cache = now = ktime_get_ns();
472	546	fq_check_throttled(q, now);
473	547	begin:
474	548	head = &q->new_flows;
..	..	@@ -476,8 +550,9 @@
476	550	head = &q->old_flows;
477	551	if (!head->first) {
478	552	if (q->time_next_delayed_flow != ~0ULL)
479		- qdisc_watchdog_schedule_ns(&q->watchdog,
480		- q->time_next_delayed_flow);
	553	+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
	554	+ q->time_next_delayed_flow,
	555	+ q->timer_slack);
481	556	return NULL;
482	557	}
483	558	}
..	..	@@ -490,16 +565,24 @@
490	565	goto begin;
491	566	}
492	567
493		- skb = f->head;
494		- if (unlikely(skb && now < f->time_next_packet &&
495		- !skb_is_tcp_pure_ack(skb))) {
496		- head->first = f->next;
497		- fq_flow_set_throttled(q, f);
498		- goto begin;
499		- }
	568	+ skb = fq_peek(f);
	569	+ if (skb) {
	570	+ u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
	571	+ f->time_next_packet);
500	572
501		- skb = fq_dequeue_head(sch, f);
502		- if (!skb) {
	573	+ if (now < time_next_packet) {
	574	+ head->first = f->next;
	575	+ f->time_next_packet = time_next_packet;
	576	+ fq_flow_set_throttled(q, f);
	577	+ goto begin;
	578	+ }
	579	+ prefetch(&skb->end);
	580	+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
	581	+ INET_ECN_set_ce(skb);
	582	+ q->stat_ce_mark++;
	583	+ }
	584	+ fq_dequeue_skb(sch, f, skb);
	585	+ } else {
503	586	head->first = f->next;
504	587	/* force a pass through old_flows to prevent starvation */
505	588	if ((head == &q->new_flows) && q->old_flows.first) {
..	..	@@ -510,33 +593,35 @@
510	593	}
511	594	goto begin;
512	595	}
513		- prefetch(&skb->end);
514		- f->credit -= qdisc_pkt_len(skb);
	596	+ plen = qdisc_pkt_len(skb);
	597	+ f->credit -= plen;
515	598
516	599	if (!q->rate_enable)
517	600	goto out;
518	601
519		- /* Do not pace locally generated ack packets */
520		- if (skb_is_tcp_pure_ack(skb))
521		- goto out;
522		-
523	602	rate = q->flow_max_rate;
524		- if (skb->sk)
525		- rate = min(skb->sk->sk_pacing_rate, rate);
526	603
527		- if (rate <= q->low_rate_threshold) {
528		- f->credit = 0;
529		- plen = qdisc_pkt_len(skb);
530		- } else {
531		- plen = max(qdisc_pkt_len(skb), q->quantum);
532		- if (f->credit > 0)
533		- goto out;
	604	+ /* If EDT time was provided for this skb, we need to
	605	+ * update f->time_next_packet only if this qdisc enforces
	606	+ * a flow max rate.
	607	+ */
	608	+ if (!skb->tstamp) {
	609	+ if (skb->sk)
	610	+ rate = min(skb->sk->sk_pacing_rate, rate);
	611	+
	612	+ if (rate <= q->low_rate_threshold) {
	613	+ f->credit = 0;
	614	+ } else {
	615	+ plen = max(plen, q->quantum);
	616	+ if (f->credit > 0)
	617	+ goto out;
	618	+ }
534	619	}
535		- if (rate != ~0U) {
	620	+ if (rate != ~0UL) {
536	621	u64 len = (u64)plen * NSEC_PER_SEC;
537	622
538	623	if (likely(rate))
539		- do_div(len, rate);
	624	+ len = div64_ul(len, rate);
540	625	/* Since socket rate can change later,
541	626	* clamp the delay to 1 second.
542	627	* Really, providers of too big packets should be fixed !
..	..	@@ -560,6 +645,15 @@
560	645
561	646	static void fq_flow_purge(struct fq_flow *flow)
562	647	{
	648	+ struct rb_node *p = rb_first(&flow->t_root);
	649	+
	650	+ while (p) {
	651	+ struct sk_buff *skb = rb_to_skb(p);
	652	+
	653	+ p = rb_next(p);
	654	+ rb_erase(&skb->rbnode, &flow->t_root);
	655	+ rtnl_kfree_skbs(skb, skb);
	656	+ }
563	657	rtnl_kfree_skbs(flow->head, flow->tail);
564	658	flow->head = NULL;
565	659	flow->qlen = 0;
..	..	@@ -685,11 +779,17 @@
685	779	return 0;
686	780	}
687	781
	782	+static struct netlink_range_validation iq_range = {
	783	+ .max = INT_MAX,
	784	+};
	785	+
688	786	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
	787	+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
	788	+
689	789	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
690	790	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
691	791	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
692		- [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
	792	+ [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
693	793	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
694	794	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
695	795	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
..	..	@@ -697,6 +797,10 @@
697	797	[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
698	798	[TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
699	799	[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
	800	+ [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
	801	+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
	802	+ [TCA_FQ_HORIZON] = { .type = NLA_U32 },
	803	+ [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
700	804	};
701	805
702	806	static int fq_change(struct Qdisc sch, struct nlattr opt,
..	..	@@ -711,7 +815,8 @@
711	815	if (!opt)
712	816	return -EINVAL;
713	817
714		- err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL);
	818	+ err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
	819	+ NULL);
715	820	if (err < 0)
716	821	return err;
717	822
..	..	@@ -751,9 +856,11 @@
751	856	pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
752	857	nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
753	858
754		- if (tb[TCA_FQ_FLOW_MAX_RATE])
755		- q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
	859	+ if (tb[TCA_FQ_FLOW_MAX_RATE]) {
	860	+ u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
756	861
	862	+ q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
	863	+ }
757	864	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
758	865	q->low_rate_threshold =
759	866	nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
..	..	@@ -776,7 +883,22 @@
776	883	if (tb[TCA_FQ_ORPHAN_MASK])
777	884	q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
778	885
	886	+ if (tb[TCA_FQ_CE_THRESHOLD])
	887	+ q->ce_threshold = (u64)NSEC_PER_USEC *
	888	+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
	889	+
	890	+ if (tb[TCA_FQ_TIMER_SLACK])
	891	+ q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
	892	+
	893	+ if (tb[TCA_FQ_HORIZON])
	894	+ q->horizon = (u64)NSEC_PER_USEC *
	895	+ nla_get_u32(tb[TCA_FQ_HORIZON]);
	896	+
	897	+ if (tb[TCA_FQ_HORIZON_DROP])
	898	+ q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
	899	+
779	900	if (!err) {
	901	+
780	902	sch_tree_unlock(sch);
781	903	err = fq_resize(sch, fq_log);
782	904	sch_tree_lock(sch);
..	..	@@ -816,7 +938,7 @@
816	938	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
817	939	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
818	940	q->flow_refill_delay = msecs_to_jiffies(40);
819		- q->flow_max_rate = ~0U;
	941	+ q->flow_max_rate = ~0UL;
820	942	q->time_next_delayed_flow = ~0ULL;
821	943	q->rate_enable = 1;
822	944	q->new_flows.first = NULL;
..	..	@@ -826,7 +948,16 @@
826	948	q->fq_trees_log = ilog2(1024);
827	949	q->orphan_mask = 1024 - 1;
828	950	q->low_rate_threshold = 550000 / 8;
829		- qdisc_watchdog_init(&q->watchdog, sch);
	951	+
	952	+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
	953	+
	954	+ q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
	955	+ q->horizon_drop = 1; /* by default, drop packets beyond horizon */
	956	+
	957	+ /* Default ce_threshold of 4294 seconds */
	958	+ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
	959	+
	960	+ qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
830	961
831	962	if (opt)
832	963	err = fq_change(sch, opt, extack);
..	..	@@ -839,26 +970,36 @@
839	970	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
840	971	{
841	972	struct fq_sched_data *q = qdisc_priv(sch);
	973	+ u64 ce_threshold = q->ce_threshold;
	974	+ u64 horizon = q->horizon;
842	975	struct nlattr *opts;
843	976
844		- opts = nla_nest_start(skb, TCA_OPTIONS);
	977	+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
845	978	if (opts == NULL)
846	979	goto nla_put_failure;
847	980
848	981	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
	982	+
	983	+ do_div(ce_threshold, NSEC_PER_USEC);
	984	+ do_div(horizon, NSEC_PER_USEC);
849	985
850	986	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
851	987	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
852	988	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
853	989	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
854	990	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
855		- nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
	991	+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
	992	+ min_t(unsigned long, q->flow_max_rate, ~0U)) \|\|
856	993	nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
857	994	jiffies_to_usecs(q->flow_refill_delay)) \|\|
858	995	nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) \|\|
859	996	nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
860	997	q->low_rate_threshold) \|\|
861		- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
	998	+ nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) \|\|
	999	+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) \|\|
	1000	+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) \|\|
	1001	+ nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) \|\|
	1002	+ nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
862	1003	goto nla_put_failure;
863	1004
864	1005	return nla_nest_end(skb, opts);
..	..	@@ -876,17 +1017,21 @@
876	1017
877	1018	st.gc_flows = q->stat_gc_flows;
878	1019	st.highprio_packets = q->stat_internal_packets;
879		- st.tcp_retrans = q->stat_tcp_retrans;
	1020	+ st.tcp_retrans = 0;
880	1021	st.throttled = q->stat_throttled;
881	1022	st.flows_plimit = q->stat_flows_plimit;
882	1023	st.pkts_too_long = q->stat_pkts_too_long;
883	1024	st.allocation_errors = q->stat_allocation_errors;
884		- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
	1025	+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
	1026	+ ktime_get_ns();
885	1027	st.flows = q->flows;
886	1028	st.inactive_flows = q->inactive_flows;
887	1029	st.throttled_flows = q->throttled_flows;
888	1030	st.unthrottle_latency_ns = min_t(unsigned long,
889	1031	q->unthrottle_latency_ns, ~0U);
	1032	+ st.ce_mark = q->stat_ce_mark;
	1033	+ st.horizon_drops = q->stat_horizon_drops;
	1034	+ st.horizon_caps = q->stat_horizon_caps;
890	1035	sch_tree_unlock(sch);
891	1036
892	1037	return gnet_stats_copy_app(d, &st, sizeof(st));
..	..	@@ -934,3 +1079,4 @@
934	1079	module_exit(fq_module_exit)
935	1080	MODULE_AUTHOR("Eric Dumazet");
936	1081	MODULE_LICENSE("GPL");
	1082	+MODULE_DESCRIPTION("Fair Queue Packet Scheduler");