hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/net/sched/sch_fq.c
....@@ -1,12 +1,8 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
34 *
45 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
5
- *
6
- * This program is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU General Public License
8
- * as published by the Free Software Foundation; either version
9
- * 2 of the License, or (at your option) any later version.
106 *
117 * Meant to be mostly used for locally generated traffic :
128 * Fast classification depends on skb->sk being set before reaching us.
....@@ -54,25 +50,43 @@
5450 #include <net/tcp_states.h>
5551 #include <net/tcp.h>
5652
53
+struct fq_skb_cb {
54
+ u64 time_to_send;
55
+};
56
+
57
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
58
+{
59
+ qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
60
+ return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
61
+}
62
+
5763 /*
58
- * Per flow structure, dynamically allocated
64
+ * Per flow structure, dynamically allocated.
65
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
66
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
5967 */
6068 struct fq_flow {
69
+/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
70
+ struct rb_root t_root;
6171 struct sk_buff *head; /* list of skbs for this flow : first skb */
6272 union {
6373 struct sk_buff *tail; /* last skb in the list */
64
- unsigned long age; /* jiffies when flow was emptied, for gc */
74
+ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
6575 };
6676 struct rb_node fq_node; /* anchor in fq_root[] trees */
6777 struct sock *sk;
68
- int qlen; /* number of packets in flow queue */
69
- int credit;
7078 u32 socket_hash; /* sk_hash */
71
- struct fq_flow *next; /* next pointer in RR lists, or &detached */
79
+ int qlen; /* number of packets in flow queue */
80
+
81
+/* Second cache line, used in fq_dequeue() */
82
+ int credit;
83
+ /* 32bit hole on 64bit arches */
84
+
85
+ struct fq_flow *next; /* next pointer in RR lists */
7286
7387 struct rb_node rate_node; /* anchor in q->delayed tree */
7488 u64 time_next_packet;
75
-};
89
+} ____cacheline_aligned_in_smp;
7690
7791 struct fq_flow_head {
7892 struct fq_flow *first;
....@@ -86,47 +100,59 @@
86100
87101 struct rb_root delayed; /* for rate limited flows */
88102 u64 time_next_delayed_flow;
103
+ u64 ktime_cache; /* copy of last ktime_get_ns() */
89104 unsigned long unthrottle_latency_ns;
90105
91106 struct fq_flow internal; /* for non classified or high prio packets */
92107 u32 quantum;
93108 u32 initial_quantum;
94109 u32 flow_refill_delay;
95
- u32 flow_max_rate; /* optional max rate per flow */
96110 u32 flow_plimit; /* max packets per flow */
111
+ unsigned long flow_max_rate; /* optional max rate per flow */
112
+ u64 ce_threshold;
113
+ u64 horizon; /* horizon in ns */
97114 u32 orphan_mask; /* mask for orphaned skb */
98115 u32 low_rate_threshold;
99116 struct rb_root *fq_root;
100117 u8 rate_enable;
101118 u8 fq_trees_log;
102
-
119
+ u8 horizon_drop;
103120 u32 flows;
104121 u32 inactive_flows;
105122 u32 throttled_flows;
106123
107124 u64 stat_gc_flows;
108125 u64 stat_internal_packets;
109
- u64 stat_tcp_retrans;
110126 u64 stat_throttled;
127
+ u64 stat_ce_mark;
128
+ u64 stat_horizon_drops;
129
+ u64 stat_horizon_caps;
111130 u64 stat_flows_plimit;
112131 u64 stat_pkts_too_long;
113132 u64 stat_allocation_errors;
133
+
134
+ u32 timer_slack; /* hrtimer slack in ns */
114135 struct qdisc_watchdog watchdog;
115136 };
116137
117
-/* special value to mark a detached flow (not on old/new list) */
118
-static struct fq_flow detached, throttled;
119
-
138
+/*
139
+ * f->tail and f->age share the same location.
140
+ * We can use the low order bit to differentiate if this location points
141
+ * to a sk_buff or contains a jiffies value, if we force this value to be odd.
142
+ * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
143
+ */
120144 static void fq_flow_set_detached(struct fq_flow *f)
121145 {
122
- f->next = &detached;
123
- f->age = jiffies;
146
+ f->age = jiffies | 1UL;
124147 }
125148
126149 static bool fq_flow_is_detached(const struct fq_flow *f)
127150 {
128
- return f->next == &detached;
151
+ return !!(f->age & 1UL);
129152 }
153
+
154
+/* special value to mark a throttled flow (not on old/new list) */
155
+static struct fq_flow throttled;
130156
131157 static bool fq_flow_is_throttled(const struct fq_flow *f)
132158 {
....@@ -192,9 +218,10 @@
192218 struct rb_root *root,
193219 struct sock *sk)
194220 {
195
- struct fq_flow *f, *tofree[FQ_GC_MAX];
196221 struct rb_node **p, *parent;
197
- int fcnt = 0;
222
+ void *tofree[FQ_GC_MAX];
223
+ struct fq_flow *f;
224
+ int i, fcnt = 0;
198225
199226 p = &root->rb_node;
200227 parent = NULL;
....@@ -217,15 +244,18 @@
217244 p = &parent->rb_left;
218245 }
219246
247
+ if (!fcnt)
248
+ return;
249
+
250
+ for (i = fcnt; i > 0; ) {
251
+ f = tofree[--i];
252
+ rb_erase(&f->fq_node, root);
253
+ }
220254 q->flows -= fcnt;
221255 q->inactive_flows -= fcnt;
222256 q->stat_gc_flows += fcnt;
223
- while (fcnt) {
224
- struct fq_flow *f = tofree[--fcnt];
225257
226
- rb_erase(&f->fq_node, root);
227
- kmem_cache_free(fq_flow_cachep, f);
228
- }
258
+ kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
229259 }
230260
231261 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
....@@ -256,6 +286,17 @@
256286 */
257287 sk = (struct sock *)((hash << 1) | 1UL);
258288 skb_orphan(skb);
289
+ } else if (sk->sk_state == TCP_CLOSE) {
290
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
291
+ /*
292
+ * Sockets in TCP_CLOSE are non connected.
293
+ * Typical use case is UDP sockets, they can send packets
294
+ * with sendto() to many different destinations.
295
+ * We probably could use a generic bit advertising
296
+ * non connected sockets, instead of sk_state == TCP_CLOSE,
297
+ * if we care enough.
298
+ */
299
+ sk = (struct sock *)((hash << 1) | 1UL);
259300 }
260301
261302 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
....@@ -276,10 +317,13 @@
276317 * It not, we need to refill credit with
277318 * initial quantum
278319 */
279
- if (unlikely(skb->sk &&
320
+ if (unlikely(skb->sk == sk &&
280321 f->socket_hash != sk->sk_hash)) {
281322 f->credit = q->initial_quantum;
282323 f->socket_hash = sk->sk_hash;
324
+ if (q->rate_enable)
325
+ smp_store_release(&sk->sk_pacing_status,
326
+ SK_PACING_FQ);
283327 if (fq_flow_is_throttled(f))
284328 fq_flow_unset_throttled(q, f);
285329 f->time_next_packet = 0ULL;
....@@ -297,10 +341,16 @@
297341 q->stat_allocation_errors++;
298342 return &q->internal;
299343 }
344
+ /* f->t_root is already zeroed after kmem_cache_zalloc() */
345
+
300346 fq_flow_set_detached(f);
301347 f->sk = sk;
302
- if (skb->sk)
348
+ if (skb->sk == sk) {
303349 f->socket_hash = sk->sk_hash;
350
+ if (q->rate_enable)
351
+ smp_store_release(&sk->sk_pacing_status,
352
+ SK_PACING_FQ);
353
+ }
304354 f->credit = q->initial_quantum;
305355
306356 rb_link_node(&f->fq_node, parent, p);
....@@ -311,78 +361,82 @@
311361 return f;
312362 }
313363
314
-
315
-/* remove one skb from head of flow queue */
316
-static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
364
+static struct sk_buff *fq_peek(struct fq_flow *flow)
317365 {
318
- struct sk_buff *skb = flow->head;
366
+ struct sk_buff *skb = skb_rb_first(&flow->t_root);
367
+ struct sk_buff *head = flow->head;
319368
320
- if (skb) {
369
+ if (!skb)
370
+ return head;
371
+
372
+ if (!head)
373
+ return skb;
374
+
375
+ if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
376
+ return skb;
377
+ return head;
378
+}
379
+
380
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
381
+ struct sk_buff *skb)
382
+{
383
+ if (skb == flow->head) {
321384 flow->head = skb->next;
322
- skb->next = NULL;
323
- flow->qlen--;
324
- qdisc_qstats_backlog_dec(sch, skb);
325
- sch->q.qlen--;
385
+ } else {
386
+ rb_erase(&skb->rbnode, &flow->t_root);
387
+ skb->dev = qdisc_dev(sch);
326388 }
327
- return skb;
328389 }
329390
330
-/* We might add in the future detection of retransmits
331
- * For the time being, just return false
391
+/* Remove one skb from flow queue.
392
+ * This skb must be the return value of prior fq_peek().
332393 */
333
-static bool skb_is_retransmit(struct sk_buff *skb)
394
+static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
395
+ struct sk_buff *skb)
334396 {
335
- return false;
397
+ fq_erase_head(sch, flow, skb);
398
+ skb_mark_not_on_list(skb);
399
+ flow->qlen--;
400
+ qdisc_qstats_backlog_dec(sch, skb);
401
+ sch->q.qlen--;
336402 }
337403
338
-/* add skb to flow queue
339
- * flow queue is a linked list, kind of FIFO, except for TCP retransmits
340
- * We special case tcp retransmits to be transmitted before other packets.
341
- * We rely on fact that TCP retransmits are unlikely, so we do not waste
342
- * a separate queue or a pointer.
343
- * head-> [retrans pkt 1]
344
- * [retrans pkt 2]
345
- * [ normal pkt 1]
346
- * [ normal pkt 2]
347
- * [ normal pkt 3]
348
- * tail-> [ normal pkt 4]
349
- */
350404 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
351405 {
352
- struct sk_buff *prev, *head = flow->head;
406
+ struct rb_node **p, *parent;
407
+ struct sk_buff *head, *aux;
353408
354
- skb->next = NULL;
355
- if (!head) {
356
- flow->head = skb;
357
- flow->tail = skb;
358
- return;
359
- }
360
- if (likely(!skb_is_retransmit(skb))) {
361
- flow->tail->next = skb;
362
- flow->tail = skb;
363
- return;
364
- }
365
-
366
- /* This skb is a tcp retransmit,
367
- * find the last retrans packet in the queue
368
- */
369
- prev = NULL;
370
- while (skb_is_retransmit(head)) {
371
- prev = head;
372
- head = head->next;
409
+ head = flow->head;
410
+ if (!head ||
411
+ fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
373412 if (!head)
374
- break;
375
- }
376
- if (!prev) { /* no rtx packet in queue, become the new head */
377
- skb->next = flow->head;
378
- flow->head = skb;
379
- } else {
380
- if (prev == flow->tail)
381
- flow->tail = skb;
413
+ flow->head = skb;
382414 else
383
- skb->next = prev->next;
384
- prev->next = skb;
415
+ flow->tail->next = skb;
416
+ flow->tail = skb;
417
+ skb->next = NULL;
418
+ return;
385419 }
420
+
421
+ p = &flow->t_root.rb_node;
422
+ parent = NULL;
423
+
424
+ while (*p) {
425
+ parent = *p;
426
+ aux = rb_to_skb(parent);
427
+ if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
428
+ p = &parent->rb_right;
429
+ else
430
+ p = &parent->rb_left;
431
+ }
432
+ rb_link_node(&skb->rbnode, parent, p);
433
+ rb_insert_color(&skb->rbnode, &flow->t_root);
434
+}
435
+
436
+static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
437
+ const struct fq_sched_data *q)
438
+{
439
+ return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
386440 }
387441
388442 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
....@@ -394,6 +448,28 @@
394448 if (unlikely(sch->q.qlen >= sch->limit))
395449 return qdisc_drop(skb, sch, to_free);
396450
451
+ if (!skb->tstamp) {
452
+ fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
453
+ } else {
454
+ /* Check if packet timestamp is too far in the future.
455
+ * Try first if our cached value, to avoid ktime_get_ns()
456
+ * cost in most cases.
457
+ */
458
+ if (fq_packet_beyond_horizon(skb, q)) {
459
+ /* Refresh our cache and check another time */
460
+ q->ktime_cache = ktime_get_ns();
461
+ if (fq_packet_beyond_horizon(skb, q)) {
462
+ if (q->horizon_drop) {
463
+ q->stat_horizon_drops++;
464
+ return qdisc_drop(skb, sch, to_free);
465
+ }
466
+ q->stat_horizon_caps++;
467
+ skb->tstamp = q->ktime_cache + q->horizon;
468
+ }
469
+ }
470
+ fq_skb_cb(skb)->time_to_send = skb->tstamp;
471
+ }
472
+
397473 f = fq_classify(skb, q);
398474 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
399475 q->stat_flows_plimit++;
....@@ -401,21 +477,11 @@
401477 }
402478
403479 f->qlen++;
404
- if (skb_is_retransmit(skb))
405
- q->stat_tcp_retrans++;
406480 qdisc_qstats_backlog_inc(sch, skb);
407481 if (fq_flow_is_detached(f)) {
408
- struct sock *sk = skb->sk;
409
-
410482 fq_flow_add_tail(&q->new_flows, f);
411483 if (time_after(jiffies, f->age + q->flow_refill_delay))
412484 f->credit = max_t(u32, f->credit, q->quantum);
413
- if (sk && q->rate_enable) {
414
- if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
415
- SK_PACING_FQ))
416
- smp_store_release(&sk->sk_pacing_status,
417
- SK_PACING_FQ);
418
- }
419485 q->inactive_flows--;
420486 }
421487
....@@ -460,15 +526,23 @@
460526 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
461527 {
462528 struct fq_sched_data *q = qdisc_priv(sch);
463
- u64 now = ktime_get_ns();
464529 struct fq_flow_head *head;
465530 struct sk_buff *skb;
466531 struct fq_flow *f;
467
- u32 rate, plen;
532
+ unsigned long rate;
533
+ u32 plen;
534
+ u64 now;
468535
469
- skb = fq_dequeue_head(sch, &q->internal);
470
- if (skb)
536
+ if (!sch->q.qlen)
537
+ return NULL;
538
+
539
+ skb = fq_peek(&q->internal);
540
+ if (unlikely(skb)) {
541
+ fq_dequeue_skb(sch, &q->internal, skb);
471542 goto out;
543
+ }
544
+
545
+ q->ktime_cache = now = ktime_get_ns();
472546 fq_check_throttled(q, now);
473547 begin:
474548 head = &q->new_flows;
....@@ -476,8 +550,9 @@
476550 head = &q->old_flows;
477551 if (!head->first) {
478552 if (q->time_next_delayed_flow != ~0ULL)
479
- qdisc_watchdog_schedule_ns(&q->watchdog,
480
- q->time_next_delayed_flow);
553
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
554
+ q->time_next_delayed_flow,
555
+ q->timer_slack);
481556 return NULL;
482557 }
483558 }
....@@ -490,16 +565,24 @@
490565 goto begin;
491566 }
492567
493
- skb = f->head;
494
- if (unlikely(skb && now < f->time_next_packet &&
495
- !skb_is_tcp_pure_ack(skb))) {
496
- head->first = f->next;
497
- fq_flow_set_throttled(q, f);
498
- goto begin;
499
- }
568
+ skb = fq_peek(f);
569
+ if (skb) {
570
+ u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
571
+ f->time_next_packet);
500572
501
- skb = fq_dequeue_head(sch, f);
502
- if (!skb) {
573
+ if (now < time_next_packet) {
574
+ head->first = f->next;
575
+ f->time_next_packet = time_next_packet;
576
+ fq_flow_set_throttled(q, f);
577
+ goto begin;
578
+ }
579
+ prefetch(&skb->end);
580
+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
581
+ INET_ECN_set_ce(skb);
582
+ q->stat_ce_mark++;
583
+ }
584
+ fq_dequeue_skb(sch, f, skb);
585
+ } else {
503586 head->first = f->next;
504587 /* force a pass through old_flows to prevent starvation */
505588 if ((head == &q->new_flows) && q->old_flows.first) {
....@@ -510,33 +593,35 @@
510593 }
511594 goto begin;
512595 }
513
- prefetch(&skb->end);
514
- f->credit -= qdisc_pkt_len(skb);
596
+ plen = qdisc_pkt_len(skb);
597
+ f->credit -= plen;
515598
516599 if (!q->rate_enable)
517600 goto out;
518601
519
- /* Do not pace locally generated ack packets */
520
- if (skb_is_tcp_pure_ack(skb))
521
- goto out;
522
-
523602 rate = q->flow_max_rate;
524
- if (skb->sk)
525
- rate = min(skb->sk->sk_pacing_rate, rate);
526603
527
- if (rate <= q->low_rate_threshold) {
528
- f->credit = 0;
529
- plen = qdisc_pkt_len(skb);
530
- } else {
531
- plen = max(qdisc_pkt_len(skb), q->quantum);
532
- if (f->credit > 0)
533
- goto out;
604
+ /* If EDT time was provided for this skb, we need to
605
+ * update f->time_next_packet only if this qdisc enforces
606
+ * a flow max rate.
607
+ */
608
+ if (!skb->tstamp) {
609
+ if (skb->sk)
610
+ rate = min(skb->sk->sk_pacing_rate, rate);
611
+
612
+ if (rate <= q->low_rate_threshold) {
613
+ f->credit = 0;
614
+ } else {
615
+ plen = max(plen, q->quantum);
616
+ if (f->credit > 0)
617
+ goto out;
618
+ }
534619 }
535
- if (rate != ~0U) {
620
+ if (rate != ~0UL) {
536621 u64 len = (u64)plen * NSEC_PER_SEC;
537622
538623 if (likely(rate))
539
- do_div(len, rate);
624
+ len = div64_ul(len, rate);
540625 /* Since socket rate can change later,
541626 * clamp the delay to 1 second.
542627 * Really, providers of too big packets should be fixed !
....@@ -560,6 +645,15 @@
560645
561646 static void fq_flow_purge(struct fq_flow *flow)
562647 {
648
+ struct rb_node *p = rb_first(&flow->t_root);
649
+
650
+ while (p) {
651
+ struct sk_buff *skb = rb_to_skb(p);
652
+
653
+ p = rb_next(p);
654
+ rb_erase(&skb->rbnode, &flow->t_root);
655
+ rtnl_kfree_skbs(skb, skb);
656
+ }
563657 rtnl_kfree_skbs(flow->head, flow->tail);
564658 flow->head = NULL;
565659 flow->qlen = 0;
....@@ -685,11 +779,17 @@
685779 return 0;
686780 }
687781
782
+static struct netlink_range_validation iq_range = {
783
+ .max = INT_MAX,
784
+};
785
+
688786 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
787
+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
788
+
689789 [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
690790 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
691791 [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
692
- [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
792
+ [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
693793 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
694794 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
695795 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
....@@ -697,6 +797,10 @@
697797 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
698798 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
699799 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
800
+ [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
801
+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
802
+ [TCA_FQ_HORIZON] = { .type = NLA_U32 },
803
+ [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
700804 };
701805
702806 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
....@@ -711,7 +815,8 @@
711815 if (!opt)
712816 return -EINVAL;
713817
714
- err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL);
818
+ err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
819
+ NULL);
715820 if (err < 0)
716821 return err;
717822
....@@ -751,9 +856,11 @@
751856 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
752857 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
753858
754
- if (tb[TCA_FQ_FLOW_MAX_RATE])
755
- q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
859
+ if (tb[TCA_FQ_FLOW_MAX_RATE]) {
860
+ u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
756861
862
+ q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
863
+ }
757864 if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
758865 q->low_rate_threshold =
759866 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
....@@ -776,7 +883,22 @@
776883 if (tb[TCA_FQ_ORPHAN_MASK])
777884 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
778885
886
+ if (tb[TCA_FQ_CE_THRESHOLD])
887
+ q->ce_threshold = (u64)NSEC_PER_USEC *
888
+ nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
889
+
890
+ if (tb[TCA_FQ_TIMER_SLACK])
891
+ q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
892
+
893
+ if (tb[TCA_FQ_HORIZON])
894
+ q->horizon = (u64)NSEC_PER_USEC *
895
+ nla_get_u32(tb[TCA_FQ_HORIZON]);
896
+
897
+ if (tb[TCA_FQ_HORIZON_DROP])
898
+ q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
899
+
779900 if (!err) {
901
+
780902 sch_tree_unlock(sch);
781903 err = fq_resize(sch, fq_log);
782904 sch_tree_lock(sch);
....@@ -816,7 +938,7 @@
816938 q->quantum = 2 * psched_mtu(qdisc_dev(sch));
817939 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
818940 q->flow_refill_delay = msecs_to_jiffies(40);
819
- q->flow_max_rate = ~0U;
941
+ q->flow_max_rate = ~0UL;
820942 q->time_next_delayed_flow = ~0ULL;
821943 q->rate_enable = 1;
822944 q->new_flows.first = NULL;
....@@ -826,7 +948,16 @@
826948 q->fq_trees_log = ilog2(1024);
827949 q->orphan_mask = 1024 - 1;
828950 q->low_rate_threshold = 550000 / 8;
829
- qdisc_watchdog_init(&q->watchdog, sch);
951
+
952
+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
953
+
954
+ q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
955
+ q->horizon_drop = 1; /* by default, drop packets beyond horizon */
956
+
957
+ /* Default ce_threshold of 4294 seconds */
958
+ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
959
+
960
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
830961
831962 if (opt)
832963 err = fq_change(sch, opt, extack);
....@@ -839,26 +970,36 @@
839970 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
840971 {
841972 struct fq_sched_data *q = qdisc_priv(sch);
973
+ u64 ce_threshold = q->ce_threshold;
974
+ u64 horizon = q->horizon;
842975 struct nlattr *opts;
843976
844
- opts = nla_nest_start(skb, TCA_OPTIONS);
977
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
845978 if (opts == NULL)
846979 goto nla_put_failure;
847980
848981 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
982
+
983
+ do_div(ce_threshold, NSEC_PER_USEC);
984
+ do_div(horizon, NSEC_PER_USEC);
849985
850986 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
851987 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
852988 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
853989 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
854990 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
855
- nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
991
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
992
+ min_t(unsigned long, q->flow_max_rate, ~0U)) ||
856993 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
857994 jiffies_to_usecs(q->flow_refill_delay)) ||
858995 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
859996 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
860997 q->low_rate_threshold) ||
861
- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
998
+ nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
999
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
1000
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
1001
+ nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
1002
+ nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
8621003 goto nla_put_failure;
8631004
8641005 return nla_nest_end(skb, opts);
....@@ -876,17 +1017,21 @@
8761017
8771018 st.gc_flows = q->stat_gc_flows;
8781019 st.highprio_packets = q->stat_internal_packets;
879
- st.tcp_retrans = q->stat_tcp_retrans;
1020
+ st.tcp_retrans = 0;
8801021 st.throttled = q->stat_throttled;
8811022 st.flows_plimit = q->stat_flows_plimit;
8821023 st.pkts_too_long = q->stat_pkts_too_long;
8831024 st.allocation_errors = q->stat_allocation_errors;
884
- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
1025
+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1026
+ ktime_get_ns();
8851027 st.flows = q->flows;
8861028 st.inactive_flows = q->inactive_flows;
8871029 st.throttled_flows = q->throttled_flows;
8881030 st.unthrottle_latency_ns = min_t(unsigned long,
8891031 q->unthrottle_latency_ns, ~0U);
1032
+ st.ce_mark = q->stat_ce_mark;
1033
+ st.horizon_drops = q->stat_horizon_drops;
1034
+ st.horizon_caps = q->stat_horizon_caps;
8901035 sch_tree_unlock(sch);
8911036
8921037 return gnet_stats_copy_app(d, &st, sizeof(st));
....@@ -934,3 +1079,4 @@
9341079 module_exit(fq_module_exit)
9351080 MODULE_AUTHOR("Eric Dumazet");
9361081 MODULE_LICENSE("GPL");
1082
+MODULE_DESCRIPTION("Fair Queue Packet Scheduler");