.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> |
---|
5 | | - * |
---|
6 | | - * This program is free software; you can redistribute it and/or |
---|
7 | | - * modify it under the terms of the GNU General Public License |
---|
8 | | - * as published by the Free Software Foundation; either version |
---|
9 | | - * 2 of the License, or (at your option) any later version. |
---|
10 | 6 | * |
---|
11 | 7 | * Meant to be mostly used for locally generated traffic : |
---|
12 | 8 | * Fast classification depends on skb->sk being set before reaching us. |
---|
.. | .. |
---|
54 | 50 | #include <net/tcp_states.h> |
---|
55 | 51 | #include <net/tcp.h> |
---|
56 | 52 | |
---|
| 53 | +struct fq_skb_cb { |
---|
| 54 | + u64 time_to_send; |
---|
| 55 | +}; |
---|
| 56 | + |
---|
| 57 | +static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) |
---|
| 58 | +{ |
---|
| 59 | + qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); |
---|
| 60 | + return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; |
---|
| 61 | +} |
---|
| 62 | + |
---|
57 | 63 | /* |
---|
58 | | - * Per flow structure, dynamically allocated |
---|
| 64 | + * Per flow structure, dynamically allocated. |
---|
| 65 | + * If packets have monotically increasing time_to_send, they are placed in O(1) |
---|
| 66 | + * in linear list (head,tail), otherwise are placed in a rbtree (t_root). |
---|
59 | 67 | */ |
---|
60 | 68 | struct fq_flow { |
---|
| 69 | +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ |
---|
| 70 | + struct rb_root t_root; |
---|
61 | 71 | struct sk_buff *head; /* list of skbs for this flow : first skb */ |
---|
62 | 72 | union { |
---|
63 | 73 | struct sk_buff *tail; /* last skb in the list */ |
---|
64 | | - unsigned long age; /* jiffies when flow was emptied, for gc */ |
---|
| 74 | + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ |
---|
65 | 75 | }; |
---|
66 | 76 | struct rb_node fq_node; /* anchor in fq_root[] trees */ |
---|
67 | 77 | struct sock *sk; |
---|
68 | | - int qlen; /* number of packets in flow queue */ |
---|
69 | | - int credit; |
---|
70 | 78 | u32 socket_hash; /* sk_hash */ |
---|
71 | | - struct fq_flow *next; /* next pointer in RR lists, or &detached */ |
---|
| 79 | + int qlen; /* number of packets in flow queue */ |
---|
| 80 | + |
---|
| 81 | +/* Second cache line, used in fq_dequeue() */ |
---|
| 82 | + int credit; |
---|
| 83 | + /* 32bit hole on 64bit arches */ |
---|
| 84 | + |
---|
| 85 | + struct fq_flow *next; /* next pointer in RR lists */ |
---|
72 | 86 | |
---|
73 | 87 | struct rb_node rate_node; /* anchor in q->delayed tree */ |
---|
74 | 88 | u64 time_next_packet; |
---|
75 | | -}; |
---|
| 89 | +} ____cacheline_aligned_in_smp; |
---|
76 | 90 | |
---|
77 | 91 | struct fq_flow_head { |
---|
78 | 92 | struct fq_flow *first; |
---|
.. | .. |
---|
86 | 100 | |
---|
87 | 101 | struct rb_root delayed; /* for rate limited flows */ |
---|
88 | 102 | u64 time_next_delayed_flow; |
---|
| 103 | + u64 ktime_cache; /* copy of last ktime_get_ns() */ |
---|
89 | 104 | unsigned long unthrottle_latency_ns; |
---|
90 | 105 | |
---|
91 | 106 | struct fq_flow internal; /* for non classified or high prio packets */ |
---|
92 | 107 | u32 quantum; |
---|
93 | 108 | u32 initial_quantum; |
---|
94 | 109 | u32 flow_refill_delay; |
---|
95 | | - u32 flow_max_rate; /* optional max rate per flow */ |
---|
96 | 110 | u32 flow_plimit; /* max packets per flow */ |
---|
| 111 | + unsigned long flow_max_rate; /* optional max rate per flow */ |
---|
| 112 | + u64 ce_threshold; |
---|
| 113 | + u64 horizon; /* horizon in ns */ |
---|
97 | 114 | u32 orphan_mask; /* mask for orphaned skb */ |
---|
98 | 115 | u32 low_rate_threshold; |
---|
99 | 116 | struct rb_root *fq_root; |
---|
100 | 117 | u8 rate_enable; |
---|
101 | 118 | u8 fq_trees_log; |
---|
102 | | - |
---|
| 119 | + u8 horizon_drop; |
---|
103 | 120 | u32 flows; |
---|
104 | 121 | u32 inactive_flows; |
---|
105 | 122 | u32 throttled_flows; |
---|
106 | 123 | |
---|
107 | 124 | u64 stat_gc_flows; |
---|
108 | 125 | u64 stat_internal_packets; |
---|
109 | | - u64 stat_tcp_retrans; |
---|
110 | 126 | u64 stat_throttled; |
---|
| 127 | + u64 stat_ce_mark; |
---|
| 128 | + u64 stat_horizon_drops; |
---|
| 129 | + u64 stat_horizon_caps; |
---|
111 | 130 | u64 stat_flows_plimit; |
---|
112 | 131 | u64 stat_pkts_too_long; |
---|
113 | 132 | u64 stat_allocation_errors; |
---|
| 133 | + |
---|
| 134 | + u32 timer_slack; /* hrtimer slack in ns */ |
---|
114 | 135 | struct qdisc_watchdog watchdog; |
---|
115 | 136 | }; |
---|
116 | 137 | |
---|
117 | | -/* special value to mark a detached flow (not on old/new list) */ |
---|
118 | | -static struct fq_flow detached, throttled; |
---|
119 | | - |
---|
| 138 | +/* |
---|
| 139 | + * f->tail and f->age share the same location. |
---|
| 140 | + * We can use the low order bit to differentiate if this location points |
---|
| 141 | + * to a sk_buff or contains a jiffies value, if we force this value to be odd. |
---|
| 142 | + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 |
---|
| 143 | + */ |
---|
120 | 144 | static void fq_flow_set_detached(struct fq_flow *f) |
---|
121 | 145 | { |
---|
122 | | - f->next = &detached; |
---|
123 | | - f->age = jiffies; |
---|
| 146 | + f->age = jiffies | 1UL; |
---|
124 | 147 | } |
---|
125 | 148 | |
---|
126 | 149 | static bool fq_flow_is_detached(const struct fq_flow *f) |
---|
127 | 150 | { |
---|
128 | | - return f->next == &detached; |
---|
| 151 | + return !!(f->age & 1UL); |
---|
129 | 152 | } |
---|
| 153 | + |
---|
| 154 | +/* special value to mark a throttled flow (not on old/new list) */ |
---|
| 155 | +static struct fq_flow throttled; |
---|
130 | 156 | |
---|
131 | 157 | static bool fq_flow_is_throttled(const struct fq_flow *f) |
---|
132 | 158 | { |
---|
.. | .. |
---|
192 | 218 | struct rb_root *root, |
---|
193 | 219 | struct sock *sk) |
---|
194 | 220 | { |
---|
195 | | - struct fq_flow *f, *tofree[FQ_GC_MAX]; |
---|
196 | 221 | struct rb_node **p, *parent; |
---|
197 | | - int fcnt = 0; |
---|
| 222 | + void *tofree[FQ_GC_MAX]; |
---|
| 223 | + struct fq_flow *f; |
---|
| 224 | + int i, fcnt = 0; |
---|
198 | 225 | |
---|
199 | 226 | p = &root->rb_node; |
---|
200 | 227 | parent = NULL; |
---|
.. | .. |
---|
217 | 244 | p = &parent->rb_left; |
---|
218 | 245 | } |
---|
219 | 246 | |
---|
| 247 | + if (!fcnt) |
---|
| 248 | + return; |
---|
| 249 | + |
---|
| 250 | + for (i = fcnt; i > 0; ) { |
---|
| 251 | + f = tofree[--i]; |
---|
| 252 | + rb_erase(&f->fq_node, root); |
---|
| 253 | + } |
---|
220 | 254 | q->flows -= fcnt; |
---|
221 | 255 | q->inactive_flows -= fcnt; |
---|
222 | 256 | q->stat_gc_flows += fcnt; |
---|
223 | | - while (fcnt) { |
---|
224 | | - struct fq_flow *f = tofree[--fcnt]; |
---|
225 | 257 | |
---|
226 | | - rb_erase(&f->fq_node, root); |
---|
227 | | - kmem_cache_free(fq_flow_cachep, f); |
---|
228 | | - } |
---|
| 258 | + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); |
---|
229 | 259 | } |
---|
230 | 260 | |
---|
231 | 261 | static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) |
---|
.. | .. |
---|
256 | 286 | */ |
---|
257 | 287 | sk = (struct sock *)((hash << 1) | 1UL); |
---|
258 | 288 | skb_orphan(skb); |
---|
| 289 | + } else if (sk->sk_state == TCP_CLOSE) { |
---|
| 290 | + unsigned long hash = skb_get_hash(skb) & q->orphan_mask; |
---|
| 291 | + /* |
---|
| 292 | + * Sockets in TCP_CLOSE are non connected. |
---|
| 293 | + * Typical use case is UDP sockets, they can send packets |
---|
| 294 | + * with sendto() to many different destinations. |
---|
| 295 | + * We probably could use a generic bit advertising |
---|
| 296 | + * non connected sockets, instead of sk_state == TCP_CLOSE, |
---|
| 297 | + * if we care enough. |
---|
| 298 | + */ |
---|
| 299 | + sk = (struct sock *)((hash << 1) | 1UL); |
---|
259 | 300 | } |
---|
260 | 301 | |
---|
261 | 302 | root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; |
---|
.. | .. |
---|
276 | 317 | * It not, we need to refill credit with |
---|
277 | 318 | * initial quantum |
---|
278 | 319 | */ |
---|
279 | | - if (unlikely(skb->sk && |
---|
| 320 | + if (unlikely(skb->sk == sk && |
---|
280 | 321 | f->socket_hash != sk->sk_hash)) { |
---|
281 | 322 | f->credit = q->initial_quantum; |
---|
282 | 323 | f->socket_hash = sk->sk_hash; |
---|
| 324 | + if (q->rate_enable) |
---|
| 325 | + smp_store_release(&sk->sk_pacing_status, |
---|
| 326 | + SK_PACING_FQ); |
---|
283 | 327 | if (fq_flow_is_throttled(f)) |
---|
284 | 328 | fq_flow_unset_throttled(q, f); |
---|
285 | 329 | f->time_next_packet = 0ULL; |
---|
.. | .. |
---|
297 | 341 | q->stat_allocation_errors++; |
---|
298 | 342 | return &q->internal; |
---|
299 | 343 | } |
---|
| 344 | + /* f->t_root is already zeroed after kmem_cache_zalloc() */ |
---|
| 345 | + |
---|
300 | 346 | fq_flow_set_detached(f); |
---|
301 | 347 | f->sk = sk; |
---|
302 | | - if (skb->sk) |
---|
| 348 | + if (skb->sk == sk) { |
---|
303 | 349 | f->socket_hash = sk->sk_hash; |
---|
| 350 | + if (q->rate_enable) |
---|
| 351 | + smp_store_release(&sk->sk_pacing_status, |
---|
| 352 | + SK_PACING_FQ); |
---|
| 353 | + } |
---|
304 | 354 | f->credit = q->initial_quantum; |
---|
305 | 355 | |
---|
306 | 356 | rb_link_node(&f->fq_node, parent, p); |
---|
.. | .. |
---|
311 | 361 | return f; |
---|
312 | 362 | } |
---|
313 | 363 | |
---|
314 | | - |
---|
315 | | -/* remove one skb from head of flow queue */ |
---|
316 | | -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) |
---|
| 364 | +static struct sk_buff *fq_peek(struct fq_flow *flow) |
---|
317 | 365 | { |
---|
318 | | - struct sk_buff *skb = flow->head; |
---|
| 366 | + struct sk_buff *skb = skb_rb_first(&flow->t_root); |
---|
| 367 | + struct sk_buff *head = flow->head; |
---|
319 | 368 | |
---|
320 | | - if (skb) { |
---|
| 369 | + if (!skb) |
---|
| 370 | + return head; |
---|
| 371 | + |
---|
| 372 | + if (!head) |
---|
| 373 | + return skb; |
---|
| 374 | + |
---|
| 375 | + if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) |
---|
| 376 | + return skb; |
---|
| 377 | + return head; |
---|
| 378 | +} |
---|
| 379 | + |
---|
| 380 | +static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, |
---|
| 381 | + struct sk_buff *skb) |
---|
| 382 | +{ |
---|
| 383 | + if (skb == flow->head) { |
---|
321 | 384 | flow->head = skb->next; |
---|
322 | | - skb->next = NULL; |
---|
323 | | - flow->qlen--; |
---|
324 | | - qdisc_qstats_backlog_dec(sch, skb); |
---|
325 | | - sch->q.qlen--; |
---|
| 385 | + } else { |
---|
| 386 | + rb_erase(&skb->rbnode, &flow->t_root); |
---|
| 387 | + skb->dev = qdisc_dev(sch); |
---|
326 | 388 | } |
---|
327 | | - return skb; |
---|
328 | 389 | } |
---|
329 | 390 | |
---|
330 | | -/* We might add in the future detection of retransmits |
---|
331 | | - * For the time being, just return false |
---|
| 391 | +/* Remove one skb from flow queue. |
---|
| 392 | + * This skb must be the return value of prior fq_peek(). |
---|
332 | 393 | */ |
---|
333 | | -static bool skb_is_retransmit(struct sk_buff *skb) |
---|
| 394 | +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, |
---|
| 395 | + struct sk_buff *skb) |
---|
334 | 396 | { |
---|
335 | | - return false; |
---|
| 397 | + fq_erase_head(sch, flow, skb); |
---|
| 398 | + skb_mark_not_on_list(skb); |
---|
| 399 | + flow->qlen--; |
---|
| 400 | + qdisc_qstats_backlog_dec(sch, skb); |
---|
| 401 | + sch->q.qlen--; |
---|
336 | 402 | } |
---|
337 | 403 | |
---|
338 | | -/* add skb to flow queue |
---|
339 | | - * flow queue is a linked list, kind of FIFO, except for TCP retransmits |
---|
340 | | - * We special case tcp retransmits to be transmitted before other packets. |
---|
341 | | - * We rely on fact that TCP retransmits are unlikely, so we do not waste |
---|
342 | | - * a separate queue or a pointer. |
---|
343 | | - * head-> [retrans pkt 1] |
---|
344 | | - * [retrans pkt 2] |
---|
345 | | - * [ normal pkt 1] |
---|
346 | | - * [ normal pkt 2] |
---|
347 | | - * [ normal pkt 3] |
---|
348 | | - * tail-> [ normal pkt 4] |
---|
349 | | - */ |
---|
350 | 404 | static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) |
---|
351 | 405 | { |
---|
352 | | - struct sk_buff *prev, *head = flow->head; |
---|
| 406 | + struct rb_node **p, *parent; |
---|
| 407 | + struct sk_buff *head, *aux; |
---|
353 | 408 | |
---|
354 | | - skb->next = NULL; |
---|
355 | | - if (!head) { |
---|
356 | | - flow->head = skb; |
---|
357 | | - flow->tail = skb; |
---|
358 | | - return; |
---|
359 | | - } |
---|
360 | | - if (likely(!skb_is_retransmit(skb))) { |
---|
361 | | - flow->tail->next = skb; |
---|
362 | | - flow->tail = skb; |
---|
363 | | - return; |
---|
364 | | - } |
---|
365 | | - |
---|
366 | | - /* This skb is a tcp retransmit, |
---|
367 | | - * find the last retrans packet in the queue |
---|
368 | | - */ |
---|
369 | | - prev = NULL; |
---|
370 | | - while (skb_is_retransmit(head)) { |
---|
371 | | - prev = head; |
---|
372 | | - head = head->next; |
---|
| 409 | + head = flow->head; |
---|
| 410 | + if (!head || |
---|
| 411 | + fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { |
---|
373 | 412 | if (!head) |
---|
374 | | - break; |
---|
375 | | - } |
---|
376 | | - if (!prev) { /* no rtx packet in queue, become the new head */ |
---|
377 | | - skb->next = flow->head; |
---|
378 | | - flow->head = skb; |
---|
379 | | - } else { |
---|
380 | | - if (prev == flow->tail) |
---|
381 | | - flow->tail = skb; |
---|
| 413 | + flow->head = skb; |
---|
382 | 414 | else |
---|
383 | | - skb->next = prev->next; |
---|
384 | | - prev->next = skb; |
---|
| 415 | + flow->tail->next = skb; |
---|
| 416 | + flow->tail = skb; |
---|
| 417 | + skb->next = NULL; |
---|
| 418 | + return; |
---|
385 | 419 | } |
---|
| 420 | + |
---|
| 421 | + p = &flow->t_root.rb_node; |
---|
| 422 | + parent = NULL; |
---|
| 423 | + |
---|
| 424 | + while (*p) { |
---|
| 425 | + parent = *p; |
---|
| 426 | + aux = rb_to_skb(parent); |
---|
| 427 | + if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) |
---|
| 428 | + p = &parent->rb_right; |
---|
| 429 | + else |
---|
| 430 | + p = &parent->rb_left; |
---|
| 431 | + } |
---|
| 432 | + rb_link_node(&skb->rbnode, parent, p); |
---|
| 433 | + rb_insert_color(&skb->rbnode, &flow->t_root); |
---|
| 434 | +} |
---|
| 435 | + |
---|
| 436 | +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, |
---|
| 437 | + const struct fq_sched_data *q) |
---|
| 438 | +{ |
---|
| 439 | + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); |
---|
386 | 440 | } |
---|
387 | 441 | |
---|
388 | 442 | static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
---|
.. | .. |
---|
394 | 448 | if (unlikely(sch->q.qlen >= sch->limit)) |
---|
395 | 449 | return qdisc_drop(skb, sch, to_free); |
---|
396 | 450 | |
---|
| 451 | + if (!skb->tstamp) { |
---|
| 452 | + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); |
---|
| 453 | + } else { |
---|
| 454 | + /* Check if packet timestamp is too far in the future. |
---|
| 455 | + * Try first if our cached value, to avoid ktime_get_ns() |
---|
| 456 | + * cost in most cases. |
---|
| 457 | + */ |
---|
| 458 | + if (fq_packet_beyond_horizon(skb, q)) { |
---|
| 459 | + /* Refresh our cache and check another time */ |
---|
| 460 | + q->ktime_cache = ktime_get_ns(); |
---|
| 461 | + if (fq_packet_beyond_horizon(skb, q)) { |
---|
| 462 | + if (q->horizon_drop) { |
---|
| 463 | + q->stat_horizon_drops++; |
---|
| 464 | + return qdisc_drop(skb, sch, to_free); |
---|
| 465 | + } |
---|
| 466 | + q->stat_horizon_caps++; |
---|
| 467 | + skb->tstamp = q->ktime_cache + q->horizon; |
---|
| 468 | + } |
---|
| 469 | + } |
---|
| 470 | + fq_skb_cb(skb)->time_to_send = skb->tstamp; |
---|
| 471 | + } |
---|
| 472 | + |
---|
397 | 473 | f = fq_classify(skb, q); |
---|
398 | 474 | if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { |
---|
399 | 475 | q->stat_flows_plimit++; |
---|
.. | .. |
---|
401 | 477 | } |
---|
402 | 478 | |
---|
403 | 479 | f->qlen++; |
---|
404 | | - if (skb_is_retransmit(skb)) |
---|
405 | | - q->stat_tcp_retrans++; |
---|
406 | 480 | qdisc_qstats_backlog_inc(sch, skb); |
---|
407 | 481 | if (fq_flow_is_detached(f)) { |
---|
408 | | - struct sock *sk = skb->sk; |
---|
409 | | - |
---|
410 | 482 | fq_flow_add_tail(&q->new_flows, f); |
---|
411 | 483 | if (time_after(jiffies, f->age + q->flow_refill_delay)) |
---|
412 | 484 | f->credit = max_t(u32, f->credit, q->quantum); |
---|
413 | | - if (sk && q->rate_enable) { |
---|
414 | | - if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != |
---|
415 | | - SK_PACING_FQ)) |
---|
416 | | - smp_store_release(&sk->sk_pacing_status, |
---|
417 | | - SK_PACING_FQ); |
---|
418 | | - } |
---|
419 | 485 | q->inactive_flows--; |
---|
420 | 486 | } |
---|
421 | 487 | |
---|
.. | .. |
---|
460 | 526 | static struct sk_buff *fq_dequeue(struct Qdisc *sch) |
---|
461 | 527 | { |
---|
462 | 528 | struct fq_sched_data *q = qdisc_priv(sch); |
---|
463 | | - u64 now = ktime_get_ns(); |
---|
464 | 529 | struct fq_flow_head *head; |
---|
465 | 530 | struct sk_buff *skb; |
---|
466 | 531 | struct fq_flow *f; |
---|
467 | | - u32 rate, plen; |
---|
| 532 | + unsigned long rate; |
---|
| 533 | + u32 plen; |
---|
| 534 | + u64 now; |
---|
468 | 535 | |
---|
469 | | - skb = fq_dequeue_head(sch, &q->internal); |
---|
470 | | - if (skb) |
---|
| 536 | + if (!sch->q.qlen) |
---|
| 537 | + return NULL; |
---|
| 538 | + |
---|
| 539 | + skb = fq_peek(&q->internal); |
---|
| 540 | + if (unlikely(skb)) { |
---|
| 541 | + fq_dequeue_skb(sch, &q->internal, skb); |
---|
471 | 542 | goto out; |
---|
| 543 | + } |
---|
| 544 | + |
---|
| 545 | + q->ktime_cache = now = ktime_get_ns(); |
---|
472 | 546 | fq_check_throttled(q, now); |
---|
473 | 547 | begin: |
---|
474 | 548 | head = &q->new_flows; |
---|
.. | .. |
---|
476 | 550 | head = &q->old_flows; |
---|
477 | 551 | if (!head->first) { |
---|
478 | 552 | if (q->time_next_delayed_flow != ~0ULL) |
---|
479 | | - qdisc_watchdog_schedule_ns(&q->watchdog, |
---|
480 | | - q->time_next_delayed_flow); |
---|
| 553 | + qdisc_watchdog_schedule_range_ns(&q->watchdog, |
---|
| 554 | + q->time_next_delayed_flow, |
---|
| 555 | + q->timer_slack); |
---|
481 | 556 | return NULL; |
---|
482 | 557 | } |
---|
483 | 558 | } |
---|
.. | .. |
---|
490 | 565 | goto begin; |
---|
491 | 566 | } |
---|
492 | 567 | |
---|
493 | | - skb = f->head; |
---|
494 | | - if (unlikely(skb && now < f->time_next_packet && |
---|
495 | | - !skb_is_tcp_pure_ack(skb))) { |
---|
496 | | - head->first = f->next; |
---|
497 | | - fq_flow_set_throttled(q, f); |
---|
498 | | - goto begin; |
---|
499 | | - } |
---|
| 568 | + skb = fq_peek(f); |
---|
| 569 | + if (skb) { |
---|
| 570 | + u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, |
---|
| 571 | + f->time_next_packet); |
---|
500 | 572 | |
---|
501 | | - skb = fq_dequeue_head(sch, f); |
---|
502 | | - if (!skb) { |
---|
| 573 | + if (now < time_next_packet) { |
---|
| 574 | + head->first = f->next; |
---|
| 575 | + f->time_next_packet = time_next_packet; |
---|
| 576 | + fq_flow_set_throttled(q, f); |
---|
| 577 | + goto begin; |
---|
| 578 | + } |
---|
| 579 | + prefetch(&skb->end); |
---|
| 580 | + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { |
---|
| 581 | + INET_ECN_set_ce(skb); |
---|
| 582 | + q->stat_ce_mark++; |
---|
| 583 | + } |
---|
| 584 | + fq_dequeue_skb(sch, f, skb); |
---|
| 585 | + } else { |
---|
503 | 586 | head->first = f->next; |
---|
504 | 587 | /* force a pass through old_flows to prevent starvation */ |
---|
505 | 588 | if ((head == &q->new_flows) && q->old_flows.first) { |
---|
.. | .. |
---|
510 | 593 | } |
---|
511 | 594 | goto begin; |
---|
512 | 595 | } |
---|
513 | | - prefetch(&skb->end); |
---|
514 | | - f->credit -= qdisc_pkt_len(skb); |
---|
| 596 | + plen = qdisc_pkt_len(skb); |
---|
| 597 | + f->credit -= plen; |
---|
515 | 598 | |
---|
516 | 599 | if (!q->rate_enable) |
---|
517 | 600 | goto out; |
---|
518 | 601 | |
---|
519 | | - /* Do not pace locally generated ack packets */ |
---|
520 | | - if (skb_is_tcp_pure_ack(skb)) |
---|
521 | | - goto out; |
---|
522 | | - |
---|
523 | 602 | rate = q->flow_max_rate; |
---|
524 | | - if (skb->sk) |
---|
525 | | - rate = min(skb->sk->sk_pacing_rate, rate); |
---|
526 | 603 | |
---|
527 | | - if (rate <= q->low_rate_threshold) { |
---|
528 | | - f->credit = 0; |
---|
529 | | - plen = qdisc_pkt_len(skb); |
---|
530 | | - } else { |
---|
531 | | - plen = max(qdisc_pkt_len(skb), q->quantum); |
---|
532 | | - if (f->credit > 0) |
---|
533 | | - goto out; |
---|
| 604 | + /* If EDT time was provided for this skb, we need to |
---|
| 605 | + * update f->time_next_packet only if this qdisc enforces |
---|
| 606 | + * a flow max rate. |
---|
| 607 | + */ |
---|
| 608 | + if (!skb->tstamp) { |
---|
| 609 | + if (skb->sk) |
---|
| 610 | + rate = min(skb->sk->sk_pacing_rate, rate); |
---|
| 611 | + |
---|
| 612 | + if (rate <= q->low_rate_threshold) { |
---|
| 613 | + f->credit = 0; |
---|
| 614 | + } else { |
---|
| 615 | + plen = max(plen, q->quantum); |
---|
| 616 | + if (f->credit > 0) |
---|
| 617 | + goto out; |
---|
| 618 | + } |
---|
534 | 619 | } |
---|
535 | | - if (rate != ~0U) { |
---|
| 620 | + if (rate != ~0UL) { |
---|
536 | 621 | u64 len = (u64)plen * NSEC_PER_SEC; |
---|
537 | 622 | |
---|
538 | 623 | if (likely(rate)) |
---|
539 | | - do_div(len, rate); |
---|
| 624 | + len = div64_ul(len, rate); |
---|
540 | 625 | /* Since socket rate can change later, |
---|
541 | 626 | * clamp the delay to 1 second. |
---|
542 | 627 | * Really, providers of too big packets should be fixed ! |
---|
.. | .. |
---|
560 | 645 | |
---|
561 | 646 | static void fq_flow_purge(struct fq_flow *flow) |
---|
562 | 647 | { |
---|
| 648 | + struct rb_node *p = rb_first(&flow->t_root); |
---|
| 649 | + |
---|
| 650 | + while (p) { |
---|
| 651 | + struct sk_buff *skb = rb_to_skb(p); |
---|
| 652 | + |
---|
| 653 | + p = rb_next(p); |
---|
| 654 | + rb_erase(&skb->rbnode, &flow->t_root); |
---|
| 655 | + rtnl_kfree_skbs(skb, skb); |
---|
| 656 | + } |
---|
563 | 657 | rtnl_kfree_skbs(flow->head, flow->tail); |
---|
564 | 658 | flow->head = NULL; |
---|
565 | 659 | flow->qlen = 0; |
---|
.. | .. |
---|
685 | 779 | return 0; |
---|
686 | 780 | } |
---|
687 | 781 | |
---|
| 782 | +static struct netlink_range_validation iq_range = { |
---|
| 783 | + .max = INT_MAX, |
---|
| 784 | +}; |
---|
| 785 | + |
---|
688 | 786 | static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { |
---|
| 787 | + [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, |
---|
| 788 | + |
---|
689 | 789 | [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, |
---|
690 | 790 | [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, |
---|
691 | 791 | [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, |
---|
692 | | - [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, |
---|
| 792 | + [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range), |
---|
693 | 793 | [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, |
---|
694 | 794 | [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, |
---|
695 | 795 | [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, |
---|
.. | .. |
---|
697 | 797 | [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, |
---|
698 | 798 | [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, |
---|
699 | 799 | [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, |
---|
| 800 | + [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, |
---|
| 801 | + [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, |
---|
| 802 | + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, |
---|
| 803 | + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, |
---|
700 | 804 | }; |
---|
701 | 805 | |
---|
702 | 806 | static int fq_change(struct Qdisc *sch, struct nlattr *opt, |
---|
.. | .. |
---|
711 | 815 | if (!opt) |
---|
712 | 816 | return -EINVAL; |
---|
713 | 817 | |
---|
714 | | - err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL); |
---|
| 818 | + err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, |
---|
| 819 | + NULL); |
---|
715 | 820 | if (err < 0) |
---|
716 | 821 | return err; |
---|
717 | 822 | |
---|
.. | .. |
---|
751 | 856 | pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", |
---|
752 | 857 | nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); |
---|
753 | 858 | |
---|
754 | | - if (tb[TCA_FQ_FLOW_MAX_RATE]) |
---|
755 | | - q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); |
---|
| 859 | + if (tb[TCA_FQ_FLOW_MAX_RATE]) { |
---|
| 860 | + u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); |
---|
756 | 861 | |
---|
| 862 | + q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; |
---|
| 863 | + } |
---|
757 | 864 | if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) |
---|
758 | 865 | q->low_rate_threshold = |
---|
759 | 866 | nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); |
---|
.. | .. |
---|
776 | 883 | if (tb[TCA_FQ_ORPHAN_MASK]) |
---|
777 | 884 | q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); |
---|
778 | 885 | |
---|
| 886 | + if (tb[TCA_FQ_CE_THRESHOLD]) |
---|
| 887 | + q->ce_threshold = (u64)NSEC_PER_USEC * |
---|
| 888 | + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); |
---|
| 889 | + |
---|
| 890 | + if (tb[TCA_FQ_TIMER_SLACK]) |
---|
| 891 | + q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); |
---|
| 892 | + |
---|
| 893 | + if (tb[TCA_FQ_HORIZON]) |
---|
| 894 | + q->horizon = (u64)NSEC_PER_USEC * |
---|
| 895 | + nla_get_u32(tb[TCA_FQ_HORIZON]); |
---|
| 896 | + |
---|
| 897 | + if (tb[TCA_FQ_HORIZON_DROP]) |
---|
| 898 | + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); |
---|
| 899 | + |
---|
779 | 900 | if (!err) { |
---|
| 901 | + |
---|
780 | 902 | sch_tree_unlock(sch); |
---|
781 | 903 | err = fq_resize(sch, fq_log); |
---|
782 | 904 | sch_tree_lock(sch); |
---|
.. | .. |
---|
816 | 938 | q->quantum = 2 * psched_mtu(qdisc_dev(sch)); |
---|
817 | 939 | q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); |
---|
818 | 940 | q->flow_refill_delay = msecs_to_jiffies(40); |
---|
819 | | - q->flow_max_rate = ~0U; |
---|
| 941 | + q->flow_max_rate = ~0UL; |
---|
820 | 942 | q->time_next_delayed_flow = ~0ULL; |
---|
821 | 943 | q->rate_enable = 1; |
---|
822 | 944 | q->new_flows.first = NULL; |
---|
.. | .. |
---|
826 | 948 | q->fq_trees_log = ilog2(1024); |
---|
827 | 949 | q->orphan_mask = 1024 - 1; |
---|
828 | 950 | q->low_rate_threshold = 550000 / 8; |
---|
829 | | - qdisc_watchdog_init(&q->watchdog, sch); |
---|
| 951 | + |
---|
| 952 | + q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ |
---|
| 953 | + |
---|
| 954 | + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ |
---|
| 955 | + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ |
---|
| 956 | + |
---|
| 957 | + /* Default ce_threshold of 4294 seconds */ |
---|
| 958 | + q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; |
---|
| 959 | + |
---|
| 960 | + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); |
---|
830 | 961 | |
---|
831 | 962 | if (opt) |
---|
832 | 963 | err = fq_change(sch, opt, extack); |
---|
.. | .. |
---|
839 | 970 | static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) |
---|
840 | 971 | { |
---|
841 | 972 | struct fq_sched_data *q = qdisc_priv(sch); |
---|
| 973 | + u64 ce_threshold = q->ce_threshold; |
---|
| 974 | + u64 horizon = q->horizon; |
---|
842 | 975 | struct nlattr *opts; |
---|
843 | 976 | |
---|
844 | | - opts = nla_nest_start(skb, TCA_OPTIONS); |
---|
| 977 | + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); |
---|
845 | 978 | if (opts == NULL) |
---|
846 | 979 | goto nla_put_failure; |
---|
847 | 980 | |
---|
848 | 981 | /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ |
---|
| 982 | + |
---|
| 983 | + do_div(ce_threshold, NSEC_PER_USEC); |
---|
| 984 | + do_div(horizon, NSEC_PER_USEC); |
---|
849 | 985 | |
---|
850 | 986 | if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || |
---|
851 | 987 | nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || |
---|
852 | 988 | nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || |
---|
853 | 989 | nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || |
---|
854 | 990 | nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || |
---|
855 | | - nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || |
---|
| 991 | + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, |
---|
| 992 | + min_t(unsigned long, q->flow_max_rate, ~0U)) || |
---|
856 | 993 | nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, |
---|
857 | 994 | jiffies_to_usecs(q->flow_refill_delay)) || |
---|
858 | 995 | nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || |
---|
859 | 996 | nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, |
---|
860 | 997 | q->low_rate_threshold) || |
---|
861 | | - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) |
---|
| 998 | + nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || |
---|
| 999 | + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || |
---|
| 1000 | + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || |
---|
| 1001 | + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || |
---|
| 1002 | + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) |
---|
862 | 1003 | goto nla_put_failure; |
---|
863 | 1004 | |
---|
864 | 1005 | return nla_nest_end(skb, opts); |
---|
.. | .. |
---|
876 | 1017 | |
---|
877 | 1018 | st.gc_flows = q->stat_gc_flows; |
---|
878 | 1019 | st.highprio_packets = q->stat_internal_packets; |
---|
879 | | - st.tcp_retrans = q->stat_tcp_retrans; |
---|
| 1020 | + st.tcp_retrans = 0; |
---|
880 | 1021 | st.throttled = q->stat_throttled; |
---|
881 | 1022 | st.flows_plimit = q->stat_flows_plimit; |
---|
882 | 1023 | st.pkts_too_long = q->stat_pkts_too_long; |
---|
883 | 1024 | st.allocation_errors = q->stat_allocation_errors; |
---|
884 | | - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); |
---|
| 1025 | + st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - |
---|
| 1026 | + ktime_get_ns(); |
---|
885 | 1027 | st.flows = q->flows; |
---|
886 | 1028 | st.inactive_flows = q->inactive_flows; |
---|
887 | 1029 | st.throttled_flows = q->throttled_flows; |
---|
888 | 1030 | st.unthrottle_latency_ns = min_t(unsigned long, |
---|
889 | 1031 | q->unthrottle_latency_ns, ~0U); |
---|
| 1032 | + st.ce_mark = q->stat_ce_mark; |
---|
| 1033 | + st.horizon_drops = q->stat_horizon_drops; |
---|
| 1034 | + st.horizon_caps = q->stat_horizon_caps; |
---|
890 | 1035 | sch_tree_unlock(sch); |
---|
891 | 1036 | |
---|
892 | 1037 | return gnet_stats_copy_app(d, &st, sizeof(st)); |
---|
.. | .. |
---|
934 | 1079 | module_exit(fq_module_exit) |
---|
935 | 1080 | MODULE_AUTHOR("Eric Dumazet"); |
---|
936 | 1081 | MODULE_LICENSE("GPL"); |
---|
| 1082 | +MODULE_DESCRIPTION("Fair Queue Packet Scheduler"); |
---|