| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) |
|---|
| 3 | 4 | * |
|---|
| 4 | 5 | * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> |
|---|
| 5 | | - * |
|---|
| 6 | | - * This program is free software; you can redistribute it and/or |
|---|
| 7 | | - * modify it under the terms of the GNU General Public License |
|---|
| 8 | | - * as published by the Free Software Foundation; either version |
|---|
| 9 | | - * 2 of the License, or (at your option) any later version. |
|---|
| 10 | 6 | * |
|---|
| 11 | 7 | * Meant to be mostly used for locally generated traffic : |
|---|
| 12 | 8 | * Fast classification depends on skb->sk being set before reaching us. |
|---|
| .. | .. |
|---|
| 54 | 50 | #include <net/tcp_states.h> |
|---|
| 55 | 51 | #include <net/tcp.h> |
|---|
| 56 | 52 | |
|---|
| 53 | +struct fq_skb_cb { |
|---|
| 54 | + u64 time_to_send; |
|---|
| 55 | +}; |
|---|
| 56 | + |
|---|
| 57 | +static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) |
|---|
| 58 | +{ |
|---|
| 59 | + qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); |
|---|
| 60 | + return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; |
|---|
| 61 | +} |
|---|
| 62 | + |
|---|
| 57 | 63 | /* |
|---|
| 58 | | - * Per flow structure, dynamically allocated |
|---|
| 64 | + * Per flow structure, dynamically allocated. |
|---|
| 65 | + * If packets have monotically increasing time_to_send, they are placed in O(1) |
|---|
| 66 | + * in linear list (head,tail), otherwise are placed in a rbtree (t_root). |
|---|
| 59 | 67 | */ |
|---|
| 60 | 68 | struct fq_flow { |
|---|
| 69 | +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ |
|---|
| 70 | + struct rb_root t_root; |
|---|
| 61 | 71 | struct sk_buff *head; /* list of skbs for this flow : first skb */ |
|---|
| 62 | 72 | union { |
|---|
| 63 | 73 | struct sk_buff *tail; /* last skb in the list */ |
|---|
| 64 | | - unsigned long age; /* jiffies when flow was emptied, for gc */ |
|---|
| 74 | + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ |
|---|
| 65 | 75 | }; |
|---|
| 66 | 76 | struct rb_node fq_node; /* anchor in fq_root[] trees */ |
|---|
| 67 | 77 | struct sock *sk; |
|---|
| 68 | | - int qlen; /* number of packets in flow queue */ |
|---|
| 69 | | - int credit; |
|---|
| 70 | 78 | u32 socket_hash; /* sk_hash */ |
|---|
| 71 | | - struct fq_flow *next; /* next pointer in RR lists, or &detached */ |
|---|
| 79 | + int qlen; /* number of packets in flow queue */ |
|---|
| 80 | + |
|---|
| 81 | +/* Second cache line, used in fq_dequeue() */ |
|---|
| 82 | + int credit; |
|---|
| 83 | + /* 32bit hole on 64bit arches */ |
|---|
| 84 | + |
|---|
| 85 | + struct fq_flow *next; /* next pointer in RR lists */ |
|---|
| 72 | 86 | |
|---|
| 73 | 87 | struct rb_node rate_node; /* anchor in q->delayed tree */ |
|---|
| 74 | 88 | u64 time_next_packet; |
|---|
| 75 | | -}; |
|---|
| 89 | +} ____cacheline_aligned_in_smp; |
|---|
| 76 | 90 | |
|---|
| 77 | 91 | struct fq_flow_head { |
|---|
| 78 | 92 | struct fq_flow *first; |
|---|
| .. | .. |
|---|
| 86 | 100 | |
|---|
| 87 | 101 | struct rb_root delayed; /* for rate limited flows */ |
|---|
| 88 | 102 | u64 time_next_delayed_flow; |
|---|
| 103 | + u64 ktime_cache; /* copy of last ktime_get_ns() */ |
|---|
| 89 | 104 | unsigned long unthrottle_latency_ns; |
|---|
| 90 | 105 | |
|---|
| 91 | 106 | struct fq_flow internal; /* for non classified or high prio packets */ |
|---|
| 92 | 107 | u32 quantum; |
|---|
| 93 | 108 | u32 initial_quantum; |
|---|
| 94 | 109 | u32 flow_refill_delay; |
|---|
| 95 | | - u32 flow_max_rate; /* optional max rate per flow */ |
|---|
| 96 | 110 | u32 flow_plimit; /* max packets per flow */ |
|---|
| 111 | + unsigned long flow_max_rate; /* optional max rate per flow */ |
|---|
| 112 | + u64 ce_threshold; |
|---|
| 113 | + u64 horizon; /* horizon in ns */ |
|---|
| 97 | 114 | u32 orphan_mask; /* mask for orphaned skb */ |
|---|
| 98 | 115 | u32 low_rate_threshold; |
|---|
| 99 | 116 | struct rb_root *fq_root; |
|---|
| 100 | 117 | u8 rate_enable; |
|---|
| 101 | 118 | u8 fq_trees_log; |
|---|
| 102 | | - |
|---|
| 119 | + u8 horizon_drop; |
|---|
| 103 | 120 | u32 flows; |
|---|
| 104 | 121 | u32 inactive_flows; |
|---|
| 105 | 122 | u32 throttled_flows; |
|---|
| 106 | 123 | |
|---|
| 107 | 124 | u64 stat_gc_flows; |
|---|
| 108 | 125 | u64 stat_internal_packets; |
|---|
| 109 | | - u64 stat_tcp_retrans; |
|---|
| 110 | 126 | u64 stat_throttled; |
|---|
| 127 | + u64 stat_ce_mark; |
|---|
| 128 | + u64 stat_horizon_drops; |
|---|
| 129 | + u64 stat_horizon_caps; |
|---|
| 111 | 130 | u64 stat_flows_plimit; |
|---|
| 112 | 131 | u64 stat_pkts_too_long; |
|---|
| 113 | 132 | u64 stat_allocation_errors; |
|---|
| 133 | + |
|---|
| 134 | + u32 timer_slack; /* hrtimer slack in ns */ |
|---|
| 114 | 135 | struct qdisc_watchdog watchdog; |
|---|
| 115 | 136 | }; |
|---|
| 116 | 137 | |
|---|
| 117 | | -/* special value to mark a detached flow (not on old/new list) */ |
|---|
| 118 | | -static struct fq_flow detached, throttled; |
|---|
| 119 | | - |
|---|
| 138 | +/* |
|---|
| 139 | + * f->tail and f->age share the same location. |
|---|
| 140 | + * We can use the low order bit to differentiate if this location points |
|---|
| 141 | + * to a sk_buff or contains a jiffies value, if we force this value to be odd. |
|---|
| 142 | + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 |
|---|
| 143 | + */ |
|---|
| 120 | 144 | static void fq_flow_set_detached(struct fq_flow *f) |
|---|
| 121 | 145 | { |
|---|
| 122 | | - f->next = &detached; |
|---|
| 123 | | - f->age = jiffies; |
|---|
| 146 | + f->age = jiffies | 1UL; |
|---|
| 124 | 147 | } |
|---|
| 125 | 148 | |
|---|
| 126 | 149 | static bool fq_flow_is_detached(const struct fq_flow *f) |
|---|
| 127 | 150 | { |
|---|
| 128 | | - return f->next == &detached; |
|---|
| 151 | + return !!(f->age & 1UL); |
|---|
| 129 | 152 | } |
|---|
| 153 | + |
|---|
| 154 | +/* special value to mark a throttled flow (not on old/new list) */ |
|---|
| 155 | +static struct fq_flow throttled; |
|---|
| 130 | 156 | |
|---|
| 131 | 157 | static bool fq_flow_is_throttled(const struct fq_flow *f) |
|---|
| 132 | 158 | { |
|---|
| .. | .. |
|---|
| 192 | 218 | struct rb_root *root, |
|---|
| 193 | 219 | struct sock *sk) |
|---|
| 194 | 220 | { |
|---|
| 195 | | - struct fq_flow *f, *tofree[FQ_GC_MAX]; |
|---|
| 196 | 221 | struct rb_node **p, *parent; |
|---|
| 197 | | - int fcnt = 0; |
|---|
| 222 | + void *tofree[FQ_GC_MAX]; |
|---|
| 223 | + struct fq_flow *f; |
|---|
| 224 | + int i, fcnt = 0; |
|---|
| 198 | 225 | |
|---|
| 199 | 226 | p = &root->rb_node; |
|---|
| 200 | 227 | parent = NULL; |
|---|
| .. | .. |
|---|
| 217 | 244 | p = &parent->rb_left; |
|---|
| 218 | 245 | } |
|---|
| 219 | 246 | |
|---|
| 247 | + if (!fcnt) |
|---|
| 248 | + return; |
|---|
| 249 | + |
|---|
| 250 | + for (i = fcnt; i > 0; ) { |
|---|
| 251 | + f = tofree[--i]; |
|---|
| 252 | + rb_erase(&f->fq_node, root); |
|---|
| 253 | + } |
|---|
| 220 | 254 | q->flows -= fcnt; |
|---|
| 221 | 255 | q->inactive_flows -= fcnt; |
|---|
| 222 | 256 | q->stat_gc_flows += fcnt; |
|---|
| 223 | | - while (fcnt) { |
|---|
| 224 | | - struct fq_flow *f = tofree[--fcnt]; |
|---|
| 225 | 257 | |
|---|
| 226 | | - rb_erase(&f->fq_node, root); |
|---|
| 227 | | - kmem_cache_free(fq_flow_cachep, f); |
|---|
| 228 | | - } |
|---|
| 258 | + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); |
|---|
| 229 | 259 | } |
|---|
| 230 | 260 | |
|---|
| 231 | 261 | static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) |
|---|
| .. | .. |
|---|
| 256 | 286 | */ |
|---|
| 257 | 287 | sk = (struct sock *)((hash << 1) | 1UL); |
|---|
| 258 | 288 | skb_orphan(skb); |
|---|
| 289 | + } else if (sk->sk_state == TCP_CLOSE) { |
|---|
| 290 | + unsigned long hash = skb_get_hash(skb) & q->orphan_mask; |
|---|
| 291 | + /* |
|---|
| 292 | + * Sockets in TCP_CLOSE are non connected. |
|---|
| 293 | + * Typical use case is UDP sockets, they can send packets |
|---|
| 294 | + * with sendto() to many different destinations. |
|---|
| 295 | + * We probably could use a generic bit advertising |
|---|
| 296 | + * non connected sockets, instead of sk_state == TCP_CLOSE, |
|---|
| 297 | + * if we care enough. |
|---|
| 298 | + */ |
|---|
| 299 | + sk = (struct sock *)((hash << 1) | 1UL); |
|---|
| 259 | 300 | } |
|---|
| 260 | 301 | |
|---|
| 261 | 302 | root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; |
|---|
| .. | .. |
|---|
| 276 | 317 | * It not, we need to refill credit with |
|---|
| 277 | 318 | * initial quantum |
|---|
| 278 | 319 | */ |
|---|
| 279 | | - if (unlikely(skb->sk && |
|---|
| 320 | + if (unlikely(skb->sk == sk && |
|---|
| 280 | 321 | f->socket_hash != sk->sk_hash)) { |
|---|
| 281 | 322 | f->credit = q->initial_quantum; |
|---|
| 282 | 323 | f->socket_hash = sk->sk_hash; |
|---|
| 324 | + if (q->rate_enable) |
|---|
| 325 | + smp_store_release(&sk->sk_pacing_status, |
|---|
| 326 | + SK_PACING_FQ); |
|---|
| 283 | 327 | if (fq_flow_is_throttled(f)) |
|---|
| 284 | 328 | fq_flow_unset_throttled(q, f); |
|---|
| 285 | 329 | f->time_next_packet = 0ULL; |
|---|
| .. | .. |
|---|
| 297 | 341 | q->stat_allocation_errors++; |
|---|
| 298 | 342 | return &q->internal; |
|---|
| 299 | 343 | } |
|---|
| 344 | + /* f->t_root is already zeroed after kmem_cache_zalloc() */ |
|---|
| 345 | + |
|---|
| 300 | 346 | fq_flow_set_detached(f); |
|---|
| 301 | 347 | f->sk = sk; |
|---|
| 302 | | - if (skb->sk) |
|---|
| 348 | + if (skb->sk == sk) { |
|---|
| 303 | 349 | f->socket_hash = sk->sk_hash; |
|---|
| 350 | + if (q->rate_enable) |
|---|
| 351 | + smp_store_release(&sk->sk_pacing_status, |
|---|
| 352 | + SK_PACING_FQ); |
|---|
| 353 | + } |
|---|
| 304 | 354 | f->credit = q->initial_quantum; |
|---|
| 305 | 355 | |
|---|
| 306 | 356 | rb_link_node(&f->fq_node, parent, p); |
|---|
| .. | .. |
|---|
| 311 | 361 | return f; |
|---|
| 312 | 362 | } |
|---|
| 313 | 363 | |
|---|
| 314 | | - |
|---|
| 315 | | -/* remove one skb from head of flow queue */ |
|---|
| 316 | | -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) |
|---|
| 364 | +static struct sk_buff *fq_peek(struct fq_flow *flow) |
|---|
| 317 | 365 | { |
|---|
| 318 | | - struct sk_buff *skb = flow->head; |
|---|
| 366 | + struct sk_buff *skb = skb_rb_first(&flow->t_root); |
|---|
| 367 | + struct sk_buff *head = flow->head; |
|---|
| 319 | 368 | |
|---|
| 320 | | - if (skb) { |
|---|
| 369 | + if (!skb) |
|---|
| 370 | + return head; |
|---|
| 371 | + |
|---|
| 372 | + if (!head) |
|---|
| 373 | + return skb; |
|---|
| 374 | + |
|---|
| 375 | + if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) |
|---|
| 376 | + return skb; |
|---|
| 377 | + return head; |
|---|
| 378 | +} |
|---|
| 379 | + |
|---|
| 380 | +static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, |
|---|
| 381 | + struct sk_buff *skb) |
|---|
| 382 | +{ |
|---|
| 383 | + if (skb == flow->head) { |
|---|
| 321 | 384 | flow->head = skb->next; |
|---|
| 322 | | - skb->next = NULL; |
|---|
| 323 | | - flow->qlen--; |
|---|
| 324 | | - qdisc_qstats_backlog_dec(sch, skb); |
|---|
| 325 | | - sch->q.qlen--; |
|---|
| 385 | + } else { |
|---|
| 386 | + rb_erase(&skb->rbnode, &flow->t_root); |
|---|
| 387 | + skb->dev = qdisc_dev(sch); |
|---|
| 326 | 388 | } |
|---|
| 327 | | - return skb; |
|---|
| 328 | 389 | } |
|---|
| 329 | 390 | |
|---|
| 330 | | -/* We might add in the future detection of retransmits |
|---|
| 331 | | - * For the time being, just return false |
|---|
| 391 | +/* Remove one skb from flow queue. |
|---|
| 392 | + * This skb must be the return value of prior fq_peek(). |
|---|
| 332 | 393 | */ |
|---|
| 333 | | -static bool skb_is_retransmit(struct sk_buff *skb) |
|---|
| 394 | +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, |
|---|
| 395 | + struct sk_buff *skb) |
|---|
| 334 | 396 | { |
|---|
| 335 | | - return false; |
|---|
| 397 | + fq_erase_head(sch, flow, skb); |
|---|
| 398 | + skb_mark_not_on_list(skb); |
|---|
| 399 | + flow->qlen--; |
|---|
| 400 | + qdisc_qstats_backlog_dec(sch, skb); |
|---|
| 401 | + sch->q.qlen--; |
|---|
| 336 | 402 | } |
|---|
| 337 | 403 | |
|---|
| 338 | | -/* add skb to flow queue |
|---|
| 339 | | - * flow queue is a linked list, kind of FIFO, except for TCP retransmits |
|---|
| 340 | | - * We special case tcp retransmits to be transmitted before other packets. |
|---|
| 341 | | - * We rely on fact that TCP retransmits are unlikely, so we do not waste |
|---|
| 342 | | - * a separate queue or a pointer. |
|---|
| 343 | | - * head-> [retrans pkt 1] |
|---|
| 344 | | - * [retrans pkt 2] |
|---|
| 345 | | - * [ normal pkt 1] |
|---|
| 346 | | - * [ normal pkt 2] |
|---|
| 347 | | - * [ normal pkt 3] |
|---|
| 348 | | - * tail-> [ normal pkt 4] |
|---|
| 349 | | - */ |
|---|
| 350 | 404 | static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) |
|---|
| 351 | 405 | { |
|---|
| 352 | | - struct sk_buff *prev, *head = flow->head; |
|---|
| 406 | + struct rb_node **p, *parent; |
|---|
| 407 | + struct sk_buff *head, *aux; |
|---|
| 353 | 408 | |
|---|
| 354 | | - skb->next = NULL; |
|---|
| 355 | | - if (!head) { |
|---|
| 356 | | - flow->head = skb; |
|---|
| 357 | | - flow->tail = skb; |
|---|
| 358 | | - return; |
|---|
| 359 | | - } |
|---|
| 360 | | - if (likely(!skb_is_retransmit(skb))) { |
|---|
| 361 | | - flow->tail->next = skb; |
|---|
| 362 | | - flow->tail = skb; |
|---|
| 363 | | - return; |
|---|
| 364 | | - } |
|---|
| 365 | | - |
|---|
| 366 | | - /* This skb is a tcp retransmit, |
|---|
| 367 | | - * find the last retrans packet in the queue |
|---|
| 368 | | - */ |
|---|
| 369 | | - prev = NULL; |
|---|
| 370 | | - while (skb_is_retransmit(head)) { |
|---|
| 371 | | - prev = head; |
|---|
| 372 | | - head = head->next; |
|---|
| 409 | + head = flow->head; |
|---|
| 410 | + if (!head || |
|---|
| 411 | + fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { |
|---|
| 373 | 412 | if (!head) |
|---|
| 374 | | - break; |
|---|
| 375 | | - } |
|---|
| 376 | | - if (!prev) { /* no rtx packet in queue, become the new head */ |
|---|
| 377 | | - skb->next = flow->head; |
|---|
| 378 | | - flow->head = skb; |
|---|
| 379 | | - } else { |
|---|
| 380 | | - if (prev == flow->tail) |
|---|
| 381 | | - flow->tail = skb; |
|---|
| 413 | + flow->head = skb; |
|---|
| 382 | 414 | else |
|---|
| 383 | | - skb->next = prev->next; |
|---|
| 384 | | - prev->next = skb; |
|---|
| 415 | + flow->tail->next = skb; |
|---|
| 416 | + flow->tail = skb; |
|---|
| 417 | + skb->next = NULL; |
|---|
| 418 | + return; |
|---|
| 385 | 419 | } |
|---|
| 420 | + |
|---|
| 421 | + p = &flow->t_root.rb_node; |
|---|
| 422 | + parent = NULL; |
|---|
| 423 | + |
|---|
| 424 | + while (*p) { |
|---|
| 425 | + parent = *p; |
|---|
| 426 | + aux = rb_to_skb(parent); |
|---|
| 427 | + if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) |
|---|
| 428 | + p = &parent->rb_right; |
|---|
| 429 | + else |
|---|
| 430 | + p = &parent->rb_left; |
|---|
| 431 | + } |
|---|
| 432 | + rb_link_node(&skb->rbnode, parent, p); |
|---|
| 433 | + rb_insert_color(&skb->rbnode, &flow->t_root); |
|---|
| 434 | +} |
|---|
| 435 | + |
|---|
| 436 | +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, |
|---|
| 437 | + const struct fq_sched_data *q) |
|---|
| 438 | +{ |
|---|
| 439 | + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); |
|---|
| 386 | 440 | } |
|---|
| 387 | 441 | |
|---|
| 388 | 442 | static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
|---|
| .. | .. |
|---|
| 394 | 448 | if (unlikely(sch->q.qlen >= sch->limit)) |
|---|
| 395 | 449 | return qdisc_drop(skb, sch, to_free); |
|---|
| 396 | 450 | |
|---|
| 451 | + if (!skb->tstamp) { |
|---|
| 452 | + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); |
|---|
| 453 | + } else { |
|---|
| 454 | + /* Check if packet timestamp is too far in the future. |
|---|
| 455 | + * Try first if our cached value, to avoid ktime_get_ns() |
|---|
| 456 | + * cost in most cases. |
|---|
| 457 | + */ |
|---|
| 458 | + if (fq_packet_beyond_horizon(skb, q)) { |
|---|
| 459 | + /* Refresh our cache and check another time */ |
|---|
| 460 | + q->ktime_cache = ktime_get_ns(); |
|---|
| 461 | + if (fq_packet_beyond_horizon(skb, q)) { |
|---|
| 462 | + if (q->horizon_drop) { |
|---|
| 463 | + q->stat_horizon_drops++; |
|---|
| 464 | + return qdisc_drop(skb, sch, to_free); |
|---|
| 465 | + } |
|---|
| 466 | + q->stat_horizon_caps++; |
|---|
| 467 | + skb->tstamp = q->ktime_cache + q->horizon; |
|---|
| 468 | + } |
|---|
| 469 | + } |
|---|
| 470 | + fq_skb_cb(skb)->time_to_send = skb->tstamp; |
|---|
| 471 | + } |
|---|
| 472 | + |
|---|
| 397 | 473 | f = fq_classify(skb, q); |
|---|
| 398 | 474 | if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { |
|---|
| 399 | 475 | q->stat_flows_plimit++; |
|---|
| .. | .. |
|---|
| 401 | 477 | } |
|---|
| 402 | 478 | |
|---|
| 403 | 479 | f->qlen++; |
|---|
| 404 | | - if (skb_is_retransmit(skb)) |
|---|
| 405 | | - q->stat_tcp_retrans++; |
|---|
| 406 | 480 | qdisc_qstats_backlog_inc(sch, skb); |
|---|
| 407 | 481 | if (fq_flow_is_detached(f)) { |
|---|
| 408 | | - struct sock *sk = skb->sk; |
|---|
| 409 | | - |
|---|
| 410 | 482 | fq_flow_add_tail(&q->new_flows, f); |
|---|
| 411 | 483 | if (time_after(jiffies, f->age + q->flow_refill_delay)) |
|---|
| 412 | 484 | f->credit = max_t(u32, f->credit, q->quantum); |
|---|
| 413 | | - if (sk && q->rate_enable) { |
|---|
| 414 | | - if (unlikely(smp_load_acquire(&sk->sk_pacing_status) != |
|---|
| 415 | | - SK_PACING_FQ)) |
|---|
| 416 | | - smp_store_release(&sk->sk_pacing_status, |
|---|
| 417 | | - SK_PACING_FQ); |
|---|
| 418 | | - } |
|---|
| 419 | 485 | q->inactive_flows--; |
|---|
| 420 | 486 | } |
|---|
| 421 | 487 | |
|---|
| .. | .. |
|---|
| 460 | 526 | static struct sk_buff *fq_dequeue(struct Qdisc *sch) |
|---|
| 461 | 527 | { |
|---|
| 462 | 528 | struct fq_sched_data *q = qdisc_priv(sch); |
|---|
| 463 | | - u64 now = ktime_get_ns(); |
|---|
| 464 | 529 | struct fq_flow_head *head; |
|---|
| 465 | 530 | struct sk_buff *skb; |
|---|
| 466 | 531 | struct fq_flow *f; |
|---|
| 467 | | - u32 rate, plen; |
|---|
| 532 | + unsigned long rate; |
|---|
| 533 | + u32 plen; |
|---|
| 534 | + u64 now; |
|---|
| 468 | 535 | |
|---|
| 469 | | - skb = fq_dequeue_head(sch, &q->internal); |
|---|
| 470 | | - if (skb) |
|---|
| 536 | + if (!sch->q.qlen) |
|---|
| 537 | + return NULL; |
|---|
| 538 | + |
|---|
| 539 | + skb = fq_peek(&q->internal); |
|---|
| 540 | + if (unlikely(skb)) { |
|---|
| 541 | + fq_dequeue_skb(sch, &q->internal, skb); |
|---|
| 471 | 542 | goto out; |
|---|
| 543 | + } |
|---|
| 544 | + |
|---|
| 545 | + q->ktime_cache = now = ktime_get_ns(); |
|---|
| 472 | 546 | fq_check_throttled(q, now); |
|---|
| 473 | 547 | begin: |
|---|
| 474 | 548 | head = &q->new_flows; |
|---|
| .. | .. |
|---|
| 476 | 550 | head = &q->old_flows; |
|---|
| 477 | 551 | if (!head->first) { |
|---|
| 478 | 552 | if (q->time_next_delayed_flow != ~0ULL) |
|---|
| 479 | | - qdisc_watchdog_schedule_ns(&q->watchdog, |
|---|
| 480 | | - q->time_next_delayed_flow); |
|---|
| 553 | + qdisc_watchdog_schedule_range_ns(&q->watchdog, |
|---|
| 554 | + q->time_next_delayed_flow, |
|---|
| 555 | + q->timer_slack); |
|---|
| 481 | 556 | return NULL; |
|---|
| 482 | 557 | } |
|---|
| 483 | 558 | } |
|---|
| .. | .. |
|---|
| 490 | 565 | goto begin; |
|---|
| 491 | 566 | } |
|---|
| 492 | 567 | |
|---|
| 493 | | - skb = f->head; |
|---|
| 494 | | - if (unlikely(skb && now < f->time_next_packet && |
|---|
| 495 | | - !skb_is_tcp_pure_ack(skb))) { |
|---|
| 496 | | - head->first = f->next; |
|---|
| 497 | | - fq_flow_set_throttled(q, f); |
|---|
| 498 | | - goto begin; |
|---|
| 499 | | - } |
|---|
| 568 | + skb = fq_peek(f); |
|---|
| 569 | + if (skb) { |
|---|
| 570 | + u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, |
|---|
| 571 | + f->time_next_packet); |
|---|
| 500 | 572 | |
|---|
| 501 | | - skb = fq_dequeue_head(sch, f); |
|---|
| 502 | | - if (!skb) { |
|---|
| 573 | + if (now < time_next_packet) { |
|---|
| 574 | + head->first = f->next; |
|---|
| 575 | + f->time_next_packet = time_next_packet; |
|---|
| 576 | + fq_flow_set_throttled(q, f); |
|---|
| 577 | + goto begin; |
|---|
| 578 | + } |
|---|
| 579 | + prefetch(&skb->end); |
|---|
| 580 | + if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { |
|---|
| 581 | + INET_ECN_set_ce(skb); |
|---|
| 582 | + q->stat_ce_mark++; |
|---|
| 583 | + } |
|---|
| 584 | + fq_dequeue_skb(sch, f, skb); |
|---|
| 585 | + } else { |
|---|
| 503 | 586 | head->first = f->next; |
|---|
| 504 | 587 | /* force a pass through old_flows to prevent starvation */ |
|---|
| 505 | 588 | if ((head == &q->new_flows) && q->old_flows.first) { |
|---|
| .. | .. |
|---|
| 510 | 593 | } |
|---|
| 511 | 594 | goto begin; |
|---|
| 512 | 595 | } |
|---|
| 513 | | - prefetch(&skb->end); |
|---|
| 514 | | - f->credit -= qdisc_pkt_len(skb); |
|---|
| 596 | + plen = qdisc_pkt_len(skb); |
|---|
| 597 | + f->credit -= plen; |
|---|
| 515 | 598 | |
|---|
| 516 | 599 | if (!q->rate_enable) |
|---|
| 517 | 600 | goto out; |
|---|
| 518 | 601 | |
|---|
| 519 | | - /* Do not pace locally generated ack packets */ |
|---|
| 520 | | - if (skb_is_tcp_pure_ack(skb)) |
|---|
| 521 | | - goto out; |
|---|
| 522 | | - |
|---|
| 523 | 602 | rate = q->flow_max_rate; |
|---|
| 524 | | - if (skb->sk) |
|---|
| 525 | | - rate = min(skb->sk->sk_pacing_rate, rate); |
|---|
| 526 | 603 | |
|---|
| 527 | | - if (rate <= q->low_rate_threshold) { |
|---|
| 528 | | - f->credit = 0; |
|---|
| 529 | | - plen = qdisc_pkt_len(skb); |
|---|
| 530 | | - } else { |
|---|
| 531 | | - plen = max(qdisc_pkt_len(skb), q->quantum); |
|---|
| 532 | | - if (f->credit > 0) |
|---|
| 533 | | - goto out; |
|---|
| 604 | + /* If EDT time was provided for this skb, we need to |
|---|
| 605 | + * update f->time_next_packet only if this qdisc enforces |
|---|
| 606 | + * a flow max rate. |
|---|
| 607 | + */ |
|---|
| 608 | + if (!skb->tstamp) { |
|---|
| 609 | + if (skb->sk) |
|---|
| 610 | + rate = min(skb->sk->sk_pacing_rate, rate); |
|---|
| 611 | + |
|---|
| 612 | + if (rate <= q->low_rate_threshold) { |
|---|
| 613 | + f->credit = 0; |
|---|
| 614 | + } else { |
|---|
| 615 | + plen = max(plen, q->quantum); |
|---|
| 616 | + if (f->credit > 0) |
|---|
| 617 | + goto out; |
|---|
| 618 | + } |
|---|
| 534 | 619 | } |
|---|
| 535 | | - if (rate != ~0U) { |
|---|
| 620 | + if (rate != ~0UL) { |
|---|
| 536 | 621 | u64 len = (u64)plen * NSEC_PER_SEC; |
|---|
| 537 | 622 | |
|---|
| 538 | 623 | if (likely(rate)) |
|---|
| 539 | | - do_div(len, rate); |
|---|
| 624 | + len = div64_ul(len, rate); |
|---|
| 540 | 625 | /* Since socket rate can change later, |
|---|
| 541 | 626 | * clamp the delay to 1 second. |
|---|
| 542 | 627 | * Really, providers of too big packets should be fixed ! |
|---|
| .. | .. |
|---|
| 560 | 645 | |
|---|
| 561 | 646 | static void fq_flow_purge(struct fq_flow *flow) |
|---|
| 562 | 647 | { |
|---|
| 648 | + struct rb_node *p = rb_first(&flow->t_root); |
|---|
| 649 | + |
|---|
| 650 | + while (p) { |
|---|
| 651 | + struct sk_buff *skb = rb_to_skb(p); |
|---|
| 652 | + |
|---|
| 653 | + p = rb_next(p); |
|---|
| 654 | + rb_erase(&skb->rbnode, &flow->t_root); |
|---|
| 655 | + rtnl_kfree_skbs(skb, skb); |
|---|
| 656 | + } |
|---|
| 563 | 657 | rtnl_kfree_skbs(flow->head, flow->tail); |
|---|
| 564 | 658 | flow->head = NULL; |
|---|
| 565 | 659 | flow->qlen = 0; |
|---|
| .. | .. |
|---|
| 686 | 780 | } |
|---|
| 687 | 781 | |
|---|
| 688 | 782 | static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { |
|---|
| 783 | + [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, |
|---|
| 784 | + |
|---|
| 689 | 785 | [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, |
|---|
| 690 | 786 | [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, |
|---|
| 691 | 787 | [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, |
|---|
| .. | .. |
|---|
| 697 | 793 | [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, |
|---|
| 698 | 794 | [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, |
|---|
| 699 | 795 | [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, |
|---|
| 796 | + [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, |
|---|
| 797 | + [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, |
|---|
| 798 | + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, |
|---|
| 799 | + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, |
|---|
| 700 | 800 | }; |
|---|
| 701 | 801 | |
|---|
| 702 | 802 | static int fq_change(struct Qdisc *sch, struct nlattr *opt, |
|---|
| .. | .. |
|---|
| 711 | 811 | if (!opt) |
|---|
| 712 | 812 | return -EINVAL; |
|---|
| 713 | 813 | |
|---|
| 714 | | - err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL); |
|---|
| 814 | + err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, |
|---|
| 815 | + NULL); |
|---|
| 715 | 816 | if (err < 0) |
|---|
| 716 | 817 | return err; |
|---|
| 717 | 818 | |
|---|
| .. | .. |
|---|
| 751 | 852 | pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", |
|---|
| 752 | 853 | nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); |
|---|
| 753 | 854 | |
|---|
| 754 | | - if (tb[TCA_FQ_FLOW_MAX_RATE]) |
|---|
| 755 | | - q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); |
|---|
| 855 | + if (tb[TCA_FQ_FLOW_MAX_RATE]) { |
|---|
| 856 | + u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); |
|---|
| 756 | 857 | |
|---|
| 858 | + q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; |
|---|
| 859 | + } |
|---|
| 757 | 860 | if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) |
|---|
| 758 | 861 | q->low_rate_threshold = |
|---|
| 759 | 862 | nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); |
|---|
| .. | .. |
|---|
| 776 | 879 | if (tb[TCA_FQ_ORPHAN_MASK]) |
|---|
| 777 | 880 | q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); |
|---|
| 778 | 881 | |
|---|
| 882 | + if (tb[TCA_FQ_CE_THRESHOLD]) |
|---|
| 883 | + q->ce_threshold = (u64)NSEC_PER_USEC * |
|---|
| 884 | + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); |
|---|
| 885 | + |
|---|
| 886 | + if (tb[TCA_FQ_TIMER_SLACK]) |
|---|
| 887 | + q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); |
|---|
| 888 | + |
|---|
| 889 | + if (tb[TCA_FQ_HORIZON]) |
|---|
| 890 | + q->horizon = (u64)NSEC_PER_USEC * |
|---|
| 891 | + nla_get_u32(tb[TCA_FQ_HORIZON]); |
|---|
| 892 | + |
|---|
| 893 | + if (tb[TCA_FQ_HORIZON_DROP]) |
|---|
| 894 | + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); |
|---|
| 895 | + |
|---|
| 779 | 896 | if (!err) { |
|---|
| 897 | + |
|---|
| 780 | 898 | sch_tree_unlock(sch); |
|---|
| 781 | 899 | err = fq_resize(sch, fq_log); |
|---|
| 782 | 900 | sch_tree_lock(sch); |
|---|
| .. | .. |
|---|
| 816 | 934 | q->quantum = 2 * psched_mtu(qdisc_dev(sch)); |
|---|
| 817 | 935 | q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); |
|---|
| 818 | 936 | q->flow_refill_delay = msecs_to_jiffies(40); |
|---|
| 819 | | - q->flow_max_rate = ~0U; |
|---|
| 937 | + q->flow_max_rate = ~0UL; |
|---|
| 820 | 938 | q->time_next_delayed_flow = ~0ULL; |
|---|
| 821 | 939 | q->rate_enable = 1; |
|---|
| 822 | 940 | q->new_flows.first = NULL; |
|---|
| .. | .. |
|---|
| 826 | 944 | q->fq_trees_log = ilog2(1024); |
|---|
| 827 | 945 | q->orphan_mask = 1024 - 1; |
|---|
| 828 | 946 | q->low_rate_threshold = 550000 / 8; |
|---|
| 829 | | - qdisc_watchdog_init(&q->watchdog, sch); |
|---|
| 947 | + |
|---|
| 948 | + q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ |
|---|
| 949 | + |
|---|
| 950 | + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ |
|---|
| 951 | + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ |
|---|
| 952 | + |
|---|
| 953 | + /* Default ce_threshold of 4294 seconds */ |
|---|
| 954 | + q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; |
|---|
| 955 | + |
|---|
| 956 | + qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); |
|---|
| 830 | 957 | |
|---|
| 831 | 958 | if (opt) |
|---|
| 832 | 959 | err = fq_change(sch, opt, extack); |
|---|
| .. | .. |
|---|
| 839 | 966 | static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) |
|---|
| 840 | 967 | { |
|---|
| 841 | 968 | struct fq_sched_data *q = qdisc_priv(sch); |
|---|
| 969 | + u64 ce_threshold = q->ce_threshold; |
|---|
| 970 | + u64 horizon = q->horizon; |
|---|
| 842 | 971 | struct nlattr *opts; |
|---|
| 843 | 972 | |
|---|
| 844 | | - opts = nla_nest_start(skb, TCA_OPTIONS); |
|---|
| 973 | + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); |
|---|
| 845 | 974 | if (opts == NULL) |
|---|
| 846 | 975 | goto nla_put_failure; |
|---|
| 847 | 976 | |
|---|
| 848 | 977 | /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ |
|---|
| 978 | + |
|---|
| 979 | + do_div(ce_threshold, NSEC_PER_USEC); |
|---|
| 980 | + do_div(horizon, NSEC_PER_USEC); |
|---|
| 849 | 981 | |
|---|
| 850 | 982 | if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || |
|---|
| 851 | 983 | nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || |
|---|
| 852 | 984 | nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || |
|---|
| 853 | 985 | nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || |
|---|
| 854 | 986 | nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || |
|---|
| 855 | | - nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || |
|---|
| 987 | + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, |
|---|
| 988 | + min_t(unsigned long, q->flow_max_rate, ~0U)) || |
|---|
| 856 | 989 | nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, |
|---|
| 857 | 990 | jiffies_to_usecs(q->flow_refill_delay)) || |
|---|
| 858 | 991 | nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || |
|---|
| 859 | 992 | nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, |
|---|
| 860 | 993 | q->low_rate_threshold) || |
|---|
| 861 | | - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) |
|---|
| 994 | + nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || |
|---|
| 995 | + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || |
|---|
| 996 | + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || |
|---|
| 997 | + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || |
|---|
| 998 | + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) |
|---|
| 862 | 999 | goto nla_put_failure; |
|---|
| 863 | 1000 | |
|---|
| 864 | 1001 | return nla_nest_end(skb, opts); |
|---|
| .. | .. |
|---|
| 876 | 1013 | |
|---|
| 877 | 1014 | st.gc_flows = q->stat_gc_flows; |
|---|
| 878 | 1015 | st.highprio_packets = q->stat_internal_packets; |
|---|
| 879 | | - st.tcp_retrans = q->stat_tcp_retrans; |
|---|
| 1016 | + st.tcp_retrans = 0; |
|---|
| 880 | 1017 | st.throttled = q->stat_throttled; |
|---|
| 881 | 1018 | st.flows_plimit = q->stat_flows_plimit; |
|---|
| 882 | 1019 | st.pkts_too_long = q->stat_pkts_too_long; |
|---|
| 883 | 1020 | st.allocation_errors = q->stat_allocation_errors; |
|---|
| 884 | | - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); |
|---|
| 1021 | + st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - |
|---|
| 1022 | + ktime_get_ns(); |
|---|
| 885 | 1023 | st.flows = q->flows; |
|---|
| 886 | 1024 | st.inactive_flows = q->inactive_flows; |
|---|
| 887 | 1025 | st.throttled_flows = q->throttled_flows; |
|---|
| 888 | 1026 | st.unthrottle_latency_ns = min_t(unsigned long, |
|---|
| 889 | 1027 | q->unthrottle_latency_ns, ~0U); |
|---|
| 1028 | + st.ce_mark = q->stat_ce_mark; |
|---|
| 1029 | + st.horizon_drops = q->stat_horizon_drops; |
|---|
| 1030 | + st.horizon_caps = q->stat_horizon_caps; |
|---|
| 890 | 1031 | sch_tree_unlock(sch); |
|---|
| 891 | 1032 | |
|---|
| 892 | 1033 | return gnet_stats_copy_app(d, &st, sizeof(st)); |
|---|
| .. | .. |
|---|
| 934 | 1075 | module_exit(fq_module_exit) |
|---|
| 935 | 1076 | MODULE_AUTHOR("Eric Dumazet"); |
|---|
| 936 | 1077 | MODULE_LICENSE("GPL"); |
|---|
| 1078 | +MODULE_DESCRIPTION("Fair Queue Packet Scheduler"); |
|---|