| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* DataCenter TCP (DCTCP) congestion control. |
|---|
| 2 | 3 | * |
|---|
| 3 | 4 | * http://simula.stanford.edu/~alizade/Site/DCTCP.html |
|---|
| .. | .. |
|---|
| 33 | 34 | * Daniel Borkmann <dborkman@redhat.com> |
|---|
| 34 | 35 | * Florian Westphal <fw@strlen.de> |
|---|
| 35 | 36 | * Glenn Judd <glenn.judd@morganstanley.com> |
|---|
| 36 | | - * |
|---|
| 37 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 38 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 39 | | - * the Free Software Foundation; either version 2 of the License, or (at |
|---|
| 40 | | - * your option) any later version. |
|---|
| 41 | 37 | */ |
|---|
| 42 | 38 | |
|---|
| 43 | 39 | #include <linux/module.h> |
|---|
| 44 | 40 | #include <linux/mm.h> |
|---|
| 45 | 41 | #include <net/tcp.h> |
|---|
| 46 | 42 | #include <linux/inet_diag.h> |
|---|
| 43 | +#include "tcp_dctcp.h" |
|---|
| 47 | 44 | |
|---|
| 48 | 45 | #define DCTCP_MAX_ALPHA 1024U |
|---|
| 49 | 46 | |
|---|
| 50 | 47 | struct dctcp { |
|---|
| 51 | | - u32 acked_bytes_ecn; |
|---|
| 52 | | - u32 acked_bytes_total; |
|---|
| 53 | | - u32 prior_snd_una; |
|---|
| 48 | + u32 old_delivered; |
|---|
| 49 | + u32 old_delivered_ce; |
|---|
| 54 | 50 | u32 prior_rcv_nxt; |
|---|
| 55 | 51 | u32 dctcp_alpha; |
|---|
| 56 | 52 | u32 next_seq; |
|---|
| .. | .. |
|---|
| 72 | 68 | { |
|---|
| 73 | 69 | ca->next_seq = tp->snd_nxt; |
|---|
| 74 | 70 | |
|---|
| 75 | | - ca->acked_bytes_ecn = 0; |
|---|
| 76 | | - ca->acked_bytes_total = 0; |
|---|
| 71 | + ca->old_delivered = tp->delivered; |
|---|
| 72 | + ca->old_delivered_ce = tp->delivered_ce; |
|---|
| 77 | 73 | } |
|---|
| 78 | 74 | |
|---|
| 79 | 75 | static void dctcp_init(struct sock *sk) |
|---|
| .. | .. |
|---|
| 85 | 81 | sk->sk_state == TCP_CLOSE)) { |
|---|
| 86 | 82 | struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 87 | 83 | |
|---|
| 88 | | - ca->prior_snd_una = tp->snd_una; |
|---|
| 89 | 84 | ca->prior_rcv_nxt = tp->rcv_nxt; |
|---|
| 90 | 85 | |
|---|
| 91 | 86 | ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); |
|---|
| .. | .. |
|---|
| 113 | 108 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); |
|---|
| 114 | 109 | } |
|---|
| 115 | 110 | |
|---|
| 116 | | -/* Minimal DCTP CE state machine: |
|---|
| 117 | | - * |
|---|
| 118 | | - * S: 0 <- last pkt was non-CE |
|---|
| 119 | | - * 1 <- last pkt was CE |
|---|
| 120 | | - */ |
|---|
| 121 | | - |
|---|
| 122 | | -static void dctcp_ce_state_0_to_1(struct sock *sk) |
|---|
| 123 | | -{ |
|---|
| 124 | | - struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 125 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 126 | | - |
|---|
| 127 | | - if (!ca->ce_state) { |
|---|
| 128 | | - /* State has changed from CE=0 to CE=1, force an immediate |
|---|
| 129 | | - * ACK to reflect the new CE state. If an ACK was delayed, |
|---|
| 130 | | - * send that first to reflect the prior CE state. |
|---|
| 131 | | - */ |
|---|
| 132 | | - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) |
|---|
| 133 | | - __tcp_send_ack(sk, ca->prior_rcv_nxt); |
|---|
| 134 | | - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; |
|---|
| 135 | | - } |
|---|
| 136 | | - |
|---|
| 137 | | - ca->prior_rcv_nxt = tp->rcv_nxt; |
|---|
| 138 | | - ca->ce_state = 1; |
|---|
| 139 | | - |
|---|
| 140 | | - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; |
|---|
| 141 | | -} |
|---|
| 142 | | - |
|---|
| 143 | | -static void dctcp_ce_state_1_to_0(struct sock *sk) |
|---|
| 144 | | -{ |
|---|
| 145 | | - struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 146 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 147 | | - |
|---|
| 148 | | - if (ca->ce_state) { |
|---|
| 149 | | - /* State has changed from CE=1 to CE=0, force an immediate |
|---|
| 150 | | - * ACK to reflect the new CE state. If an ACK was delayed, |
|---|
| 151 | | - * send that first to reflect the prior CE state. |
|---|
| 152 | | - */ |
|---|
| 153 | | - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) |
|---|
| 154 | | - __tcp_send_ack(sk, ca->prior_rcv_nxt); |
|---|
| 155 | | - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; |
|---|
| 156 | | - } |
|---|
| 157 | | - |
|---|
| 158 | | - ca->prior_rcv_nxt = tp->rcv_nxt; |
|---|
| 159 | | - ca->ce_state = 0; |
|---|
| 160 | | - |
|---|
| 161 | | - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
|---|
| 162 | | -} |
|---|
| 163 | | - |
|---|
| 164 | 111 | static void dctcp_update_alpha(struct sock *sk, u32 flags) |
|---|
| 165 | 112 | { |
|---|
| 166 | 113 | const struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 167 | 114 | struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 168 | | - u32 acked_bytes = tp->snd_una - ca->prior_snd_una; |
|---|
| 169 | | - |
|---|
| 170 | | - /* If ack did not advance snd_una, count dupack as MSS size. |
|---|
| 171 | | - * If ack did update window, do not count it at all. |
|---|
| 172 | | - */ |
|---|
| 173 | | - if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) |
|---|
| 174 | | - acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; |
|---|
| 175 | | - if (acked_bytes) { |
|---|
| 176 | | - ca->acked_bytes_total += acked_bytes; |
|---|
| 177 | | - ca->prior_snd_una = tp->snd_una; |
|---|
| 178 | | - |
|---|
| 179 | | - if (flags & CA_ACK_ECE) |
|---|
| 180 | | - ca->acked_bytes_ecn += acked_bytes; |
|---|
| 181 | | - } |
|---|
| 182 | 115 | |
|---|
| 183 | 116 | /* Expired RTT */ |
|---|
| 184 | 117 | if (!before(tp->snd_una, ca->next_seq)) { |
|---|
| 185 | | - u64 bytes_ecn = ca->acked_bytes_ecn; |
|---|
| 118 | + u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; |
|---|
| 186 | 119 | u32 alpha = ca->dctcp_alpha; |
|---|
| 187 | 120 | |
|---|
| 188 | 121 | /* alpha = (1 - g) * alpha + g * F */ |
|---|
| 189 | 122 | |
|---|
| 190 | 123 | alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); |
|---|
| 191 | | - if (bytes_ecn) { |
|---|
| 192 | | - /* If dctcp_shift_g == 1, a 32bit value would overflow |
|---|
| 193 | | - * after 8 Mbytes. |
|---|
| 194 | | - */ |
|---|
| 195 | | - bytes_ecn <<= (10 - dctcp_shift_g); |
|---|
| 196 | | - do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); |
|---|
| 124 | + if (delivered_ce) { |
|---|
| 125 | + u32 delivered = tp->delivered - ca->old_delivered; |
|---|
| 197 | 126 | |
|---|
| 198 | | - alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); |
|---|
| 127 | + /* If dctcp_shift_g == 1, a 32bit value would overflow |
|---|
| 128 | + * after 8 M packets. |
|---|
| 129 | + */ |
|---|
| 130 | + delivered_ce <<= (10 - dctcp_shift_g); |
|---|
| 131 | + delivered_ce /= max(1U, delivered); |
|---|
| 132 | + |
|---|
| 133 | + alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); |
|---|
| 199 | 134 | } |
|---|
| 200 | 135 | /* dctcp_alpha can be read from dctcp_get_info() without |
|---|
| 201 | 136 | * synchro, so we ask compiler to not use dctcp_alpha |
|---|
| .. | .. |
|---|
| 227 | 162 | |
|---|
| 228 | 163 | static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) |
|---|
| 229 | 164 | { |
|---|
| 165 | + struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 166 | + |
|---|
| 230 | 167 | switch (ev) { |
|---|
| 231 | 168 | case CA_EVENT_ECN_IS_CE: |
|---|
| 232 | | - dctcp_ce_state_0_to_1(sk); |
|---|
| 233 | | - break; |
|---|
| 234 | 169 | case CA_EVENT_ECN_NO_CE: |
|---|
| 235 | | - dctcp_ce_state_1_to_0(sk); |
|---|
| 170 | + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); |
|---|
| 236 | 171 | break; |
|---|
| 237 | 172 | case CA_EVENT_LOSS: |
|---|
| 238 | 173 | dctcp_react_to_loss(sk); |
|---|
| .. | .. |
|---|
| 247 | 182 | union tcp_cc_info *info) |
|---|
| 248 | 183 | { |
|---|
| 249 | 184 | const struct dctcp *ca = inet_csk_ca(sk); |
|---|
| 185 | + const struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 250 | 186 | |
|---|
| 251 | 187 | /* Fill it also in case of VEGASINFO due to req struct limits. |
|---|
| 252 | 188 | * We can still correctly retrieve it later. |
|---|
| .. | .. |
|---|
| 258 | 194 | info->dctcp.dctcp_enabled = 1; |
|---|
| 259 | 195 | info->dctcp.dctcp_ce_state = (u16) ca->ce_state; |
|---|
| 260 | 196 | info->dctcp.dctcp_alpha = ca->dctcp_alpha; |
|---|
| 261 | | - info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; |
|---|
| 262 | | - info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; |
|---|
| 197 | + info->dctcp.dctcp_ab_ecn = tp->mss_cache * |
|---|
| 198 | + (tp->delivered_ce - ca->old_delivered_ce); |
|---|
| 199 | + info->dctcp.dctcp_ab_tot = tp->mss_cache * |
|---|
| 200 | + (tp->delivered - ca->old_delivered); |
|---|
| 263 | 201 | } |
|---|
| 264 | 202 | |
|---|
| 265 | 203 | *attr = INET_DIAG_DCTCPINFO; |
|---|