.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* DataCenter TCP (DCTCP) congestion control. |
---|
2 | 3 | * |
---|
3 | 4 | * http://simula.stanford.edu/~alizade/Site/DCTCP.html |
---|
.. | .. |
---|
33 | 34 | * Daniel Borkmann <dborkman@redhat.com> |
---|
34 | 35 | * Florian Westphal <fw@strlen.de> |
---|
35 | 36 | * Glenn Judd <glenn.judd@morganstanley.com> |
---|
36 | | - * |
---|
37 | | - * This program is free software; you can redistribute it and/or modify |
---|
38 | | - * it under the terms of the GNU General Public License as published by |
---|
39 | | - * the Free Software Foundation; either version 2 of the License, or (at |
---|
40 | | - * your option) any later version. |
---|
41 | 37 | */ |
---|
42 | 38 | |
---|
43 | 39 | #include <linux/module.h> |
---|
44 | 40 | #include <linux/mm.h> |
---|
45 | 41 | #include <net/tcp.h> |
---|
46 | 42 | #include <linux/inet_diag.h> |
---|
| 43 | +#include "tcp_dctcp.h" |
---|
47 | 44 | |
---|
48 | 45 | #define DCTCP_MAX_ALPHA 1024U |
---|
49 | 46 | |
---|
50 | 47 | struct dctcp { |
---|
51 | | - u32 acked_bytes_ecn; |
---|
52 | | - u32 acked_bytes_total; |
---|
53 | | - u32 prior_snd_una; |
---|
| 48 | + u32 old_delivered; |
---|
| 49 | + u32 old_delivered_ce; |
---|
54 | 50 | u32 prior_rcv_nxt; |
---|
55 | 51 | u32 dctcp_alpha; |
---|
56 | 52 | u32 next_seq; |
---|
.. | .. |
---|
72 | 68 | { |
---|
73 | 69 | ca->next_seq = tp->snd_nxt; |
---|
74 | 70 | |
---|
75 | | - ca->acked_bytes_ecn = 0; |
---|
76 | | - ca->acked_bytes_total = 0; |
---|
| 71 | + ca->old_delivered = tp->delivered; |
---|
| 72 | + ca->old_delivered_ce = tp->delivered_ce; |
---|
77 | 73 | } |
---|
78 | 74 | |
---|
79 | 75 | static void dctcp_init(struct sock *sk) |
---|
.. | .. |
---|
85 | 81 | sk->sk_state == TCP_CLOSE)) { |
---|
86 | 82 | struct dctcp *ca = inet_csk_ca(sk); |
---|
87 | 83 | |
---|
88 | | - ca->prior_snd_una = tp->snd_una; |
---|
89 | 84 | ca->prior_rcv_nxt = tp->rcv_nxt; |
---|
90 | 85 | |
---|
91 | 86 | ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); |
---|
.. | .. |
---|
113 | 108 | return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); |
---|
114 | 109 | } |
---|
115 | 110 | |
---|
116 | | -/* Minimal DCTP CE state machine: |
---|
117 | | - * |
---|
118 | | - * S: 0 <- last pkt was non-CE |
---|
119 | | - * 1 <- last pkt was CE |
---|
120 | | - */ |
---|
121 | | - |
---|
122 | | -static void dctcp_ce_state_0_to_1(struct sock *sk) |
---|
123 | | -{ |
---|
124 | | - struct dctcp *ca = inet_csk_ca(sk); |
---|
125 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
126 | | - |
---|
127 | | - if (!ca->ce_state) { |
---|
128 | | - /* State has changed from CE=0 to CE=1, force an immediate |
---|
129 | | - * ACK to reflect the new CE state. If an ACK was delayed, |
---|
130 | | - * send that first to reflect the prior CE state. |
---|
131 | | - */ |
---|
132 | | - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) |
---|
133 | | - __tcp_send_ack(sk, ca->prior_rcv_nxt); |
---|
134 | | - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; |
---|
135 | | - } |
---|
136 | | - |
---|
137 | | - ca->prior_rcv_nxt = tp->rcv_nxt; |
---|
138 | | - ca->ce_state = 1; |
---|
139 | | - |
---|
140 | | - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; |
---|
141 | | -} |
---|
142 | | - |
---|
143 | | -static void dctcp_ce_state_1_to_0(struct sock *sk) |
---|
144 | | -{ |
---|
145 | | - struct dctcp *ca = inet_csk_ca(sk); |
---|
146 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
147 | | - |
---|
148 | | - if (ca->ce_state) { |
---|
149 | | - /* State has changed from CE=1 to CE=0, force an immediate |
---|
150 | | - * ACK to reflect the new CE state. If an ACK was delayed, |
---|
151 | | - * send that first to reflect the prior CE state. |
---|
152 | | - */ |
---|
153 | | - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) |
---|
154 | | - __tcp_send_ack(sk, ca->prior_rcv_nxt); |
---|
155 | | - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; |
---|
156 | | - } |
---|
157 | | - |
---|
158 | | - ca->prior_rcv_nxt = tp->rcv_nxt; |
---|
159 | | - ca->ce_state = 0; |
---|
160 | | - |
---|
161 | | - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; |
---|
162 | | -} |
---|
163 | | - |
---|
164 | 111 | static void dctcp_update_alpha(struct sock *sk, u32 flags) |
---|
165 | 112 | { |
---|
166 | 113 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
167 | 114 | struct dctcp *ca = inet_csk_ca(sk); |
---|
168 | | - u32 acked_bytes = tp->snd_una - ca->prior_snd_una; |
---|
169 | | - |
---|
170 | | - /* If ack did not advance snd_una, count dupack as MSS size. |
---|
171 | | - * If ack did update window, do not count it at all. |
---|
172 | | - */ |
---|
173 | | - if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) |
---|
174 | | - acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; |
---|
175 | | - if (acked_bytes) { |
---|
176 | | - ca->acked_bytes_total += acked_bytes; |
---|
177 | | - ca->prior_snd_una = tp->snd_una; |
---|
178 | | - |
---|
179 | | - if (flags & CA_ACK_ECE) |
---|
180 | | - ca->acked_bytes_ecn += acked_bytes; |
---|
181 | | - } |
---|
182 | 115 | |
---|
183 | 116 | /* Expired RTT */ |
---|
184 | 117 | if (!before(tp->snd_una, ca->next_seq)) { |
---|
185 | | - u64 bytes_ecn = ca->acked_bytes_ecn; |
---|
| 118 | + u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; |
---|
186 | 119 | u32 alpha = ca->dctcp_alpha; |
---|
187 | 120 | |
---|
188 | 121 | /* alpha = (1 - g) * alpha + g * F */ |
---|
189 | 122 | |
---|
190 | 123 | alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); |
---|
191 | | - if (bytes_ecn) { |
---|
192 | | - /* If dctcp_shift_g == 1, a 32bit value would overflow |
---|
193 | | - * after 8 Mbytes. |
---|
194 | | - */ |
---|
195 | | - bytes_ecn <<= (10 - dctcp_shift_g); |
---|
196 | | - do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); |
---|
| 124 | + if (delivered_ce) { |
---|
| 125 | + u32 delivered = tp->delivered - ca->old_delivered; |
---|
197 | 126 | |
---|
198 | | - alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); |
---|
| 127 | + /* If dctcp_shift_g == 1, a 32bit value would overflow |
---|
| 128 | + * after 8 M packets. |
---|
| 129 | + */ |
---|
| 130 | + delivered_ce <<= (10 - dctcp_shift_g); |
---|
| 131 | + delivered_ce /= max(1U, delivered); |
---|
| 132 | + |
---|
| 133 | + alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); |
---|
199 | 134 | } |
---|
200 | 135 | /* dctcp_alpha can be read from dctcp_get_info() without |
---|
201 | 136 | * synchro, so we ask compiler to not use dctcp_alpha |
---|
.. | .. |
---|
227 | 162 | |
---|
228 | 163 | static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) |
---|
229 | 164 | { |
---|
| 165 | + struct dctcp *ca = inet_csk_ca(sk); |
---|
| 166 | + |
---|
230 | 167 | switch (ev) { |
---|
231 | 168 | case CA_EVENT_ECN_IS_CE: |
---|
232 | | - dctcp_ce_state_0_to_1(sk); |
---|
233 | | - break; |
---|
234 | 169 | case CA_EVENT_ECN_NO_CE: |
---|
235 | | - dctcp_ce_state_1_to_0(sk); |
---|
| 170 | + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); |
---|
236 | 171 | break; |
---|
237 | 172 | case CA_EVENT_LOSS: |
---|
238 | 173 | dctcp_react_to_loss(sk); |
---|
.. | .. |
---|
247 | 182 | union tcp_cc_info *info) |
---|
248 | 183 | { |
---|
249 | 184 | const struct dctcp *ca = inet_csk_ca(sk); |
---|
| 185 | + const struct tcp_sock *tp = tcp_sk(sk); |
---|
250 | 186 | |
---|
251 | 187 | /* Fill it also in case of VEGASINFO due to req struct limits. |
---|
252 | 188 | * We can still correctly retrieve it later. |
---|
.. | .. |
---|
258 | 194 | info->dctcp.dctcp_enabled = 1; |
---|
259 | 195 | info->dctcp.dctcp_ce_state = (u16) ca->ce_state; |
---|
260 | 196 | info->dctcp.dctcp_alpha = ca->dctcp_alpha; |
---|
261 | | - info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; |
---|
262 | | - info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; |
---|
| 197 | + info->dctcp.dctcp_ab_ecn = tp->mss_cache * |
---|
| 198 | + (tp->delivered_ce - ca->old_delivered_ce); |
---|
| 199 | + info->dctcp.dctcp_ab_tot = tp->mss_cache * |
---|
| 200 | + (tp->delivered - ca->old_delivered); |
---|
263 | 201 | } |
---|
264 | 202 | |
---|
265 | 203 | *attr = INET_DIAG_DCTCPINFO; |
---|