.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
---|
3 | 4 | * operating system. INET is implemented using the BSD Socket |
---|
.. | .. |
---|
6 | 7 | * Generic INET transport hashtables |
---|
7 | 8 | * |
---|
8 | 9 | * Authors: Lotsa people, from code originally in tcp |
---|
9 | | - * |
---|
10 | | - * This program is free software; you can redistribute it and/or |
---|
11 | | - * modify it under the terms of the GNU General Public License |
---|
12 | | - * as published by the Free Software Foundation; either version |
---|
13 | | - * 2 of the License, or (at your option) any later version. |
---|
14 | 10 | */ |
---|
15 | 11 | |
---|
16 | 12 | #include <linux/module.h> |
---|
.. | .. |
---|
19 | 15 | #include <linux/slab.h> |
---|
20 | 16 | #include <linux/wait.h> |
---|
21 | 17 | #include <linux/vmalloc.h> |
---|
22 | | -#include <linux/bootmem.h> |
---|
| 18 | +#include <linux/memblock.h> |
---|
23 | 19 | |
---|
24 | 20 | #include <net/addrconf.h> |
---|
25 | 21 | #include <net/inet_connection_sock.h> |
---|
26 | 22 | #include <net/inet_hashtables.h> |
---|
| 23 | +#if IS_ENABLED(CONFIG_IPV6) |
---|
| 24 | +#include <net/inet6_hashtables.h> |
---|
| 25 | +#endif |
---|
27 | 26 | #include <net/secure_seq.h> |
---|
28 | 27 | #include <net/ip.h> |
---|
29 | 28 | #include <net/tcp.h> |
---|
.. | .. |
---|
65 | 64 | struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, |
---|
66 | 65 | struct net *net, |
---|
67 | 66 | struct inet_bind_hashbucket *head, |
---|
68 | | - const unsigned short snum) |
---|
| 67 | + const unsigned short snum, |
---|
| 68 | + int l3mdev) |
---|
69 | 69 | { |
---|
70 | 70 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); |
---|
71 | 71 | |
---|
72 | 72 | if (tb) { |
---|
73 | 73 | write_pnet(&tb->ib_net, net); |
---|
| 74 | + tb->l3mdev = l3mdev; |
---|
74 | 75 | tb->port = snum; |
---|
75 | 76 | tb->fastreuse = 0; |
---|
76 | 77 | tb->fastreuseport = 0; |
---|
.. | .. |
---|
135 | 136 | table->bhash_size); |
---|
136 | 137 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; |
---|
137 | 138 | struct inet_bind_bucket *tb; |
---|
| 139 | + int l3mdev; |
---|
138 | 140 | |
---|
139 | 141 | spin_lock(&head->lock); |
---|
140 | 142 | tb = inet_csk(sk)->icsk_bind_hash; |
---|
.. | .. |
---|
143 | 145 | return -ENOENT; |
---|
144 | 146 | } |
---|
145 | 147 | if (tb->port != port) { |
---|
| 148 | + l3mdev = inet_sk_bound_l3mdev(sk); |
---|
| 149 | + |
---|
146 | 150 | /* NOTE: using tproxy and redirecting skbs to a proxy |
---|
147 | 151 | * on a different listener port breaks the assumption |
---|
148 | 152 | * that the listener socket's icsk_bind_hash is the same |
---|
.. | .. |
---|
150 | 154 | * create a new bind bucket for the child here. */ |
---|
151 | 155 | inet_bind_bucket_for_each(tb, &head->chain) { |
---|
152 | 156 | if (net_eq(ib_net(tb), sock_net(sk)) && |
---|
153 | | - tb->port == port) |
---|
| 157 | + tb->l3mdev == l3mdev && tb->port == port) |
---|
154 | 158 | break; |
---|
155 | 159 | } |
---|
156 | 160 | if (!tb) { |
---|
157 | 161 | tb = inet_bind_bucket_create(table->bind_bucket_cachep, |
---|
158 | | - sock_net(sk), head, port); |
---|
| 162 | + sock_net(sk), head, port, |
---|
| 163 | + l3mdev); |
---|
159 | 164 | if (!tb) { |
---|
160 | 165 | spin_unlock(&head->lock); |
---|
161 | 166 | return -ENOMEM; |
---|
.. | .. |
---|
226 | 231 | |
---|
227 | 232 | static inline int compute_score(struct sock *sk, struct net *net, |
---|
228 | 233 | const unsigned short hnum, const __be32 daddr, |
---|
229 | | - const int dif, const int sdif, bool exact_dif) |
---|
| 234 | + const int dif, const int sdif) |
---|
230 | 235 | { |
---|
231 | 236 | int score = -1; |
---|
232 | | - struct inet_sock *inet = inet_sk(sk); |
---|
233 | 237 | |
---|
234 | | - if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && |
---|
| 238 | + if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && |
---|
235 | 239 | !ipv6_only_sock(sk)) { |
---|
236 | | - __be32 rcv_saddr = inet->inet_rcv_saddr; |
---|
237 | | - score = sk->sk_family == PF_INET ? 2 : 1; |
---|
238 | | - if (rcv_saddr) { |
---|
239 | | - if (rcv_saddr != daddr) |
---|
240 | | - return -1; |
---|
241 | | - score += 4; |
---|
242 | | - } |
---|
243 | | - if (sk->sk_bound_dev_if || exact_dif) { |
---|
244 | | - bool dev_match = (sk->sk_bound_dev_if == dif || |
---|
245 | | - sk->sk_bound_dev_if == sdif); |
---|
| 240 | + if (sk->sk_rcv_saddr != daddr) |
---|
| 241 | + return -1; |
---|
246 | 242 | |
---|
247 | | - if (!dev_match) |
---|
248 | | - return -1; |
---|
249 | | - if (sk->sk_bound_dev_if) |
---|
250 | | - score += 4; |
---|
251 | | - } |
---|
| 243 | + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) |
---|
| 244 | + return -1; |
---|
| 245 | + score = sk->sk_bound_dev_if ? 2 : 1; |
---|
| 246 | + |
---|
| 247 | + if (sk->sk_family == PF_INET) |
---|
| 248 | + score++; |
---|
252 | 249 | if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) |
---|
253 | 250 | score++; |
---|
254 | 251 | } |
---|
255 | 252 | return score; |
---|
| 253 | +} |
---|
| 254 | + |
---|
| 255 | +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, |
---|
| 256 | + struct sk_buff *skb, int doff, |
---|
| 257 | + __be32 saddr, __be16 sport, |
---|
| 258 | + __be32 daddr, unsigned short hnum) |
---|
| 259 | +{ |
---|
| 260 | + struct sock *reuse_sk = NULL; |
---|
| 261 | + u32 phash; |
---|
| 262 | + |
---|
| 263 | + if (sk->sk_reuseport) { |
---|
| 264 | + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); |
---|
| 265 | + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); |
---|
| 266 | + } |
---|
| 267 | + return reuse_sk; |
---|
256 | 268 | } |
---|
257 | 269 | |
---|
258 | 270 | /* |
---|
.. | .. |
---|
270 | 282 | const __be32 daddr, const unsigned short hnum, |
---|
271 | 283 | const int dif, const int sdif) |
---|
272 | 284 | { |
---|
273 | | - bool exact_dif = inet_exact_dif_match(net, skb); |
---|
274 | 285 | struct inet_connection_sock *icsk; |
---|
275 | 286 | struct sock *sk, *result = NULL; |
---|
276 | 287 | int score, hiscore = 0; |
---|
277 | | - u32 phash = 0; |
---|
278 | 288 | |
---|
279 | 289 | inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { |
---|
280 | 290 | sk = (struct sock *)icsk; |
---|
281 | | - score = compute_score(sk, net, hnum, daddr, |
---|
282 | | - dif, sdif, exact_dif); |
---|
| 291 | + score = compute_score(sk, net, hnum, daddr, dif, sdif); |
---|
283 | 292 | if (score > hiscore) { |
---|
284 | | - if (sk->sk_reuseport) { |
---|
285 | | - phash = inet_ehashfn(net, daddr, hnum, |
---|
286 | | - saddr, sport); |
---|
287 | | - result = reuseport_select_sock(sk, phash, |
---|
288 | | - skb, doff); |
---|
289 | | - if (result) |
---|
290 | | - return result; |
---|
291 | | - } |
---|
| 293 | + result = lookup_reuseport(net, sk, skb, doff, |
---|
| 294 | + saddr, sport, daddr, hnum); |
---|
| 295 | + if (result) |
---|
| 296 | + return result; |
---|
| 297 | + |
---|
292 | 298 | result = sk; |
---|
293 | 299 | hiscore = score; |
---|
294 | 300 | } |
---|
295 | 301 | } |
---|
296 | 302 | |
---|
297 | 303 | return result; |
---|
| 304 | +} |
---|
| 305 | + |
---|
| 306 | +static inline struct sock *inet_lookup_run_bpf(struct net *net, |
---|
| 307 | + struct inet_hashinfo *hashinfo, |
---|
| 308 | + struct sk_buff *skb, int doff, |
---|
| 309 | + __be32 saddr, __be16 sport, |
---|
| 310 | + __be32 daddr, u16 hnum) |
---|
| 311 | +{ |
---|
| 312 | + struct sock *sk, *reuse_sk; |
---|
| 313 | + bool no_reuseport; |
---|
| 314 | + |
---|
| 315 | + if (hashinfo != &tcp_hashinfo) |
---|
| 316 | + return NULL; /* only TCP is supported */ |
---|
| 317 | + |
---|
| 318 | + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, |
---|
| 319 | + saddr, sport, daddr, hnum, &sk); |
---|
| 320 | + if (no_reuseport || IS_ERR_OR_NULL(sk)) |
---|
| 321 | + return sk; |
---|
| 322 | + |
---|
| 323 | + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); |
---|
| 324 | + if (reuse_sk) |
---|
| 325 | + sk = reuse_sk; |
---|
| 326 | + return sk; |
---|
298 | 327 | } |
---|
299 | 328 | |
---|
300 | 329 | struct sock *__inet_lookup_listener(struct net *net, |
---|
.. | .. |
---|
304 | 333 | const __be32 daddr, const unsigned short hnum, |
---|
305 | 334 | const int dif, const int sdif) |
---|
306 | 335 | { |
---|
307 | | - unsigned int hash = inet_lhashfn(net, hnum); |
---|
308 | | - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; |
---|
309 | | - bool exact_dif = inet_exact_dif_match(net, skb); |
---|
310 | 336 | struct inet_listen_hashbucket *ilb2; |
---|
311 | | - struct sock *sk, *result = NULL; |
---|
312 | | - struct hlist_nulls_node *node; |
---|
313 | | - int score, hiscore = 0; |
---|
| 337 | + struct sock *result = NULL; |
---|
314 | 338 | unsigned int hash2; |
---|
315 | | - u32 phash = 0; |
---|
316 | 339 | |
---|
317 | | - if (ilb->count <= 10 || !hashinfo->lhash2) |
---|
318 | | - goto port_lookup; |
---|
319 | | - |
---|
320 | | - /* Too many sk in the ilb bucket (which is hashed by port alone). |
---|
321 | | - * Try lhash2 (which is hashed by port and addr) instead. |
---|
322 | | - */ |
---|
| 340 | + /* Lookup redirect from BPF */ |
---|
| 341 | + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { |
---|
| 342 | + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, |
---|
| 343 | + saddr, sport, daddr, hnum); |
---|
| 344 | + if (result) |
---|
| 345 | + goto done; |
---|
| 346 | + } |
---|
323 | 347 | |
---|
324 | 348 | hash2 = ipv4_portaddr_hash(net, daddr, hnum); |
---|
325 | 349 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
---|
326 | | - if (ilb2->count > ilb->count) |
---|
327 | | - goto port_lookup; |
---|
328 | 350 | |
---|
329 | 351 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
---|
330 | 352 | saddr, sport, daddr, hnum, |
---|
.. | .. |
---|
333 | 355 | goto done; |
---|
334 | 356 | |
---|
335 | 357 | /* Lookup lhash2 with INADDR_ANY */ |
---|
336 | | - |
---|
337 | 358 | hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); |
---|
338 | 359 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
---|
339 | | - if (ilb2->count > ilb->count) |
---|
340 | | - goto port_lookup; |
---|
341 | 360 | |
---|
342 | 361 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
---|
343 | | - saddr, sport, daddr, hnum, |
---|
| 362 | + saddr, sport, htonl(INADDR_ANY), hnum, |
---|
344 | 363 | dif, sdif); |
---|
345 | | - goto done; |
---|
346 | | - |
---|
347 | | -port_lookup: |
---|
348 | | - sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) { |
---|
349 | | - score = compute_score(sk, net, hnum, daddr, |
---|
350 | | - dif, sdif, exact_dif); |
---|
351 | | - if (score > hiscore) { |
---|
352 | | - if (sk->sk_reuseport) { |
---|
353 | | - phash = inet_ehashfn(net, daddr, hnum, |
---|
354 | | - saddr, sport); |
---|
355 | | - result = reuseport_select_sock(sk, phash, |
---|
356 | | - skb, doff); |
---|
357 | | - if (result) |
---|
358 | | - goto done; |
---|
359 | | - } |
---|
360 | | - result = sk; |
---|
361 | | - hiscore = score; |
---|
362 | | - } |
---|
363 | | - } |
---|
364 | 364 | done: |
---|
365 | | - if (unlikely(IS_ERR(result))) |
---|
| 365 | + if (IS_ERR(result)) |
---|
366 | 366 | return NULL; |
---|
367 | 367 | return result; |
---|
368 | 368 | } |
---|
.. | .. |
---|
410 | 410 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
---|
411 | 411 | if (sk->sk_hash != hash) |
---|
412 | 412 | continue; |
---|
413 | | - if (likely(INET_MATCH(sk, net, acookie, |
---|
414 | | - saddr, daddr, ports, dif, sdif))) { |
---|
| 413 | + if (likely(INET_MATCH(net, sk, acookie, ports, dif, sdif))) { |
---|
415 | 414 | if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) |
---|
416 | 415 | goto out; |
---|
417 | | - if (unlikely(!INET_MATCH(sk, net, acookie, |
---|
418 | | - saddr, daddr, ports, |
---|
419 | | - dif, sdif))) { |
---|
| 416 | + if (unlikely(!INET_MATCH(net, sk, acookie, |
---|
| 417 | + ports, dif, sdif))) { |
---|
420 | 418 | sock_gen_put(sk); |
---|
421 | 419 | goto begin; |
---|
422 | 420 | } |
---|
.. | .. |
---|
465 | 463 | if (sk2->sk_hash != hash) |
---|
466 | 464 | continue; |
---|
467 | 465 | |
---|
468 | | - if (likely(INET_MATCH(sk2, net, acookie, |
---|
469 | | - saddr, daddr, ports, dif, sdif))) { |
---|
| 466 | + if (likely(INET_MATCH(net, sk2, acookie, ports, dif, sdif))) { |
---|
470 | 467 | if (sk2->sk_state == TCP_TIME_WAIT) { |
---|
471 | 468 | tw = inet_twsk(sk2); |
---|
472 | 469 | if (twsk_unique(sk, sk2, twp)) |
---|
.. | .. |
---|
504 | 501 | return -EADDRNOTAVAIL; |
---|
505 | 502 | } |
---|
506 | 503 | |
---|
507 | | -static u32 inet_sk_port_offset(const struct sock *sk) |
---|
| 504 | +static u64 inet_sk_port_offset(const struct sock *sk) |
---|
508 | 505 | { |
---|
509 | 506 | const struct inet_sock *inet = inet_sk(sk); |
---|
510 | 507 | |
---|
.. | .. |
---|
513 | 510 | inet->inet_dport); |
---|
514 | 511 | } |
---|
515 | 512 | |
---|
516 | | -/* insert a socket into ehash, and eventually remove another one |
---|
517 | | - * (The another one can be a SYN_RECV or TIMEWAIT |
---|
| 513 | +/* Searches for an exsiting socket in the ehash bucket list. |
---|
| 514 | + * Returns true if found, false otherwise. |
---|
518 | 515 | */ |
---|
519 | | -bool inet_ehash_insert(struct sock *sk, struct sock *osk) |
---|
| 516 | +static bool inet_ehash_lookup_by_sk(struct sock *sk, |
---|
| 517 | + struct hlist_nulls_head *list) |
---|
| 518 | +{ |
---|
| 519 | + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); |
---|
| 520 | + const int sdif = sk->sk_bound_dev_if; |
---|
| 521 | + const int dif = sk->sk_bound_dev_if; |
---|
| 522 | + const struct hlist_nulls_node *node; |
---|
| 523 | + struct net *net = sock_net(sk); |
---|
| 524 | + struct sock *esk; |
---|
| 525 | + |
---|
| 526 | + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); |
---|
| 527 | + |
---|
| 528 | + sk_nulls_for_each_rcu(esk, node, list) { |
---|
| 529 | + if (esk->sk_hash != sk->sk_hash) |
---|
| 530 | + continue; |
---|
| 531 | + if (sk->sk_family == AF_INET) { |
---|
| 532 | + if (unlikely(INET_MATCH(net, esk, acookie, |
---|
| 533 | + ports, dif, sdif))) { |
---|
| 534 | + return true; |
---|
| 535 | + } |
---|
| 536 | + } |
---|
| 537 | +#if IS_ENABLED(CONFIG_IPV6) |
---|
| 538 | + else if (sk->sk_family == AF_INET6) { |
---|
| 539 | + if (unlikely(inet6_match(net, esk, |
---|
| 540 | + &sk->sk_v6_daddr, |
---|
| 541 | + &sk->sk_v6_rcv_saddr, |
---|
| 542 | + ports, dif, sdif))) { |
---|
| 543 | + return true; |
---|
| 544 | + } |
---|
| 545 | + } |
---|
| 546 | +#endif |
---|
| 547 | + } |
---|
| 548 | + return false; |
---|
| 549 | +} |
---|
| 550 | + |
---|
| 551 | +/* Insert a socket into ehash, and eventually remove another one |
---|
| 552 | + * (The another one can be a SYN_RECV or TIMEWAIT) |
---|
| 553 | + * If an existing socket already exists, socket sk is not inserted, |
---|
| 554 | + * and sets found_dup_sk parameter to true. |
---|
| 555 | + */ |
---|
| 556 | +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) |
---|
520 | 557 | { |
---|
521 | 558 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
---|
522 | 559 | struct hlist_nulls_head *list; |
---|
.. | .. |
---|
535 | 572 | if (osk) { |
---|
536 | 573 | WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); |
---|
537 | 574 | ret = sk_nulls_del_node_init_rcu(osk); |
---|
| 575 | + } else if (found_dup_sk) { |
---|
| 576 | + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); |
---|
| 577 | + if (*found_dup_sk) |
---|
| 578 | + ret = false; |
---|
538 | 579 | } |
---|
| 580 | + |
---|
539 | 581 | if (ret) |
---|
540 | 582 | __sk_nulls_add_node_rcu(sk, list); |
---|
| 583 | + |
---|
541 | 584 | spin_unlock(lock); |
---|
| 585 | + |
---|
542 | 586 | return ret; |
---|
543 | 587 | } |
---|
544 | 588 | |
---|
545 | | -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) |
---|
| 589 | +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) |
---|
546 | 590 | { |
---|
547 | | - bool ok = inet_ehash_insert(sk, osk); |
---|
| 591 | + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); |
---|
548 | 592 | |
---|
549 | 593 | if (ok) { |
---|
550 | 594 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
---|
.. | .. |
---|
588 | 632 | int err = 0; |
---|
589 | 633 | |
---|
590 | 634 | if (sk->sk_state != TCP_LISTEN) { |
---|
591 | | - inet_ehash_nolisten(sk, osk); |
---|
| 635 | + local_bh_disable(); |
---|
| 636 | + inet_ehash_nolisten(sk, osk, NULL); |
---|
| 637 | + local_bh_enable(); |
---|
592 | 638 | return 0; |
---|
593 | 639 | } |
---|
594 | 640 | WARN_ON(!sk_unhashed(sk)); |
---|
.. | .. |
---|
620 | 666 | { |
---|
621 | 667 | int err = 0; |
---|
622 | 668 | |
---|
623 | | - if (sk->sk_state != TCP_CLOSE) { |
---|
624 | | - local_bh_disable(); |
---|
| 669 | + if (sk->sk_state != TCP_CLOSE) |
---|
625 | 670 | err = __inet_hash(sk, NULL); |
---|
626 | | - local_bh_enable(); |
---|
627 | | - } |
---|
628 | 671 | |
---|
629 | 672 | return err; |
---|
630 | 673 | } |
---|
631 | 674 | EXPORT_SYMBOL_GPL(inet_hash); |
---|
632 | 675 | |
---|
633 | | -void inet_unhash(struct sock *sk) |
---|
| 676 | +static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) |
---|
634 | 677 | { |
---|
635 | | - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
---|
636 | | - struct inet_listen_hashbucket *ilb = NULL; |
---|
637 | | - spinlock_t *lock; |
---|
638 | | - |
---|
639 | 678 | if (sk_unhashed(sk)) |
---|
640 | 679 | return; |
---|
641 | | - |
---|
642 | | - if (sk->sk_state == TCP_LISTEN) { |
---|
643 | | - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
---|
644 | | - lock = &ilb->lock; |
---|
645 | | - } else { |
---|
646 | | - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
---|
647 | | - } |
---|
648 | | - spin_lock_bh(lock); |
---|
649 | | - if (sk_unhashed(sk)) |
---|
650 | | - goto unlock; |
---|
651 | 680 | |
---|
652 | 681 | if (rcu_access_pointer(sk->sk_reuseport_cb)) |
---|
653 | 682 | reuseport_detach_sock(sk); |
---|
654 | 683 | if (ilb) { |
---|
| 684 | + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
---|
| 685 | + |
---|
655 | 686 | inet_unhash2(hashinfo, sk); |
---|
656 | 687 | ilb->count--; |
---|
657 | 688 | } |
---|
658 | 689 | __sk_nulls_del_node_init_rcu(sk); |
---|
659 | 690 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
---|
660 | | -unlock: |
---|
661 | | - spin_unlock_bh(lock); |
---|
| 691 | +} |
---|
| 692 | + |
---|
| 693 | +void inet_unhash(struct sock *sk) |
---|
| 694 | +{ |
---|
| 695 | + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
---|
| 696 | + |
---|
| 697 | + if (sk_unhashed(sk)) |
---|
| 698 | + return; |
---|
| 699 | + |
---|
| 700 | + if (sk->sk_state == TCP_LISTEN) { |
---|
| 701 | + struct inet_listen_hashbucket *ilb; |
---|
| 702 | + |
---|
| 703 | + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
---|
| 704 | + /* Don't disable bottom halves while acquiring the lock to |
---|
| 705 | + * avoid circular locking dependency on PREEMPT_RT. |
---|
| 706 | + */ |
---|
| 707 | + spin_lock(&ilb->lock); |
---|
| 708 | + __inet_unhash(sk, ilb); |
---|
| 709 | + spin_unlock(&ilb->lock); |
---|
| 710 | + } else { |
---|
| 711 | + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
---|
| 712 | + |
---|
| 713 | + spin_lock_bh(lock); |
---|
| 714 | + __inet_unhash(sk, NULL); |
---|
| 715 | + spin_unlock_bh(lock); |
---|
| 716 | + } |
---|
662 | 717 | } |
---|
663 | 718 | EXPORT_SYMBOL_GPL(inet_unhash); |
---|
664 | 719 | |
---|
| 720 | +/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm |
---|
| 721 | + * Note that we use 32bit integers (vs RFC 'short integers') |
---|
| 722 | + * because 2^16 is not a multiple of num_ephemeral and this |
---|
| 723 | + * property might be used by clever attacker. |
---|
| 724 | + * |
---|
| 725 | + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though |
---|
| 726 | + * attacks were since demonstrated, thus we use 65536 by default instead |
---|
| 727 | + * to really give more isolation and privacy, at the expense of 256kB |
---|
| 728 | + * of kernel memory. |
---|
| 729 | + */ |
---|
| 730 | +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) |
---|
| 731 | +static u32 *table_perturb; |
---|
| 732 | + |
---|
665 | 733 | int __inet_hash_connect(struct inet_timewait_death_row *death_row, |
---|
666 | | - struct sock *sk, u32 port_offset, |
---|
| 734 | + struct sock *sk, u64 port_offset, |
---|
667 | 735 | int (*check_established)(struct inet_timewait_death_row *, |
---|
668 | 736 | struct sock *, __u16, struct inet_timewait_sock **)) |
---|
669 | 737 | { |
---|
.. | .. |
---|
675 | 743 | struct inet_bind_bucket *tb; |
---|
676 | 744 | u32 remaining, offset; |
---|
677 | 745 | int ret, i, low, high; |
---|
678 | | - static u32 hint; |
---|
| 746 | + int l3mdev; |
---|
| 747 | + u32 index; |
---|
679 | 748 | |
---|
680 | 749 | if (port) { |
---|
681 | | - head = &hinfo->bhash[inet_bhashfn(net, port, |
---|
682 | | - hinfo->bhash_size)]; |
---|
683 | | - tb = inet_csk(sk)->icsk_bind_hash; |
---|
684 | | - spin_lock_bh(&head->lock); |
---|
685 | | - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { |
---|
686 | | - inet_ehash_nolisten(sk, NULL); |
---|
687 | | - spin_unlock_bh(&head->lock); |
---|
688 | | - return 0; |
---|
689 | | - } |
---|
690 | | - spin_unlock(&head->lock); |
---|
691 | | - /* No definite answer... Walk to established hash table */ |
---|
| 750 | + local_bh_disable(); |
---|
692 | 751 | ret = check_established(death_row, sk, port, NULL); |
---|
693 | 752 | local_bh_enable(); |
---|
694 | 753 | return ret; |
---|
695 | 754 | } |
---|
| 755 | + |
---|
| 756 | + l3mdev = inet_sk_bound_l3mdev(sk); |
---|
696 | 757 | |
---|
697 | 758 | inet_get_local_port_range(net, &low, &high); |
---|
698 | 759 | high++; /* [32768, 60999] -> [32768, 61000[ */ |
---|
.. | .. |
---|
700 | 761 | if (likely(remaining > 1)) |
---|
701 | 762 | remaining &= ~1U; |
---|
702 | 763 | |
---|
703 | | - offset = (hint + port_offset) % remaining; |
---|
| 764 | + get_random_slow_once(table_perturb, |
---|
| 765 | + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); |
---|
| 766 | + index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); |
---|
| 767 | + |
---|
| 768 | + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); |
---|
| 769 | + offset %= remaining; |
---|
| 770 | + |
---|
704 | 771 | /* In first pass we try ports of @low parity. |
---|
705 | 772 | * inet_csk_get_port() does the opposite choice. |
---|
706 | 773 | */ |
---|
.. | .. |
---|
720 | 787 | * the established check is already unique enough. |
---|
721 | 788 | */ |
---|
722 | 789 | inet_bind_bucket_for_each(tb, &head->chain) { |
---|
723 | | - if (net_eq(ib_net(tb), net) && tb->port == port) { |
---|
| 790 | + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && |
---|
| 791 | + tb->port == port) { |
---|
724 | 792 | if (tb->fastreuse >= 0 || |
---|
725 | 793 | tb->fastreuseport >= 0) |
---|
726 | 794 | goto next_port; |
---|
.. | .. |
---|
733 | 801 | } |
---|
734 | 802 | |
---|
735 | 803 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, |
---|
736 | | - net, head, port); |
---|
| 804 | + net, head, port, l3mdev); |
---|
737 | 805 | if (!tb) { |
---|
738 | 806 | spin_unlock_bh(&head->lock); |
---|
739 | 807 | return -ENOMEM; |
---|
.. | .. |
---|
753 | 821 | return -EADDRNOTAVAIL; |
---|
754 | 822 | |
---|
755 | 823 | ok: |
---|
756 | | - hint += i + 2; |
---|
| 824 | + /* Here we want to add a little bit of randomness to the next source |
---|
| 825 | + * port that will be chosen. We use a max() with a random here so that |
---|
| 826 | + * on low contention the randomness is maximal and on high contention |
---|
| 827 | + * it may be inexistent. |
---|
| 828 | + */ |
---|
| 829 | + i = max_t(int, i, (prandom_u32() & 7) * 2); |
---|
| 830 | + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); |
---|
757 | 831 | |
---|
758 | 832 | /* Head lock still held and bh's disabled */ |
---|
759 | 833 | inet_bind_hash(sk, tb, port); |
---|
760 | 834 | if (sk_unhashed(sk)) { |
---|
761 | 835 | inet_sk(sk)->inet_sport = htons(port); |
---|
762 | | - inet_ehash_nolisten(sk, (struct sock *)tw); |
---|
| 836 | + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); |
---|
763 | 837 | } |
---|
764 | 838 | if (tw) |
---|
765 | 839 | inet_twsk_bind_unhash(tw, hinfo); |
---|
.. | .. |
---|
776 | 850 | int inet_hash_connect(struct inet_timewait_death_row *death_row, |
---|
777 | 851 | struct sock *sk) |
---|
778 | 852 | { |
---|
779 | | - u32 port_offset = 0; |
---|
| 853 | + u64 port_offset = 0; |
---|
780 | 854 | |
---|
781 | 855 | if (!inet_sk(sk)->inet_num) |
---|
782 | 856 | port_offset = inet_sk_port_offset(sk); |
---|
.. | .. |
---|
800 | 874 | } |
---|
801 | 875 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); |
---|
802 | 876 | |
---|
| 877 | +static void init_hashinfo_lhash2(struct inet_hashinfo *h) |
---|
| 878 | +{ |
---|
| 879 | + int i; |
---|
| 880 | + |
---|
| 881 | + for (i = 0; i <= h->lhash2_mask; i++) { |
---|
| 882 | + spin_lock_init(&h->lhash2[i].lock); |
---|
| 883 | + INIT_HLIST_HEAD(&h->lhash2[i].head); |
---|
| 884 | + h->lhash2[i].count = 0; |
---|
| 885 | + } |
---|
| 886 | +} |
---|
| 887 | + |
---|
803 | 888 | void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, |
---|
804 | 889 | unsigned long numentries, int scale, |
---|
805 | 890 | unsigned long low_limit, |
---|
806 | 891 | unsigned long high_limit) |
---|
807 | 892 | { |
---|
808 | | - unsigned int i; |
---|
809 | | - |
---|
810 | 893 | h->lhash2 = alloc_large_system_hash(name, |
---|
811 | 894 | sizeof(*h->lhash2), |
---|
812 | 895 | numentries, |
---|
.. | .. |
---|
816 | 899 | &h->lhash2_mask, |
---|
817 | 900 | low_limit, |
---|
818 | 901 | high_limit); |
---|
| 902 | + init_hashinfo_lhash2(h); |
---|
819 | 903 | |
---|
820 | | - for (i = 0; i <= h->lhash2_mask; i++) { |
---|
821 | | - spin_lock_init(&h->lhash2[i].lock); |
---|
822 | | - INIT_HLIST_HEAD(&h->lhash2[i].head); |
---|
823 | | - h->lhash2[i].count = 0; |
---|
824 | | - } |
---|
| 904 | + /* this one is used for source ports of outgoing connections */ |
---|
| 905 | + table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, |
---|
| 906 | + sizeof(*table_perturb), GFP_KERNEL); |
---|
| 907 | + if (!table_perturb) |
---|
| 908 | + panic("TCP: failed to alloc table_perturb"); |
---|
825 | 909 | } |
---|
826 | 910 | |
---|
| 911 | +int inet_hashinfo2_init_mod(struct inet_hashinfo *h) |
---|
| 912 | +{ |
---|
| 913 | + h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); |
---|
| 914 | + if (!h->lhash2) |
---|
| 915 | + return -ENOMEM; |
---|
| 916 | + |
---|
| 917 | + h->lhash2_mask = INET_LHTABLE_SIZE - 1; |
---|
| 918 | + /* INET_LHTABLE_SIZE must be a power of 2 */ |
---|
| 919 | + BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); |
---|
| 920 | + |
---|
| 921 | + init_hashinfo_lhash2(h); |
---|
| 922 | + return 0; |
---|
| 923 | +} |
---|
| 924 | +EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); |
---|
| 925 | + |
---|
827 | 926 | int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) |
---|
828 | 927 | { |
---|
829 | 928 | unsigned int locksz = sizeof(spinlock_t); |
---|