| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
|---|
| 3 | 4 | * operating system. INET is implemented using the BSD Socket |
|---|
| .. | .. |
|---|
| 6 | 7 | * Generic INET transport hashtables |
|---|
| 7 | 8 | * |
|---|
| 8 | 9 | * Authors: Lotsa people, from code originally in tcp |
|---|
| 9 | | - * |
|---|
| 10 | | - * This program is free software; you can redistribute it and/or |
|---|
| 11 | | - * modify it under the terms of the GNU General Public License |
|---|
| 12 | | - * as published by the Free Software Foundation; either version |
|---|
| 13 | | - * 2 of the License, or (at your option) any later version. |
|---|
| 14 | 10 | */ |
|---|
| 15 | 11 | |
|---|
| 16 | 12 | #include <linux/module.h> |
|---|
| .. | .. |
|---|
| 19 | 15 | #include <linux/slab.h> |
|---|
| 20 | 16 | #include <linux/wait.h> |
|---|
| 21 | 17 | #include <linux/vmalloc.h> |
|---|
| 22 | | -#include <linux/bootmem.h> |
|---|
| 18 | +#include <linux/memblock.h> |
|---|
| 23 | 19 | |
|---|
| 24 | 20 | #include <net/addrconf.h> |
|---|
| 25 | 21 | #include <net/inet_connection_sock.h> |
|---|
| 26 | 22 | #include <net/inet_hashtables.h> |
|---|
| 23 | +#if IS_ENABLED(CONFIG_IPV6) |
|---|
| 24 | +#include <net/inet6_hashtables.h> |
|---|
| 25 | +#endif |
|---|
| 27 | 26 | #include <net/secure_seq.h> |
|---|
| 28 | 27 | #include <net/ip.h> |
|---|
| 29 | 28 | #include <net/tcp.h> |
|---|
| .. | .. |
|---|
| 65 | 64 | struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, |
|---|
| 66 | 65 | struct net *net, |
|---|
| 67 | 66 | struct inet_bind_hashbucket *head, |
|---|
| 68 | | - const unsigned short snum) |
|---|
| 67 | + const unsigned short snum, |
|---|
| 68 | + int l3mdev) |
|---|
| 69 | 69 | { |
|---|
| 70 | 70 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); |
|---|
| 71 | 71 | |
|---|
| 72 | 72 | if (tb) { |
|---|
| 73 | 73 | write_pnet(&tb->ib_net, net); |
|---|
| 74 | + tb->l3mdev = l3mdev; |
|---|
| 74 | 75 | tb->port = snum; |
|---|
| 75 | 76 | tb->fastreuse = 0; |
|---|
| 76 | 77 | tb->fastreuseport = 0; |
|---|
| .. | .. |
|---|
| 135 | 136 | table->bhash_size); |
|---|
| 136 | 137 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; |
|---|
| 137 | 138 | struct inet_bind_bucket *tb; |
|---|
| 139 | + int l3mdev; |
|---|
| 138 | 140 | |
|---|
| 139 | 141 | spin_lock(&head->lock); |
|---|
| 140 | 142 | tb = inet_csk(sk)->icsk_bind_hash; |
|---|
| .. | .. |
|---|
| 143 | 145 | return -ENOENT; |
|---|
| 144 | 146 | } |
|---|
| 145 | 147 | if (tb->port != port) { |
|---|
| 148 | + l3mdev = inet_sk_bound_l3mdev(sk); |
|---|
| 149 | + |
|---|
| 146 | 150 | /* NOTE: using tproxy and redirecting skbs to a proxy |
|---|
| 147 | 151 | * on a different listener port breaks the assumption |
|---|
| 148 | 152 | * that the listener socket's icsk_bind_hash is the same |
|---|
| .. | .. |
|---|
| 150 | 154 | * create a new bind bucket for the child here. */ |
|---|
| 151 | 155 | inet_bind_bucket_for_each(tb, &head->chain) { |
|---|
| 152 | 156 | if (net_eq(ib_net(tb), sock_net(sk)) && |
|---|
| 153 | | - tb->port == port) |
|---|
| 157 | + tb->l3mdev == l3mdev && tb->port == port) |
|---|
| 154 | 158 | break; |
|---|
| 155 | 159 | } |
|---|
| 156 | 160 | if (!tb) { |
|---|
| 157 | 161 | tb = inet_bind_bucket_create(table->bind_bucket_cachep, |
|---|
| 158 | | - sock_net(sk), head, port); |
|---|
| 162 | + sock_net(sk), head, port, |
|---|
| 163 | + l3mdev); |
|---|
| 159 | 164 | if (!tb) { |
|---|
| 160 | 165 | spin_unlock(&head->lock); |
|---|
| 161 | 166 | return -ENOMEM; |
|---|
| .. | .. |
|---|
| 226 | 231 | |
|---|
| 227 | 232 | static inline int compute_score(struct sock *sk, struct net *net, |
|---|
| 228 | 233 | const unsigned short hnum, const __be32 daddr, |
|---|
| 229 | | - const int dif, const int sdif, bool exact_dif) |
|---|
| 234 | + const int dif, const int sdif) |
|---|
| 230 | 235 | { |
|---|
| 231 | 236 | int score = -1; |
|---|
| 232 | | - struct inet_sock *inet = inet_sk(sk); |
|---|
| 233 | 237 | |
|---|
| 234 | | - if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && |
|---|
| 238 | + if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && |
|---|
| 235 | 239 | !ipv6_only_sock(sk)) { |
|---|
| 236 | | - __be32 rcv_saddr = inet->inet_rcv_saddr; |
|---|
| 237 | | - score = sk->sk_family == PF_INET ? 2 : 1; |
|---|
| 238 | | - if (rcv_saddr) { |
|---|
| 239 | | - if (rcv_saddr != daddr) |
|---|
| 240 | | - return -1; |
|---|
| 241 | | - score += 4; |
|---|
| 242 | | - } |
|---|
| 243 | | - if (sk->sk_bound_dev_if || exact_dif) { |
|---|
| 244 | | - bool dev_match = (sk->sk_bound_dev_if == dif || |
|---|
| 245 | | - sk->sk_bound_dev_if == sdif); |
|---|
| 240 | + if (sk->sk_rcv_saddr != daddr) |
|---|
| 241 | + return -1; |
|---|
| 246 | 242 | |
|---|
| 247 | | - if (!dev_match) |
|---|
| 248 | | - return -1; |
|---|
| 249 | | - if (sk->sk_bound_dev_if) |
|---|
| 250 | | - score += 4; |
|---|
| 251 | | - } |
|---|
| 243 | + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) |
|---|
| 244 | + return -1; |
|---|
| 245 | + score = sk->sk_bound_dev_if ? 2 : 1; |
|---|
| 246 | + |
|---|
| 247 | + if (sk->sk_family == PF_INET) |
|---|
| 248 | + score++; |
|---|
| 252 | 249 | if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) |
|---|
| 253 | 250 | score++; |
|---|
| 254 | 251 | } |
|---|
| 255 | 252 | return score; |
|---|
| 253 | +} |
|---|
| 254 | + |
|---|
| 255 | +static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, |
|---|
| 256 | + struct sk_buff *skb, int doff, |
|---|
| 257 | + __be32 saddr, __be16 sport, |
|---|
| 258 | + __be32 daddr, unsigned short hnum) |
|---|
| 259 | +{ |
|---|
| 260 | + struct sock *reuse_sk = NULL; |
|---|
| 261 | + u32 phash; |
|---|
| 262 | + |
|---|
| 263 | + if (sk->sk_reuseport) { |
|---|
| 264 | + phash = inet_ehashfn(net, daddr, hnum, saddr, sport); |
|---|
| 265 | + reuse_sk = reuseport_select_sock(sk, phash, skb, doff); |
|---|
| 266 | + } |
|---|
| 267 | + return reuse_sk; |
|---|
| 256 | 268 | } |
|---|
| 257 | 269 | |
|---|
| 258 | 270 | /* |
|---|
| .. | .. |
|---|
| 270 | 282 | const __be32 daddr, const unsigned short hnum, |
|---|
| 271 | 283 | const int dif, const int sdif) |
|---|
| 272 | 284 | { |
|---|
| 273 | | - bool exact_dif = inet_exact_dif_match(net, skb); |
|---|
| 274 | 285 | struct inet_connection_sock *icsk; |
|---|
| 275 | 286 | struct sock *sk, *result = NULL; |
|---|
| 276 | 287 | int score, hiscore = 0; |
|---|
| 277 | | - u32 phash = 0; |
|---|
| 278 | 288 | |
|---|
| 279 | 289 | inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { |
|---|
| 280 | 290 | sk = (struct sock *)icsk; |
|---|
| 281 | | - score = compute_score(sk, net, hnum, daddr, |
|---|
| 282 | | - dif, sdif, exact_dif); |
|---|
| 291 | + score = compute_score(sk, net, hnum, daddr, dif, sdif); |
|---|
| 283 | 292 | if (score > hiscore) { |
|---|
| 284 | | - if (sk->sk_reuseport) { |
|---|
| 285 | | - phash = inet_ehashfn(net, daddr, hnum, |
|---|
| 286 | | - saddr, sport); |
|---|
| 287 | | - result = reuseport_select_sock(sk, phash, |
|---|
| 288 | | - skb, doff); |
|---|
| 289 | | - if (result) |
|---|
| 290 | | - return result; |
|---|
| 291 | | - } |
|---|
| 293 | + result = lookup_reuseport(net, sk, skb, doff, |
|---|
| 294 | + saddr, sport, daddr, hnum); |
|---|
| 295 | + if (result) |
|---|
| 296 | + return result; |
|---|
| 297 | + |
|---|
| 292 | 298 | result = sk; |
|---|
| 293 | 299 | hiscore = score; |
|---|
| 294 | 300 | } |
|---|
| 295 | 301 | } |
|---|
| 296 | 302 | |
|---|
| 297 | 303 | return result; |
|---|
| 304 | +} |
|---|
| 305 | + |
|---|
| 306 | +static inline struct sock *inet_lookup_run_bpf(struct net *net, |
|---|
| 307 | + struct inet_hashinfo *hashinfo, |
|---|
| 308 | + struct sk_buff *skb, int doff, |
|---|
| 309 | + __be32 saddr, __be16 sport, |
|---|
| 310 | + __be32 daddr, u16 hnum) |
|---|
| 311 | +{ |
|---|
| 312 | + struct sock *sk, *reuse_sk; |
|---|
| 313 | + bool no_reuseport; |
|---|
| 314 | + |
|---|
| 315 | + if (hashinfo != &tcp_hashinfo) |
|---|
| 316 | + return NULL; /* only TCP is supported */ |
|---|
| 317 | + |
|---|
| 318 | + no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, |
|---|
| 319 | + saddr, sport, daddr, hnum, &sk); |
|---|
| 320 | + if (no_reuseport || IS_ERR_OR_NULL(sk)) |
|---|
| 321 | + return sk; |
|---|
| 322 | + |
|---|
| 323 | + reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); |
|---|
| 324 | + if (reuse_sk) |
|---|
| 325 | + sk = reuse_sk; |
|---|
| 326 | + return sk; |
|---|
| 298 | 327 | } |
|---|
| 299 | 328 | |
|---|
| 300 | 329 | struct sock *__inet_lookup_listener(struct net *net, |
|---|
| .. | .. |
|---|
| 304 | 333 | const __be32 daddr, const unsigned short hnum, |
|---|
| 305 | 334 | const int dif, const int sdif) |
|---|
| 306 | 335 | { |
|---|
| 307 | | - unsigned int hash = inet_lhashfn(net, hnum); |
|---|
| 308 | | - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; |
|---|
| 309 | | - bool exact_dif = inet_exact_dif_match(net, skb); |
|---|
| 310 | 336 | struct inet_listen_hashbucket *ilb2; |
|---|
| 311 | | - struct sock *sk, *result = NULL; |
|---|
| 312 | | - struct hlist_nulls_node *node; |
|---|
| 313 | | - int score, hiscore = 0; |
|---|
| 337 | + struct sock *result = NULL; |
|---|
| 314 | 338 | unsigned int hash2; |
|---|
| 315 | | - u32 phash = 0; |
|---|
| 316 | 339 | |
|---|
| 317 | | - if (ilb->count <= 10 || !hashinfo->lhash2) |
|---|
| 318 | | - goto port_lookup; |
|---|
| 319 | | - |
|---|
| 320 | | - /* Too many sk in the ilb bucket (which is hashed by port alone). |
|---|
| 321 | | - * Try lhash2 (which is hashed by port and addr) instead. |
|---|
| 322 | | - */ |
|---|
| 340 | + /* Lookup redirect from BPF */ |
|---|
| 341 | + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { |
|---|
| 342 | + result = inet_lookup_run_bpf(net, hashinfo, skb, doff, |
|---|
| 343 | + saddr, sport, daddr, hnum); |
|---|
| 344 | + if (result) |
|---|
| 345 | + goto done; |
|---|
| 346 | + } |
|---|
| 323 | 347 | |
|---|
| 324 | 348 | hash2 = ipv4_portaddr_hash(net, daddr, hnum); |
|---|
| 325 | 349 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
|---|
| 326 | | - if (ilb2->count > ilb->count) |
|---|
| 327 | | - goto port_lookup; |
|---|
| 328 | 350 | |
|---|
| 329 | 351 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
|---|
| 330 | 352 | saddr, sport, daddr, hnum, |
|---|
| .. | .. |
|---|
| 333 | 355 | goto done; |
|---|
| 334 | 356 | |
|---|
| 335 | 357 | /* Lookup lhash2 with INADDR_ANY */ |
|---|
| 336 | | - |
|---|
| 337 | 358 | hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); |
|---|
| 338 | 359 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
|---|
| 339 | | - if (ilb2->count > ilb->count) |
|---|
| 340 | | - goto port_lookup; |
|---|
| 341 | 360 | |
|---|
| 342 | 361 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
|---|
| 343 | | - saddr, sport, daddr, hnum, |
|---|
| 362 | + saddr, sport, htonl(INADDR_ANY), hnum, |
|---|
| 344 | 363 | dif, sdif); |
|---|
| 345 | | - goto done; |
|---|
| 346 | | - |
|---|
| 347 | | -port_lookup: |
|---|
| 348 | | - sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) { |
|---|
| 349 | | - score = compute_score(sk, net, hnum, daddr, |
|---|
| 350 | | - dif, sdif, exact_dif); |
|---|
| 351 | | - if (score > hiscore) { |
|---|
| 352 | | - if (sk->sk_reuseport) { |
|---|
| 353 | | - phash = inet_ehashfn(net, daddr, hnum, |
|---|
| 354 | | - saddr, sport); |
|---|
| 355 | | - result = reuseport_select_sock(sk, phash, |
|---|
| 356 | | - skb, doff); |
|---|
| 357 | | - if (result) |
|---|
| 358 | | - goto done; |
|---|
| 359 | | - } |
|---|
| 360 | | - result = sk; |
|---|
| 361 | | - hiscore = score; |
|---|
| 362 | | - } |
|---|
| 363 | | - } |
|---|
| 364 | 364 | done: |
|---|
| 365 | | - if (unlikely(IS_ERR(result))) |
|---|
| 365 | + if (IS_ERR(result)) |
|---|
| 366 | 366 | return NULL; |
|---|
| 367 | 367 | return result; |
|---|
| 368 | 368 | } |
|---|
| .. | .. |
|---|
| 410 | 410 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
|---|
| 411 | 411 | if (sk->sk_hash != hash) |
|---|
| 412 | 412 | continue; |
|---|
| 413 | | - if (likely(INET_MATCH(sk, net, acookie, |
|---|
| 414 | | - saddr, daddr, ports, dif, sdif))) { |
|---|
| 413 | + if (likely(INET_MATCH(net, sk, acookie, ports, dif, sdif))) { |
|---|
| 415 | 414 | if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) |
|---|
| 416 | 415 | goto out; |
|---|
| 417 | | - if (unlikely(!INET_MATCH(sk, net, acookie, |
|---|
| 418 | | - saddr, daddr, ports, |
|---|
| 419 | | - dif, sdif))) { |
|---|
| 416 | + if (unlikely(!INET_MATCH(net, sk, acookie, |
|---|
| 417 | + ports, dif, sdif))) { |
|---|
| 420 | 418 | sock_gen_put(sk); |
|---|
| 421 | 419 | goto begin; |
|---|
| 422 | 420 | } |
|---|
| .. | .. |
|---|
| 465 | 463 | if (sk2->sk_hash != hash) |
|---|
| 466 | 464 | continue; |
|---|
| 467 | 465 | |
|---|
| 468 | | - if (likely(INET_MATCH(sk2, net, acookie, |
|---|
| 469 | | - saddr, daddr, ports, dif, sdif))) { |
|---|
| 466 | + if (likely(INET_MATCH(net, sk2, acookie, ports, dif, sdif))) { |
|---|
| 470 | 467 | if (sk2->sk_state == TCP_TIME_WAIT) { |
|---|
| 471 | 468 | tw = inet_twsk(sk2); |
|---|
| 472 | 469 | if (twsk_unique(sk, sk2, twp)) |
|---|
| .. | .. |
|---|
| 504 | 501 | return -EADDRNOTAVAIL; |
|---|
| 505 | 502 | } |
|---|
| 506 | 503 | |
|---|
| 507 | | -static u32 inet_sk_port_offset(const struct sock *sk) |
|---|
| 504 | +static u64 inet_sk_port_offset(const struct sock *sk) |
|---|
| 508 | 505 | { |
|---|
| 509 | 506 | const struct inet_sock *inet = inet_sk(sk); |
|---|
| 510 | 507 | |
|---|
| .. | .. |
|---|
| 513 | 510 | inet->inet_dport); |
|---|
| 514 | 511 | } |
|---|
| 515 | 512 | |
|---|
| 516 | | -/* insert a socket into ehash, and eventually remove another one |
|---|
| 517 | | - * (The another one can be a SYN_RECV or TIMEWAIT |
|---|
| 513 | +/* Searches for an exsiting socket in the ehash bucket list. |
|---|
| 514 | + * Returns true if found, false otherwise. |
|---|
| 518 | 515 | */ |
|---|
| 519 | | -bool inet_ehash_insert(struct sock *sk, struct sock *osk) |
|---|
| 516 | +static bool inet_ehash_lookup_by_sk(struct sock *sk, |
|---|
| 517 | + struct hlist_nulls_head *list) |
|---|
| 518 | +{ |
|---|
| 519 | + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); |
|---|
| 520 | + const int sdif = sk->sk_bound_dev_if; |
|---|
| 521 | + const int dif = sk->sk_bound_dev_if; |
|---|
| 522 | + const struct hlist_nulls_node *node; |
|---|
| 523 | + struct net *net = sock_net(sk); |
|---|
| 524 | + struct sock *esk; |
|---|
| 525 | + |
|---|
| 526 | + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); |
|---|
| 527 | + |
|---|
| 528 | + sk_nulls_for_each_rcu(esk, node, list) { |
|---|
| 529 | + if (esk->sk_hash != sk->sk_hash) |
|---|
| 530 | + continue; |
|---|
| 531 | + if (sk->sk_family == AF_INET) { |
|---|
| 532 | + if (unlikely(INET_MATCH(net, esk, acookie, |
|---|
| 533 | + ports, dif, sdif))) { |
|---|
| 534 | + return true; |
|---|
| 535 | + } |
|---|
| 536 | + } |
|---|
| 537 | +#if IS_ENABLED(CONFIG_IPV6) |
|---|
| 538 | + else if (sk->sk_family == AF_INET6) { |
|---|
| 539 | + if (unlikely(inet6_match(net, esk, |
|---|
| 540 | + &sk->sk_v6_daddr, |
|---|
| 541 | + &sk->sk_v6_rcv_saddr, |
|---|
| 542 | + ports, dif, sdif))) { |
|---|
| 543 | + return true; |
|---|
| 544 | + } |
|---|
| 545 | + } |
|---|
| 546 | +#endif |
|---|
| 547 | + } |
|---|
| 548 | + return false; |
|---|
| 549 | +} |
|---|
| 550 | + |
|---|
| 551 | +/* Insert a socket into ehash, and eventually remove another one |
|---|
| 552 | + * (The another one can be a SYN_RECV or TIMEWAIT) |
|---|
| 553 | + * If an existing socket already exists, socket sk is not inserted, |
|---|
| 554 | + * and sets found_dup_sk parameter to true. |
|---|
| 555 | + */ |
|---|
| 556 | +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) |
|---|
| 520 | 557 | { |
|---|
| 521 | 558 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
|---|
| 522 | 559 | struct hlist_nulls_head *list; |
|---|
| .. | .. |
|---|
| 535 | 572 | if (osk) { |
|---|
| 536 | 573 | WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); |
|---|
| 537 | 574 | ret = sk_nulls_del_node_init_rcu(osk); |
|---|
| 575 | + } else if (found_dup_sk) { |
|---|
| 576 | + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); |
|---|
| 577 | + if (*found_dup_sk) |
|---|
| 578 | + ret = false; |
|---|
| 538 | 579 | } |
|---|
| 580 | + |
|---|
| 539 | 581 | if (ret) |
|---|
| 540 | 582 | __sk_nulls_add_node_rcu(sk, list); |
|---|
| 583 | + |
|---|
| 541 | 584 | spin_unlock(lock); |
|---|
| 585 | + |
|---|
| 542 | 586 | return ret; |
|---|
| 543 | 587 | } |
|---|
| 544 | 588 | |
|---|
| 545 | | -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) |
|---|
| 589 | +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) |
|---|
| 546 | 590 | { |
|---|
| 547 | | - bool ok = inet_ehash_insert(sk, osk); |
|---|
| 591 | + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); |
|---|
| 548 | 592 | |
|---|
| 549 | 593 | if (ok) { |
|---|
| 550 | 594 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
|---|
| .. | .. |
|---|
| 588 | 632 | int err = 0; |
|---|
| 589 | 633 | |
|---|
| 590 | 634 | if (sk->sk_state != TCP_LISTEN) { |
|---|
| 591 | | - inet_ehash_nolisten(sk, osk); |
|---|
| 635 | + local_bh_disable(); |
|---|
| 636 | + inet_ehash_nolisten(sk, osk, NULL); |
|---|
| 637 | + local_bh_enable(); |
|---|
| 592 | 638 | return 0; |
|---|
| 593 | 639 | } |
|---|
| 594 | 640 | WARN_ON(!sk_unhashed(sk)); |
|---|
| .. | .. |
|---|
| 620 | 666 | { |
|---|
| 621 | 667 | int err = 0; |
|---|
| 622 | 668 | |
|---|
| 623 | | - if (sk->sk_state != TCP_CLOSE) { |
|---|
| 624 | | - local_bh_disable(); |
|---|
| 669 | + if (sk->sk_state != TCP_CLOSE) |
|---|
| 625 | 670 | err = __inet_hash(sk, NULL); |
|---|
| 626 | | - local_bh_enable(); |
|---|
| 627 | | - } |
|---|
| 628 | 671 | |
|---|
| 629 | 672 | return err; |
|---|
| 630 | 673 | } |
|---|
| 631 | 674 | EXPORT_SYMBOL_GPL(inet_hash); |
|---|
| 632 | 675 | |
|---|
| 633 | | -void inet_unhash(struct sock *sk) |
|---|
| 676 | +static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) |
|---|
| 634 | 677 | { |
|---|
| 635 | | - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
|---|
| 636 | | - struct inet_listen_hashbucket *ilb = NULL; |
|---|
| 637 | | - spinlock_t *lock; |
|---|
| 638 | | - |
|---|
| 639 | 678 | if (sk_unhashed(sk)) |
|---|
| 640 | 679 | return; |
|---|
| 641 | | - |
|---|
| 642 | | - if (sk->sk_state == TCP_LISTEN) { |
|---|
| 643 | | - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
|---|
| 644 | | - lock = &ilb->lock; |
|---|
| 645 | | - } else { |
|---|
| 646 | | - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
|---|
| 647 | | - } |
|---|
| 648 | | - spin_lock_bh(lock); |
|---|
| 649 | | - if (sk_unhashed(sk)) |
|---|
| 650 | | - goto unlock; |
|---|
| 651 | 680 | |
|---|
| 652 | 681 | if (rcu_access_pointer(sk->sk_reuseport_cb)) |
|---|
| 653 | 682 | reuseport_detach_sock(sk); |
|---|
| 654 | 683 | if (ilb) { |
|---|
| 684 | + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
|---|
| 685 | + |
|---|
| 655 | 686 | inet_unhash2(hashinfo, sk); |
|---|
| 656 | 687 | ilb->count--; |
|---|
| 657 | 688 | } |
|---|
| 658 | 689 | __sk_nulls_del_node_init_rcu(sk); |
|---|
| 659 | 690 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
|---|
| 660 | | -unlock: |
|---|
| 661 | | - spin_unlock_bh(lock); |
|---|
| 691 | +} |
|---|
| 692 | + |
|---|
| 693 | +void inet_unhash(struct sock *sk) |
|---|
| 694 | +{ |
|---|
| 695 | + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
|---|
| 696 | + |
|---|
| 697 | + if (sk_unhashed(sk)) |
|---|
| 698 | + return; |
|---|
| 699 | + |
|---|
| 700 | + if (sk->sk_state == TCP_LISTEN) { |
|---|
| 701 | + struct inet_listen_hashbucket *ilb; |
|---|
| 702 | + |
|---|
| 703 | + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
|---|
| 704 | + /* Don't disable bottom halves while acquiring the lock to |
|---|
| 705 | + * avoid circular locking dependency on PREEMPT_RT. |
|---|
| 706 | + */ |
|---|
| 707 | + spin_lock(&ilb->lock); |
|---|
| 708 | + __inet_unhash(sk, ilb); |
|---|
| 709 | + spin_unlock(&ilb->lock); |
|---|
| 710 | + } else { |
|---|
| 711 | + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
|---|
| 712 | + |
|---|
| 713 | + spin_lock_bh(lock); |
|---|
| 714 | + __inet_unhash(sk, NULL); |
|---|
| 715 | + spin_unlock_bh(lock); |
|---|
| 716 | + } |
|---|
| 662 | 717 | } |
|---|
| 663 | 718 | EXPORT_SYMBOL_GPL(inet_unhash); |
|---|
| 664 | 719 | |
|---|
| 720 | +/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm |
|---|
| 721 | + * Note that we use 32bit integers (vs RFC 'short integers') |
|---|
| 722 | + * because 2^16 is not a multiple of num_ephemeral and this |
|---|
| 723 | + * property might be used by clever attacker. |
|---|
| 724 | + * |
|---|
| 725 | + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though |
|---|
| 726 | + * attacks were since demonstrated, thus we use 65536 by default instead |
|---|
| 727 | + * to really give more isolation and privacy, at the expense of 256kB |
|---|
| 728 | + * of kernel memory. |
|---|
| 729 | + */ |
|---|
| 730 | +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) |
|---|
| 731 | +static u32 *table_perturb; |
|---|
| 732 | + |
|---|
| 665 | 733 | int __inet_hash_connect(struct inet_timewait_death_row *death_row, |
|---|
| 666 | | - struct sock *sk, u32 port_offset, |
|---|
| 734 | + struct sock *sk, u64 port_offset, |
|---|
| 667 | 735 | int (*check_established)(struct inet_timewait_death_row *, |
|---|
| 668 | 736 | struct sock *, __u16, struct inet_timewait_sock **)) |
|---|
| 669 | 737 | { |
|---|
| .. | .. |
|---|
| 675 | 743 | struct inet_bind_bucket *tb; |
|---|
| 676 | 744 | u32 remaining, offset; |
|---|
| 677 | 745 | int ret, i, low, high; |
|---|
| 678 | | - static u32 hint; |
|---|
| 746 | + int l3mdev; |
|---|
| 747 | + u32 index; |
|---|
| 679 | 748 | |
|---|
| 680 | 749 | if (port) { |
|---|
| 681 | 750 | head = &hinfo->bhash[inet_bhashfn(net, port, |
|---|
| .. | .. |
|---|
| 683 | 752 | tb = inet_csk(sk)->icsk_bind_hash; |
|---|
| 684 | 753 | spin_lock_bh(&head->lock); |
|---|
| 685 | 754 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { |
|---|
| 686 | | - inet_ehash_nolisten(sk, NULL); |
|---|
| 755 | + inet_ehash_nolisten(sk, NULL, NULL); |
|---|
| 687 | 756 | spin_unlock_bh(&head->lock); |
|---|
| 688 | 757 | return 0; |
|---|
| 689 | 758 | } |
|---|
| .. | .. |
|---|
| 694 | 763 | return ret; |
|---|
| 695 | 764 | } |
|---|
| 696 | 765 | |
|---|
| 766 | + l3mdev = inet_sk_bound_l3mdev(sk); |
|---|
| 767 | + |
|---|
| 697 | 768 | inet_get_local_port_range(net, &low, &high); |
|---|
| 698 | 769 | high++; /* [32768, 60999] -> [32768, 61000[ */ |
|---|
| 699 | 770 | remaining = high - low; |
|---|
| 700 | 771 | if (likely(remaining > 1)) |
|---|
| 701 | 772 | remaining &= ~1U; |
|---|
| 702 | 773 | |
|---|
| 703 | | - offset = (hint + port_offset) % remaining; |
|---|
| 774 | + get_random_slow_once(table_perturb, |
|---|
| 775 | + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); |
|---|
| 776 | + index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); |
|---|
| 777 | + |
|---|
| 778 | + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); |
|---|
| 779 | + offset %= remaining; |
|---|
| 780 | + |
|---|
| 704 | 781 | /* In first pass we try ports of @low parity. |
|---|
| 705 | 782 | * inet_csk_get_port() does the opposite choice. |
|---|
| 706 | 783 | */ |
|---|
| .. | .. |
|---|
| 720 | 797 | * the established check is already unique enough. |
|---|
| 721 | 798 | */ |
|---|
| 722 | 799 | inet_bind_bucket_for_each(tb, &head->chain) { |
|---|
| 723 | | - if (net_eq(ib_net(tb), net) && tb->port == port) { |
|---|
| 800 | + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && |
|---|
| 801 | + tb->port == port) { |
|---|
| 724 | 802 | if (tb->fastreuse >= 0 || |
|---|
| 725 | 803 | tb->fastreuseport >= 0) |
|---|
| 726 | 804 | goto next_port; |
|---|
| .. | .. |
|---|
| 733 | 811 | } |
|---|
| 734 | 812 | |
|---|
| 735 | 813 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, |
|---|
| 736 | | - net, head, port); |
|---|
| 814 | + net, head, port, l3mdev); |
|---|
| 737 | 815 | if (!tb) { |
|---|
| 738 | 816 | spin_unlock_bh(&head->lock); |
|---|
| 739 | 817 | return -ENOMEM; |
|---|
| .. | .. |
|---|
| 753 | 831 | return -EADDRNOTAVAIL; |
|---|
| 754 | 832 | |
|---|
| 755 | 833 | ok: |
|---|
| 756 | | - hint += i + 2; |
|---|
| 834 | + /* Here we want to add a little bit of randomness to the next source |
|---|
| 835 | + * port that will be chosen. We use a max() with a random here so that |
|---|
| 836 | + * on low contention the randomness is maximal and on high contention |
|---|
| 837 | + * it may be inexistent. |
|---|
| 838 | + */ |
|---|
| 839 | + i = max_t(int, i, (prandom_u32() & 7) * 2); |
|---|
| 840 | + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); |
|---|
| 757 | 841 | |
|---|
| 758 | 842 | /* Head lock still held and bh's disabled */ |
|---|
| 759 | 843 | inet_bind_hash(sk, tb, port); |
|---|
| 760 | 844 | if (sk_unhashed(sk)) { |
|---|
| 761 | 845 | inet_sk(sk)->inet_sport = htons(port); |
|---|
| 762 | | - inet_ehash_nolisten(sk, (struct sock *)tw); |
|---|
| 846 | + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); |
|---|
| 763 | 847 | } |
|---|
| 764 | 848 | if (tw) |
|---|
| 765 | 849 | inet_twsk_bind_unhash(tw, hinfo); |
|---|
| .. | .. |
|---|
| 776 | 860 | int inet_hash_connect(struct inet_timewait_death_row *death_row, |
|---|
| 777 | 861 | struct sock *sk) |
|---|
| 778 | 862 | { |
|---|
| 779 | | - u32 port_offset = 0; |
|---|
| 863 | + u64 port_offset = 0; |
|---|
| 780 | 864 | |
|---|
| 781 | 865 | if (!inet_sk(sk)->inet_num) |
|---|
| 782 | 866 | port_offset = inet_sk_port_offset(sk); |
|---|
| .. | .. |
|---|
| 800 | 884 | } |
|---|
| 801 | 885 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); |
|---|
| 802 | 886 | |
|---|
| 887 | +static void init_hashinfo_lhash2(struct inet_hashinfo *h) |
|---|
| 888 | +{ |
|---|
| 889 | + int i; |
|---|
| 890 | + |
|---|
| 891 | + for (i = 0; i <= h->lhash2_mask; i++) { |
|---|
| 892 | + spin_lock_init(&h->lhash2[i].lock); |
|---|
| 893 | + INIT_HLIST_HEAD(&h->lhash2[i].head); |
|---|
| 894 | + h->lhash2[i].count = 0; |
|---|
| 895 | + } |
|---|
| 896 | +} |
|---|
| 897 | + |
|---|
| 803 | 898 | void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, |
|---|
| 804 | 899 | unsigned long numentries, int scale, |
|---|
| 805 | 900 | unsigned long low_limit, |
|---|
| 806 | 901 | unsigned long high_limit) |
|---|
| 807 | 902 | { |
|---|
| 808 | | - unsigned int i; |
|---|
| 809 | | - |
|---|
| 810 | 903 | h->lhash2 = alloc_large_system_hash(name, |
|---|
| 811 | 904 | sizeof(*h->lhash2), |
|---|
| 812 | 905 | numentries, |
|---|
| .. | .. |
|---|
| 816 | 909 | &h->lhash2_mask, |
|---|
| 817 | 910 | low_limit, |
|---|
| 818 | 911 | high_limit); |
|---|
| 912 | + init_hashinfo_lhash2(h); |
|---|
| 819 | 913 | |
|---|
| 820 | | - for (i = 0; i <= h->lhash2_mask; i++) { |
|---|
| 821 | | - spin_lock_init(&h->lhash2[i].lock); |
|---|
| 822 | | - INIT_HLIST_HEAD(&h->lhash2[i].head); |
|---|
| 823 | | - h->lhash2[i].count = 0; |
|---|
| 824 | | - } |
|---|
| 914 | + /* this one is used for source ports of outgoing connections */ |
|---|
| 915 | + table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, |
|---|
| 916 | + sizeof(*table_perturb), GFP_KERNEL); |
|---|
| 917 | + if (!table_perturb) |
|---|
| 918 | + panic("TCP: failed to alloc table_perturb"); |
|---|
| 825 | 919 | } |
|---|
| 826 | 920 | |
|---|
| 921 | +int inet_hashinfo2_init_mod(struct inet_hashinfo *h) |
|---|
| 922 | +{ |
|---|
| 923 | + h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); |
|---|
| 924 | + if (!h->lhash2) |
|---|
| 925 | + return -ENOMEM; |
|---|
| 926 | + |
|---|
| 927 | + h->lhash2_mask = INET_LHTABLE_SIZE - 1; |
|---|
| 928 | + /* INET_LHTABLE_SIZE must be a power of 2 */ |
|---|
| 929 | + BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); |
|---|
| 930 | + |
|---|
| 931 | + init_hashinfo_lhash2(h); |
|---|
| 932 | + return 0; |
|---|
| 933 | +} |
|---|
| 934 | +EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); |
|---|
| 935 | + |
|---|
| 827 | 936 | int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) |
|---|
| 828 | 937 | { |
|---|
| 829 | 938 | unsigned int locksz = sizeof(spinlock_t); |
|---|