hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/xfrm/xfrm_policy.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * xfrm_policy.c
34 *
....@@ -26,12 +27,26 @@
2627 #include <linux/cache.h>
2728 #include <linux/cpu.h>
2829 #include <linux/audit.h>
30
+#include <linux/rhashtable.h>
31
+#include <linux/if_tunnel.h>
2932 #include <net/dst.h>
3033 #include <net/flow.h>
34
+#ifndef __GENKSYMS__
35
+#include <net/inet_ecn.h>
36
+#endif
3137 #include <net/xfrm.h>
3238 #include <net/ip.h>
39
+#ifndef __GENKSYMS__
40
+#include <net/gre.h>
41
+#endif
42
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
43
+#include <net/mip6.h>
44
+#endif
3345 #ifdef CONFIG_XFRM_STATISTICS
3446 #include <net/snmp.h>
47
+#endif
48
+#ifdef CONFIG_XFRM_ESPINTCP
49
+#include <net/espintcp.h>
3550 #endif
3651
3752 #include "xfrm_hash.h"
....@@ -45,6 +60,99 @@
4560 u8 flags;
4661 };
4762
63
+/* prefixes smaller than this are stored in lists, not trees. */
64
+#define INEXACT_PREFIXLEN_IPV4 16
65
+#define INEXACT_PREFIXLEN_IPV6 48
66
+
67
+struct xfrm_pol_inexact_node {
68
+ struct rb_node node;
69
+ union {
70
+ xfrm_address_t addr;
71
+ struct rcu_head rcu;
72
+ };
73
+ u8 prefixlen;
74
+
75
+ struct rb_root root;
76
+
77
+ /* the policies matching this node, can be empty list */
78
+ struct hlist_head hhead;
79
+};
80
+
81
+/* xfrm inexact policy search tree:
82
+ * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
83
+ * |
84
+ * +---- root_d: sorted by daddr:prefix
85
+ * | |
86
+ * | xfrm_pol_inexact_node
87
+ * | |
88
+ * | +- root: sorted by saddr/prefix
89
+ * | | |
90
+ * | | xfrm_pol_inexact_node
91
+ * | | |
92
+ * | | + root: unused
93
+ * | | |
94
+ * | | + hhead: saddr:daddr policies
95
+ * | |
96
+ * | +- coarse policies and all any:daddr policies
97
+ * |
98
+ * +---- root_s: sorted by saddr:prefix
99
+ * | |
100
+ * | xfrm_pol_inexact_node
101
+ * | |
102
+ * | + root: unused
103
+ * | |
104
+ * | + hhead: saddr:any policies
105
+ * |
106
+ * +---- coarse policies and all any:any policies
107
+ *
108
+ * Lookups return four candidate lists:
109
+ * 1. any:any list from top-level xfrm_pol_inexact_bin
110
+ * 2. any:daddr list from daddr tree
111
+ * 3. saddr:daddr list from 2nd level daddr tree
112
+ * 4. saddr:any list from saddr tree
113
+ *
114
+ * This result set then needs to be searched for the policy with
115
+ * the lowest priority. If two results have same prio, youngest one wins.
116
+ */
117
+
118
+struct xfrm_pol_inexact_key {
119
+ possible_net_t net;
120
+ u32 if_id;
121
+ u16 family;
122
+ u8 dir, type;
123
+};
124
+
125
+struct xfrm_pol_inexact_bin {
126
+ struct xfrm_pol_inexact_key k;
127
+ struct rhash_head head;
128
+ /* list containing '*:*' policies */
129
+ struct hlist_head hhead;
130
+
131
+ seqcount_spinlock_t count;
132
+ /* tree sorted by daddr/prefix */
133
+ struct rb_root root_d;
134
+
135
+ /* tree sorted by saddr/prefix */
136
+ struct rb_root root_s;
137
+
138
+ /* slow path below */
139
+ struct list_head inexact_bins;
140
+ struct rcu_head rcu;
141
+};
142
+
143
+enum xfrm_pol_inexact_candidate_type {
144
+ XFRM_POL_CAND_BOTH,
145
+ XFRM_POL_CAND_SADDR,
146
+ XFRM_POL_CAND_DADDR,
147
+ XFRM_POL_CAND_ANY,
148
+
149
+ XFRM_POL_CAND_MAX,
150
+};
151
+
152
+struct xfrm_pol_inexact_candidates {
153
+ struct hlist_head *res[XFRM_POL_CAND_MAX];
154
+};
155
+
48156 static DEFINE_SPINLOCK(xfrm_if_cb_lock);
49157 static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
50158
....@@ -53,7 +161,10 @@
53161 __read_mostly;
54162
55163 static struct kmem_cache *xfrm_dst_cache __ro_after_init;
56
-static __read_mostly seqcount_t xfrm_policy_hash_generation;
164
+static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation;
165
+
166
+static struct rhashtable xfrm_policy_inexact_table;
167
+static const struct rhashtable_params xfrm_pol_inexact_params;
57168
58169 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
59170 static int stale_bundle(struct dst_entry *dst);
....@@ -63,6 +174,25 @@
63174 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
64175 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
65176 int dir);
177
+
178
+static struct xfrm_pol_inexact_bin *
179
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
180
+ u32 if_id);
181
+
182
+static struct xfrm_pol_inexact_bin *
183
+xfrm_policy_inexact_lookup_rcu(struct net *net,
184
+ u8 type, u16 family, u8 dir, u32 if_id);
185
+static struct xfrm_policy *
186
+xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
187
+ bool excl);
188
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
189
+ struct xfrm_policy *policy);
190
+
191
+static bool
192
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
193
+ struct xfrm_pol_inexact_bin *b,
194
+ const xfrm_address_t *saddr,
195
+ const xfrm_address_t *daddr);
66196
67197 static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
68198 {
....@@ -269,6 +399,7 @@
269399 if (policy) {
270400 write_pnet(&policy->xp_net, net);
271401 INIT_LIST_HEAD(&policy->walk.all);
402
+ INIT_HLIST_NODE(&policy->bydst_inexact_list);
272403 INIT_HLIST_NODE(&policy->bydst);
273404 INIT_HLIST_NODE(&policy->byidx);
274405 rwlock_init(&policy->lock);
....@@ -367,7 +498,7 @@
367498 hash = __sel_hash(sel, family, hmask, dbits, sbits);
368499
369500 if (hash == hmask + 1)
370
- return &net->xfrm.policy_inexact[dir];
501
+ return NULL;
371502
372503 return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
373504 lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
....@@ -461,9 +592,6 @@
461592
462593 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
463594 write_seqcount_begin(&xfrm_policy_hash_generation);
464
-
465
- odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
466
- lockdep_is_held(&net->xfrm.xfrm_policy_lock));
467595
468596 odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
469597 lockdep_is_held(&net->xfrm.xfrm_policy_lock));
....@@ -565,6 +693,536 @@
565693 mutex_unlock(&hash_resize_mutex);
566694 }
567695
696
+/* Make sure *pol can be inserted into fastbin.
697
+ * Useful to check that later insert requests will be sucessful
698
+ * (provided xfrm_policy_lock is held throughout).
699
+ */
700
+static struct xfrm_pol_inexact_bin *
701
+xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
702
+{
703
+ struct xfrm_pol_inexact_bin *bin, *prev;
704
+ struct xfrm_pol_inexact_key k = {
705
+ .family = pol->family,
706
+ .type = pol->type,
707
+ .dir = dir,
708
+ .if_id = pol->if_id,
709
+ };
710
+ struct net *net = xp_net(pol);
711
+
712
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
713
+
714
+ write_pnet(&k.net, net);
715
+ bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
716
+ xfrm_pol_inexact_params);
717
+ if (bin)
718
+ return bin;
719
+
720
+ bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
721
+ if (!bin)
722
+ return NULL;
723
+
724
+ bin->k = k;
725
+ INIT_HLIST_HEAD(&bin->hhead);
726
+ bin->root_d = RB_ROOT;
727
+ bin->root_s = RB_ROOT;
728
+ seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);
729
+
730
+ prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
731
+ &bin->k, &bin->head,
732
+ xfrm_pol_inexact_params);
733
+ if (!prev) {
734
+ list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
735
+ return bin;
736
+ }
737
+
738
+ kfree(bin);
739
+
740
+ return IS_ERR(prev) ? NULL : prev;
741
+}
742
+
743
+static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
744
+ int family, u8 prefixlen)
745
+{
746
+ if (xfrm_addr_any(addr, family))
747
+ return true;
748
+
749
+ if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
750
+ return true;
751
+
752
+ if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
753
+ return true;
754
+
755
+ return false;
756
+}
757
+
758
+static bool
759
+xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
760
+{
761
+ const xfrm_address_t *addr;
762
+ bool saddr_any, daddr_any;
763
+ u8 prefixlen;
764
+
765
+ addr = &policy->selector.saddr;
766
+ prefixlen = policy->selector.prefixlen_s;
767
+
768
+ saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
769
+ policy->family,
770
+ prefixlen);
771
+ addr = &policy->selector.daddr;
772
+ prefixlen = policy->selector.prefixlen_d;
773
+ daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
774
+ policy->family,
775
+ prefixlen);
776
+ return saddr_any && daddr_any;
777
+}
778
+
779
+static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
780
+ const xfrm_address_t *addr, u8 prefixlen)
781
+{
782
+ node->addr = *addr;
783
+ node->prefixlen = prefixlen;
784
+}
785
+
786
+static struct xfrm_pol_inexact_node *
787
+xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
788
+{
789
+ struct xfrm_pol_inexact_node *node;
790
+
791
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
792
+ if (node)
793
+ xfrm_pol_inexact_node_init(node, addr, prefixlen);
794
+
795
+ return node;
796
+}
797
+
798
+static int xfrm_policy_addr_delta(const xfrm_address_t *a,
799
+ const xfrm_address_t *b,
800
+ u8 prefixlen, u16 family)
801
+{
802
+ u32 ma, mb, mask;
803
+ unsigned int pdw, pbi;
804
+ int delta = 0;
805
+
806
+ switch (family) {
807
+ case AF_INET:
808
+ if (prefixlen == 0)
809
+ return 0;
810
+ mask = ~0U << (32 - prefixlen);
811
+ ma = ntohl(a->a4) & mask;
812
+ mb = ntohl(b->a4) & mask;
813
+ if (ma < mb)
814
+ delta = -1;
815
+ else if (ma > mb)
816
+ delta = 1;
817
+ break;
818
+ case AF_INET6:
819
+ pdw = prefixlen >> 5;
820
+ pbi = prefixlen & 0x1f;
821
+
822
+ if (pdw) {
823
+ delta = memcmp(a->a6, b->a6, pdw << 2);
824
+ if (delta)
825
+ return delta;
826
+ }
827
+ if (pbi) {
828
+ mask = ~0U << (32 - pbi);
829
+ ma = ntohl(a->a6[pdw]) & mask;
830
+ mb = ntohl(b->a6[pdw]) & mask;
831
+ if (ma < mb)
832
+ delta = -1;
833
+ else if (ma > mb)
834
+ delta = 1;
835
+ }
836
+ break;
837
+ default:
838
+ break;
839
+ }
840
+
841
+ return delta;
842
+}
843
+
844
+static void xfrm_policy_inexact_list_reinsert(struct net *net,
845
+ struct xfrm_pol_inexact_node *n,
846
+ u16 family)
847
+{
848
+ unsigned int matched_s, matched_d;
849
+ struct xfrm_policy *policy, *p;
850
+
851
+ matched_s = 0;
852
+ matched_d = 0;
853
+
854
+ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
855
+ struct hlist_node *newpos = NULL;
856
+ bool matches_s, matches_d;
857
+
858
+ if (!policy->bydst_reinsert)
859
+ continue;
860
+
861
+ WARN_ON_ONCE(policy->family != family);
862
+
863
+ policy->bydst_reinsert = false;
864
+ hlist_for_each_entry(p, &n->hhead, bydst) {
865
+ if (policy->priority > p->priority)
866
+ newpos = &p->bydst;
867
+ else if (policy->priority == p->priority &&
868
+ policy->pos > p->pos)
869
+ newpos = &p->bydst;
870
+ else
871
+ break;
872
+ }
873
+
874
+ if (newpos)
875
+ hlist_add_behind_rcu(&policy->bydst, newpos);
876
+ else
877
+ hlist_add_head_rcu(&policy->bydst, &n->hhead);
878
+
879
+ /* paranoia checks follow.
880
+ * Check that the reinserted policy matches at least
881
+ * saddr or daddr for current node prefix.
882
+ *
883
+ * Matching both is fine, matching saddr in one policy
884
+ * (but not daddr) and then matching only daddr in another
885
+ * is a bug.
886
+ */
887
+ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
888
+ &n->addr,
889
+ n->prefixlen,
890
+ family) == 0;
891
+ matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
892
+ &n->addr,
893
+ n->prefixlen,
894
+ family) == 0;
895
+ if (matches_s && matches_d)
896
+ continue;
897
+
898
+ WARN_ON_ONCE(!matches_s && !matches_d);
899
+ if (matches_s)
900
+ matched_s++;
901
+ if (matches_d)
902
+ matched_d++;
903
+ WARN_ON_ONCE(matched_s && matched_d);
904
+ }
905
+}
906
+
907
+static void xfrm_policy_inexact_node_reinsert(struct net *net,
908
+ struct xfrm_pol_inexact_node *n,
909
+ struct rb_root *new,
910
+ u16 family)
911
+{
912
+ struct xfrm_pol_inexact_node *node;
913
+ struct rb_node **p, *parent;
914
+
915
+ /* we should not have another subtree here */
916
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
917
+restart:
918
+ parent = NULL;
919
+ p = &new->rb_node;
920
+ while (*p) {
921
+ u8 prefixlen;
922
+ int delta;
923
+
924
+ parent = *p;
925
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
926
+
927
+ prefixlen = min(node->prefixlen, n->prefixlen);
928
+
929
+ delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
930
+ prefixlen, family);
931
+ if (delta < 0) {
932
+ p = &parent->rb_left;
933
+ } else if (delta > 0) {
934
+ p = &parent->rb_right;
935
+ } else {
936
+ bool same_prefixlen = node->prefixlen == n->prefixlen;
937
+ struct xfrm_policy *tmp;
938
+
939
+ hlist_for_each_entry(tmp, &n->hhead, bydst) {
940
+ tmp->bydst_reinsert = true;
941
+ hlist_del_rcu(&tmp->bydst);
942
+ }
943
+
944
+ node->prefixlen = prefixlen;
945
+
946
+ xfrm_policy_inexact_list_reinsert(net, node, family);
947
+
948
+ if (same_prefixlen) {
949
+ kfree_rcu(n, rcu);
950
+ return;
951
+ }
952
+
953
+ rb_erase(*p, new);
954
+ kfree_rcu(n, rcu);
955
+ n = node;
956
+ goto restart;
957
+ }
958
+ }
959
+
960
+ rb_link_node_rcu(&n->node, parent, p);
961
+ rb_insert_color(&n->node, new);
962
+}
963
+
964
+/* merge nodes v and n */
965
+static void xfrm_policy_inexact_node_merge(struct net *net,
966
+ struct xfrm_pol_inexact_node *v,
967
+ struct xfrm_pol_inexact_node *n,
968
+ u16 family)
969
+{
970
+ struct xfrm_pol_inexact_node *node;
971
+ struct xfrm_policy *tmp;
972
+ struct rb_node *rnode;
973
+
974
+ /* To-be-merged node v has a subtree.
975
+ *
976
+ * Dismantle it and insert its nodes to n->root.
977
+ */
978
+ while ((rnode = rb_first(&v->root)) != NULL) {
979
+ node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
980
+ rb_erase(&node->node, &v->root);
981
+ xfrm_policy_inexact_node_reinsert(net, node, &n->root,
982
+ family);
983
+ }
984
+
985
+ hlist_for_each_entry(tmp, &v->hhead, bydst) {
986
+ tmp->bydst_reinsert = true;
987
+ hlist_del_rcu(&tmp->bydst);
988
+ }
989
+
990
+ xfrm_policy_inexact_list_reinsert(net, n, family);
991
+}
992
+
993
+static struct xfrm_pol_inexact_node *
994
+xfrm_policy_inexact_insert_node(struct net *net,
995
+ struct rb_root *root,
996
+ xfrm_address_t *addr,
997
+ u16 family, u8 prefixlen, u8 dir)
998
+{
999
+ struct xfrm_pol_inexact_node *cached = NULL;
1000
+ struct rb_node **p, *parent = NULL;
1001
+ struct xfrm_pol_inexact_node *node;
1002
+
1003
+ p = &root->rb_node;
1004
+ while (*p) {
1005
+ int delta;
1006
+
1007
+ parent = *p;
1008
+ node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
1009
+
1010
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
1011
+ node->prefixlen,
1012
+ family);
1013
+ if (delta == 0 && prefixlen >= node->prefixlen) {
1014
+ WARN_ON_ONCE(cached); /* ipsec policies got lost */
1015
+ return node;
1016
+ }
1017
+
1018
+ if (delta < 0)
1019
+ p = &parent->rb_left;
1020
+ else
1021
+ p = &parent->rb_right;
1022
+
1023
+ if (prefixlen < node->prefixlen) {
1024
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
1025
+ prefixlen,
1026
+ family);
1027
+ if (delta)
1028
+ continue;
1029
+
1030
+ /* This node is a subnet of the new prefix. It needs
1031
+ * to be removed and re-inserted with the smaller
1032
+ * prefix and all nodes that are now also covered
1033
+ * by the reduced prefixlen.
1034
+ */
1035
+ rb_erase(&node->node, root);
1036
+
1037
+ if (!cached) {
1038
+ xfrm_pol_inexact_node_init(node, addr,
1039
+ prefixlen);
1040
+ cached = node;
1041
+ } else {
1042
+ /* This node also falls within the new
1043
+ * prefixlen. Merge the to-be-reinserted
1044
+ * node and this one.
1045
+ */
1046
+ xfrm_policy_inexact_node_merge(net, node,
1047
+ cached, family);
1048
+ kfree_rcu(node, rcu);
1049
+ }
1050
+
1051
+ /* restart */
1052
+ p = &root->rb_node;
1053
+ parent = NULL;
1054
+ }
1055
+ }
1056
+
1057
+ node = cached;
1058
+ if (!node) {
1059
+ node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
1060
+ if (!node)
1061
+ return NULL;
1062
+ }
1063
+
1064
+ rb_link_node_rcu(&node->node, parent, p);
1065
+ rb_insert_color(&node->node, root);
1066
+
1067
+ return node;
1068
+}
1069
+
1070
+static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
1071
+{
1072
+ struct xfrm_pol_inexact_node *node;
1073
+ struct rb_node *rn = rb_first(r);
1074
+
1075
+ while (rn) {
1076
+ node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
1077
+
1078
+ xfrm_policy_inexact_gc_tree(&node->root, rm);
1079
+ rn = rb_next(rn);
1080
+
1081
+ if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
1082
+ WARN_ON_ONCE(rm);
1083
+ continue;
1084
+ }
1085
+
1086
+ rb_erase(&node->node, r);
1087
+ kfree_rcu(node, rcu);
1088
+ }
1089
+}
1090
+
1091
+static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
1092
+{
1093
+ write_seqcount_begin(&b->count);
1094
+ xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
1095
+ xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
1096
+ write_seqcount_end(&b->count);
1097
+
1098
+ if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
1099
+ !hlist_empty(&b->hhead)) {
1100
+ WARN_ON_ONCE(net_exit);
1101
+ return;
1102
+ }
1103
+
1104
+ if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
1105
+ xfrm_pol_inexact_params) == 0) {
1106
+ list_del(&b->inexact_bins);
1107
+ kfree_rcu(b, rcu);
1108
+ }
1109
+}
1110
+
1111
+static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
1112
+{
1113
+ struct net *net = read_pnet(&b->k.net);
1114
+
1115
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1116
+ __xfrm_policy_inexact_prune_bin(b, false);
1117
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1118
+}
1119
+
1120
+static void __xfrm_policy_inexact_flush(struct net *net)
1121
+{
1122
+ struct xfrm_pol_inexact_bin *bin, *t;
1123
+
1124
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1125
+
1126
+ list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
1127
+ __xfrm_policy_inexact_prune_bin(bin, false);
1128
+}
1129
+
1130
+static struct hlist_head *
1131
+xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
1132
+ struct xfrm_policy *policy, u8 dir)
1133
+{
1134
+ struct xfrm_pol_inexact_node *n;
1135
+ struct net *net;
1136
+
1137
+ net = xp_net(policy);
1138
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1139
+
1140
+ if (xfrm_policy_inexact_insert_use_any_list(policy))
1141
+ return &bin->hhead;
1142
+
1143
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
1144
+ policy->family,
1145
+ policy->selector.prefixlen_d)) {
1146
+ write_seqcount_begin(&bin->count);
1147
+ n = xfrm_policy_inexact_insert_node(net,
1148
+ &bin->root_s,
1149
+ &policy->selector.saddr,
1150
+ policy->family,
1151
+ policy->selector.prefixlen_s,
1152
+ dir);
1153
+ write_seqcount_end(&bin->count);
1154
+ if (!n)
1155
+ return NULL;
1156
+
1157
+ return &n->hhead;
1158
+ }
1159
+
1160
+ /* daddr is fixed */
1161
+ write_seqcount_begin(&bin->count);
1162
+ n = xfrm_policy_inexact_insert_node(net,
1163
+ &bin->root_d,
1164
+ &policy->selector.daddr,
1165
+ policy->family,
1166
+ policy->selector.prefixlen_d, dir);
1167
+ write_seqcount_end(&bin->count);
1168
+ if (!n)
1169
+ return NULL;
1170
+
1171
+ /* saddr is wildcard */
1172
+ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
1173
+ policy->family,
1174
+ policy->selector.prefixlen_s))
1175
+ return &n->hhead;
1176
+
1177
+ write_seqcount_begin(&bin->count);
1178
+ n = xfrm_policy_inexact_insert_node(net,
1179
+ &n->root,
1180
+ &policy->selector.saddr,
1181
+ policy->family,
1182
+ policy->selector.prefixlen_s, dir);
1183
+ write_seqcount_end(&bin->count);
1184
+ if (!n)
1185
+ return NULL;
1186
+
1187
+ return &n->hhead;
1188
+}
1189
+
1190
+static struct xfrm_policy *
1191
+xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
1192
+{
1193
+ struct xfrm_pol_inexact_bin *bin;
1194
+ struct xfrm_policy *delpol;
1195
+ struct hlist_head *chain;
1196
+ struct net *net;
1197
+
1198
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
1199
+ if (!bin)
1200
+ return ERR_PTR(-ENOMEM);
1201
+
1202
+ net = xp_net(policy);
1203
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
1204
+
1205
+ chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
1206
+ if (!chain) {
1207
+ __xfrm_policy_inexact_prune_bin(bin, false);
1208
+ return ERR_PTR(-ENOMEM);
1209
+ }
1210
+
1211
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
1212
+ if (delpol && excl) {
1213
+ __xfrm_policy_inexact_prune_bin(bin, false);
1214
+ return ERR_PTR(-EEXIST);
1215
+ }
1216
+
1217
+ chain = &net->xfrm.policy_inexact[dir];
1218
+ xfrm_policy_insert_inexact_list(chain, policy);
1219
+
1220
+ if (delpol)
1221
+ __xfrm_policy_inexact_prune_bin(bin, false);
1222
+
1223
+ return delpol;
1224
+}
1225
+
5681226 static void xfrm_hash_rebuild(struct work_struct *work)
5691227 {
5701228 struct net *net = container_of(work, struct net,
....@@ -593,14 +1251,66 @@
5931251 } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
5941252
5951253 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1254
+ write_seqcount_begin(&xfrm_policy_hash_generation);
1255
+
1256
+ /* make sure that we can insert the indirect policies again before
1257
+ * we start with destructive action.
1258
+ */
1259
+ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
1260
+ struct xfrm_pol_inexact_bin *bin;
1261
+ u8 dbits, sbits;
1262
+
1263
+ dir = xfrm_policy_id2dir(policy->index);
1264
+ if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
1265
+ continue;
1266
+
1267
+ if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
1268
+ if (policy->family == AF_INET) {
1269
+ dbits = rbits4;
1270
+ sbits = lbits4;
1271
+ } else {
1272
+ dbits = rbits6;
1273
+ sbits = lbits6;
1274
+ }
1275
+ } else {
1276
+ if (policy->family == AF_INET) {
1277
+ dbits = lbits4;
1278
+ sbits = rbits4;
1279
+ } else {
1280
+ dbits = lbits6;
1281
+ sbits = rbits6;
1282
+ }
1283
+ }
1284
+
1285
+ if (policy->selector.prefixlen_d < dbits ||
1286
+ policy->selector.prefixlen_s < sbits)
1287
+ continue;
1288
+
1289
+ bin = xfrm_policy_inexact_alloc_bin(policy, dir);
1290
+ if (!bin)
1291
+ goto out_unlock;
1292
+
1293
+ if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
1294
+ goto out_unlock;
1295
+ }
5961296
5971297 /* reset the bydst and inexact table in all directions */
5981298 for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
599
- INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
1299
+ struct hlist_node *n;
1300
+
1301
+ hlist_for_each_entry_safe(policy, n,
1302
+ &net->xfrm.policy_inexact[dir],
1303
+ bydst_inexact_list) {
1304
+ hlist_del_rcu(&policy->bydst);
1305
+ hlist_del_init(&policy->bydst_inexact_list);
1306
+ }
1307
+
6001308 hmask = net->xfrm.policy_bydst[dir].hmask;
6011309 odst = net->xfrm.policy_bydst[dir].table;
602
- for (i = hmask; i >= 0; i--)
603
- INIT_HLIST_HEAD(odst + i);
1310
+ for (i = hmask; i >= 0; i--) {
1311
+ hlist_for_each_entry_safe(policy, n, odst + i, bydst)
1312
+ hlist_del_rcu(&policy->bydst);
1313
+ }
6041314 if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
6051315 /* dir out => dst = remote, src = local */
6061316 net->xfrm.policy_bydst[dir].dbits4 = rbits4;
....@@ -618,15 +1328,24 @@
6181328
6191329 /* re-insert all policies by order of creation */
6201330 list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
621
- if (policy->walk.dead ||
622
- xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) {
1331
+ if (policy->walk.dead)
1332
+ continue;
1333
+ dir = xfrm_policy_id2dir(policy->index);
1334
+ if (dir >= XFRM_POLICY_MAX) {
6231335 /* skip socket policies */
6241336 continue;
6251337 }
6261338 newpos = NULL;
6271339 chain = policy_hash_bysel(net, &policy->selector,
628
- policy->family,
629
- xfrm_policy_id2dir(policy->index));
1340
+ policy->family, dir);
1341
+
1342
+ if (!chain) {
1343
+ void *p = xfrm_policy_inexact_insert(policy, dir, 0);
1344
+
1345
+ WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
1346
+ continue;
1347
+ }
1348
+
6301349 hlist_for_each_entry(pol, chain, bydst) {
6311350 if (policy->priority >= pol->priority)
6321351 newpos = &pol->bydst;
....@@ -639,6 +1358,9 @@
6391358 hlist_add_head_rcu(&policy->bydst, chain);
6401359 }
6411360
1361
+out_unlock:
1362
+ __xfrm_policy_inexact_flush(net);
1363
+ write_seqcount_end(&xfrm_policy_hash_generation);
6421364 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
6431365
6441366 mutex_unlock(&hash_resize_mutex);
....@@ -727,53 +1449,149 @@
7271449 spin_unlock_bh(&pq->hold_queue.lock);
7281450 }
7291451
730
-static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
731
- struct xfrm_policy *pol)
1452
+static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
1453
+ struct xfrm_policy *pol)
7321454 {
733
- if (policy->mark.v == pol->mark.v &&
734
- policy->priority == pol->priority)
735
- return true;
736
-
737
- return false;
1455
+ return mark->v == pol->mark.v && mark->m == pol->mark.m;
7381456 }
7391457
740
-int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
1458
+static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
7411459 {
742
- struct net *net = xp_net(policy);
743
- struct xfrm_policy *pol;
744
- struct xfrm_policy *delpol;
745
- struct hlist_head *chain;
746
- struct hlist_node *newpos;
1460
+ const struct xfrm_pol_inexact_key *k = data;
1461
+ u32 a = k->type << 24 | k->dir << 16 | k->family;
7471462
748
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
749
- chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
750
- delpol = NULL;
751
- newpos = NULL;
752
- hlist_for_each_entry(pol, chain, bydst) {
1463
+ return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
1464
+ seed);
1465
+}
1466
+
1467
+static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
1468
+{
1469
+ const struct xfrm_pol_inexact_bin *b = data;
1470
+
1471
+ return xfrm_pol_bin_key(&b->k, 0, seed);
1472
+}
1473
+
1474
+static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
1475
+ const void *ptr)
1476
+{
1477
+ const struct xfrm_pol_inexact_key *key = arg->key;
1478
+ const struct xfrm_pol_inexact_bin *b = ptr;
1479
+ int ret;
1480
+
1481
+ if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
1482
+ return -1;
1483
+
1484
+ ret = b->k.dir ^ key->dir;
1485
+ if (ret)
1486
+ return ret;
1487
+
1488
+ ret = b->k.type ^ key->type;
1489
+ if (ret)
1490
+ return ret;
1491
+
1492
+ ret = b->k.family ^ key->family;
1493
+ if (ret)
1494
+ return ret;
1495
+
1496
+ return b->k.if_id ^ key->if_id;
1497
+}
1498
+
1499
+static const struct rhashtable_params xfrm_pol_inexact_params = {
1500
+ .head_offset = offsetof(struct xfrm_pol_inexact_bin, head),
1501
+ .hashfn = xfrm_pol_bin_key,
1502
+ .obj_hashfn = xfrm_pol_bin_obj,
1503
+ .obj_cmpfn = xfrm_pol_bin_cmp,
1504
+ .automatic_shrinking = true,
1505
+};
1506
+
1507
+static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
1508
+ struct xfrm_policy *policy)
1509
+{
1510
+ struct xfrm_policy *pol, *delpol = NULL;
1511
+ struct hlist_node *newpos = NULL;
1512
+ int i = 0;
1513
+
1514
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
7531515 if (pol->type == policy->type &&
7541516 pol->if_id == policy->if_id &&
7551517 !selector_cmp(&pol->selector, &policy->selector) &&
756
- xfrm_policy_mark_match(policy, pol) &&
1518
+ xfrm_policy_mark_match(&policy->mark, pol) &&
7571519 xfrm_sec_ctx_match(pol->security, policy->security) &&
7581520 !WARN_ON(delpol)) {
759
- if (excl) {
760
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
761
- return -EEXIST;
762
- }
7631521 delpol = pol;
7641522 if (policy->priority > pol->priority)
7651523 continue;
7661524 } else if (policy->priority >= pol->priority) {
767
- newpos = &pol->bydst;
1525
+ newpos = &pol->bydst_inexact_list;
7681526 continue;
7691527 }
7701528 if (delpol)
7711529 break;
7721530 }
1531
+
7731532 if (newpos)
774
- hlist_add_behind_rcu(&policy->bydst, newpos);
1533
+ hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
1534
+ else
1535
+ hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
1536
+
1537
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
1538
+ pol->pos = i;
1539
+ i++;
1540
+ }
1541
+}
1542
+
1543
+static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
1544
+ struct xfrm_policy *policy,
1545
+ bool excl)
1546
+{
1547
+ struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
1548
+
1549
+ hlist_for_each_entry(pol, chain, bydst) {
1550
+ if (pol->type == policy->type &&
1551
+ pol->if_id == policy->if_id &&
1552
+ !selector_cmp(&pol->selector, &policy->selector) &&
1553
+ xfrm_policy_mark_match(&policy->mark, pol) &&
1554
+ xfrm_sec_ctx_match(pol->security, policy->security) &&
1555
+ !WARN_ON(delpol)) {
1556
+ if (excl)
1557
+ return ERR_PTR(-EEXIST);
1558
+ delpol = pol;
1559
+ if (policy->priority > pol->priority)
1560
+ continue;
1561
+ } else if (policy->priority >= pol->priority) {
1562
+ newpos = pol;
1563
+ continue;
1564
+ }
1565
+ if (delpol)
1566
+ break;
1567
+ }
1568
+
1569
+ if (newpos)
1570
+ hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
7751571 else
7761572 hlist_add_head_rcu(&policy->bydst, chain);
1573
+
1574
+ return delpol;
1575
+}
1576
+
1577
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
1578
+{
1579
+ struct net *net = xp_net(policy);
1580
+ struct xfrm_policy *delpol;
1581
+ struct hlist_head *chain;
1582
+
1583
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1584
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
1585
+ if (chain)
1586
+ delpol = xfrm_policy_insert_list(chain, policy, excl);
1587
+ else
1588
+ delpol = xfrm_policy_inexact_insert(policy, dir, excl);
1589
+
1590
+ if (IS_ERR(delpol)) {
1591
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1592
+ return PTR_ERR(delpol);
1593
+ }
1594
+
7771595 __xfrm_policy_link(policy, dir);
7781596
7791597 /* After previous checking, family can either be AF_INET or AF_INET6 */
....@@ -803,50 +1621,101 @@
8031621 }
8041622 EXPORT_SYMBOL(xfrm_policy_insert);
8051623
806
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
807
- u8 type, int dir,
808
- struct xfrm_selector *sel,
809
- struct xfrm_sec_ctx *ctx, int delete,
810
- int *err)
1624
+static struct xfrm_policy *
1625
+__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
1626
+ u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
1627
+ struct xfrm_sec_ctx *ctx)
8111628 {
812
- struct xfrm_policy *pol, *ret;
1629
+ struct xfrm_policy *pol;
1630
+
1631
+ if (!chain)
1632
+ return NULL;
1633
+
1634
+ hlist_for_each_entry(pol, chain, bydst) {
1635
+ if (pol->type == type &&
1636
+ pol->if_id == if_id &&
1637
+ xfrm_policy_mark_match(mark, pol) &&
1638
+ !selector_cmp(sel, &pol->selector) &&
1639
+ xfrm_sec_ctx_match(ctx, pol->security))
1640
+ return pol;
1641
+ }
1642
+
1643
+ return NULL;
1644
+}
1645
+
1646
+struct xfrm_policy *
1647
+xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
1648
+ u8 type, int dir, struct xfrm_selector *sel,
1649
+ struct xfrm_sec_ctx *ctx, int delete, int *err)
1650
+{
1651
+ struct xfrm_pol_inexact_bin *bin = NULL;
1652
+ struct xfrm_policy *pol, *ret = NULL;
8131653 struct hlist_head *chain;
8141654
8151655 *err = 0;
8161656 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
8171657 chain = policy_hash_bysel(net, sel, sel->family, dir);
818
- ret = NULL;
819
- hlist_for_each_entry(pol, chain, bydst) {
820
- if (pol->type == type &&
821
- pol->if_id == if_id &&
822
- (mark & pol->mark.m) == pol->mark.v &&
823
- !selector_cmp(sel, &pol->selector) &&
824
- xfrm_sec_ctx_match(ctx, pol->security)) {
825
- xfrm_pol_hold(pol);
826
- if (delete) {
827
- *err = security_xfrm_policy_delete(
828
- pol->security);
829
- if (*err) {
830
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
831
- return pol;
832
- }
833
- __xfrm_policy_unlink(pol, dir);
834
- }
835
- ret = pol;
836
- break;
1658
+ if (!chain) {
1659
+ struct xfrm_pol_inexact_candidates cand;
1660
+ int i;
1661
+
1662
+ bin = xfrm_policy_inexact_lookup(net, type,
1663
+ sel->family, dir, if_id);
1664
+ if (!bin) {
1665
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1666
+ return NULL;
8371667 }
1668
+
1669
+ if (!xfrm_policy_find_inexact_candidates(&cand, bin,
1670
+ &sel->saddr,
1671
+ &sel->daddr)) {
1672
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1673
+ return NULL;
1674
+ }
1675
+
1676
+ pol = NULL;
1677
+ for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
1678
+ struct xfrm_policy *tmp;
1679
+
1680
+ tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
1681
+ if_id, type, dir,
1682
+ sel, ctx);
1683
+ if (!tmp)
1684
+ continue;
1685
+
1686
+ if (!pol || tmp->pos < pol->pos)
1687
+ pol = tmp;
1688
+ }
1689
+ } else {
1690
+ pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
1691
+ sel, ctx);
1692
+ }
1693
+
1694
+ if (pol) {
1695
+ xfrm_pol_hold(pol);
1696
+ if (delete) {
1697
+ *err = security_xfrm_policy_delete(pol->security);
1698
+ if (*err) {
1699
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1700
+ return pol;
1701
+ }
1702
+ __xfrm_policy_unlink(pol, dir);
1703
+ }
1704
+ ret = pol;
8381705 }
8391706 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
8401707
8411708 if (ret && delete)
8421709 xfrm_policy_kill(ret);
1710
+ if (bin && delete)
1711
+ xfrm_policy_inexact_prune_bin(bin);
8431712 return ret;
8441713 }
8451714 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
8461715
847
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
848
- u8 type, int dir, u32 id, int delete,
849
- int *err)
1716
+struct xfrm_policy *
1717
+xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
1718
+ u8 type, int dir, u32 id, int delete, int *err)
8501719 {
8511720 struct xfrm_policy *pol, *ret;
8521721 struct hlist_head *chain;
....@@ -861,8 +1730,7 @@
8611730 ret = NULL;
8621731 hlist_for_each_entry(pol, chain, byidx) {
8631732 if (pol->type == type && pol->index == id &&
864
- pol->if_id == if_id &&
865
- (mark & pol->mark.m) == pol->mark.v) {
1733
+ pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
8661734 xfrm_pol_hold(pol);
8671735 if (delete) {
8681736 *err = security_xfrm_policy_delete(
....@@ -889,36 +1757,19 @@
8891757 static inline int
8901758 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
8911759 {
892
- int dir, err = 0;
1760
+ struct xfrm_policy *pol;
1761
+ int err = 0;
8931762
894
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
895
- struct xfrm_policy *pol;
896
- int i;
1763
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
1764
+ if (pol->walk.dead ||
1765
+ xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
1766
+ pol->type != type)
1767
+ continue;
8971768
898
- hlist_for_each_entry(pol,
899
- &net->xfrm.policy_inexact[dir], bydst) {
900
- if (pol->type != type)
901
- continue;
902
- err = security_xfrm_policy_delete(pol->security);
903
- if (err) {
904
- xfrm_audit_policy_delete(pol, 0, task_valid);
905
- return err;
906
- }
907
- }
908
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
909
- hlist_for_each_entry(pol,
910
- net->xfrm.policy_bydst[dir].table + i,
911
- bydst) {
912
- if (pol->type != type)
913
- continue;
914
- err = security_xfrm_policy_delete(
915
- pol->security);
916
- if (err) {
917
- xfrm_audit_policy_delete(pol, 0,
918
- task_valid);
919
- return err;
920
- }
921
- }
1769
+ err = security_xfrm_policy_delete(pol->security);
1770
+ if (err) {
1771
+ xfrm_audit_policy_delete(pol, 0, task_valid);
1772
+ return err;
9221773 }
9231774 }
9241775 return err;
....@@ -934,6 +1785,7 @@
9341785 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
9351786 {
9361787 int dir, err = 0, cnt = 0;
1788
+ struct xfrm_policy *pol;
9371789
9381790 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
9391791
....@@ -941,48 +1793,25 @@
9411793 if (err)
9421794 goto out;
9431795
944
- for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
945
- struct xfrm_policy *pol;
946
- int i;
1796
+again:
1797
+ list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
1798
+ dir = xfrm_policy_id2dir(pol->index);
1799
+ if (pol->walk.dead ||
1800
+ dir >= XFRM_POLICY_MAX ||
1801
+ pol->type != type)
1802
+ continue;
9471803
948
- again1:
949
- hlist_for_each_entry(pol,
950
- &net->xfrm.policy_inexact[dir], bydst) {
951
- if (pol->type != type)
952
- continue;
953
- __xfrm_policy_unlink(pol, dir);
954
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
955
- cnt++;
956
-
957
- xfrm_audit_policy_delete(pol, 1, task_valid);
958
-
959
- xfrm_policy_kill(pol);
960
-
961
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
962
- goto again1;
963
- }
964
-
965
- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
966
- again2:
967
- hlist_for_each_entry(pol,
968
- net->xfrm.policy_bydst[dir].table + i,
969
- bydst) {
970
- if (pol->type != type)
971
- continue;
972
- __xfrm_policy_unlink(pol, dir);
973
- spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
974
- cnt++;
975
-
976
- xfrm_audit_policy_delete(pol, 1, task_valid);
977
- xfrm_policy_kill(pol);
978
-
979
- spin_lock_bh(&net->xfrm.xfrm_policy_lock);
980
- goto again2;
981
- }
982
- }
983
-
1804
+ __xfrm_policy_unlink(pol, dir);
1805
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
1806
+ cnt++;
1807
+ xfrm_audit_policy_delete(pol, 1, task_valid);
1808
+ xfrm_policy_kill(pol);
1809
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
1810
+ goto again;
9841811 }
985
- if (!cnt)
1812
+ if (cnt)
1813
+ __xfrm_policy_inexact_flush(net);
1814
+ else
9861815 err = -ESRCH;
9871816 out:
9881817 spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
....@@ -1081,8 +1910,174 @@
10811910 if (match)
10821911 ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
10831912 dir);
1084
-
10851913 return ret;
1914
+}
1915
+
1916
+static struct xfrm_pol_inexact_node *
1917
+xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
1918
+ seqcount_spinlock_t *count,
1919
+ const xfrm_address_t *addr, u16 family)
1920
+{
1921
+ const struct rb_node *parent;
1922
+ int seq;
1923
+
1924
+again:
1925
+ seq = read_seqcount_begin(count);
1926
+
1927
+ parent = rcu_dereference_raw(r->rb_node);
1928
+ while (parent) {
1929
+ struct xfrm_pol_inexact_node *node;
1930
+ int delta;
1931
+
1932
+ node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
1933
+
1934
+ delta = xfrm_policy_addr_delta(addr, &node->addr,
1935
+ node->prefixlen, family);
1936
+ if (delta < 0) {
1937
+ parent = rcu_dereference_raw(parent->rb_left);
1938
+ continue;
1939
+ } else if (delta > 0) {
1940
+ parent = rcu_dereference_raw(parent->rb_right);
1941
+ continue;
1942
+ }
1943
+
1944
+ return node;
1945
+ }
1946
+
1947
+ if (read_seqcount_retry(count, seq))
1948
+ goto again;
1949
+
1950
+ return NULL;
1951
+}
1952
+
1953
+static bool
1954
+xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
1955
+ struct xfrm_pol_inexact_bin *b,
1956
+ const xfrm_address_t *saddr,
1957
+ const xfrm_address_t *daddr)
1958
+{
1959
+ struct xfrm_pol_inexact_node *n;
1960
+ u16 family;
1961
+
1962
+ if (!b)
1963
+ return false;
1964
+
1965
+ family = b->k.family;
1966
+ memset(cand, 0, sizeof(*cand));
1967
+ cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
1968
+
1969
+ n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
1970
+ family);
1971
+ if (n) {
1972
+ cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
1973
+ n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
1974
+ family);
1975
+ if (n)
1976
+ cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
1977
+ }
1978
+
1979
+ n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
1980
+ family);
1981
+ if (n)
1982
+ cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
1983
+
1984
+ return true;
1985
+}
1986
+
1987
+static struct xfrm_pol_inexact_bin *
1988
+xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
1989
+ u8 dir, u32 if_id)
1990
+{
1991
+ struct xfrm_pol_inexact_key k = {
1992
+ .family = family,
1993
+ .type = type,
1994
+ .dir = dir,
1995
+ .if_id = if_id,
1996
+ };
1997
+
1998
+ write_pnet(&k.net, net);
1999
+
2000
+ return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
2001
+ xfrm_pol_inexact_params);
2002
+}
2003
+
2004
+static struct xfrm_pol_inexact_bin *
2005
+xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
2006
+ u8 dir, u32 if_id)
2007
+{
2008
+ struct xfrm_pol_inexact_bin *bin;
2009
+
2010
+ lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
2011
+
2012
+ rcu_read_lock();
2013
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
2014
+ rcu_read_unlock();
2015
+
2016
+ return bin;
2017
+}
2018
+
2019
+static struct xfrm_policy *
2020
+__xfrm_policy_eval_candidates(struct hlist_head *chain,
2021
+ struct xfrm_policy *prefer,
2022
+ const struct flowi *fl,
2023
+ u8 type, u16 family, int dir, u32 if_id)
2024
+{
2025
+ u32 priority = prefer ? prefer->priority : ~0u;
2026
+ struct xfrm_policy *pol;
2027
+
2028
+ if (!chain)
2029
+ return NULL;
2030
+
2031
+ hlist_for_each_entry_rcu(pol, chain, bydst) {
2032
+ int err;
2033
+
2034
+ if (pol->priority > priority)
2035
+ break;
2036
+
2037
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
2038
+ if (err) {
2039
+ if (err != -ESRCH)
2040
+ return ERR_PTR(err);
2041
+
2042
+ continue;
2043
+ }
2044
+
2045
+ if (prefer) {
2046
+ /* matches. Is it older than *prefer? */
2047
+ if (pol->priority == priority &&
2048
+ prefer->pos < pol->pos)
2049
+ return prefer;
2050
+ }
2051
+
2052
+ return pol;
2053
+ }
2054
+
2055
+ return NULL;
2056
+}
2057
+
2058
+static struct xfrm_policy *
2059
+xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
2060
+ struct xfrm_policy *prefer,
2061
+ const struct flowi *fl,
2062
+ u8 type, u16 family, int dir, u32 if_id)
2063
+{
2064
+ struct xfrm_policy *tmp;
2065
+ int i;
2066
+
2067
+ for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
2068
+ tmp = __xfrm_policy_eval_candidates(cand->res[i],
2069
+ prefer,
2070
+ fl, type, family, dir,
2071
+ if_id);
2072
+ if (!tmp)
2073
+ continue;
2074
+
2075
+ if (IS_ERR(tmp))
2076
+ return tmp;
2077
+ prefer = tmp;
2078
+ }
2079
+
2080
+ return prefer;
10862081 }
10872082
10882083 static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
....@@ -1090,12 +2085,13 @@
10902085 u16 family, u8 dir,
10912086 u32 if_id)
10922087 {
1093
- int err;
1094
- struct xfrm_policy *pol, *ret;
2088
+ struct xfrm_pol_inexact_candidates cand;
10952089 const xfrm_address_t *daddr, *saddr;
2090
+ struct xfrm_pol_inexact_bin *bin;
2091
+ struct xfrm_policy *pol, *ret;
10962092 struct hlist_head *chain;
10972093 unsigned int sequence;
1098
- u32 priority;
2094
+ int err;
10992095
11002096 daddr = xfrm_flowi_daddr(fl, family);
11012097 saddr = xfrm_flowi_saddr(fl, family);
....@@ -1109,7 +2105,6 @@
11092105 chain = policy_hash_direct(net, daddr, saddr, family, dir);
11102106 } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
11112107
1112
- priority = ~0U;
11132108 ret = NULL;
11142109 hlist_for_each_entry_rcu(pol, chain, bydst) {
11152110 err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
....@@ -1122,29 +2117,23 @@
11222117 }
11232118 } else {
11242119 ret = pol;
1125
- priority = ret->priority;
11262120 break;
11272121 }
11282122 }
1129
- chain = &net->xfrm.policy_inexact[dir];
1130
- hlist_for_each_entry_rcu(pol, chain, bydst) {
1131
- if ((pol->priority >= priority) && ret)
1132
- break;
2123
+ bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
2124
+ if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
2125
+ daddr))
2126
+ goto skip_inexact;
11332127
1134
- err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
1135
- if (err) {
1136
- if (err == -ESRCH)
1137
- continue;
1138
- else {
1139
- ret = ERR_PTR(err);
1140
- goto fail;
1141
- }
1142
- } else {
1143
- ret = pol;
1144
- break;
1145
- }
2128
+ pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
2129
+ family, dir, if_id);
2130
+ if (pol) {
2131
+ ret = pol;
2132
+ if (IS_ERR(pol))
2133
+ goto fail;
11462134 }
11472135
2136
+skip_inexact:
11482137 if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
11492138 goto retry;
11502139
....@@ -1236,6 +2225,7 @@
12362225 /* Socket policies are not hashed. */
12372226 if (!hlist_unhashed(&pol->bydst)) {
12382227 hlist_del_rcu(&pol->bydst);
2228
+ hlist_del_init(&pol->bydst_inexact_list);
12392229 hlist_del(&pol->byidx);
12402230 }
12412231
....@@ -1475,18 +2465,10 @@
14752465
14762466 static int xfrm_get_tos(const struct flowi *fl, int family)
14772467 {
1478
- const struct xfrm_policy_afinfo *afinfo;
1479
- int tos;
2468
+ if (family == AF_INET)
2469
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;
14802470
1481
- afinfo = xfrm_policy_get_afinfo(family);
1482
- if (!afinfo)
1483
- return 0;
1484
-
1485
- tos = afinfo->get_tos(fl);
1486
-
1487
- rcu_read_unlock();
1488
-
1489
- return tos;
2471
+ return 0;
14902472 }
14912473
14922474 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
....@@ -1524,21 +2506,14 @@
15242506 return xdst;
15252507 }
15262508
1527
-static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
1528
- int nfheader_len)
2509
+static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
2510
+ int nfheader_len)
15292511 {
1530
- const struct xfrm_policy_afinfo *afinfo =
1531
- xfrm_policy_get_afinfo(dst->ops->family);
1532
- int err;
1533
-
1534
- if (!afinfo)
1535
- return -EINVAL;
1536
-
1537
- err = afinfo->init_path(path, dst, nfheader_len);
1538
-
1539
- rcu_read_unlock();
1540
-
1541
- return err;
2512
+ if (dst->ops->family == AF_INET6) {
2513
+ struct rt6_info *rt = (struct rt6_info *)dst;
2514
+ path->path_cookie = rt6_get_cookie(rt);
2515
+ path->u.rt6.rt6i_nfheader_len = nfheader_len;
2516
+ }
15422517 }
15432518
15442519 static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
....@@ -1570,10 +2545,11 @@
15702545 const struct flowi *fl,
15712546 struct dst_entry *dst)
15722547 {
2548
+ const struct xfrm_state_afinfo *afinfo;
2549
+ const struct xfrm_mode *inner_mode;
15732550 struct net *net = xp_net(policy);
15742551 unsigned long now = jiffies;
15752552 struct net_device *dev;
1576
- struct xfrm_mode *inner_mode;
15772553 struct xfrm_dst *xdst_prev = NULL;
15782554 struct xfrm_dst *xdst0 = NULL;
15792555 int i = 0;
....@@ -1619,7 +2595,7 @@
16192595 goto put_states;
16202596 }
16212597 } else
1622
- inner_mode = xfrm[i]->inner_mode;
2598
+ inner_mode = &xfrm[i]->inner_mode;
16232599
16242600 xdst->route = dst;
16252601 dst_copy_metrics(dst1, dst);
....@@ -1643,11 +2619,17 @@
16432619 xdst->xfrm_genid = xfrm[i]->genid;
16442620
16452621 dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
1646
- dst1->flags |= DST_HOST;
16472622 dst1->lastuse = now;
16482623
16492624 dst1->input = dst_discard;
1650
- dst1->output = inner_mode->afinfo->output;
2625
+
2626
+ rcu_read_lock();
2627
+ afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
2628
+ if (likely(afinfo))
2629
+ dst1->output = afinfo->output;
2630
+ else
2631
+ dst1->output = dst_discard_out;
2632
+ rcu_read_unlock();
16512633
16522634 xdst_prev = xdst;
16532635
....@@ -1703,8 +2685,10 @@
17032685 *num_xfrms = 0;
17042686 return 0;
17052687 }
1706
- if (IS_ERR(pols[0]))
2688
+ if (IS_ERR(pols[0])) {
2689
+ *num_pols = 0;
17072690 return PTR_ERR(pols[0]);
2691
+ }
17082692
17092693 *num_xfrms = pols[0]->xfrm_nr;
17102694
....@@ -1719,6 +2703,7 @@
17192703 if (pols[1]) {
17202704 if (IS_ERR(pols[1])) {
17212705 xfrm_pols_put(pols, *num_pols);
2706
+ *num_pols = 0;
17222707 return PTR_ERR(pols[1]);
17232708 }
17242709 (*num_pols)++;
....@@ -1785,6 +2770,7 @@
17852770 struct xfrm_policy_queue *pq = &pol->polq;
17862771 struct flowi fl;
17872772 struct sk_buff_head list;
2773
+ __u32 skb_mark;
17882774
17892775 spin_lock(&pq->hold_queue.lock);
17902776 skb = skb_peek(&pq->hold_queue);
....@@ -1794,7 +2780,12 @@
17942780 }
17952781 dst = skb_dst(skb);
17962782 sk = skb->sk;
2783
+
2784
+ /* Fixup the mark to support VTI. */
2785
+ skb_mark = skb->mark;
2786
+ skb->mark = pol->mark.v;
17972787 xfrm_decode_session(skb, &fl, dst->ops->family);
2788
+ skb->mark = skb_mark;
17982789 spin_unlock(&pq->hold_queue.lock);
17992790
18002791 dst_hold(xfrm_dst_path(dst));
....@@ -1811,7 +2802,7 @@
18112802 pq->timeout = pq->timeout << 1;
18122803 if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
18132804 xfrm_pol_hold(pol);
1814
- goto out;
2805
+ goto out;
18152806 }
18162807
18172808 dst_release(dst);
....@@ -1826,7 +2817,12 @@
18262817 while (!skb_queue_empty(&list)) {
18272818 skb = __skb_dequeue(&list);
18282819
2820
+ /* Fixup the mark to support VTI. */
2821
+ skb_mark = skb->mark;
2822
+ skb->mark = pol->mark.v;
18292823 xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
2824
+ skb->mark = skb_mark;
2825
+
18302826 dst_hold(xfrm_dst_path(skb_dst(skb)));
18312827 dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
18322828 if (IS_ERR(dst)) {
....@@ -1834,7 +2830,7 @@
18342830 continue;
18352831 }
18362832
1837
- nf_reset(skb);
2833
+ nf_reset_ct(skb);
18382834 skb_dst_drop(skb);
18392835 skb_dst_set(skb, dst);
18402836
....@@ -1922,7 +2918,7 @@
19222918 dst_copy_metrics(dst1, dst);
19232919
19242920 dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
1925
- dst1->flags |= DST_HOST | DST_XFRM_QUEUE;
2921
+ dst1->flags |= DST_XFRM_QUEUE;
19262922 dst1->lastuse = jiffies;
19272923
19282924 dst1->input = dst_discard;
....@@ -2212,7 +3208,7 @@
22123208 flags | XFRM_LOOKUP_QUEUE |
22133209 XFRM_LOOKUP_KEEP_DST_REF);
22143210
2215
- if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
3211
+ if (PTR_ERR(dst) == -EREMOTE)
22163212 return make_blackhole(net, dst_orig->ops->family, dst_orig);
22173213
22183214 if (IS_ERR(dst))
....@@ -2225,11 +3221,12 @@
22253221 static inline int
22263222 xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
22273223 {
3224
+ struct sec_path *sp = skb_sec_path(skb);
22283225 struct xfrm_state *x;
22293226
2230
- if (!skb->sp || idx < 0 || idx >= skb->sp->len)
3227
+ if (!sp || idx < 0 || idx >= sp->len)
22313228 return 0;
2232
- x = skb->sp->xvec[idx];
3229
+ x = sp->xvec[idx];
22333230 if (!x->type->reject)
22343231 return 0;
22353232 return x->type->reject(x, skb, fl);
....@@ -2243,7 +3240,7 @@
22433240
22443241 static inline int
22453242 xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
2246
- unsigned short family)
3243
+ unsigned short family, u32 if_id)
22473244 {
22483245 if (xfrm_state_kern(x))
22493246 return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
....@@ -2254,7 +3251,8 @@
22543251 (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
22553252 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
22563253 !(x->props.mode != XFRM_MODE_TRANSPORT &&
2257
- xfrm_state_addr_cmp(tmpl, x, family));
3254
+ xfrm_state_addr_cmp(tmpl, x, family)) &&
3255
+ (if_id == 0 || if_id == x->if_id);
22583256 }
22593257
22603258 /*
....@@ -2266,7 +3264,7 @@
22663264 */
22673265 static inline int
22683266 xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
2269
- unsigned short family)
3267
+ unsigned short family, u32 if_id)
22703268 {
22713269 int idx = start;
22723270
....@@ -2276,9 +3274,16 @@
22763274 } else
22773275 start = -1;
22783276 for (; idx < sp->len; idx++) {
2279
- if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
3277
+ if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id))
22803278 return ++idx;
22813279 if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
3280
+ if (idx < sp->verified_cnt) {
3281
+ /* Secpath entry previously verified, consider optional and
3282
+ * continue searching
3283
+ */
3284
+ continue;
3285
+ }
3286
+
22823287 if (start == -1)
22833288 start = -2-idx;
22843289 break;
....@@ -2287,20 +3292,251 @@
22873292 return start;
22883293 }
22893294
3295
+static void
3296
+decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
3297
+{
3298
+ const struct iphdr *iph = ip_hdr(skb);
3299
+ int ihl = iph->ihl;
3300
+ u8 *xprth = skb_network_header(skb) + ihl * 4;
3301
+ struct flowi4 *fl4 = &fl->u.ip4;
3302
+ int oif = 0;
3303
+
3304
+ if (skb_dst(skb) && skb_dst(skb)->dev)
3305
+ oif = skb_dst(skb)->dev->ifindex;
3306
+
3307
+ memset(fl4, 0, sizeof(struct flowi4));
3308
+ fl4->flowi4_mark = skb->mark;
3309
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
3310
+
3311
+ fl4->flowi4_proto = iph->protocol;
3312
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
3313
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
3314
+ fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;
3315
+
3316
+ if (!ip_is_fragment(iph)) {
3317
+ switch (iph->protocol) {
3318
+ case IPPROTO_UDP:
3319
+ case IPPROTO_UDPLITE:
3320
+ case IPPROTO_TCP:
3321
+ case IPPROTO_SCTP:
3322
+ case IPPROTO_DCCP:
3323
+ if (xprth + 4 < skb->data ||
3324
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
3325
+ __be16 *ports;
3326
+
3327
+ xprth = skb_network_header(skb) + ihl * 4;
3328
+ ports = (__be16 *)xprth;
3329
+
3330
+ fl4->fl4_sport = ports[!!reverse];
3331
+ fl4->fl4_dport = ports[!reverse];
3332
+ }
3333
+ break;
3334
+ case IPPROTO_ICMP:
3335
+ if (xprth + 2 < skb->data ||
3336
+ pskb_may_pull(skb, xprth + 2 - skb->data)) {
3337
+ u8 *icmp;
3338
+
3339
+ xprth = skb_network_header(skb) + ihl * 4;
3340
+ icmp = xprth;
3341
+
3342
+ fl4->fl4_icmp_type = icmp[0];
3343
+ fl4->fl4_icmp_code = icmp[1];
3344
+ }
3345
+ break;
3346
+ case IPPROTO_ESP:
3347
+ if (xprth + 4 < skb->data ||
3348
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
3349
+ __be32 *ehdr;
3350
+
3351
+ xprth = skb_network_header(skb) + ihl * 4;
3352
+ ehdr = (__be32 *)xprth;
3353
+
3354
+ fl4->fl4_ipsec_spi = ehdr[0];
3355
+ }
3356
+ break;
3357
+ case IPPROTO_AH:
3358
+ if (xprth + 8 < skb->data ||
3359
+ pskb_may_pull(skb, xprth + 8 - skb->data)) {
3360
+ __be32 *ah_hdr;
3361
+
3362
+ xprth = skb_network_header(skb) + ihl * 4;
3363
+ ah_hdr = (__be32 *)xprth;
3364
+
3365
+ fl4->fl4_ipsec_spi = ah_hdr[1];
3366
+ }
3367
+ break;
3368
+ case IPPROTO_COMP:
3369
+ if (xprth + 4 < skb->data ||
3370
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
3371
+ __be16 *ipcomp_hdr;
3372
+
3373
+ xprth = skb_network_header(skb) + ihl * 4;
3374
+ ipcomp_hdr = (__be16 *)xprth;
3375
+
3376
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
3377
+ }
3378
+ break;
3379
+ case IPPROTO_GRE:
3380
+ if (xprth + 12 < skb->data ||
3381
+ pskb_may_pull(skb, xprth + 12 - skb->data)) {
3382
+ __be16 *greflags;
3383
+ __be32 *gre_hdr;
3384
+
3385
+ xprth = skb_network_header(skb) + ihl * 4;
3386
+ greflags = (__be16 *)xprth;
3387
+ gre_hdr = (__be32 *)xprth;
3388
+
3389
+ if (greflags[0] & GRE_KEY) {
3390
+ if (greflags[0] & GRE_CSUM)
3391
+ gre_hdr++;
3392
+ fl4->fl4_gre_key = gre_hdr[1];
3393
+ }
3394
+ }
3395
+ break;
3396
+ default:
3397
+ fl4->fl4_ipsec_spi = 0;
3398
+ break;
3399
+ }
3400
+ }
3401
+}
3402
+
3403
+#if IS_ENABLED(CONFIG_IPV6)
3404
+static void
3405
+decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
3406
+{
3407
+ struct flowi6 *fl6 = &fl->u.ip6;
3408
+ int onlyproto = 0;
3409
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
3410
+ u32 offset = sizeof(*hdr);
3411
+ struct ipv6_opt_hdr *exthdr;
3412
+ const unsigned char *nh = skb_network_header(skb);
3413
+ u16 nhoff = IP6CB(skb)->nhoff;
3414
+ int oif = 0;
3415
+ u8 nexthdr;
3416
+
3417
+ if (!nhoff)
3418
+ nhoff = offsetof(struct ipv6hdr, nexthdr);
3419
+
3420
+ nexthdr = nh[nhoff];
3421
+
3422
+ if (skb_dst(skb) && skb_dst(skb)->dev)
3423
+ oif = skb_dst(skb)->dev->ifindex;
3424
+
3425
+ memset(fl6, 0, sizeof(struct flowi6));
3426
+ fl6->flowi6_mark = skb->mark;
3427
+ fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
3428
+
3429
+ fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
3430
+ fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
3431
+
3432
+ while (nh + offset + sizeof(*exthdr) < skb->data ||
3433
+ pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
3434
+ nh = skb_network_header(skb);
3435
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
3436
+
3437
+ switch (nexthdr) {
3438
+ case NEXTHDR_FRAGMENT:
3439
+ onlyproto = 1;
3440
+ fallthrough;
3441
+ case NEXTHDR_ROUTING:
3442
+ case NEXTHDR_HOP:
3443
+ case NEXTHDR_DEST:
3444
+ offset += ipv6_optlen(exthdr);
3445
+ nexthdr = exthdr->nexthdr;
3446
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
3447
+ break;
3448
+ case IPPROTO_UDP:
3449
+ case IPPROTO_UDPLITE:
3450
+ case IPPROTO_TCP:
3451
+ case IPPROTO_SCTP:
3452
+ case IPPROTO_DCCP:
3453
+ if (!onlyproto && (nh + offset + 4 < skb->data ||
3454
+ pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
3455
+ __be16 *ports;
3456
+
3457
+ nh = skb_network_header(skb);
3458
+ ports = (__be16 *)(nh + offset);
3459
+ fl6->fl6_sport = ports[!!reverse];
3460
+ fl6->fl6_dport = ports[!reverse];
3461
+ }
3462
+ fl6->flowi6_proto = nexthdr;
3463
+ return;
3464
+ case IPPROTO_ICMPV6:
3465
+ if (!onlyproto && (nh + offset + 2 < skb->data ||
3466
+ pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
3467
+ u8 *icmp;
3468
+
3469
+ nh = skb_network_header(skb);
3470
+ icmp = (u8 *)(nh + offset);
3471
+ fl6->fl6_icmp_type = icmp[0];
3472
+ fl6->fl6_icmp_code = icmp[1];
3473
+ }
3474
+ fl6->flowi6_proto = nexthdr;
3475
+ return;
3476
+ case IPPROTO_GRE:
3477
+ if (!onlyproto &&
3478
+ (nh + offset + 12 < skb->data ||
3479
+ pskb_may_pull(skb, nh + offset + 12 - skb->data))) {
3480
+ struct gre_base_hdr *gre_hdr;
3481
+ __be32 *gre_key;
3482
+
3483
+ nh = skb_network_header(skb);
3484
+ gre_hdr = (struct gre_base_hdr *)(nh + offset);
3485
+ gre_key = (__be32 *)(gre_hdr + 1);
3486
+
3487
+ if (gre_hdr->flags & GRE_KEY) {
3488
+ if (gre_hdr->flags & GRE_CSUM)
3489
+ gre_key++;
3490
+ fl6->fl6_gre_key = *gre_key;
3491
+ }
3492
+ }
3493
+ fl6->flowi6_proto = nexthdr;
3494
+ return;
3495
+
3496
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
3497
+ case IPPROTO_MH:
3498
+ offset += ipv6_optlen(exthdr);
3499
+ if (!onlyproto && (nh + offset + 3 < skb->data ||
3500
+ pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
3501
+ struct ip6_mh *mh;
3502
+
3503
+ nh = skb_network_header(skb);
3504
+ mh = (struct ip6_mh *)(nh + offset);
3505
+ fl6->fl6_mh_type = mh->ip6mh_type;
3506
+ }
3507
+ fl6->flowi6_proto = nexthdr;
3508
+ return;
3509
+#endif
3510
+ /* XXX Why are there these headers? */
3511
+ case IPPROTO_AH:
3512
+ case IPPROTO_ESP:
3513
+ case IPPROTO_COMP:
3514
+ default:
3515
+ fl6->fl6_ipsec_spi = 0;
3516
+ fl6->flowi6_proto = nexthdr;
3517
+ return;
3518
+ }
3519
+ }
3520
+}
3521
+#endif
3522
+
22903523 int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
22913524 unsigned int family, int reverse)
22923525 {
2293
- const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
2294
- int err;
2295
-
2296
- if (unlikely(afinfo == NULL))
3526
+ switch (family) {
3527
+ case AF_INET:
3528
+ decode_session4(skb, fl, reverse);
3529
+ break;
3530
+#if IS_ENABLED(CONFIG_IPV6)
3531
+ case AF_INET6:
3532
+ decode_session6(skb, fl, reverse);
3533
+ break;
3534
+#endif
3535
+ default:
22973536 return -EAFNOSUPPORT;
3537
+ }
22983538
2299
- afinfo->decode_session(skb, fl, reverse);
2300
-
2301
- err = security_xfrm_decode_session(skb, &fl->flowi_secid);
2302
- rcu_read_unlock();
2303
- return err;
3539
+ return security_xfrm_decode_session(skb, &fl->flowi_secid);
23043540 }
23053541 EXPORT_SYMBOL(__xfrm_decode_session);
23063542
....@@ -2329,6 +3565,7 @@
23293565 struct flowi fl;
23303566 int xerr_idx = -1;
23313567 const struct xfrm_if_cb *ifcb;
3568
+ struct sec_path *sp;
23323569 struct xfrm_if *xi;
23333570 u32 if_id = 0;
23343571
....@@ -2355,11 +3592,12 @@
23553592 nf_nat_decode_session(skb, &fl, family);
23563593
23573594 /* First, check used SA against their selectors. */
2358
- if (skb->sp) {
3595
+ sp = skb_sec_path(skb);
3596
+ if (sp) {
23593597 int i;
23603598
2361
- for (i = skb->sp->len-1; i >= 0; i--) {
2362
- struct xfrm_state *x = skb->sp->xvec[i];
3599
+ for (i = sp->len - 1; i >= 0; i--) {
3600
+ struct xfrm_state *x = sp->xvec[i];
23633601 if (!xfrm_selector_match(&x->sel, &fl, family)) {
23643602 XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
23653603 return 0;
....@@ -2386,7 +3624,7 @@
23863624 }
23873625
23883626 if (!pol) {
2389
- if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
3627
+ if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
23903628 xfrm_secpath_reject(xerr_idx, skb, &fl);
23913629 XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
23923630 return 0;
....@@ -2406,6 +3644,7 @@
24063644 if (pols[1]) {
24073645 if (IS_ERR(pols[1])) {
24083646 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
3647
+ xfrm_pol_put(pols[0]);
24093648 return 0;
24103649 }
24113650 pols[1]->curlft.use_time = ktime_get_real_seconds();
....@@ -2415,7 +3654,6 @@
24153654 #endif
24163655
24173656 if (pol->action == XFRM_POLICY_ALLOW) {
2418
- struct sec_path *sp;
24193657 static struct sec_path dummy;
24203658 struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
24213659 struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
....@@ -2423,7 +3661,8 @@
24233661 int ti = 0;
24243662 int i, k;
24253663
2426
- if ((sp = skb->sp) == NULL)
3664
+ sp = skb_sec_path(skb);
3665
+ if (!sp)
24273666 sp = &dummy;
24283667
24293668 for (pi = 0; pi < npols; pi++) {
....@@ -2440,8 +3679,9 @@
24403679 tpp[ti++] = &pols[pi]->xfrm_vec[i];
24413680 }
24423681 xfrm_nr = ti;
3682
+
24433683 if (npols > 1) {
2444
- xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
3684
+ xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
24453685 tpp = stp;
24463686 }
24473687
....@@ -2450,9 +3690,12 @@
24503690 * Order is _important_. Later we will implement
24513691 * some barriers, but at the moment barriers
24523692 * are implied between each two transformations.
3693
+ * Upon success, marks secpath entries as having been
3694
+ * verified to allow them to be skipped in future policy
3695
+ * checks (e.g. nested tunnels).
24533696 */
24543697 for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
2455
- k = xfrm_policy_ok(tpp[i], sp, k, family);
3698
+ k = xfrm_policy_ok(tpp[i], sp, k, family, if_id);
24563699 if (k < 0) {
24573700 if (k < -1)
24583701 /* "-2 - errored_index" returned */
....@@ -2468,6 +3711,8 @@
24683711 }
24693712
24703713 xfrm_pols_put(pols, npols);
3714
+ sp->verified_cnt = k;
3715
+
24713716 return 1;
24723717 }
24733718 XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
....@@ -2818,13 +4063,17 @@
28184063 static int __net_init xfrm_policy_init(struct net *net)
28194064 {
28204065 unsigned int hmask, sz;
2821
- int dir;
4066
+ int dir, err;
28224067
2823
- if (net_eq(net, &init_net))
4068
+ if (net_eq(net, &init_net)) {
28244069 xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
28254070 sizeof(struct xfrm_dst),
28264071 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
28274072 NULL);
4073
+ err = rhashtable_init(&xfrm_policy_inexact_table,
4074
+ &xfrm_pol_inexact_params);
4075
+ BUG_ON(err);
4076
+ }
28284077
28294078 hmask = 8 - 1;
28304079 sz = (hmask+1) * sizeof(struct hlist_head);
....@@ -2859,6 +4108,7 @@
28594108 seqlock_init(&net->xfrm.policy_hthresh.lock);
28604109
28614110 INIT_LIST_HEAD(&net->xfrm.policy_all);
4111
+ INIT_LIST_HEAD(&net->xfrm.inexact_bins);
28624112 INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
28634113 INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
28644114 return 0;
....@@ -2877,6 +4127,7 @@
28774127
28784128 static void xfrm_policy_fini(struct net *net)
28794129 {
4130
+ struct xfrm_pol_inexact_bin *b, *t;
28804131 unsigned int sz;
28814132 int dir;
28824133
....@@ -2902,6 +4153,11 @@
29024153 sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
29034154 WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
29044155 xfrm_hash_free(net->xfrm.policy_byidx, sz);
4156
+
4157
+ spin_lock_bh(&net->xfrm.xfrm_policy_lock);
4158
+ list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
4159
+ __xfrm_policy_inexact_prune_bin(b, true);
4160
+ spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
29054161 }
29064162
29074163 static int __net_init xfrm_net_init(struct net *net)
....@@ -2955,8 +4211,12 @@
29554211 {
29564212 register_pernet_subsys(&xfrm_net_ops);
29574213 xfrm_dev_init();
2958
- seqcount_init(&xfrm_policy_hash_generation);
4214
+ seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex);
29594215 xfrm_input_init();
4216
+
4217
+#ifdef CONFIG_XFRM_ESPINTCP
4218
+ espintcp_init();
4219
+#endif
29604220
29614221 RCU_INIT_POINTER(xfrm_if_cb, NULL);
29624222 synchronize_rcu();
....@@ -3050,7 +4310,7 @@
30504310 }
30514311
30524312 static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
3053
- u8 dir, u8 type, struct net *net)
4313
+ u8 dir, u8 type, struct net *net, u32 if_id)
30544314 {
30554315 struct xfrm_policy *pol, *ret = NULL;
30564316 struct hlist_head *chain;
....@@ -3059,7 +4319,8 @@
30594319 spin_lock_bh(&net->xfrm.xfrm_policy_lock);
30604320 chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
30614321 hlist_for_each_entry(pol, chain, bydst) {
3062
- if (xfrm_migrate_selector_match(sel, &pol->selector) &&
4322
+ if ((if_id == 0 || pol->if_id == if_id) &&
4323
+ xfrm_migrate_selector_match(sel, &pol->selector) &&
30634324 pol->type == type) {
30644325 ret = pol;
30654326 priority = ret->priority;
....@@ -3067,11 +4328,12 @@
30674328 }
30684329 }
30694330 chain = &net->xfrm.policy_inexact[dir];
3070
- hlist_for_each_entry(pol, chain, bydst) {
4331
+ hlist_for_each_entry(pol, chain, bydst_inexact_list) {
30714332 if ((pol->priority >= priority) && ret)
30724333 break;
30734334
3074
- if (xfrm_migrate_selector_match(sel, &pol->selector) &&
4335
+ if ((if_id == 0 || pol->if_id == if_id) &&
4336
+ xfrm_migrate_selector_match(sel, &pol->selector) &&
30754337 pol->type == type) {
30764338 ret = pol;
30774339 break;
....@@ -3187,7 +4449,7 @@
31874449 int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
31884450 struct xfrm_migrate *m, int num_migrate,
31894451 struct xfrm_kmaddress *k, struct net *net,
3190
- struct xfrm_encap_tmpl *encap)
4452
+ struct xfrm_encap_tmpl *encap, u32 if_id)
31914453 {
31924454 int i, err, nx_cur = 0, nx_new = 0;
31934455 struct xfrm_policy *pol = NULL;
....@@ -3206,14 +4468,14 @@
32064468 }
32074469
32084470 /* Stage 1 - find policy */
3209
- if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
4471
+ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) {
32104472 err = -ENOENT;
32114473 goto out;
32124474 }
32134475
32144476 /* Stage 2 - find and update state(s) */
32154477 for (i = 0, mp = m; i < num_migrate; i++, mp++) {
3216
- if ((x = xfrm_migrate_state_find(mp, net))) {
4478
+ if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
32174479 x_cur[nx_cur] = x;
32184480 nx_cur++;
32194481 xc = xfrm_state_migrate(x, mp, encap);