hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/openvswitch/flow_table.c
....@@ -1,19 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (c) 2007-2014 Nicira, Inc.
3
- *
4
- * This program is free software; you can redistribute it and/or
5
- * modify it under the terms of version 2 of the GNU General Public
6
- * License as published by the Free Software Foundation.
7
- *
8
- * This program is distributed in the hope that it will be useful, but
9
- * WITHOUT ANY WARRANTY; without even the implied warranty of
10
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
- * General Public License for more details.
12
- *
13
- * You should have received a copy of the GNU General Public License
14
- * along with this program; if not, write to the Free Software
15
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16
- * 02110-1301, USA
174 */
185
196 #include "flow.h"
....@@ -42,12 +29,18 @@
4229 #include <linux/icmp.h>
4330 #include <linux/icmpv6.h>
4431 #include <linux/rculist.h>
32
+#include <linux/sort.h>
4533 #include <net/ip.h>
4634 #include <net/ipv6.h>
4735 #include <net/ndisc.h>
4836
4937 #define TBL_MIN_BUCKETS 1024
38
+#define MASK_ARRAY_SIZE_MIN 16
5039 #define REHASH_INTERVAL (10 * 60 * HZ)
40
+
41
+#define MC_DEFAULT_HASH_ENTRIES 256
42
+#define MC_HASH_SHIFT 8
43
+#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
5144
5245 static struct kmem_cache *flow_cache;
5346 struct kmem_cache *flow_stats_cache __read_mostly;
....@@ -79,7 +72,7 @@
7972 struct sw_flow *ovs_flow_alloc(void)
8073 {
8174 struct sw_flow *flow;
82
- struct flow_stats *stats;
75
+ struct sw_flow_stats *stats;
8376
8477 flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
8578 if (!flow)
....@@ -111,29 +104,6 @@
111104 return table->count;
112105 }
113106
114
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
115
-{
116
- struct flex_array *buckets;
117
- int i, err;
118
-
119
- buckets = flex_array_alloc(sizeof(struct hlist_head),
120
- n_buckets, GFP_KERNEL);
121
- if (!buckets)
122
- return NULL;
123
-
124
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
125
- if (err) {
126
- flex_array_free(buckets);
127
- return NULL;
128
- }
129
-
130
- for (i = 0; i < n_buckets; i++)
131
- INIT_HLIST_HEAD((struct hlist_head *)
132
- flex_array_get(buckets, i));
133
-
134
- return buckets;
135
-}
136
-
137107 static void flow_free(struct sw_flow *flow)
138108 {
139109 int cpu;
....@@ -141,12 +111,16 @@
141111 if (ovs_identifier_is_key(&flow->id))
142112 kfree(flow->id.unmasked_key);
143113 if (flow->sf_acts)
144
- ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
114
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
115
+ flow->sf_acts);
145116 /* We open code this to make sure cpu 0 is always considered */
146
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask))
117
+ for (cpu = 0; cpu < nr_cpu_ids;
118
+ cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
147119 if (flow->stats[cpu])
148120 kmem_cache_free(flow_stats_cache,
149
- (struct flow_stats __force *)flow->stats[cpu]);
121
+ (struct sw_flow_stats __force *)flow->stats[cpu]);
122
+ }
123
+
150124 kmem_cache_free(flow_cache, flow);
151125 }
152126
....@@ -168,47 +142,291 @@
168142 flow_free(flow);
169143 }
170144
171
-static void free_buckets(struct flex_array *buckets)
172
-{
173
- flex_array_free(buckets);
174
-}
175
-
176
-
177145 static void __table_instance_destroy(struct table_instance *ti)
178146 {
179
- free_buckets(ti->buckets);
147
+ kvfree(ti->buckets);
180148 kfree(ti);
181149 }
182150
183151 static struct table_instance *table_instance_alloc(int new_size)
184152 {
185153 struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
154
+ int i;
186155
187156 if (!ti)
188157 return NULL;
189158
190
- ti->buckets = alloc_buckets(new_size);
191
-
159
+ ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
160
+ GFP_KERNEL);
192161 if (!ti->buckets) {
193162 kfree(ti);
194163 return NULL;
195164 }
165
+
166
+ for (i = 0; i < new_size; i++)
167
+ INIT_HLIST_HEAD(&ti->buckets[i]);
168
+
196169 ti->n_buckets = new_size;
197170 ti->node_ver = 0;
198
- ti->keep_flows = false;
199171 get_random_bytes(&ti->hash_seed, sizeof(u32));
200172
201173 return ti;
202174 }
203175
176
+static void __mask_array_destroy(struct mask_array *ma)
177
+{
178
+ free_percpu(ma->masks_usage_stats);
179
+ kfree(ma);
180
+}
181
+
182
+static void mask_array_rcu_cb(struct rcu_head *rcu)
183
+{
184
+ struct mask_array *ma = container_of(rcu, struct mask_array, rcu);
185
+
186
+ __mask_array_destroy(ma);
187
+}
188
+
189
+static void tbl_mask_array_reset_counters(struct mask_array *ma)
190
+{
191
+ int i, cpu;
192
+
193
+ /* As the per CPU counters are not atomic we can not go ahead and
194
+ * reset them from another CPU. To be able to still have an approximate
195
+ * zero based counter we store the value at reset, and subtract it
196
+ * later when processing.
197
+ */
198
+ for (i = 0; i < ma->max; i++) {
199
+ ma->masks_usage_zero_cntr[i] = 0;
200
+
201
+ for_each_possible_cpu(cpu) {
202
+ struct mask_array_stats *stats;
203
+ unsigned int start;
204
+ u64 counter;
205
+
206
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
207
+ do {
208
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
209
+ counter = stats->usage_cntrs[i];
210
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
211
+
212
+ ma->masks_usage_zero_cntr[i] += counter;
213
+ }
214
+ }
215
+}
216
+
217
+static struct mask_array *tbl_mask_array_alloc(int size)
218
+{
219
+ struct mask_array *new;
220
+
221
+ size = max(MASK_ARRAY_SIZE_MIN, size);
222
+ new = kzalloc(sizeof(struct mask_array) +
223
+ sizeof(struct sw_flow_mask *) * size +
224
+ sizeof(u64) * size, GFP_KERNEL);
225
+ if (!new)
226
+ return NULL;
227
+
228
+ new->masks_usage_zero_cntr = (u64 *)((u8 *)new +
229
+ sizeof(struct mask_array) +
230
+ sizeof(struct sw_flow_mask *) *
231
+ size);
232
+
233
+ new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
234
+ sizeof(u64) * size,
235
+ __alignof__(u64));
236
+ if (!new->masks_usage_stats) {
237
+ kfree(new);
238
+ return NULL;
239
+ }
240
+
241
+ new->count = 0;
242
+ new->max = size;
243
+
244
+ return new;
245
+}
246
+
247
+static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
248
+{
249
+ struct mask_array *old;
250
+ struct mask_array *new;
251
+
252
+ new = tbl_mask_array_alloc(size);
253
+ if (!new)
254
+ return -ENOMEM;
255
+
256
+ old = ovsl_dereference(tbl->mask_array);
257
+ if (old) {
258
+ int i;
259
+
260
+ for (i = 0; i < old->max; i++) {
261
+ if (ovsl_dereference(old->masks[i]))
262
+ new->masks[new->count++] = old->masks[i];
263
+ }
264
+ call_rcu(&old->rcu, mask_array_rcu_cb);
265
+ }
266
+
267
+ rcu_assign_pointer(tbl->mask_array, new);
268
+
269
+ return 0;
270
+}
271
+
272
+static int tbl_mask_array_add_mask(struct flow_table *tbl,
273
+ struct sw_flow_mask *new)
274
+{
275
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
276
+ int err, ma_count = READ_ONCE(ma->count);
277
+
278
+ if (ma_count >= ma->max) {
279
+ err = tbl_mask_array_realloc(tbl, ma->max +
280
+ MASK_ARRAY_SIZE_MIN);
281
+ if (err)
282
+ return err;
283
+
284
+ ma = ovsl_dereference(tbl->mask_array);
285
+ } else {
286
+ /* On every add or delete we need to reset the counters so
287
+ * every new mask gets a fair chance of being prioritized.
288
+ */
289
+ tbl_mask_array_reset_counters(ma);
290
+ }
291
+
292
+ BUG_ON(ovsl_dereference(ma->masks[ma_count]));
293
+
294
+ rcu_assign_pointer(ma->masks[ma_count], new);
295
+ WRITE_ONCE(ma->count, ma_count + 1);
296
+
297
+ return 0;
298
+}
299
+
300
+static void tbl_mask_array_del_mask(struct flow_table *tbl,
301
+ struct sw_flow_mask *mask)
302
+{
303
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
304
+ int i, ma_count = READ_ONCE(ma->count);
305
+
306
+ /* Remove the deleted mask pointers from the array */
307
+ for (i = 0; i < ma_count; i++) {
308
+ if (mask == ovsl_dereference(ma->masks[i]))
309
+ goto found;
310
+ }
311
+
312
+ BUG();
313
+ return;
314
+
315
+found:
316
+ WRITE_ONCE(ma->count, ma_count - 1);
317
+
318
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
319
+ RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
320
+
321
+ kfree_rcu(mask, rcu);
322
+
323
+ /* Shrink the mask array if necessary. */
324
+ if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
325
+ ma_count <= (ma->max / 3))
326
+ tbl_mask_array_realloc(tbl, ma->max / 2);
327
+ else
328
+ tbl_mask_array_reset_counters(ma);
329
+
330
+}
331
+
332
+/* Remove 'mask' from the mask list, if it is not needed any more. */
333
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
334
+{
335
+ if (mask) {
336
+ /* ovs-lock is required to protect mask-refcount and
337
+ * mask list.
338
+ */
339
+ ASSERT_OVSL();
340
+ BUG_ON(!mask->ref_count);
341
+ mask->ref_count--;
342
+
343
+ if (!mask->ref_count)
344
+ tbl_mask_array_del_mask(tbl, mask);
345
+ }
346
+}
347
+
348
+static void __mask_cache_destroy(struct mask_cache *mc)
349
+{
350
+ free_percpu(mc->mask_cache);
351
+ kfree(mc);
352
+}
353
+
354
+static void mask_cache_rcu_cb(struct rcu_head *rcu)
355
+{
356
+ struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu);
357
+
358
+ __mask_cache_destroy(mc);
359
+}
360
+
361
+static struct mask_cache *tbl_mask_cache_alloc(u32 size)
362
+{
363
+ struct mask_cache_entry __percpu *cache = NULL;
364
+ struct mask_cache *new;
365
+
366
+ /* Only allow size to be 0, or a power of 2, and does not exceed
367
+ * percpu allocation size.
368
+ */
369
+ if ((!is_power_of_2(size) && size != 0) ||
370
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
371
+ return NULL;
372
+
373
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
374
+ if (!new)
375
+ return NULL;
376
+
377
+ new->cache_size = size;
378
+ if (new->cache_size > 0) {
379
+ cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry),
380
+ new->cache_size),
381
+ __alignof__(struct mask_cache_entry));
382
+ if (!cache) {
383
+ kfree(new);
384
+ return NULL;
385
+ }
386
+ }
387
+
388
+ new->mask_cache = cache;
389
+ return new;
390
+}
391
+int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
392
+{
393
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
394
+ struct mask_cache *new;
395
+
396
+ if (size == mc->cache_size)
397
+ return 0;
398
+
399
+ if ((!is_power_of_2(size) && size != 0) ||
400
+ (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
401
+ return -EINVAL;
402
+
403
+ new = tbl_mask_cache_alloc(size);
404
+ if (!new)
405
+ return -ENOMEM;
406
+
407
+ rcu_assign_pointer(table->mask_cache, new);
408
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
409
+
410
+ return 0;
411
+}
412
+
204413 int ovs_flow_tbl_init(struct flow_table *table)
205414 {
206415 struct table_instance *ti, *ufid_ti;
416
+ struct mask_cache *mc;
417
+ struct mask_array *ma;
418
+
419
+ mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
420
+ if (!mc)
421
+ return -ENOMEM;
422
+
423
+ ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
424
+ if (!ma)
425
+ goto free_mask_cache;
207426
208427 ti = table_instance_alloc(TBL_MIN_BUCKETS);
209
-
210428 if (!ti)
211
- return -ENOMEM;
429
+ goto free_mask_array;
212430
213431 ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
214432 if (!ufid_ti)
....@@ -216,7 +434,8 @@
216434
217435 rcu_assign_pointer(table->ti, ti);
218436 rcu_assign_pointer(table->ufid_ti, ufid_ti);
219
- INIT_LIST_HEAD(&table->mask_list);
437
+ rcu_assign_pointer(table->mask_array, ma);
438
+ rcu_assign_pointer(table->mask_cache, mc);
220439 table->last_rehash = jiffies;
221440 table->count = 0;
222441 table->ufid_count = 0;
....@@ -224,52 +443,70 @@
224443
225444 free_ti:
226445 __table_instance_destroy(ti);
446
+free_mask_array:
447
+ __mask_array_destroy(ma);
448
+free_mask_cache:
449
+ __mask_cache_destroy(mc);
227450 return -ENOMEM;
228451 }
229452
230453 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
231454 {
232
- struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
455
+ struct table_instance *ti;
233456
457
+ ti = container_of(rcu, struct table_instance, rcu);
234458 __table_instance_destroy(ti);
235459 }
236460
237
-static void table_instance_destroy(struct table_instance *ti,
238
- struct table_instance *ufid_ti,
239
- bool deferred)
461
+static void table_instance_flow_free(struct flow_table *table,
462
+ struct table_instance *ti,
463
+ struct table_instance *ufid_ti,
464
+ struct sw_flow *flow)
465
+{
466
+ hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
467
+ table->count--;
468
+
469
+ if (ovs_identifier_is_ufid(&flow->id)) {
470
+ hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
471
+ table->ufid_count--;
472
+ }
473
+
474
+ flow_mask_remove(table, flow->mask);
475
+}
476
+
477
+/* Must be called with OVS mutex held. */
478
+void table_instance_flow_flush(struct flow_table *table,
479
+ struct table_instance *ti,
480
+ struct table_instance *ufid_ti)
240481 {
241482 int i;
242483
243
- if (!ti)
244
- return;
245
-
246
- BUG_ON(!ufid_ti);
247
- if (ti->keep_flows)
248
- goto skip_flows;
249
-
250484 for (i = 0; i < ti->n_buckets; i++) {
251
- struct sw_flow *flow;
252
- struct hlist_head *head = flex_array_get(ti->buckets, i);
485
+ struct hlist_head *head = &ti->buckets[i];
253486 struct hlist_node *n;
254
- int ver = ti->node_ver;
255
- int ufid_ver = ufid_ti->node_ver;
487
+ struct sw_flow *flow;
256488
257
- hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
258
- hlist_del_rcu(&flow->flow_table.node[ver]);
259
- if (ovs_identifier_is_ufid(&flow->id))
260
- hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
261
- ovs_flow_free(flow, deferred);
489
+ hlist_for_each_entry_safe(flow, n, head,
490
+ flow_table.node[ti->node_ver]) {
491
+
492
+ table_instance_flow_free(table, ti, ufid_ti,
493
+ flow);
494
+ ovs_flow_free(flow, true);
262495 }
263496 }
264497
265
-skip_flows:
266
- if (deferred) {
267
- call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
268
- call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
269
- } else {
270
- __table_instance_destroy(ti);
271
- __table_instance_destroy(ufid_ti);
498
+ if (WARN_ON(table->count != 0 ||
499
+ table->ufid_count != 0)) {
500
+ table->count = 0;
501
+ table->ufid_count = 0;
272502 }
503
+}
504
+
505
+static void table_instance_destroy(struct table_instance *ti,
506
+ struct table_instance *ufid_ti)
507
+{
508
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
509
+ call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
273510 }
274511
275512 /* No need for locking this function is called from RCU callback or
....@@ -279,8 +516,12 @@
279516 {
280517 struct table_instance *ti = rcu_dereference_raw(table->ti);
281518 struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
519
+ struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
520
+ struct mask_array *ma = rcu_dereference_raw(table->mask_array);
282521
283
- table_instance_destroy(ti, ufid_ti, false);
522
+ call_rcu(&mc->rcu, mask_cache_rcu_cb);
523
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
524
+ table_instance_destroy(ti, ufid_ti);
284525 }
285526
286527 struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
....@@ -294,7 +535,7 @@
294535 ver = ti->node_ver;
295536 while (*bucket < ti->n_buckets) {
296537 i = 0;
297
- head = flex_array_get(ti->buckets, *bucket);
538
+ head = &ti->buckets[*bucket];
298539 hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
299540 if (i < *last) {
300541 i++;
....@@ -313,8 +554,7 @@
313554 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
314555 {
315556 hash = jhash_1word(hash, ti->hash_seed);
316
- return flex_array_get(ti->buckets,
317
- (hash & (ti->n_buckets - 1)));
557
+ return &ti->buckets[hash & (ti->n_buckets - 1)];
318558 }
319559
320560 static void table_instance_insert(struct table_instance *ti,
....@@ -347,21 +587,19 @@
347587 /* Insert in new table. */
348588 for (i = 0; i < old->n_buckets; i++) {
349589 struct sw_flow *flow;
350
- struct hlist_head *head;
351
-
352
- head = flex_array_get(old->buckets, i);
590
+ struct hlist_head *head = &old->buckets[i];
353591
354592 if (ufid)
355
- hlist_for_each_entry(flow, head,
356
- ufid_table.node[old_ver])
593
+ hlist_for_each_entry_rcu(flow, head,
594
+ ufid_table.node[old_ver],
595
+ lockdep_ovsl_is_held())
357596 ufid_table_instance_insert(new, flow);
358597 else
359
- hlist_for_each_entry(flow, head,
360
- flow_table.node[old_ver])
598
+ hlist_for_each_entry_rcu(flow, head,
599
+ flow_table.node[old_ver],
600
+ lockdep_ovsl_is_held())
361601 table_instance_insert(new, flow);
362602 }
363
-
364
- old->keep_flows = true;
365603 }
366604
367605 static struct table_instance *table_instance_rehash(struct table_instance *ti,
....@@ -396,10 +634,9 @@
396634 rcu_assign_pointer(flow_table->ti, new_ti);
397635 rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
398636 flow_table->last_rehash = jiffies;
399
- flow_table->count = 0;
400
- flow_table->ufid_count = 0;
401637
402
- table_instance_destroy(old_ti, old_ufid_ti, true);
638
+ table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
639
+ table_instance_destroy(old_ti, old_ufid_ti);
403640 return 0;
404641
405642 err_free_ti:
....@@ -410,13 +647,10 @@
410647 static u32 flow_hash(const struct sw_flow_key *key,
411648 const struct sw_flow_key_range *range)
412649 {
413
- int key_start = range->start;
414
- int key_end = range->end;
415
- const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
416
- int hash_u32s = (key_end - key_start) >> 2;
650
+ const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
417651
418652 /* Make sure number of hash bytes are multiple of u32. */
419
- BUILD_BUG_ON(sizeof(long) % sizeof(u32));
653
+ int hash_u32s = range_n_bytes(range) >> 2;
420654
421655 return jhash2(hash_key, hash_u32s, 0);
422656 }
....@@ -427,7 +661,7 @@
427661 return 0;
428662 else
429663 return rounddown(offsetof(struct sw_flow_key, phy),
430
- sizeof(long));
664
+ sizeof(long));
431665 }
432666
433667 static bool cmp_key(const struct sw_flow_key *key1,
....@@ -439,7 +673,7 @@
439673 long diffs = 0;
440674 int i;
441675
442
- for (i = key_start; i < key_end; i += sizeof(long))
676
+ for (i = key_start; i < key_end; i += sizeof(long))
443677 diffs |= *cp1++ ^ *cp2++;
444678
445679 return diffs == 0;
....@@ -465,7 +699,8 @@
465699
466700 static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
467701 const struct sw_flow_key *unmasked,
468
- const struct sw_flow_mask *mask)
702
+ const struct sw_flow_mask *mask,
703
+ u32 *n_mask_hit)
469704 {
470705 struct sw_flow *flow;
471706 struct hlist_head *head;
....@@ -475,7 +710,10 @@
475710 ovs_flow_mask_key(&masked_key, unmasked, false, mask);
476711 hash = flow_hash(&masked_key, &mask->range);
477712 head = find_bucket(ti, hash);
478
- hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
713
+ (*n_mask_hit)++;
714
+
715
+ hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
716
+ lockdep_ovsl_is_held()) {
479717 if (flow->mask == mask && flow->flow_table.hash == hash &&
480718 flow_cmp_masked_key(flow, &masked_key, &mask->range))
481719 return flow;
....@@ -483,46 +721,175 @@
483721 return NULL;
484722 }
485723
486
-struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
487
- const struct sw_flow_key *key,
488
- u32 *n_mask_hit)
724
+/* Flow lookup does full lookup on flow table. It starts with
725
+ * mask from index passed in *index.
726
+ * This function MUST be called with BH disabled due to the use
727
+ * of CPU specific variables.
728
+ */
729
+static struct sw_flow *flow_lookup(struct flow_table *tbl,
730
+ struct table_instance *ti,
731
+ struct mask_array *ma,
732
+ const struct sw_flow_key *key,
733
+ u32 *n_mask_hit,
734
+ u32 *n_cache_hit,
735
+ u32 *index)
489736 {
490
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
491
- struct sw_flow_mask *mask;
737
+ struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
492738 struct sw_flow *flow;
739
+ struct sw_flow_mask *mask;
740
+ int i;
741
+
742
+ if (likely(*index < ma->max)) {
743
+ mask = rcu_dereference_ovsl(ma->masks[*index]);
744
+ if (mask) {
745
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
746
+ if (flow) {
747
+ u64_stats_update_begin(&stats->syncp);
748
+ stats->usage_cntrs[*index]++;
749
+ u64_stats_update_end(&stats->syncp);
750
+ (*n_cache_hit)++;
751
+ return flow;
752
+ }
753
+ }
754
+ }
755
+
756
+ for (i = 0; i < ma->max; i++) {
757
+
758
+ if (i == *index)
759
+ continue;
760
+
761
+ mask = rcu_dereference_ovsl(ma->masks[i]);
762
+ if (unlikely(!mask))
763
+ break;
764
+
765
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
766
+ if (flow) { /* Found */
767
+ *index = i;
768
+ u64_stats_update_begin(&stats->syncp);
769
+ stats->usage_cntrs[*index]++;
770
+ u64_stats_update_end(&stats->syncp);
771
+ return flow;
772
+ }
773
+ }
774
+
775
+ return NULL;
776
+}
777
+
778
+/*
779
+ * mask_cache maps flow to probable mask. This cache is not tightly
780
+ * coupled cache, It means updates to mask list can result in inconsistent
781
+ * cache entry in mask cache.
782
+ * This is per cpu cache and is divided in MC_HASH_SEGS segments.
783
+ * In case of a hash collision the entry is hashed in next segment.
784
+ * */
785
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
786
+ const struct sw_flow_key *key,
787
+ u32 skb_hash,
788
+ u32 *n_mask_hit,
789
+ u32 *n_cache_hit)
790
+{
791
+ struct mask_cache *mc = rcu_dereference(tbl->mask_cache);
792
+ struct mask_array *ma = rcu_dereference(tbl->mask_array);
793
+ struct table_instance *ti = rcu_dereference(tbl->ti);
794
+ struct mask_cache_entry *entries, *ce;
795
+ struct sw_flow *flow;
796
+ u32 hash;
797
+ int seg;
493798
494799 *n_mask_hit = 0;
495
- list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
496
- (*n_mask_hit)++;
497
- flow = masked_flow_lookup(ti, key, mask);
498
- if (flow) /* Found */
499
- return flow;
800
+ *n_cache_hit = 0;
801
+ if (unlikely(!skb_hash || mc->cache_size == 0)) {
802
+ u32 mask_index = 0;
803
+ u32 cache = 0;
804
+
805
+ return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache,
806
+ &mask_index);
500807 }
501
- return NULL;
808
+
809
+ /* Pre and post recirulation flows usually have the same skb_hash
810
+ * value. To avoid hash collisions, rehash the 'skb_hash' with
811
+ * 'recirc_id'. */
812
+ if (key->recirc_id)
813
+ skb_hash = jhash_1word(skb_hash, key->recirc_id);
814
+
815
+ ce = NULL;
816
+ hash = skb_hash;
817
+ entries = this_cpu_ptr(mc->mask_cache);
818
+
819
+ /* Find the cache entry 'ce' to operate on. */
820
+ for (seg = 0; seg < MC_HASH_SEGS; seg++) {
821
+ int index = hash & (mc->cache_size - 1);
822
+ struct mask_cache_entry *e;
823
+
824
+ e = &entries[index];
825
+ if (e->skb_hash == skb_hash) {
826
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
827
+ n_cache_hit, &e->mask_index);
828
+ if (!flow)
829
+ e->skb_hash = 0;
830
+ return flow;
831
+ }
832
+
833
+ if (!ce || e->skb_hash < ce->skb_hash)
834
+ ce = e; /* A better replacement cache candidate. */
835
+
836
+ hash >>= MC_HASH_SHIFT;
837
+ }
838
+
839
+ /* Cache miss, do full lookup. */
840
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit,
841
+ &ce->mask_index);
842
+ if (flow)
843
+ ce->skb_hash = skb_hash;
844
+
845
+ *n_cache_hit = 0;
846
+ return flow;
502847 }
503848
504849 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
505850 const struct sw_flow_key *key)
506851 {
852
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
853
+ struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
507854 u32 __always_unused n_mask_hit;
855
+ u32 __always_unused n_cache_hit;
856
+ struct sw_flow *flow;
857
+ u32 index = 0;
508858
509
- return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
859
+ /* This function gets called trough the netlink interface and therefore
860
+ * is preemptible. However, flow_lookup() function needs to be called
861
+ * with BH disabled due to CPU specific variables.
862
+ */
863
+ local_bh_disable();
864
+ flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
865
+ local_bh_enable();
866
+ return flow;
510867 }
511868
512869 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
513870 const struct sw_flow_match *match)
514871 {
515
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
516
- struct sw_flow_mask *mask;
517
- struct sw_flow *flow;
872
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
873
+ int i;
518874
519875 /* Always called under ovs-mutex. */
520
- list_for_each_entry(mask, &tbl->mask_list, list) {
521
- flow = masked_flow_lookup(ti, match->key, mask);
876
+ for (i = 0; i < ma->max; i++) {
877
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
878
+ u32 __always_unused n_mask_hit;
879
+ struct sw_flow_mask *mask;
880
+ struct sw_flow *flow;
881
+
882
+ mask = ovsl_dereference(ma->masks[i]);
883
+ if (!mask)
884
+ continue;
885
+
886
+ flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
522887 if (flow && ovs_identifier_is_key(&flow->id) &&
523
- ovs_flow_cmp_unmasked_key(flow, match))
888
+ ovs_flow_cmp_unmasked_key(flow, match)) {
524889 return flow;
890
+ }
525891 }
892
+
526893 return NULL;
527894 }
528895
....@@ -540,7 +907,8 @@
540907 return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
541908 }
542909
543
-bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
910
+bool ovs_flow_cmp(const struct sw_flow *flow,
911
+ const struct sw_flow_match *match)
544912 {
545913 if (ovs_identifier_is_ufid(&flow->id))
546914 return flow_cmp_masked_key(flow, match->key, &match->range);
....@@ -558,7 +926,8 @@
558926
559927 hash = ufid_hash(ufid);
560928 head = find_bucket(ti, hash);
561
- hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
929
+ hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
930
+ lockdep_ovsl_is_held()) {
562931 if (flow->ufid_table.hash == hash &&
563932 ovs_flow_cmp_ufid(flow, ufid))
564933 return flow;
....@@ -568,37 +937,21 @@
568937
569938 int ovs_flow_tbl_num_masks(const struct flow_table *table)
570939 {
571
- struct sw_flow_mask *mask;
572
- int num = 0;
940
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
941
+ return READ_ONCE(ma->count);
942
+}
573943
574
- list_for_each_entry(mask, &table->mask_list, list)
575
- num++;
944
+u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
945
+{
946
+ struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
576947
577
- return num;
948
+ return READ_ONCE(mc->cache_size);
578949 }
579950
580951 static struct table_instance *table_instance_expand(struct table_instance *ti,
581952 bool ufid)
582953 {
583954 return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
584
-}
585
-
586
-/* Remove 'mask' from the mask list, if it is not needed any more. */
587
-static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
588
-{
589
- if (mask) {
590
- /* ovs-lock is required to protect mask-refcount and
591
- * mask list.
592
- */
593
- ASSERT_OVSL();
594
- BUG_ON(!mask->ref_count);
595
- mask->ref_count--;
596
-
597
- if (!mask->ref_count) {
598
- list_del_rcu(&mask->list);
599
- kfree_rcu(mask, rcu);
600
- }
601
- }
602955 }
603956
604957 /* Must be called with OVS mutex held. */
....@@ -608,17 +961,7 @@
608961 struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
609962
610963 BUG_ON(table->count == 0);
611
- hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
612
- table->count--;
613
- if (ovs_identifier_is_ufid(&flow->id)) {
614
- hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
615
- table->ufid_count--;
616
- }
617
-
618
- /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
619
- * accessible as long as the RCU read lock is held.
620
- */
621
- flow_mask_remove(table, flow->mask);
964
+ table_instance_flow_free(table, ti, ufid_ti, flow);
622965 }
623966
624967 static struct sw_flow_mask *mask_alloc(void)
....@@ -646,13 +989,16 @@
646989 static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
647990 const struct sw_flow_mask *mask)
648991 {
649
- struct list_head *ml;
992
+ struct mask_array *ma;
993
+ int i;
650994
651
- list_for_each(ml, &tbl->mask_list) {
652
- struct sw_flow_mask *m;
653
- m = container_of(ml, struct sw_flow_mask, list);
654
- if (mask_equal(mask, m))
655
- return m;
995
+ ma = ovsl_dereference(tbl->mask_array);
996
+ for (i = 0; i < ma->max; i++) {
997
+ struct sw_flow_mask *t;
998
+ t = ovsl_dereference(ma->masks[i]);
999
+
1000
+ if (t && mask_equal(mask, t))
1001
+ return t;
6561002 }
6571003
6581004 return NULL;
....@@ -663,6 +1009,7 @@
6631009 const struct sw_flow_mask *new)
6641010 {
6651011 struct sw_flow_mask *mask;
1012
+
6661013 mask = flow_mask_find(tbl, new);
6671014 if (!mask) {
6681015 /* Allocate a new mask if none exsits. */
....@@ -671,7 +1018,12 @@
6711018 return -ENOMEM;
6721019 mask->key = new->key;
6731020 mask->range = new->range;
674
- list_add_rcu(&mask->list, &tbl->mask_list);
1021
+
1022
+ /* Add mask to mask-list. */
1023
+ if (tbl_mask_array_add_mask(tbl, mask)) {
1024
+ kfree(mask);
1025
+ return -ENOMEM;
1026
+ }
6751027 } else {
6761028 BUG_ON(!mask->ref_count);
6771029 mask->ref_count++;
....@@ -743,6 +1095,99 @@
7431095 return 0;
7441096 }
7451097
1098
+static int compare_mask_and_count(const void *a, const void *b)
1099
+{
1100
+ const struct mask_count *mc_a = a;
1101
+ const struct mask_count *mc_b = b;
1102
+
1103
+ return (s64)mc_b->counter - (s64)mc_a->counter;
1104
+}
1105
+
1106
+/* Must be called with OVS mutex held. */
1107
+void ovs_flow_masks_rebalance(struct flow_table *table)
1108
+{
1109
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
1110
+ struct mask_count *masks_and_count;
1111
+ struct mask_array *new;
1112
+ int masks_entries = 0;
1113
+ int i;
1114
+
1115
+ /* Build array of all current entries with use counters. */
1116
+ masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count),
1117
+ GFP_KERNEL);
1118
+ if (!masks_and_count)
1119
+ return;
1120
+
1121
+ for (i = 0; i < ma->max; i++) {
1122
+ struct sw_flow_mask *mask;
1123
+ int cpu;
1124
+
1125
+ mask = rcu_dereference_ovsl(ma->masks[i]);
1126
+ if (unlikely(!mask))
1127
+ break;
1128
+
1129
+ masks_and_count[i].index = i;
1130
+ masks_and_count[i].counter = 0;
1131
+
1132
+ for_each_possible_cpu(cpu) {
1133
+ struct mask_array_stats *stats;
1134
+ unsigned int start;
1135
+ u64 counter;
1136
+
1137
+ stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
1138
+ do {
1139
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
1140
+ counter = stats->usage_cntrs[i];
1141
+ } while (u64_stats_fetch_retry_irq(&stats->syncp,
1142
+ start));
1143
+
1144
+ masks_and_count[i].counter += counter;
1145
+ }
1146
+
1147
+ /* Subtract the zero count value. */
1148
+ masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i];
1149
+
1150
+ /* Rather than calling tbl_mask_array_reset_counters()
1151
+ * below when no change is needed, do it inline here.
1152
+ */
1153
+ ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter;
1154
+ }
1155
+
1156
+ if (i == 0)
1157
+ goto free_mask_entries;
1158
+
1159
+ /* Sort the entries */
1160
+ masks_entries = i;
1161
+ sort(masks_and_count, masks_entries, sizeof(*masks_and_count),
1162
+ compare_mask_and_count, NULL);
1163
+
1164
+ /* If the order is the same, nothing to do... */
1165
+ for (i = 0; i < masks_entries; i++) {
1166
+ if (i != masks_and_count[i].index)
1167
+ break;
1168
+ }
1169
+ if (i == masks_entries)
1170
+ goto free_mask_entries;
1171
+
1172
+ /* Rebuilt the new list in order of usage. */
1173
+ new = tbl_mask_array_alloc(ma->max);
1174
+ if (!new)
1175
+ goto free_mask_entries;
1176
+
1177
+ for (i = 0; i < masks_entries; i++) {
1178
+ int index = masks_and_count[i].index;
1179
+
1180
+ if (ovsl_dereference(ma->masks[index]))
1181
+ new->masks[new->count++] = ma->masks[index];
1182
+ }
1183
+
1184
+ rcu_assign_pointer(table->mask_array, new);
1185
+ call_rcu(&ma->rcu, mask_array_rcu_cb);
1186
+
1187
+free_mask_entries:
1188
+ kfree(masks_and_count);
1189
+}
1190
+
7461191 /* Initializes the flow module.
7471192 * Returns zero if successful or a negative error code. */
7481193 int ovs_flow_init(void)
....@@ -752,13 +1197,13 @@
7521197
7531198 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
7541199 + (nr_cpu_ids
755
- * sizeof(struct flow_stats *)),
1200
+ * sizeof(struct sw_flow_stats *)),
7561201 0, 0, NULL);
7571202 if (flow_cache == NULL)
7581203 return -ENOMEM;
7591204
7601205 flow_stats_cache
761
- = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
1206
+ = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats),
7621207 0, SLAB_HWCACHE_ALIGN, NULL);
7631208 if (flow_stats_cache == NULL) {
7641209 kmem_cache_destroy(flow_cache);