hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/net/bpf_jit_comp.c
....@@ -1,21 +1,20 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * bpf_jit_comp.c: BPF JIT compiler
34 *
45 * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
56 * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
6
- *
7
- * This program is free software; you can redistribute it and/or
8
- * modify it under the terms of the GNU General Public License
9
- * as published by the Free Software Foundation; version 2
10
- * of the License.
117 */
128 #include <linux/netdevice.h>
139 #include <linux/filter.h>
1410 #include <linux/if_vlan.h>
1511 #include <linux/bpf.h>
16
-
12
+#include <linux/memory.h>
13
+#include <linux/sort.h>
14
+#include <asm/extable.h>
1715 #include <asm/set_memory.h>
1816 #include <asm/nospec-branch.h>
17
+#include <asm/text-patching.h>
1918
2019 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
2120 {
....@@ -100,6 +99,7 @@
10099
101100 /* Pick a register outside of BPF range for JIT internal work */
102101 #define AUX_REG (MAX_BPF_JIT_REG + 1)
102
+#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
103103
104104 /*
105105 * The following table maps BPF registers to x86-64 registers.
....@@ -108,8 +108,8 @@
108108 * register in load/store instructions, it always needs an
109109 * extra byte of encoding and is callee saved.
110110 *
111
- * Also x86-64 register R9 is unused. x86-64 register R10 is
112
- * used for blinding (if enabled).
111
+ * x86-64 register R9 is not used by BPF programs, but can be used by BPF
112
+ * trampoline. x86-64 register R10 is used for blinding (if enabled).
113113 */
114114 static const int reg2hex[] = {
115115 [BPF_REG_0] = 0, /* RAX */
....@@ -125,6 +125,20 @@
125125 [BPF_REG_FP] = 5, /* RBP readonly */
126126 [BPF_REG_AX] = 2, /* R10 temp register */
127127 [AUX_REG] = 3, /* R11 temp register */
128
+ [X86_REG_R9] = 1, /* R9 register, 6th function argument */
129
+};
130
+
131
+static const int reg2pt_regs[] = {
132
+ [BPF_REG_0] = offsetof(struct pt_regs, ax),
133
+ [BPF_REG_1] = offsetof(struct pt_regs, di),
134
+ [BPF_REG_2] = offsetof(struct pt_regs, si),
135
+ [BPF_REG_3] = offsetof(struct pt_regs, dx),
136
+ [BPF_REG_4] = offsetof(struct pt_regs, cx),
137
+ [BPF_REG_5] = offsetof(struct pt_regs, r8),
138
+ [BPF_REG_6] = offsetof(struct pt_regs, bx),
139
+ [BPF_REG_7] = offsetof(struct pt_regs, r13),
140
+ [BPF_REG_8] = offsetof(struct pt_regs, r14),
141
+ [BPF_REG_9] = offsetof(struct pt_regs, r15),
128142 };
129143
130144 /*
....@@ -139,6 +153,7 @@
139153 BIT(BPF_REG_7) |
140154 BIT(BPF_REG_8) |
141155 BIT(BPF_REG_9) |
156
+ BIT(X86_REG_R9) |
142157 BIT(BPF_REG_AX));
143158 }
144159
....@@ -197,36 +212,206 @@
197212
198213 struct jit_context {
199214 int cleanup_addr; /* Epilogue code offset */
215
+
216
+ /*
217
+ * Program specific offsets of labels in the code; these rely on the
218
+ * JIT doing at least 2 passes, recording the position on the first
219
+ * pass, only to generate the correct offset on the second pass.
220
+ */
221
+ int tail_call_direct_label;
222
+ int tail_call_indirect_label;
200223 };
201224
202225 /* Maximum number of bytes emitted while JITing one eBPF insn */
203226 #define BPF_MAX_INSN_SIZE 128
204227 #define BPF_INSN_SAFETY 64
205228
206
-#define PROLOGUE_SIZE 20
229
+/* Number of bytes emit_patch() needs to generate instructions */
230
+#define X86_PATCH_SIZE 5
231
+/* Number of bytes that will be skipped on tailcall */
232
+#define X86_TAIL_CALL_OFFSET 11
207233
208
-/*
209
- * Emit x86-64 prologue code for BPF program and check its size.
210
- * bpf_tail_call helper will skip it while jumping into another program
211
- */
212
-static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
234
+static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
213235 {
214236 u8 *prog = *pprog;
215237 int cnt = 0;
216238
239
+ if (callee_regs_used[0])
240
+ EMIT1(0x53); /* push rbx */
241
+ if (callee_regs_used[1])
242
+ EMIT2(0x41, 0x55); /* push r13 */
243
+ if (callee_regs_used[2])
244
+ EMIT2(0x41, 0x56); /* push r14 */
245
+ if (callee_regs_used[3])
246
+ EMIT2(0x41, 0x57); /* push r15 */
247
+ *pprog = prog;
248
+}
249
+
250
+static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
251
+{
252
+ u8 *prog = *pprog;
253
+ int cnt = 0;
254
+
255
+ if (callee_regs_used[3])
256
+ EMIT2(0x41, 0x5F); /* pop r15 */
257
+ if (callee_regs_used[2])
258
+ EMIT2(0x41, 0x5E); /* pop r14 */
259
+ if (callee_regs_used[1])
260
+ EMIT2(0x41, 0x5D); /* pop r13 */
261
+ if (callee_regs_used[0])
262
+ EMIT1(0x5B); /* pop rbx */
263
+ *pprog = prog;
264
+}
265
+
266
+/*
267
+ * Emit x86-64 prologue code for BPF program.
268
+ * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
269
+ * while jumping to another program
270
+ */
271
+static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
272
+ bool tail_call_reachable, bool is_subprog)
273
+{
274
+ u8 *prog = *pprog;
275
+ int cnt = X86_PATCH_SIZE;
276
+
277
+ /* BPF trampoline can be made to work without these nops,
278
+ * but let's waste 5 bytes for now and optimize later
279
+ */
280
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
281
+ prog += cnt;
282
+ if (!ebpf_from_cbpf) {
283
+ if (tail_call_reachable && !is_subprog)
284
+ EMIT2(0x31, 0xC0); /* xor eax, eax */
285
+ else
286
+ EMIT2(0x66, 0x90); /* nop2 */
287
+ }
217288 EMIT1(0x55); /* push rbp */
218289 EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
219290 /* sub rsp, rounded_stack_depth */
220
- EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
221
- EMIT1(0x53); /* push rbx */
222
- EMIT2(0x41, 0x55); /* push r13 */
223
- EMIT2(0x41, 0x56); /* push r14 */
224
- EMIT2(0x41, 0x57); /* push r15 */
225
- if (!ebpf_from_cbpf) {
226
- /* zero init tail_call_cnt */
227
- EMIT2(0x6a, 0x00);
228
- BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
291
+ if (stack_depth)
292
+ EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
293
+ if (tail_call_reachable)
294
+ EMIT1(0x50); /* push rax */
295
+ *pprog = prog;
296
+}
297
+
298
+static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
299
+{
300
+ u8 *prog = *pprog;
301
+ int cnt = 0;
302
+ s64 offset;
303
+
304
+ offset = func - (ip + X86_PATCH_SIZE);
305
+ if (!is_simm32(offset)) {
306
+ pr_err("Target call %p is out of range\n", func);
307
+ return -ERANGE;
229308 }
309
+ EMIT1_off32(opcode, offset);
310
+ *pprog = prog;
311
+ return 0;
312
+}
313
+
314
+static int emit_call(u8 **pprog, void *func, void *ip)
315
+{
316
+ return emit_patch(pprog, func, ip, 0xE8);
317
+}
318
+
319
+static int emit_jump(u8 **pprog, void *func, void *ip)
320
+{
321
+ return emit_patch(pprog, func, ip, 0xE9);
322
+}
323
+
324
+static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
325
+ void *old_addr, void *new_addr,
326
+ const bool text_live)
327
+{
328
+ const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
329
+ u8 old_insn[X86_PATCH_SIZE];
330
+ u8 new_insn[X86_PATCH_SIZE];
331
+ u8 *prog;
332
+ int ret;
333
+
334
+ memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
335
+ if (old_addr) {
336
+ prog = old_insn;
337
+ ret = t == BPF_MOD_CALL ?
338
+ emit_call(&prog, old_addr, ip) :
339
+ emit_jump(&prog, old_addr, ip);
340
+ if (ret)
341
+ return ret;
342
+ }
343
+
344
+ memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
345
+ if (new_addr) {
346
+ prog = new_insn;
347
+ ret = t == BPF_MOD_CALL ?
348
+ emit_call(&prog, new_addr, ip) :
349
+ emit_jump(&prog, new_addr, ip);
350
+ if (ret)
351
+ return ret;
352
+ }
353
+
354
+ ret = -EBUSY;
355
+ mutex_lock(&text_mutex);
356
+ if (memcmp(ip, old_insn, X86_PATCH_SIZE))
357
+ goto out;
358
+ ret = 1;
359
+ if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
360
+ if (text_live)
361
+ text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
362
+ else
363
+ memcpy(ip, new_insn, X86_PATCH_SIZE);
364
+ ret = 0;
365
+ }
366
+out:
367
+ mutex_unlock(&text_mutex);
368
+ return ret;
369
+}
370
+
371
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
372
+ void *old_addr, void *new_addr)
373
+{
374
+ if (!is_kernel_text((long)ip) &&
375
+ !is_bpf_text_address((long)ip))
376
+ /* BPF poking in modules is not supported */
377
+ return -EINVAL;
378
+
379
+ return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
380
+}
381
+
382
+#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8)
383
+
384
+static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
385
+{
386
+ u8 *prog = *pprog;
387
+ int cnt = 0;
388
+
389
+#ifdef CONFIG_RETPOLINE
390
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
391
+ EMIT_LFENCE();
392
+ EMIT2(0xFF, 0xE0 + reg);
393
+ } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
394
+ emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
395
+ } else
396
+#endif
397
+ EMIT2(0xFF, 0xE0 + reg);
398
+
399
+ *pprog = prog;
400
+}
401
+
402
+static void emit_return(u8 **pprog, u8 *ip)
403
+{
404
+ u8 *prog = *pprog;
405
+ int cnt = 0;
406
+
407
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
408
+ emit_jump(&prog, &__x86_return_thunk, ip);
409
+ } else {
410
+ EMIT1(0xC3); /* ret */
411
+ if (IS_ENABLED(CONFIG_SLS))
412
+ EMIT1(0xCC); /* int3 */
413
+ }
414
+
230415 *pprog = prog;
231416 }
232417
....@@ -244,11 +429,13 @@
244429 * goto *(prog->bpf_func + prologue_size);
245430 * out:
246431 */
247
-static void emit_bpf_tail_call(u8 **pprog)
432
+static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
433
+ u32 stack_depth, u8 *ip,
434
+ struct jit_context *ctx)
248435 {
249
- u8 *prog = *pprog;
250
- int label1, label2, label3;
251
- int cnt = 0;
436
+ int tcc_off = -4 - round_up(stack_depth, 8);
437
+ u8 *prog = *pprog, *start = *pprog;
438
+ int cnt = 0, offset;
252439
253440 /*
254441 * rdi - pointer to ctx
....@@ -263,52 +450,143 @@
263450 EMIT2(0x89, 0xD2); /* mov edx, edx */
264451 EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
265452 offsetof(struct bpf_array, map.max_entries));
266
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
267
- EMIT2(X86_JBE, OFFSET1); /* jbe out */
268
- label1 = cnt;
453
+
454
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
455
+ EMIT2(X86_JBE, offset); /* jbe out */
269456
270457 /*
271458 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
272459 * goto out;
273460 */
274
- EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
461
+ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
275462 EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
276
-#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE)
277
- EMIT2(X86_JA, OFFSET2); /* ja out */
278
- label2 = cnt;
463
+
464
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
465
+ EMIT2(X86_JA, offset); /* ja out */
279466 EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
280
- EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */
467
+ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
281468
282469 /* prog = array->ptrs[index]; */
283
- EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
470
+ EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
284471 offsetof(struct bpf_array, ptrs));
285472
286473 /*
287474 * if (prog == NULL)
288475 * goto out;
289476 */
290
- EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
291
-#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
292
- EMIT2(X86_JE, OFFSET3); /* je out */
293
- label3 = cnt;
477
+ EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */
294478
295
- /* goto *(prog->bpf_func + prologue_size); */
296
- EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */
479
+ offset = ctx->tail_call_indirect_label - (prog + 2 - start);
480
+ EMIT2(X86_JE, offset); /* je out */
481
+
482
+ pop_callee_regs(&prog, callee_regs_used);
483
+
484
+ EMIT1(0x58); /* pop rax */
485
+ if (stack_depth)
486
+ EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */
487
+ round_up(stack_depth, 8));
488
+
489
+ /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */
490
+ EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */
297491 offsetof(struct bpf_prog, bpf_func));
298
- EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */
299
-
492
+ EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */
493
+ X86_TAIL_CALL_OFFSET);
300494 /*
301
- * Wow we're ready to jump into next BPF program
495
+ * Now we're ready to jump into next BPF program
302496 * rdi == ctx (1st arg)
303
- * rax == prog->bpf_func + prologue_size
497
+ * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
304498 */
305
- RETPOLINE_RAX_BPF_JIT();
499
+ emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));
306500
307501 /* out: */
308
- BUILD_BUG_ON(cnt - label1 != OFFSET1);
309
- BUILD_BUG_ON(cnt - label2 != OFFSET2);
310
- BUILD_BUG_ON(cnt - label3 != OFFSET3);
502
+ ctx->tail_call_indirect_label = prog - start;
311503 *pprog = prog;
504
+}
505
+
506
+static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
507
+ u8 **pprog, u8 *ip,
508
+ bool *callee_regs_used, u32 stack_depth,
509
+ struct jit_context *ctx)
510
+{
511
+ int tcc_off = -4 - round_up(stack_depth, 8);
512
+ u8 *prog = *pprog, *start = *pprog;
513
+ int cnt = 0, offset;
514
+
515
+ /*
516
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
517
+ * goto out;
518
+ */
519
+ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
520
+ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
521
+
522
+ offset = ctx->tail_call_direct_label - (prog + 2 - start);
523
+ EMIT2(X86_JA, offset); /* ja out */
524
+ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
525
+ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
526
+
527
+ poke->tailcall_bypass = ip + (prog - start);
528
+ poke->adj_off = X86_TAIL_CALL_OFFSET;
529
+ poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE;
530
+ poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE;
531
+
532
+ emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
533
+ poke->tailcall_bypass);
534
+
535
+ pop_callee_regs(&prog, callee_regs_used);
536
+ EMIT1(0x58); /* pop rax */
537
+ if (stack_depth)
538
+ EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
539
+
540
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
541
+ prog += X86_PATCH_SIZE;
542
+
543
+ /* out: */
544
+ ctx->tail_call_direct_label = prog - start;
545
+
546
+ *pprog = prog;
547
+}
548
+
549
+static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
550
+{
551
+ struct bpf_jit_poke_descriptor *poke;
552
+ struct bpf_array *array;
553
+ struct bpf_prog *target;
554
+ int i, ret;
555
+
556
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
557
+ poke = &prog->aux->poke_tab[i];
558
+ WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
559
+
560
+ if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
561
+ continue;
562
+
563
+ array = container_of(poke->tail_call.map, struct bpf_array, map);
564
+ mutex_lock(&array->aux->poke_mutex);
565
+ target = array->ptrs[poke->tail_call.key];
566
+ if (target) {
567
+ /* Plain memcpy is used when image is not live yet
568
+ * and still not locked as read-only. Once poke
569
+ * location is active (poke->tailcall_target_stable),
570
+ * any parallel bpf_arch_text_poke() might occur
571
+ * still on the read-write image until we finally
572
+ * locked it as read-only. Both modifications on
573
+ * the given image are under text_mutex to avoid
574
+ * interference.
575
+ */
576
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
577
+ BPF_MOD_JUMP, NULL,
578
+ (u8 *)target->bpf_func +
579
+ poke->adj_off, false);
580
+ BUG_ON(ret < 0);
581
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
582
+ BPF_MOD_JUMP,
583
+ (u8 *)poke->tailcall_target +
584
+ X86_PATCH_SIZE, NULL, false);
585
+ BUG_ON(ret < 0);
586
+ }
587
+ WRITE_ONCE(poke->tailcall_target_stable, true);
588
+ mutex_unlock(&array->aux->poke_mutex);
589
+ }
312590 }
313591
314592 static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
....@@ -394,21 +672,141 @@
394672 *pprog = prog;
395673 }
396674
675
+/* LDX: dst_reg = *(u8*)(src_reg + off) */
676
+static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
677
+{
678
+ u8 *prog = *pprog;
679
+ int cnt = 0;
680
+
681
+ switch (size) {
682
+ case BPF_B:
683
+ /* Emit 'movzx rax, byte ptr [rax + off]' */
684
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
685
+ break;
686
+ case BPF_H:
687
+ /* Emit 'movzx rax, word ptr [rax + off]' */
688
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
689
+ break;
690
+ case BPF_W:
691
+ /* Emit 'mov eax, dword ptr [rax+0x14]' */
692
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
693
+ EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
694
+ else
695
+ EMIT1(0x8B);
696
+ break;
697
+ case BPF_DW:
698
+ /* Emit 'mov rax, qword ptr [rax+0x14]' */
699
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
700
+ break;
701
+ }
702
+ /*
703
+ * If insn->off == 0 we can save one extra byte, but
704
+ * special case of x86 R13 which always needs an offset
705
+ * is not worth the hassle
706
+ */
707
+ if (is_imm8(off))
708
+ EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
709
+ else
710
+ EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
711
+ *pprog = prog;
712
+}
713
+
714
+/* STX: *(u8*)(dst_reg + off) = src_reg */
715
+static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
716
+{
717
+ u8 *prog = *pprog;
718
+ int cnt = 0;
719
+
720
+ switch (size) {
721
+ case BPF_B:
722
+ /* Emit 'mov byte ptr [rax + off], al' */
723
+ if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
724
+ /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
725
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
726
+ else
727
+ EMIT1(0x88);
728
+ break;
729
+ case BPF_H:
730
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
731
+ EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
732
+ else
733
+ EMIT2(0x66, 0x89);
734
+ break;
735
+ case BPF_W:
736
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
737
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
738
+ else
739
+ EMIT1(0x89);
740
+ break;
741
+ case BPF_DW:
742
+ EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
743
+ break;
744
+ }
745
+ if (is_imm8(off))
746
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
747
+ else
748
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
749
+ *pprog = prog;
750
+}
751
+
752
+static bool ex_handler_bpf(const struct exception_table_entry *x,
753
+ struct pt_regs *regs, int trapnr,
754
+ unsigned long error_code, unsigned long fault_addr)
755
+{
756
+ u32 reg = x->fixup >> 8;
757
+
758
+ /* jump over faulting load and clear dest register */
759
+ *(unsigned long *)((void *)regs + reg) = 0;
760
+ regs->ip += x->fixup & 0xff;
761
+ return true;
762
+}
763
+
764
+static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
765
+ bool *regs_used, bool *tail_call_seen)
766
+{
767
+ int i;
768
+
769
+ for (i = 1; i <= insn_cnt; i++, insn++) {
770
+ if (insn->code == (BPF_JMP | BPF_TAIL_CALL))
771
+ *tail_call_seen = true;
772
+ if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6)
773
+ regs_used[0] = true;
774
+ if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7)
775
+ regs_used[1] = true;
776
+ if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8)
777
+ regs_used[2] = true;
778
+ if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9)
779
+ regs_used[3] = true;
780
+ }
781
+}
782
+
397783 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
398784 int oldproglen, struct jit_context *ctx)
399785 {
786
+ bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
400787 struct bpf_insn *insn = bpf_prog->insnsi;
788
+ bool callee_regs_used[4] = {};
401789 int insn_cnt = bpf_prog->len;
790
+ bool tail_call_seen = false;
402791 bool seen_exit = false;
403792 u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
404
- int i, cnt = 0;
793
+ int i, cnt = 0, excnt = 0;
405794 int proglen = 0;
406795 u8 *prog = temp;
407796
408
- emit_prologue(&prog, bpf_prog->aux->stack_depth,
409
- bpf_prog_was_classic(bpf_prog));
797
+ detect_reg_usage(insn, insn_cnt, callee_regs_used,
798
+ &tail_call_seen);
410799
411
- for (i = 0; i < insn_cnt; i++, insn++) {
800
+ /* tail call's presence in current prog implies it is reachable */
801
+ tail_call_reachable |= tail_call_seen;
802
+
803
+ emit_prologue(&prog, bpf_prog->aux->stack_depth,
804
+ bpf_prog_was_classic(bpf_prog), tail_call_reachable,
805
+ bpf_prog->aux->func_idx != 0);
806
+ push_callee_regs(&prog, callee_regs_used);
807
+ addrs[0] = prog - temp;
808
+
809
+ for (i = 1; i <= insn_cnt; i++, insn++) {
412810 const s32 imm32 = insn->imm;
413811 u32 dst_reg = insn->dst_reg;
414812 u32 src_reg = insn->src_reg;
....@@ -734,8 +1132,7 @@
7341132 /* speculation barrier */
7351133 case BPF_ST | BPF_NOSPEC:
7361134 if (boot_cpu_has(X86_FEATURE_XMM2))
737
- /* Emit 'lfence' */
738
- EMIT3(0x0F, 0xAE, 0xE8);
1135
+ EMIT_LFENCE();
7391136 break;
7401137
7411138 /* ST: *(u8*)(dst_reg + off) = imm */
....@@ -770,63 +1167,64 @@
7701167
7711168 /* STX: *(u8*)(dst_reg + off) = src_reg */
7721169 case BPF_STX | BPF_MEM | BPF_B:
773
- /* Emit 'mov byte ptr [rax + off], al' */
774
- if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
775
- /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
776
- EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
777
- else
778
- EMIT1(0x88);
779
- goto stx;
7801170 case BPF_STX | BPF_MEM | BPF_H:
781
- if (is_ereg(dst_reg) || is_ereg(src_reg))
782
- EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
783
- else
784
- EMIT2(0x66, 0x89);
785
- goto stx;
7861171 case BPF_STX | BPF_MEM | BPF_W:
787
- if (is_ereg(dst_reg) || is_ereg(src_reg))
788
- EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
789
- else
790
- EMIT1(0x89);
791
- goto stx;
7921172 case BPF_STX | BPF_MEM | BPF_DW:
793
- EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
794
-stx: if (is_imm8(insn->off))
795
- EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
796
- else
797
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
798
- insn->off);
1173
+ emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
7991174 break;
8001175
8011176 /* LDX: dst_reg = *(u8*)(src_reg + off) */
8021177 case BPF_LDX | BPF_MEM | BPF_B:
803
- /* Emit 'movzx rax, byte ptr [rax + off]' */
804
- EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
805
- goto ldx;
1178
+ case BPF_LDX | BPF_PROBE_MEM | BPF_B:
8061179 case BPF_LDX | BPF_MEM | BPF_H:
807
- /* Emit 'movzx rax, word ptr [rax + off]' */
808
- EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
809
- goto ldx;
1180
+ case BPF_LDX | BPF_PROBE_MEM | BPF_H:
8101181 case BPF_LDX | BPF_MEM | BPF_W:
811
- /* Emit 'mov eax, dword ptr [rax+0x14]' */
812
- if (is_ereg(dst_reg) || is_ereg(src_reg))
813
- EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
814
- else
815
- EMIT1(0x8B);
816
- goto ldx;
1182
+ case BPF_LDX | BPF_PROBE_MEM | BPF_W:
8171183 case BPF_LDX | BPF_MEM | BPF_DW:
818
- /* Emit 'mov rax, qword ptr [rax+0x14]' */
819
- EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
820
-ldx: /*
821
- * If insn->off == 0 we can save one extra byte, but
822
- * special case of x86 R13 which always needs an offset
823
- * is not worth the hassle
824
- */
825
- if (is_imm8(insn->off))
826
- EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
827
- else
828
- EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
829
- insn->off);
1184
+ case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
1185
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
1186
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
1187
+ struct exception_table_entry *ex;
1188
+ u8 *_insn = image + proglen;
1189
+ s64 delta;
1190
+
1191
+ if (!bpf_prog->aux->extable)
1192
+ break;
1193
+
1194
+ if (excnt >= bpf_prog->aux->num_exentries) {
1195
+ pr_err("ex gen bug\n");
1196
+ return -EFAULT;
1197
+ }
1198
+ ex = &bpf_prog->aux->extable[excnt++];
1199
+
1200
+ delta = _insn - (u8 *)&ex->insn;
1201
+ if (!is_simm32(delta)) {
1202
+ pr_err("extable->insn doesn't fit into 32-bit\n");
1203
+ return -EFAULT;
1204
+ }
1205
+ ex->insn = delta;
1206
+
1207
+ delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
1208
+ if (!is_simm32(delta)) {
1209
+ pr_err("extable->handler doesn't fit into 32-bit\n");
1210
+ return -EFAULT;
1211
+ }
1212
+ ex->handler = delta;
1213
+
1214
+ if (dst_reg > BPF_REG_9) {
1215
+ pr_err("verifier error\n");
1216
+ return -EFAULT;
1217
+ }
1218
+ /*
1219
+ * Compute size of x86 insn and its target dest x86 register.
1220
+ * ex_handler_bpf() will use lower 8 bits to adjust
1221
+ * pt_regs->ip to jump over this x86 instruction
1222
+ * and upper bits to figure out which pt_regs to zero out.
1223
+ * End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
1224
+ * of 4 bytes will be ignored and rbx will be zero inited.
1225
+ */
1226
+ ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
1227
+ }
8301228 break;
8311229
8321230 /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
....@@ -849,17 +1247,31 @@
8491247 /* call */
8501248 case BPF_JMP | BPF_CALL:
8511249 func = (u8 *) __bpf_call_base + imm32;
852
- jmp_offset = func - (image + addrs[i]);
853
- if (!imm32 || !is_simm32(jmp_offset)) {
854
- pr_err("unsupported BPF func %d addr %p image %p\n",
855
- imm32, func, image);
856
- return -EINVAL;
1250
+ if (tail_call_reachable) {
1251
+ /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
1252
+ EMIT3_off32(0x48, 0x8B, 0x85,
1253
+ -round_up(bpf_prog->aux->stack_depth, 8) - 8);
1254
+ if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
1255
+ return -EINVAL;
1256
+ } else {
1257
+ if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
1258
+ return -EINVAL;
8571259 }
858
- EMIT1_off32(0xE8, jmp_offset);
8591260 break;
8601261
8611262 case BPF_JMP | BPF_TAIL_CALL:
862
- emit_bpf_tail_call(&prog);
1263
+ if (imm32)
1264
+ emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
1265
+ &prog, image + addrs[i - 1],
1266
+ callee_regs_used,
1267
+ bpf_prog->aux->stack_depth,
1268
+ ctx);
1269
+ else
1270
+ emit_bpf_tail_call_indirect(&prog,
1271
+ callee_regs_used,
1272
+ bpf_prog->aux->stack_depth,
1273
+ image + addrs[i - 1],
1274
+ ctx);
8631275 break;
8641276
8651277 /* cond jump */
....@@ -873,20 +1285,41 @@
8731285 case BPF_JMP | BPF_JSLT | BPF_X:
8741286 case BPF_JMP | BPF_JSGE | BPF_X:
8751287 case BPF_JMP | BPF_JSLE | BPF_X:
1288
+ case BPF_JMP32 | BPF_JEQ | BPF_X:
1289
+ case BPF_JMP32 | BPF_JNE | BPF_X:
1290
+ case BPF_JMP32 | BPF_JGT | BPF_X:
1291
+ case BPF_JMP32 | BPF_JLT | BPF_X:
1292
+ case BPF_JMP32 | BPF_JGE | BPF_X:
1293
+ case BPF_JMP32 | BPF_JLE | BPF_X:
1294
+ case BPF_JMP32 | BPF_JSGT | BPF_X:
1295
+ case BPF_JMP32 | BPF_JSLT | BPF_X:
1296
+ case BPF_JMP32 | BPF_JSGE | BPF_X:
1297
+ case BPF_JMP32 | BPF_JSLE | BPF_X:
8761298 /* cmp dst_reg, src_reg */
877
- EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
878
- add_2reg(0xC0, dst_reg, src_reg));
1299
+ if (BPF_CLASS(insn->code) == BPF_JMP)
1300
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
1301
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
1302
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
1303
+ EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
8791304 goto emit_cond_jmp;
8801305
8811306 case BPF_JMP | BPF_JSET | BPF_X:
1307
+ case BPF_JMP32 | BPF_JSET | BPF_X:
8821308 /* test dst_reg, src_reg */
883
- EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
884
- add_2reg(0xC0, dst_reg, src_reg));
1309
+ if (BPF_CLASS(insn->code) == BPF_JMP)
1310
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
1311
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
1312
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
1313
+ EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
8851314 goto emit_cond_jmp;
8861315
8871316 case BPF_JMP | BPF_JSET | BPF_K:
1317
+ case BPF_JMP32 | BPF_JSET | BPF_K:
8881318 /* test dst_reg, imm32 */
889
- EMIT1(add_1mod(0x48, dst_reg));
1319
+ if (BPF_CLASS(insn->code) == BPF_JMP)
1320
+ EMIT1(add_1mod(0x48, dst_reg));
1321
+ else if (is_ereg(dst_reg))
1322
+ EMIT1(add_1mod(0x40, dst_reg));
8901323 EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
8911324 goto emit_cond_jmp;
8921325
....@@ -900,8 +1333,31 @@
9001333 case BPF_JMP | BPF_JSLT | BPF_K:
9011334 case BPF_JMP | BPF_JSGE | BPF_K:
9021335 case BPF_JMP | BPF_JSLE | BPF_K:
1336
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
1337
+ case BPF_JMP32 | BPF_JNE | BPF_K:
1338
+ case BPF_JMP32 | BPF_JGT | BPF_K:
1339
+ case BPF_JMP32 | BPF_JLT | BPF_K:
1340
+ case BPF_JMP32 | BPF_JGE | BPF_K:
1341
+ case BPF_JMP32 | BPF_JLE | BPF_K:
1342
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
1343
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
1344
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
1345
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
1346
+ /* test dst_reg, dst_reg to save one extra byte */
1347
+ if (imm32 == 0) {
1348
+ if (BPF_CLASS(insn->code) == BPF_JMP)
1349
+ EMIT1(add_2mod(0x48, dst_reg, dst_reg));
1350
+ else if (is_ereg(dst_reg))
1351
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
1352
+ EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
1353
+ goto emit_cond_jmp;
1354
+ }
1355
+
9031356 /* cmp dst_reg, imm8/32 */
904
- EMIT1(add_1mod(0x48, dst_reg));
1357
+ if (BPF_CLASS(insn->code) == BPF_JMP)
1358
+ EMIT1(add_1mod(0x48, dst_reg));
1359
+ else if (is_ereg(dst_reg))
1360
+ EMIT1(add_1mod(0x40, dst_reg));
9051361
9061362 if (is_imm8(imm32))
9071363 EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
....@@ -998,14 +1454,9 @@
9981454 seen_exit = true;
9991455 /* Update cleanup_addr */
10001456 ctx->cleanup_addr = proglen;
1001
- if (!bpf_prog_was_classic(bpf_prog))
1002
- EMIT1(0x5B); /* get rid of tail_call_cnt */
1003
- EMIT2(0x41, 0x5F); /* pop r15 */
1004
- EMIT2(0x41, 0x5E); /* pop r14 */
1005
- EMIT2(0x41, 0x5D); /* pop r13 */
1006
- EMIT1(0x5B); /* pop rbx */
1457
+ pop_callee_regs(&prog, callee_regs_used);
10071458 EMIT1(0xC9); /* leave */
1008
- EMIT1(0xC3); /* ret */
1459
+ emit_return(&prog, image + addrs[i - 1] + (prog - temp));
10091460 break;
10101461
10111462 default:
....@@ -1045,7 +1496,506 @@
10451496 addrs[i] = proglen;
10461497 prog = temp;
10471498 }
1499
+
1500
+ if (image && excnt != bpf_prog->aux->num_exentries) {
1501
+ pr_err("extable is not populated\n");
1502
+ return -EFAULT;
1503
+ }
10481504 return proglen;
1505
+}
1506
+
1507
+static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
1508
+ int stack_size)
1509
+{
1510
+ int i;
1511
+ /* Store function arguments to stack.
1512
+ * For a function that accepts two pointers the sequence will be:
1513
+ * mov QWORD PTR [rbp-0x10],rdi
1514
+ * mov QWORD PTR [rbp-0x8],rsi
1515
+ */
1516
+ for (i = 0; i < min(nr_args, 6); i++)
1517
+ emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
1518
+ BPF_REG_FP,
1519
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
1520
+ -(stack_size - i * 8));
1521
+}
1522
+
1523
+static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
1524
+ int stack_size)
1525
+{
1526
+ int i;
1527
+
1528
+ /* Restore function arguments from stack.
1529
+ * For a function that accepts two pointers the sequence will be:
1530
+ * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
1531
+ * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
1532
+ */
1533
+ for (i = 0; i < min(nr_args, 6); i++)
1534
+ emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
1535
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
1536
+ BPF_REG_FP,
1537
+ -(stack_size - i * 8));
1538
+}
1539
+
1540
+static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
1541
+ struct bpf_prog *p, int stack_size, bool save_ret)
1542
+{
1543
+ u8 *prog = *pprog;
1544
+ int cnt = 0;
1545
+
1546
+ if (p->aux->sleepable) {
1547
+ if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
1548
+ return -EINVAL;
1549
+ } else {
1550
+ if (emit_call(&prog, __bpf_prog_enter, prog))
1551
+ return -EINVAL;
1552
+ /* remember prog start time returned by __bpf_prog_enter */
1553
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
1554
+ }
1555
+
1556
+ /* arg1: lea rdi, [rbp - stack_size] */
1557
+ EMIT4(0x48, 0x8D, 0x7D, -stack_size);
1558
+ /* arg2: progs[i]->insnsi for interpreter */
1559
+ if (!p->jited)
1560
+ emit_mov_imm64(&prog, BPF_REG_2,
1561
+ (long) p->insnsi >> 32,
1562
+ (u32) (long) p->insnsi);
1563
+ /* call JITed bpf program or interpreter */
1564
+ if (emit_call(&prog, p->bpf_func, prog))
1565
+ return -EINVAL;
1566
+
1567
+ /*
1568
+ * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
1569
+ * of the previous call which is then passed on the stack to
1570
+ * the next BPF program.
1571
+ *
1572
+ * BPF_TRAMP_FENTRY trampoline may need to return the return
1573
+ * value of BPF_PROG_TYPE_STRUCT_OPS prog.
1574
+ */
1575
+ if (save_ret)
1576
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
1577
+
1578
+ if (p->aux->sleepable) {
1579
+ if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
1580
+ return -EINVAL;
1581
+ } else {
1582
+ /* arg1: mov rdi, progs[i] */
1583
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
1584
+ (u32) (long) p);
1585
+ /* arg2: mov rsi, rbx <- start time in nsec */
1586
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
1587
+ if (emit_call(&prog, __bpf_prog_exit, prog))
1588
+ return -EINVAL;
1589
+ }
1590
+
1591
+ *pprog = prog;
1592
+ return 0;
1593
+}
1594
+
1595
+static void emit_nops(u8 **pprog, unsigned int len)
1596
+{
1597
+ unsigned int i, noplen;
1598
+ u8 *prog = *pprog;
1599
+ int cnt = 0;
1600
+
1601
+ while (len > 0) {
1602
+ noplen = len;
1603
+
1604
+ if (noplen > ASM_NOP_MAX)
1605
+ noplen = ASM_NOP_MAX;
1606
+
1607
+ for (i = 0; i < noplen; i++)
1608
+ EMIT1(ideal_nops[noplen][i]);
1609
+ len -= noplen;
1610
+ }
1611
+
1612
+ *pprog = prog;
1613
+}
1614
+
1615
+static void emit_align(u8 **pprog, u32 align)
1616
+{
1617
+ u8 *target, *prog = *pprog;
1618
+
1619
+ target = PTR_ALIGN(prog, align);
1620
+ if (target != prog)
1621
+ emit_nops(&prog, target - prog);
1622
+
1623
+ *pprog = prog;
1624
+}
1625
+
1626
+static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
1627
+{
1628
+ u8 *prog = *pprog;
1629
+ int cnt = 0;
1630
+ s64 offset;
1631
+
1632
+ offset = func - (ip + 2 + 4);
1633
+ if (!is_simm32(offset)) {
1634
+ pr_err("Target %p is out of range\n", func);
1635
+ return -EINVAL;
1636
+ }
1637
+ EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
1638
+ *pprog = prog;
1639
+ return 0;
1640
+}
1641
+
1642
+static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
1643
+ struct bpf_tramp_progs *tp, int stack_size,
1644
+ bool save_ret)
1645
+{
1646
+ int i;
1647
+ u8 *prog = *pprog;
1648
+
1649
+ for (i = 0; i < tp->nr_progs; i++) {
1650
+ if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size,
1651
+ save_ret))
1652
+ return -EINVAL;
1653
+ }
1654
+ *pprog = prog;
1655
+ return 0;
1656
+}
1657
+
1658
+static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
1659
+ struct bpf_tramp_progs *tp, int stack_size,
1660
+ u8 **branches)
1661
+{
1662
+ u8 *prog = *pprog;
1663
+ int i, cnt = 0;
1664
+
1665
+ /* The first fmod_ret program will receive a garbage return value.
1666
+ * Set this to 0 to avoid confusing the program.
1667
+ */
1668
+ emit_mov_imm32(&prog, false, BPF_REG_0, 0);
1669
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
1670
+ for (i = 0; i < tp->nr_progs; i++) {
1671
+ if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true))
1672
+ return -EINVAL;
1673
+
1674
+ /* mod_ret prog stored return value into [rbp - 8]. Emit:
1675
+ * if (*(u64 *)(rbp - 8) != 0)
1676
+ * goto do_fexit;
1677
+ */
1678
+ /* cmp QWORD PTR [rbp - 0x8], 0x0 */
1679
+ EMIT4(0x48, 0x83, 0x7d, 0xf8); EMIT1(0x00);
1680
+
1681
+ /* Save the location of the branch and Generate 6 nops
1682
+ * (4 bytes for an offset and 2 bytes for the jump) These nops
1683
+ * are replaced with a conditional jump once do_fexit (i.e. the
1684
+ * start of the fexit invocation) is finalized.
1685
+ */
1686
+ branches[i] = prog;
1687
+ emit_nops(&prog, 4 + 2);
1688
+ }
1689
+
1690
+ *pprog = prog;
1691
+ return 0;
1692
+}
1693
+
1694
+static bool is_valid_bpf_tramp_flags(unsigned int flags)
1695
+{
1696
+ if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
1697
+ (flags & BPF_TRAMP_F_SKIP_FRAME))
1698
+ return false;
1699
+
1700
+ /*
1701
+ * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
1702
+ * and it must be used alone.
1703
+ */
1704
+ if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) &&
1705
+ (flags & ~BPF_TRAMP_F_RET_FENTRY_RET))
1706
+ return false;
1707
+
1708
+ return true;
1709
+}
1710
+
1711
+/* Example:
1712
+ * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
1713
+ * its 'struct btf_func_model' will be nr_args=2
1714
+ * The assembly code when eth_type_trans is executing after trampoline:
1715
+ *
1716
+ * push rbp
1717
+ * mov rbp, rsp
1718
+ * sub rsp, 16 // space for skb and dev
1719
+ * push rbx // temp regs to pass start time
1720
+ * mov qword ptr [rbp - 16], rdi // save skb pointer to stack
1721
+ * mov qword ptr [rbp - 8], rsi // save dev pointer to stack
1722
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
1723
+ * mov rbx, rax // remember start time in bpf stats are enabled
1724
+ * lea rdi, [rbp - 16] // R1==ctx of bpf prog
1725
+ * call addr_of_jited_FENTRY_prog
1726
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
1727
+ * mov rsi, rbx // prog start time
1728
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
1729
+ * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack
1730
+ * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack
1731
+ * pop rbx
1732
+ * leave
1733
+ * ret
1734
+ *
1735
+ * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
1736
+ * replaced with 'call generated_bpf_trampoline'. When it returns
1737
+ * eth_type_trans will continue executing with original skb and dev pointers.
1738
+ *
1739
+ * The assembly code when eth_type_trans is called from trampoline:
1740
+ *
1741
+ * push rbp
1742
+ * mov rbp, rsp
1743
+ * sub rsp, 24 // space for skb, dev, return value
1744
+ * push rbx // temp regs to pass start time
1745
+ * mov qword ptr [rbp - 24], rdi // save skb pointer to stack
1746
+ * mov qword ptr [rbp - 16], rsi // save dev pointer to stack
1747
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
1748
+ * mov rbx, rax // remember start time if bpf stats are enabled
1749
+ * lea rdi, [rbp - 24] // R1==ctx of bpf prog
1750
+ * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev
1751
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
1752
+ * mov rsi, rbx // prog start time
1753
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
1754
+ * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack
1755
+ * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack
1756
+ * call eth_type_trans+5 // execute body of eth_type_trans
1757
+ * mov qword ptr [rbp - 8], rax // save return value
1758
+ * call __bpf_prog_enter // rcu_read_lock and preempt_disable
1759
+ * mov rbx, rax // remember start time in bpf stats are enabled
1760
+ * lea rdi, [rbp - 24] // R1==ctx of bpf prog
1761
+ * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value
1762
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
1763
+ * mov rsi, rbx // prog start time
1764
+ * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
1765
+ * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value
1766
+ * pop rbx
1767
+ * leave
1768
+ * add rsp, 8 // skip eth_type_trans's frame
1769
+ * ret // return to its caller
1770
+ */
1771
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
1772
+ const struct btf_func_model *m, u32 flags,
1773
+ struct bpf_tramp_progs *tprogs,
1774
+ void *orig_call)
1775
+{
1776
+ int ret, i, cnt = 0, nr_args = m->nr_args;
1777
+ int stack_size = nr_args * 8;
1778
+ struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
1779
+ struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
1780
+ struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
1781
+ u8 **branches = NULL;
1782
+ u8 *prog;
1783
+ bool save_ret;
1784
+
1785
+ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
1786
+ if (nr_args > 6)
1787
+ return -ENOTSUPP;
1788
+
1789
+ if (!is_valid_bpf_tramp_flags(flags))
1790
+ return -EINVAL;
1791
+
1792
+ /* room for return value of orig_call or fentry prog */
1793
+ save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
1794
+ if (save_ret)
1795
+ stack_size += 8;
1796
+
1797
+ if (flags & BPF_TRAMP_F_SKIP_FRAME)
1798
+ /* skip patched call instruction and point orig_call to actual
1799
+ * body of the kernel function.
1800
+ */
1801
+ orig_call += X86_PATCH_SIZE;
1802
+
1803
+ prog = image;
1804
+
1805
+ EMIT1(0x55); /* push rbp */
1806
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
1807
+ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
1808
+ EMIT1(0x53); /* push rbx */
1809
+
1810
+ save_regs(m, &prog, nr_args, stack_size);
1811
+
1812
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
1813
+ /* arg1: mov rdi, im */
1814
+ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
1815
+ if (emit_call(&prog, __bpf_tramp_enter, prog)) {
1816
+ ret = -EINVAL;
1817
+ goto cleanup;
1818
+ }
1819
+ }
1820
+
1821
+ if (fentry->nr_progs)
1822
+ if (invoke_bpf(m, &prog, fentry, stack_size,
1823
+ flags & BPF_TRAMP_F_RET_FENTRY_RET))
1824
+ return -EINVAL;
1825
+
1826
+ if (fmod_ret->nr_progs) {
1827
+ branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *),
1828
+ GFP_KERNEL);
1829
+ if (!branches)
1830
+ return -ENOMEM;
1831
+
1832
+ if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
1833
+ branches)) {
1834
+ ret = -EINVAL;
1835
+ goto cleanup;
1836
+ }
1837
+ }
1838
+
1839
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
1840
+ restore_regs(m, &prog, nr_args, stack_size);
1841
+
1842
+ /* call original function */
1843
+ if (emit_call(&prog, orig_call, prog)) {
1844
+ ret = -EINVAL;
1845
+ goto cleanup;
1846
+ }
1847
+ /* remember return value in a stack for bpf prog to access */
1848
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
1849
+ im->ip_after_call = prog;
1850
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
1851
+ prog += X86_PATCH_SIZE;
1852
+ }
1853
+
1854
+ if (fmod_ret->nr_progs) {
1855
+ /* From Intel 64 and IA-32 Architectures Optimization
1856
+ * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
1857
+ * Coding Rule 11: All branch targets should be 16-byte
1858
+ * aligned.
1859
+ */
1860
+ emit_align(&prog, 16);
1861
+ /* Update the branches saved in invoke_bpf_mod_ret with the
1862
+ * aligned address of do_fexit.
1863
+ */
1864
+ for (i = 0; i < fmod_ret->nr_progs; i++)
1865
+ emit_cond_near_jump(&branches[i], prog, branches[i],
1866
+ X86_JNE);
1867
+ }
1868
+
1869
+ if (fexit->nr_progs)
1870
+ if (invoke_bpf(m, &prog, fexit, stack_size, false)) {
1871
+ ret = -EINVAL;
1872
+ goto cleanup;
1873
+ }
1874
+
1875
+ if (flags & BPF_TRAMP_F_RESTORE_REGS)
1876
+ restore_regs(m, &prog, nr_args, stack_size);
1877
+
1878
+ /* This needs to be done regardless. If there were fmod_ret programs,
1879
+ * the return value is only updated on the stack and still needs to be
1880
+ * restored to R0.
1881
+ */
1882
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
1883
+ im->ip_epilogue = prog;
1884
+ /* arg1: mov rdi, im */
1885
+ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
1886
+ if (emit_call(&prog, __bpf_tramp_exit, prog)) {
1887
+ ret = -EINVAL;
1888
+ goto cleanup;
1889
+ }
1890
+ }
1891
+ /* restore return value of orig_call or fentry prog back into RAX */
1892
+ if (save_ret)
1893
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
1894
+
1895
+ EMIT1(0x5B); /* pop rbx */
1896
+ EMIT1(0xC9); /* leave */
1897
+ if (flags & BPF_TRAMP_F_SKIP_FRAME)
1898
+ /* skip our return address and return to parent */
1899
+ EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
1900
+ emit_return(&prog, prog);
1901
+ /* Make sure the trampoline generation logic doesn't overflow */
1902
+ if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
1903
+ ret = -EFAULT;
1904
+ goto cleanup;
1905
+ }
1906
+ ret = prog - (u8 *)image;
1907
+
1908
+cleanup:
1909
+ kfree(branches);
1910
+ return ret;
1911
+}
1912
+
1913
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
1914
+{
1915
+ u8 *jg_reloc, *prog = *pprog;
1916
+ int pivot, err, jg_bytes = 1, cnt = 0;
1917
+ s64 jg_offset;
1918
+
1919
+ if (a == b) {
1920
+ /* Leaf node of recursion, i.e. not a range of indices
1921
+ * anymore.
1922
+ */
1923
+ EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
1924
+ if (!is_simm32(progs[a]))
1925
+ return -1;
1926
+ EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
1927
+ progs[a]);
1928
+ err = emit_cond_near_jump(&prog, /* je func */
1929
+ (void *)progs[a], prog,
1930
+ X86_JE);
1931
+ if (err)
1932
+ return err;
1933
+
1934
+ emit_indirect_jump(&prog, 2 /* rdx */, prog);
1935
+
1936
+ *pprog = prog;
1937
+ return 0;
1938
+ }
1939
+
1940
+ /* Not a leaf node, so we pivot, and recursively descend into
1941
+ * the lower and upper ranges.
1942
+ */
1943
+ pivot = (b - a) / 2;
1944
+ EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
1945
+ if (!is_simm32(progs[a + pivot]))
1946
+ return -1;
1947
+ EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
1948
+
1949
+ if (pivot > 2) { /* jg upper_part */
1950
+ /* Require near jump. */
1951
+ jg_bytes = 4;
1952
+ EMIT2_off32(0x0F, X86_JG + 0x10, 0);
1953
+ } else {
1954
+ EMIT2(X86_JG, 0);
1955
+ }
1956
+ jg_reloc = prog;
1957
+
1958
+ err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */
1959
+ progs);
1960
+ if (err)
1961
+ return err;
1962
+
1963
+ /* From Intel 64 and IA-32 Architectures Optimization
1964
+ * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
1965
+ * Coding Rule 11: All branch targets should be 16-byte
1966
+ * aligned.
1967
+ */
1968
+ emit_align(&prog, 16);
1969
+ jg_offset = prog - jg_reloc;
1970
+ emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
1971
+
1972
+ err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */
1973
+ b, progs);
1974
+ if (err)
1975
+ return err;
1976
+
1977
+ *pprog = prog;
1978
+ return 0;
1979
+}
1980
+
1981
+static int cmp_ips(const void *a, const void *b)
1982
+{
1983
+ const s64 *ipa = a;
1984
+ const s64 *ipb = b;
1985
+
1986
+ if (*ipa > *ipb)
1987
+ return 1;
1988
+ if (*ipa < *ipb)
1989
+ return -1;
1990
+ return 0;
1991
+}
1992
+
1993
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
1994
+{
1995
+ u8 *prog = image;
1996
+
1997
+ sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
1998
+ return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
10491999 }
10502000
10512001 struct x64_jit_data {
....@@ -1103,7 +2053,7 @@
11032053 extra_pass = true;
11042054 goto skip_init_addrs;
11052055 }
1106
- addrs = kmalloc_array(prog->len, sizeof(*addrs), GFP_KERNEL);
2056
+ addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
11072057 if (!addrs) {
11082058 prog = orig_prog;
11092059 goto out_addrs;
....@@ -1113,7 +2063,7 @@
11132063 * Before first pass, make a rough estimation of addrs[]
11142064 * each BPF instruction is translated to less than 64 bytes
11152065 */
1116
- for (proglen = 0, i = 0; i < prog->len; i++) {
2066
+ for (proglen = 0, i = 0; i <= prog->len; i++) {
11172067 proglen += 64;
11182068 addrs[i] = proglen;
11192069 }
....@@ -1145,12 +2095,24 @@
11452095 break;
11462096 }
11472097 if (proglen == oldproglen) {
1148
- header = bpf_jit_binary_alloc(proglen, &image,
1149
- 1, jit_fill_hole);
2098
+ /*
2099
+ * The number of entries in extable is the number of BPF_LDX
2100
+ * insns that access kernel memory via "pointer to BTF type".
2101
+ * The verifier changed their opcode from LDX|MEM|size
2102
+ * to LDX|PROBE_MEM|size to make JITing easier.
2103
+ */
2104
+ u32 align = __alignof__(struct exception_table_entry);
2105
+ u32 extable_size = prog->aux->num_exentries *
2106
+ sizeof(struct exception_table_entry);
2107
+
2108
+ /* allocate module memory for x86 insns and extable */
2109
+ header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
2110
+ &image, align, jit_fill_hole);
11502111 if (!header) {
11512112 prog = orig_prog;
11522113 goto out_addrs;
11532114 }
2115
+ prog->aux->extable = (void *) image + roundup(proglen, align);
11542116 }
11552117 oldproglen = proglen;
11562118 cond_resched();
....@@ -1161,6 +2123,7 @@
11612123
11622124 if (image) {
11632125 if (!prog->is_func || extra_pass) {
2126
+ bpf_tail_call_direct_fixup(prog);
11642127 bpf_jit_binary_lock_ro(header);
11652128 } else {
11662129 jit_data->addrs = addrs;
....@@ -1177,8 +2140,10 @@
11772140 }
11782141
11792142 if (!image || !prog->is_func || extra_pass) {
2143
+ if (image)
2144
+ bpf_prog_fill_jited_linfo(prog, addrs + 1);
11802145 out_addrs:
1181
- kfree(addrs);
2146
+ kvfree(addrs);
11822147 kfree(jit_data);
11832148 prog->aux->jit_data = NULL;
11842149 }