Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf...
authorJakub Kicinski <kuba@kernel.org>
Tue, 12 Mar 2024 01:06:04 +0000 (18:06 -0700)
committerJakub Kicinski <kuba@kernel.org>
Tue, 12 Mar 2024 01:06:04 +0000 (18:06 -0700)
Alexei Starovoitov says:

====================
pull-request: bpf-next 2024-03-11

We've added 59 non-merge commits during the last 9 day(s) which contain
a total of 88 files changed, 4181 insertions(+), 590 deletions(-).

The main changes are:

1) Enforce VM_IOREMAP flag and range in ioremap_page_range and introduce
   VM_SPARSE kind and vm_area_[un]map_pages to be used in bpf_arena,
   from Alexei.

2) Introduce bpf_arena which is sparse shared memory region between bpf
   program and user space where structures inside the arena can have
   pointers to other areas of the arena, and pointers work seamlessly for
   both user-space programs and bpf programs, from Alexei and Andrii.

3) Introduce may_goto instruction that is a contract between the verifier
   and the program. The verifier allows the program to loop assuming it's
   behaving well, but reserves the right to terminate it, from Alexei.

4) Use IETF format for field definitions in the BPF standard
   document, from Dave.

5) Extend struct_ops libbpf APIs to allow specify version suffixes for
   stuct_ops map types, share the same BPF program between several map
   definitions, and other improvements, from Eduard.

6) Enable struct_ops support for more than one page in trampolines,
   from Kui-Feng.

7) Support kCFI + BPF on riscv64, from Puranjay.

8) Use bpf_prog_pack for arm64 bpf trampoline, from Puranjay.

9) Fix roundup_pow_of_two undefined behavior on 32-bit archs, from Toke.
====================

Link: https://lore.kernel.org/r/20240312003646.8692-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
88 files changed:
Documentation/bpf/standardization/instruction-set.rst
arch/arm/mm/ioremap.c
arch/arm64/net/bpf_jit_comp.c
arch/loongarch/kernel/setup.c
arch/mips/loongson64/init.c
arch/powerpc/kernel/isa-bridge.c
arch/riscv/include/asm/cfi.h
arch/riscv/kernel/cfi.c
arch/riscv/net/bpf_jit.h
arch/riscv/net/bpf_jit_comp32.c
arch/riscv/net/bpf_jit_comp64.c
arch/riscv/net/bpf_jit_core.c
arch/x86/net/bpf_jit_comp.c
drivers/pci/pci.c
include/linux/bpf.h
include/linux/bpf_types.h
include/linux/bpf_verifier.h
include/linux/filter.h
include/linux/io.h
include/linux/vmalloc.h
include/uapi/linux/bpf.h
kernel/bpf/Makefile
kernel/bpf/arena.c [new file with mode: 0644]
kernel/bpf/bpf_iter.c
kernel/bpf/bpf_struct_ops.c
kernel/bpf/btf.c
kernel/bpf/core.c
kernel/bpf/devmap.c
kernel/bpf/disasm.c
kernel/bpf/hashtab.c
kernel/bpf/log.c
kernel/bpf/stackmap.c
kernel/bpf/syscall.c
kernel/bpf/trampoline.c
kernel/bpf/verifier.c
kernel/events/core.c
kernel/trace/bpf_trace.c
mm/vmalloc.c
net/bpf/bpf_dummy_struct_ops.c
net/ipv4/tcp_cong.c
tools/bpf/bpftool/Documentation/bpftool-map.rst
tools/bpf/bpftool/gen.c
tools/bpf/bpftool/map.c
tools/include/uapi/linux/bpf.h
tools/lib/bpf/bpf_helpers.h
tools/lib/bpf/btf.c
tools/lib/bpf/features.c
tools/lib/bpf/libbpf.c
tools/lib/bpf/libbpf.h
tools/lib/bpf/libbpf_internal.h
tools/lib/bpf/libbpf_probes.c
tools/testing/selftests/bpf/DENYLIST.aarch64
tools/testing/selftests/bpf/DENYLIST.s390x
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bench.c
tools/testing/selftests/bpf/benchs/bench_trigger.c
tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh [new file with mode: 0755]
tools/testing/selftests/bpf/bpf_arena_alloc.h [new file with mode: 0644]
tools/testing/selftests/bpf/bpf_arena_common.h [new file with mode: 0644]
tools/testing/selftests/bpf/bpf_arena_htab.h [new file with mode: 0644]
tools/testing/selftests/bpf/bpf_arena_list.h [new file with mode: 0644]
tools/testing/selftests/bpf/bpf_experimental.h
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h
tools/testing/selftests/bpf/prog_tests/arena_htab.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/arena_list.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/bad_struct_ops.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/btf.c
tools/testing/selftests/bpf/prog_tests/struct_ops_autocreate.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c
tools/testing/selftests/bpf/prog_tests/test_struct_ops_multi_pages.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/verifier.c
tools/testing/selftests/bpf/progs/arena_htab.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/arena_htab_asm.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/arena_list.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bad_struct_ops.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bad_struct_ops2.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/struct_ops_autocreate.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/struct_ops_autocreate2.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/struct_ops_module.c
tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/trigger_bench.c
tools/testing/selftests/bpf/progs/verifier_arena.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c
tools/testing/selftests/bpf/test_loader.c
tools/testing/selftests/bpf/test_progs.c
tools/testing/selftests/bpf/test_progs.h
tools/testing/selftests/bpf/xdp_hw_metadata.c

index f3269d6dd5e024cfe302c90396040b7cd62475d5..a5ab00ac0b1487437ee0261791c5ae45f31c8103 100644 (file)
@@ -24,22 +24,22 @@ a type's signedness (`S`) and bit width (`N`), respectively.
 .. table:: Meaning of signedness notation.
 
   ==== =========
-  `S`  Meaning
+  S    Meaning
   ==== =========
-  `u`  unsigned
-  `s`  signed
+  u    unsigned
+  s    signed
   ==== =========
 
 .. table:: Meaning of bit-width notation.
 
   ===== =========
-  `N`   Bit width
+  N     Bit width
   ===== =========
-  `8`   8 bits
-  `16`  16 bits
-  `32`  32 bits
-  `64`  64 bits
-  `128` 128 bits
+  8     8 bits
+  16    16 bits
+  32    32 bits
+  64    64 bits
+  128   128 bits
   ===== =========
 
 For example, `u32` is a type whose valid values are all the 32-bit unsigned
@@ -48,31 +48,31 @@ numbers.
 
 Functions
 ---------
-* `htobe16`: Takes an unsigned 16-bit number in host-endian format and
+* htobe16: Takes an unsigned 16-bit number in host-endian format and
   returns the equivalent number as an unsigned 16-bit number in big-endian
   format.
-* `htobe32`: Takes an unsigned 32-bit number in host-endian format and
+* htobe32: Takes an unsigned 32-bit number in host-endian format and
   returns the equivalent number as an unsigned 32-bit number in big-endian
   format.
-* `htobe64`: Takes an unsigned 64-bit number in host-endian format and
+* htobe64: Takes an unsigned 64-bit number in host-endian format and
   returns the equivalent number as an unsigned 64-bit number in big-endian
   format.
-* `htole16`: Takes an unsigned 16-bit number in host-endian format and
+* htole16: Takes an unsigned 16-bit number in host-endian format and
   returns the equivalent number as an unsigned 16-bit number in little-endian
   format.
-* `htole32`: Takes an unsigned 32-bit number in host-endian format and
+* htole32: Takes an unsigned 32-bit number in host-endian format and
   returns the equivalent number as an unsigned 32-bit number in little-endian
   format.
-* `htole64`: Takes an unsigned 64-bit number in host-endian format and
+* htole64: Takes an unsigned 64-bit number in host-endian format and
   returns the equivalent number as an unsigned 64-bit number in little-endian
   format.
-* `bswap16`: Takes an unsigned 16-bit number in either big- or little-endian
+* bswap16: Takes an unsigned 16-bit number in either big- or little-endian
   format and returns the equivalent number with the same bit width but
   opposite endianness.
-* `bswap32`: Takes an unsigned 32-bit number in either big- or little-endian
+* bswap32: Takes an unsigned 32-bit number in either big- or little-endian
   format and returns the equivalent number with the same bit width but
   opposite endianness.
-* `bswap64`: Takes an unsigned 64-bit number in either big- or little-endian
+* bswap64: Takes an unsigned 64-bit number in either big- or little-endian
   format and returns the equivalent number with the same bit width but
   opposite endianness.
 
@@ -127,7 +127,7 @@ This document defines the following conformance groups:
 * divmul32: includes 32-bit division, multiplication, and modulo instructions.
 * divmul64: includes divmul32, plus 64-bit division, multiplication,
   and modulo instructions.
-* legacy: deprecated packet access instructions.
+* packet: deprecated packet access instructions.
 
 Instruction encoding
 ====================
@@ -135,34 +135,63 @@ Instruction encoding
 BPF has two instruction encodings:
 
 * the basic instruction encoding, which uses 64 bits to encode an instruction
-* the wide instruction encoding, which appends a second 64-bit immediate (i.e.,
-  constant) value after the basic instruction for a total of 128 bits.
+* the wide instruction encoding, which appends a second 64 bits
+  after the basic instruction for a total of 128 bits.
 
-The fields conforming an encoded basic instruction are stored in the
-following order::
+Basic instruction encoding
+--------------------------
 
-  opcode:8 src_reg:4 dst_reg:4 offset:16 imm:32 // In little-endian BPF.
-  opcode:8 dst_reg:4 src_reg:4 offset:16 imm:32 // In big-endian BPF.
+A basic instruction is encoded as follows::
 
-**imm**
-  signed integer immediate value
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |    opcode     |     regs      |            offset             |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                              imm                              |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 
-**offset**
-  signed integer offset used with pointer arithmetic
+**opcode**
+  operation to perform, encoded as follows::
 
-**src_reg**
-  the source register number (0-10), except where otherwise specified
-  (`64-bit immediate instructions`_ reuse this field for other purposes)
+    +-+-+-+-+-+-+-+-+
+    |specific |class|
+    +-+-+-+-+-+-+-+-+
 
-**dst_reg**
-  destination register number (0-10)
+  **specific**
+    The format of these bits varies by instruction class
 
-**opcode**
-  operation to perform
+  **class**
+    The instruction class (see `Instruction classes`_)
+
+**regs**
+  The source and destination register numbers, encoded as follows
+  on a little-endian host::
+
+    +-+-+-+-+-+-+-+-+
+    |src_reg|dst_reg|
+    +-+-+-+-+-+-+-+-+
+
+  and as follows on a big-endian host::
+
+    +-+-+-+-+-+-+-+-+
+    |dst_reg|src_reg|
+    +-+-+-+-+-+-+-+-+
+
+  **src_reg**
+    the source register number (0-10), except where otherwise specified
+    (`64-bit immediate instructions`_ reuse this field for other purposes)
+
+  **dst_reg**
+    destination register number (0-10)
+
+**offset**
+  signed integer offset used with pointer arithmetic
+
+**imm**
+  signed integer immediate value
 
-Note that the contents of multi-byte fields ('imm' and 'offset') are
-stored using big-endian byte ordering in big-endian BPF and
-little-endian byte ordering in little-endian BPF.
+Note that the contents of multi-byte fields ('offset' and 'imm') are
+stored using big-endian byte ordering on big-endian hosts and
+little-endian byte ordering on little-endian hosts.
 
 For example::
 
@@ -175,66 +204,83 @@ For example::
 Note that most instructions do not use all of the fields.
 Unused fields shall be cleared to zero.
 
-As discussed below in `64-bit immediate instructions`_, a 64-bit immediate
-instruction uses two 32-bit immediate values that are constructed as follows.
-The 64 bits following the basic instruction contain a pseudo instruction
-using the same format but with 'opcode', 'dst_reg', 'src_reg', and 'offset' all
-set to zero, and imm containing the high 32 bits of the immediate value.
+Wide instruction encoding
+--------------------------
+
+Some instructions are defined to use the wide instruction encoding,
+which uses two 32-bit immediate values.  The 64 bits following
+the basic instruction format contain a pseudo instruction
+with 'opcode', 'dst_reg', 'src_reg', and 'offset' all set to zero.
 
 This is depicted in the following figure::
 
-        basic_instruction
-  .------------------------------.
-  |                              |
-  opcode:8 regs:8 offset:16 imm:32 unused:32 imm:32
-                                   |              |
-                                   '--------------'
-                                  pseudo instruction
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |    opcode     |     regs      |            offset             |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                              imm                              |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                           reserved                            |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  |                           next_imm                            |
+  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+**opcode**
+  operation to perform, encoded as explained above
+
+**regs**
+  The source and destination register numbers, encoded as explained above
+
+**offset**
+  signed integer offset used with pointer arithmetic
+
+**imm**
+  signed integer immediate value
+
+**reserved**
+  unused, set to zero
 
-Here, the imm value of the pseudo instruction is called 'next_imm'. The unused
-bytes in the pseudo instruction are reserved and shall be cleared to zero.
+**next_imm**
+  second signed integer immediate value
 
 Instruction classes
 -------------------
 
-The three LSB bits of the 'opcode' field store the instruction class:
-
-=========  =====  ===============================  ===================================
-class      value  description                      reference
-=========  =====  ===============================  ===================================
-BPF_LD     0x00   non-standard load operations     `Load and store instructions`_
-BPF_LDX    0x01   load into register operations    `Load and store instructions`_
-BPF_ST     0x02   store from immediate operations  `Load and store instructions`_
-BPF_STX    0x03   store from register operations   `Load and store instructions`_
-BPF_ALU    0x04   32-bit arithmetic operations     `Arithmetic and jump instructions`_
-BPF_JMP    0x05   64-bit jump operations           `Arithmetic and jump instructions`_
-BPF_JMP32  0x06   32-bit jump operations           `Arithmetic and jump instructions`_
-BPF_ALU64  0x07   64-bit arithmetic operations     `Arithmetic and jump instructions`_
-=========  =====  ===============================  ===================================
+The three least significant bits of the 'opcode' field store the instruction class:
+
+=====  =====  ===============================  ===================================
+class  value  description                      reference
+=====  =====  ===============================  ===================================
+LD     0x0    non-standard load operations     `Load and store instructions`_
+LDX    0x1    load into register operations    `Load and store instructions`_
+ST     0x2    store from immediate operations  `Load and store instructions`_
+STX    0x3    store from register operations   `Load and store instructions`_
+ALU    0x4    32-bit arithmetic operations     `Arithmetic and jump instructions`_
+JMP    0x5    64-bit jump operations           `Arithmetic and jump instructions`_
+JMP32  0x6    32-bit jump operations           `Arithmetic and jump instructions`_
+ALU64  0x7    64-bit arithmetic operations     `Arithmetic and jump instructions`_
+=====  =====  ===============================  ===================================
 
 Arithmetic and jump instructions
 ================================
 
-For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
-``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
+For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and
+``JMP32``), the 8-bit 'opcode' field is divided into three parts::
 
-==============  ======  =================
-4 bits (MSB)    1 bit   3 bits (LSB)
-==============  ======  =================
-code            source  instruction class
-==============  ======  =================
+  +-+-+-+-+-+-+-+-+
+  |  code |s|class|
+  +-+-+-+-+-+-+-+-+
 
 **code**
   the operation code, whose meaning varies by instruction class
 
-**source**
+**s (source)**
   the source operand location, which unless otherwise specified is one of:
 
   ======  =====  ==============================================
   source  value  description
   ======  =====  ==============================================
-  BPF_K   0x00   use 32-bit 'imm' value as source operand
-  BPF_X   0x08   use 'src_reg' register value as source operand
+  K       0      use 32-bit 'imm' value as source operand
+  X       1      use 'src_reg' register value as source operand
   ======  =====  ==============================================
 
 **instruction class**
@@ -243,75 +289,75 @@ code            source  instruction class
 Arithmetic instructions
 -----------------------
 
-``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
-otherwise identical operations. ``BPF_ALU64`` instructions belong to the
+``ALU`` uses 32-bit wide operands while ``ALU64`` uses 64-bit wide operands for
+otherwise identical operations. ``ALU64`` instructions belong to the
 base64 conformance group unless noted otherwise.
 The 'code' field encodes the operation as below, where 'src' and 'dst' refer
 to the values of the source and destination registers, respectively.
 
-=========  =====  =======  ==========================================================
-code       value  offset   description
-=========  =====  =======  ==========================================================
-BPF_ADD    0x00   0        dst += src
-BPF_SUB    0x10   0        dst -= src
-BPF_MUL    0x20   0        dst \*= src
-BPF_DIV    0x30   0        dst = (src != 0) ? (dst / src) : 0
-BPF_SDIV   0x30   1        dst = (src != 0) ? (dst s/ src) : 0
-BPF_OR     0x40   0        dst \|= src
-BPF_AND    0x50   0        dst &= src
-BPF_LSH    0x60   0        dst <<= (src & mask)
-BPF_RSH    0x70   0        dst >>= (src & mask)
-BPF_NEG    0x80   0        dst = -dst
-BPF_MOD    0x90   0        dst = (src != 0) ? (dst % src) : dst
-BPF_SMOD   0x90   1        dst = (src != 0) ? (dst s% src) : dst
-BPF_XOR    0xa0   0        dst ^= src
-BPF_MOV    0xb0   0        dst = src
-BPF_MOVSX  0xb0   8/16/32  dst = (s8,s16,s32)src
-BPF_ARSH   0xc0   0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
-BPF_END    0xd0   0        byte swap operations (see `Byte swap instructions`_ below)
-=========  =====  =======  ==========================================================
+=====  =====  =======  ==========================================================
+name   code   offset   description
+=====  =====  =======  ==========================================================
+ADD    0x0    0        dst += src
+SUB    0x1    0        dst -= src
+MUL    0x2    0        dst \*= src
+DIV    0x3    0        dst = (src != 0) ? (dst / src) : 0
+SDIV   0x3    1        dst = (src != 0) ? (dst s/ src) : 0
+OR     0x4    0        dst \|= src
+AND    0x5    0        dst &= src
+LSH    0x6    0        dst <<= (src & mask)
+RSH    0x7    0        dst >>= (src & mask)
+NEG    0x8    0        dst = -dst
+MOD    0x9    0        dst = (src != 0) ? (dst % src) : dst
+SMOD   0x9    1        dst = (src != 0) ? (dst s% src) : dst
+XOR    0xa    0        dst ^= src
+MOV    0xb    0        dst = src
+MOVSX  0xb    8/16/32  dst = (s8,s16,s32)src
+ARSH   0xc    0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
+END    0xd    0        byte swap operations (see `Byte swap instructions`_ below)
+=====  =====  =======  ==========================================================
 
 Underflow and overflow are allowed during arithmetic operations, meaning
 the 64-bit or 32-bit value will wrap. If BPF program execution would
 result in division by zero, the destination register is instead set to zero.
-If execution would result in modulo by zero, for ``BPF_ALU64`` the value of
-the destination register is unchanged whereas for ``BPF_ALU`` the upper
+If execution would result in modulo by zero, for ``ALU64`` the value of
+the destination register is unchanged whereas for ``ALU`` the upper
 32 bits of the destination register are zeroed.
 
-``BPF_ADD | BPF_X | BPF_ALU`` means::
+``{ADD, X, ALU}``, where 'code' = ``ADD``, 'source' = ``X``, and 'class' = ``ALU``, means::
 
   dst = (u32) ((u32) dst + (u32) src)
 
 where '(u32)' indicates that the upper 32 bits are zeroed.
 
-``BPF_ADD | BPF_X | BPF_ALU64`` means::
+``{ADD, X, ALU64}`` means::
 
   dst = dst + src
 
-``BPF_XOR | BPF_K | BPF_ALU`` means::
+``{XOR, K, ALU}`` means::
 
   dst = (u32) dst ^ (u32) imm
 
-``BPF_XOR | BPF_K | BPF_ALU64`` means::
+``{XOR, K, ALU64}`` means::
 
   dst = dst ^ imm
 
 Note that most instructions have instruction offset of 0. Only three instructions
-(``BPF_SDIV``, ``BPF_SMOD``, ``BPF_MOVSX``) have a non-zero offset.
+(``SDIV``, ``SMOD``, ``MOVSX``) have a non-zero offset.
 
-Division, multiplication, and modulo operations for ``BPF_ALU`` are part
+Division, multiplication, and modulo operations for ``ALU`` are part
 of the "divmul32" conformance group, and division, multiplication, and
-modulo operations for ``BPF_ALU64`` are part of the "divmul64" conformance
+modulo operations for ``ALU64`` are part of the "divmul64" conformance
 group.
 The division and modulo operations support both unsigned and signed flavors.
 
-For unsigned operations (``BPF_DIV`` and ``BPF_MOD``), for ``BPF_ALU``,
-'imm' is interpreted as a 32-bit unsigned value. For ``BPF_ALU64``,
+For unsigned operations (``DIV`` and ``MOD``), for ``ALU``,
+'imm' is interpreted as a 32-bit unsigned value. For ``ALU64``,
 'imm' is first :term:`sign extended<Sign Extend>` from 32 to 64 bits, and then
 interpreted as a 64-bit unsigned value.
 
-For signed operations (``BPF_SDIV`` and ``BPF_SMOD``), for ``BPF_ALU``,
-'imm' is interpreted as a 32-bit signed value. For ``BPF_ALU64``, 'imm'
+For signed operations (``SDIV`` and ``SMOD``), for ``ALU``,
+'imm' is interpreted as a 32-bit signed value. For ``ALU64``, 'imm'
 is first :term:`sign extended<Sign Extend>` from 32 to 64 bits, and then
 interpreted as a 64-bit signed value.
 
@@ -323,15 +369,15 @@ etc. This specification requires that signed modulo use truncated division
 
    a % n = a - n * trunc(a / n)
 
-The ``BPF_MOVSX`` instruction does a move operation with sign extension.
-``BPF_ALU | BPF_MOVSX`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into 32
+The ``MOVSX`` instruction does a move operation with sign extension.
+``{MOVSX, X, ALU}`` :term:`sign extends<Sign Extend>` 8-bit and 16-bit operands into 32
 bit operands, and zeroes the remaining upper 32 bits.
-``BPF_ALU64 | BPF_MOVSX`` :term:`sign extends<Sign Extend>` 8-bit, 16-bit, and 32-bit
+``{MOVSX, X, ALU64}`` :term:`sign extends<Sign Extend>` 8-bit, 16-bit, and 32-bit
 operands into 64 bit operands.  Unlike other arithmetic instructions,
-``BPF_MOVSX`` is only defined for register source operands (``BPF_X``).
+``MOVSX`` is only defined for register source operands (``X``).
 
-The ``BPF_NEG`` instruction is only defined when the source bit is clear
-(``BPF_K``).
+The ``NEG`` instruction is only defined when the source bit is clear
+(``K``).
 
 Shift operations use a mask of 0x3F (63) for 64-bit operations and 0x1F (31)
 for 32-bit operations.
@@ -339,24 +385,24 @@ for 32-bit operations.
 Byte swap instructions
 ----------------------
 
-The byte swap instructions use instruction classes of ``BPF_ALU`` and ``BPF_ALU64``
-and a 4-bit 'code' field of ``BPF_END``.
+The byte swap instructions use instruction classes of ``ALU`` and ``ALU64``
+and a 4-bit 'code' field of ``END``.
 
 The byte swap instructions operate on the destination register
 only and do not use a separate source register or immediate value.
 
-For ``BPF_ALU``, the 1-bit source operand field in the opcode is used to
+For ``ALU``, the 1-bit source operand field in the opcode is used to
 select what byte order the operation converts from or to. For
-``BPF_ALU64``, the 1-bit source operand field in the opcode is reserved
+``ALU64``, the 1-bit source operand field in the opcode is reserved
 and must be set to 0.
 
-=========  =========  =====  =================================================
-class      source     value  description
-=========  =========  =====  =================================================
-BPF_ALU    BPF_TO_LE  0x00   convert between host byte order and little endian
-BPF_ALU    BPF_TO_BE  0x08   convert between host byte order and big endian
-BPF_ALU64  Reserved   0x00   do byte swap unconditionally
-=========  =========  =====  =================================================
+=====  ========  =====  =================================================
+class  source    value  description
+=====  ========  =====  =================================================
+ALU    TO_LE     0      convert between host byte order and little endian
+ALU    TO_BE     1      convert between host byte order and big endian
+ALU64  Reserved  0      do byte swap unconditionally
+=====  ========  =====  =================================================
 
 The 'imm' field encodes the width of the swap operations.  The following widths
 are supported: 16, 32 and 64.  Width 64 operations belong to the base64
@@ -365,19 +411,19 @@ conformance group.
 
 Examples:
 
-``BPF_ALU | BPF_TO_LE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_LE, ALU}`` with imm = 16/32/64 means::
 
   dst = htole16(dst)
   dst = htole32(dst)
   dst = htole64(dst)
 
-``BPF_ALU | BPF_TO_BE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_BE, ALU}`` with imm = 16/32/64 means::
 
   dst = htobe16(dst)
   dst = htobe32(dst)
   dst = htobe64(dst)
 
-``BPF_ALU64 | BPF_TO_LE | BPF_END`` with imm = 16/32/64 means::
+``{END, TO_LE, ALU64}`` with imm = 16/32/64 means::
 
   dst = bswap16(dst)
   dst = bswap32(dst)
@@ -386,59 +432,59 @@ Examples:
 Jump instructions
 -----------------
 
-``BPF_JMP32`` uses 32-bit wide operands and indicates the base32
-conformance group, while ``BPF_JMP`` uses 64-bit wide operands for
+``JMP32`` uses 32-bit wide operands and indicates the base32
+conformance group, while ``JMP`` uses 64-bit wide operands for
 otherwise identical operations, and indicates the base64 conformance
 group unless otherwise specified.
 The 'code' field encodes the operation as below:
 
-========  =====  =======  ===============================  =============================================
+========  =====  =======  ===============================  ===================================================
 code      value  src_reg  description                      notes
-========  =====  =======  ===============================  =============================================
-BPF_JA    0x0    0x0      PC += offset                     BPF_JMP | BPF_K only
-BPF_JA    0x0    0x0      PC += imm                        BPF_JMP32 | BPF_K only
-BPF_JEQ   0x1    any      PC += offset if dst == src
-BPF_JGT   0x2    any      PC += offset if dst > src        unsigned
-BPF_JGE   0x3    any      PC += offset if dst >= src       unsigned
-BPF_JSET  0x4    any      PC += offset if dst & src
-BPF_JNE   0x5    any      PC += offset if dst != src
-BPF_JSGT  0x6    any      PC += offset if dst > src        signed
-BPF_JSGE  0x7    any      PC += offset if dst >= src       signed
-BPF_CALL  0x8    0x0      call helper function by address  BPF_JMP | BPF_K only, see `Helper functions`_
-BPF_CALL  0x8    0x1      call PC += imm                   BPF_JMP | BPF_K only, see `Program-local functions`_
-BPF_CALL  0x8    0x2      call helper function by BTF ID   BPF_JMP | BPF_K only, see `Helper functions`_
-BPF_EXIT  0x9    0x0      return                           BPF_JMP | BPF_K only
-BPF_JLT   0xa    any      PC += offset if dst < src        unsigned
-BPF_JLE   0xb    any      PC += offset if dst <= src       unsigned
-BPF_JSLT  0xc    any      PC += offset if dst < src        signed
-BPF_JSLE  0xd    any      PC += offset if dst <= src       signed
-========  =====  =======  ===============================  =============================================
-
-The BPF program needs to store the return value into register R0 before doing a
-``BPF_EXIT``.
+========  =====  =======  ===============================  ===================================================
+JA        0x0    0x0      PC += offset                     {JA, K, JMP} only
+JA        0x0    0x0      PC += imm                        {JA, K, JMP32} only
+JEQ       0x1    any      PC += offset if dst == src
+JGT       0x2    any      PC += offset if dst > src        unsigned
+JGE       0x3    any      PC += offset if dst >= src       unsigned
+JSET      0x4    any      PC += offset if dst & src
+JNE       0x5    any      PC += offset if dst != src
+JSGT      0x6    any      PC += offset if dst > src        signed
+JSGE      0x7    any      PC += offset if dst >= src       signed
+CALL      0x8    0x0      call helper function by address  {CALL, K, JMP} only, see `Helper functions`_
+CALL      0x8    0x1      call PC += imm                   {CALL, K, JMP} only, see `Program-local functions`_
+CALL      0x8    0x2      call helper function by BTF ID   {CALL, K, JMP} only, see `Helper functions`_
+EXIT      0x9    0x0      return                           {CALL, K, JMP} only
+JLT       0xa    any      PC += offset if dst < src        unsigned
+JLE       0xb    any      PC += offset if dst <= src       unsigned
+JSLT      0xc    any      PC += offset if dst < src        signed
+JSLE      0xd    any      PC += offset if dst <= src       signed
+========  =====  =======  ===============================  ===================================================
+
+The BPF program needs to store the return value into register R0 before doing an
+``EXIT``.
 
 Example:
 
-``BPF_JSGE | BPF_X | BPF_JMP32`` (0x7e) means::
+``{JSGE, X, JMP32}`` means::
 
   if (s32)dst s>= (s32)src goto +offset
 
 where 's>=' indicates a signed '>=' comparison.
 
-``BPF_JA | BPF_K | BPF_JMP32`` (0x06) means::
+``{JA, K, JMP32}`` means::
 
   gotol +imm
 
 where 'imm' means the branch offset comes from insn 'imm' field.
 
-Note that there are two flavors of ``BPF_JA`` instructions. The
-``BPF_JMP`` class permits a 16-bit jump offset specified by the 'offset'
-field, whereas the ``BPF_JMP32`` class permits a 32-bit jump offset
+Note that there are two flavors of ``JA`` instructions. The
+``JMP`` class permits a 16-bit jump offset specified by the 'offset'
+field, whereas the ``JMP32`` class permits a 32-bit jump offset
 specified by the 'imm' field. A > 16-bit conditional jump may be
 converted to a < 16-bit conditional jump plus a 32-bit unconditional
 jump.
 
-All ``BPF_CALL`` and ``BPF_JA`` instructions belong to the
+All ``CALL`` and ``JA`` instructions belong to the
 base32 conformance group.
 
 Helper functions
@@ -459,80 +505,83 @@ Program-local functions
 ~~~~~~~~~~~~~~~~~~~~~~~
 Program-local functions are functions exposed by the same BPF program as the
 caller, and are referenced by offset from the call instruction, similar to
-``BPF_JA``.  The offset is encoded in the imm field of the call instruction.
-A ``BPF_EXIT`` within the program-local function will return to the caller.
+``JA``.  The offset is encoded in the imm field of the call instruction.
+A ``EXIT`` within the program-local function will return to the caller.
 
 Load and store instructions
 ===========================
 
-For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
-8-bit 'opcode' field is divided as:
-
-============  ======  =================
-3 bits (MSB)  2 bits  3 bits (LSB)
-============  ======  =================
-mode          size    instruction class
-============  ======  =================
-
-The mode modifier is one of:
-
-  =============  =====  ====================================  =============
-  mode modifier  value  description                           reference
-  =============  =====  ====================================  =============
-  BPF_IMM        0x00   64-bit immediate instructions         `64-bit immediate instructions`_
-  BPF_ABS        0x20   legacy BPF packet access (absolute)   `Legacy BPF Packet access instructions`_
-  BPF_IND        0x40   legacy BPF packet access (indirect)   `Legacy BPF Packet access instructions`_
-  BPF_MEM        0x60   regular load and store operations     `Regular load and store operations`_
-  BPF_MEMSX      0x80   sign-extension load operations        `Sign-extension load operations`_
-  BPF_ATOMIC     0xc0   atomic operations                     `Atomic operations`_
-  =============  =====  ====================================  =============
-
-The size modifier is one of:
-
-  =============  =====  =====================
-  size modifier  value  description
-  =============  =====  =====================
-  BPF_W          0x00   word        (4 bytes)
-  BPF_H          0x08   half word   (2 bytes)
-  BPF_B          0x10   byte
-  BPF_DW         0x18   double word (8 bytes)
-  =============  =====  =====================
-
-Instructions using ``BPF_DW`` belong to the base64 conformance group.
+For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
+8-bit 'opcode' field is divided as::
+
+  +-+-+-+-+-+-+-+-+
+  |mode |sz |class|
+  +-+-+-+-+-+-+-+-+
+
+**mode**
+  The mode modifier is one of:
+
+    =============  =====  ====================================  =============
+    mode modifier  value  description                           reference
+    =============  =====  ====================================  =============
+    IMM            0      64-bit immediate instructions         `64-bit immediate instructions`_
+    ABS            1      legacy BPF packet access (absolute)   `Legacy BPF Packet access instructions`_
+    IND            2      legacy BPF packet access (indirect)   `Legacy BPF Packet access instructions`_
+    MEM            3      regular load and store operations     `Regular load and store operations`_
+    MEMSX          4      sign-extension load operations        `Sign-extension load operations`_
+    ATOMIC         6      atomic operations                     `Atomic operations`_
+    =============  =====  ====================================  =============
+
+**sz (size)**
+  The size modifier is one of:
+
+    ====  =====  =====================
+    size  value  description
+    ====  =====  =====================
+    W     0      word        (4 bytes)
+    H     1      half word   (2 bytes)
+    B     2      byte
+    DW    3      double word (8 bytes)
+    ====  =====  =====================
+
+  Instructions using ``DW`` belong to the base64 conformance group.
+
+**class**
+  The instruction class (see `Instruction classes`_)
 
 Regular load and store operations
 ---------------------------------
 
-The ``BPF_MEM`` mode modifier is used to encode regular load and store
+The ``MEM`` mode modifier is used to encode regular load and store
 instructions that transfer data between a register and memory.
 
-``BPF_MEM | <size> | BPF_STX`` means::
+``{MEM, <size>, STX}`` means::
 
   *(size *) (dst + offset) = src
 
-``BPF_MEM | <size> | BPF_ST`` means::
+``{MEM, <size>, ST}`` means::
 
   *(size *) (dst + offset) = imm
 
-``BPF_MEM | <size> | BPF_LDX`` means::
+``{MEM, <size>, LDX}`` means::
 
   dst = *(unsigned size *) (src + offset)
 
-Where size is one of: ``BPF_B``, ``BPF_H``, ``BPF_W``, or ``BPF_DW`` and
-'unsigned size' is one of u8, u16, u32 or u64.
+Where '<size>' is one of: ``B``, ``H``, ``W``, or ``DW``, and
+'unsigned size' is one of: u8, u16, u32, or u64.
 
 Sign-extension load operations
 ------------------------------
 
-The ``BPF_MEMSX`` mode modifier is used to encode :term:`sign-extension<Sign Extend>` load
+The ``MEMSX`` mode modifier is used to encode :term:`sign-extension<Sign Extend>` load
 instructions that transfer data between a register and memory.
 
-``BPF_MEMSX | <size> | BPF_LDX`` means::
+``{MEMSX, <size>, LDX}`` means::
 
   dst = *(signed size *) (src + offset)
 
-Where size is one of: ``BPF_B``, ``BPF_H`` or ``BPF_W``, and
-'signed size' is one of s8, s16 or s32.
+Where size is one of: ``B``, ``H``, or ``W``, and
+'signed size' is one of: s8, s16, or s32.
 
 Atomic operations
 -----------------
@@ -542,11 +591,11 @@ interrupted or corrupted by other access to the same memory region
 by other BPF programs or means outside of this specification.
 
 All atomic operations supported by BPF are encoded as store operations
-that use the ``BPF_ATOMIC`` mode modifier as follows:
+that use the ``ATOMIC`` mode modifier as follows:
 
-* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations, which are
+* ``{ATOMIC, W, STX}`` for 32-bit operations, which are
   part of the "atomic32" conformance group.
-* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations, which are
+* ``{ATOMIC, DW, STX}`` for 64-bit operations, which are
   part of the "atomic64" conformance group.
 * 8-bit and 16-bit wide atomic operations are not supported.
 
@@ -557,18 +606,18 @@ arithmetic operations in the 'imm' field to encode the atomic operation:
 ========  =====  ===========
 imm       value  description
 ========  =====  ===========
-BPF_ADD   0x00   atomic add
-BPF_OR    0x40   atomic or
-BPF_AND   0x50   atomic and
-BPF_XOR   0xa0   atomic xor
+ADD       0x00   atomic add
+OR        0x40   atomic or
+AND       0x50   atomic and
+XOR       0xa0   atomic xor
 ========  =====  ===========
 
 
-``BPF_ATOMIC | BPF_W  | BPF_STX`` with 'imm' = BPF_ADD means::
+``{ATOMIC, W, STX}`` with 'imm' = ADD means::
 
   *(u32 *)(dst + offset) += src
 
-``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF_ADD means::
+``{ATOMIC, DW, STX}`` with 'imm' = ADD means::
 
   *(u64 *)(dst + offset) += src
 
@@ -578,20 +627,20 @@ two complex atomic operations:
 ===========  ================  ===========================
 imm          value             description
 ===========  ================  ===========================
-BPF_FETCH    0x01              modifier: return old value
-BPF_XCHG     0xe0 | BPF_FETCH  atomic exchange
-BPF_CMPXCHG  0xf0 | BPF_FETCH  atomic compare and exchange
+FETCH        0x01              modifier: return old value
+XCHG         0xe0 | FETCH      atomic exchange
+CMPXCHG      0xf0 | FETCH      atomic compare and exchange
 ===========  ================  ===========================
 
-The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
-always set for the complex atomic operations.  If the ``BPF_FETCH`` flag
+The ``FETCH`` modifier is optional for simple atomic operations, and
+always set for the complex atomic operations.  If the ``FETCH`` flag
 is set, then the operation also overwrites ``src`` with the value that
 was in memory before it was modified.
 
-The ``BPF_XCHG`` operation atomically exchanges ``src`` with the value
+The ``XCHG`` operation atomically exchanges ``src`` with the value
 addressed by ``dst + offset``.
 
-The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
+The ``CMPXCHG`` operation atomically compares the value addressed by
 ``dst + offset`` with ``R0``. If they match, the value addressed by
 ``dst + offset`` is replaced with ``src``. In either case, the
 value that was at ``dst + offset`` before the operation is zero-extended
@@ -600,25 +649,25 @@ and loaded back to ``R0``.
 64-bit immediate instructions
 -----------------------------
 
-Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
+Instructions with the ``IMM`` 'mode' modifier use the wide instruction
 encoding defined in `Instruction encoding`_, and use the 'src_reg' field of the
 basic instruction to hold an opcode subtype.
 
-The following table defines a set of ``BPF_IMM | BPF_DW | BPF_LD`` instructions
+The following table defines a set of ``{IMM, DW, LD}`` instructions
 with opcode subtypes in the 'src_reg' field, using new terms such as "map"
 defined further below:
 
-=========================  ======  =======  =========================================  ===========  ==============
-opcode construction        opcode  src_reg  pseudocode                                 imm type     dst type
-=========================  ======  =======  =========================================  ===========  ==============
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x0      dst = (next_imm << 32) | imm               integer      integer
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x1      dst = map_by_fd(imm)                       map fd       map
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data pointer
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x3      dst = var_addr(imm)                        variable id  data pointer
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x4      dst = code_addr(imm)                       integer      code pointer
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x5      dst = map_by_idx(imm)                      map index    map
-BPF_IMM | BPF_DW | BPF_LD  0x18    0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data pointer
-=========================  ======  =======  =========================================  ===========  ==============
+=======  =========================================  ===========  ==============
+src_reg  pseudocode                                 imm type     dst type
+=======  =========================================  ===========  ==============
+0x0      dst = (next_imm << 32) | imm               integer      integer
+0x1      dst = map_by_fd(imm)                       map fd       map
+0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data pointer
+0x3      dst = var_addr(imm)                        variable id  data pointer
+0x4      dst = code_addr(imm)                       integer      code pointer
+0x5      dst = map_by_idx(imm)                      map index    map
+0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data pointer
+=======  =========================================  ===========  ==============
 
 where
 
@@ -657,8 +706,8 @@ Legacy BPF Packet access instructions
 
 BPF previously introduced special instructions for access to packet data that were
 carried over from classic BPF. These instructions used an instruction
-class of BPF_LD, a size modifier of BPF_W, BPF_H, or BPF_B, and a
-mode modifier of BPF_ABS or BPF_IND.  The 'dst_reg' and 'offset' fields were
-set to zero, and 'src_reg' was set to zero for BPF_ABS.  However, these
+class of ``LD``, a size modifier of ``W``, ``H``, or ``B``, and a
+mode modifier of ``ABS`` or ``IND``.  The 'dst_reg' and 'offset' fields were
+set to zero, and 'src_reg' was set to zero for ``ABS``.  However, these
 instructions are deprecated and should no longer be used.  All legacy packet
-access instructions belong to the "legacy" conformance group.
+access instructions belong to the "packet" conformance group.
index 2129070065c32328baed62a73449f10bc9caf4e1..794cfea9f9d4c894d906d3032cac50cbbf0161ef 100644 (file)
@@ -110,8 +110,8 @@ void __init add_static_vm_early(struct static_vm *svm)
 int ioremap_page(unsigned long virt, unsigned long phys,
                 const struct mem_type *mtype)
 {
-       return ioremap_page_range(virt, virt + PAGE_SIZE, phys,
-                                 __pgprot(mtype->prot_pte));
+       return vmap_page_range(virt, virt + PAGE_SIZE, phys,
+                              __pgprot(mtype->prot_pte));
 }
 EXPORT_SYMBOL(ioremap_page);
 
@@ -466,8 +466,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
        if (res->end > IO_SPACE_LIMIT)
                return -EINVAL;
 
-       return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
-                                 __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte));
+       return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
+                              __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte));
 }
 EXPORT_SYMBOL(pci_remap_iospace);
 
index 5afc7a525eca72c1d8720839c392f44636ad2206..c5b461dda4385960437d5472cc3e3945d7f9a3fe 100644 (file)
@@ -2076,7 +2076,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
                /* store return value */
                emit(A64_STR64I(A64_R(0), A64_SP, retval_off), ctx);
                /* reserve a nop for bpf_tramp_image_put */
-               im->ip_after_call = ctx->image + ctx->idx;
+               im->ip_after_call = ctx->ro_image + ctx->idx;
                emit(A64_NOP, ctx);
        }
 
@@ -2091,7 +2091,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
                                run_ctx_off, false);
 
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
-               im->ip_epilogue = ctx->image + ctx->idx;
+               im->ip_epilogue = ctx->ro_image + ctx->idx;
                emit_addr_mov_i64(A64_R(0), (const u64)im, ctx);
                emit_call((const u64)__bpf_tramp_exit, ctx);
        }
@@ -2124,9 +2124,6 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
                emit(A64_RET(A64_R(10)), ctx);
        }
 
-       if (ctx->image)
-               bpf_flush_icache(ctx->image, ctx->image + ctx->idx);
-
        kfree(branches);
 
        return ctx->idx;
@@ -2169,14 +2166,43 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
        return ret < 0 ? ret : ret * AARCH64_INSN_SIZE;
 }
 
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
-                               void *image_end, const struct btf_func_model *m,
+void *arch_alloc_bpf_trampoline(unsigned int size)
+{
+       return bpf_prog_pack_alloc(size, jit_fill_hole);
+}
+
+void arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+       bpf_prog_pack_free(image, size);
+}
+
+void arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
+                               void *ro_image_end, const struct btf_func_model *m,
                                u32 flags, struct bpf_tramp_links *tlinks,
                                void *func_addr)
 {
        int ret, nregs;
+       void *image, *tmp;
+       u32 size = ro_image_end - ro_image;
+
+       /* image doesn't need to be in module memory range, so we can
+        * use kvmalloc.
+        */
+       image = kvmalloc(size, GFP_KERNEL);
+       if (!image)
+               return -ENOMEM;
+
        struct jit_ctx ctx = {
                .image = image,
+               .ro_image = ro_image,
                .idx = 0,
        };
 
@@ -2185,15 +2211,26 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
        if (nregs > 8)
                return -ENOTSUPP;
 
-       jit_fill_hole(image, (unsigned int)(image_end - image));
+       jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image));
        ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 
-       if (ret > 0 && validate_code(&ctx) < 0)
+       if (ret > 0 && validate_code(&ctx) < 0) {
                ret = -EINVAL;
+               goto out;
+       }
 
        if (ret > 0)
                ret *= AARCH64_INSN_SIZE;
 
+       tmp = bpf_arch_text_copy(ro_image, image, size);
+       if (IS_ERR(tmp)) {
+               ret = PTR_ERR(tmp);
+               goto out;
+       }
+
+       bpf_flush_icache(ro_image, ro_image + size);
+out:
+       kvfree(image);
        return ret;
 }
 
index 634ef17fd38bf10d8bd9deef8a6693f0f4777c1e..fd915ad69c09b8f0d0da3f2934b4018078015706 100644 (file)
@@ -490,7 +490,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode,
        }
 
        vaddr = (unsigned long)(PCI_IOBASE + range->io_start);
-       ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
+       vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
 
        return 0;
 }
index 553142c1f14fe2261d963b3784f3ed9e6c086cd2..a35dd731179582f981de5517ee827295d6796173 100644 (file)
@@ -180,7 +180,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode, resource_size_
 
        vaddr = PCI_IOBASE + range->io_start;
 
-       ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
+       vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
 
        return 0;
 }
index 48e0eaf1ad61559a374fe83b33c65aee6ef35e7f..5c064485197a9059835f23530c0841f78803fbe8 100644 (file)
@@ -46,8 +46,8 @@ static void remap_isa_base(phys_addr_t pa, unsigned long size)
        WARN_ON_ONCE(size & ~PAGE_MASK);
 
        if (slab_is_available()) {
-               if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
-                               pgprot_noncached(PAGE_KERNEL)))
+               if (vmap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
+                                   pgprot_noncached(PAGE_KERNEL)))
                        vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size);
        } else {
                early_ioremap_range(ISA_IO_BASE, pa, size,
index 8f7a6225704499a9b32ffb40c1ade6fca9d03c0c..fb9696d7a3f256bf96acd27cf0fd84832a7d0d2f 100644 (file)
@@ -13,11 +13,28 @@ struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+static inline int cfi_get_offset(void)
+{
+       return 4;
+}
+
+#define cfi_get_offset cfi_get_offset
+extern u32 cfi_bpf_hash;
+extern u32 cfi_bpf_subprog_hash;
+extern u32 cfi_get_func_hash(void *func);
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
        return BUG_TRAP_TYPE_NONE;
 }
+
+#define cfi_bpf_hash 0U
+#define cfi_bpf_subprog_hash 0U
+static inline u32 cfi_get_func_hash(void *func)
+{
+       return 0;
+}
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_RISCV_CFI_H */
index 6ec9dbd7292eecad5d27787904ba890a37b1426f..64bdd3e1ab8ca48e57be56db9a153493af13316f 100644 (file)
@@ -75,3 +75,56 @@ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 
        return report_cfi_failure(regs, regs->epc, &target, type);
 }
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+                                   const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+"      .pushsection    .data..ro_after_init,\"aw\",@progbits   \n"
+"      .type   cfi_bpf_hash,@object                            \n"
+"      .globl  cfi_bpf_hash                                    \n"
+"      .p2align        2, 0x0                                  \n"
+"cfi_bpf_hash:                                                 \n"
+"      .word   __kcfi_typeid___bpf_prog_runX                   \n"
+"      .size   cfi_bpf_hash, 4                                 \n"
+"      .popsection                                             \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+"      .pushsection    .data..ro_after_init,\"aw\",@progbits   \n"
+"      .type   cfi_bpf_subprog_hash,@object                    \n"
+"      .globl  cfi_bpf_subprog_hash                            \n"
+"      .p2align        2, 0x0                                  \n"
+"cfi_bpf_subprog_hash:                                         \n"
+"      .word   __kcfi_typeid___bpf_callback_fn                 \n"
+"      .size   cfi_bpf_subprog_hash, 4                         \n"
+"      .popsection                                             \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+       u32 hash;
+
+       if (get_kernel_nofault(hash, func - cfi_get_offset()))
+               return 0;
+
+       return hash;
+}
+#endif
index 8b35f12a44527306e29348c5cf66e5f8e1bc1dc6..f4b6b3b9edda3668c2075e0c42f8a8da2d92cac5 100644 (file)
@@ -1223,7 +1223,7 @@ out_be:
 
 #endif /* __riscv_xlen == 64 */
 
-void bpf_jit_build_prologue(struct rv_jit_context *ctx);
+void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog);
 void bpf_jit_build_epilogue(struct rv_jit_context *ctx);
 
 int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
index 529a83b85c1c934791ed8d18e79b9527c73b83fc..f5ba73bb153d7ea0117eae54b379836bc0a0d8fc 100644 (file)
@@ -1301,7 +1301,7 @@ notsupported:
        return 0;
 }
 
-void bpf_jit_build_prologue(struct rv_jit_context *ctx)
+void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 {
        const s8 *fp = bpf2rv32[BPF_REG_FP];
        const s8 *r1 = bpf2rv32[BPF_REG_1];
index 869e4282a2c4214fc57ebf21be9b3b5e3ff47e5f..aac190085472411f4552c54b36a802f896e72cd3 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/memory.h>
 #include <linux/stop_machine.h>
 #include <asm/patch.h>
+#include <asm/cfi.h>
 #include "bpf_jit.h"
 
 #define RV_FENTRY_NINSNS 2
@@ -455,6 +456,12 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx)
        return emit_jump_and_link(RV_REG_RA, off, fixed_addr, ctx);
 }
 
+static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx)
+{
+       if (IS_ENABLED(CONFIG_CFI_CLANG))
+               emit(hash, ctx);
+}
+
 static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
                        struct rv_jit_context *ctx)
 {
@@ -869,6 +876,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
                emit_sd(RV_REG_SP, stack_size - 16, RV_REG_FP, ctx);
                emit_addi(RV_REG_FP, RV_REG_SP, stack_size, ctx);
        } else {
+               /* emit kcfi hash */
+               emit_kcfi(cfi_get_func_hash(func_addr), ctx);
                /* For the trampoline called directly, just handle
                 * the frame of trampoline.
                 */
@@ -1711,7 +1720,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
        return 0;
 }
 
-void bpf_jit_build_prologue(struct rv_jit_context *ctx)
+void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 {
        int i, stack_adjust = 0, store_offset, bpf_stack_adjust;
 
@@ -1740,6 +1749,9 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx)
 
        store_offset = stack_adjust - 8;
 
+       /* emit kcfi type preamble immediately before the  first insn */
+       emit_kcfi(is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash, ctx);
+
        /* nops reserved for auipc+jalr pair */
        for (i = 0; i < RV_FENTRY_NINSNS; i++)
                emit(rv_nop(), ctx);
index 7b70ccb7fec345847b2dc2ab4664472f0d9ca574..6b3acac30c06199480a8397b2f59886184e3d1c3 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/filter.h>
 #include <linux/memory.h>
 #include <asm/patch.h>
+#include <asm/cfi.h>
 #include "bpf_jit.h"
 
 /* Number of iterations to try until offsets converge. */
@@ -100,7 +101,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                pass++;
                ctx->ninsns = 0;
 
-               bpf_jit_build_prologue(ctx);
+               bpf_jit_build_prologue(ctx, bpf_is_subprog(prog));
                ctx->prologue_len = ctx->ninsns;
 
                if (build_body(ctx, extra_pass, ctx->offset)) {
@@ -160,7 +161,7 @@ skip_init_ctx:
        ctx->ninsns = 0;
        ctx->nexentries = 0;
 
-       bpf_jit_build_prologue(ctx);
+       bpf_jit_build_prologue(ctx, bpf_is_subprog(prog));
        if (build_body(ctx, extra_pass, NULL)) {
                prog = orig_prog;
                goto out_free_hdr;
@@ -170,9 +171,9 @@ skip_init_ctx:
        if (bpf_jit_enable > 1)
                bpf_jit_dump(prog->len, prog_size, pass, ctx->insns);
 
-       prog->bpf_func = (void *)ctx->ro_insns;
+       prog->bpf_func = (void *)ctx->ro_insns + cfi_get_offset();
        prog->jited = 1;
-       prog->jited_len = prog_size;
+       prog->jited_len = prog_size - cfi_get_offset();
 
        if (!prog->is_func || extra_pass) {
                if (WARN_ON(bpf_jit_binary_pack_finalize(prog, jit_data->ro_header,
index e1390d1e331b589270a5d614b357d726e717679e..27058d7395f66090dfb98b5a81f93185b338007b 100644 (file)
@@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 /* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 #define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
+#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)
 
 /*
  * The following table maps BPF registers to x86-64 registers.
@@ -139,6 +140,7 @@ static const int reg2hex[] = {
        [BPF_REG_AX] = 2, /* R10 temp register */
        [AUX_REG] = 3,    /* R11 temp register */
        [X86_REG_R9] = 1, /* R9 register, 6th function argument */
+       [X86_REG_R12] = 4, /* R12 callee saved */
 };
 
 static const int reg2pt_regs[] = {
@@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
                             BIT(BPF_REG_8) |
                             BIT(BPF_REG_9) |
                             BIT(X86_REG_R9) |
+                            BIT(X86_REG_R12) |
                             BIT(BPF_REG_AX));
 }
 
@@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
        return byte;
 }
 
+static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
+{
+       if (is_ereg(r1))
+               byte |= 1;
+       if (is_ereg(index))
+               byte |= 2;
+       if (is_ereg(r2))
+               byte |= 4;
+       return byte;
+}
+
 /* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
@@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
                pop_r12(&prog);
        } else {
                pop_callee_regs(&prog, callee_regs_used);
+               if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
+                       pop_r12(&prog);
        }
 
        EMIT1(0x58);                              /* pop rax */
@@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
                pop_r12(&prog);
        } else {
                pop_callee_regs(&prog, callee_regs_used);
+               if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
+                       pop_r12(&prog);
        }
 
        EMIT1(0x58);                                  /* pop rax */
@@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
        *pprog = prog;
 }
 
+static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
+{
+       u8 *prog = *pprog;
+
+       if (is_imm8(off)) {
+               EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
+       } else {
+               EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
+       }
+       *pprog = prog;
+}
+
 /*
  * Emit a REX byte if it will be necessary to address these registers
  */
@@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
        *pprog = prog;
 }
 
+static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+       u8 *prog = *pprog;
+
+       switch (size) {
+       case BPF_B:
+               /* movzx rax, byte ptr [rax + r12 + off] */
+               EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
+               break;
+       case BPF_H:
+               /* movzx rax, word ptr [rax + r12 + off] */
+               EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
+               break;
+       case BPF_W:
+               /* mov eax, dword ptr [rax + r12 + off] */
+               EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
+               break;
+       case BPF_DW:
+               /* mov rax, qword ptr [rax + r12 + off] */
+               EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
+               break;
+       }
+       emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
+       *pprog = prog;
+}
+
+static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+       emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
+}
+
 /* STX: *(u8*)(dst_reg + off) = src_reg */
 static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 {
@@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
        *pprog = prog;
 }
 
+/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
+static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+       u8 *prog = *pprog;
+
+       switch (size) {
+       case BPF_B:
+               /* mov byte ptr [rax + r12 + off], al */
+               EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
+               break;
+       case BPF_H:
+               /* mov word ptr [rax + r12 + off], ax */
+               EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
+               break;
+       case BPF_W:
+               /* mov dword ptr [rax + r12 + 1], eax */
+               EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
+               break;
+       case BPF_DW:
+               /* mov qword ptr [rax + r12 + 1], rax */
+               EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
+               break;
+       }
+       emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+       *pprog = prog;
+}
+
+static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+       emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
+}
+
+/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
+static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
+{
+       u8 *prog = *pprog;
+
+       switch (size) {
+       case BPF_B:
+               /* mov byte ptr [rax + r12 + off], imm8 */
+               EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
+               break;
+       case BPF_H:
+               /* mov word ptr [rax + r12 + off], imm16 */
+               EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
+               break;
+       case BPF_W:
+               /* mov dword ptr [rax + r12 + 1], imm32 */
+               EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
+               break;
+       case BPF_DW:
+               /* mov qword ptr [rax + r12 + 1], imm32 */
+               EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
+               break;
+       }
+       emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
+       EMIT(imm, bpf_size_to_x86_bytes(size));
+       *pprog = prog;
+}
+
+static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
+{
+       emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
+}
+
 static int emit_atomic(u8 **pprog, u8 atomic_op,
                       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
 {
@@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
        return 0;
 }
 
+#define DONT_CLEAR 1
+
 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
 {
        u32 reg = x->fixup >> 8;
 
        /* jump over faulting load and clear dest register */
-       *(unsigned long *)((void *)regs + reg) = 0;
+       if (reg != DONT_CLEAR)
+               *(unsigned long *)((void *)regs + reg) = 0;
        regs->ip += x->fixup & 0xff;
        return true;
 }
@@ -1147,11 +1276,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
        bool tail_call_seen = false;
        bool seen_exit = false;
        u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+       u64 arena_vm_start, user_vm_start;
        int i, excnt = 0;
        int ilen, proglen = 0;
        u8 *prog = temp;
        int err;
 
+       arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
+       user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
+
        detect_reg_usage(insn, insn_cnt, callee_regs_used,
                         &tail_call_seen);
 
@@ -1172,8 +1305,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
                push_r12(&prog);
                push_callee_regs(&prog, all_callee_regs_used);
        } else {
+               if (arena_vm_start)
+                       push_r12(&prog);
                push_callee_regs(&prog, callee_regs_used);
        }
+       if (arena_vm_start)
+               emit_mov_imm64(&prog, X86_REG_R12,
+                              arena_vm_start >> 32, (u32) arena_vm_start);
 
        ilen = prog - temp;
        if (rw_image)
@@ -1213,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
                        break;
 
                case BPF_ALU64 | BPF_MOV | BPF_X:
+                       if (insn->off == BPF_ADDR_SPACE_CAST &&
+                           insn->imm == 1U << 16) {
+                               if (dst_reg != src_reg)
+                                       /* 32-bit mov */
+                                       emit_mov_reg(&prog, false, dst_reg, src_reg);
+                               /* shl dst_reg, 32 */
+                               maybe_emit_1mod(&prog, dst_reg, true);
+                               EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);
+
+                               /* or dst_reg, user_vm_start */
+                               maybe_emit_1mod(&prog, dst_reg, true);
+                               if (is_axreg(dst_reg))
+                                       EMIT1_off32(0x0D,  user_vm_start >> 32);
+                               else
+                                       EMIT2_off32(0x81, add_1reg(0xC8, dst_reg),  user_vm_start >> 32);
+
+                               /* rol dst_reg, 32 */
+                               maybe_emit_1mod(&prog, dst_reg, true);
+                               EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);
+
+                               /* xor r11, r11 */
+                               EMIT3(0x4D, 0x31, 0xDB);
+
+                               /* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
+                               maybe_emit_mod(&prog, dst_reg, dst_reg, false);
+                               EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
+
+                               /* cmove r11, dst_reg; if so, set dst_reg to zero */
+                               /* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
+                               maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
+                               EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
+                               break;
+                       }
+                       fallthrough;
                case BPF_ALU | BPF_MOV | BPF_X:
                        if (insn->off == 0)
                                emit_mov_reg(&prog,
@@ -1564,6 +1736,56 @@ st:                      if (is_imm8(insn->off))
                        emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
                        break;
 
+               case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+               case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+               case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+               case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+                       start_of_ldx = prog;
+                       emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
+                       goto populate_extable;
+
+                       /* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
+               case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+               case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+               case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+               case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
+               case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+               case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+               case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+               case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+                       start_of_ldx = prog;
+                       if (BPF_CLASS(insn->code) == BPF_LDX)
+                               emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+                       else
+                               emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+populate_extable:
+                       {
+                               struct exception_table_entry *ex;
+                               u8 *_insn = image + proglen + (start_of_ldx - temp);
+                               s64 delta;
+
+                               if (!bpf_prog->aux->extable)
+                                       break;
+
+                               if (excnt >= bpf_prog->aux->num_exentries) {
+                                       pr_err("mem32 extable bug\n");
+                                       return -EFAULT;
+                               }
+                               ex = &bpf_prog->aux->extable[excnt++];
+
+                               delta = _insn - (u8 *)&ex->insn;
+                               /* switch ex to rw buffer for writes */
+                               ex = (void *)rw_image + ((void *)ex - (void *)image);
+
+                               ex->insn = delta;
+
+                               ex->data = EX_TYPE_BPF;
+
+                               ex->fixup = (prog - start_of_ldx) |
+                                       ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
+                       }
+                       break;
+
                        /* LDX: dst_reg = *(u8*)(src_reg + off) */
                case BPF_LDX | BPF_MEM | BPF_B:
                case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@@ -2036,6 +2258,8 @@ emit_jmp:
                                pop_r12(&prog);
                        } else {
                                pop_callee_regs(&prog, callee_regs_used);
+                               if (arena_vm_start)
+                                       pop_r12(&prog);
                        }
                        EMIT1(0xC9);         /* leave */
                        emit_return(&prog, image + addrs[i - 1] + (prog - temp));
@@ -3243,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
        }
 }
 
+bool bpf_jit_supports_arena(void)
+{
+       return true;
+}
+
 bool bpf_jit_supports_ptr_xchg(void)
 {
        return true;
index c3585229c12a2145401d675ff84c20288b8f158e..ccee56615f784ce1f43f848eede4dfb697461472 100644 (file)
@@ -4353,8 +4353,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
        if (res->end > IO_SPACE_LIMIT)
                return -EINVAL;
 
-       return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
-                                 pgprot_device(PAGE_KERNEL));
+       return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
+                              pgprot_device(PAGE_KERNEL));
 #else
        /*
         * This architecture does not have memory mapped I/O space,
index 814dc913a96857513552ec767955b6f62596be17..4f20f62f9d63da87800af4ac21cbc7c92dae5fb9 100644 (file)
@@ -37,6 +37,7 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_prog_aux;
 struct bpf_map;
+struct bpf_arena;
 struct sock;
 struct seq_file;
 struct btf;
@@ -139,6 +140,9 @@ struct bpf_map_ops {
        int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
        __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
                             struct poll_table_struct *pts);
+       unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr,
+                                              unsigned long len, unsigned long pgoff,
+                                              unsigned long flags);
 
        /* Functions called by bpf_local_storage maps */
        int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
@@ -525,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
                        struct bpf_spin_lock *spin_lock);
 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
                      struct bpf_spin_lock *spin_lock);
-
-
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
@@ -708,6 +712,7 @@ enum bpf_arg_type {
         * on eBPF program stack
         */
        ARG_PTR_TO_MEM,         /* pointer to valid memory (stack, packet, map value) */
+       ARG_PTR_TO_ARENA,
 
        ARG_CONST_SIZE,         /* number of bytes accessed from memory */
        ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
@@ -879,6 +884,7 @@ enum bpf_reg_type {
         * an explicit null check is required for this struct.
         */
        PTR_TO_MEM,              /* reg points to valid memory region */
+       PTR_TO_ARENA,
        PTR_TO_BUF,              /* reg points to a read/write buffer */
        PTR_TO_FUNC,             /* reg points to a bpf program function */
        CONST_PTR_TO_DYNPTR,     /* reg points to a const struct bpf_dynptr */
@@ -1449,11 +1455,11 @@ struct bpf_prog_aux {
        bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
        bool attach_tracing_prog; /* true if tracing another tracing program */
        bool func_proto_unreliable;
-       bool sleepable;
        bool tail_call_reachable;
        bool xdp_has_frags;
        bool exception_cb;
        bool exception_boundary;
+       struct bpf_arena *arena;
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
        /* function name for valid attach_btf_id */
@@ -1534,7 +1540,8 @@ struct bpf_prog {
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
                                call_get_func_ip:1, /* Do we call get_func_ip() */
-                               tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
+                               tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
+                               sleepable:1;    /* BPF program is sleepable */
        enum bpf_prog_type      type;           /* Type of BPF program */
        enum bpf_attach_type    expected_attach_type; /* For some prog types */
        u32                     len;            /* Number of filter blocks */
@@ -1763,7 +1770,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                                      struct bpf_tramp_link *link,
                                      const struct btf_func_model *model,
                                      void *stub_func,
-                                     void *image, void *image_end);
+                                     void **image, u32 *image_off,
+                                     bool allow_alloc);
+void bpf_struct_ops_image_free(void *image);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
        if (owner == BPF_MODULE_OWNER)
@@ -2103,14 +2112,14 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu,
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
-               if (!prog->aux->sleepable)
+               if (!prog->sleepable)
                        rcu_read_lock();
 
                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;
 
-               if (!prog->aux->sleepable)
+               if (!prog->sleepable)
                        rcu_read_unlock();
        }
        bpf_reset_run_ctx(old_run_ctx);
@@ -2210,6 +2219,8 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+                       unsigned long nr_pages, struct page **page_array);
 #ifdef CONFIG_MEMCG_KMEM
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node);
index 94baced5a1ad6439e9f43f1c52a81f3d784368b4..9f2a6b83b49e144782f036f023f61b7674ad3ba7 100644 (file)
@@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
index 84365e6dd85d5f1ce3303c25c5f27bb58a775fd4..7cb1b75eee381979c0c56d38f871c93c48d352ed 100644 (file)
@@ -449,6 +449,7 @@ struct bpf_verifier_state {
        u32 jmp_history_cnt;
        u32 dfs_depth;
        u32 callback_unroll_depth;
+       u32 may_goto_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)                         \
@@ -547,6 +548,7 @@ struct bpf_insn_aux_data {
        u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
        bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
        bool zext_dst; /* this insn zero extends dst reg */
+       bool needs_zext; /* alu op needs to clear upper bits */
        bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
        bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
        bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
@@ -619,6 +621,7 @@ struct bpf_subprog_info {
        u32 start; /* insn idx of function entry point */
        u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
        u16 stack_depth; /* max. stack depth used by this function */
+       u16 stack_extra;
        bool has_tail_call: 1;
        bool tail_call_reachable: 1;
        bool has_ld_abs: 1;
index 36cc29a2934cb1e299211fb47e6a6fbb6921e713..c99bc3df2d28e35f73b21d62ca50f4a53c515e1b 100644 (file)
@@ -72,6 +72,9 @@ struct ctl_table_header;
 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */
 #define BPF_PROBE_MEMSX        0x40
 
+/* unused opcode to mark special load instruction. Same as BPF_MSH */
+#define BPF_PROBE_MEM32        0xa0
+
 /* unused opcode to mark call to interpreter with arguments */
 #define BPF_CALL_ARGS  0xe0
 
@@ -959,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
+bool bpf_jit_supports_arena(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
index 7304f2a69960a3493d7b218e43ee9a2155df78f2..235ba7d80a8f0d76e9d33b772226ebab40bfe8fd 100644 (file)
@@ -23,12 +23,19 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
                       phys_addr_t phys_addr, pgprot_t prot);
+int vmap_page_range(unsigned long addr, unsigned long end,
+                   phys_addr_t phys_addr, pgprot_t prot);
 #else
 static inline int ioremap_page_range(unsigned long addr, unsigned long end,
                                     phys_addr_t phys_addr, pgprot_t prot)
 {
        return 0;
 }
+static inline int vmap_page_range(unsigned long addr, unsigned long end,
+                                 phys_addr_t phys_addr, pgprot_t prot)
+{
+       return 0;
+}
 #endif
 
 /*
index c720be70c8ddde9cc947c685e64923139c66c3f9..0f72c85a377be9d2c817a989b38da6d7c9c54b36 100644 (file)
@@ -35,6 +35,7 @@ struct iov_iter;              /* in uio.h */
 #else
 #define VM_DEFER_KMEMLEAK      0
 #endif
+#define VM_SPARSE              0x00001000      /* sparse vm_area. not all pages are present. */
 
 /* bits [20..32] reserved for arch specific ioremap internals */
 
@@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr)
 }
 
 #ifdef CONFIG_MMU
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+                     unsigned long end, struct page **pages);
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+                        unsigned long end);
 void vunmap_range(unsigned long addr, unsigned long end);
 static inline void set_vm_flush_reset_perms(void *addr)
 {
index a241f407c23414cbd9bf44be0879462ba37a4f02..3c42b9f1bada3d5bcd7b2608a71bdb277fb1ea6d 100644 (file)
@@ -42,6 +42,7 @@
 #define BPF_JSGE       0x70    /* SGE is signed '>=', GE in x86 */
 #define BPF_JSLT       0xc0    /* SLT is signed, '<' */
 #define BPF_JSLE       0xd0    /* SLE is signed, '<=' */
+#define BPF_JCOND      0xe0    /* conditional pseudo jumps: may_goto, goto_or_nop */
 #define BPF_CALL       0x80    /* function call */
 #define BPF_EXIT       0x90    /* function return */
 
 #define BPF_XCHG       (0xe0 | BPF_FETCH)      /* atomic exchange */
 #define BPF_CMPXCHG    (0xf0 | BPF_FETCH)      /* atomic compare-and-write */
 
+enum bpf_cond_pseudo_jmp {
+       BPF_MAY_GOTO = 0,
+};
+
 /* Register numbers */
 enum {
        BPF_REG_0 = 0,
@@ -1004,6 +1009,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_BLOOM_FILTER,
        BPF_MAP_TYPE_USER_RINGBUF,
        BPF_MAP_TYPE_CGRP_STORAGE,
+       BPF_MAP_TYPE_ARENA,
        __MAX_BPF_MAP_TYPE
 };
 
@@ -1333,6 +1339,10 @@ enum {
  */
 #define BPF_PSEUDO_KFUNC_CALL  2
 
+enum bpf_addr_space_cast {
+       BPF_ADDR_SPACE_CAST = 1,
+};
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
        BPF_ANY         = 0, /* create new element or update existing */
@@ -1391,6 +1401,12 @@ enum {
 
 /* BPF token FD is passed in a corresponding command's token_fd field */
        BPF_F_TOKEN_FD          = (1U << 16),
+
+/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
+       BPF_F_SEGV_ON_FAULT     = (1U << 17),
+
+/* Do not translate kernel bpf_arena pointers to user pointers */
+       BPF_F_NO_USER_CONV      = (1U << 18),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1462,6 +1478,9 @@ union bpf_attr {
                 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
                 * number of hash functions (if 0, the bloom filter will default
                 * to using 5 hash functions).
+                *
+                * BPF_MAP_TYPE_ARENA - contains the address where user space
+                * is going to mmap() the arena. It has to be page aligned.
                 */
                __u64   map_extra;
 
index 4ce95acfcaa728aa85294e56fc707518dc4dba3f..368c5d86b5b7c81da952e5fd5fe3bab06d9d748b 100644 (file)
@@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM}   += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o
+endif
 obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644 (file)
index 0000000..86571e7
--- /dev/null
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000     // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ *   mov eax, eax  // eax has lower 32-bit of user pointer
+ *   mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
+#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
+
+struct bpf_arena {
+       struct bpf_map map;
+       u64 user_vm_start;
+       u64 user_vm_end;
+       struct vm_struct *kern_vm;
+       struct maple_tree mt;
+       struct list_head vma_list;
+       struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+       return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+       return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+       return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+       return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+       return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+       return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+       return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+       return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+       struct vm_struct *kern_vm;
+       int numa_node = bpf_map_attr_numa_node(attr);
+       struct bpf_arena *arena;
+       u64 vm_range;
+       int err = -ENOMEM;
+
+       if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+           /* BPF_F_MMAPABLE must be set */
+           !(attr->map_flags & BPF_F_MMAPABLE) ||
+           /* No unsupported flags present */
+           (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+               return ERR_PTR(-EINVAL);
+
+       if (attr->map_extra & ~PAGE_MASK)
+               /* If non-zero the map_extra is an expected user VMA start address */
+               return ERR_PTR(-EINVAL);
+
+       vm_range = (u64)attr->max_entries * PAGE_SIZE;
+       if (vm_range > (1ull << 32))
+               return ERR_PTR(-E2BIG);
+
+       if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+               /* user vma must not cross 32-bit boundary */
+               return ERR_PTR(-ERANGE);
+
+       kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+       if (!kern_vm)
+               return ERR_PTR(-ENOMEM);
+
+       arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+       if (!arena)
+               goto err;
+
+       arena->kern_vm = kern_vm;
+       arena->user_vm_start = attr->map_extra;
+       if (arena->user_vm_start)
+               arena->user_vm_end = arena->user_vm_start + vm_range;
+
+       INIT_LIST_HEAD(&arena->vma_list);
+       bpf_map_init_from_attr(&arena->map, attr);
+       mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
+       mutex_init(&arena->lock);
+
+       return &arena->map;
+err:
+       free_vm_area(kern_vm);
+       return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+       struct page *page;
+       pte_t pte;
+
+       pte = ptep_get(ptep);
+       if (!pte_present(pte)) /* sanity check */
+               return 0;
+       page = pte_page(pte);
+       /*
+        * We do not update pte here:
+        * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+        * 2. TLB flushing is batched or deferred. Even if we clear pte,
+        * the TLB entries can stick around and continue to permit access to
+        * the freed page. So it all relies on 1.
+        */
+       __free_page(page);
+       return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+       /*
+        * Check that user vma-s are not around when bpf map is freed.
+        * mmap() holds vm_file which holds bpf_map refcnt.
+        * munmap() must have happened on vma followed by arena_vm_close()
+        * which would clear arena->vma_list.
+        */
+       if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+               return;
+
+       /*
+        * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+        * It unmaps everything from vmalloc area and clears pgtables.
+        * Call apply_to_existing_page_range() first to find populated ptes and
+        * free those pages.
+        */
+       apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+                                    KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+       free_vm_area(arena->kern_vm);
+       mtree_destroy(&arena->mt);
+       bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+       return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+                                 void *value, u64 flags)
+{
+       return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+                              const struct btf_type *key_type, const struct btf_type *value_type)
+{
+       return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+       return 0;
+}
+
+struct vma_list {
+       struct vm_area_struct *vma;
+       struct list_head head;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+       struct vma_list *vml;
+
+       vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+       if (!vml)
+               return -ENOMEM;
+       vma->vm_private_data = vml;
+       vml->vma = vma;
+       list_add(&vml->head, &arena->vma_list);
+       return 0;
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+       struct bpf_map *map = vma->vm_file->private_data;
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+       struct vma_list *vml;
+
+       guard(mutex)(&arena->lock);
+       vml = vma->vm_private_data;
+       list_del(&vml->head);
+       vma->vm_private_data = NULL;
+       kfree(vml);
+}
+
+#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+       struct bpf_map *map = vmf->vma->vm_file->private_data;
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+       struct page *page;
+       long kbase, kaddr;
+       int ret;
+
+       kbase = bpf_arena_get_kern_vm_start(arena);
+       kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
+
+       guard(mutex)(&arena->lock);
+       page = vmalloc_to_page((void *)kaddr);
+       if (page)
+               /* already have a page vmap-ed */
+               goto out;
+
+       if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+               /* User space requested to segfault when page is not allocated by bpf prog */
+               return VM_FAULT_SIGSEGV;
+
+       ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
+       if (ret)
+               return VM_FAULT_SIGSEGV;
+
+       /* Account into memcg of the process that created bpf_arena */
+       ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+       if (ret) {
+               mtree_erase(&arena->mt, vmf->pgoff);
+               return VM_FAULT_SIGSEGV;
+       }
+
+       ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+       if (ret) {
+               mtree_erase(&arena->mt, vmf->pgoff);
+               __free_page(page);
+               return VM_FAULT_SIGSEGV;
+       }
+out:
+       page_ref_add(page, 1);
+       vmf->page = page;
+       return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+       .close          = arena_vm_close,
+       .fault          = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+                                            unsigned long len, unsigned long pgoff,
+                                            unsigned long flags)
+{
+       struct bpf_map *map = filp->private_data;
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+       long ret;
+
+       if (pgoff)
+               return -EINVAL;
+       if (len > (1ull << 32))
+               return -E2BIG;
+
+       /* if user_vm_start was specified at arena creation time */
+       if (arena->user_vm_start) {
+               if (len > arena->user_vm_end - arena->user_vm_start)
+                       return -E2BIG;
+               if (len != arena->user_vm_end - arena->user_vm_start)
+                       return -EINVAL;
+               if (addr != arena->user_vm_start)
+                       return -EINVAL;
+       }
+
+       ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+       if (IS_ERR_VALUE(ret))
+               return ret;
+       if ((ret >> 32) == ((ret + len - 1) >> 32))
+               return ret;
+       if (WARN_ON_ONCE(arena->user_vm_start))
+               /* checks at map creation time should prevent this */
+               return -EFAULT;
+       return round_up(ret, 1ull << 32);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+       guard(mutex)(&arena->lock);
+       if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+               /*
+                * If map_extra was not specified at arena creation time then
+                * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+                * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+                *   or
+                * specify addr in map_extra and
+                * use the same addr later with mmap(addr, MAP_FIXED..);
+                */
+               return -EBUSY;
+
+       if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+               /* all user processes must have the same size of mmap-ed region */
+               return -EBUSY;
+
+       /* Earlier checks should prevent this */
+       if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
+               return -EFAULT;
+
+       if (remember_vma(arena, vma))
+               return -ENOMEM;
+
+       arena->user_vm_start = vma->vm_start;
+       arena->user_vm_end = vma->vm_end;
+       /*
+        * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+        * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+        * potential change of user_vm_start.
+        */
+       vm_flags_set(vma, VM_DONTEXPAND);
+       vma->vm_ops = &arena_vm_ops;
+       return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+       if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+               return -ERANGE;
+       *imm = (unsigned long)arena->user_vm_start;
+       return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+       .map_meta_equal = bpf_map_meta_equal,
+       .map_alloc = arena_map_alloc,
+       .map_free = arena_map_free,
+       .map_direct_value_addr = arena_map_direct_value_addr,
+       .map_mmap = arena_map_mmap,
+       .map_get_unmapped_area = arena_get_unmapped_area,
+       .map_get_next_key = arena_map_get_next_key,
+       .map_push_elem = arena_map_push_elem,
+       .map_peek_elem = arena_map_peek_elem,
+       .map_pop_elem = arena_map_pop_elem,
+       .map_lookup_elem = arena_map_lookup_elem,
+       .map_update_elem = arena_map_update_elem,
+       .map_delete_elem = arena_map_delete_elem,
+       .map_check_btf = arena_map_check_btf,
+       .map_mem_usage = arena_map_mem_usage,
+       .map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+       return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+       /* user_vm_end/start are fixed before bpf prog runs */
+       long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+       u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+       struct page **pages;
+       long pgoff = 0;
+       u32 uaddr32;
+       int ret, i;
+
+       if (page_cnt > page_cnt_max)
+               return 0;
+
+       if (uaddr) {
+               if (uaddr & ~PAGE_MASK)
+                       return 0;
+               pgoff = compute_pgoff(arena, uaddr);
+               if (pgoff + page_cnt > page_cnt_max)
+                       /* requested address will be outside of user VMA */
+                       return 0;
+       }
+
+       /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+       pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+       if (!pages)
+               return 0;
+
+       guard(mutex)(&arena->lock);
+
+       if (uaddr)
+               ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
+                                        MT_ENTRY, GFP_KERNEL);
+       else
+               ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
+                                       page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
+       if (ret)
+               goto out_free_pages;
+
+       ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
+                                 node_id, page_cnt, pages);
+       if (ret)
+               goto out;
+
+       uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+       /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
+       ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+                               kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+       if (ret) {
+               for (i = 0; i < page_cnt; i++)
+                       __free_page(pages[i]);
+               goto out;
+       }
+       kvfree(pages);
+       return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+       mtree_erase(&arena->mt, pgoff);
+out_free_pages:
+       kvfree(pages);
+       return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+       struct vma_list *vml;
+
+       list_for_each_entry(vml, &arena->vma_list, head)
+               zap_page_range_single(vml->vma, uaddr,
+                                     PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+       u64 full_uaddr, uaddr_end;
+       long kaddr, pgoff, i;
+       struct page *page;
+
+       /* only aligned lower 32-bit are relevant */
+       uaddr = (u32)uaddr;
+       uaddr &= PAGE_MASK;
+       full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+       uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+       if (full_uaddr >= uaddr_end)
+               return;
+
+       page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+       guard(mutex)(&arena->lock);
+
+       pgoff = compute_pgoff(arena, uaddr);
+       /* clear range */
+       mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
+
+       if (page_cnt > 1)
+               /* bulk zap if multiple pages being freed */
+               zap_pages(arena, full_uaddr, page_cnt);
+
+       kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+       for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+               page = vmalloc_to_page((void *)kaddr);
+               if (!page)
+                       continue;
+               if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+                       zap_pages(arena, full_uaddr, 1);
+               vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+               __free_page(page);
+       }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+                                       int node_id, u64 flags)
+{
+       struct bpf_map *map = p__map;
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+       if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+               return NULL;
+
+       return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+       struct bpf_map *map = p__map;
+       struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+       if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+               return;
+       arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+       .owner = THIS_MODULE,
+       .set   = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+       return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
index 0fae79164187094d77def3bdc9cd6e6cac99c733..112581cf97e7fe913e550ee2b19adb45da28df00 100644 (file)
@@ -548,7 +548,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
                return -ENOENT;
 
        /* Only allow sleepable program for resched-able iterator */
-       if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+       if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
                return -EINVAL;
 
        link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
@@ -697,7 +697,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
        struct bpf_run_ctx run_ctx, *old_run_ctx;
        int ret;
 
-       if (prog->aux->sleepable) {
+       if (prog->sleepable) {
                rcu_read_lock_trace();
                migrate_disable();
                might_fault();
index a6019087b467c2612ee20f8205a3268cc51beb22..43356faaa0578e8e61fb0b28ea3a4d80a96ccebe 100644 (file)
@@ -18,6 +18,8 @@ struct bpf_struct_ops_value {
        char data[] ____cacheline_aligned_in_smp;
 };
 
+#define MAX_TRAMP_IMAGE_PAGES 8
+
 struct bpf_struct_ops_map {
        struct bpf_map map;
        struct rcu_head rcu;
@@ -30,12 +32,11 @@ struct bpf_struct_ops_map {
         */
        struct bpf_link **links;
        u32 links_cnt;
-       /* image is a page that has all the trampolines
+       u32 image_pages_cnt;
+       /* image_pages is an array of pages that has all the trampolines
         * that stores the func args before calling the bpf_prog.
-        * A PAGE_SIZE "image" is enough to store all trampoline for
-        * "links[]".
         */
-       void *image;
+       void *image_pages[MAX_TRAMP_IMAGE_PAGES];
        /* The owner moduler's btf. */
        struct btf *btf;
        /* uvalue->data stores the kernel struct
@@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
        return true;
 }
 
+static void *bpf_struct_ops_image_alloc(void)
+{
+       void *image;
+       int err;
+
+       err = bpf_jit_charge_modmem(PAGE_SIZE);
+       if (err)
+               return ERR_PTR(err);
+       image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+       if (!image) {
+               bpf_jit_uncharge_modmem(PAGE_SIZE);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       return image;
+}
+
+void bpf_struct_ops_image_free(void *image)
+{
+       if (image) {
+               arch_free_bpf_trampoline(image, PAGE_SIZE);
+               bpf_jit_uncharge_modmem(PAGE_SIZE);
+       }
+}
+
 #define MAYBE_NULL_SUFFIX "__nullable"
 #define MAX_STUB_NAME 128
 
@@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
        }
 }
 
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+       int i;
+
+       for (i = 0; i < st_map->image_pages_cnt; i++)
+               bpf_struct_ops_image_free(st_map->image_pages[i]);
+       st_map->image_pages_cnt = 0;
+}
+
 static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
 {
        const struct btf_member *member;
@@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                                      struct bpf_tramp_link *link,
                                      const struct btf_func_model *model,
-                                     void *stub_func, void *image, void *image_end)
+                                     void *stub_func,
+                                     void **_image, u32 *_image_off,
+                                     bool allow_alloc)
 {
-       u32 flags = BPF_TRAMP_F_INDIRECT;
+       u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+       void *image = *_image;
        int size;
 
        tlinks[BPF_TRAMP_FENTRY].links[0] = link;
@@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                flags |= BPF_TRAMP_F_RET_FENTRY_RET;
 
        size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
-       if (size < 0)
-               return size;
-       if (size > (unsigned long)image_end - (unsigned long)image)
-               return -E2BIG;
-       return arch_prepare_bpf_trampoline(NULL, image, image_end,
+       if (size <= 0)
+               return size ? : -EFAULT;
+
+       /* Allocate image buffer if necessary */
+       if (!image || size > PAGE_SIZE - image_off) {
+               if (!allow_alloc)
+                       return -E2BIG;
+
+               image = bpf_struct_ops_image_alloc();
+               if (IS_ERR(image))
+                       return PTR_ERR(image);
+               image_off = 0;
+       }
+
+       size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+                                          image + PAGE_SIZE,
                                           model, flags, tlinks, stub_func);
+       if (size <= 0) {
+               if (image != *_image)
+                       bpf_struct_ops_image_free(image);
+               return size ? : -EFAULT;
+       }
+
+       *_image = image;
+       *_image_off = image_off + size;
+       return 0;
 }
 
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
        struct bpf_tramp_links *tlinks;
        void *udata, *kdata;
        int prog_fd, err;
-       void *image, *image_end;
-       u32 i;
+       u32 i, trampoline_start, image_off = 0;
+       void *cur_image = NULL, *image = NULL;
 
        if (flags)
                return -EINVAL;
@@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
        udata = &uvalue->data;
        kdata = &kvalue->data;
-       image = st_map->image;
-       image_end = st_map->image + PAGE_SIZE;
 
        module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
        for_each_member(i, t, member) {
@@ -658,28 +714,39 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
                              &bpf_struct_ops_link_lops, prog);
                st_map->links[i] = &link->link;
 
+               trampoline_start = image_off;
                err = bpf_struct_ops_prepare_trampoline(tlinks, link,
-                                                       &st_ops->func_models[i],
-                                                       *(void **)(st_ops->cfi_stubs + moff),
-                                                       image, image_end);
+                                               &st_ops->func_models[i],
+                                               *(void **)(st_ops->cfi_stubs + moff),
+                                               &image, &image_off,
+                                               st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+               if (err)
+                       goto reset_unlock;
+
+               if (cur_image != image) {
+                       st_map->image_pages[st_map->image_pages_cnt++] = image;
+                       cur_image = image;
+                       trampoline_start = 0;
+               }
                if (err < 0)
                        goto reset_unlock;
 
-               *(void **)(kdata + moff) = image + cfi_get_offset();
-               image += err;
+               *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
 
                /* put prog_id to udata */
                *(unsigned long *)(udata + moff) = prog->aux->id;
        }
 
+       if (st_ops->validate) {
+               err = st_ops->validate(kdata);
+               if (err)
+                       goto reset_unlock;
+       }
+       for (i = 0; i < st_map->image_pages_cnt; i++)
+               arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+
        if (st_map->map.map_flags & BPF_F_LINK) {
                err = 0;
-               if (st_ops->validate) {
-                       err = st_ops->validate(kdata);
-                       if (err)
-                               goto reset_unlock;
-               }
-               arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
                /* Let bpf_link handle registration & unregistration.
                 *
                 * Pair with smp_load_acquire() during lookup_elem().
@@ -688,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
                goto unlock;
        }
 
-       arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
        err = st_ops->reg(kdata);
        if (likely(!err)) {
                /* This refcnt increment on the map here after
@@ -711,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
         * there was a race in registering the struct_ops (under the same name) to
         * a sub-system through different struct_ops's maps.
         */
-       arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
 
 reset_unlock:
+       bpf_struct_ops_map_free_image(st_map);
        bpf_struct_ops_map_put_progs(st_map);
        memset(uvalue, 0, map->value_size);
        memset(kvalue, 0, map->value_size);
@@ -780,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
        if (st_map->links)
                bpf_struct_ops_map_put_progs(st_map);
        bpf_map_area_free(st_map->links);
-       if (st_map->image) {
-               arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
-               bpf_jit_uncharge_modmem(PAGE_SIZE);
-       }
+       bpf_struct_ops_map_free_image(st_map);
        bpf_map_area_free(st_map->uvalue);
        bpf_map_area_free(st_map);
 }
@@ -893,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
        st_map->st_ops_desc = st_ops_desc;
        map = &st_map->map;
 
-       ret = bpf_jit_charge_modmem(PAGE_SIZE);
-       if (ret)
-               goto errout_free;
-
-       st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-       if (!st_map->image) {
-               /* __bpf_struct_ops_map_free() uses st_map->image as flag
-                * for "charged or not". In this case, we need to unchange
-                * here.
-                */
-               bpf_jit_uncharge_modmem(PAGE_SIZE);
-               ret = -ENOMEM;
-               goto errout_free;
-       }
        st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
        st_map->links_cnt = btf_type_vlen(t);
        st_map->links =
index 6ff0bd1a91d5617fb928e9f6338cb35e0e68b347..90c4a32d89ff36da3df48bd26b71c42a936fe2e4 100644 (file)
@@ -809,9 +809,23 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
        return __btf_name_valid(btf, offset);
 }
 
+/* Allow any printable character in DATASEC names */
 static bool btf_name_valid_section(const struct btf *btf, u32 offset)
 {
-       return __btf_name_valid(btf, offset);
+       /* offset must be valid */
+       const char *src = btf_str_by_offset(btf, offset);
+       const char *src_limit;
+
+       /* set a limit on identifier length */
+       src_limit = src + KSYM_NAME_LEN;
+       src++;
+       while (*src && src < src_limit) {
+               if (!isprint(*src))
+                       return false;
+               src++;
+       }
+
+       return !*src;
 }
 
 static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
@@ -7097,10 +7111,11 @@ cand_cache_unlock:
 }
 
 enum btf_arg_tag {
-       ARG_TAG_CTX = 0x1,
-       ARG_TAG_NONNULL = 0x2,
-       ARG_TAG_TRUSTED = 0x4,
-       ARG_TAG_NULLABLE = 0x8,
+       ARG_TAG_CTX      = BIT_ULL(0),
+       ARG_TAG_NONNULL  = BIT_ULL(1),
+       ARG_TAG_TRUSTED  = BIT_ULL(2),
+       ARG_TAG_NULLABLE = BIT_ULL(3),
+       ARG_TAG_ARENA    = BIT_ULL(4),
 };
 
 /* Process BTF of a function to produce high-level expectation of function
@@ -7212,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
                                tags |= ARG_TAG_NONNULL;
                        } else if (strcmp(tag, "nullable") == 0) {
                                tags |= ARG_TAG_NULLABLE;
+                       } else if (strcmp(tag, "arena") == 0) {
+                               tags |= ARG_TAG_ARENA;
                        } else {
                                bpf_log(log, "arg#%d has unsupported set of tags\n", i);
                                return -EOPNOTSUPP;
@@ -7266,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
                        sub->args[i].btf_id = kern_type_id;
                        continue;
                }
+               if (tags & ARG_TAG_ARENA) {
+                       if (tags & ~ARG_TAG_ARENA) {
+                               bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+                               return -EINVAL;
+                       }
+                       sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+                       continue;
+               }
                if (is_global) { /* generic user data pointer */
                        u32 mem_size;
 
index 71c459a51d9e144b04d343f7de46d2ab8ce01dc7..696bc55de8e82ea9358ede9c222b4927871e60be 100644 (file)
@@ -88,13 +88,18 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
        return NULL;
 }
 
+/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
+enum page_size_enum {
+       __PAGE_SIZE = PAGE_SIZE
+};
+
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;
 
-       size = round_up(size, PAGE_SIZE);
+       size = round_up(size, __PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;
@@ -888,7 +893,12 @@ static LIST_HEAD(pack_list);
  * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
  */
 #ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
 #else
 #define BPF_PROG_PACK_SIZE PAGE_SIZE
 #endif
@@ -1675,6 +1685,7 @@ bool bpf_opcode_in_insntable(u8 code)
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
+               [BPF_JMP | BPF_JCOND] = true,
        };
 #undef BPF_INSN_3_TBL
 #undef BPF_INSN_2_TBL
@@ -2695,7 +2706,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
        bool sleepable;
        u32 i;
 
-       sleepable = aux->sleepable;
+       sleepable = aux->prog->sleepable;
        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
@@ -2926,6 +2937,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
        return false;
 }
 
+bool __weak bpf_jit_supports_arena(void)
+{
+       return false;
+}
+
 /* Return TRUE if the JIT backend satisfies the following two conditions:
  * 1) JIT backend supports atomic_xchg() on pointer-sized words.
  * 2) Under the specific arch, the implementation of xchg() is the same
@@ -2970,6 +2986,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
 {
 }
 
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+       return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+       return 0;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
 {
index a936c704d4e773c213563e5cdae1091eb3bde879..4e2cdbb5629f22fc1464c5cba296688c45fc8d26 100644 (file)
@@ -130,13 +130,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
        bpf_map_init_from_attr(&dtab->map, attr);
 
        if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
-               dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
-               if (!dtab->n_buckets) /* Overflow check */
+               /* hash table size must be power of 2; roundup_pow_of_two() can
+                * overflow into UB on 32-bit arches, so check that first
+                */
+               if (dtab->map.max_entries > 1UL << 31)
                        return -EINVAL;
-       }
 
-       if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+               dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
                dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
                                                           dtab->map.numa_node);
                if (!dtab->dev_index_head)
index 49940c26a227496937c0bad4e8cc0fa2e89825b1..bd2e2dd04740c1810ba559b67ac9ba69092dc11a 100644 (file)
@@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn)
               (insn->off == 8 || insn->off == 16 || insn->off == 32);
 }
 
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+       return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+               insn->off == BPF_ADDR_SPACE_CAST;
+}
+
 void print_bpf_insn(const struct bpf_insn_cbs *cbs,
                    const struct bpf_insn *insn,
                    bool allow_ptr_leaks)
@@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
                                insn->code, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
                                insn->dst_reg);
+               } else if (is_addr_space_cast(insn)) {
+                       verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+                               insn->code, insn->dst_reg,
+                               insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
                                insn->code, class == BPF_ALU ? 'w' : 'r',
@@ -322,6 +332,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
                } else if (insn->code == (BPF_JMP | BPF_JA)) {
                        verbose(cbs->private_data, "(%02x) goto pc%+d\n",
                                insn->code, insn->off);
+               } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+                          insn->src_reg == BPF_MAY_GOTO) {
+                       verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+                               insn->code, insn->off);
                } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
                        verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
                                insn->code, insn->imm);
index 03a6a2500b6aba26772bd4cdc1ebd8401379a930..3a088a5349bc0e2f4b06f8d4e44ad81ad00ff110 100644 (file)
@@ -499,7 +499,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                                                          num_possible_cpus());
        }
 
-       /* hash table size must be power of 2 */
+       /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+        * into UB on 32-bit arches, so check that first
+        */
+       err = -E2BIG;
+       if (htab->map.max_entries > 1UL << 31)
+               goto free_htab;
+
        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 
        htab->elem_size = sizeof(struct htab_elem) +
@@ -509,10 +515,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        else
                htab->elem_size += round_up(htab->map.value_size, 8);
 
-       err = -E2BIG;
-       /* prevent zero size kmalloc and check for u32 overflow */
-       if (htab->n_buckets == 0 ||
-           htab->n_buckets > U32_MAX / sizeof(struct bucket))
+       /* check for u32 overflow */
+       if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
                goto free_htab;
 
        err = bpf_map_init_elem_count(&htab->map);
index 63c34e7b07155488784d2e738ea1f7d203241d4f..2a243cf37c60b1dbca97817dcde61c68af9f5cd5 100644 (file)
@@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
                [PTR_TO_XDP_SOCK]       = "xdp_sock",
                [PTR_TO_BTF_ID]         = "ptr_",
                [PTR_TO_MEM]            = "mem",
+               [PTR_TO_ARENA]          = "arena",
                [PTR_TO_BUF]            = "buf",
                [PTR_TO_FUNC]           = "func",
                [PTR_TO_MAP_KEY]        = "map_key",
@@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
        }
 
        verbose(env, "%s", reg_type_str(env, t));
+       if (t == PTR_TO_ARENA)
+               return;
        if (t == PTR_TO_STACK) {
                if (state->frameno != reg->frameno)
                        verbose(env, "[%d]", reg->frameno);
index dff7ba5397015520d8163d62bb4393ff9d224402..c99f8e5234ac484a48506249b392d520aefe5a85 100644 (file)
@@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        } else if (value_size / 8 > sysctl_perf_event_max_stack)
                return ERR_PTR(-EINVAL);
 
-       /* hash table size must be power of 2 */
-       n_buckets = roundup_pow_of_two(attr->max_entries);
-       if (!n_buckets)
+       /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+        * into UB on 32-bit arches, so check that first
+        */
+       if (attr->max_entries > 1UL << 31)
                return ERR_PTR(-E2BIG);
 
+       n_buckets = roundup_pow_of_two(attr->max_entries);
+
        cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
        smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
        if (!smap)
index b2750b79ac8050c448533a23aed14a9c2aa1939b..ae2ff73bde7e79aa905f060f753bfc8261451972 100644 (file)
@@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
        if (bpf_map_is_offloaded(map)) {
                return bpf_map_offload_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+                  map->map_type == BPF_MAP_TYPE_ARENA ||
                   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                return map->ops->map_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
@@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map)
 }
 #endif
 
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+                       unsigned long nr_pages, struct page **pages)
+{
+       unsigned long i, j;
+       struct page *pg;
+       int ret = 0;
+#ifdef CONFIG_MEMCG_KMEM
+       struct mem_cgroup *memcg, *old_memcg;
+
+       memcg = bpf_map_get_memcg(map);
+       old_memcg = set_active_memcg(memcg);
+#endif
+       for (i = 0; i < nr_pages; i++) {
+               pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+
+               if (pg) {
+                       pages[i] = pg;
+                       continue;
+               }
+               for (j = 0; j < i; j++)
+                       __free_page(pages[j]);
+               ret = -ENOMEM;
+               break;
+       }
+
+#ifdef CONFIG_MEMCG_KMEM
+       set_active_memcg(old_memcg);
+       mem_cgroup_put(memcg);
+#endif
+       return ret;
+}
+
+
 static int btf_field_cmp(const void *a, const void *b)
 {
        const struct btf_field *f1 = a, *f2 = b;
@@ -937,6 +971,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
        return EPOLLERR;
 }
 
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+                                          unsigned long len, unsigned long pgoff,
+                                          unsigned long flags)
+{
+       struct bpf_map *map = filp->private_data;
+
+       if (map->ops->map_get_unmapped_area)
+               return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+       return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+#else
+       return addr;
+#endif
+}
+
 const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
        .show_fdinfo    = bpf_map_show_fdinfo,
@@ -946,6 +995,7 @@ const struct file_operations bpf_map_fops = {
        .write          = bpf_dummy_write,
        .mmap           = bpf_map_mmap,
        .poll           = bpf_map_poll,
+       .get_unmapped_area = bpf_get_unmapped_area,
 };
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -1160,6 +1210,7 @@ static int map_create(union bpf_attr *attr)
        }
 
        if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+           attr->map_type != BPF_MAP_TYPE_ARENA &&
            attr->map_extra != 0)
                return -EINVAL;
 
@@ -1249,6 +1300,7 @@ static int map_create(union bpf_attr *attr)
        case BPF_MAP_TYPE_LRU_PERCPU_HASH:
        case BPF_MAP_TYPE_STRUCT_OPS:
        case BPF_MAP_TYPE_CPUMAP:
+       case BPF_MAP_TYPE_ARENA:
                if (!bpf_token_capable(token, CAP_BPF))
                        goto put_token;
                break;
@@ -2196,7 +2248,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
                btf_put(prog->aux->attach_btf);
 
        if (deferred) {
-               if (prog->aux->sleepable)
+               if (prog->sleepable)
                        call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
                else
                        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -2761,11 +2813,11 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
        }
 
        prog->expected_attach_type = attr->expected_attach_type;
+       prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
        prog->aux->attach_btf = attach_btf;
        prog->aux->attach_btf_id = attr->attach_btf_id;
        prog->aux->dst_prog = dst_prog;
        prog->aux->dev_bound = !!attr->prog_ifindex;
-       prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
        prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
        /* move token into prog->aux, reuse taken refcnt */
@@ -4401,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
                        continue;
                }
 
+               if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+                    BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+                       insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+                       continue;
+               }
+
                if (code != (BPF_LD | BPF_IMM | BPF_DW))
                        continue;
 
@@ -5496,7 +5554,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
        /* The bpf program will not access the bpf map, but for the sake of
         * simplicity, increase sleepable_refcnt for sleepable program as well.
         */
-       if (prog->aux->sleepable)
+       if (prog->sleepable)
                atomic64_inc(&map->sleepable_refcnt);
        memcpy(used_maps_new, used_maps_old,
               sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
index d382f5ebe06c8f02be96679d2277508e0af0644f..db7599c59c78a66f1b85ef969ba64be084edd181 100644 (file)
@@ -1014,7 +1014,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 
 bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
 {
-       bool sleepable = prog->aux->sleepable;
+       bool sleepable = prog->sleepable;
 
        if (bpf_prog_check_recur(prog))
                return sleepable ? __bpf_prog_enter_sleepable_recur :
@@ -1029,7 +1029,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
 
 bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
 {
-       bool sleepable = prog->aux->sleepable;
+       bool sleepable = prog->sleepable;
 
        if (bpf_prog_check_recur(prog))
                return sleepable ? __bpf_prog_exit_sleepable_recur :
index 21f80383f8b215b72df159fcdce0e6073d58c25f..63749ad5ac6b8d63f108b92690897f032c7eacb6 100644 (file)
@@ -533,6 +533,16 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn)
        return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
 }
 
+static bool is_may_goto_insn(struct bpf_insn *insn)
+{
+       return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+       return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
 static bool is_storage_get_function(enum bpf_func_id func_id)
 {
        return func_id == BPF_FUNC_sk_storage_get ||
@@ -1429,6 +1439,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
        dst_state->dfs_depth = src->dfs_depth;
        dst_state->callback_unroll_depth = src->callback_unroll_depth;
        dst_state->used_as_loop_entry = src->used_as_loop_entry;
+       dst_state->may_goto_depth = src->may_goto_depth;
        for (i = 0; i <= src->curframe; i++) {
                dst = dst_state->frame[i];
                if (!dst) {
@@ -4375,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        case PTR_TO_MEM:
        case PTR_TO_FUNC:
        case PTR_TO_MAP_KEY:
+       case PTR_TO_ARENA:
                return true;
        default:
                return false;
@@ -5262,7 +5274,7 @@ bad_type:
 
 static bool in_sleepable(struct bpf_verifier_env *env)
 {
-       return env->prog->aux->sleepable;
+       return env->prog->sleepable;
 }
 
 /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -5817,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
        case PTR_TO_XDP_SOCK:
                pointer_desc = "xdp_sock ";
                break;
+       case PTR_TO_ARENA:
+               return 0;
        default:
                break;
        }
@@ -6926,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
                if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
                        mark_reg_unknown(env, regs, value_regno);
+       } else if (reg->type == PTR_TO_ARENA) {
+               if (t == BPF_READ && value_regno >= 0)
+                       mark_reg_unknown(env, regs, value_regno);
        } else {
                verbose(env, "R%d invalid mem access '%s'\n", regno,
                        reg_type_str(env, reg->type));
@@ -8397,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
        case PTR_TO_MEM | MEM_RINGBUF:
        case PTR_TO_BUF:
        case PTR_TO_BUF | MEM_RDONLY:
+       case PTR_TO_ARENA:
        case SCALAR_VALUE:
                return 0;
        /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -9361,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
                                bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
                                return -EINVAL;
                        }
+               } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+                       /*
+                        * Can pass any value and the kernel won't crash, but
+                        * only PTR_TO_ARENA or SCALAR make sense. Everything
+                        * else is a bug in the bpf program. Point it out to
+                        * the user at the verification time instead of
+                        * run-time debug nightmare.
+                        */
+                       if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+                               bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+                               return -EINVAL;
+                       }
                } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
                        ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
                        if (ret)
@@ -10741,6 +10771,11 @@ static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *a
        return btf_param_match_suffix(btf, arg, "__ign");
 }
 
+static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
+{
+       return btf_param_match_suffix(btf, arg, "__map");
+}
+
 static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
 {
        return btf_param_match_suffix(btf, arg, "__alloc");
@@ -10910,6 +10945,7 @@ enum kfunc_ptr_arg_type {
        KF_ARG_PTR_TO_RB_NODE,
        KF_ARG_PTR_TO_NULL,
        KF_ARG_PTR_TO_CONST_STR,
+       KF_ARG_PTR_TO_MAP,
 };
 
 enum special_kfunc_type {
@@ -11063,6 +11099,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
        if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_CONST_STR;
 
+       if (is_kfunc_arg_map(meta->btf, &args[argno]))
+               return KF_ARG_PTR_TO_MAP;
+
        if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
                if (!btf_type_is_struct(ref_t)) {
                        verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11663,6 +11702,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_NULL:
                        continue;
+               case KF_ARG_PTR_TO_MAP:
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                case KF_ARG_PTR_TO_BTF_ID:
                        if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
@@ -11879,6 +11919,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                        if (ret < 0)
                                return ret;
                        break;
+               case KF_ARG_PTR_TO_MAP:
+                       /* If argument has '__map' suffix expect 'struct bpf_map *' */
+                       ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
+                       ref_t = btf_type_by_id(btf_vmlinux, ref_id);
+                       ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+                       fallthrough;
                case KF_ARG_PTR_TO_BTF_ID:
                        /* Only base_type is checked, further checks are done here */
                        if ((base_type(reg->type) != PTR_TO_BTF_ID ||
@@ -12353,6 +12399,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                                        meta.func_name);
                                return -EFAULT;
                        }
+               } else if (btf_type_is_void(ptr_type)) {
+                       /* kfunc returning 'void *' is equivalent to returning scalar */
+                       mark_reg_unknown(env, regs, BPF_REG_0);
                } else if (!__btf_type_is_struct(ptr_type)) {
                        if (!meta.r0_size) {
                                __u32 sz;
@@ -13822,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
        dst_reg = &regs[insn->dst_reg];
        src_reg = NULL;
+
+       if (dst_reg->type == PTR_TO_ARENA) {
+               struct bpf_insn_aux_data *aux = cur_aux(env);
+
+               if (BPF_CLASS(insn->code) == BPF_ALU64)
+                       /*
+                        * 32-bit operations zero upper bits automatically.
+                        * 64-bit operations need to be converted to 32.
+                        */
+                       aux->needs_zext = true;
+
+               /* Any arithmetic operations are allowed on arena pointers */
+               return 0;
+       }
+
        if (dst_reg->type != SCALAR_VALUE)
                ptr_reg = dst_reg;
        else
@@ -13939,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
        } else if (opcode == BPF_MOV) {
 
                if (BPF_SRC(insn->code) == BPF_X) {
-                       if (insn->imm != 0) {
-                               verbose(env, "BPF_MOV uses reserved fields\n");
-                               return -EINVAL;
-                       }
-
                        if (BPF_CLASS(insn->code) == BPF_ALU) {
-                               if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+                               if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+                                   insn->imm) {
                                        verbose(env, "BPF_MOV uses reserved fields\n");
                                        return -EINVAL;
                                }
+                       } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+                               if (insn->imm != 1 && insn->imm != 1u << 16) {
+                                       verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+                                       return -EINVAL;
+                               }
                        } else {
-                               if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
-                                   insn->off != 32) {
+                               if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+                                    insn->off != 32) || insn->imm) {
                                        verbose(env, "BPF_MOV uses reserved fields\n");
                                        return -EINVAL;
                                }
@@ -13978,7 +14043,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
 
                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
-                               if (insn->off == 0) {
+                               if (insn->imm) {
+                                       /* off == BPF_ADDR_SPACE_CAST */
+                                       mark_reg_unknown(env, regs, insn->dst_reg);
+                                       if (insn->imm == 1) /* cast from as(1) to as(0) */
+                                               dst_reg->type = PTR_TO_ARENA;
+                               } else if (insn->off == 0) {
                                        /* case: R1 = R2
                                         * copy register state to dest reg
                                         */
@@ -14871,11 +14941,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        int err;
 
        /* Only conditional jumps are expected to reach here. */
-       if (opcode == BPF_JA || opcode > BPF_JSLE) {
+       if (opcode == BPF_JA || opcode > BPF_JCOND) {
                verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
                return -EINVAL;
        }
 
+       if (opcode == BPF_JCOND) {
+               struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+               int idx = *insn_idx;
+
+               if (insn->code != (BPF_JMP | BPF_JCOND) ||
+                   insn->src_reg != BPF_MAY_GOTO ||
+                   insn->dst_reg || insn->imm || insn->off == 0) {
+                       verbose(env, "invalid may_goto off %d imm %d\n",
+                               insn->off, insn->imm);
+                       return -EINVAL;
+               }
+               prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+               /* branch out 'fallthrough' insn as a new state to explore */
+               queued_st = push_stack(env, idx + 1, idx, false);
+               if (!queued_st)
+                       return -ENOMEM;
+
+               queued_st->may_goto_depth++;
+               if (prev_st)
+                       widen_imprecise_scalars(env, prev_st, queued_st);
+               *insn_idx += insn->off;
+               return 0;
+       }
+
        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
@@ -15127,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
        if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
            insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+               if (map->map_type == BPF_MAP_TYPE_ARENA) {
+                       __mark_reg_unknown(env, dst_reg);
+                       return 0;
+               }
                dst_reg->type = PTR_TO_MAP_VALUE;
                dst_reg->off = aux->map_off;
                WARN_ON_ONCE(map->max_entries != 1);
@@ -15659,6 +15758,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
        default:
                /* conditional jump with two edges */
                mark_prune_point(env, t);
+               if (is_may_goto_insn(insn))
+                       mark_force_checkpoint(env, t);
 
                ret = push_insn(t, t + 1, FALLTHROUGH, env);
                if (ret)
@@ -16222,8 +16323,8 @@ static int check_btf_info(struct bpf_verifier_env *env,
 }
 
 /* check %cur's range satisfies %old's */
-static bool range_within(struct bpf_reg_state *old,
-                        struct bpf_reg_state *cur)
+static bool range_within(const struct bpf_reg_state *old,
+                        const struct bpf_reg_state *cur)
 {
        return old->umin_value <= cur->umin_value &&
               old->umax_value >= cur->umax_value &&
@@ -16387,21 +16488,28 @@ static bool regs_exact(const struct bpf_reg_state *rold,
               check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
 }
 
+enum exact_level {
+       NOT_EXACT,
+       EXACT,
+       RANGE_WITHIN
+};
+
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
-                   struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
+                   struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+                   enum exact_level exact)
 {
-       if (exact)
+       if (exact == EXACT)
                return regs_exact(rold, rcur, idmap);
 
-       if (!(rold->live & REG_LIVE_READ))
+       if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
                /* explored state didn't use this */
                return true;
-       if (rold->type == NOT_INIT)
-               /* explored state can't have used this */
-               return true;
-       if (rcur->type == NOT_INIT)
-               return false;
+       if (rold->type == NOT_INIT) {
+               if (exact == NOT_EXACT || rcur->type == NOT_INIT)
+                       /* explored state can't have used this */
+                       return true;
+       }
 
        /* Enforce that register types have to match exactly, including their
         * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
@@ -16436,7 +16544,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
                               check_scalar_ids(rold->id, rcur->id, idmap);
                }
-               if (!rold->precise)
+               if (!rold->precise && exact == NOT_EXACT)
                        return true;
                /* Why check_ids() for scalar registers?
                 *
@@ -16504,6 +16612,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                 * the same stack frame, since fp-8 in foo != fp-8 in bar
                 */
                return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+       case PTR_TO_ARENA:
+               return true;
        default:
                return regs_exact(rold, rcur, idmap);
        }
@@ -16547,7 +16657,8 @@ static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
 }
 
 static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
-                     struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
+                     struct bpf_func_state *cur, struct bpf_idmap *idmap,
+                     enum exact_level exact)
 {
        int i, spi;
 
@@ -16560,12 +16671,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 
                spi = i / BPF_REG_SIZE;
 
-               if (exact &&
+               if (exact != NOT_EXACT &&
                    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
                        return false;
 
-               if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
+               if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
+                   && exact == NOT_EXACT) {
                        i += BPF_REG_SIZE - 1;
                        /* explored state didn't use this */
                        continue;
@@ -16711,7 +16823,7 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
  * the current state will reach 'bpf_exit' instruction safely
  */
 static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
-                             struct bpf_func_state *cur, bool exact)
+                             struct bpf_func_state *cur, enum exact_level exact)
 {
        int i;
 
@@ -16741,7 +16853,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env)
 static bool states_equal(struct bpf_verifier_env *env,
                         struct bpf_verifier_state *old,
                         struct bpf_verifier_state *cur,
-                        bool exact)
+                        enum exact_level exact)
 {
        int i;
 
@@ -17115,7 +17227,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                         * => unsafe memory access at 11 would not be caught.
                         */
                        if (is_iter_next_insn(env, insn_idx)) {
-                               if (states_equal(env, &sl->state, cur, true)) {
+                               if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        struct bpf_func_state *cur_frame;
                                        struct bpf_reg_state *iter_state, *iter_reg;
                                        int spi;
@@ -17138,15 +17250,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                                }
                                goto skip_inf_loop_check;
                        }
+                       if (is_may_goto_insn_at(env, insn_idx)) {
+                               if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+                                       update_loop_entry(cur, &sl->state);
+                                       goto hit;
+                               }
+                               goto skip_inf_loop_check;
+                       }
                        if (calls_callback(env, insn_idx)) {
-                               if (states_equal(env, &sl->state, cur, true))
+                               if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
                                        goto hit;
                                goto skip_inf_loop_check;
                        }
                        /* attempt to detect infinite loop to avoid unnecessary doomed work */
                        if (states_maybe_looping(&sl->state, cur) &&
-                           states_equal(env, &sl->state, cur, true) &&
+                           states_equal(env, &sl->state, cur, EXACT) &&
                            !iter_active_depths_differ(&sl->state, cur) &&
+                           sl->state.may_goto_depth == cur->may_goto_depth &&
                            sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
                                verbose_linfo(env, insn_idx, "; ");
                                verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -17202,7 +17322,7 @@ skip_inf_loop_check:
                 */
                loop_entry = get_loop_entry(&sl->state);
                force_exact = loop_entry && loop_entry->branches > 0;
-               if (states_equal(env, &sl->state, cur, force_exact)) {
+               if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
                        if (force_exact)
                                update_loop_entry(cur, loop_entry);
 hit:
@@ -17372,6 +17492,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
        case PTR_TO_TCP_SOCK:
        case PTR_TO_XDP_SOCK:
        case PTR_TO_BTF_ID:
+       case PTR_TO_ARENA:
                return false;
        default:
                return true;
@@ -18019,7 +18140,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                return -EINVAL;
        }
 
-       if (prog->aux->sleepable)
+       if (prog->sleepable)
                switch (map->map_type) {
                case BPF_MAP_TYPE_HASH:
                case BPF_MAP_TYPE_LRU_HASH:
@@ -18037,6 +18158,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                case BPF_MAP_TYPE_CGRP_STORAGE:
                case BPF_MAP_TYPE_QUEUE:
                case BPF_MAP_TYPE_STACK:
+               case BPF_MAP_TYPE_ARENA:
                        break;
                default:
                        verbose(env,
@@ -18206,7 +18328,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
                                return -E2BIG;
                        }
 
-                       if (env->prog->aux->sleepable)
+                       if (env->prog->sleepable)
                                atomic64_inc(&map->sleepable_refcnt);
                        /* hold the map. If the program is rejected by verifier,
                         * the map will be released by release_maps() or it
@@ -18224,6 +18346,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
                                fdput(f);
                                return -EBUSY;
                        }
+                       if (map->map_type == BPF_MAP_TYPE_ARENA) {
+                               if (env->prog->aux->arena) {
+                                       verbose(env, "Only one arena per program\n");
+                                       fdput(f);
+                                       return -EBUSY;
+                               }
+                               if (!env->allow_ptr_leaks || !env->bpf_capable) {
+                                       verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+                                       fdput(f);
+                                       return -EPERM;
+                               }
+                               if (!env->prog->jit_requested) {
+                                       verbose(env, "JIT is required to use arena\n");
+                                       return -EOPNOTSUPP;
+                               }
+                               if (!bpf_jit_supports_arena()) {
+                                       verbose(env, "JIT doesn't support arena\n");
+                                       return -EOPNOTSUPP;
+                               }
+                               env->prog->aux->arena = (void *)map;
+                               if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+                                       verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+                                       return -EINVAL;
+                               }
+                       }
 
                        fdput(f);
 next_insn:
@@ -18845,6 +18992,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                                env->prog->aux->num_exentries++;
                        }
                        continue;
+               case PTR_TO_ARENA:
+                       if (BPF_MODE(insn->code) == BPF_MEMSX) {
+                               verbose(env, "sign extending loads from arena are not supported yet\n");
+                               return -EOPNOTSUPP;
+                       }
+                       insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+                       env->prog->aux->num_exentries++;
+                       continue;
                default:
                        continue;
                }
@@ -19030,13 +19185,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                func[i]->aux->nr_linfo = prog->aux->nr_linfo;
                func[i]->aux->jited_linfo = prog->aux->jited_linfo;
                func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+               func[i]->aux->arena = prog->aux->arena;
                num_exentries = 0;
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
                        if (BPF_CLASS(insn->code) == BPF_LDX &&
                            (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+                            BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
                                num_exentries++;
+                       if ((BPF_CLASS(insn->code) == BPF_STX ||
+                            BPF_CLASS(insn->code) == BPF_ST) &&
+                            BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+                               num_exentries++;
                }
                func[i]->aux->num_exentries = num_exentries;
                func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
@@ -19411,7 +19572,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
        struct bpf_insn insn_buf[16];
        struct bpf_prog *new_prog;
        struct bpf_map *map_ptr;
-       int i, ret, cnt, delta = 0;
+       int i, ret, cnt, delta = 0, cur_subprog = 0;
+       struct bpf_subprog_info *subprogs = env->subprog_info;
+       u16 stack_depth = subprogs[cur_subprog].stack_depth;
+       u16 stack_depth_extra = 0;
 
        if (env->seen_exception && !env->exception_callback_subprog) {
                struct bpf_insn patch[] = {
@@ -19431,7 +19595,22 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                mark_subprog_exc_cb(env, env->exception_callback_subprog);
        }
 
-       for (i = 0; i < insn_cnt; i++, insn++) {
+       for (i = 0; i < insn_cnt;) {
+               if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+                       if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+                           (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+                               /* convert to 32-bit mov that clears upper 32-bit */
+                               insn->code = BPF_ALU | BPF_MOV | BPF_X;
+                               /* clear off, so it's a normal 'wX = wY' from JIT pov */
+                               insn->off = 0;
+                       } /* cast from as(0) to as(1) should be handled by JIT */
+                       goto next_insn;
+               }
+
+               if (env->insn_aux_data[i + delta].needs_zext)
+                       /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+                       insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
                /* Make divide-by-zero exceptions impossible. */
                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
@@ -19470,7 +19649,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
@@ -19490,7 +19669,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Rewrite pointer arithmetic to mitigate speculation attacks. */
@@ -19505,7 +19684,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        aux = &env->insn_aux_data[i + delta];
                        if (!aux->alu_state ||
                            aux->alu_state == BPF_ALU_NON_POINTER)
-                               continue;
+                               goto next_insn;
 
                        isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
                        issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
@@ -19543,19 +19722,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
+               }
+
+               if (is_may_goto_insn(insn)) {
+                       int stack_off = -stack_depth - 8;
+
+                       stack_depth_extra = 8;
+                       insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+                       insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+                       insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+                       insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+                       cnt = 4;
+
+                       new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+                       if (!new_prog)
+                               return -ENOMEM;
+
+                       delta += cnt - 1;
+                       env->prog = prog = new_prog;
+                       insn = new_prog->insnsi + i + delta;
+                       goto next_insn;
                }
 
                if (insn->code != (BPF_JMP | BPF_CALL))
-                       continue;
+                       goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_CALL)
-                       continue;
+                       goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                        ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
                        if (ret)
                                return ret;
                        if (cnt == 0)
-                               continue;
+                               goto next_insn;
 
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
@@ -19564,7 +19763,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                if (insn->imm == BPF_FUNC_get_route_realm)
@@ -19612,11 +19811,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                                }
 
                                insn->imm = ret + 1;
-                               continue;
+                               goto next_insn;
                        }
 
                        if (!bpf_map_ptr_unpriv(aux))
-                               continue;
+                               goto next_insn;
 
                        /* instead of changing every JIT dealing with tail_call
                         * emit two extra insns:
@@ -19645,7 +19844,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                if (insn->imm == BPF_FUNC_timer_set_callback) {
@@ -19757,7 +19956,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
                                delta    += cnt - 1;
                                env->prog = prog = new_prog;
                                insn      = new_prog->insnsi + i + delta;
-                               continue;
+                               goto next_insn;
                        }
 
                        BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
@@ -19788,31 +19987,31 @@ patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_update_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_update_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_delete_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_push_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_push_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_pop_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_peek_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_redirect_map:
                                insn->imm = BPF_CALL_IMM(ops->map_redirect);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_for_each_map_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
-                               continue;
+                               goto next_insn;
                        case BPF_FUNC_map_lookup_percpu_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
-                               continue;
+                               goto next_insn;
                        }
 
                        goto patch_call_imm;
@@ -19840,7 +20039,7 @@ patch_map_ops_generic:
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement bpf_get_func_arg inline. */
@@ -19865,7 +20064,7 @@ patch_map_ops_generic:
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement bpf_get_func_ret inline. */
@@ -19893,7 +20092,7 @@ patch_map_ops_generic:
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement get_func_arg_cnt inline. */
@@ -19908,7 +20107,7 @@ patch_map_ops_generic:
 
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement bpf_get_func_ip inline. */
@@ -19923,7 +20122,7 @@ patch_map_ops_generic:
 
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 
                /* Implement bpf_kptr_xchg inline */
@@ -19941,7 +20140,7 @@ patch_map_ops_generic:
                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
-                       continue;
+                       goto next_insn;
                }
 patch_call_imm:
                fn = env->ops->get_func_proto(insn->imm, env->prog);
@@ -19955,6 +20154,40 @@ patch_call_imm:
                        return -EFAULT;
                }
                insn->imm = fn->func - __bpf_call_base;
+next_insn:
+               if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+                       subprogs[cur_subprog].stack_depth += stack_depth_extra;
+                       subprogs[cur_subprog].stack_extra = stack_depth_extra;
+                       cur_subprog++;
+                       stack_depth = subprogs[cur_subprog].stack_depth;
+                       stack_depth_extra = 0;
+               }
+               i++;
+               insn++;
+       }
+
+       env->prog->aux->stack_depth = subprogs[0].stack_depth;
+       for (i = 0; i < env->subprog_cnt; i++) {
+               int subprog_start = subprogs[i].start;
+               int stack_slots = subprogs[i].stack_extra / 8;
+
+               if (!stack_slots)
+                       continue;
+               if (stack_slots > 1) {
+                       verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+                       return -EFAULT;
+               }
+
+               /* Add ST insn to subprog prologue to init extra stack */
+               insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+                                        -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+               /* Copy first actual insn to preserve it */
+               insn_buf[1] = env->prog->insnsi[subprog_start];
+
+               new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+               if (!new_prog)
+                       return -ENOMEM;
+               env->prog = prog = new_prog;
        }
 
        /* Since poke tab is now finalized, publish aux to tracker. */
@@ -20230,6 +20463,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
                                reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
                                reg->btf_id = arg->btf_id;
                                reg->id = ++env->id_gen;
+                       } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+                               /* caller can pass either PTR_TO_ARENA or SCALAR */
+                               mark_reg_unknown(env, regs, i);
                        } else {
                                WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
                                          i - BPF_REG_1, arg->arg_type);
@@ -20705,7 +20941,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
                        }
                }
 
-               if (prog->aux->sleepable) {
+               if (prog->sleepable) {
                        ret = -EINVAL;
                        switch (prog->type) {
                        case BPF_PROG_TYPE_TRACING:
@@ -20816,14 +21052,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
        u64 key;
 
        if (prog->type == BPF_PROG_TYPE_SYSCALL) {
-               if (prog->aux->sleepable)
+               if (prog->sleepable)
                        /* attach_btf_id checked to be zero already */
                        return 0;
                verbose(env, "Syscall programs can only be sleepable\n");
                return -EINVAL;
        }
 
-       if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+       if (prog->sleepable && !can_be_sleepable(prog)) {
                verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
                return -EINVAL;
        }
index 5ecfa57e3b97f6983dfe413cc17d8c5db903faf5..724e6d7e128f3766f89791861c258fb317297216 100644 (file)
@@ -10553,7 +10553,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;
 
-       if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+       if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
                /* only uprobe programs are allowed to be sleepable */
                return -EINVAL;
 
index 241ddf5e38953e2dc4c275ac8136b9611b25936b..0a5c4efc73c3674fa225757c6f4ccc921f758b57 100644 (file)
@@ -3256,7 +3256,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
                .uprobe = uprobe,
        };
        struct bpf_prog *prog = link->link.prog;
-       bool sleepable = prog->aux->sleepable;
+       bool sleepable = prog->sleepable;
        struct bpf_run_ctx *old_run_ctx;
        int err = 0;
 
index d12a17fc0c171cc40ad86bfe4b40d5badd1c0ee9..1e36322d83d895ca8964240d8c5fa9dfb14868a9 100644 (file)
@@ -304,8 +304,8 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
        return err;
 }
 
-int ioremap_page_range(unsigned long addr, unsigned long end,
-               phys_addr_t phys_addr, pgprot_t prot)
+int vmap_page_range(unsigned long addr, unsigned long end,
+                   phys_addr_t phys_addr, pgprot_t prot)
 {
        int err;
 
@@ -318,6 +318,26 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
        return err;
 }
 
+int ioremap_page_range(unsigned long addr, unsigned long end,
+               phys_addr_t phys_addr, pgprot_t prot)
+{
+       struct vm_struct *area;
+
+       area = find_vm_area((void *)addr);
+       if (!area || !(area->flags & VM_IOREMAP)) {
+               WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
+               return -EINVAL;
+       }
+       if (addr != (unsigned long)area->addr ||
+           (void *)end != area->addr + get_vm_area_size(area)) {
+               WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
+                         addr, end, (long)area->addr,
+                         (long)area->addr + get_vm_area_size(area));
+               return -ERANGE;
+       }
+       return vmap_page_range(addr, end, phys_addr, prot);
+}
+
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
 {
@@ -635,6 +655,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
        return err;
 }
 
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+                               unsigned long end)
+{
+       might_sleep();
+       if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+               return -EINVAL;
+       if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+               return -EINVAL;
+       if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+               return -EINVAL;
+       if ((end - start) >> PAGE_SHIFT > totalram_pages())
+               return -E2BIG;
+       if (start < (unsigned long)area->addr ||
+           (void *)end > area->addr + get_vm_area_size(area))
+               return -ERANGE;
+       return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+                     unsigned long end, struct page **pages)
+{
+       int err;
+
+       err = check_sparse_vm_area(area, start, end);
+       if (err)
+               return err;
+
+       return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+                        unsigned long end)
+{
+       if (check_sparse_vm_area(area, start, end))
+               return;
+
+       vunmap_range(start, end);
+}
+
 int is_vmalloc_or_module_addr(const void *x)
 {
        /*
@@ -3809,9 +3881,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
-               else if (!(vm && (vm->flags & VM_IOREMAP)))
+               else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
-               else /* IOREMAP area is treated as memory hole */
+               else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);
 
                addr += copied;
@@ -4402,6 +4474,9 @@ static int s_show(struct seq_file *m, void *p)
        if (v->flags & VM_IOREMAP)
                seq_puts(m, " ioremap");
 
+       if (v->flags & VM_SPARSE)
+               seq_puts(m, " sparse");
+
        if (v->flags & VM_ALLOC)
                seq_puts(m, " vmalloc");
 
index 02de71719aeda7c4c16f59206f2046a0b8d6af77..de33dc1b0daadc0258950bee1adcc1cd99512143 100644 (file)
@@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
        struct bpf_tramp_link *link = NULL;
        void *image = NULL;
        unsigned int op_idx;
+       u32 image_off = 0;
        int prog_ret;
        s32 type_id;
        int err;
@@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
                goto out;
        }
 
-       image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-       if (!image) {
-               err = -ENOMEM;
-               goto out;
-       }
-
        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
@@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
        err = bpf_struct_ops_prepare_trampoline(tlinks, link,
                                                &st_ops->func_models[op_idx],
                                                &dummy_ops_test_ret_function,
-                                               image, image + PAGE_SIZE);
+                                               &image, &image_off,
+                                               true);
        if (err < 0)
                goto out;
 
@@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
                err = -EFAULT;
 out:
        kfree(args);
-       arch_free_bpf_trampoline(image, PAGE_SIZE);
+       bpf_struct_ops_image_free(image);
        if (link)
                bpf_link_put(&link->link);
        kfree(tlinks);
@@ -178,7 +174,7 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
        case offsetof(struct bpf_dummy_ops, test_sleepable):
                break;
        default:
-               if (prog->aux->sleepable)
+               if (prog->sleepable)
                        return -EINVAL;
        }
 
index 1b34050a7538be1130fbc84002a7150029fc1c03..28ffcfbeef14e5bb57e62f1610cdf52caaafba89 100644 (file)
@@ -146,11 +146,7 @@ EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
 {
        struct tcp_congestion_ops *existing;
-       int ret;
-
-       ret = tcp_validate_congestion_control(ca);
-       if (ret)
-               return ret;
+       int ret = 0;
 
        ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
 
index 3b7ba037af95da0a05d2b9ed137e429aa88fe725..9d6a314dfd7a593cc6dbe6338f5962e0c411575c 100644 (file)
@@ -55,7 +55,7 @@ MAP COMMANDS
 |              | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |              | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |              | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-|              | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** }
+|              | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** }
 
 DESCRIPTION
 ===========
index 1f579eacd9d413786bca1f21014629cc57e6eb1c..4fa4ade1ce7445eadac1cc92437218db97697e33 100644 (file)
@@ -120,6 +120,12 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz)
        static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" };
        int i, n;
 
+       /* recognize hard coded LLVM section name */
+       if (strcmp(sec_name, ".arena.1") == 0) {
+               /* this is the name to use in skeleton */
+               snprintf(buf, buf_sz, "arena");
+               return true;
+       }
        for  (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) {
                const char *pfx = pfxs[i];
 
@@ -248,8 +254,15 @@ static const struct btf_type *find_type_for_map(struct btf *btf, const char *map
        return NULL;
 }
 
-static bool is_internal_mmapable_map(const struct bpf_map *map, char *buf, size_t sz)
+static bool is_mmapable_map(const struct bpf_map *map, char *buf, size_t sz)
 {
+       size_t tmp_sz;
+
+       if (bpf_map__type(map) == BPF_MAP_TYPE_ARENA && bpf_map__initial_value(map, &tmp_sz)) {
+               snprintf(buf, sz, "arena");
+               return true;
+       }
+
        if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE))
                return false;
 
@@ -274,7 +287,7 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name)
 
        bpf_object__for_each_map(map, obj) {
                /* only generate definitions for memory-mapped internal maps */
-               if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident)))
+               if (!is_mmapable_map(map, map_ident, sizeof(map_ident)))
                        continue;
 
                sec = find_type_for_map(btf, map_ident);
@@ -327,7 +340,7 @@ static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name
 
        bpf_object__for_each_map(map, obj) {
                /* only generate definitions for memory-mapped internal maps */
-               if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident)))
+               if (!is_mmapable_map(map, map_ident, sizeof(map_ident)))
                        continue;
 
                sec = find_type_for_map(btf, map_ident);
@@ -504,7 +517,7 @@ static void codegen_asserts(struct bpf_object *obj, const char *obj_name)
                ", obj_name);
 
        bpf_object__for_each_map(map, obj) {
-               if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident)))
+               if (!is_mmapable_map(map, map_ident, sizeof(map_ident)))
                        continue;
 
                sec = find_type_for_map(btf, map_ident);
@@ -720,7 +733,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
                const void *mmap_data = NULL;
                size_t mmap_size = 0;
 
-               if (!is_internal_mmapable_map(map, ident, sizeof(ident)))
+               if (!is_mmapable_map(map, ident, sizeof(ident)))
                        continue;
 
                codegen("\
@@ -782,7 +795,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
        bpf_object__for_each_map(map, obj) {
                const char *mmap_flags;
 
-               if (!is_internal_mmapable_map(map, ident, sizeof(ident)))
+               if (!is_mmapable_map(map, ident, sizeof(ident)))
                        continue;
 
                if (bpf_map__map_flags(map) & BPF_F_RDONLY_PROG)
@@ -871,7 +884,7 @@ codegen_maps_skeleton(struct bpf_object *obj, size_t map_cnt, bool mmaped)
                        ",
                        i, bpf_map__name(map), i, ident);
                /* memory-mapped internal maps */
-               if (mmaped && is_internal_mmapable_map(map, ident, sizeof(ident))) {
+               if (mmaped && is_mmapable_map(map, ident, sizeof(ident))) {
                        printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n",
                                i, ident);
                }
@@ -1617,7 +1630,7 @@ static int do_subskeleton(int argc, char **argv)
                /* Also count all maps that have a name */
                map_cnt++;
 
-               if (!is_internal_mmapable_map(map, ident, sizeof(ident)))
+               if (!is_mmapable_map(map, ident, sizeof(ident)))
                        continue;
 
                map_type_id = bpf_map__btf_value_type_id(map);
@@ -1739,7 +1752,7 @@ static int do_subskeleton(int argc, char **argv)
 
        /* walk through each symbol and emit the runtime representation */
        bpf_object__for_each_map(map, obj) {
-               if (!is_internal_mmapable_map(map, ident, sizeof(ident)))
+               if (!is_mmapable_map(map, ident, sizeof(ident)))
                        continue;
 
                map_type_id = bpf_map__btf_value_type_id(map);
index f98f7bbea2b1582304afbf220a29fe804187eec5..b89bd792c1d510ab69b1c255639eba56a8722023 100644 (file)
@@ -1463,7 +1463,7 @@ static int do_help(int argc, char **argv)
                "                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
                "                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
                "                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-               "                 task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n"
+               "                 task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena }\n"
                "       " HELP_SPEC_OPTIONS " |\n"
                "                    {-f|--bpffs} | {-n|--nomount} }\n"
                "",
index a241f407c23414cbd9bf44be0879462ba37a4f02..3c42b9f1bada3d5bcd7b2608a71bdb277fb1ea6d 100644 (file)
@@ -42,6 +42,7 @@
 #define BPF_JSGE       0x70    /* SGE is signed '>=', GE in x86 */
 #define BPF_JSLT       0xc0    /* SLT is signed, '<' */
 #define BPF_JSLE       0xd0    /* SLE is signed, '<=' */
+#define BPF_JCOND      0xe0    /* conditional pseudo jumps: may_goto, goto_or_nop */
 #define BPF_CALL       0x80    /* function call */
 #define BPF_EXIT       0x90    /* function return */
 
 #define BPF_XCHG       (0xe0 | BPF_FETCH)      /* atomic exchange */
 #define BPF_CMPXCHG    (0xf0 | BPF_FETCH)      /* atomic compare-and-write */
 
+enum bpf_cond_pseudo_jmp {
+       BPF_MAY_GOTO = 0,
+};
+
 /* Register numbers */
 enum {
        BPF_REG_0 = 0,
@@ -1004,6 +1009,7 @@ enum bpf_map_type {
        BPF_MAP_TYPE_BLOOM_FILTER,
        BPF_MAP_TYPE_USER_RINGBUF,
        BPF_MAP_TYPE_CGRP_STORAGE,
+       BPF_MAP_TYPE_ARENA,
        __MAX_BPF_MAP_TYPE
 };
 
@@ -1333,6 +1339,10 @@ enum {
  */
 #define BPF_PSEUDO_KFUNC_CALL  2
 
+enum bpf_addr_space_cast {
+       BPF_ADDR_SPACE_CAST = 1,
+};
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
        BPF_ANY         = 0, /* create new element or update existing */
@@ -1391,6 +1401,12 @@ enum {
 
 /* BPF token FD is passed in a corresponding command's token_fd field */
        BPF_F_TOKEN_FD          = (1U << 16),
+
+/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
+       BPF_F_SEGV_ON_FAULT     = (1U << 17),
+
+/* Do not translate kernel bpf_arena pointers to user pointers */
+       BPF_F_NO_USER_CONV      = (1U << 18),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1462,6 +1478,9 @@ union bpf_attr {
                 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
                 * number of hash functions (if 0, the bloom filter will default
                 * to using 5 hash functions).
+                *
+                * BPF_MAP_TYPE_ARENA - contains the address where user space
+                * is going to mmap() the arena. It has to be page aligned.
                 */
                __u64   map_extra;
 
index 79eaa581be98a1c4d5273829cdf28014a8e56cbc..cd17f6d0791fefadbecc68362a4444515177a9b8 100644 (file)
@@ -13,6 +13,7 @@
 #define __uint(name, val) int (*name)[val]
 #define __type(name, val) typeof(val) *name
 #define __array(name, val) typeof(val) *name[]
+#define __ulong(name, val) enum { ___bpf_concat(__unique_value, __COUNTER__) = val } name
 
 /*
  * Helper macro to place programs, maps, license in
@@ -192,6 +193,7 @@ enum libbpf_tristate {
 #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull")))
 #define __arg_nullable __attribute((btf_decl_tag("arg:nullable")))
 #define __arg_trusted __attribute((btf_decl_tag("arg:trusted")))
+#define __arg_arena __attribute((btf_decl_tag("arg:arena")))
 
 #ifndef ___bpf_concat
 #define ___bpf_concat(a, b) a ## b
index a17b4c9c4213daabbdab9108e335608ff9f5ac46..2d0840ef599aff7d54a1efab818e88e6dca2af2e 100644 (file)
@@ -4968,7 +4968,7 @@ struct btf *btf__load_vmlinux_btf(void)
                        pr_warn("failed to read kernel BTF from '%s': %d\n", sysfs_btf_path, err);
                        return libbpf_err_ptr(err);
                }
-               pr_debug("loaded kernel BTF from '%s'\n", path);
+               pr_debug("loaded kernel BTF from '%s'\n", sysfs_btf_path);
                return btf;
        }
 
index 6b0738ad7063674e34f0e1229149f4c5615d5fd0..4e783cc7fc4b591cf57d9908100644e70ac65ee8 100644 (file)
@@ -147,6 +147,25 @@ static int probe_kern_btf_datasec(int token_fd)
                                             strs, sizeof(strs), token_fd));
 }
 
+static int probe_kern_btf_qmark_datasec(int token_fd)
+{
+       static const char strs[] = "\0x\0?.data";
+       /* static int a; */
+       __u32 types[] = {
+               /* int */
+               BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+               /* VAR x */                                     /* [2] */
+               BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+               BTF_VAR_STATIC,
+               /* DATASEC ?.data */                            /* [3] */
+               BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+               BTF_VAR_SECINFO_ENC(2, 0, 4),
+       };
+
+       return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+                                            strs, sizeof(strs), token_fd));
+}
+
 static int probe_kern_btf_float(int token_fd)
 {
        static const char strs[] = "\0float";
@@ -534,6 +553,9 @@ static struct kern_feature_desc {
        [FEAT_ARG_CTX_TAG] = {
                "kernel-side __arg_ctx tag", probe_kern_arg_ctx_tag,
        },
+       [FEAT_BTF_QMARK_DATASEC] = {
+               "BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec,
+       },
 };
 
 bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
index 6c2979f1b4712d6db9e2599fc8b5694bf95ede1b..efab29b8935bd9f7027859efee6a1383046105f4 100644 (file)
@@ -185,6 +185,7 @@ static const char * const map_type_name[] = {
        [BPF_MAP_TYPE_BLOOM_FILTER]             = "bloom_filter",
        [BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
        [BPF_MAP_TYPE_CGRP_STORAGE]             = "cgrp_storage",
+       [BPF_MAP_TYPE_ARENA]                    = "arena",
 };
 
 static const char * const prog_type_name[] = {
@@ -497,6 +498,7 @@ struct bpf_struct_ops {
 #define KSYMS_SEC ".ksyms"
 #define STRUCT_OPS_SEC ".struct_ops"
 #define STRUCT_OPS_LINK_SEC ".struct_ops.link"
+#define ARENA_SEC ".arena.1"
 
 enum libbpf_map_type {
        LIBBPF_MAP_UNSPEC,
@@ -612,6 +614,7 @@ enum sec_type {
        SEC_BSS,
        SEC_DATA,
        SEC_RODATA,
+       SEC_ST_OPS,
 };
 
 struct elf_sec_desc {
@@ -627,8 +630,7 @@ struct elf_state {
        Elf *elf;
        Elf64_Ehdr *ehdr;
        Elf_Data *symbols;
-       Elf_Data *st_ops_data;
-       Elf_Data *st_ops_link_data;
+       Elf_Data *arena_data;
        size_t shstrndx; /* section index for section name strings */
        size_t strtabidx;
        struct elf_sec_desc *secs;
@@ -637,8 +639,8 @@ struct elf_state {
        __u32 btf_maps_sec_btf_id;
        int text_shndx;
        int symbols_shndx;
-       int st_ops_shndx;
-       int st_ops_link_shndx;
+       bool has_st_ops;
+       int arena_data_shndx;
 };
 
 struct usdt_manager;
@@ -698,6 +700,10 @@ struct bpf_object {
 
        struct usdt_manager *usdt_man;
 
+       struct bpf_map *arena_map;
+       void *arena_data;
+       size_t arena_data_sz;
+
        struct kern_feature_cache *feat_cache;
        char *token_path;
        int token_fd;
@@ -948,7 +954,7 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
                                   const char *name, __u32 kind);
 
 static int
-find_struct_ops_kern_types(struct bpf_object *obj, const char *tname,
+find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw,
                           struct module_btf **mod_btf,
                           const struct btf_type **type, __u32 *type_id,
                           const struct btf_type **vtype, __u32 *vtype_id,
@@ -958,8 +964,12 @@ find_struct_ops_kern_types(struct bpf_object *obj, const char *tname,
        const struct btf_member *kern_data_member;
        struct btf *btf;
        __s32 kern_vtype_id, kern_type_id;
+       char tname[256];
        __u32 i;
 
+       snprintf(tname, sizeof(tname), "%.*s",
+                (int)bpf_core_essential_name_len(tname_raw), tname_raw);
+
        kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT,
                                        &btf, mod_btf);
        if (kern_type_id < 0) {
@@ -1027,6 +1037,48 @@ static bool is_valid_st_ops_program(struct bpf_object *obj,
        return false;
 }
 
+/* For each struct_ops program P, referenced from some struct_ops map M,
+ * enable P.autoload if there are Ms for which M.autocreate is true,
+ * disable P.autoload if for all Ms M.autocreate is false.
+ * Don't change P.autoload for programs that are not referenced from any maps.
+ */
+static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj)
+{
+       struct bpf_program *prog, *slot_prog;
+       struct bpf_map *map;
+       int i, j, k, vlen;
+
+       for (i = 0; i < obj->nr_programs; ++i) {
+               int should_load = false;
+               int use_cnt = 0;
+
+               prog = &obj->programs[i];
+               if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+                       continue;
+
+               for (j = 0; j < obj->nr_maps; ++j) {
+                       map = &obj->maps[j];
+                       if (!bpf_map__is_struct_ops(map))
+                               continue;
+
+                       vlen = btf_vlen(map->st_ops->type);
+                       for (k = 0; k < vlen; ++k) {
+                               slot_prog = map->st_ops->progs[k];
+                               if (prog != slot_prog)
+                                       continue;
+
+                               use_cnt++;
+                               if (map->autocreate)
+                                       should_load = true;
+                       }
+               }
+               if (use_cnt)
+                       prog->autoload = should_load;
+       }
+
+       return 0;
+}
+
 /* Init the map's fields that depend on kern_btf */
 static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 {
@@ -1142,8 +1194,32 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map)
 
                        if (mod_btf)
                                prog->attach_btf_obj_fd = mod_btf->fd;
-                       prog->attach_btf_id = kern_type_id;
-                       prog->expected_attach_type = kern_member_idx;
+
+                       /* if we haven't yet processed this BPF program, record proper
+                        * attach_btf_id and member_idx
+                        */
+                       if (!prog->attach_btf_id) {
+                               prog->attach_btf_id = kern_type_id;
+                               prog->expected_attach_type = kern_member_idx;
+                       }
+
+                       /* struct_ops BPF prog can be re-used between multiple
+                        * .struct_ops & .struct_ops.link as long as it's the
+                        * same struct_ops struct definition and the same
+                        * function pointer field
+                        */
+                       if (prog->attach_btf_id != kern_type_id) {
+                               pr_warn("struct_ops init_kern %s func ptr %s: invalid reuse of prog %s in sec %s with type %u: attach_btf_id %u != kern_type_id %u\n",
+                                       map->name, mname, prog->name, prog->sec_name, prog->type,
+                                       prog->attach_btf_id, kern_type_id);
+                               return -EINVAL;
+                       }
+                       if (prog->expected_attach_type != kern_member_idx) {
+                               pr_warn("struct_ops init_kern %s func ptr %s: invalid reuse of prog %s in sec %s with type %u: expected_attach_type %u != kern_member_idx %u\n",
+                                       map->name, mname, prog->name, prog->sec_name, prog->type,
+                                       prog->expected_attach_type, kern_member_idx);
+                               return -EINVAL;
+                       }
 
                        st_ops->kern_func_off[i] = kern_data_off + kern_moff;
 
@@ -1184,6 +1260,9 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
                if (!bpf_map__is_struct_ops(map))
                        continue;
 
+               if (!map->autocreate)
+                       continue;
+
                err = bpf_map__init_kern_struct_ops(map);
                if (err)
                        return err;
@@ -1193,7 +1272,7 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj)
 }
 
 static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
-                               int shndx, Elf_Data *data, __u32 map_flags)
+                               int shndx, Elf_Data *data)
 {
        const struct btf_type *type, *datasec;
        const struct btf_var_secinfo *vsi;
@@ -1251,11 +1330,20 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
                        return -ENOMEM;
                map->btf_value_type_id = type_id;
 
+               /* Follow same convention as for programs autoload:
+                * SEC("?.struct_ops") means map is not created by default.
+                */
+               if (sec_name[0] == '?') {
+                       map->autocreate = false;
+                       /* from now on forget there was ? in section name */
+                       sec_name++;
+               }
+
                map->def.type = BPF_MAP_TYPE_STRUCT_OPS;
                map->def.key_size = sizeof(int);
                map->def.value_size = type->size;
                map->def.max_entries = 1;
-               map->def.map_flags = map_flags;
+               map->def.map_flags = strcmp(sec_name, STRUCT_OPS_LINK_SEC) == 0 ? BPF_F_LINK : 0;
 
                map->st_ops = calloc(1, sizeof(*map->st_ops));
                if (!map->st_ops)
@@ -1290,15 +1378,25 @@ static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name,
 
 static int bpf_object_init_struct_ops(struct bpf_object *obj)
 {
-       int err;
+       const char *sec_name;
+       int sec_idx, err;
 
-       err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx,
-                                  obj->efile.st_ops_data, 0);
-       err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC,
-                                         obj->efile.st_ops_link_shndx,
-                                         obj->efile.st_ops_link_data,
-                                         BPF_F_LINK);
-       return err;
+       for (sec_idx = 0; sec_idx < obj->efile.sec_cnt; ++sec_idx) {
+               struct elf_sec_desc *desc = &obj->efile.secs[sec_idx];
+
+               if (desc->sec_type != SEC_ST_OPS)
+                       continue;
+
+               sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx));
+               if (!sec_name)
+                       return -LIBBPF_ERRNO__FORMAT;
+
+               err = init_struct_ops_maps(obj, sec_name, sec_idx, desc->data);
+               if (err)
+                       return err;
+       }
+
+       return 0;
 }
 
 static struct bpf_object *bpf_object__new(const char *path,
@@ -1336,8 +1434,6 @@ static struct bpf_object *bpf_object__new(const char *path,
        obj->efile.obj_buf = obj_buf;
        obj->efile.obj_buf_sz = obj_buf_sz;
        obj->efile.btf_maps_shndx = -1;
-       obj->efile.st_ops_shndx = -1;
-       obj->efile.st_ops_link_shndx = -1;
        obj->kconfig_map_idx = -1;
 
        obj->kern_version = get_kernel_version();
@@ -1354,8 +1450,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj)
        elf_end(obj->efile.elf);
        obj->efile.elf = NULL;
        obj->efile.symbols = NULL;
-       obj->efile.st_ops_data = NULL;
-       obj->efile.st_ops_link_data = NULL;
+       obj->efile.arena_data = NULL;
 
        zfree(&obj->efile.secs);
        obj->efile.sec_cnt = 0;
@@ -1598,7 +1693,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj)
        return map;
 }
 
-static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
+static size_t array_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
 {
        const long page_sz = sysconf(_SC_PAGE_SIZE);
        size_t map_sz;
@@ -1608,6 +1703,20 @@ static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries)
        return map_sz;
 }
 
+static size_t bpf_map_mmap_sz(const struct bpf_map *map)
+{
+       const long page_sz = sysconf(_SC_PAGE_SIZE);
+
+       switch (map->def.type) {
+       case BPF_MAP_TYPE_ARRAY:
+               return array_map_mmap_sz(map->def.value_size, map->def.max_entries);
+       case BPF_MAP_TYPE_ARENA:
+               return page_sz * map->def.max_entries;
+       default:
+               return 0; /* not supported */
+       }
+}
+
 static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz)
 {
        void *mmaped;
@@ -1750,7 +1859,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
        def->value_size = data_sz;
        def->max_entries = 1;
        def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG
-                        ? BPF_F_RDONLY_PROG : 0;
+               ? BPF_F_RDONLY_PROG : 0;
 
        /* failures are fine because of maps like .rodata.str1.1 */
        (void) map_fill_btf_type_info(obj, map);
@@ -1761,7 +1870,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type,
        pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n",
                 map->name, map->sec_idx, map->sec_offset, def->map_flags);
 
-       mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
+       mmap_sz = bpf_map_mmap_sz(map);
        map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
        if (map->mmaped == MAP_FAILED) {
@@ -2249,6 +2358,46 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf,
        return true;
 }
 
+static bool get_map_field_long(const char *map_name, const struct btf *btf,
+                              const struct btf_member *m, __u64 *res)
+{
+       const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL);
+       const char *name = btf__name_by_offset(btf, m->name_off);
+
+       if (btf_is_ptr(t)) {
+               __u32 res32;
+               bool ret;
+
+               ret = get_map_field_int(map_name, btf, m, &res32);
+               if (ret)
+                       *res = (__u64)res32;
+               return ret;
+       }
+
+       if (!btf_is_enum(t) && !btf_is_enum64(t)) {
+               pr_warn("map '%s': attr '%s': expected ENUM or ENUM64, got %s.\n",
+                       map_name, name, btf_kind_str(t));
+               return false;
+       }
+
+       if (btf_vlen(t) != 1) {
+               pr_warn("map '%s': attr '%s': invalid __ulong\n",
+                       map_name, name);
+               return false;
+       }
+
+       if (btf_is_enum(t)) {
+               const struct btf_enum *e = btf_enum(t);
+
+               *res = e->val;
+       } else {
+               const struct btf_enum64 *e = btf_enum64(t);
+
+               *res = btf_enum64_value(e);
+       }
+       return true;
+}
+
 static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name)
 {
        int len;
@@ -2482,9 +2631,9 @@ int parse_btf_map_def(const char *map_name, struct btf *btf,
                        map_def->pinning = val;
                        map_def->parts |= MAP_DEF_PINNING;
                } else if (strcmp(name, "map_extra") == 0) {
-                       __u32 map_extra;
+                       __u64 map_extra;
 
-                       if (!get_map_field_int(map_name, btf, m, &map_extra))
+                       if (!get_map_field_long(map_name, btf, m, &map_extra))
                                return -EINVAL;
                        map_def->map_extra = map_extra;
                        map_def->parts |= MAP_DEF_MAP_EXTRA;
@@ -2702,6 +2851,32 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
        return 0;
 }
 
+static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
+                              const char *sec_name, int sec_idx,
+                              void *data, size_t data_sz)
+{
+       const long page_sz = sysconf(_SC_PAGE_SIZE);
+       size_t mmap_sz;
+
+       mmap_sz = bpf_map_mmap_sz(obj->arena_map);
+       if (roundup(data_sz, page_sz) > mmap_sz) {
+               pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n",
+                       sec_name, mmap_sz, data_sz);
+               return -E2BIG;
+       }
+
+       obj->arena_data = malloc(data_sz);
+       if (!obj->arena_data)
+               return -ENOMEM;
+       memcpy(obj->arena_data, data, data_sz);
+       obj->arena_data_sz = data_sz;
+
+       /* make bpf_map__init_value() work for ARENA maps */
+       map->mmaped = obj->arena_data;
+
+       return 0;
+}
+
 static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
                                          const char *pin_root_path)
 {
@@ -2751,6 +2926,33 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
                        return err;
        }
 
+       for (i = 0; i < obj->nr_maps; i++) {
+               struct bpf_map *map = &obj->maps[i];
+
+               if (map->def.type != BPF_MAP_TYPE_ARENA)
+                       continue;
+
+               if (obj->arena_map) {
+                       pr_warn("map '%s': only single ARENA map is supported (map '%s' is also ARENA)\n",
+                               map->name, obj->arena_map->name);
+                       return -EINVAL;
+               }
+               obj->arena_map = map;
+
+               if (obj->efile.arena_data) {
+                       err = init_arena_map_data(obj, map, ARENA_SEC, obj->efile.arena_data_shndx,
+                                                 obj->efile.arena_data->d_buf,
+                                                 obj->efile.arena_data->d_size);
+                       if (err)
+                               return err;
+               }
+       }
+       if (obj->efile.arena_data && !obj->arena_map) {
+               pr_warn("elf: sec '%s': to use global __arena variables the ARENA map should be explicitly declared in SEC(\".maps\")\n",
+                       ARENA_SEC);
+               return -ENOENT;
+       }
+
        return 0;
 }
 
@@ -2783,6 +2985,11 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx)
        return sh->sh_flags & SHF_EXECINSTR;
 }
 
+static bool starts_with_qmark(const char *s)
+{
+       return s && s[0] == '?';
+}
+
 static bool btf_needs_sanitization(struct bpf_object *obj)
 {
        bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC);
@@ -2792,9 +2999,10 @@ static bool btf_needs_sanitization(struct bpf_object *obj)
        bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG);
        bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG);
        bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64);
+       bool has_qmark_datasec = kernel_supports(obj, FEAT_BTF_QMARK_DATASEC);
 
        return !has_func || !has_datasec || !has_func_global || !has_float ||
-              !has_decl_tag || !has_type_tag || !has_enum64;
+              !has_decl_tag || !has_type_tag || !has_enum64 || !has_qmark_datasec;
 }
 
 static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
@@ -2806,6 +3014,7 @@ static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
        bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG);
        bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG);
        bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64);
+       bool has_qmark_datasec = kernel_supports(obj, FEAT_BTF_QMARK_DATASEC);
        int enum64_placeholder_id = 0;
        struct btf_type *t;
        int i, j, vlen;
@@ -2832,7 +3041,7 @@ static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
 
                        name = (char *)btf__name_by_offset(btf, t->name_off);
                        while (*name) {
-                               if (*name == '.')
+                               if (*name == '.' || *name == '?')
                                        *name = '_';
                                name++;
                        }
@@ -2847,6 +3056,14 @@ static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
                                vt = (void *)btf__type_by_id(btf, v->type);
                                m->name_off = vt->name_off;
                        }
+               } else if (!has_qmark_datasec && btf_is_datasec(t) &&
+                          starts_with_qmark(btf__name_by_offset(btf, t->name_off))) {
+                       /* replace '?' prefix with '_' for DATASEC names */
+                       char *name;
+
+                       name = (char *)btf__name_by_offset(btf, t->name_off);
+                       if (name[0] == '?')
+                               name[0] = '_';
                } else if (!has_func && btf_is_func_proto(t)) {
                        /* replace FUNC_PROTO with ENUM */
                        vlen = btf_vlen(t);
@@ -2900,14 +3117,13 @@ static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
 static bool libbpf_needs_btf(const struct bpf_object *obj)
 {
        return obj->efile.btf_maps_shndx >= 0 ||
-              obj->efile.st_ops_shndx >= 0 ||
-              obj->efile.st_ops_link_shndx >= 0 ||
+              obj->efile.has_st_ops ||
               obj->nr_extern > 0;
 }
 
 static bool kernel_needs_btf(const struct bpf_object *obj)
 {
-       return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0;
+       return obj->efile.has_st_ops;
 }
 
 static int bpf_object__init_btf(struct bpf_object *obj,
@@ -3608,12 +3824,17 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                                sec_desc->sec_type = SEC_RODATA;
                                sec_desc->shdr = sh;
                                sec_desc->data = data;
-                       } else if (strcmp(name, STRUCT_OPS_SEC) == 0) {
-                               obj->efile.st_ops_data = data;
-                               obj->efile.st_ops_shndx = idx;
-                       } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) {
-                               obj->efile.st_ops_link_data = data;
-                               obj->efile.st_ops_link_shndx = idx;
+                       } else if (strcmp(name, STRUCT_OPS_SEC) == 0 ||
+                                  strcmp(name, STRUCT_OPS_LINK_SEC) == 0 ||
+                                  strcmp(name, "?" STRUCT_OPS_SEC) == 0 ||
+                                  strcmp(name, "?" STRUCT_OPS_LINK_SEC) == 0) {
+                               sec_desc->sec_type = SEC_ST_OPS;
+                               sec_desc->shdr = sh;
+                               sec_desc->data = data;
+                               obj->efile.has_st_ops = true;
+                       } else if (strcmp(name, ARENA_SEC) == 0) {
+                               obj->efile.arena_data = data;
+                               obj->efile.arena_data_shndx = idx;
                        } else {
                                pr_info("elf: skipping unrecognized data section(%d) %s\n",
                                        idx, name);
@@ -3629,6 +3850,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
                        if (!section_have_execinstr(obj, targ_sec_idx) &&
                            strcmp(name, ".rel" STRUCT_OPS_SEC) &&
                            strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) &&
+                           strcmp(name, ".rel?" STRUCT_OPS_SEC) &&
+                           strcmp(name, ".rel?" STRUCT_OPS_LINK_SEC) &&
                            strcmp(name, ".rel" MAPS_ELF_SEC)) {
                                pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n",
                                        idx, name, targ_sec_idx,
@@ -4241,6 +4464,15 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
        type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
        sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
 
+       /* arena data relocation */
+       if (shdr_idx == obj->efile.arena_data_shndx) {
+               reloc_desc->type = RELO_DATA;
+               reloc_desc->insn_idx = insn_idx;
+               reloc_desc->map_idx = obj->arena_map - obj->maps;
+               reloc_desc->sym_off = sym->st_value;
+               return 0;
+       }
+
        /* generic map reference relocation */
        if (type == LIBBPF_MAP_UNSPEC) {
                if (!bpf_object__shndx_is_maps(obj, shdr_idx)) {
@@ -4781,6 +5013,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
                        bpf_gen__map_freeze(obj->gen_loader, map - obj->maps);
                return 0;
        }
+
        err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
        if (err) {
                err = -errno;
@@ -4873,6 +5106,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
        case BPF_MAP_TYPE_SOCKHASH:
        case BPF_MAP_TYPE_QUEUE:
        case BPF_MAP_TYPE_STACK:
+       case BPF_MAP_TYPE_ARENA:
                create_attr.btf_fd = 0;
                create_attr.btf_key_type_id = 0;
                create_attr.btf_value_type_id = 0;
@@ -5117,7 +5351,23 @@ retry:
                                if (err < 0)
                                        goto err_out;
                        }
-
+                       if (map->def.type == BPF_MAP_TYPE_ARENA) {
+                               map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map),
+                                                  PROT_READ | PROT_WRITE,
+                                                  map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
+                                                  map->fd, 0);
+                               if (map->mmaped == MAP_FAILED) {
+                                       err = -errno;
+                                       map->mmaped = NULL;
+                                       pr_warn("map '%s': failed to mmap arena: %d\n",
+                                               map->name, err);
+                                       return err;
+                               }
+                               if (obj->arena_data) {
+                                       memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz);
+                                       zfree(&obj->arena_data);
+                               }
+                       }
                        if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) {
                                err = init_map_in_map_slots(obj, map);
                                if (err < 0)
@@ -6926,12 +7176,12 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
                data = sec_desc->data;
                idx = shdr->sh_info;
 
-               if (shdr->sh_type != SHT_REL) {
+               if (shdr->sh_type != SHT_REL || idx < 0 || idx >= obj->efile.sec_cnt) {
                        pr_warn("internal error at %d\n", __LINE__);
                        return -LIBBPF_ERRNO__INTERNAL;
                }
 
-               if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx)
+               if (obj->efile.secs[idx].sec_type == SEC_ST_OPS)
                        err = bpf_object__collect_st_ops_relos(obj, shdr, data);
                else if (idx == obj->efile.btf_maps_shndx)
                        err = bpf_object__collect_map_relos(obj, shdr, data);
@@ -8105,11 +8355,20 @@ static void bpf_map_prepare_vdata(const struct bpf_map *map)
 
 static int bpf_object_prepare_struct_ops(struct bpf_object *obj)
 {
+       struct bpf_map *map;
        int i;
 
-       for (i = 0; i < obj->nr_maps; i++)
-               if (bpf_map__is_struct_ops(&obj->maps[i]))
-                       bpf_map_prepare_vdata(&obj->maps[i]);
+       for (i = 0; i < obj->nr_maps; i++) {
+               map = &obj->maps[i];
+
+               if (!bpf_map__is_struct_ops(map))
+                       continue;
+
+               if (!map->autocreate)
+                       continue;
+
+               bpf_map_prepare_vdata(map);
+       }
 
        return 0;
 }
@@ -8135,6 +8394,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
        err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
        err = err ? : bpf_object__sanitize_maps(obj);
        err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
+       err = err ? : bpf_object_adjust_struct_ops_autoload(obj);
        err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path);
        err = err ? : bpf_object__sanitize_and_load_btf(obj);
        err = err ? : bpf_object__create_maps(obj);
@@ -8604,13 +8864,9 @@ static void bpf_map__destroy(struct bpf_map *map)
        zfree(&map->init_slots);
        map->init_slots_sz = 0;
 
-       if (map->mmaped) {
-               size_t mmap_sz;
-
-               mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
-               munmap(map->mmaped, mmap_sz);
-               map->mmaped = NULL;
-       }
+       if (map->mmaped && map->mmaped != map->obj->arena_data)
+               munmap(map->mmaped, bpf_map_mmap_sz(map));
+       map->mmaped = NULL;
 
        if (map->st_ops) {
                zfree(&map->st_ops->data);
@@ -8670,6 +8926,8 @@ void bpf_object__close(struct bpf_object *obj)
        if (obj->token_fd > 0)
                close(obj->token_fd);
 
+       zfree(&obj->arena_data);
+
        free(obj);
 }
 
@@ -9424,27 +9682,6 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj,
                        return -EINVAL;
                }
 
-               /* if we haven't yet processed this BPF program, record proper
-                * attach_btf_id and member_idx
-                */
-               if (!prog->attach_btf_id) {
-                       prog->attach_btf_id = st_ops->type_id;
-                       prog->expected_attach_type = member_idx;
-               }
-
-               /* struct_ops BPF prog can be re-used between multiple
-                * .struct_ops & .struct_ops.link as long as it's the
-                * same struct_ops struct definition and the same
-                * function pointer field
-                */
-               if (prog->attach_btf_id != st_ops->type_id ||
-                   prog->expected_attach_type != member_idx) {
-                       pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n",
-                               map->name, prog->name, prog->sec_name, prog->type,
-                               prog->attach_btf_id, prog->expected_attach_type, name);
-                       return -EINVAL;
-               }
-
                st_ops->progs[member_idx] = prog;
 
                /* st_ops->data will be exposed to users, being returned by
@@ -9862,11 +10099,14 @@ int bpf_map__set_value_size(struct bpf_map *map, __u32 size)
                return libbpf_err(-EBUSY);
 
        if (map->mmaped) {
-               int err;
                size_t mmap_old_sz, mmap_new_sz;
+               int err;
+
+               if (map->def.type != BPF_MAP_TYPE_ARRAY)
+                       return -EOPNOTSUPP;
 
-               mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
-               mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries);
+               mmap_old_sz = bpf_map_mmap_sz(map);
+               mmap_new_sz = array_map_mmap_sz(size, map->def.max_entries);
                err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz);
                if (err) {
                        pr_warn("map '%s': failed to resize memory-mapped region: %d\n",
@@ -9899,18 +10139,26 @@ __u32 bpf_map__btf_value_type_id(const struct bpf_map *map)
 int bpf_map__set_initial_value(struct bpf_map *map,
                               const void *data, size_t size)
 {
+       size_t actual_sz;
+
        if (map->obj->loaded || map->reused)
                return libbpf_err(-EBUSY);
 
-       if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG ||
-           size != map->def.value_size)
+       if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG)
+               return libbpf_err(-EINVAL);
+
+       if (map->def.type == BPF_MAP_TYPE_ARENA)
+               actual_sz = map->obj->arena_data_sz;
+       else
+               actual_sz = map->def.value_size;
+       if (size != actual_sz)
                return libbpf_err(-EINVAL);
 
        memcpy(map->mmaped, data, size);
        return 0;
 }
 
-void *bpf_map__initial_value(struct bpf_map *map, size_t *psize)
+void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize)
 {
        if (bpf_map__is_struct_ops(map)) {
                if (psize)
@@ -9920,7 +10168,12 @@ void *bpf_map__initial_value(struct bpf_map *map, size_t *psize)
 
        if (!map->mmaped)
                return NULL;
-       *psize = map->def.value_size;
+
+       if (map->def.type == BPF_MAP_TYPE_ARENA)
+               *psize = map->obj->arena_data_sz;
+       else
+               *psize = map->def.value_size;
+
        return map->mmaped;
 }
 
@@ -13397,7 +13650,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
 
        for (i = 0; i < s->map_cnt; i++) {
                struct bpf_map *map = *s->maps[i].map;
-               size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries);
+               size_t mmap_sz = bpf_map_mmap_sz(map);
                int prot, map_fd = map->fd;
                void **mmaped = s->maps[i].mmaped;
 
@@ -13409,6 +13662,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
                        continue;
                }
 
+               if (map->def.type == BPF_MAP_TYPE_ARENA) {
+                       *mmaped = map->mmaped;
+                       continue;
+               }
+
                if (map->def.map_flags & BPF_F_RDONLY_PROG)
                        prot = PROT_READ;
                else
index 5723cbbfcc41d53d20871dc9f1ecfa0befbb8a67..7b510761f545d09eaafc43ae69fa746aeb4f15b3 100644 (file)
@@ -1014,7 +1014,7 @@ LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra);
 
 LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map,
                                          const void *data, size_t size);
-LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize);
+LIBBPF_API void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize);
 
 /**
  * @brief **bpf_map__is_internal()** tells the caller whether or not the
index ad936ac5e6397b8121673451dd880aec2eee1295..864b361774240767f172d5944c46968b58546959 100644 (file)
@@ -374,6 +374,8 @@ enum kern_feature_id {
        FEAT_UPROBE_MULTI_LINK,
        /* Kernel supports arg:ctx tag (__arg_ctx) for global subprogs natively */
        FEAT_ARG_CTX_TAG,
+       /* Kernel supports '?' at the front of datasec names */
+       FEAT_BTF_QMARK_DATASEC,
        __FEAT_CNT,
 };
 
index ee9b1dbea9eb877f190413044780bd17c2eeb7b1..3021881224392a52b082e02fef18e93f8b698ec5 100644 (file)
@@ -338,6 +338,13 @@ static int probe_map_create(enum bpf_map_type map_type)
                key_size = 0;
                max_entries = 1;
                break;
+       case BPF_MAP_TYPE_ARENA:
+               key_size        = 0;
+               value_size      = 0;
+               max_entries     = 1; /* one page */
+               opts.map_extra  = 0; /* can mmap() at any address */
+               opts.map_flags  = BPF_F_MMAPABLE;
+               break;
        case BPF_MAP_TYPE_HASH:
        case BPF_MAP_TYPE_ARRAY:
        case BPF_MAP_TYPE_PROG_ARRAY:
index 0445ac38bc07de1337d78a5ce63f6875724403c5..d8ade15e27898961554726158babaa03b3c18f0d 100644 (file)
@@ -10,3 +10,5 @@ fill_link_info/kprobe_multi_link_info            # bpf_program__attach_kprobe_mu
 fill_link_info/kretprobe_multi_link_info         # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 fill_link_info/kprobe_multi_invalid_ubuff        # bpf_program__attach_kprobe_multi_opts unexpected error: -95
 missed/kprobe_recursion                          # missed_kprobe_recursion__attach unexpected error: -95 (errno 95)
+verifier_arena                                   # JIT does not support arena
+arena_htab                                       # JIT does not support arena
index 1a63996c0304bc76c6a3727340c1c63c029a5ac5..f4a2f66a683ddbd91d7f81ae5264335be09f067e 100644 (file)
@@ -3,3 +3,6 @@
 exceptions                              # JIT does not support calling kfunc bpf_throw                                (exceptions)
 get_stack_raw_tp                         # user_stack corrupted user stack                                             (no backchain userspace)
 stacktrace_build_id                      # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2                   (?)
+verifier_iterating_callbacks
+verifier_arena                           # JIT does not support arena
+arena_htab                               # JIT does not support arena
index 84cb5500e8eff6791cc215c21a5e2f39f0d4b4b5..3b9eb40d63436f84b1fe63e9a85f0b0f6154863d 100644 (file)
@@ -34,7 +34,7 @@ LIBELF_CFLAGS := $(shell $(PKG_CONFIG) libelf --cflags 2>/dev/null)
 LIBELF_LIBS    := $(shell $(PKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf)
 
 CFLAGS += -g $(OPT_FLAGS) -rdynamic                                    \
-         -Wall -Werror                                                 \
+         -Wall -Werror -fno-omit-frame-pointer                         \
          $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS)                    \
          -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)          \
          -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT)
index 1724d50ba9421a4d5408c178c475573e85d2f4fc..b2b4c391eb0ac295282c306db7a346f75b2b9ddd 100644 (file)
@@ -495,14 +495,20 @@ extern const struct bench bench_trig_base;
 extern const struct bench bench_trig_tp;
 extern const struct bench bench_trig_rawtp;
 extern const struct bench bench_trig_kprobe;
+extern const struct bench bench_trig_kretprobe;
+extern const struct bench bench_trig_kprobe_multi;
+extern const struct bench bench_trig_kretprobe_multi;
 extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fexit;
 extern const struct bench bench_trig_fentry_sleep;
 extern const struct bench bench_trig_fmodret;
 extern const struct bench bench_trig_uprobe_base;
-extern const struct bench bench_trig_uprobe_with_nop;
-extern const struct bench bench_trig_uretprobe_with_nop;
-extern const struct bench bench_trig_uprobe_without_nop;
-extern const struct bench bench_trig_uretprobe_without_nop;
+extern const struct bench bench_trig_uprobe_nop;
+extern const struct bench bench_trig_uretprobe_nop;
+extern const struct bench bench_trig_uprobe_push;
+extern const struct bench bench_trig_uretprobe_push;
+extern const struct bench bench_trig_uprobe_ret;
+extern const struct bench bench_trig_uretprobe_ret;
 extern const struct bench bench_rb_libbpf;
 extern const struct bench bench_rb_custom;
 extern const struct bench bench_pb_libbpf;
@@ -537,14 +543,20 @@ static const struct bench *benchs[] = {
        &bench_trig_tp,
        &bench_trig_rawtp,
        &bench_trig_kprobe,
+       &bench_trig_kretprobe,
+       &bench_trig_kprobe_multi,
+       &bench_trig_kretprobe_multi,
        &bench_trig_fentry,
+       &bench_trig_fexit,
        &bench_trig_fentry_sleep,
        &bench_trig_fmodret,
        &bench_trig_uprobe_base,
-       &bench_trig_uprobe_with_nop,
-       &bench_trig_uretprobe_with_nop,
-       &bench_trig_uprobe_without_nop,
-       &bench_trig_uretprobe_without_nop,
+       &bench_trig_uprobe_nop,
+       &bench_trig_uretprobe_nop,
+       &bench_trig_uprobe_push,
+       &bench_trig_uretprobe_push,
+       &bench_trig_uprobe_ret,
+       &bench_trig_uretprobe_ret,
        &bench_rb_libbpf,
        &bench_rb_custom,
        &bench_pb_libbpf,
index dbd362771d6ab3d5c8dc357a2558e739364e3b16..ace0d1011a8e517589ebcf492d49c2a35f58bf0a 100644 (file)
@@ -85,12 +85,36 @@ static void trigger_kprobe_setup(void)
        attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
 }
 
+static void trigger_kretprobe_setup(void)
+{
+       setup_ctx();
+       attach_bpf(ctx.skel->progs.bench_trigger_kretprobe);
+}
+
+static void trigger_kprobe_multi_setup(void)
+{
+       setup_ctx();
+       attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi);
+}
+
+static void trigger_kretprobe_multi_setup(void)
+{
+       setup_ctx();
+       attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi);
+}
+
 static void trigger_fentry_setup(void)
 {
        setup_ctx();
        attach_bpf(ctx.skel->progs.bench_trigger_fentry);
 }
 
+static void trigger_fexit_setup(void)
+{
+       setup_ctx();
+       attach_bpf(ctx.skel->progs.bench_trigger_fexit);
+}
+
 static void trigger_fentry_sleep_setup(void)
 {
        setup_ctx();
@@ -113,12 +137,25 @@ static void trigger_fmodret_setup(void)
  * GCC doesn't generate stack setup preample for these functions due to them
  * having no input arguments and doing nothing in the body.
  */
-__weak void uprobe_target_with_nop(void)
+__weak void uprobe_target_nop(void)
 {
        asm volatile ("nop");
 }
 
-__weak void uprobe_target_without_nop(void)
+__weak void opaque_noop_func(void)
+{
+}
+
+__weak int uprobe_target_push(void)
+{
+       /* overhead of function call is negligible compared to uprobe
+        * triggering, so this shouldn't affect benchmark results much
+        */
+       opaque_noop_func();
+       return 1;
+}
+
+__weak void uprobe_target_ret(void)
 {
        asm volatile ("");
 }
@@ -126,27 +163,34 @@ __weak void uprobe_target_without_nop(void)
 static void *uprobe_base_producer(void *input)
 {
        while (true) {
-               uprobe_target_with_nop();
+               uprobe_target_nop();
                atomic_inc(&base_hits.value);
        }
        return NULL;
 }
 
-static void *uprobe_producer_with_nop(void *input)
+static void *uprobe_producer_nop(void *input)
+{
+       while (true)
+               uprobe_target_nop();
+       return NULL;
+}
+
+static void *uprobe_producer_push(void *input)
 {
        while (true)
-               uprobe_target_with_nop();
+               uprobe_target_push();
        return NULL;
 }
 
-static void *uprobe_producer_without_nop(void *input)
+static void *uprobe_producer_ret(void *input)
 {
        while (true)
-               uprobe_target_without_nop();
+               uprobe_target_ret();
        return NULL;
 }
 
-static void usetup(bool use_retprobe, bool use_nop)
+static void usetup(bool use_retprobe, void *target_addr)
 {
        size_t uprobe_offset;
        struct bpf_link *link;
@@ -159,11 +203,7 @@ static void usetup(bool use_retprobe, bool use_nop)
                exit(1);
        }
 
-       if (use_nop)
-               uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop);
-       else
-               uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop);
-
+       uprobe_offset = get_uprobe_offset(target_addr);
        link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
                                          use_retprobe,
                                          -1 /* all PIDs */,
@@ -176,24 +216,34 @@ static void usetup(bool use_retprobe, bool use_nop)
        ctx.skel->links.bench_trigger_uprobe = link;
 }
 
-static void uprobe_setup_with_nop(void)
+static void uprobe_setup_nop(void)
 {
-       usetup(false, true);
+       usetup(false, &uprobe_target_nop);
 }
 
-static void uretprobe_setup_with_nop(void)
+static void uretprobe_setup_nop(void)
 {
-       usetup(true, true);
+       usetup(true, &uprobe_target_nop);
 }
 
-static void uprobe_setup_without_nop(void)
+static void uprobe_setup_push(void)
 {
-       usetup(false, false);
+       usetup(false, &uprobe_target_push);
 }
 
-static void uretprobe_setup_without_nop(void)
+static void uretprobe_setup_push(void)
 {
-       usetup(true, false);
+       usetup(true, &uprobe_target_push);
+}
+
+static void uprobe_setup_ret(void)
+{
+       usetup(false, &uprobe_target_ret);
+}
+
+static void uretprobe_setup_ret(void)
+{
+       usetup(true, &uprobe_target_ret);
 }
 
 const struct bench bench_trig_base = {
@@ -235,6 +285,36 @@ const struct bench bench_trig_kprobe = {
        .report_final = hits_drops_report_final,
 };
 
+const struct bench bench_trig_kretprobe = {
+       .name = "trig-kretprobe",
+       .validate = trigger_validate,
+       .setup = trigger_kretprobe_setup,
+       .producer_thread = trigger_producer,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_kprobe_multi = {
+       .name = "trig-kprobe-multi",
+       .validate = trigger_validate,
+       .setup = trigger_kprobe_multi_setup,
+       .producer_thread = trigger_producer,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_kretprobe_multi = {
+       .name = "trig-kretprobe-multi",
+       .validate = trigger_validate,
+       .setup = trigger_kretprobe_multi_setup,
+       .producer_thread = trigger_producer,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
 const struct bench bench_trig_fentry = {
        .name = "trig-fentry",
        .validate = trigger_validate,
@@ -245,6 +325,16 @@ const struct bench bench_trig_fentry = {
        .report_final = hits_drops_report_final,
 };
 
+const struct bench bench_trig_fexit = {
+       .name = "trig-fexit",
+       .validate = trigger_validate,
+       .setup = trigger_fexit_setup,
+       .producer_thread = trigger_producer,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
 const struct bench bench_trig_fentry_sleep = {
        .name = "trig-fentry-sleep",
        .validate = trigger_validate,
@@ -274,37 +364,55 @@ const struct bench bench_trig_uprobe_base = {
        .report_final = hits_drops_report_final,
 };
 
-const struct bench bench_trig_uprobe_with_nop = {
-       .name = "trig-uprobe-with-nop",
-       .setup = uprobe_setup_with_nop,
-       .producer_thread = uprobe_producer_with_nop,
+const struct bench bench_trig_uprobe_nop = {
+       .name = "trig-uprobe-nop",
+       .setup = uprobe_setup_nop,
+       .producer_thread = uprobe_producer_nop,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uretprobe_nop = {
+       .name = "trig-uretprobe-nop",
+       .setup = uretprobe_setup_nop,
+       .producer_thread = uprobe_producer_nop,
+       .measure = trigger_measure,
+       .report_progress = hits_drops_report_progress,
+       .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_uprobe_push = {
+       .name = "trig-uprobe-push",
+       .setup = uprobe_setup_push,
+       .producer_thread = uprobe_producer_push,
        .measure = trigger_measure,
        .report_progress = hits_drops_report_progress,
        .report_final = hits_drops_report_final,
 };
 
-const struct bench bench_trig_uretprobe_with_nop = {
-       .name = "trig-uretprobe-with-nop",
-       .setup = uretprobe_setup_with_nop,
-       .producer_thread = uprobe_producer_with_nop,
+const struct bench bench_trig_uretprobe_push = {
+       .name = "trig-uretprobe-push",
+       .setup = uretprobe_setup_push,
+       .producer_thread = uprobe_producer_push,
        .measure = trigger_measure,
        .report_progress = hits_drops_report_progress,
        .report_final = hits_drops_report_final,
 };
 
-const struct bench bench_trig_uprobe_without_nop = {
-       .name = "trig-uprobe-without-nop",
-       .setup = uprobe_setup_without_nop,
-       .producer_thread = uprobe_producer_without_nop,
+const struct bench bench_trig_uprobe_ret = {
+       .name = "trig-uprobe-ret",
+       .setup = uprobe_setup_ret,
+       .producer_thread = uprobe_producer_ret,
        .measure = trigger_measure,
        .report_progress = hits_drops_report_progress,
        .report_final = hits_drops_report_final,
 };
 
-const struct bench bench_trig_uretprobe_without_nop = {
-       .name = "trig-uretprobe-without-nop",
-       .setup = uretprobe_setup_without_nop,
-       .producer_thread = uprobe_producer_without_nop,
+const struct bench bench_trig_uretprobe_ret = {
+       .name = "trig-uretprobe-ret",
+       .setup = uretprobe_setup_ret,
+       .producer_thread = uprobe_producer_ret,
        .measure = trigger_measure,
        .report_progress = hits_drops_report_progress,
        .report_final = hits_drops_report_final,
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
new file mode 100755 (executable)
index 0000000..9bdcc74
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -eufo pipefail
+
+for i in base {uprobe,uretprobe}-{nop,push,ret}
+do
+       summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
+       printf "%-15s: %s\n" $i "$summary"
+done
diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h
new file mode 100644 (file)
index 0000000..c276782
--- /dev/null
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+#include "bpf_arena_common.h"
+
+#ifndef __round_mask
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#endif
+#ifndef round_up
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+#endif
+
+#ifdef __BPF__
+#define NR_CPUS (sizeof(struct cpumask) * 8)
+
+static void __arena * __arena page_frag_cur_page[NR_CPUS];
+static int __arena page_frag_cur_offset[NR_CPUS];
+
+/* Simple page_frag allocator */
+static inline void __arena* bpf_alloc(unsigned int size)
+{
+       __u64 __arena *obj_cnt;
+       __u32 cpu = bpf_get_smp_processor_id();
+       void __arena *page = page_frag_cur_page[cpu];
+       int __arena *cur_offset = &page_frag_cur_offset[cpu];
+       int offset;
+
+       size = round_up(size, 8);
+       if (size >= PAGE_SIZE - 8)
+               return NULL;
+       if (!page) {
+refill:
+               page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+               if (!page)
+                       return NULL;
+               cast_kern(page);
+               page_frag_cur_page[cpu] = page;
+               *cur_offset = PAGE_SIZE - 8;
+               obj_cnt = page + PAGE_SIZE - 8;
+               *obj_cnt = 0;
+       } else {
+               cast_kern(page);
+               obj_cnt = page + PAGE_SIZE - 8;
+       }
+
+       offset = *cur_offset - size;
+       if (offset < 0)
+               goto refill;
+
+       (*obj_cnt)++;
+       *cur_offset = offset;
+       return page + offset;
+}
+
+static inline void bpf_free(void __arena *addr)
+{
+       __u64 __arena *obj_cnt;
+
+       addr = (void __arena *)(((long)addr) & ~(PAGE_SIZE - 1));
+       obj_cnt = addr + PAGE_SIZE - 8;
+       if (--(*obj_cnt) == 0)
+               bpf_arena_free_pages(&arena, addr, 1);
+}
+#else
+static inline void __arena* bpf_alloc(unsigned int size) { return NULL; }
+static inline void bpf_free(void __arena *addr) {}
+#endif
diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h
new file mode 100644 (file)
index 0000000..bcf195c
--- /dev/null
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val))
+#endif
+
+#ifndef NUMA_NO_NODE
+#define        NUMA_NO_NODE    (-1)
+#endif
+
+#ifndef arena_container_of
+#define arena_container_of(ptr, type, member)                  \
+       ({                                                      \
+               void __arena *__mptr = (void __arena *)(ptr);   \
+               ((type *)(__mptr - offsetof(type, member)));    \
+       })
+#endif
+
+#ifdef __BPF__ /* when compiled as bpf program */
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE __PAGE_SIZE
+/*
+ * for older kernels try sizeof(struct genradix_node)
+ * or flexible:
+ * static inline long __bpf_page_size(void) {
+ *   return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
+ * }
+ * but generated code is not great.
+ */
+#endif
+
+#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#define __arena __attribute__((address_space(1)))
+#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
+#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
+#else
+#define __arena
+#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
+#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
+#endif
+
+void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
+                                   int node_id, __u64 flags) __ksym __weak;
+void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
+
+#else /* when compiled as user space code */
+
+#define __arena
+#define __arg_arena
+#define cast_kern(ptr) /* nop for user space */
+#define cast_user(ptr) /* nop for user space */
+__weak char arena[1];
+
+#ifndef offsetof
+#define offsetof(type, member)  ((unsigned long)&((type *)0)->member)
+#endif
+
+static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
+                                                 int node_id, __u64 flags)
+{
+       return NULL;
+}
+static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
+{
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h
new file mode 100644 (file)
index 0000000..acc01a8
--- /dev/null
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+#include <errno.h>
+#include "bpf_arena_alloc.h"
+#include "bpf_arena_list.h"
+
+struct htab_bucket {
+       struct arena_list_head head;
+};
+typedef struct htab_bucket __arena htab_bucket_t;
+
+struct htab {
+       htab_bucket_t *buckets;
+       int n_buckets;
+};
+typedef struct htab __arena htab_t;
+
+static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash)
+{
+       htab_bucket_t *b = htab->buckets;
+
+       cast_kern(b);
+       return &b[hash & (htab->n_buckets - 1)];
+}
+
+static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash)
+{
+       return &__select_bucket(htab, hash)->head;
+}
+
+struct hashtab_elem {
+       int hash;
+       int key;
+       int value;
+       struct arena_list_node hash_node;
+};
+typedef struct hashtab_elem __arena hashtab_elem_t;
+
+static hashtab_elem_t *lookup_elem_raw(arena_list_head_t *head, __u32 hash, int key)
+{
+       hashtab_elem_t *l;
+
+       list_for_each_entry(l, head, hash_node)
+               if (l->hash == hash && l->key == key)
+                       return l;
+
+       return NULL;
+}
+
+static int htab_hash(int key)
+{
+       return key;
+}
+
+__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key)
+{
+       hashtab_elem_t *l_old;
+       arena_list_head_t *head;
+
+       cast_kern(htab);
+       head = select_bucket(htab, key);
+       l_old = lookup_elem_raw(head, htab_hash(key), key);
+       if (l_old)
+               return l_old->value;
+       return 0;
+}
+
+__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value)
+{
+       hashtab_elem_t *l_new = NULL, *l_old;
+       arena_list_head_t *head;
+
+       cast_kern(htab);
+       head = select_bucket(htab, key);
+       l_old = lookup_elem_raw(head, htab_hash(key), key);
+
+       l_new = bpf_alloc(sizeof(*l_new));
+       if (!l_new)
+               return -ENOMEM;
+       l_new->key = key;
+       l_new->hash = htab_hash(key);
+       l_new->value = value;
+
+       list_add_head(&l_new->hash_node, head);
+       if (l_old) {
+               list_del(&l_old->hash_node);
+               bpf_free(l_old);
+       }
+       return 0;
+}
+
+void htab_init(htab_t *htab)
+{
+       void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
+
+       cast_user(buckets);
+       htab->buckets = buckets;
+       htab->n_buckets = 2 * PAGE_SIZE / sizeof(struct htab_bucket);
+}
diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h
new file mode 100644 (file)
index 0000000..b99b9f4
--- /dev/null
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+#include "bpf_arena_common.h"
+
+struct arena_list_node;
+
+typedef struct arena_list_node __arena arena_list_node_t;
+
+struct arena_list_node {
+       arena_list_node_t *next;
+       arena_list_node_t * __arena *pprev;
+};
+
+struct arena_list_head {
+       struct arena_list_node __arena *first;
+};
+typedef struct arena_list_head __arena arena_list_head_t;
+
+#define list_entry(ptr, type, member) arena_container_of(ptr, type, member)
+
+#define list_entry_safe(ptr, type, member) \
+       ({ typeof(*ptr) * ___ptr = (ptr); \
+        ___ptr ? ({ cast_kern(___ptr); list_entry(___ptr, type, member); }) : NULL; \
+        })
+
+#ifndef __BPF__
+static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; }
+static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {}
+static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; }
+#define cond_break ({})
+#endif
+
+/* Safely walk link list elements. Deletion of elements is allowed. */
+#define list_for_each_entry(pos, head, member)                                 \
+       for (void * ___tmp = (pos = list_entry_safe((head)->first,              \
+                                                   typeof(*(pos)), member),    \
+                             (void *)0);                                       \
+            pos && ({ ___tmp = (void *)pos->member.next; 1; });                \
+            cond_break,                                                        \
+            pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member))
+
+static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h)
+{
+       arena_list_node_t *first = h->first, * __arena *tmp;
+
+       cast_user(first);
+       cast_kern(n);
+       WRITE_ONCE(n->next, first);
+       cast_kern(first);
+       if (first) {
+               tmp = &n->next;
+               cast_user(tmp);
+               WRITE_ONCE(first->pprev, tmp);
+       }
+       cast_user(n);
+       WRITE_ONCE(h->first, n);
+
+       tmp = &h->first;
+       cast_user(tmp);
+       cast_kern(n);
+       WRITE_ONCE(n->pprev, tmp);
+}
+
+static inline void __list_del(arena_list_node_t *n)
+{
+       arena_list_node_t *next = n->next, *tmp;
+       arena_list_node_t * __arena *pprev = n->pprev;
+
+       cast_user(next);
+       cast_kern(pprev);
+       tmp = *pprev;
+       cast_kern(tmp);
+       WRITE_ONCE(tmp, next);
+       if (next) {
+               cast_user(pprev);
+               cast_kern(next);
+               WRITE_ONCE(next->pprev, pprev);
+       }
+}
+
+#define POISON_POINTER_DELTA 0
+
+#define LIST_POISON1  ((void __arena *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void __arena *) 0x122 + POISON_POINTER_DELTA)
+
+static inline void list_del(arena_list_node_t *n)
+{
+       __list_del(n);
+       n->next = LIST_POISON1;
+       n->pprev = LIST_POISON2;
+}
index 0d749006d107577877e56a89f62af5e4492a585f..a5b9df38c162595c8c63420d3eb425149eddeef7 100644 (file)
@@ -326,11 +326,66 @@ l_true:                                                                                           \
        })
 #endif
 
+#define cond_break                                     \
+       ({ __label__ l_break, l_continue;               \
+        asm volatile goto("1:.byte 0xe5;                       \
+                     .byte 0;                          \
+                     .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;      \
+                     .short 0"                         \
+                     :::: l_break);                    \
+       goto l_continue;                                \
+       l_break: break;                                 \
+       l_continue:;                                    \
+       })
+
 #ifndef bpf_nop_mov
 #define bpf_nop_mov(var) \
        asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var))
 #endif
 
+/* emit instruction:
+ * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
+ */
+#ifndef bpf_addr_space_cast
+#define bpf_addr_space_cast(var, dst_as, src_as)\
+       asm volatile(".byte 0xBF;               \
+                    .ifc %[reg], r0;           \
+                    .byte 0x00;                \
+                    .endif;                    \
+                    .ifc %[reg], r1;           \
+                    .byte 0x11;                \
+                    .endif;                    \
+                    .ifc %[reg], r2;           \
+                    .byte 0x22;                \
+                    .endif;                    \
+                    .ifc %[reg], r3;           \
+                    .byte 0x33;                \
+                    .endif;                    \
+                    .ifc %[reg], r4;           \
+                    .byte 0x44;                \
+                    .endif;                    \
+                    .ifc %[reg], r5;           \
+                    .byte 0x55;                \
+                    .endif;                    \
+                    .ifc %[reg], r6;           \
+                    .byte 0x66;                \
+                    .endif;                    \
+                    .ifc %[reg], r7;           \
+                    .byte 0x77;                \
+                    .endif;                    \
+                    .ifc %[reg], r8;           \
+                    .byte 0x88;                \
+                    .endif;                    \
+                    .ifc %[reg], r9;           \
+                    .byte 0x99;                \
+                    .endif;                    \
+                    .short %[off];             \
+                    .long %[as]"               \
+                    : [reg]"+r"(var)           \
+                    : [off]"i"(BPF_ADDR_SPACE_CAST) \
+                    , [as]"i"((dst_as << 16) | src_as));
+#endif
+
 /* Description
  *     Assert that a conditional expression is true.
  * Returns
index 098ddd0672244f3373642dae611b3bcaec54dbab..39ad96a18123f626c311aaa2fdf8213594b7d67d 100644 (file)
@@ -564,6 +564,8 @@ static int bpf_dummy_reg(void *kdata)
 {
        struct bpf_testmod_ops *ops = kdata;
 
+       if (ops->test_1)
+               ops->test_1();
        /* Some test cases (ex. struct_ops_maybe_null) may not have test_2
         * initialized, so we need to check for NULL.
         */
@@ -609,6 +611,29 @@ struct bpf_struct_ops bpf_bpf_testmod_ops = {
        .owner = THIS_MODULE,
 };
 
+static int bpf_dummy_reg2(void *kdata)
+{
+       struct bpf_testmod_ops2 *ops = kdata;
+
+       ops->test_1();
+       return 0;
+}
+
+static struct bpf_testmod_ops2 __bpf_testmod_ops2 = {
+       .test_1 = bpf_testmod_test_1,
+};
+
+struct bpf_struct_ops bpf_testmod_ops2 = {
+       .verifier_ops = &bpf_testmod_verifier_ops,
+       .init = bpf_testmod_ops_init,
+       .init_member = bpf_testmod_ops_init_member,
+       .reg = bpf_dummy_reg2,
+       .unreg = bpf_dummy_unreg,
+       .cfi_stubs = &__bpf_testmod_ops2,
+       .name = "bpf_testmod_ops2",
+       .owner = THIS_MODULE,
+};
+
 extern int bpf_fentry_test1(int a);
 
 static int bpf_testmod_init(void)
@@ -620,6 +645,7 @@ static int bpf_testmod_init(void)
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_testmod_kfunc_set);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_testmod_kfunc_set);
        ret = ret ?: register_bpf_struct_ops(&bpf_bpf_testmod_ops, bpf_testmod_ops);
+       ret = ret ?: register_bpf_struct_ops(&bpf_testmod_ops2, bpf_testmod_ops2);
        if (ret < 0)
                return ret;
        if (bpf_fentry_test1(0) < 0)
index 971458acfac350c0e09c7d76457a31ae9564b4b0..23fa1872ee674f5b224587e525c4afb1a379d02d 100644 (file)
@@ -43,6 +43,54 @@ struct bpf_testmod_ops {
                int b;
        } unsupported;
        int data;
+
+       /* The following pointers are used to test the maps having multiple
+        * pages of trampolines.
+        */
+       int (*tramp_1)(int value);
+       int (*tramp_2)(int value);
+       int (*tramp_3)(int value);
+       int (*tramp_4)(int value);
+       int (*tramp_5)(int value);
+       int (*tramp_6)(int value);
+       int (*tramp_7)(int value);
+       int (*tramp_8)(int value);
+       int (*tramp_9)(int value);
+       int (*tramp_10)(int value);
+       int (*tramp_11)(int value);
+       int (*tramp_12)(int value);
+       int (*tramp_13)(int value);
+       int (*tramp_14)(int value);
+       int (*tramp_15)(int value);
+       int (*tramp_16)(int value);
+       int (*tramp_17)(int value);
+       int (*tramp_18)(int value);
+       int (*tramp_19)(int value);
+       int (*tramp_20)(int value);
+       int (*tramp_21)(int value);
+       int (*tramp_22)(int value);
+       int (*tramp_23)(int value);
+       int (*tramp_24)(int value);
+       int (*tramp_25)(int value);
+       int (*tramp_26)(int value);
+       int (*tramp_27)(int value);
+       int (*tramp_28)(int value);
+       int (*tramp_29)(int value);
+       int (*tramp_30)(int value);
+       int (*tramp_31)(int value);
+       int (*tramp_32)(int value);
+       int (*tramp_33)(int value);
+       int (*tramp_34)(int value);
+       int (*tramp_35)(int value);
+       int (*tramp_36)(int value);
+       int (*tramp_37)(int value);
+       int (*tramp_38)(int value);
+       int (*tramp_39)(int value);
+       int (*tramp_40)(int value);
+};
+
+struct bpf_testmod_ops2 {
+       int (*test_1)(void);
 };
 
 #endif /* _BPF_TESTMOD_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_htab.c b/tools/testing/selftests/bpf/prog_tests/arena_htab.c
new file mode 100644 (file)
index 0000000..0766702
--- /dev/null
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <network_helpers.h>
+
+#include "arena_htab_asm.skel.h"
+#include "arena_htab.skel.h"
+
+#define PAGE_SIZE 4096
+
+#include "bpf_arena_htab.h"
+
+static void test_arena_htab_common(struct htab *htab)
+{
+       int i;
+
+       printf("htab %p buckets %p n_buckets %d\n", htab, htab->buckets, htab->n_buckets);
+       ASSERT_OK_PTR(htab->buckets, "htab->buckets shouldn't be NULL");
+       for (i = 0; htab->buckets && i < 16; i += 4) {
+               /*
+                * Walk htab buckets and link lists since all pointers are correct,
+                * though they were written by bpf program.
+                */
+               int val = htab_lookup_elem(htab, i);
+
+               ASSERT_EQ(i, val, "key == value");
+       }
+}
+
+static void test_arena_htab_llvm(void)
+{
+       LIBBPF_OPTS(bpf_test_run_opts, opts);
+       struct arena_htab *skel;
+       struct htab *htab;
+       size_t arena_sz;
+       void *area;
+       int ret;
+
+       skel = arena_htab__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "arena_htab__open_and_load"))
+               return;
+
+       area = bpf_map__initial_value(skel->maps.arena, &arena_sz);
+       /* fault-in a page with pgoff == 0 as sanity check */
+       *(volatile int *)area = 0x55aa;
+
+       /* bpf prog will allocate more pages */
+       ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_llvm), &opts);
+       ASSERT_OK(ret, "ret");
+       ASSERT_OK(opts.retval, "retval");
+       if (skel->bss->skip) {
+               printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
+               test__skip();
+               goto out;
+       }
+       htab = skel->bss->htab_for_user;
+       test_arena_htab_common(htab);
+out:
+       arena_htab__destroy(skel);
+}
+
+static void test_arena_htab_asm(void)
+{
+       LIBBPF_OPTS(bpf_test_run_opts, opts);
+       struct arena_htab_asm *skel;
+       struct htab *htab;
+       int ret;
+
+       skel = arena_htab_asm__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "arena_htab_asm__open_and_load"))
+               return;
+
+       ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_htab_asm), &opts);
+       ASSERT_OK(ret, "ret");
+       ASSERT_OK(opts.retval, "retval");
+       htab = skel->bss->htab_for_user;
+       test_arena_htab_common(htab);
+       arena_htab_asm__destroy(skel);
+}
+
+void test_arena_htab(void)
+{
+       if (test__start_subtest("arena_htab_llvm"))
+               test_arena_htab_llvm();
+       if (test__start_subtest("arena_htab_asm"))
+               test_arena_htab_asm();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c
new file mode 100644 (file)
index 0000000..e61886d
--- /dev/null
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <network_helpers.h>
+
+#define PAGE_SIZE 4096
+
+#include "bpf_arena_list.h"
+#include "arena_list.skel.h"
+
+struct elem {
+       struct arena_list_node node;
+       __u64 value;
+};
+
+static int list_sum(struct arena_list_head *head)
+{
+       struct elem __arena *n;
+       int sum = 0;
+
+       list_for_each_entry(n, head, node)
+               sum += n->value;
+       return sum;
+}
+
+static void test_arena_list_add_del(int cnt)
+{
+       LIBBPF_OPTS(bpf_test_run_opts, opts);
+       struct arena_list *skel;
+       int expected_sum = (u64)cnt * (cnt - 1) / 2;
+       int ret, sum;
+
+       skel = arena_list__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load"))
+               return;
+
+       skel->bss->cnt = cnt;
+       ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts);
+       ASSERT_OK(ret, "ret_add");
+       ASSERT_OK(opts.retval, "retval");
+       if (skel->bss->skip) {
+               printf("%s:SKIP:compiler doesn't support arena_cast\n", __func__);
+               test__skip();
+               goto out;
+       }
+       sum = list_sum(skel->bss->list_head);
+       ASSERT_EQ(sum, expected_sum, "sum of elems");
+       ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems");
+       ASSERT_EQ(skel->arena->test_val, cnt + 1, "num of elems");
+
+       ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_del), &opts);
+       ASSERT_OK(ret, "ret_del");
+       sum = list_sum(skel->bss->list_head);
+       ASSERT_EQ(sum, 0, "sum of list elems after del");
+       ASSERT_EQ(skel->bss->list_sum, expected_sum, "sum of list elems computed by prog");
+       ASSERT_EQ(skel->arena->arena_sum, expected_sum, "__arena sum of elems");
+out:
+       arena_list__destroy(skel);
+}
+
+void test_arena_list(void)
+{
+       if (test__start_subtest("arena_list_1"))
+               test_arena_list_add_del(1);
+       if (test__start_subtest("arena_list_1000"))
+               test_arena_list_add_del(1000);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bad_struct_ops.c b/tools/testing/selftests/bpf/prog_tests/bad_struct_ops.c
new file mode 100644 (file)
index 0000000..6a70721
--- /dev/null
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "bad_struct_ops.skel.h"
+#include "bad_struct_ops2.skel.h"
+
+static void invalid_prog_reuse(void)
+{
+       struct bad_struct_ops *skel;
+       char *log = NULL;
+       int err;
+
+       skel = bad_struct_ops__open();
+       if (!ASSERT_OK_PTR(skel, "bad_struct_ops__open"))
+               return;
+
+       if (start_libbpf_log_capture())
+               goto cleanup;
+
+       err = bad_struct_ops__load(skel);
+       log = stop_libbpf_log_capture();
+       ASSERT_ERR(err, "bad_struct_ops__load should fail");
+       ASSERT_HAS_SUBSTR(log,
+               "struct_ops init_kern testmod_2 func ptr test_1: invalid reuse of prog test_1",
+               "expected init_kern message");
+
+cleanup:
+       free(log);
+       bad_struct_ops__destroy(skel);
+}
+
+static void unused_program(void)
+{
+       struct bad_struct_ops2 *skel;
+       char *log = NULL;
+       int err;
+
+       skel = bad_struct_ops2__open();
+       if (!ASSERT_OK_PTR(skel, "bad_struct_ops2__open"))
+               return;
+
+       /* struct_ops programs not referenced from any maps are open
+        * with autoload set to true.
+        */
+       ASSERT_TRUE(bpf_program__autoload(skel->progs.foo), "foo autoload == true");
+
+       if (start_libbpf_log_capture())
+               goto cleanup;
+
+       err = bad_struct_ops2__load(skel);
+       ASSERT_ERR(err, "bad_struct_ops2__load should fail");
+       log = stop_libbpf_log_capture();
+       ASSERT_HAS_SUBSTR(log, "prog 'foo': failed to load",
+                         "message about 'foo' failing to load");
+
+cleanup:
+       free(log);
+       bad_struct_ops2__destroy(skel);
+}
+
+void test_bad_struct_ops(void)
+{
+       if (test__start_subtest("invalid_prog_reuse"))
+               invalid_prog_reuse();
+       if (test__start_subtest("unused_program"))
+               unused_program();
+}
index 816145bcb64765dcf9112774aa1fc3c6c518e2ad..00965a6e83bb252f3ee109b6c6b0f4d655a42c87 100644 (file)
@@ -3535,6 +3535,32 @@ static struct btf_raw_test raw_tests[] = {
        .value_type_id = 1,
        .max_entries = 1,
 },
+{
+       .descr = "datasec: name '?.foo bar:buz' is ok",
+       .raw_types = {
+               /* int */
+               BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+               /* VAR x */                                     /* [2] */
+               BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+               BTF_VAR_STATIC,
+               /* DATASEC ?.data */                            /* [3] */
+               BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+               BTF_VAR_SECINFO_ENC(2, 0, 4),
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0x\0?.foo bar:buz"),
+},
+{
+       .descr = "type name '?foo' is not ok",
+       .raw_types = {
+               /* union ?foo; */
+               BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_FWD, 1, 0), 0), /* [1] */
+               BTF_END_RAW,
+       },
+       BTF_STR_SEC("\0?foo"),
+       .err_str = "Invalid name",
+       .btf_load_err = true,
+},
 
 {
        .descr = "float test #1, well-formed",
@@ -4363,6 +4389,9 @@ static void do_test_raw(unsigned int test_num)
        if (err || btf_fd < 0)
                goto done;
 
+       if (!test->map_type)
+               goto done;
+
        opts.btf_fd = btf_fd;
        opts.btf_key_type_id = test->key_type_id;
        opts.btf_value_type_id = test->value_type_id;
diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_autocreate.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_autocreate.c
new file mode 100644 (file)
index 0000000..a5cc593
--- /dev/null
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "struct_ops_autocreate.skel.h"
+#include "struct_ops_autocreate2.skel.h"
+
+static void cant_load_full_object(void)
+{
+       struct struct_ops_autocreate *skel;
+       char *log = NULL;
+       int err;
+
+       skel = struct_ops_autocreate__open();
+       if (!ASSERT_OK_PTR(skel, "struct_ops_autocreate__open"))
+               return;
+
+       if (start_libbpf_log_capture())
+               goto cleanup;
+       /* The testmod_2 map BTF type (struct bpf_testmod_ops___v2) doesn't
+        * match the BTF of the actual struct bpf_testmod_ops defined in the
+        * kernel, so we should fail to load it if we don't disable autocreate
+        * for that map.
+        */
+       err = struct_ops_autocreate__load(skel);
+       log = stop_libbpf_log_capture();
+       if (!ASSERT_ERR(err, "struct_ops_autocreate__load"))
+               goto cleanup;
+
+       ASSERT_HAS_SUBSTR(log, "libbpf: struct_ops init_kern", "init_kern message");
+       ASSERT_EQ(err, -ENOTSUP, "errno should be ENOTSUP");
+
+cleanup:
+       free(log);
+       struct_ops_autocreate__destroy(skel);
+}
+
+static int check_test_1_link(struct struct_ops_autocreate *skel, struct bpf_map *map)
+{
+       struct bpf_link *link;
+       int err;
+
+       link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+       if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops"))
+               return -1;
+
+       /* test_1() would be called from bpf_dummy_reg2() in bpf_testmod.c */
+       err = ASSERT_EQ(skel->bss->test_1_result, 42, "test_1_result");
+       bpf_link__destroy(link);
+       return err;
+}
+
+static void can_load_partial_object(void)
+{
+       struct struct_ops_autocreate *skel;
+       int err;
+
+       skel = struct_ops_autocreate__open();
+       if (!ASSERT_OK_PTR(skel, "struct_ops_autocreate__open_opts"))
+               return;
+
+       err = bpf_map__set_autocreate(skel->maps.testmod_2, false);
+       if (!ASSERT_OK(err, "bpf_map__set_autocreate"))
+               goto cleanup;
+
+       ASSERT_TRUE(bpf_program__autoload(skel->progs.test_1), "test_1 default autoload");
+       ASSERT_TRUE(bpf_program__autoload(skel->progs.test_2), "test_2 default autoload");
+
+       err = struct_ops_autocreate__load(skel);
+       if (ASSERT_OK(err, "struct_ops_autocreate__load"))
+               goto cleanup;
+
+       ASSERT_TRUE(bpf_program__autoload(skel->progs.test_1), "test_1 actual autoload");
+       ASSERT_FALSE(bpf_program__autoload(skel->progs.test_2), "test_2 actual autoload");
+
+       check_test_1_link(skel, skel->maps.testmod_1);
+
+cleanup:
+       struct_ops_autocreate__destroy(skel);
+}
+
+static void optional_maps(void)
+{
+       struct struct_ops_autocreate *skel;
+       int err;
+
+       skel = struct_ops_autocreate__open();
+       if (!ASSERT_OK_PTR(skel, "struct_ops_autocreate__open"))
+               return;
+
+       ASSERT_TRUE(bpf_map__autocreate(skel->maps.testmod_1), "testmod_1 autocreate");
+       ASSERT_TRUE(bpf_map__autocreate(skel->maps.testmod_2), "testmod_2 autocreate");
+       ASSERT_FALSE(bpf_map__autocreate(skel->maps.optional_map), "optional_map autocreate");
+       ASSERT_FALSE(bpf_map__autocreate(skel->maps.optional_map2), "optional_map2 autocreate");
+
+       err  = bpf_map__set_autocreate(skel->maps.testmod_1, false);
+       err |= bpf_map__set_autocreate(skel->maps.testmod_2, false);
+       err |= bpf_map__set_autocreate(skel->maps.optional_map2, true);
+       if (!ASSERT_OK(err, "bpf_map__set_autocreate"))
+               goto cleanup;
+
+       err = struct_ops_autocreate__load(skel);
+       if (ASSERT_OK(err, "struct_ops_autocreate__load"))
+               goto cleanup;
+
+       check_test_1_link(skel, skel->maps.optional_map2);
+
+cleanup:
+       struct_ops_autocreate__destroy(skel);
+}
+
+/* Swap test_mod1->test_1 program from 'bar' to 'foo' using shadow vars.
+ * test_mod1 load should enable autoload for 'foo'.
+ */
+static void autoload_and_shadow_vars(void)
+{
+       struct struct_ops_autocreate2 *skel = NULL;
+       struct bpf_link *link = NULL;
+       int err;
+
+       skel = struct_ops_autocreate2__open();
+       if (!ASSERT_OK_PTR(skel, "struct_ops_autocreate__open_opts"))
+               return;
+
+       ASSERT_FALSE(bpf_program__autoload(skel->progs.foo), "foo default autoload");
+       ASSERT_FALSE(bpf_program__autoload(skel->progs.bar), "bar default autoload");
+
+       /* loading map testmod_1 would switch foo's autoload to true */
+       skel->struct_ops.testmod_1->test_1 = skel->progs.foo;
+
+       err = struct_ops_autocreate2__load(skel);
+       if (ASSERT_OK(err, "struct_ops_autocreate__load"))
+               goto cleanup;
+
+       ASSERT_TRUE(bpf_program__autoload(skel->progs.foo), "foo actual autoload");
+       ASSERT_FALSE(bpf_program__autoload(skel->progs.bar), "bar actual autoload");
+
+       link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
+       if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops"))
+               goto cleanup;
+
+       /* test_1() would be called from bpf_dummy_reg2() in bpf_testmod.c */
+       err = ASSERT_EQ(skel->bss->test_1_result, 42, "test_1_result");
+
+cleanup:
+       bpf_link__destroy(link);
+       struct_ops_autocreate2__destroy(skel);
+}
+
+void test_struct_ops_autocreate(void)
+{
+       if (test__start_subtest("cant_load_full_object"))
+               cant_load_full_object();
+       if (test__start_subtest("can_load_partial_object"))
+               can_load_partial_object();
+       if (test__start_subtest("autoload_and_shadow_vars"))
+               autoload_and_shadow_vars();
+       if (test__start_subtest("optional_maps"))
+               optional_maps();
+}
index 7d6facf46ebb21ac01c0880063d6f3735d4fa473..ee5372c7f2c7c92b6b278a156753fcbecc5a8059 100644 (file)
@@ -30,11 +30,29 @@ cleanup:
        close(fd);
 }
 
+static int attach_ops_and_check(struct struct_ops_module *skel,
+                               struct bpf_map *map,
+                               int expected_test_2_result)
+{
+       struct bpf_link *link;
+
+       link = bpf_map__attach_struct_ops(map);
+       ASSERT_OK_PTR(link, "attach_test_mod_1");
+       if (!link)
+               return -1;
+
+       /* test_{1,2}() would be called from bpf_dummy_reg() in bpf_testmod.c */
+       ASSERT_EQ(skel->bss->test_1_result, 0xdeadbeef, "test_1_result");
+       ASSERT_EQ(skel->bss->test_2_result, expected_test_2_result, "test_2_result");
+
+       bpf_link__destroy(link);
+       return 0;
+}
+
 static void test_struct_ops_load(void)
 {
        struct struct_ops_module *skel;
        struct bpf_map_info info = {};
-       struct bpf_link *link;
        int err;
        u32 len;
 
@@ -59,20 +77,17 @@ static void test_struct_ops_load(void)
        if (!ASSERT_OK(err, "bpf_map_get_info_by_fd"))
                goto cleanup;
 
-       link = bpf_map__attach_struct_ops(skel->maps.testmod_1);
-       ASSERT_OK_PTR(link, "attach_test_mod_1");
-
+       check_map_info(&info);
        /* test_3() will be called from bpf_dummy_reg() in bpf_testmod.c
         *
         * In bpf_testmod.c it will pass 4 and 13 (the value of data) to
         * .test_2.  So, the value of test_2_result should be 20 (4 + 13 +
         * 3).
         */
-       ASSERT_EQ(skel->bss->test_2_result, 20, "check_shadow_variables");
-
-       bpf_link__destroy(link);
-
-       check_map_info(&info);
+       if (!attach_ops_and_check(skel, skel->maps.testmod_1, 20))
+               goto cleanup;
+       if (!attach_ops_and_check(skel, skel->maps.testmod_2, 12))
+               goto cleanup;
 
 cleanup:
        struct_ops_module__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_multi_pages.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_multi_pages.c
new file mode 100644 (file)
index 0000000..645d32b
--- /dev/null
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+
+#include "struct_ops_multi_pages.skel.h"
+
+static void do_struct_ops_multi_pages(void)
+{
+       struct struct_ops_multi_pages *skel;
+       struct bpf_link *link;
+
+       /* The size of all trampolines of skel->maps.multi_pages should be
+        * over 1 page (at least for x86).
+        */
+       skel = struct_ops_multi_pages__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "struct_ops_multi_pages_open_and_load"))
+               return;
+
+       link = bpf_map__attach_struct_ops(skel->maps.multi_pages);
+       ASSERT_OK_PTR(link, "attach_multi_pages");
+
+       bpf_link__destroy(link);
+       struct_ops_multi_pages__destroy(skel);
+}
+
+void test_struct_ops_multi_pages(void)
+{
+       if (test__start_subtest("multi_pages"))
+               do_struct_ops_multi_pages();
+}
index 9c6072a197456929fda693d6fea4de527ff49adb..985273832f891c291a308d03d35d1469204f9708 100644 (file)
@@ -4,6 +4,7 @@
 
 #include "cap_helpers.h"
 #include "verifier_and.skel.h"
+#include "verifier_arena.skel.h"
 #include "verifier_array_access.skel.h"
 #include "verifier_basic_stack.skel.h"
 #include "verifier_bitfield_write.skel.h"
@@ -118,6 +119,7 @@ static void run_tests_aux(const char *skel_name,
 #define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL)
 
 void test_verifier_and(void)                  { RUN(verifier_and); }
+void test_verifier_arena(void)                { RUN(verifier_arena); }
 void test_verifier_basic_stack(void)          { RUN(verifier_basic_stack); }
 void test_verifier_bitfield_write(void)       { RUN(verifier_bitfield_write); }
 void test_verifier_bounds(void)               { RUN(verifier_bounds); }
diff --git a/tools/testing/selftests/bpf/progs/arena_htab.c b/tools/testing/selftests/bpf/progs/arena_htab.c
new file mode 100644 (file)
index 0000000..b7bb712
--- /dev/null
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARENA);
+       __uint(map_flags, BPF_F_MMAPABLE);
+       __uint(max_entries, 100); /* number of pages */
+} arena SEC(".maps");
+
+#include "bpf_arena_htab.h"
+
+void __arena *htab_for_user;
+bool skip = false;
+
+int zero = 0;
+
+SEC("syscall")
+int arena_htab_llvm(void *ctx)
+{
+#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM)
+       struct htab __arena *htab;
+       __u64 i;
+
+       htab = bpf_alloc(sizeof(*htab));
+       cast_kern(htab);
+       htab_init(htab);
+
+       /* first run. No old elems in the table */
+       for (i = zero; i < 1000; i++)
+               htab_update_elem(htab, i, i);
+
+       /* should replace all elems with new ones */
+       for (i = zero; i < 1000; i++)
+               htab_update_elem(htab, i, i);
+       cast_user(htab);
+       htab_for_user = htab;
+#else
+       skip = true;
+#endif
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/arena_htab_asm.c b/tools/testing/selftests/bpf/progs/arena_htab_asm.c
new file mode 100644 (file)
index 0000000..6cd70ea
--- /dev/null
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#define BPF_ARENA_FORCE_ASM
+#define arena_htab_llvm arena_htab_asm
+#include "arena_htab.c"
diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c
new file mode 100644 (file)
index 0000000..cd35b84
--- /dev/null
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARENA);
+       __uint(map_flags, BPF_F_MMAPABLE);
+       __uint(max_entries, 100); /* number of pages */
+#ifdef __TARGET_ARCH_arm64
+       __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */
+#else
+       __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */
+#endif
+} arena SEC(".maps");
+
+#include "bpf_arena_alloc.h"
+#include "bpf_arena_list.h"
+
+struct elem {
+       struct arena_list_node node;
+       __u64 value;
+};
+
+struct arena_list_head __arena *list_head;
+int list_sum;
+int cnt;
+bool skip = false;
+
+#ifdef __BPF_FEATURE_ARENA_CAST
+long __arena arena_sum;
+int __arena test_val = 1;
+struct arena_list_head __arena global_head;
+#else
+long arena_sum SEC(".arena.1");
+int test_val SEC(".arena.1");
+#endif
+
+int zero;
+
+SEC("syscall")
+int arena_list_add(void *ctx)
+{
+#ifdef __BPF_FEATURE_ARENA_CAST
+       __u64 i;
+
+       list_head = &global_head;
+
+       for (i = zero; i < cnt; cond_break, i++) {
+               struct elem __arena *n = bpf_alloc(sizeof(*n));
+
+               test_val++;
+               n->value = i;
+               arena_sum += i;
+               list_add_head(&n->node, list_head);
+       }
+#else
+       skip = true;
+#endif
+       return 0;
+}
+
+SEC("syscall")
+int arena_list_del(void *ctx)
+{
+#ifdef __BPF_FEATURE_ARENA_CAST
+       struct elem __arena *n;
+       int sum = 0;
+
+       arena_sum = 0;
+       list_for_each_entry(n, list_head, node) {
+               sum += n->value;
+               arena_sum += n->value;
+               list_del(&n->node);
+               bpf_free(n);
+       }
+       list_sum = sum;
+#else
+       skip = true;
+#endif
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bad_struct_ops.c b/tools/testing/selftests/bpf/progs/bad_struct_ops.c
new file mode 100644 (file)
index 0000000..b7e175c
--- /dev/null
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1) { return 0; }
+
+SEC("struct_ops/test_2")
+int BPF_PROG(test_2) { return 0; }
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops testmod_1 = {
+       .test_1 = (void *)test_1,
+       .test_2 = (void *)test_2
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops2 testmod_2 = {
+       .test_1 = (void *)test_1
+};
diff --git a/tools/testing/selftests/bpf/progs/bad_struct_ops2.c b/tools/testing/selftests/bpf/progs/bad_struct_ops2.c
new file mode 100644 (file)
index 0000000..64a95f6
--- /dev/null
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* This is an unused struct_ops program, it lacks corresponding
+ * struct_ops map, which provides attachment information.
+ * W/o additional configuration attempt to load such
+ * BPF object file would fail.
+ */
+SEC("struct_ops/foo")
+void foo(void) {}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_autocreate.c b/tools/testing/selftests/bpf/progs/struct_ops_autocreate.c
new file mode 100644 (file)
index 0000000..ba10c38
--- /dev/null
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int test_1_result = 0;
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_1)
+{
+       test_1_result = 42;
+       return 0;
+}
+
+SEC("struct_ops/test_1")
+int BPF_PROG(test_2)
+{
+       return 0;
+}
+
+struct bpf_testmod_ops___v1 {
+       int (*test_1)(void);
+};
+
+struct bpf_testmod_ops___v2 {
+       int (*test_1)(void);
+       int (*does_not_exist)(void);
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops___v1 testmod_1 = {
+       .test_1 = (void *)test_1
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops___v2 testmod_2 = {
+       .test_1 = (void *)test_1,
+       .does_not_exist = (void *)test_2
+};
+
+SEC("?.struct_ops")
+struct bpf_testmod_ops___v1 optional_map = {
+       .test_1 = (void *)test_1,
+};
+
+SEC("?.struct_ops.link")
+struct bpf_testmod_ops___v1 optional_map2 = {
+       .test_1 = (void *)test_1,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_autocreate2.c b/tools/testing/selftests/bpf/progs/struct_ops_autocreate2.c
new file mode 100644 (file)
index 0000000..6049d9c
--- /dev/null
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int test_1_result = 0;
+
+SEC("?struct_ops/test_1")
+int BPF_PROG(foo)
+{
+       test_1_result = 42;
+       return 0;
+}
+
+SEC("?struct_ops/test_1")
+int BPF_PROG(bar)
+{
+       test_1_result = 24;
+       return 0;
+}
+
+struct bpf_testmod_ops {
+       int (*test_1)(void);
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops testmod_1 = {
+       .test_1 = (void *)bar
+};
index 25952fa093482fa2d89e36510460f5119d48f5e8..026cabfa7f1f671ce988da8332607ae25ee9db85 100644 (file)
@@ -7,12 +7,14 @@
 
 char _license[] SEC("license") = "GPL";
 
+int test_1_result = 0;
 int test_2_result = 0;
 
 SEC("struct_ops/test_1")
 int BPF_PROG(test_1)
 {
-       return 0xdeadbeef;
+       test_1_result = 0xdeadbeef;
+       return 0;
 }
 
 SEC("struct_ops/test_2")
@@ -35,3 +37,20 @@ struct bpf_testmod_ops testmod_1 = {
        .data = 0x1,
 };
 
+SEC("struct_ops/test_2")
+void BPF_PROG(test_2_v2, int a, int b)
+{
+       test_2_result = a * b;
+}
+
+struct bpf_testmod_ops___v2 {
+       int (*test_1)(void);
+       void (*test_2)(int a, int b);
+       int (*test_maybe_null)(int dummy, struct task_struct *task);
+};
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops___v2 testmod_2 = {
+       .test_1 = (void *)test_1,
+       .test_2 = (void *)test_2_v2,
+};
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c b/tools/testing/selftests/bpf/progs/struct_ops_multi_pages.c
new file mode 100644 (file)
index 0000000..9efcc6e
--- /dev/null
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../bpf_testmod/bpf_testmod.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define TRAMP(x) \
+       SEC("struct_ops/tramp_" #x)             \
+       int BPF_PROG(tramp_ ## x, int a)        \
+       {                                       \
+               return a;                       \
+       }
+
+TRAMP(1)
+TRAMP(2)
+TRAMP(3)
+TRAMP(4)
+TRAMP(5)
+TRAMP(6)
+TRAMP(7)
+TRAMP(8)
+TRAMP(9)
+TRAMP(10)
+TRAMP(11)
+TRAMP(12)
+TRAMP(13)
+TRAMP(14)
+TRAMP(15)
+TRAMP(16)
+TRAMP(17)
+TRAMP(18)
+TRAMP(19)
+TRAMP(20)
+TRAMP(21)
+TRAMP(22)
+TRAMP(23)
+TRAMP(24)
+TRAMP(25)
+TRAMP(26)
+TRAMP(27)
+TRAMP(28)
+TRAMP(29)
+TRAMP(30)
+TRAMP(31)
+TRAMP(32)
+TRAMP(33)
+TRAMP(34)
+TRAMP(35)
+TRAMP(36)
+TRAMP(37)
+TRAMP(38)
+TRAMP(39)
+TRAMP(40)
+
+#define F_TRAMP(x) .tramp_ ## x = (void *)tramp_ ## x
+
+SEC(".struct_ops.link")
+struct bpf_testmod_ops multi_pages = {
+       F_TRAMP(1),
+       F_TRAMP(2),
+       F_TRAMP(3),
+       F_TRAMP(4),
+       F_TRAMP(5),
+       F_TRAMP(6),
+       F_TRAMP(7),
+       F_TRAMP(8),
+       F_TRAMP(9),
+       F_TRAMP(10),
+       F_TRAMP(11),
+       F_TRAMP(12),
+       F_TRAMP(13),
+       F_TRAMP(14),
+       F_TRAMP(15),
+       F_TRAMP(16),
+       F_TRAMP(17),
+       F_TRAMP(18),
+       F_TRAMP(19),
+       F_TRAMP(20),
+       F_TRAMP(21),
+       F_TRAMP(22),
+       F_TRAMP(23),
+       F_TRAMP(24),
+       F_TRAMP(25),
+       F_TRAMP(26),
+       F_TRAMP(27),
+       F_TRAMP(28),
+       F_TRAMP(29),
+       F_TRAMP(30),
+       F_TRAMP(31),
+       F_TRAMP(32),
+       F_TRAMP(33),
+       F_TRAMP(34),
+       F_TRAMP(35),
+       F_TRAMP(36),
+       F_TRAMP(37),
+       F_TRAMP(38),
+       F_TRAMP(39),
+       F_TRAMP(40),
+};
index 694e7cec1823ccae864171eaca2f65decf14eac8..5fda43901033a306d263145703b7705284007523 100644 (file)
@@ -33,6 +33,27 @@ int bench_trigger_kprobe(void *ctx)
        return 0;
 }
 
+SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
+int bench_trigger_kretprobe(void *ctx)
+{
+       __sync_add_and_fetch(&hits, 1);
+       return 0;
+}
+
+SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
+int bench_trigger_kprobe_multi(void *ctx)
+{
+       __sync_add_and_fetch(&hits, 1);
+       return 0;
+}
+
+SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
+int bench_trigger_kretprobe_multi(void *ctx)
+{
+       __sync_add_and_fetch(&hits, 1);
+       return 0;
+}
+
 SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry(void *ctx)
 {
@@ -40,6 +61,13 @@ int bench_trigger_fentry(void *ctx)
        return 0;
 }
 
+SEC("fexit/" SYS_PREFIX "sys_getpgid")
+int bench_trigger_fexit(void *ctx)
+{
+       __sync_add_and_fetch(&hits, 1);
+       return 0;
+}
+
 SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
 int bench_trigger_fentry_sleep(void *ctx)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c
new file mode 100644 (file)
index 0000000..5540b05
--- /dev/null
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARENA);
+       __uint(map_flags, BPF_F_MMAPABLE);
+       __uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/
+       __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */
+} arena SEC(".maps");
+
+SEC("syscall")
+__success __retval(0)
+int basic_alloc1(void *ctx)
+{
+#if defined(__BPF_FEATURE_ARENA_CAST)
+       volatile int __arena *page1, *page2, *no_page, *page3;
+
+       page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+       if (!page1)
+               return 1;
+       *page1 = 1;
+       page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+       if (!page2)
+               return 2;
+       *page2 = 2;
+       no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+       if (no_page)
+               return 3;
+       if (*page1 != 1)
+               return 4;
+       if (*page2 != 2)
+               return 5;
+       bpf_arena_free_pages(&arena, (void __arena *)page2, 1);
+       if (*page1 != 1)
+               return 6;
+       if (*page2 != 0) /* use-after-free should return 0 */
+               return 7;
+       page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+       if (!page3)
+               return 8;
+       *page3 = 3;
+       if (page2 != page3)
+               return 9;
+       if (*page1 != 1)
+               return 10;
+#endif
+       return 0;
+}
+
+SEC("syscall")
+__success __retval(0)
+int basic_alloc2(void *ctx)
+{
+#if defined(__BPF_FEATURE_ARENA_CAST)
+       volatile char __arena *page1, *page2, *page3, *page4;
+
+       page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
+       if (!page1)
+               return 1;
+       page2 = page1 + __PAGE_SIZE;
+       page3 = page1 + __PAGE_SIZE * 2;
+       page4 = page1 - __PAGE_SIZE;
+       *page1 = 1;
+       *page2 = 2;
+       *page3 = 3;
+       *page4 = 4;
+       if (*page1 != 1)
+               return 1;
+       if (*page2 != 2)
+               return 2;
+       if (*page3 != 0)
+               return 3;
+       if (*page4 != 0)
+               return 4;
+       bpf_arena_free_pages(&arena, (void __arena *)page1, 2);
+       if (*page1 != 0)
+               return 5;
+       if (*page2 != 0)
+               return 6;
+       if (*page3 != 0)
+               return 7;
+       if (*page4 != 0)
+               return 8;
+#endif
+       return 0;
+}
+
+struct bpf_arena___l {
+        struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+SEC("syscall")
+__success __retval(0) __log_level(2)
+int basic_alloc3(void *ctx)
+{
+       struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena;
+       volatile char __arena *pages;
+
+       pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0);
+       if (!pages)
+               return 1;
+       return 0;
+}
+
+SEC("iter.s/bpf_map")
+__success __log_level(2)
+int iter_maps1(struct bpf_iter__bpf_map *ctx)
+{
+       struct bpf_map *map = ctx->map;
+
+       if (!map)
+               return 0;
+       bpf_arena_alloc_pages(map, NULL, map->max_entries, 0, 0);
+       return 0;
+}
+
+SEC("iter.s/bpf_map")
+__failure __msg("expected pointer to STRUCT bpf_map")
+int iter_maps2(struct bpf_iter__bpf_map *ctx)
+{
+       struct seq_file *seq = ctx->meta->seq;
+
+       bpf_arena_alloc_pages((void *)seq, NULL, 1, 0, 0);
+       return 0;
+}
+
+SEC("iter.s/bpf_map")
+__failure __msg("untrusted_ptr_bpf_map")
+int iter_maps3(struct bpf_iter__bpf_map *ctx)
+{
+       struct bpf_map *map = ctx->map;
+
+       if (!map)
+               return 0;
+       bpf_arena_alloc_pages(map->inner_map_meta, NULL, map->max_entries, 0, 0);
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index a955a6358206eac8a4b5065b531e0171d4cc2a62..99e561f18f9b67aa44af233c8941cb1e37776248 100644 (file)
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bpf.h>
-#include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
+#include "bpf_experimental.h"
 
 struct {
        __uint(type, BPF_MAP_TYPE_ARRAY);
@@ -309,4 +307,103 @@ int iter_limit_bug(struct __sk_buff *skb)
        return 0;
 }
 
+#define ARR_SZ 1000000
+int zero;
+char arr[ARR_SZ];
+
+SEC("socket")
+__success __retval(0xd495cdc0)
+int cond_break1(const void *ctx)
+{
+       unsigned long i;
+       unsigned int sum = 0;
+
+       for (i = zero; i < ARR_SZ; cond_break, i++)
+               sum += i;
+       for (i = zero; i < ARR_SZ; i++) {
+               barrier_var(i);
+               sum += i + arr[i];
+               cond_break;
+       }
+
+       return sum;
+}
+
+SEC("socket")
+__success __retval(999000000)
+int cond_break2(const void *ctx)
+{
+       int i, j;
+       int sum = 0;
+
+       for (i = zero; i < 1000; cond_break, i++)
+               for (j = zero; j < 1000; j++) {
+                       sum += i + j;
+                       cond_break;
+               }
+
+       return sum;
+}
+
+static __noinline int loop(void)
+{
+       int i, sum = 0;
+
+       for (i = zero; i <= 1000000; i++, cond_break)
+               sum += i;
+
+       return sum;
+}
+
+SEC("socket")
+__success __retval(0x6a5a2920)
+int cond_break3(const void *ctx)
+{
+       return loop();
+}
+
+SEC("socket")
+__success __retval(1)
+int cond_break4(const void *ctx)
+{
+       int cnt = zero;
+
+       for (;;) {
+               /* should eventually break out of the loop */
+               cond_break;
+               cnt++;
+       }
+       /* if we looped a bit, it's a success */
+       return cnt > 1 ? 1 : 0;
+}
+
+static __noinline int static_subprog(void)
+{
+       int cnt = zero;
+
+       for (;;) {
+               cond_break;
+               cnt++;
+       }
+
+       return cnt;
+}
+
+SEC("socket")
+__success __retval(1)
+int cond_break5(const void *ctx)
+{
+       int cnt1 = zero, cnt2;
+
+       for (;;) {
+               cond_break;
+               cnt1++;
+       }
+
+       cnt2 = static_subprog();
+
+       /* main and subprog have to loop a bit */
+       return cnt1 > 1 && cnt2 > 1 ? 1 : 0;
+}
+
 char _license[] SEC("license") = "GPL";
index ba57601c2a4d5b98fdf5513812efab6f40c389c5..524c38e9cde48feb8977bb26555d29df3656c85a 100644 (file)
@@ -501,7 +501,7 @@ static bool is_unpriv_capable_map(struct bpf_map *map)
        }
 }
 
-static int do_prog_test_run(int fd_prog, int *retval)
+static int do_prog_test_run(int fd_prog, int *retval, bool empty_opts)
 {
        __u8 tmp_out[TEST_DATA_LEN << 2] = {};
        __u8 tmp_in[TEST_DATA_LEN] = {};
@@ -514,6 +514,10 @@ static int do_prog_test_run(int fd_prog, int *retval)
                .repeat = 1,
        );
 
+       if (empty_opts) {
+               memset(&topts, 0, sizeof(struct bpf_test_run_opts));
+               topts.sz = sizeof(struct bpf_test_run_opts);
+       }
        err = bpf_prog_test_run_opts(fd_prog, &topts);
        saved_errno = errno;
 
@@ -649,7 +653,8 @@ void run_subtest(struct test_loader *tester,
                        }
                }
 
-               do_prog_test_run(bpf_program__fd(tprog), &retval);
+               do_prog_test_run(bpf_program__fd(tprog), &retval,
+                                bpf_program__type(tprog) == BPF_PROG_TYPE_SYSCALL ? true : false);
                if (retval != subspec->retval && subspec->retval != POINTER_VALUE) {
                        PRINT_FAIL("Unexpected retval: %d != %d\n", retval, subspec->retval);
                        goto tobj_cleanup;
index 808550986f306530be2010f7e417ced77d7ba3a3..89ff704e9dad5eabfe35047616f109f6f133a27e 100644 (file)
@@ -683,11 +683,69 @@ static const struct argp_option opts[] = {
        {},
 };
 
+static FILE *libbpf_capture_stream;
+
+static struct {
+       char *buf;
+       size_t buf_sz;
+} libbpf_output_capture;
+
+/* Creates a global memstream capturing INFO and WARN level output
+ * passed to libbpf_print_fn.
+ * Returns 0 on success, negative value on failure.
+ * On failure the description is printed using PRINT_FAIL and
+ * current test case is marked as fail.
+ */
+int start_libbpf_log_capture(void)
+{
+       if (libbpf_capture_stream) {
+               PRINT_FAIL("%s: libbpf_capture_stream != NULL\n", __func__);
+               return -EINVAL;
+       }
+
+       libbpf_capture_stream = open_memstream(&libbpf_output_capture.buf,
+                                              &libbpf_output_capture.buf_sz);
+       if (!libbpf_capture_stream) {
+               PRINT_FAIL("%s: open_memstream failed errno=%d\n", __func__, errno);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/* Destroys global memstream created by start_libbpf_log_capture().
+ * Returns a pointer to captured data which has to be freed.
+ * Returned buffer is null terminated.
+ */
+char *stop_libbpf_log_capture(void)
+{
+       char *buf;
+
+       if (!libbpf_capture_stream)
+               return NULL;
+
+       fputc(0, libbpf_capture_stream);
+       fclose(libbpf_capture_stream);
+       libbpf_capture_stream = NULL;
+       /* get 'buf' after fclose(), see open_memstream() documentation */
+       buf = libbpf_output_capture.buf;
+       memset(&libbpf_output_capture, 0, sizeof(libbpf_output_capture));
+       return buf;
+}
+
 static int libbpf_print_fn(enum libbpf_print_level level,
                           const char *format, va_list args)
 {
+       if (libbpf_capture_stream && level != LIBBPF_DEBUG) {
+               va_list args2;
+
+               va_copy(args2, args);
+               vfprintf(libbpf_capture_stream, format, args2);
+       }
+
        if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG)
                return 0;
+
        vfprintf(stdout, format, args);
        return 0;
 }
@@ -1081,6 +1139,7 @@ static void run_one_test(int test_num)
                cleanup_cgroup_environment();
 
        stdio_restore();
+       free(stop_libbpf_log_capture());
 
        dump_test_log(test, state, false, false, NULL);
 }
index 80df5124488667ece3bbe157bbf7df04c0bd0d73..0ba5a20b19ba8e8e0ec3f7ca774ba847c1cb3320 100644 (file)
@@ -397,6 +397,9 @@ int test__join_cgroup(const char *path);
                system(cmd);                                            \
        })
 
+int start_libbpf_log_capture(void);
+char *stop_libbpf_log_capture(void);
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
        return (__u64) (unsigned long) ptr;
index 878d68db0325673c0fa16ccefe583ea283903f88..bdf5d8180067f8d2bfeb2ad520aae86134876a10 100644 (file)
@@ -480,7 +480,7 @@ peek:
                                        for (int j = 0; j < 500; j++) {
                                                if (complete_tx(xsk, clock_id))
                                                        break;
-                                               usleep(10*1000);
+                                               usleep(10);
                                        }
                                }
                        }