ebpf的perf buffer机制分析-USB迷|专注于互联网分享

ebpf的perf buffer机制分析

什么是perf buff

eBPF中，用户态和内核态大量交换数据使用了perf buffer机制，这种机制保证了高效的数据交换能力

基于libbpf的使用

下面是kernel源码，samples目录

< samples/bpf/trace_output_kern.c >

struct {__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);__uint(key_size, sizeof(int));__uint(value_size, sizeof(u32));__uint(max_entries, 2);
} my_map SEC(".maps");

定义kprobe挂载系统调用sys_write的机制，发生系统调用就获取进程pid

< samples/bpf/trace_output_kern.c >

SEC("kprobe/" SYSCALL(sys_write))     /* 使用kprobe监控sys_write调用 */
int bpf_prog1(struct pt_regs *ctx)
{struct S {u64 pid;u64 cookie;} data;data.pid = bpf_get_current_pid_tgid();data.cookie = 0x12345678;bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));return 0;
}

用户态的实现

< samples/bpf/trace_output_user.c >

static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size)               /* 回调函数的实现 */
{struct {__u64 pid;__u64 cookie;} *e = data;if (e->cookie != 0x12345678) {printf("BUG pid %llx cookie %llx sized %d\n",e->pid, e->cookie, size);return;}cnt++;if (cnt == MAX_CNT) {printf("recv %lld events per sec\n",MAX_CNT * 1000000000ll / (time_get_ns() - start_time));return;}
}int main(int argc, char **argv)
{struct bpf_link *link = NULL;struct bpf_program *prog;struct perf_buffer *pb;struct bpf_object *obj;int map_fd, ret = 0;char filename[256];FILE *f;snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);obj = bpf_object__open_file(filename, NULL);if (libbpf_get_error(obj)) {fprintf(stderr, "ERROR: opening BPF object file failed\n");return 0;}/* load BPF program */if (bpf_object__load(obj)) {fprintf(stderr, "ERROR: loading BPF object file failed\n");goto cleanup;}map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");          /* 内核定义的map名字,最后返回的是fd */if (map_fd < 0) {fprintf(stderr, "ERROR: finding a map in obj file failed\n");goto cleanup;}prog = bpf_object__find_program_by_name(obj, "bpf_prog1");             /* 内核定义的map名字prog名字 */if (libbpf_get_error(prog)) {fprintf(stderr, "ERROR: finding a prog in obj file failed\n");goto cleanup;}link = bpf_program__attach(prog);if (libbpf_get_error(link)) {fprintf(stderr, "ERROR: bpf_program__attach failed\n");link = NULL;goto cleanup;}pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL);           /* 定义map处理函数，即回调处理函数print_bpf_output */ret = libbpf_get_error(pb);if (ret) {printf("failed to setup perf_buffer: %d\n", ret);return 1;}f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");(void) f;start_time = time_get_ns();while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) {            /* 通过perf_buffer__poll监控内核数据 */}kill(0, SIGINT);cleanup:bpf_link__destroy(link);bpf_object__close(obj);return ret;
}

内核如何实现perf buffer机制的

用户态程序调用bpf_object__open_file()函数(底层调用了libbpf中的接口函数)对trace_output_kern.o文件进行了解析，并提取了SEC("maps")和SEC(".maps")两个section中申明的map数据结构

真正要让ebpf map运行起来，承接起用户态和内核态数据交互等等方面的功能，还需要在内核中创建一个map对象。我们来看一下整个创建流程。

__sys_bpf()

< kernel/bpf/syscall.c >

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
}static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{union bpf_attr attr;bool capable;int err;capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;/* Intent here is for unprivileged_bpf_disabled to block key object* creation commands for unprivileged users; other actions depend* of fd availability and access to bpffs, so are dependent on* object creation success.  Capabilities are later verified for* operations such as load and map create, so even with unprivileged* BPF disabled, capability checks are still carried out for these* and other operations.*/if (!capable &&(cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))return -EPERM;err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);if (err)return err;size = min_t(u32, size, sizeof(attr));/* copy attributes from user space, may be less than sizeof(bpf_attr) */memset(&attr, 0, sizeof(attr));if (copy_from_bpfptr(&attr, uattr, size) != 0)return -EFAULT;err = security_bpf(cmd, &attr, size);if (err < 0)return err;switch (cmd) {case BPF_MAP_CREATE:err = map_create(&attr);                          /* map的创建入口 */break;case BPF_MAP_LOOKUP_ELEM:err = map_lookup_elem(&attr);break;case BPF_MAP_UPDATE_ELEM:err = map_update_elem(&attr, uattr);break;case BPF_MAP_DELETE_ELEM:err = map_delete_elem(&attr, uattr);break;case BPF_MAP_GET_NEXT_KEY:err = map_get_next_key(&attr);break;case BPF_MAP_FREEZE:err = map_freeze(&attr);break;case BPF_PROG_LOAD:err = bpf_prog_load(&attr, uattr);break;case BPF_OBJ_PIN:err = bpf_obj_pin(&attr);break;case BPF_OBJ_GET:err = bpf_obj_get(&attr);break;case BPF_PROG_ATTACH:err = bpf_prog_attach(&attr);break;case BPF_PROG_DETACH:err = bpf_prog_detach(&attr);break;case BPF_PROG_QUERY:err = bpf_prog_query(&attr, uattr.user);break;case BPF_PROG_TEST_RUN:err = bpf_prog_test_run(&attr, uattr.user);break;case BPF_PROG_GET_NEXT_ID:err = bpf_obj_get_next_id(&attr, uattr.user,&prog_idr, &prog_idr_lock);break;case BPF_MAP_GET_NEXT_ID:err = bpf_obj_get_next_id(&attr, uattr.user,&map_idr, &map_idr_lock);break;case BPF_BTF_GET_NEXT_ID:err = bpf_obj_get_next_id(&attr, uattr.user,&btf_idr, &btf_idr_lock);break;case BPF_PROG_GET_FD_BY_ID:err = bpf_prog_get_fd_by_id(&attr);break;case BPF_MAP_GET_FD_BY_ID:err = bpf_map_get_fd_by_id(&attr);break;case BPF_OBJ_GET_INFO_BY_FD:err = bpf_obj_get_info_by_fd(&attr, uattr.user);break;case BPF_RAW_TRACEPOINT_OPEN:err = bpf_raw_tracepoint_open(&attr);break;case BPF_BTF_LOAD:err = bpf_btf_load(&attr, uattr);break;case BPF_BTF_GET_FD_BY_ID:err = bpf_btf_get_fd_by_id(&attr);break;case BPF_TASK_FD_QUERY:err = bpf_task_fd_query(&attr, uattr.user);break;case BPF_MAP_LOOKUP_AND_DELETE_ELEM:err = map_lookup_and_delete_elem(&attr);break;case BPF_MAP_LOOKUP_BATCH:err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);break;case BPF_MAP_LOOKUP_AND_DELETE_BATCH:err = bpf_map_do_batch(&attr, uattr.user,BPF_MAP_LOOKUP_AND_DELETE_BATCH);break;case BPF_MAP_UPDATE_BATCH:err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);break;case BPF_MAP_DELETE_BATCH:err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);break;case BPF_LINK_CREATE:err = link_create(&attr, uattr);break;case BPF_LINK_UPDATE:err = link_update(&attr);break;case BPF_LINK_GET_FD_BY_ID:err = bpf_link_get_fd_by_id(&attr);break;case BPF_LINK_GET_NEXT_ID:err = bpf_obj_get_next_id(&attr, uattr.user,&link_idr, &link_idr_lock);break;case BPF_ENABLE_STATS:err = bpf_enable_stats(&attr);break;case BPF_ITER_CREATE:err = bpf_iter_create(&attr);break;case BPF_LINK_DETACH:err = link_detach(&attr);break;case BPF_PROG_BIND_MAP:err = bpf_prog_bind_map(&attr);break;default:err = -EINVAL;break;}return err;
}

映射类型的定义

< include/uapi/linux/bpf.h >

enum bpf_map_type {BPF_MAP_TYPE_UNSPEC,BPF_MAP_TYPE_HASH,BPF_MAP_TYPE_ARRAY,BPF_MAP_TYPE_PROG_ARRAY,BPF_MAP_TYPE_PERF_EVENT_ARRAY,          /* 这里定义perf buffer映射类型 */BPF_MAP_TYPE_PERCPU_HASH,BPF_MAP_TYPE_PERCPU_ARRAY,BPF_MAP_TYPE_STACK_TRACE,BPF_MAP_TYPE_CGROUP_ARRAY,BPF_MAP_TYPE_LRU_HASH,BPF_MAP_TYPE_LRU_PERCPU_HASH,BPF_MAP_TYPE_LPM_TRIE,BPF_MAP_TYPE_ARRAY_OF_MAPS,BPF_MAP_TYPE_HASH_OF_MAPS,BPF_MAP_TYPE_DEVMAP,BPF_MAP_TYPE_SOCKMAP,BPF_MAP_TYPE_CPUMAP,BPF_MAP_TYPE_XSKMAP,BPF_MAP_TYPE_SOCKHASH,BPF_MAP_TYPE_CGROUP_STORAGE,BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,BPF_MAP_TYPE_QUEUE,BPF_MAP_TYPE_STACK,BPF_MAP_TYPE_SK_STORAGE,BPF_MAP_TYPE_DEVMAP_HASH,BPF_MAP_TYPE_STRUCT_OPS,BPF_MAP_TYPE_RINGBUF,BPF_MAP_TYPE_INODE_STORAGE,BPF_MAP_TYPE_TASK_STORAGE,BPF_MAP_TYPE_BLOOM_FILTER,BPF_MAP_TYPE_USER_RINGBUF,
};

map_create()

< kernel/bpf/syscall.c >

static int map_create(union bpf_attr *attr)
{int numa_node = bpf_map_attr_numa_node(attr);struct bpf_map *map;int f_flags;int err;err = CHECK_ATTR(BPF_MAP_CREATE);if (err)return -EINVAL;if (attr->btf_vmlinux_value_type_id) {if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||attr->btf_key_type_id || attr->btf_value_type_id)return -EINVAL;} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {return -EINVAL;}if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&attr->map_extra != 0)return -EINVAL;f_flags = bpf_get_file_flag(attr->map_flags);if (f_flags < 0)return f_flags;if (numa_node != NUMA_NO_NODE &&((unsigned int)numa_node >= nr_node_ids ||!node_online(numa_node)))return -EINVAL;/* find map type and init map: hashtable vs rbtree vs bloom vs ... */map = find_and_alloc_map(attr);                          /* 创建map对象struct bpf_map */if (IS_ERR(map))return PTR_ERR(map);err = bpf_obj_name_cpy(map->name, attr->map_name,sizeof(attr->map_name));                                /*  拷贝map name  */if (err < 0)goto free_map;atomic64_set(&map->refcnt, 1);atomic64_set(&map->usercnt, 1);mutex_init(&map->freeze_mutex);spin_lock_init(&map->owner.lock);map->spin_lock_off = -EINVAL;map->timer_off = -EINVAL;if (attr->btf_key_type_id || attr->btf_value_type_id ||/* Even the map's value is a kernel's struct,* the bpf_prog.o must have BTF to begin with* to figure out the corresponding kernel's* counter part.  Thus, attr->btf_fd has* to be valid also.*/attr->btf_vmlinux_value_type_id) {struct btf *btf;btf = btf_get_by_fd(attr->btf_fd);if (IS_ERR(btf)) {err = PTR_ERR(btf);goto free_map;}if (btf_is_kernel(btf)) {btf_put(btf);err = -EACCES;goto free_map;}map->btf = btf;if (attr->btf_value_type_id) {err = map_check_btf(map, btf, attr->btf_key_type_id,attr->btf_value_type_id);if (err)goto free_map;}map->btf_key_type_id = attr->btf_key_type_id;map->btf_value_type_id = attr->btf_value_type_id;map->btf_vmlinux_value_type_id =attr->btf_vmlinux_value_type_id;}err = bpf_map_alloc_off_arr(map);if (err)goto free_map;err = security_bpf_map_alloc(map);if (err)goto free_map_off_arr;err = bpf_map_alloc_id(map);if (err)goto free_map_sec;bpf_map_save_memcg(map);err = bpf_map_new_fd(map, f_flags);                    /* 返回map的文件句柄,这里的err最后是fd,其实这里err名字应该改下 */if (err < 0) {/* failed to allocate fd.* bpf_map_put_with_uref() is needed because the above* bpf_map_alloc_id() has published the map* to the userspace and the userspace may* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.*/bpf_map_put_with_uref(map);return err;}return err;free_map_sec:security_bpf_map_free(map);
free_map_off_arr:kfree(map->off_arr);
free_map:btf_put(map->btf);map->ops->map_free(map);return err;
}

find_and_alloc_map()

< kernel/bpf/syscall.c >

static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{const struct bpf_map_ops *ops;u32 type = attr->map_type;struct bpf_map *map;int err;if (type >= ARRAY_SIZE(bpf_map_types))return ERR_PTR(-EINVAL);type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));ops = bpf_map_types[type];                         /* 根据map的类型attr->map_type找到这种map类型对应的struct bpf_map_ops *ops */if (!ops)return ERR_PTR(-EINVAL);if (ops->map_alloc_check) {err = ops->map_alloc_check(attr);if (err)return ERR_PTR(err);}if (attr->map_ifindex)ops = &bpf_map_offload_ops;map = ops->map_alloc(attr);                      /* 创建struct bpf_map *map对象 */if (IS_ERR(map))return map;map->ops = ops;map->map_type = type;return map;
}

perf_event_array_map_ops

BPF_MAP_TYPE_PERF_EVENT_ARRAY类型的ops

< kernel/bpf/arraymap.c >

const struct bpf_map_ops perf_event_array_map_ops = {.map_meta_equal = bpf_map_meta_equal,.map_alloc_check = fd_array_map_alloc_check,.map_alloc = array_map_alloc,                                     /* map_alloc分配map内存 */.map_free = perf_event_fd_array_map_free,.map_get_next_key = array_map_get_next_key,.map_lookup_elem = fd_array_map_lookup_elem,.map_delete_elem = fd_array_map_delete_elem,.map_fd_get_ptr = perf_event_fd_array_get_ptr,.map_fd_put_ptr = perf_event_fd_array_put_ptr,.map_release = perf_event_fd_array_release,.map_check_btf = map_check_no_btf,.map_btf_id = &array_map_btf_ids[0],
};

array_map_alloc()

< kernel/bpf/arraymap.c >

static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;int numa_node = bpf_map_attr_numa_node(attr);u32 elem_size, index_mask, max_entries;bool bypass_spec_v1 = bpf_bypass_spec_v1();u64 array_size, mask64;struct bpf_array *array;elem_size = round_up(attr->value_size, 8);max_entries = attr->max_entries;/* On 32 bit archs roundup_pow_of_two() with max_entries that has* upper most bit set in u32 space is undefined behavior due to* resulting 1U << 32, so do it manually here in u64 space.*/mask64 = fls_long(max_entries - 1);mask64 = 1ULL << mask64;mask64 -= 1;index_mask = mask64;if (!bypass_spec_v1) {/* round up array size to nearest power of 2,* since cpu will speculate within index_mask limits*/max_entries = index_mask + 1;/* Check for overflows. */if (max_entries < attr->max_entries)return ERR_PTR(-E2BIG);}array_size = sizeof(*array);if (percpu) {array_size += (u64) max_entries * sizeof(void *);} else {/* rely on vmalloc() to return page-aligned memory and* ensure array->value is exactly page-aligned*/if (attr->map_flags & BPF_F_MMAPABLE) {array_size = PAGE_ALIGN(array_size);array_size += PAGE_ALIGN((u64) max_entries * elem_size);} else {array_size += (u64) max_entries * elem_size;}}/* allocate all map elements and zero-initialize them */if (attr->map_flags & BPF_F_MMAPABLE) {void *data;/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */data = bpf_map_area_mmapable_alloc(array_size, numa_node);if (!data)return ERR_PTR(-ENOMEM);array = data + PAGE_ALIGN(sizeof(struct bpf_array))- offsetof(struct bpf_array, value);} else {array = bpf_map_area_alloc(array_size, numa_node);                     /* 分配map内存 */}if (!array)return ERR_PTR(-ENOMEM);array->index_mask = index_mask;array->map.bypass_spec_v1 = bypass_spec_v1;/* copy mandatory map attributes */bpf_map_init_from_attr(&array->map, attr);array->elem_size = elem_size;if (percpu && bpf_array_alloc_percpu(array)) {bpf_map_area_free(array);return ERR_PTR(-ENOMEM);}return &array->map;
}

struct bpf_array

< include/linux/bpf.h >

struct bpf_array {struct bpf_map map;u32 elem_size;u32 index_mask;struct bpf_array_aux *aux;union {                      /* map元素存放位置 */char value[0] __aligned(8);void *ptrs[0] __aligned(8);void __percpu *pptrs[0] __aligned(8);};
};

bpf_map_new_fd()

< kernel/bpf/syscall.c >

int bpf_map_new_fd(struct bpf_map *map, int flags)
{int ret;ret = security_bpf_map(map, OPEN_FMODE(flags));if (ret < 0)return ret;return anon_inode_getfd("bpf-map", &bpf_map_fops, map,flags | O_CLOEXEC);                                                            /* 这里最后返回fd,回顾下之前用户态程序的 map_fd = bpf_object__find_map_fd_by_name(obj, "my_map") */
}

经过这组函数调用，用户态就可以通过fd与map对象进行交互。

总结

event buffer对应的map的创建过程如下：
1) 解析trace_output_kern中的SEC("maps")和SEC(".maps")，并提取各个maps数据结构作为后续系统调用参数

2) 调用syscall(__NR_bpf, BPF_MAP_CREATE, attr, attr_size)系统调用在内核创建maps对象，并通过fd和用户态关联