/*
|
* Copyright (c) 2015 PLUMgrid, Inc.
|
*
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
* you may not use this file except in compliance with the License.
|
* You may obtain a copy of the License at
|
*
|
* http://www.apache.org/licenses/LICENSE-2.0
|
*
|
* Unless required by applicable law or agreed to in writing, software
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* See the License for the specific language governing permissions and
|
* limitations under the License.
|
*/
|
|
#include <inttypes.h>
|
#include <poll.h>
|
#include <stdio.h>
|
#include <stdint.h>
|
#include <stdlib.h>
|
#include <string.h>
|
#include <syscall.h>
|
#include <sys/ioctl.h>
|
#include <sys/mman.h>
|
#include <sys/types.h>
|
#include <unistd.h>
|
#include <linux/perf_event.h>
|
|
#include "libbpf.h"
|
#include "perf_reader.h"
|
|
enum {
|
RB_NOT_USED = 0, // ring buffer not usd
|
RB_USED_IN_MUNMAP = 1, // used in munmap
|
RB_USED_IN_READ = 2, // used in read
|
};
|
|
struct perf_reader {
|
perf_reader_raw_cb raw_cb;
|
perf_reader_lost_cb lost_cb;
|
void *cb_cookie; // to be returned in the cb
|
void *buf; // for keeping segmented data
|
size_t buf_size;
|
void *base;
|
int rb_use_state;
|
pid_t rb_read_tid;
|
int page_size;
|
int page_cnt;
|
int fd;
|
};
|
|
struct perf_reader * perf_reader_new(perf_reader_raw_cb raw_cb,
|
perf_reader_lost_cb lost_cb,
|
void *cb_cookie, int page_cnt) {
|
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
|
if (!reader)
|
return NULL;
|
reader->raw_cb = raw_cb;
|
reader->lost_cb = lost_cb;
|
reader->cb_cookie = cb_cookie;
|
reader->fd = -1;
|
reader->page_size = getpagesize();
|
reader->page_cnt = page_cnt;
|
return reader;
|
}
|
|
void perf_reader_free(void *ptr) {
|
if (ptr) {
|
struct perf_reader *reader = ptr;
|
pid_t tid = syscall(__NR_gettid);
|
while (!__sync_bool_compare_and_swap(&reader->rb_use_state, RB_NOT_USED, RB_USED_IN_MUNMAP)) {
|
// If the same thread, it is called from call back handler, no locking needed
|
if (tid == reader->rb_read_tid)
|
break;
|
}
|
munmap(reader->base, reader->page_size * (reader->page_cnt + 1));
|
if (reader->fd >= 0) {
|
ioctl(reader->fd, PERF_EVENT_IOC_DISABLE, 0);
|
close(reader->fd);
|
}
|
free(reader->buf);
|
free(ptr);
|
}
|
}
|
|
int perf_reader_mmap(struct perf_reader *reader) {
|
int mmap_size = reader->page_size * (reader->page_cnt + 1);
|
|
if (reader->fd < 0) {
|
fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__);
|
return -1;
|
}
|
|
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
|
if (reader->base == MAP_FAILED) {
|
perror("mmap");
|
return -1;
|
}
|
|
return 0;
|
}
|
|
struct perf_sample_trace_common {
|
uint16_t id;
|
uint8_t flags;
|
uint8_t preempt_count;
|
int pid;
|
};
|
|
struct perf_sample_trace_kprobe {
|
struct perf_sample_trace_common common;
|
uint64_t ip;
|
};
|
|
static void parse_sw(struct perf_reader *reader, void *data, int size) {
|
uint8_t *ptr = data;
|
struct perf_event_header *header = (void *)data;
|
|
struct {
|
uint32_t size;
|
char data[0];
|
} *raw = NULL;
|
|
ptr += sizeof(*header);
|
if (ptr > (uint8_t *)data + size) {
|
fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
|
return;
|
}
|
|
raw = (void *)ptr;
|
ptr += sizeof(raw->size) + raw->size;
|
if (ptr > (uint8_t *)data + size) {
|
fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
|
return;
|
}
|
|
// sanity check
|
if (ptr != (uint8_t *)data + size) {
|
fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
|
return;
|
}
|
|
if (reader->raw_cb)
|
reader->raw_cb(reader->cb_cookie, raw->data, raw->size);
|
}
|
|
static uint64_t read_data_head(volatile struct perf_event_mmap_page *perf_header) {
|
uint64_t data_head = perf_header->data_head;
|
asm volatile("" ::: "memory");
|
return data_head;
|
}
|
|
static void write_data_tail(volatile struct perf_event_mmap_page *perf_header, uint64_t data_tail) {
|
asm volatile("" ::: "memory");
|
perf_header->data_tail = data_tail;
|
}
|
|
void perf_reader_event_read(struct perf_reader *reader) {
|
volatile struct perf_event_mmap_page *perf_header = reader->base;
|
uint64_t buffer_size = (uint64_t)reader->page_size * reader->page_cnt;
|
uint64_t data_head;
|
uint8_t *base = (uint8_t *)reader->base + reader->page_size;
|
uint8_t *sentinel = (uint8_t *)reader->base + buffer_size + reader->page_size;
|
uint8_t *begin, *end;
|
|
reader->rb_read_tid = syscall(__NR_gettid);
|
if (!__sync_bool_compare_and_swap(&reader->rb_use_state, RB_NOT_USED, RB_USED_IN_READ))
|
return;
|
|
// Consume all the events on this ring, calling the cb function for each one.
|
// The message may fall on the ring boundary, in which case copy the message
|
// into a malloced buffer.
|
for (data_head = read_data_head(perf_header); perf_header->data_tail != data_head;
|
data_head = read_data_head(perf_header)) {
|
uint64_t data_tail = perf_header->data_tail;
|
uint8_t *ptr;
|
|
begin = base + data_tail % buffer_size;
|
// event header is u64, won't wrap
|
struct perf_event_header *e = (void *)begin;
|
ptr = begin;
|
end = base + (data_tail + e->size) % buffer_size;
|
if (end < begin) {
|
// perf event wraps around the ring, make a contiguous copy
|
reader->buf = realloc(reader->buf, e->size);
|
size_t len = sentinel - begin;
|
memcpy(reader->buf, begin, len);
|
memcpy((void *)((unsigned long)reader->buf + len), base, e->size - len);
|
ptr = reader->buf;
|
}
|
|
if (e->type == PERF_RECORD_LOST) {
|
/*
|
* struct {
|
* struct perf_event_header header;
|
* u64 id;
|
* u64 lost;
|
* struct sample_id sample_id;
|
* };
|
*/
|
uint64_t lost = *(uint64_t *)(ptr + sizeof(*e) + sizeof(uint64_t));
|
if (reader->lost_cb) {
|
reader->lost_cb(reader->cb_cookie, lost);
|
} else {
|
fprintf(stderr, "Possibly lost %" PRIu64 " samples\n", lost);
|
}
|
} else if (e->type == PERF_RECORD_SAMPLE) {
|
parse_sw(reader, ptr, e->size);
|
} else {
|
fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
|
}
|
|
write_data_tail(perf_header, perf_header->data_tail + e->size);
|
}
|
reader->rb_use_state = RB_NOT_USED;
|
__sync_synchronize();
|
reader->rb_read_tid = 0;
|
}
|
|
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout) {
|
struct pollfd pfds[num_readers];
|
int i;
|
|
for (i = 0; i <num_readers; ++i) {
|
pfds[i].fd = readers[i]->fd;
|
pfds[i].events = POLLIN;
|
}
|
|
if (poll(pfds, num_readers, timeout) > 0) {
|
for (i = 0; i < num_readers; ++i) {
|
if (pfds[i].revents & POLLIN)
|
perf_reader_event_read(readers[i]);
|
}
|
}
|
return 0;
|
}
|
|
void perf_reader_set_fd(struct perf_reader *reader, int fd) {
|
reader->fd = fd;
|
}
|
|
int perf_reader_fd(struct perf_reader *reader) {
|
return reader->fd;
|
}
|