iovisor / bcc Goto Github PK

View Code? Open in Web Editor NEW

19.9K 19.9K 3.8K 20.13 MB

BCC - Tools for BPF-based Linux IO analysis, networking, monitoring, and more

License: Apache License 2.0

CMake 0.28% Python 7.26% C 86.40% C++ 4.48% HTML 0.01% Shell 0.10% Lua 1.43% Makefile 0.03%

bcc's Introduction

PLUMgrid iovisor source code

net/ - kernel module
compat/ - compatiblity code for older kernels
bld/ - .ko build

Distributed bridge demo using BPF programs

test_l2ls.sh - demonstrates L2 learning switch functionality defining a Topology with two Linux namespaces connected by a switch.
test_l2ls_tun.sh - demonstrates distributed L2 learning switch functionality defining a Topology with two switches connected through Linux bridge and kernel vxlan/gre drivers. note: requires kernel 3.13+
l2ls.c - C code for BPF bridge
l2ls_bpf.h - compiled BPF code
tunnel_port.c - C code for BPF tunnel
tunnel_port_bpf.h - compiled BPF code

bcc's People

Stargazers

Watchers

Forkers

hgn tuxology nkwilson fortitudepub presto53 brendangregg affansyed rlane mbudiu-bfn arslanabbasi bpowers ashhadsheikh mraqkhan ceeaspb mcanthony somalden grandkarabas benroeder yannayl costingalan antmd lcp alagalah princetonuniversity swj753357 c6226 hans-liu noscripter xushengping joelagnel sjas wenhuizhang giovanniarmano tempbottle goldshtn zaheersm jshook gjasny woolenwang aebm yadutaf billyom goryszewskig mhiramat lpefferkorn zaafar chen0031 dongxingshui vmg att-innovate fuyou001 mulianov vishnudxb chubbymaggie aoj thtanaka danburkert shyamalschandra ruoshan markmuir87 saltsa tpxtask linearregression geassdb allengaller zhangyangisme ragnard zyh329 jwarni talsonthomas kartikeyap zined lucacanali larsx2 shuxiang1990 outmanzhaohu patrick-park ruissalo bobrik arunkgupta miauwuffmiau emaxerrno sjanulonoks wujcheng cloudxtreme sublimino lnussbaum shenlianghao jameskyle rnav abirchall unws agaurav mulengakatebe mqasimsarfraz valkum gila harpsichord mujtabaahmed1991 iamkafai

bcc's Issues

All BPF.Tables inherit from MutableMapping...instead some should be array-like

Currently all BPF.Table behave like a python dict. However, array and prog_array should behave more like list or array.array.

Power-of-2 histograms should be a common use case for analyzing distributions. Here is how they can be populated in C (code largely from https://github.com/torvalds/linux/blob/master/samples/bpf/tracex2_kern.c):

BPF_TABLE("array", int, u64, dist, 64);

static unsigned int log2(unsigned int v)
{
    unsigned int r;
    unsigned int shift;

    r = (v > 0xFFFF) << 4; v >>= r;
    shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
    shift = (v > 0xF) << 2; v >>= shift; r |= shift;
    shift = (v > 0x3) << 1; v >>= shift; r |= shift;
    r |= (v >> 1);
    return r;
}

static unsigned int log2l(unsigned long v)
{
    unsigned int hi = v >> 32;
    if (hi)
        return log2(hi) + 32 + 1;
    else
        return log2(v) + 1;
}

[...]
    int index = log2l(req->__data_len / 1024);
    u64 *leaf = dist.lookup(&index);
    if (leaf) (*leaf)++;

Adding a BPF_LOG2_IDX macro for the above two C functions would be a big improvement, eg:

BPF_TABLE("array", int, u64, dist, 64);
[...]
    int index = BPF_LOG2_IDX(req->__data_len / 1024);
    u64 *leaf = dist.lookup(&index);
    if (leaf) (*leaf)++;

Even better would be a BPF_HIST macro, and a BPF helper:

BPF_HIST(dist, log2);
[...]
    dist.increment(req->__data_len / 1024);

non-power-of-2 value size not handling correctly

We should review how to handle non-power-of-2 key/value size.

At this point, I found that compiler did not generate correct code to retrieve arp packet values:

I add arp support in the proto.b:

state ethernet {
switch $ethernet.type {
case 0x0800 {
next proto::ip;
};
case 0x0806 {
next proto::arp;
};
case 0x8100 {
next proto::dot1q;
};
case * {
goto EOP;
};
}
}

struct arp {
u8 htype:16;
u8 ptype:16;
u32 hlen:8;
u32 plen:8;
u32 oper:16;
u64 sha:48;
u32 spa:32;
u64 tha:48;
u32 tpa:32;
};

state arp {
goto EOP;
}

For arp packet, in bpfdev1.b file, I have the following code to process arp packet:

state proto::arp {
if (skb.pkt_type) {
if $arp.oper == 1 {
struct MacaddrKey mac_key = {.ip = $arp.spa};
struct MacaddrLeaf mac_leaf = {.mac = $arp.sha};
log("sha\n", $arp.spa);
log("sha\n", $arp.sha);
log("sha\n", $arp.tpa);
log("sha\n", $arp.tha);
u64 sha:64 = $arp.sha;
log("sha\n", sha);
macaddr_map.update(mac_key, mac_leaf);
}
goto EOP;
}
}

For a particular arp packet, it prints out the following values:
[ 1523.499639] 0 1 8 0 6 4 0 1
[ 1523.499641] f2 8d dd 96 0 8f 14 1
[ 1523.499643] 1 3 0 0 0 0 0 0
[ 1523.499645] 14 0 0 4
[ 1523.499649] bpfdev_printk: 14010103
[ 1523.499651] bpfdev_printk: 61dddf280f00
[ 1523.499652] bpfdev_printk: 14000004
[ 1523.499654] bpfdev_printk: 1400000000
[ 1523.499656] bpfdev_printk: 61dddf280f00

ip addresses are correct, but mac addresses are not correct.

Hacked log support in the linux kernel as below.

diff --git a/net/core/filter.c b/net/core/filter.c
index ee6fe82..4a2218a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1201,6 +1201,14 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
char buf[16];
void *ptr;

  unsigned char *data = (unsigned char *)from;

  printk("%s: %s: %d, offset = %d, len = %u\n", **FILE**, **FUNCTION**, **LINE**, offset, len);

  if (len == 4) printk("%x %x %x %x\n", data[0], data[1], data[2], data[3]);

  if (len == 6) printk("%x %x %x %x %x %x\n", data[0], data[1], data[2], data[3], data[4], data[5]);

```
  if (len == 8) printk("%x %x %x %x %x %x %x %x\n", data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
```
+}
+
/* bpf verifier guarantees that:
* 'from' pointer points to bpf program stack
* 'len' bytes of it were initialized
@@ -1333,6 +1341,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}

+static u64 bpfdev_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{

  printk("%s: %llx\n", **FUNCTION**, r3);

```
  return 0;
```
+}
+
const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.func = bpf_l4_csum_replace,
.gpl_only = false,
@@ -1344,6 +1358,14 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};

+const struct bpf_func_proto bpf_trace_printk_proto = {

```
  .func           = bpfdev_printk,
```
```
  .gpl_only       = false,
```
```
  .ret_type       = RET_INTEGER,
```
```
  .arg1_type      = ARG_PTR_TO_STACK,
```
```
  .arg2_type      = ARG_CONST_STACK_SIZE,
```
+};
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -1388,6 +1410,8 @@ bpfdev_func_proto(enum bpf_func_id func_id)
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
return &bpf_l4_csum_replace_proto;
```
  case BPF_FUNC_trace_printk:
```

          return &bpf_trace_printk_proto;
default:
        return sk_filter_func_proto(func_id);
}

plumgrid@yhs-plumgrid:~/misc/net-next$

kprobe events need to be deleted on exit

after running tests:
$ cat kprobe_events
p:kprobes/p_sys_write sys_write
p:kprobes/p_sys_read sys_read
p:kprobes/p_htab_map_get_next_key htab_map_get_next_key
p:kprobes/p_schedule_50 schedule+50
p:kprobes/p_blk_start_request blk_start_request
p:kprobes/p_blk_update_request blk_update_request
p:kprobes/p_sys_clone sys_clone

both exit and Ctrl-C need to delete kprobe events

Add support for sharing of "extern" tables between source files

Users may want to write an application in multiple components, with related code grouped into different source files. The modules will communicate within the set of co-existing functions using one set of maps. Between different modules, however, those local tables will not be visible, but the modules may still need to communicate global state.

Add support for something like an "extern" reference to a table, which can be seen across multiple BPFModules, and these modules may load and unload over time, but the table should only be initialized once.

improve memory access errors

still working on #209:

Here's an example of the confusion. This is attached to vfs_read():

int trace_entry(struct pt_regs *ctx, struct file *file)
{
        struct dentry *dent;
[...]
                dent = file->f_path.dentry;
                bpf_trace_printk("read %s\\n", dent->d_iname);

That works fine.

If I change the print line to this:

                bpf_trace_printk("read flags %x %s\\n", dent->d_flags, dent->d_iname);

It errors:

bpf: Permission denied
0: (79) r6 = *(u64 *)(r1 +112)
1: (85) call 14
2: (63) *(u32 *)(r10 -20) = r0
3: (15) if r6 == 0x0 goto pc+53
 R0=inv R6=inv R10=fp
4: (b7) r7 = 0
5: (7b) *(u64 *)(r10 -32) = r7
6: (7b) *(u64 *)(r10 -40) = r7
7: (07) r6 += 16
8: (bf) r1 = r10
9: (07) r1 += -40
10: (b7) r2 = 16
11: (bf) r3 = r6
12: (85) call 4
13: (79) r1 = *(u64 *)(r10 -32)
14: (15) if r1 == 0x0 goto pc+42
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
15: (7b) *(u64 *)(r10 -48) = r7
16: (7b) *(u64 *)(r10 -56) = r7
17: (bf) r1 = r10
18: (07) r1 += -56
19: (b7) r2 = 16
20: (bf) r3 = r6
21: (85) call 4
22: (79) r1 = *(u64 *)(r10 -56)
23: (15) if r1 == 0x0 goto pc+33
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
24: (b7) r1 = 0
25: (7b) *(u64 *)(r10 -64) = r1
26: (7b) *(u64 *)(r10 -72) = r1
27: (bf) r1 = r10
28: (07) r1 += -72
29: (b7) r2 = 16
30: (bf) r3 = r6
31: (85) call 4
32: (79) r1 = *(u64 *)(r10 -64)
33: (7b) *(u64 *)(r10 -8) = r1
34: (18) r1 = 0x95457120
36: (bf) r2 = r10
37: (07) r2 += -20
38: (bf) r3 = r10
39: (07) r3 += -8
40: (b7) r4 = 0
41: (85) call 2
42: (b7) r1 = 10
43: (6b) *(u16 *)(r10 -80) = r1
44: (18) r1 = 0x25207367
46: (7b) *(u64 *)(r10 -88) = r1
47: (18) r1 = 0x64616572
49: (7b) *(u64 *)(r10 -96) = r1
50: (79) r4 = *(u64 *)(r10 -8)
51: (61) r3 = *(u32 *)(r4 +0)
R4 invalid mem access 'inv'

Traceback (most recent call last):
  File "./vfsslower", line 83, in <module>
    b.attach_kprobe(event="vfs_read", fn_name="trace_entry")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 498, in attach_kprobe
    fn = self.load_func(fn_name, BPF.KPROBE)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 388, in load_func
    raise Exception("Failed to load BPF program %s" % func_name)
Exception: Failed to load BPF program trace_entry

This doesn't make much sense: I can access a string member, but not an int member.

Changing it to the following:

                uint d_flags;
                bpf_probe_read(&d_flags, sizeof(d_flags), &dent->d_flags);
                bpf_trace_printk("read flags %x %s\\n", d_flags, dent->d_iname);

Fails:

bpf: Permission denied
0: (79) r6 = *(u64 *)(r1 +112)
1: (85) call 14
2: (63) *(u32 *)(r10 -20) = r0
3: (15) if r6 == 0x0 goto pc+57
 R0=inv R6=inv R10=fp
4: (b7) r7 = 0
5: (7b) *(u64 *)(r10 -32) = r7
6: (7b) *(u64 *)(r10 -40) = r7
7: (07) r6 += 16
8: (bf) r1 = r10
9: (07) r1 += -40
10: (b7) r2 = 16
11: (bf) r3 = r6
12: (85) call 4
13: (79) r1 = *(u64 *)(r10 -32)
14: (15) if r1 == 0x0 goto pc+46
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
15: (7b) *(u64 *)(r10 -48) = r7
16: (7b) *(u64 *)(r10 -56) = r7
17: (bf) r1 = r10
18: (07) r1 += -56
19: (b7) r2 = 16
20: (bf) r3 = r6
21: (85) call 4
22: (79) r1 = *(u64 *)(r10 -56)
23: (15) if r1 == 0x0 goto pc+37
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
24: (b7) r1 = 0
25: (7b) *(u64 *)(r10 -64) = r1
26: (7b) *(u64 *)(r10 -72) = r1
27: (bf) r1 = r10
28: (07) r1 += -72
29: (b7) r2 = 16
30: (bf) r3 = r6
31: (85) call 4
32: (79) r3 = *(u64 *)(r10 -64)
33: (7b) *(u64 *)(r10 -8) = r3
34: (bf) r1 = r10
35: (07) r1 += -76
36: (b7) r2 = 4
37: (85) call 4
invalid indirect read from stack off -76+0 size 4

Traceback (most recent call last):
  File "./vfsslower", line 83, in <module>
    b.attach_kprobe(event="vfs_read", fn_name="trace_entry")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 498, in attach_kprobe
    fn = self.load_func(fn_name, BPF.KPROBE)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 388, in load_func
    raise Exception("Failed to load BPF program %s" % func_name)
Exception: Failed to load BPF program trace_entry

But if I set d_flags to zero:

                uint d_flags = 0;
                bpf_probe_read(&d_flags, sizeof(d_flags), &dent->d_flags);
                bpf_trace_printk("read flags %x %s\\n", d_flags, dent->d_iname);

It works. But I'm left pretty confused.

In lieu of improving the errors, maybe a good example script with comments explaining each step.

variable has incomplete type Error

I am using Ubuntu 14.04 with kernel 4.2.0-999-generic (Build Date 03/Sep/2015) as a VM on the virtual box and when I try to execute the distributed-bridge code using the root account I receive following error. Weird part about this error is that if I reboot the machine and execute the distributed-bridge code it will work fine in the first attempt only however, on all substituent executions it will give following error.

/usr/share/bcc/examples/distributed_bridge/tunnel.c:31:25: error: variable has incomplete type 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:31:10: note: forward declaration of 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:32:3: warning: implicit declaration of function 'bpf_skb_get_tunnel_key' is invalid in C99 [-Wimplicit-function-declaration]
bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0);
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:61:25: error: variable has incomplete type 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:61:10: note: forward declaration of 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:66:5: warning: implicit declaration of function 'bpf_skb_set_tunnel_key' is invalid in C99 [-Wimplicit-function-declaration]
bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:69:27: error: variable has incomplete type 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
/usr/share/bcc/examples/distributed_bridge/tunnel.c:61:10: note: forward declaration of 'struct bpf_tunnel_key'
struct bpf_tunnel_key tkey = {};
^
2 warnings and 3 errors generated.
Traceback (most recent call last):
File "tunnel.py", line 18, in
b = BPF(src_file="tunnel.c")
File "/usr/lib/python3/dist-packages/bcc/init.py", line 298, in init
raise Exception("Failed to compile BPF module %s" % src_file)
Exception: Failed to compile BPF module tunnel.c
Validating connectivity

Also, where is the "struct bpf_tunnel_key" defined?

Zaafar

/usr/share/bcc/examples not in Trusty binaries from INSTALL.md

I followed the instructions for pulling the Ubuntu binaries after installing the same 4.2 version listed.

There are no examples in /usr/share/bcc/.

Also the examples under bcc git repo fail with BPF not found.

When I followed the "Ubuntu build from source" examples everything worked fine (helloworld.py /taskswitcher.py).

I'd consider this really low priority and only logging it so I can contribute to fixing it when I get time :)

pkt rewrite not working

in test2, pkt.rewrite_field is used to rewrite the fields:

state proto::ip {
orig_dip = $ip.dst;
orig_sip = $ip.src;
struct IPKey key = {.dip=orig_dip, .sip=orig_sip};
xlate.lookup(key, xleaf) {};
on_valid(xleaf) {
incr_cksum(@ip.hchecksum, orig_dip, xleaf.xdip);
incr_cksum(@ip.hchecksum, orig_sip, xleaf.xsip);
pkt.rewrite_field($ip.dst, xleaf.xdip);
pkt.rewrite_field($ip.src, xleaf.xsip);
atomic_add(xleaf.xlated_pkts, 1);
}

But actually pkt.rewrite_field is not implemented in bcc.

If you add the following change to bcc,
plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$ git diff
diff --git a/src/cc/codegen_llvm.cc b/src/cc/codegen_llvm.cc
index 6ba9f23..6c7ce28 100644
--- a/src/cc/codegen_llvm.cc
+++ b/src/cc/codegen_llvm.cc
@@ -741,6 +741,8 @@ StatusTuple CodegenLLVM::visit_method_call_expr_node(MethodCallExprNode *n) {
TRY2(emit_table_update(n));
} else if (n->id_->sub_name_ == "delete") {
TRY2(emit_table_delete(n));

} else {
```
 return mkstatus_(n, "unsupported");
```
}
} else if (n->id_->name_ == "atomic_add") {
TRY2(emit_atomic_add(n));

The compilation will fail. Basically, for pkt.rewrite_field, the compiler assumes "pkt" is a map name.
This is not correct.
In codegen_llvm.cc,

StatusTuple CodegenLLVM::visit_packet_expr_node(PacketExprNode *n) {
.....
}

it looks like if PacketExprNode ($ip.dst) is the lvalue of assignment expression,
packet rewrite will be translated. Currently,

assign_expr
: dotted_ident TEQUAL expr
{ $$ = new AssignExprNode(IdentExprNode::Ptr($1), ExprNode::Ptr($3));
parser.set_loc($$, @$); }
| dotted_ident bitop TEQUAL expr
{ $$ = new AssignExprNode(IdentExprNode::Ptr($1), ExprNode::Ptr($4)); $$->bitop_ = BitopExprNode::Ptr($2);
parser.set_loc($$, @$); }
;

It does not support assign lvalue as a packet expr node at this point.

In codegen_llvm.h, there are a lot un-implemented functions like
STATUS_RETURN emit_packet_push_header(MethodCallExprNode* n);
STATUS_RETURN emit_packet_pop_header(MethodCallExprNode* n);
STATUS_RETURN emit_packet_push_vlan(MethodCallExprNode* n);
STATUS_RETURN emit_packet_pop_vlan(MethodCallExprNode* n);
STATUS_RETURN emit_packet_rewrite_field(MethodCallExprNode* n);
.....

Assign to Brenden who can make proper decision about how to support pkt rewrite.

trace_fields() should quit on KeyboardInterrupt

... like trace_readline(). Test program:

vfs_read.py:

#!/usr/bin/python

from bcc import BPF

# load BPF program
b = BPF(src_file="vfs_read.c")
b.attach_kprobe(event="vfs_read", fn_name="run_read")

# format output
while 1:
    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
    print("got: %s" % msg)

vfs_read.c:

# cat vfs_read.c
#include <linux/fs.h>

void run_read(struct pt_regs *ctx, struct file *file)
{
    void *name = 0;

    if (file != NULL && file->f_path.dentry != NULL) {
        bpf_trace_printk("read %s\n", file->f_path.dentry->d_iname);
    } else {
        bpf_trace_printk("was null");
    }
}

output:

# ./vfs_read.py
[...]
got: read ptmx
got: read ptmx
^CTraceback (most recent call last):
  File "./vfs_read.py", line 11, in <module>
    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 512, in trace_fields
    return (task, int(pid), int(cpu), flags, float(ts), msg)
KeyboardInterrupt

this sometimes happens instead:

# ./vfs_read.py
[...]
got: read ptmx
got: read trace_pipe
got: read ptmx
^CTraceback (most recent call last):
  File "./vfs_read.py", line 12, in <module>
    print("got: %s" % msg)
KeyboardInterrupt
close failed in file object destructor:
sys.excepthook is missing
lost sys.stderr

Automatically pull in maps

Instead of pulling in maps (Python):

stats = b.get_table("stats", c_int, c_ulonglong)

The BPF init function should pull these in to the python namespace by default, unless instructed not to (eg, a flag: load_maps = false).

At times this might pull in an unnecessary map, but the value of having them automatically loaded should outweigh that. (If this did become a nuisance, which I doubt, perhaps there could be a BPF_PRIVATE_TABLE() macro in C, for non-auto-loaded maps.)

Drop hello_world return 0

From examples/hello_world.py:

prog = """
int hello(void *ctx) {
  bpf_trace_printk("Hello, World!\\n");
  return 0;
};

Is the int ... return 0 necessary? I don't see how it's handled. If it isn't necessary, it would be good to drop it from the hello world example, and make this void.

Add macro for BPF_COUNTER

Instead of this code (C):

enum stat_types {
    S_COUNT = 1,
    S_MAXSTAT
};

BPF_TABLE("array", int, u64, stats, S_MAXSTAT + 1);

This could be just:

BPF_COUNTER(count);

Where the name is "count".

There could then be a macro for incrementing BPF_COUNTERs. Eg (two examples):

BPF_INC(count);
BPF_INC(sum, value);

And/or, an increment BPF helper:

count.increment(1);
sum.increment(value);

Zero maps from Python

When using maps to store per-interval sums, it would be handy if we could clear the maps from Python, otherwise it's necessary to store the prior values and then calculate the delta each interval. Of course, Python could pretend to clear the maps and keep its own tally of what to expose.

Install failure on Ubuntu 15.04 with 4.2.0-rc4

I tried to follow INSTALL.md with a new Ubuntu 15.04 VM with kernel 4.2.0-rc4. I could not complete it because cmake was finding LLVM in wrong path. I could verify by output of this

Found LLVM: /usr/include

Output of ${LLVM_LIBRARY_DIRS} was /usr/lib as well, whereas it should be /usr/lib/llvm-3.7/lib.

I proceeded to make my own install scripts for llvm and bcc setup based on your kickstart file. This resulted into another issue #100 My lab folks were really interested in bcc. #100 fix did get it to work and I demoed it.

hypothetical syntax

Is it possible to support the following .py?

#!/usr/bin/env python

from ctypes import c_uint, c_ulong, Structure
from bpf import BPF
from time import sleep
import sys

s = """
#include <linux/ptrace.h>
#include "../../src/cc/bpf_helpers.h"
struct Ptr { u64 ptr; };
struct Counters { u64 stat1; };

BPF_TABLE("hash", struct Ptr, struct Counters, stats, 1024);

int count_sched(struct pt_regs *ctx) {
  struct Ptr key = {.ptr=ctx->bx};
  struct Counters zleaf = {0};
  stats.upsert(&key, &zleaf)->stat1++;
  return 0;
}

int count_sched2(struct pt_regs *ctx) {
  return 0;
}
"""

class Ptr(Structure):
    _fields_ = [("ptr", c_ulong)]
class Counters(Structure):
    _fields_ = [("stat1", c_ulong)]

blob = BPF(s)

blob["count_sched"].load(BPF.BPF_PROG_TYPE_KPROBE)
blob["count_sched2"].load(BPF.BPF_PROG_TYPE_KPROBE)

stats[2] = blob["count_sched"].fd
stats[3] = blob["count_sched2"].fd

stats = blob.table("stats", Ptr, Counters)
attach_kprobe(blob["count_sched"].fd, "schedule", 0, -1)

for i in range(0, 100):
    sleep(0.01)
for key in stats.iter():
    leaf = stats.get(key)
    print("ptr %x:" % key.ptr, "stat1 %x" % leaf.stat1)

Print log2 histograms from Python

There should be a print_log2_hist() function. Example tool output:

# ./bitehist.py
Tracing... Hit Ctrl-C to end.

     kbytes          : count     distribution
       0 -> 1        : 0        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 26       |*************                         |
       8 -> 15       : 3        |*                                     |
      16 -> 31       : 5        |**                                    |
      32 -> 63       : 6        |***                                   |
      64 -> 127      : 7        |***                                   |
     128 -> 255      : 75       |**************************************|

The Python code used (feel free to take/improve):

last = {}
for i in range(1, dist_max + 1):
    last[i] = 0

# functions
stars_max = 38
def stars(val, val_max, width):
    i = 0
    text = ""
    while (1):
        if (i > (width * val / val_max) - 1) or (i > width - 1):
            break
        text += "*"
        i += 1
    if val > val_max:
        text = text[:-1] + "+"
    return text

def print_log2_hist(d, val_type):
    idx_max = -1
    val_max = 0
    for i in range(1, dist_max + 1):
        try:
            val = dist[c_int(i)].value - last[i]
            if (val > 0):
                idx_max = i
            if (val > val_max):
                val_max = val
        except:
            break
    if idx_max > 0:
        print("     %-15s : count     distribution" % val_type);
    for i in range(1, idx_max + 1):
        low = (1 << i) >> 1
        high = (1 << i) - 1
        if (low == high):
            low -= 1
        try:
            val = dist[c_int(i)].value - last[i]
            print("%8d -> %-8d : %-8d |%-*s|" % (low, high, val,
                stars_max, stars(val, val_max, stars_max)))
            last[i] = dist[c_int(i)].value
        except:
            break

[...]
    print_log2_hist(dist, "kbytes")

The only hitch here is that I'm printing a per-interval histogram, and needed to subtract the previous map values, hence the use of the "last" dictionary.

If there was a way to clear a map (or pretend to; this is issue #142), then the "last" logic can be dropped, simplifying this a bit.

BPF.attach_kprobe() to accept glob or regexp

It may be handy to use globs or regexps when specifying kprobe function names. Eg (Python):

BPF.attach_kprobe(fn, "vfs_*")

... for globbing, or "vfs_.*" for regexp.

Handle strings for bpf_trace_printk()

Eg, attaching the following to do_execve/sys_execve:

int do_exec(struct pt_regs *ctx, struct filename *filename) {
    bpf_trace_printk("just executed: %s\n", filename->name);
[...]

%s isn't currently supported.

Use of invalid format specifiers in bpf_trace_printk should error

Doing something like bpf_trace_printk("%0.2f" ...) silently fails. The valid format specifiers are %d, %u, %p, and %x. The frontend should be able to easily scan for such format strings and raise an error, or at least a warning, to the user, as the kernel verifier will otherwise reject the printk and return -EINVAL.

LLVM ERROR: Cannot select

# ./vfsslower| head -50
LLVM ERROR: Cannot select: 0x29e97a0: ch,glue = BPFISD::CALL 0x1e16d00, 0x29c4f50, 0x29b9240, 0x2ac89b8, 0x29e9550, 0x1e16d00:1 [ORD=4] [ID=40]
  0x29c4f50: i64 = TargetExternalSymbol'memset' [ID=9]
  0x29b9240: i64 = Register %R1 [ID=6]
  0x2ac89b8: i64 = Register %R2 [ID=7]
  0x29e9550: i64 = Register %R3 [ID=8]
  0x1e16d00: ch,glue = CopyToReg 0x1e17078, 0x29e9550, 0x2ac8640, 0x1e17078:1 [ORD=4] [ID=39]
    0x29e9550: i64 = Register %R3 [ID=8]
    0x2ac8640: i64 = Constant<1216> [ID=3]
    0x1e17078: ch,glue = CopyToReg 0x29b95b8, 0x2ac89b8, 0x29baa58, 0x29b95b8:1 [ORD=4] [ID=38]
      0x2ac89b8: i64 = Register %R2 [ID=7]
      0x29baa58: i64 = Constant<0> [ID=4]
      0x29b95b8: ch,glue = CopyToReg 0x29c4988, 0x29b9240, 0x29bb020 [ORD=4] [ID=37]
        0x29b9240: i64 = Register %R1 [ID=6]
        0x29bb020: i64 = FrameIndex<5> [ID=2]
In function: trace_entry

Same program as #209, still trying to exclude non-storage-device I/O. Attaching this to vfs_read():

int trace_entry(struct pt_regs *ctx, struct file *file)
{
[...]
                struct super_block mnt_sb = {}, *mnt_sbp = file->f_path.mnt->mnt_sb;
                bpf_probe_read(&mnt_sb, sizeof(mnt_sb), &mnt_sbp);

... was hoping something out of the super_block could identify if this was a storage-backed FS or not.

incorrect pkt rewrite field

I my dp program, I have:

  pkt.rewrite_field($ethernet.dst, dst_mac);
  pkt.rewrite_field($ethernet.src, src_mac);

The compiler translates it into a 8-byte write. This caused packet corruption.

support bpf_trace_printk("format string", arg1, arg2, arg3) to make debug prints easier

rewriter can convert bpf_trace_printk("format string", arg1, arg2, arg3) into
char fmt[] = "format string";
bpf_trace_printk(fmt, sizeof(fmt), arg1, arg2, arg3)

NoneType object error in trace_fields()

Test workload:

$ dd if=/dev/zero of=/dev/null bs=1

vfs_read.py:

#!/usr/bin/python

from bcc import BPF

# load BPF program
b = BPF(src_file="vfs_read.c")
b.attach_kprobe(event="vfs_read", fn_name="run_read")

# format output
while 1:
    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
    print("got: %s" % msg)

vfs_read.c:

# cat vfs_read.c
#include <linux/fs.h>

void run_read(struct pt_regs *ctx, struct file *file)
{
    void *name = 0;

    if (file != NULL && file->f_path.dentry != NULL) {
        bpf_trace_printk("read %s\n", file->f_path.dentry->d_iname);
    } else {
        bpf_trace_printk("was null");
    }
}

output:

# ./vfs_read.py
[...]
got: read ptmx
got: read ptmx
got: read ptmx
got: read ptmx
Traceback (most recent call last):
  File "./vfs_read.py", line 11, in <module>
    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
TypeError: 'NoneType' object is not iterable

Create a TC example that shows how to do complex classifications more simply

For example, try implementing Figure 12 from http://people.netfilter.org/pablo/netdev0.1/papers/Linux-Traffic-Control-Classifier-Action-Subsystem-Architecture.pdf

verifier failure with a particular dp program

For the following B program:

plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$ cat bpfdev1.b
#packed "false"

// hash
struct FwdKey {
  u32 dip:32;
};
struct FwdLeaf {
  u32 fwd_idx:32;
};
Table<FwdKey, FwdLeaf, FIXED_MATCH, NONE> fwd_map(1);

// array
struct ConfigKey {
  u32 index:32;
};
struct ConfigLeaf {
  u32 bpfdev_ip:32;
  u32 slave_ip:32;
};
Table<ConfigKey, ConfigLeaf, INDEXED, AUTO> config_map(1);

// hash
struct MacaddrKey {
  u32 ip:32;
};
struct MacaddrLeaf {
  u64 mac:48;
};
Table<MacaddrKey, MacaddrLeaf, FIXED_MATCH, AUTO> macaddr_map(11);

// hash
struct SlaveKey {
  u32 slave_ip:32;
};
struct SlaveLeaf {
  u32 slave_ifindex:32;
};
Table<SlaveKey, SlaveLeaf, FIXED_MATCH, NONE> slave_map(10);

u32 main(struct proto::skbuff *skb) {
  u32 ret:32;

  if skb.pkt_type == 0 {
    // tx
    // make sure configured
    u32 slave_ip:32;

    struct ConfigKey cfg_key = {.index = 0};
    struct ConfigLeaf *cfg_leaf;
    config_map.lookup(cfg_key, cfg_leaf) {};
    on_valid(cfg_leaf) {
      slave_ip = cfg_leaf->slave_ip;
    } else {
      return 0xffffffff;
    }

    // make sure slave configured
    // tx, default to the single slave
    struct SlaveKey slave_key = {.slave_ip = slave_ip};
    struct SlaveLeaf *slave_leaf;
    slave_map.lookup(slave_key, slave_leaf);
    on_valid(slave_leaf) {
      ret = slave_leaf->slave_ifindex;
    } else {
      return 0xffffffff;
    }
  } else {
    // rx, default to stack
    ret = 0;
  }

  goto proto::ethernet;

  state proto::ethernet {
  }

  state proto::dot1q {
  }

  state proto::arp {
    if (skb.pkt_type) {
      if $arp.oper == 1 {
        struct MacaddrKey mac_key = {.ip = $arp.spa};
        struct MacaddrLeaf mac_leaf = {.mac = $arp.sha};
        macaddr_map.update(mac_key, mac_leaf);
      }
      goto EOP;
    }
  }

  state proto::ip {
  }
  state proto::udp {
    if $udp.dport != 5000 {
       goto EOP;
    }
    if (skb.pkt_type) {
      // lookup and then forward
      struct FwdKey fwd_key = {.dip = $ip.dst};
      struct FwdLeaf *fwd_val;
      fwd_map.lookup(fwd_key, fwd_val) {};
      on_valid(fwd_val) {
         return fwd_val.fwd_idx;
      } else {
         goto EOP;
      }
    } else {
      // rewrite the packet and send to a pre-configured index if needed
      u32 new_ip:32;
      u32 old_ip:32;
      u64 src_mac:48;
      u64 dst_mac:48;

      struct ConfigKey cfg_key = {.index = 0};
      struct ConfigLeaf *cfg_leaf;
      config_map.lookup(cfg_key, cfg_leaf) {};
      on_valid(cfg_leaf) {
        struct MacaddrKey mac_key = {.ip = cfg_leaf->bpfdev_ip};
        struct MacaddrLeaf *mac_leaf;

        mac_key.ip = cfg_leaf->bpfdev_ip;
        macaddr_map.lookup(mac_key, mac_leaf) {};
        on_valid (mac_leaf) {
          src_mac = mac_leaf->mac;
        } else {
      goto EOP;
        }

        mac_key.ip = cfg_leaf->slave_ip;
        macaddr_map.lookup(mac_key, mac_leaf) {};
        on_valid (mac_leaf) {
          dst_mac = mac_leaf->mac;
        } else {
      goto EOP;
        }

        // rewrite ethernet header
        pkt.rewrite_field($ethernet.dst, dst_mac);
        pkt.rewrite_field($ethernet.src, src_mac);

        // ip & udp checksum
        incr_cksum(@ip.hchecksum, $ip.src, cfg_leaf->bpfdev_ip);
        incr_cksum(@ip.hchecksum, $ip.dst, cfg_leaf->slave_ip);
        incr_cksum(@udp.crc, $ip.src, cfg_leaf->bpfdev_ip, 1);
        incr_cksum(@udp.crc, $ip.dst, cfg_leaf->slave_ip, 1);

        // rewrite ip src/dst fields
        pkt.rewrite_field($ip.src, cfg_leaf->bpfdev_ip);
        pkt.rewrite_field($ip.dst, cfg_leaf->slave_ip);

        goto EOP;

      } else {
        goto EOP;
      }
    }
  }

  state EOP {
    return ret;
  }
}

plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$

The control plane app looks like:

plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$ cat bpfdev1.py

#!/usr/bin/env python

# test program for a simple bpfdev

import sys
import commands
from ctypes import c_uint, c_ulong, Structure
from netaddr import IPAddress, EUI
from bpf import BPF
from subprocess import check_call
from unittest import main, TestCase

# map structures
class FwdKey(Structure):
    _fields_ = [("dip", c_uint)]
class FwdLeaf(Structure):
    _fields_ = [("ifc_idx", c_uint)]

class ConfigKey(Structure):
    _fields_ = [("idx", c_uint)]
class ConfigLeaf(Structure):
    _fields_ = [("bpfdev_ip", c_uint),
                ("slave_ip", c_uint)]

class MacaddrKey(Structure):
    _fields_ = [("ip", c_uint)]
class MacaddrLeaf(Structure):
    _fields_ = [("mac", c_ulong)]

class SlaveKey(Structure):
    _fields_ = [("slave_ip", c_uint)]
class SlaveLeaf(Structure):
    _fields_ = [("ifc_idx", c_uint)]

class TestBPFDev(TestCase):
    def config(self, bpfdev, ns, bpfdev_ip, bpfdev_mac, slave_ip):
        # ifup bpfdev
        check_call(["ip", "link", "set", "dev", bpfdev, "up"])
        check_call(["ifconfig", bpfdev, bpfdev_ip])
        check_call(["ifconfig", bpfdev, "hw", "ether", bpfdev_mac])

        # setup a namespace for the VM
        if_se = ns + ".eth0.se"
        if_vm = ns + ".eth0.vm"
        check_call(["ip", "netns", "add", ns])
        check_call(["ip", "link", "add", "name", if_se, "type", "veth", "peer", "name", if_vm])
        check_call(["ip", "link", "set", if_vm, "netns", ns])
        check_call(["ip", "netns", "exec", ns, "ip", "link", "set", if_vm, "name", "eth0"])
        check_call(["ip", "link", "set", if_se, "up"])
        check_call(["ip", "netns", "exec", ns, "ip", "link", "set", "eth0", "up"])
        check_call(["ip", "link", "set", "dev", if_se, "promisc", "on"])
        check_call(["ip", "netns", "exec", ns, "ifconfig", "eth0", slave_ip])

        # establish the master-slave relationships
        check_call(["ip", "link", "set", "dev", if_se, "master", bpfdev])

    def setUp(self):
        sys.stderr.write("build bpfdev programs for br22 and br33\n")
        self.prog1 = BPF("main", "bpfdev1.b", "proto.b", prog_type=BPF.BPF_PROG_TYPE_BPFDEV, debug=0)
        self.prog2 = BPF("main", "bpfdev1.b", "proto.b", prog_type=BPF.BPF_PROG_TYPE_BPFDEV, debug=0)

    # create two bpf devices
        sys.stderr.write("creating bpfdev br22 and br33\n")
        self.prog1.create_bpfdev("br22")
        self.prog2.create_bpfdev("br33")

        # configure bpfdev
        sys.stderr.write("configuring bpfdev br22 and br33\n")
        self.config("br22", "ns0", "10.0.0.4", "02:02:02:02:02:02", "10.1.1.3")
        self.config("br33", "ns1", "20.0.0.4", "04:04:04:04:04:04", "20.1.1.3")

        # prog1 table configuration
        sys.stderr.write("configuring bpfdev br22 table\n")
        fwd_if = int(commands.getoutput('ip -o link show dev br33 | awk -F\': \' \'{print $1}\''))
        sys.stderr.write("br22 special rx packet forward to %d\n" % fwd_if)
        fwd_map = self.prog1.table("fwd_map", FwdKey, FwdLeaf)
        key = FwdKey(IPAddress("10.0.0.4").value)
        leaf = FwdLeaf(fwd_if)
        fwd_map.put(key, leaf)

        config_map = self.prog1.table("config_map", ConfigKey, ConfigLeaf)
        key = ConfigKey(0)
        leaf = ConfigLeaf(IPAddress("10.0.0.4").value, IPAddress("10.1.1.3").value)
        config_map.put(key, leaf)

        macaddr_map = self.prog1.table("macaddr_map", MacaddrKey, MacaddrLeaf)
        key = MacaddrKey(IPAddress("10.0.0.4").value)
        leaf = MacaddrLeaf(EUI("02-02-02-02-02-02").value)    # 02:02:02:02:02:02
        macaddr_map.put(key, leaf)

        slave_map = self.prog1.table("slave_map", SlaveKey, SlaveLeaf)
        fwd_if = int(commands.getoutput('ip -o link show dev ns0.eth0.se | awk -F\': \' \'{print $1}\''))
        sys.stderr.write("br22 special tx packet forward to %d\n" % fwd_if)
        key = SlaveKey(IPAddress("10.1.1.3").value)
        leaf = SlaveLeaf(fwd_if)
        slave_map.put(key, leaf)

        # prog2 table configuratioin
        sys.stderr.write("configuring bpfdev br33 table\n")
        fwd_if = int(commands.getoutput('ip -o link show dev br22 | awk -F\': \' \'{print $1}\''))
        sys.stderr.write("br33 special rx packet forward to %d\n" % fwd_if)
        fwd_map = self.prog2.table("fwd_map", FwdKey, FwdLeaf)
        key = FwdKey(IPAddress("20.0.0.4").value)
        leaf = FwdLeaf(fwd_if)
        fwd_map.put(key, leaf)

        config_map = self.prog2.table("config_map", ConfigKey, ConfigLeaf)
        key = ConfigKey(0)
        leaf = ConfigLeaf(IPAddress("20.0.0.4").value, IPAddress("20.1.1.3").value)
        config_map.put(key, leaf)

        macaddr_map = self.prog2.table("macaddr_map", MacaddrKey, MacaddrLeaf)
        key = MacaddrKey(IPAddress("20.0.0.4").value)
        leaf = MacaddrLeaf(EUI("04-04-04-04-04-04").value)    # 04:04:04:04:04:04
        macaddr_map.put(key, leaf)

        slave_map = self.prog2.table("slave_map", SlaveKey, SlaveLeaf)
        fwd_if = int(commands.getoutput('ip -o link show dev ns1.eth0.se | awk -F\': \' \'{print $1}\''))
        sys.stderr.write("br33 special tx packet forward to %d\n" % fwd_if)
        key = SlaveKey(IPAddress("20.1.1.3").value)
        leaf = SlaveLeaf(fwd_if)
        slave_map.put(key, leaf)

    sys.stderr.write("prog1 config_map")
        config_map = self.prog1.table("config_map", ConfigKey, ConfigLeaf)
        key = ConfigKey(0)
    leaf = config_map.get(key)
        print "config1", leaf.bpfdev_ip, leaf.slave_ip

    sys.stderr.write("prog2 config_map")
        config_map = self.prog2.table("config_map", ConfigKey, ConfigLeaf)
        key = ConfigKey(0)
    leaf = config_map.get(key)
        print "config2", leaf.bpfdev_ip, leaf.slave_ip

    def test_ping(self):
        sys.stderr.write("testing ping between master and slave\n")
        check_call(["ip", "netns", "exec", "ns0", "ping", "-c4", "10.0.0.4"])
        check_call(["ip", "netns", "exec", "ns1", "ping", "-c4", "20.0.0.4"])

        # sys.stderr.write("testing forwarding from br22 to br33\n")
        # check_call(["ip", "netns", "exec", "ns1", "/usr/bin/python", "/home/plumgrid/bpf/recv_udp.py", "&"])
        # check_call(["ip", "netns", "exec", "ns0", "/usr/bin/python", "/home/plumgrid/bpf/send_udp.py"])

if __name__ == "__main__":
    main()

plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$

In order to run complete test, there are other changes in bcc are needed to support new bpfdev device and these changes are not included here.

The test failed with the following symtom:

5: from 39 to 89: R0=imm5000 R1=imm0 R6=ctx R7=imm0 R8=imm14 R9=inv R10=fp
5: 89: (b7) r9 = 0
5: 90: (63) *(u32 *)(r10 -56) = r9
5: 91: (18) r1 = 0x587280
5: 93: (bf) r2 = r10
5: 94: (07) r2 += -56
5: 95: (85) call 1
5: 96: (bf) r1 = r0
5: 97: (15) if r0 == 0x0 goto pc-11
5:  R0=map_value(ks=4,vs=8) R1=map_value_or_null(ks=4,vs=8) R6=ctx R7=imm0 R8=imm14 R9=imm0 R10=fp
5: 98: (05) goto pc+0
5: 99: (63) *(u32 *)(r10 -64) = r9
5: 100: (bf) r2 = r1
5: 101: (7b) *(u64 *)(r10 -72) = r2
5: 102: (61) r1 = *(u32 *)(r2 +0)
5: R2 invalid mem access 'map_value_or_null'
5: 
5: ERROR: test_ping (__main__.TestBPFDev)
5: ----------------------------------------------------------------------
5: Traceback (most recent call last):
5:   File "/home/plumgrid/iovisor/bcc/tests/jit/bpfdev1.py", line 59, in setUp
5:     self.prog1 = BPF("main", "bpfdev1.b", "proto.b", prog_type=BPF.BPF_PROG_TYPE_BPFDEV, debug=0)
5:   File "/home/plumgrid/iovisor/bcc/src/bpf.py", line 64, in __init__
5:     self.load(self.name)
5:   File "/home/plumgrid/iovisor/bcc/src/bpf.py", line 78, in load
5:     raise Exception("Failed to load BPF program %s" % self.name)
5: Exception: Failed to load BPF program main
5: 
5: ----------------------------------------------------------------------

instruction "96" does an assignment from r0 to r1, and verifier thinks r1 could be map_value or null,
although instruction "97" checks "r0" for null.

The compiler ought to generate better code. The instruction "r1 = r0" is not necessary.

I dumped IR (change py program debug=0 to debug=1), and feed the IR to llc.
`llc -march=bpf -filetype=asm -O3 b.ll``

llc also generates similar code:

LBB8_16:                                # %if.else20
        mov     r9, 0
        stw     -56(r10), r9
        ld_pseudo       r1, 1, 6
        mov     r2, r10
        addi    r2, -56
        call    1
        mov     r1, r0
        jeqi    r0, 0 goto LBB8_12
        jmp     LBB8_18
LBB8_18:                                # %onvalid.then25
        stw     -64(r10), r9
        mov     r2, r1
        std     -72(r10), r2
        ldw     r1, 0(r2)
        stw     -64(r10), r1

-O2 generates similar code.

Studying the LLVM optimization passes, there is a path in LLVM which called "virtual register rewrite" and it indeed removes SOME of the above redundant copies, but not all of them, hence causing the issue.

FYI, I changed LLVM to print out the pass applied during bcc compiler optimization and below is the result:

5: Target Transform Information
5: Target Pass Configuration
5: No Alias Analysis (always returns 'may' alias)
5: Type-Based Alias Analysis
5: Scoped NoAlias Alias Analysis
5: Assumption Cache Tracker
5: Target Library Information
5: Basic Alias Analysis (stateless AA impl)
5: Create Garbage Collector Module Metadata
5: Machine Module Information
5: Machine Branch Probability Analysis
5:   ModulePass Manager
5:     FunctionPass Manager
5:       Dominator Tree Construction
5:       Natural Loop Information
5:       Canonicalize natural loops
5:       Scalar Evolution Analysis
5:       Loop Pass Manager
5:         Induction Variable Users
5:         Loop Strength Reduction
5:       Lower Garbage Collection Instructions
5:       Shadow Stack GC Lowering
5:       Remove unreachable blocks from the CFG
5:       Dominator Tree Construction
5:       Constant Hoisting
5:       Partially inline calls to library functions
5:       CodeGen Prepare
5:     Rewrite Symbols
5:     FunctionPass Manager
5:       Lower invoke and unwind, for unwindless code generators
5:       Remove unreachable blocks from the CFG
5:       Insert stack protectors
5:       Machine Function Analysis
5:       Dominator Tree Construction
5:       Natural Loop Information
5:       Branch Probability Analysis
5:       BPF DAG->DAG Pattern Instruction Selection
5:       Expand ISel Pseudo-instructions
5:       Tail Duplication
5:       Optimize machine instruction PHIs
5:       MachineDominator Tree Construction
5:       Slot index numbering
5:       Merge disjoint stack slots
5:       Local Stack Slot Allocation
5:       Remove dead machine instructions
5:       MachineDominator Tree Construction
5:       Machine Natural Loop Construction
5:       Machine Loop Invariant Code Motion
5:       Machine Common Subexpression Elimination
5:       MachinePostDominator Tree Construction
5:       Machine Block Frequency Analysis
5:       Machine code sinking
5:       Peephole Optimizations
5:       Remove dead machine instructions
5:       Process Implicit Definitions
5:       Remove unreachable machine basic blocks
5:       Live Variable Analysis
5:       MachineDominator Tree Construction
5:       Machine Natural Loop Construction
5:       Eliminate PHI nodes for register allocation
5:       Two-Address instruction pass
5:       Slot index numbering
5:       Live Interval Analysis
5:       Simple Register Coalescing
5:       Machine Instruction Scheduler
5:       Machine Block Frequency Analysis
5:       Debug Variable Analysis
5:       Live Stack Slot Analysis
5:       Virtual Register Map
5:       Live Register Matrix
5:       Bundle Machine CFG Edges
5:       Spill Code Placement Analysis
5:       Greedy Register Allocator
5:       Virtual Register Rewriter
5:       Stack Slot Coloring
5:       Machine Loop Invariant Code Motion
5:       Prologue/Epilogue Insertion & Frame Finalization
5:       Machine Block Frequency Analysis
5:       Control Flow Optimizer
5:       Tail Duplication
5:       Machine Copy Propagation Pass
5:       Post-RA pseudo instruction expansion pass
5:       MachineDominator Tree Construction
5:       Machine Natural Loop Construction
5:       Post RA top-down list latency scheduler
5:       Analyze Machine Code For Garbage Collection
5:       Machine Block Frequency Analysis
5:       Branch Probability Basic Block Placement
5:       StackMap Liveness Analysis
5:       BPF Assembly Printer

Checking llc compiler passes, it is very similar (I did not compare one-to-one) to the above
for function passes.

In summary, this is an LLVM issue and we may have to fix there.

make install not updating libbcc.so

Just had to use:

bcc# cp build/src/cc/libbcc.so.0.1.6-e292255e /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.6

To get it to work. make install wasn't updating it. Not sure why -- maybe my system is messed up. Looks like it was updating /usr/lib/libbcc.so.0, but not the x86_64-linux-gnu dir.

need better compiler error/diagnistics output

The following is a buggy b-language program.
plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$ cat bpfdev1.b

packed "false"

// hash
struct FwdKey {
u32 dip:32;
};
struct FwdLeaf {
u32 ifindex:32;
};
Table<FwdKey, FwdLeaf, FIXED_MATCH, NONE> fwd_map(1);

// array
struct ConfigKey {
u32 index:32;
};
struct ConfigLeaf {
u32 bpfdev_ip:32;
u32 slave_ip:32;
};
Table<ConfigKey, ConfigLeaf, INDEXED, NONE> config_map(1);

// hash
struct MacaddrKey {
u32 ip:32;
};
struct MacaddrLeaf {
u64 mac:48;
};
Table<MacaddrKey, MacaddrLeaf, FIXED_MATCH, AUTO> macaddr_map(11);

// hash
struct SlaveKey {
u32 slave_ip:32;
};
struct SlaveLeaf {
u32 slave_ifindex:32;
};
Table<SlaveKey, SlaveLeaf, FIXED_MATCH, NONE> slave_map(10);

u32 main(struct proto::skbuff *skb) {
u32 ret:32 = 0xffffffff;

goto proto::ethernet;

state proto::ethernet {
}

state proto::dot1q {
}

if (skb.pkt_type) {
state proto::arp {
if $arp.oper != 1 {
return 0;
}
macaddr_map[$arp.spa] = $arp.sha;
}
state proto::ip {
}
state proto::udp {
if $udp.dport != 5000 {
return 0;
}
FwdKey fwd_key = {.dip = $ip.dst};
FwdLeaf *fwd_idx_p = fwd_map[$ip.dst];
if (fwd_idx_p) {
return fwd_idx_p->fwd_idx;
} else {
return 0;
}
}
} else {
state proto::ip {
}
}

state EOP {
return ret;
}
}
plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$

The compiler error message like the below:

Checking test dependency graph end
test 5
Start 5: py_bpfdev1

5: Test command: /home/plumgrid/iovisor/bcc/build/tests/wrapper.sh "py_bpfdev1" "/home/plumgrid/iovisor/bcc/tests/jit/bpfdev1.py" "namespace"
5: Test timeout computed to be: 9.99988e+06
5: Error: 42.1-5 syntax error
5: In file: bpfdev1.b
5: E

It does not provide sufficient information about where the problem occurs.
Maybe 42.1-5 indicates something, but the message should be better communicated.

helper for per-task variables

It will likely be common to store data with the current on-CPU task (ie, PID/thread), for example, timestamps, arguments, etc., that are then later retrieved by another event for the same task.

Eg, the vfsreadlat.c example does this to store a timestamp from function entry to return:

struct key_t {
    u32 pid;
};

BPF_HASH(start, struct key_t);

int do_entry(struct pt_regs *ctx)
{
        struct key_t key = {};
        u64 ts, *val;

        key.pid = bpf_get_current_pid_tgid();
        ts = bpf_ktime_get_ns();
        start.update(&key, &ts);
        return 0;
}

int do_return(struct pt_regs *ctx)
{
        struct key_t key = {};
        u64 *tsp, delta;

        key.pid = bpf_get_current_pid_tgid();
        tsp = start.lookup(&key);
[...]

Imagine something like:

BPF_TASK_VAR(start, u64);

void do_entry(struct pt_regs *ctx)
{
        start.set(bpf_ktime_get_ns());
}

void do_return(struct pt_regs *ctx)
{
        u64 *tsp, delta;

        tsp = start.get();
[...]

Or better ("start = bpf_ktime_get_ns()", etc).

Add macro for BPF_HASH

It's nice to have a lot of control over BPF maps, but I wonder if we
could have some simplified macros for the really common cases, like: hash
of one key -> u64.

Before:

struct key_t {
  struct request *req;
};
BPF_TABLE("hash", struct key_t, u64, start, 10240);

After:

BPF_HASH(start, struct request *);

Where the name is "start", the single key is "struct request *", and it
assumes the value is u64 (common case), and there is a global default for
number of entries (10240). If people want to customize, then do BPF_TABLE.

mismatch between type and bitfield width

I added the arp support in proto.h as below:

state ethernet {
switch $ethernet.type {
case 0x0800 {
next proto::ip;
};
case 0x0806 {
next proto::arp;
};
case 0x8100 {
next proto::dot1q;
};
case * {
goto EOP;
};
}
}

struct arp {
u8 htype:16;
u8 ptype:16;
u32 hlen:8;
u32 plen:8;
u32 oper:16;
u32 sha:48;
u32 spa:32;
u64 tha:48;
u32 tpa:32;
};

state arp {
goto EOP;
}

No compiler warning/error is given at this point.
Notice that there are some mismatch between type and bitfield width.

Easily save linear histograms

A follow on from #144, this would allow a linear histogram to be easily saved. Eg:

BPF_HIST(myhist, linear, 0, 100, 10);
[...]
    myhist.increment(value);

Where the start is 0, end is 100, and a step of 10.

More exotic histograms can already be coded using BPF_TABLE directly. It's not too hard to do linear either, this is just for minor convenience.

Python function to print the bpf_log

For debugging. This could be done in an error handler.

Add a read_trace_fields() Python function

As a follow on from #136, a function could be added that returned the fields printed by bpf_trace_printk(). Eg, so one can use:

b = BPF(...)
[...]
task, pid, cpu, flags, timestamp, function, arguments = b.read_trace_fields()

This is slightly harder than it sounds: I believe the task name can contain spaces.

Missing helper functions

I did an install using my llvm install script and bcc setup script (which are based on your kickstart file) on Linux kernel 4.2-rc4. Things go just fine and the llvm backend and bcc get compiled. However, when I try to run a simple example sudo ./hello_worpld.py, I get this error,

In file included from <built-in>:316:
In file included from <command line>:3:
/usr/share/bcc/include/bcc/helpers.h:78:18: error: use of undeclared identifier 'BPF_FUNC_get_cgroup_classid'
        (void *) BPF_FUNC_get_cgroup_classid;
                 ^
/usr/share/bcc/include/bcc/helpers.h:80:18: error: use of undeclared identifier 'BPF_FUNC_skb_vlan_push'
        (void *) BPF_FUNC_skb_vlan_push;
                 ^
/usr/share/bcc/include/bcc/helpers.h:82:18: error: use of undeclared identifier 'BPF_FUNC_skb_vlan_pop'
        (void *) BPF_FUNC_skb_vlan_pop;
                 ^
/usr/share/bcc/include/bcc/helpers.h:87:12: error: use of undeclared identifier 'BPF_FUNC_skb_get_tunnel_key'
  (void *) BPF_FUNC_skb_get_tunnel_key;
           ^
/usr/share/bcc/include/bcc/helpers.h:89:12: error: use of undeclared identifier 'BPF_FUNC_skb_set_tunnel_key'
  (void *) BPF_FUNC_skb_set_tunnel_key;
           ^
5 errors generated.
Traceback (most recent call last):
  File "./hello_world.py", line 17, in <module>
    b = BPF(text=prog)
  File "/usr/lib/python2.7/dist-packages/bpf/__init__.py", line 182, in __init__
    raise Exception("Failed to compile BPF module %s" % src_file)
Exception: Failed to compile BPF module

I checked the uapi/linux/bpf.h supplied in 4.2-rc4 and found that these functions were missing from the bpf_func_id enum but are present in your compat directory. Simply disabling them for me works for now. Also, thanks for BCC 👍

return type "int32" is not supported by B language implementation

plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$ cat bpfdev1.b

packed "false"

u32 main(struct proto::skbuff *skb) {
u32 ret:32 = 0xffffffff;

goto proto::ethernet;

state EOP {
return ret;
}
}
plumgrid@yhs-plumgrid:~/iovisor/bcc/tests/jit$

In kernel, people may use negative value to represent certain condition. It would be good if we support int32 directly in B language.

make install print() error for python2.6

Trying the latest...

build# make install
[...]
copying build/lib.linux-x86_64-2.7/bpf/__init__.py -> /usr/lib/python2.7/dist-packages/bpf
byte-compiling /usr/lib/python2.7/dist-packages/bpf/__init__.py to __init__.pyc
  File "/usr/lib/python2.7/dist-packages/bpf/__init__.py", line 312
    print(log_buf.value.decode(), file=sys.stderr)
                                      ^
SyntaxError: invalid syntax

If I add:

from __future__ import print_function

To src/python/bpf/init.py, it works.

error matching many kprobes (eg, "tcp*")

I can can attach kprobes to "vfs_." successfully, but not "tcp_.".

# ./funccount 'vfs_*'
Tracing... Ctrl-C to end.
^C
ADDR             FUNC                          COUNT
ffffffff811e8d71 vfs_fstat                        22
ffffffff811e8dd1 vfs_fstatat                      42
ffffffff811e4381 vfs_write                        58
ffffffff811e8c71 vfs_getattr_nosec                63
ffffffff811e8d41 vfs_getattr                      63
ffffffff811e4251 vfs_read                         91
ffffffff811e3221 vfs_open                        108

# ./funccount 'tcp_*'
write(kprobe_events): Invalid argument
Traceback (most recent call last):
  File "./funccount", line 84, in <module>
    b.attach_kprobe(event_re=pattern, fn_name="trace_count")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 485, in attach_kprobe
    cpu=cpu, group_fd=group_fd)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 494, in attach_kprobe
    raise Exception("Failed to attach BPF to kprobe")
Exception: Failed to attach BPF to kprobe

This is using funccount, which is a port of my simple but useful ftrace function profiling tool of the same name. That original tool had some limitations, eg, it could not match a PID (set_ftrace_pid was ignored, likely due to a bug). This bcc/eBPF tool can fix that issue and more.

funcount:

#!/usr/bin/python
#
# funccount Count kernel function calls.
#       For Linux, uses BCC, eBPF. See .c file.
#
# USAGE: funccount [-h] [-p PID] [-i INTERVAL] [-t TOP] [-T] [-r] pattern
#
# The pattern is a string with optional '*' wildcards, similar to file globbing.
# If you'd prefer to use regular expressions, use the -r option.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
#09-Sep-2015   Brendan Gregg   Created this.

from __future__ import print_function
from bcc import BPF
from time import sleep, strftime
import argparse
import re

# arguments
examples = """examples:
    ./funccount 'tcp_*'         # count kernel functions starting with "tcp"
    ./funccount 'vfs_*'         # count kernel functions starting with "vfs"
    ./funccount -r '^vfs.*'     # same as above, using regular expressions
    ./funccount -Ti 5 'vfs_*'   # output every 5 seconds, with timestamps
    ./funccount -p 185 'vfs_*'  # count vfs calls for PID 181 only
"""
parser = argparse.ArgumentParser(
    description="Count kernel function calls",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
parser.add_argument("-p", "--pid",
    help="Trace this PID only")
parser.add_argument("-i", "--interval", default=99999999,
    help="Summary interval, seconds")
parser.add_argument("-t", "--top",
    help="Show top number, count")
parser.add_argument("-T", "--timestamp", action="store_true",
    help="Include timestamp on output")
parser.add_argument("-r", "--regexp", action="store_true",
    help="Use regular expressions. Default is \"*\" wildcards only.")
parser.add_argument("pattern",
    help="search expression for kernel functions")
args = parser.parse_args()
pattern = args.pattern
if not args.regexp:
    pattern = pattern.replace('*', '.*')
    pattern = '^' + pattern + '$'
debug = 1

# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>

struct key_t {
    u64 ip;
};
BPF_HASH(counts, struct key_t);

int trace_count(struct pt_regs *ctx) {
    FILTER_START
    struct key_t key = {};
    u64 zero = 0, *val;
    key.ip = ctx->ip;
    val = counts.lookup_or_init(&key, &zero);
    (*val)++;
    FILTER_DONE
    return 0;
}
"""
if args.pid:
    bpf_text = bpf_text.replace('FILTER_START',
        ('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
        'if (pid == %s) {') % (args.pid))
    bpf_text = bpf_text.replace('FILTER_DONE', '}')
else:
    bpf_text = bpf_text.replace('FILTER_START', '')
    bpf_text = bpf_text.replace('FILTER_DONE', '')
if debug:
    print(bpf_text)
b = BPF(text=bpf_text)
b.attach_kprobe(event_re=pattern, fn_name="trace_count")

# header
print("Tracing... Ctrl-C to end.")

# output
exiting = 0 if args.interval else 1
while (1):
    try:
        sleep(int(args.interval))
    except KeyboardInterrupt:
        exiting=1

    print()
    if args.timestamp:
        print("%-8s\n" % strftime("%H:%M:%S"), end="")

    print("%-16s %-26s %8s" % ("ADDR", "FUNC", "COUNT"))
    counts = b.get_table("counts")
    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
        print("%-16x %-26s %8d" % (k.ip, b.ksym(k.ip), v.value))
    counts.clear()

    if exiting:
        exit()

USAGE:

# ./funccount -h
usage: funccount [-h] [-p PID] [-i INTERVAL] [-t TOP] [-T] [-r] pattern

Count kernel function calls

positional arguments:
  pattern               search expression for kernel functions

optional arguments:
  -h, --help            show this help message and exit
  -p PID, --pid PID     Trace this PID only
  -i INTERVAL, --interval INTERVAL
                        Summary interval, seconds
  -t TOP, --top TOP     Show top number, count
  -T, --timestamp       Include timestamp on output
  -r, --regexp          Use regular expressions. Default is "*" wildcards
                        only.

examples:
    ./funccount 'tcp_*'         # count kernel functions starting with "tcp"
    ./funccount 'vfs_*'         # count kernel functions starting with "vfs"
    ./funccount -r '^vfs.*'     # same as above, using regular expressions
    ./funccount -Ti 5 'vfs_*'   # output every 5 seconds, with timestamps
    ./funccount -p 185 'vfs_*'  # count vfs calls for PID 181 only

Now to get it to work on more than just vfs*. :)

The ftrace function profiling version does do this ok, eg:

# /apps/perf-tools/bin/funccount 'tcp*'
Tracing "tcp*"... Ctrl-C to end.
^C
FUNC                              COUNT
tcp_tasklet_func                      1
tcp_tsq_handler.part.31               1
tcp_write_timer                       2
tcp_write_timer_handler               2
tcp_delack_timer                      5
tcp_delack_timer_handler              5
tcp_cleanup_rbuf                      6
tcp_event_data_recv                   6
tcp_queue_rcv                         6
tcp_rcv_space_adjust                  6
tcp_recvmsg                           6
tcp_send_delayed_ack                  6
tcp_event_new_data_sent              11
tcp_init_tso_segs                    11
tcp_nagle_check                      11
tcp_options_write                    11
tcp_push                             11
tcp_sendmsg                          11
tcp_send_mss                         11
tcp_transmit_skb                     11
tcp_v4_send_check                    11
tcp_wfree                            11
tcp_write_xmit                       11
tcp_current_mss                      12
tcp_data_queue                       14
tcp_urg                              14
tcp_validate_incoming                14
tcp_ack                              15
tcp_check_space                      15
tcp4_gro_receive                     16
tcp_gro_receive                      16
tcp_parse_aligned_timestamp.pa       16
tcp_parse_md5sig_option              16
tcp_prequeue                         16
tcp_rcv_established                  16
tcp_schedule_loss_probe              16
tcp_v4_do_rcv                        16
tcp_v4_early_demux                   16
tcp_v4_rcv                           16
tcp_release_cb                       17
tcp_established_options              23
tcp_v4_md5_lookup                    23
tcp_rearm_rto                        31
tcp_md5_do_lookup                    39
tcp_poll                             49

Ending tracing...

support C-style /* ... */ comment?

I am adding arp support in proto.b, and I found /* ... */ is not supported.

......
state ethernet {
switch $ethernet.type {
case 0x0800 {
next proto::ip;
};
case 0x0806 {
next proto::arp;
};
case 0x8100 {
next proto::dot1q;
};
case * {
goto EOP;
};
}
}

/*
// struct arp {
// u32 htype:16
// u32 ptype:16
// u32 hlen:8
// u32 plen:8
// u32 oper:16
// u64 sha:48
// u32 spa:32
// u64 tha:48
// u32 tpa:32
// }

// state arp {
// goto EOP;
// }
*/
.....

If user want to comment out a large portion of code, a single /* ... */ is better than a bunch of //.
What do you think?

Probably a project logo?

Also, I thought while we are at it, I would make a small logo for you in a few minutes. I can put a scaled down version in the README.md if you want. Ping me if you need the vector image as well.

Choice 1:

Choice 2:

Add support for reading debug symbols from kernel for use in kprobe programs

Currently only raw register access is supported, which makes writing useful programs difficult.

invalid reads: type=inv expected=fp

I can't figure out some errors like the following:

bpf: Permission denied
0: (79) r6 = *(u64 *)(r1 +112)
1: (85) call 14
2: (63) *(u32 *)(r10 -12) = r0
3: (15) if r6 == 0x0 goto pc+55
 R0=inv R6=inv R10=fp
4: (b7) r7 = 0
5: (7b) *(u64 *)(r10 -24) = r7
6: (7b) *(u64 *)(r10 -32) = r7
7: (07) r6 += 16
8: (bf) r1 = r10
9: (07) r1 += -32
10: (b7) r2 = 16
11: (bf) r3 = r6
12: (85) call 4
13: (79) r1 = *(u64 *)(r10 -24)
14: (15) if r1 == 0x0 goto pc+44
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
15: (7b) *(u64 *)(r10 -40) = r7
16: (7b) *(u64 *)(r10 -48) = r7
17: (bf) r1 = r10
18: (07) r1 += -48
19: (b7) r2 = 16
20: (bf) r3 = r6
21: (85) call 4
22: (79) r1 = *(u64 *)(r10 -48)
23: (15) if r1 == 0x0 goto pc+35
 R0=inv R1=inv R6=inv R7=imm0 R10=fp
24: (b7) r7 = 0
25: (7b) *(u64 *)(r10 -56) = r7
26: (7b) *(u64 *)(r10 -64) = r7
27: (7b) *(u64 *)(r10 -72) = r7
28: (bf) r1 = r10
29: (07) r1 += -72
30: (b7) r2 = 16
31: (bf) r3 = r6
32: (85) call 4
33: (79) r6 = *(u64 *)(r10 -64)
34: (07) r6 += 56
35: (bf) r1 = r10
36: (07) r1 += -56
37: (b7) r2 = 8
38: (bf) r3 = r6
39: (85) call 4
40: (18) r1 = 0x93c8960
42: (bf) r2 = r10
43: (07) r2 += -12
44: (bf) r3 = r6
45: (b7) r4 = 0
46: (85) call 2
R3 type=inv expected=fp

Traceback (most recent call last):
  File "./vfsslower", line 83, in <module>
    b.attach_kprobe(event="vfs_read", fn_name="trace_entry")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 498, in attach_kprobe
    fn = self.load_func(fn_name, BPF.KPROBE)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 388, in load_func
    raise Exception("Failed to load BPF program %s" % func_name)
Exception: Failed to load BPF program trace_entry

What actually is "type=inv expected=fp"?

Here's a test program, that in this state works:

#!/usr/bin/python
#
# vfsslower Trace file system read/writes (via VFS) slower than a threshold.
#       For Linux, uses BCC, eBPF.
#
# In development. Incomplete. DO NOT USE.
#
# Copyright (c) 2015 Brendan Gregg.
# Licensed under the Apache License, Version 2.0 (the "License")
#
#12-Sep-2015   Brendan Gregg   Created this.

from __future__ import print_function
from bcc import BPF

REQ_WRITE = 1       # from include/linux/blk_types.h

# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>
#include <linux/mount.h>

BPF_HASH(start, u32);
BPF_HASH(filebypid, u32, char *);

int trace_entry(struct pt_regs *ctx, struct file *file)
{
    u64 ts;
    u32 pid = bpf_get_current_pid_tgid();


    if (file != NULL && file->f_path.dentry != NULL &&
        file->f_path.mnt != NULL) {
// XXX can't get this to work:
//      struct vfsmount *mnt = file->f_path.mnt;
//      if (!(mnt->mnt_flags & MNT_NODEV)) {
//          return 0;
//      }

        void *name = 0;
        struct dentry *dent = file->f_path.dentry;

        // get filename via d_iname
        bpf_probe_read(&name, sizeof(name), dent->d_iname);
// XXX can't get this to work:
//      filebypid.update(&pid, (char **)&dent->d_iname);

// debug only:
        bpf_trace_printk("read d_iname %s\\n", &dent->d_iname);
    }

    ts = bpf_ktime_get_ns();
    start.update(&pid, &ts);
    return 0;
}

int trace_return(struct pt_regs *ctx)
{
    u64 *tsp, delta;
    u32 pid = bpf_get_current_pid_tgid();

    pid = bpf_get_current_pid_tgid();
    tsp = start.lookup(&pid);

    if (tsp != 0) {
        delta = bpf_ktime_get_ns() - *tsp;
        if (delta > MIN_LATENCY_NS) {
            char *file = (char *)filebypid.lookup(&pid);
            bpf_trace_printk("%d us %s\\n", delta / 1000, file);
            if (file != NULL) {
                filebypid.delete(&pid);
            }
        }
        start.delete(&pid);
    }

    return 0;
}
"""
bpf_text = bpf_text.replace('MIN_LATENCY_NS', '100000')
b = BPF(text=bpf_text)
b.attach_kprobe(event="vfs_read", fn_name="trace_entry")
b.attach_kretprobe(event="vfs_read", fn_name="trace_return")

b.trace_print()

There's a couple of blocks, that when uncommented, break it.

It's pretty noisy to run, since it prints tty vfs_read()s from ssh (hence trying to filter it based on MNT_NODEV, which I don't know yet if it works). So I'd been testing it like this:

window1# ./vfsslower > out
window2# echo 1 > /proc/sys/vm/drop_caches; cksum /usr/bin/*
window1# grep cksum out

auto attach BPF functions to kprobes

I think this is worth suggesting... I keep declaring C function names that only exist for an attach_kprobe() mapping, and wonder if this could be simplified.

Eg, here's the existing hello_world:

b = BPF(text='void hello(void *ctx) { bpf_trace_printk("Hello, World!\\n"); }')
b.attach_kprobe(event="sys_clone", fn_name="hello")
b.trace_print()

imagine:

b = BPF(text='kprobe:sys_clone(void *ctx) { bpf_trace_printk("Hello, World!\\n"); }')
b.trace_print()

I think this will work well with tracepoints, when their support is added; eg, imagine tracing the "sched:sched_process_fork" tracepoint using:

b = BPF(text='sched:sched_process_fork(void *ctx) { bpf_trace_printk("Hello, World!\\n"); }')
b.trace_print()

C-style macro support?

In C program, user can define a macro like below

define MAX_IFS 10

and later on use MAX_IFS in the program.
.... (MAX_IFS) ...
... (MAX_IFS + 1) ...

Do we want to support macro in B language?

example of BPF_HASH without key_t

In a few cases, the struct key_t for a BPF_HASH only contains one pointer. Eg, disksnoop.c:

struct key_t {
    struct request *req;
};
BPF_HASH(start, struct key_t);

It seems redundant. But I can't seem to use a pointer directly. Eg, changing disksnoop.c to become:

#include <uapi/linux/ptrace.h>
#include <linux/blkdev.h>

BPF_HASH(start, struct request *);

int do_request(struct pt_regs *ctx, struct request *req) {
    u64 ts;

    // stash start timestamp by request ptr
    ts = bpf_ktime_get_ns();
    start.update(req, &ts);

    return 0;
}

int do_completion(struct pt_regs *ctx, struct request *req) {
    u64 *tsp, delta;

    tsp = start.lookup(req);

    if (tsp != 0) {
        delta = bpf_ktime_get_ns() - *tsp;
        bpf_trace_printk("%d %x %d\n", req->__data_len,
            req->cmd_flags, delta / 1000);
        start.delete(req);
    }

    return 0;
}

output:

# ./disksnoop.py 
/home/bgregg-buildtest/bcc/examples/disksnoop.c:23:15: warning: incompatible pointer types passing 'struct request *' to parameter of type 'struct request **'; take the
      address with & [-Wincompatible-pointer-types]
        start.update(req, &ts);
                     ^~~
                     &
/home/bgregg-buildtest/bcc/examples/disksnoop.c:31:21: warning: incompatible pointer types passing 'struct request *' to parameter of type 'struct request **'; take the
      address with & [-Wincompatible-pointer-types]
        tsp = start.lookup(req);
                           ^~~
                           &
/home/bgregg-buildtest/bcc/examples/disksnoop.c:37:16: warning: incompatible pointer types passing 'struct request *' to parameter of type 'struct request **'; take the
      address with & [-Wincompatible-pointer-types]
                start.delete(req);
                             ^~~
                             &
3 warnings generated.
<bcc-memory-buffer>:33:59: error: use of undeclared identifier 'u'
        tsp = bpf_map_lookup_elem((void *)bpf_pseudo_fd(1, 3), ((u);
                                                                 ^
<bcc-memory-buffer>:43:2: error: expected '}'
}
 ^
<bcc-memory-buffer>:30:61: note: to match this '{'
int do_completion(struct pt_regs *ctx, struct request *req) {
                                                            ^
2 errors generated.
Traceback (most recent call last):
  File "./disksnoop.py", line 19, in <module>
    b = BPF(src_file="disksnoop.c")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 345, in __init__
    raise Exception("Failed to compile BPF module %s" % src_file)
Exception: Failed to compile BPF module disksnoop.c

Kernel symbol translation

There should be a Python function for translating kernel addresses to symbol names: eg, BPF.ksym(addr). As an example, here's some code I was using (feel free to use/improve):

# kernel symbol translation
ksym_addrs = []         # addresses for binary search
ksym_names = []         # same index as ksym_addrs
def load_kallsyms():
    symfile = "/proc/kallsyms"
    try:
        syms = open(symfile, "r")
    except:
        print >> stderr, "ERROR: reading " + symfile
        exit()
    line = syms.readline()
    for line in iter(syms):
        cols = line.split()
        name = cols[2]
        addr = int(cols[0], 16)
        ksym_addrs.append(addr)
        ksym_names.append(name)
    syms.close()
def ksym(addr):
    start = -1
    end = len(ksym_addrs)
    while end != start + 1:
        mid = (start + end) / 2
        if addr < ksym_addrs[mid]:
            end = mid
        else:
            start = mid
    if start == -1:
        return "[unknown]"
    return ksym_names[start]
load_kallsyms()

The python program could then call ksym() on an address, to get the symbol name from /proc/kallsyms.

In hindsight I'd adjust this so that ksym() checked if symbols were loaded, and if not, called load_kallsyms() itself. That would avoid needing to call load_kallsyms() explicitly. The end-user would simply use ksym() if they needed it.

It may also be useful to provide a ksymaddr() function, which returned the symbol name followed by the instruction offset. Eg, "vfs_read+0x18" instead of just "vfs_read".

Easier trace_pipe handling

Some code I'm commonly using (Python):

b = BPF(...)

# open trace pipe
try:
    trace = open("/sys/kernel/debug/tracing/trace_pipe", "r")
except:
    print >> sys.stderr, "ERROR: opening trace_pipe"
    exit(1)

# format output
while 1:
    try:
        line = trace.readline().rstrip()
    except KeyboardInterrupt:
        pass; exit()

This could be improved. Eg:

b = BPF(...)

# format output
while 1:
    try:
        line = b.read_trace_pipe()
    except KeyboardInterrupt:
        pass; exit()

BPF can provide a read_trace_pipe() function to perform a readline() and rstrip() from the trace_pipe. It can open the trace_pipe if it wasn't already open (therefore the trace_pipe isn't opened unless the user explicitly uses the BPF.read_trace_pipe() function).

Perhaps the implementation could also use trace instances (see Instances in https://github.com/torvalds/linux/blob/master/Documentation/trace/ftrace.txt), so that multiple concurrent users would use separate trace_pipes.

Scripts packaging and distribution network

Have some infra where people can submit their BPF 'analysis script bundles' and easily access and download scripts written by other people. For example, a person willing to do an analysis can search through scripts, their descriptions, public rating, certification, number of downloads etc. We can extend it to a minimal bundle packaging system as well. There can be commands like,

# Install an analysis script bundle
bpf-get install syscall-latency

# Search analysis scripts
bpf-get search task-switch

# Show details about script, examples, if it's stable or testing etc.
bpf-get info task-switch 

# List all analysis scripts
bpf-get --list all

# List all scripts containing 'sched_switch' tracepoint
bpf-get --list --tracepoint sched_switch

# Remove script
bpf-get remove task-switch

# Upgrade scripts
bpf-get update task-switch

# Get source files of the script bundle
bpf-get source task-switch

# Package the script directory to a BPF bundle
bpf-bundle package ./task-switch

# Submit scripts to public review pool. After certain karma threshold or
# moderator intervention we can add it to the stable distribution pool 
bpf-bundle submit task-switch-0.3.bun

Then, run them using bpf-run

In the long run, we could also provide a web interface to browse different scripts and see what they do just as in https://atlas.hashicorp.com/boxes/search with the history of all the addition to the scripts. In addition, we can also show description and how a typical analysis and output would look.

Error digging out d_name from struct file

Fetching a file pathname from a struct file should be a common routine, but I can't get it to work.

vfs_read.py:

#!/usr/bin/python

from bcc import BPF

# load BPF program
b = BPF(src_file="vfs_read.c")
b.attach_kprobe(event="vfs_read", fn_name="run_read")

# format output
while 1:
    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
    print("got: %s" % msg)

vfs_read.c:

# cat vfs_read.c
#include <linux/fs.h>

void run_read(struct pt_regs *ctx, struct file *file)
{
    void *name = 0;

    if (file != NULL && file->f_path.dentry != NULL) {
        //bpf_trace_printk("read %s\n", file->f_path.dentry->d_iname);
        bpf_trace_printk("read %s\n", file->f_path.dentry->d_name.name);
    } else {
        bpf_trace_printk("was null");
    }
}

The line commented out works, but when I dig further to the d_name it does this:

# ./vfs_read.py 
bpf: Permission denied
0: (bf) r6 = r1
1: (79) r3 = *(u64 *)(r6 +112)
2: (15) if r3 == 0x0 goto pc+29
 R1=ctx R3=inv R6=ctx R10=fp
3: (b7) r7 = 0
4: (7b) *(u64 *)(r10 -8) = r7
5: (7b) *(u64 *)(r10 -16) = r7
6: (07) r3 += 16
7: (bf) r1 = r10
8: (07) r1 += -16
9: (b7) r2 = 16
10: (85) call 4
11: (79) r1 = *(u64 *)(r10 -8)
12: (15) if r1 == 0x0 goto pc+19
 R0=inv R1=inv R6=ctx R7=imm0 R10=fp
13: (18) r1 = 0x64616572
15: (7b) *(u64 *)(r10 -32) = r1
16: (73) *(u8 *)(r10 -24) = r7
17: (7b) *(u64 *)(r10 -40) = r7
18: (7b) *(u64 *)(r10 -48) = r7
19: (79) r3 = *(u64 *)(r6 +112)
20: (07) r3 += 16
21: (bf) r1 = r10
22: (07) r1 += -48
23: (b7) r2 = 16
24: (85) call 4
25: (79) r1 = *(u64 *)(r10 -40)
26: (79) r3 = *(u64 *)(r1 +40)
R1 invalid mem access 'inv'

Traceback (most recent call last):
  File "./vfs_read.py", line 7, in <module>
    b.attach_kprobe(event="vfs_read", fn_name="run_read")
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 424, in attach_kprobe
    fn = self.load_func(fn_name, BPF.KPROBE)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 320, in load_func
    raise Exception("Failed to load BPF program %s" % func_name)
Exception: Failed to load BPF program run_read

Maybe I'm doing it wrong?

iovisor / bcc Goto Github PK

bcc's Introduction

bcc's People

Stargazers

Watchers

Forkers

bcc's Issues

packed "false"

packed "false"

define MAX_IFS 10

Recommend Projects

Recommend Topics

Recommend Org