Mercurial > hg > nginx
comparison src/event/quic/bpf/ngx_quic_reuseport_helper.c @ 8676:7df607cb2d11 quic
QUIC: ngx_quic_bpf module.
The quic kernel bpf helper inspects packet payload for DCID, extracts key
and routes the packet into socket matching the key.
Due to reuseport feature, each worker owns a personal socket, which is
identified by the same key, used to create DCID.
BPF objects are locked in RAM and are subject to RLIMIT_MEMLOCK.
The "ulimit -l" command may be used to setup proper limits, if maps
cannot be created with EPERM or updated with ETOOLONG.
author | Vladimir Homutov <vl@nginx.com> |
---|---|
date | Fri, 25 Dec 2020 15:01:15 +0300 |
parents | |
children | 1a489587e1c8 |
comparison
equal
deleted
inserted
replaced
8675:d3747ba486e7 | 8676:7df607cb2d11 |
---|---|
1 #include <errno.h> | |
2 #include <linux/string.h> | |
3 #include <linux/udp.h> | |
4 #include <linux/bpf.h> | |
5 /* | |
6 * the bpf_helpers.h is not included into linux-headers, only available | |
7 * with kernel sources in "tools/lib/bpf/bpf_helpers.h" or in libbpf. | |
8 */ | |
9 #include <bpf/bpf_helpers.h> | |
10 | |
11 | |
12 #if !defined(SEC) | |
13 #define SEC(NAME) __attribute__((section(NAME), used)) | |
14 #endif | |
15 | |
16 | |
17 #if defined(LICENSE_GPL) | |
18 | |
19 /* | |
20 * To see debug: | |
21 * | |
22 * echo 1 > /sys/kernel/debug/tracing/events/bpf_trace/enable | |
23 * cat /sys/kernel/debug/tracing/trace_pipe | |
24 * echo 0 > /sys/kernel/debug/tracing/events/bpf_trace/enable | |
25 */ | |
26 | |
27 #define debugmsg(fmt, ...) \ | |
28 do { \ | |
29 char __buf[] = fmt; \ | |
30 bpf_trace_printk(__buf, sizeof(__buf), ##__VA_ARGS__); \ | |
31 } while (0) | |
32 | |
33 #else | |
34 | |
35 #define debugmsg(fmt, ...) | |
36 | |
37 #endif | |
38 | |
39 char _license[] SEC("license") = LICENSE; | |
40 | |
41 /*****************************************************************************/ | |
42 | |
43 #define NGX_QUIC_PKT_LONG 0x80 /* header form */ | |
44 #define NGX_QUIC_SERVER_CID_LEN 20 | |
45 | |
46 | |
47 #define advance_data(nbytes) \ | |
48 offset += nbytes; \ | |
49 if (start + offset > end) { \ | |
50 debugmsg("cannot read %ld bytes at offset %ld", nbytes, offset); \ | |
51 goto failed; \ | |
52 } \ | |
53 data = start + offset - 1; | |
54 | |
55 | |
56 #define ngx_quic_parse_uint64(p) \ | |
57 (((__u64)(p)[0] << 56) | \ | |
58 ((__u64)(p)[1] << 48) | \ | |
59 ((__u64)(p)[2] << 40) | \ | |
60 ((__u64)(p)[3] << 32) | \ | |
61 (p)[4] << 24 | \ | |
62 (p)[5] << 16 | \ | |
63 (p)[6] << 8 | \ | |
64 (p)[7]) | |
65 | |
66 /* | |
67 * actual map object is created by the "bpf" system call, | |
68 * all pointers to this variable are replaced by the bpf loader | |
69 */ | |
70 struct bpf_map_def SEC("maps") ngx_quic_sockmap; | |
71 | |
72 | |
73 SEC(PROGNAME) | |
74 int ngx_quic_select_socket_by_dcid(struct sk_reuseport_md *ctx) | |
75 { | |
76 int rc; | |
77 __u64 key; | |
78 size_t len, offset; | |
79 unsigned char *start, *end, *data, *dcid; | |
80 | |
81 start = ctx->data; | |
82 end = (unsigned char *) ctx->data_end; | |
83 offset = 0; | |
84 | |
85 advance_data(sizeof(struct udphdr)); /* skip UDP header */ | |
86 advance_data(1); /* QUIC flags */ | |
87 | |
88 if (data[0] & NGX_QUIC_PKT_LONG) { | |
89 | |
90 advance_data(4); /* skip QUIC version */ | |
91 len = data[0]; /* read DCID length */ | |
92 | |
93 if (len < 8) { | |
94 /* it's useless to search for key in such short DCID */ | |
95 return SK_PASS; | |
96 } | |
97 | |
98 advance_data(1); /* skip DCID len */ | |
99 | |
100 } else { | |
101 len = NGX_QUIC_SERVER_CID_LEN; | |
102 } | |
103 | |
104 dcid = &data[1]; | |
105 advance_data(len); /* we expect the packet to have full DCID */ | |
106 | |
107 /* make verifier happy */ | |
108 if (dcid + sizeof(__u64) > end) { | |
109 goto failed; | |
110 } | |
111 | |
112 key = ngx_quic_parse_uint64(dcid); | |
113 | |
114 rc = bpf_sk_select_reuseport(ctx, &ngx_quic_sockmap, &key, 0); | |
115 | |
116 switch (rc) { | |
117 case 0: | |
118 debugmsg("nginx quic socket selected by key 0x%x", key); | |
119 return SK_PASS; | |
120 | |
121 /* kernel returns positive error numbers, errno.h defines positive */ | |
122 case -ENOENT: | |
123 debugmsg("nginx quic default route for key 0x%x", key); | |
124 /* let the default reuseport logic decide which socket to choose */ | |
125 return SK_PASS; | |
126 | |
127 default: | |
128 debugmsg("nginx quic bpf_sk_select_reuseport err: %d key 0x%x", | |
129 rc, key); | |
130 goto failed; | |
131 } | |
132 | |
133 failed: | |
134 /* | |
135 * SK_DROP will generate ICMP, but we may want to process "invalid" packet | |
136 * in userspace quic to investigate further and finally react properly | |
137 * (maybe ignore, maybe send something in response or close connection) | |
138 */ | |
139 return SK_PASS; | |
140 } |