comparison src/event/quic/bpf/ngx_quic_reuseport_helper.c @ 8269:7df607cb2d11 quic

QUIC: ngx_quic_bpf module. The quic kernel bpf helper inspects packet payload for DCID, extracts key and routes the packet into socket matching the key. Due to reuseport feature, each worker owns a personal socket, which is identified by the same key, used to create DCID. BPF objects are locked in RAM and are subject to RLIMIT_MEMLOCK. The "ulimit -l" command may be used to setup proper limits, if maps cannot be created with EPERM or updated with ETOOLONG.
author Vladimir Homutov <vl@nginx.com>
date Fri, 25 Dec 2020 15:01:15 +0300
parents
children 1a489587e1c8
comparison
equal deleted inserted replaced
8268:d3747ba486e7 8269:7df607cb2d11
1 #include <errno.h>
2 #include <linux/string.h>
3 #include <linux/udp.h>
4 #include <linux/bpf.h>
5 /*
6 * the bpf_helpers.h is not included into linux-headers, only available
7 * with kernel sources in "tools/lib/bpf/bpf_helpers.h" or in libbpf.
8 */
9 #include <bpf/bpf_helpers.h>
10
11
12 #if !defined(SEC)
13 #define SEC(NAME) __attribute__((section(NAME), used))
14 #endif
15
16
17 #if defined(LICENSE_GPL)
18
19 /*
20 * To see debug:
21 *
22 * echo 1 > /sys/kernel/debug/tracing/events/bpf_trace/enable
23 * cat /sys/kernel/debug/tracing/trace_pipe
24 * echo 0 > /sys/kernel/debug/tracing/events/bpf_trace/enable
25 */
26
27 #define debugmsg(fmt, ...) \
28 do { \
29 char __buf[] = fmt; \
30 bpf_trace_printk(__buf, sizeof(__buf), ##__VA_ARGS__); \
31 } while (0)
32
33 #else
34
35 #define debugmsg(fmt, ...)
36
37 #endif
38
39 char _license[] SEC("license") = LICENSE;
40
41 /*****************************************************************************/
42
43 #define NGX_QUIC_PKT_LONG 0x80 /* header form */
44 #define NGX_QUIC_SERVER_CID_LEN 20
45
46
47 #define advance_data(nbytes) \
48 offset += nbytes; \
49 if (start + offset > end) { \
50 debugmsg("cannot read %ld bytes at offset %ld", nbytes, offset); \
51 goto failed; \
52 } \
53 data = start + offset - 1;
54
55
56 #define ngx_quic_parse_uint64(p) \
57 (((__u64)(p)[0] << 56) | \
58 ((__u64)(p)[1] << 48) | \
59 ((__u64)(p)[2] << 40) | \
60 ((__u64)(p)[3] << 32) | \
61 (p)[4] << 24 | \
62 (p)[5] << 16 | \
63 (p)[6] << 8 | \
64 (p)[7])
65
66 /*
67 * actual map object is created by the "bpf" system call,
68 * all pointers to this variable are replaced by the bpf loader
69 */
70 struct bpf_map_def SEC("maps") ngx_quic_sockmap;
71
72
73 SEC(PROGNAME)
74 int ngx_quic_select_socket_by_dcid(struct sk_reuseport_md *ctx)
75 {
76 int rc;
77 __u64 key;
78 size_t len, offset;
79 unsigned char *start, *end, *data, *dcid;
80
81 start = ctx->data;
82 end = (unsigned char *) ctx->data_end;
83 offset = 0;
84
85 advance_data(sizeof(struct udphdr)); /* skip UDP header */
86 advance_data(1); /* QUIC flags */
87
88 if (data[0] & NGX_QUIC_PKT_LONG) {
89
90 advance_data(4); /* skip QUIC version */
91 len = data[0]; /* read DCID length */
92
93 if (len < 8) {
94 /* it's useless to search for key in such short DCID */
95 return SK_PASS;
96 }
97
98 advance_data(1); /* skip DCID len */
99
100 } else {
101 len = NGX_QUIC_SERVER_CID_LEN;
102 }
103
104 dcid = &data[1];
105 advance_data(len); /* we expect the packet to have full DCID */
106
107 /* make verifier happy */
108 if (dcid + sizeof(__u64) > end) {
109 goto failed;
110 }
111
112 key = ngx_quic_parse_uint64(dcid);
113
114 rc = bpf_sk_select_reuseport(ctx, &ngx_quic_sockmap, &key, 0);
115
116 switch (rc) {
117 case 0:
118 debugmsg("nginx quic socket selected by key 0x%x", key);
119 return SK_PASS;
120
121 /* kernel returns positive error numbers, errno.h defines positive */
122 case -ENOENT:
123 debugmsg("nginx quic default route for key 0x%x", key);
124 /* let the default reuseport logic decide which socket to choose */
125 return SK_PASS;
126
127 default:
128 debugmsg("nginx quic bpf_sk_select_reuseport err: %d key 0x%x",
129 rc, key);
130 goto failed;
131 }
132
133 failed:
134 /*
135 * SK_DROP will generate ICMP, but we may want to process "invalid" packet
136 * in userspace quic to investigate further and finally react properly
137 * (maybe ignore, maybe send something in response or close connection)
138 */
139 return SK_PASS;
140 }