diff src/event/quic/bpf/ngx_quic_reuseport_helper.c @ 8269:7df607cb2d11 quic

QUIC: ngx_quic_bpf module. The quic kernel bpf helper inspects packet payload for DCID, extracts key and routes the packet into socket matching the key. Due to reuseport feature, each worker owns a personal socket, which is identified by the same key, used to create DCID. BPF objects are locked in RAM and are subject to RLIMIT_MEMLOCK. The "ulimit -l" command may be used to setup proper limits, if maps cannot be created with EPERM or updated with ETOOLONG.
author Vladimir Homutov <vl@nginx.com>
date Fri, 25 Dec 2020 15:01:15 +0300
parents
children 1a489587e1c8
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/src/event/quic/bpf/ngx_quic_reuseport_helper.c
@@ -0,0 +1,140 @@
+#include <errno.h>
+#include <linux/string.h>
+#include <linux/udp.h>
+#include <linux/bpf.h>
+/*
+ * the bpf_helpers.h is not included into linux-headers, only available
+ * with kernel sources in "tools/lib/bpf/bpf_helpers.h" or in libbpf.
+ */
+#include <bpf/bpf_helpers.h>
+
+
+#if !defined(SEC)
+#define SEC(NAME)  __attribute__((section(NAME), used))
+#endif
+
+
+#if defined(LICENSE_GPL)
+
+/*
+ * To see debug:
+ *
+ *  echo 1 > /sys/kernel/debug/tracing/events/bpf_trace/enable
+ *  cat /sys/kernel/debug/tracing/trace_pipe
+ *  echo 0 > /sys/kernel/debug/tracing/events/bpf_trace/enable
+ */
+
+#define debugmsg(fmt, ...)                                                    \
+do {                                                                          \
+    char __buf[] = fmt;                                                       \
+    bpf_trace_printk(__buf, sizeof(__buf), ##__VA_ARGS__);                    \
+} while (0)
+
+#else
+
+#define debugmsg(fmt, ...)
+
+#endif
+
+char _license[] SEC("license") = LICENSE;
+
+/*****************************************************************************/
+
+#define NGX_QUIC_PKT_LONG        0x80  /* header form */
+#define NGX_QUIC_SERVER_CID_LEN  20
+
+
+#define advance_data(nbytes)                                                  \
+    offset += nbytes;                                                         \
+    if (start + offset > end) {                                               \
+        debugmsg("cannot read %ld bytes at offset %ld", nbytes, offset);      \
+        goto failed;                                                          \
+    }                                                                         \
+    data = start + offset - 1;
+
+
+#define ngx_quic_parse_uint64(p)                                              \
+    (((__u64)(p)[0] << 56) |                                                  \
+     ((__u64)(p)[1] << 48) |                                                  \
+     ((__u64)(p)[2] << 40) |                                                  \
+     ((__u64)(p)[3] << 32) |                                                  \
+             (p)[4] << 24  |                                                  \
+             (p)[5] << 16  |                                                  \
+             (p)[6] << 8   |                                                  \
+             (p)[7])
+
+/*
+ * actual map object is created by the "bpf" system call,
+ * all pointers to this variable are replaced by the bpf loader
+ */
+struct bpf_map_def SEC("maps") ngx_quic_sockmap;
+
+
+SEC(PROGNAME)
+int ngx_quic_select_socket_by_dcid(struct sk_reuseport_md *ctx)
+{
+    int             rc;
+    __u64           key;
+    size_t          len, offset;
+    unsigned char  *start, *end, *data, *dcid;
+
+    start = ctx->data;
+    end = (unsigned char *) ctx->data_end;
+    offset = 0;
+
+    advance_data(sizeof(struct udphdr)); /* skip UDP header */
+    advance_data(1); /* QUIC flags */
+
+    if (data[0] & NGX_QUIC_PKT_LONG) {
+
+        advance_data(4); /* skip QUIC version */
+        len = data[0];   /* read DCID length */
+
+        if (len < 8) {
+            /* it's useless to search for key in such short DCID */
+            return SK_PASS;
+        }
+
+        advance_data(1); /* skip DCID len */
+
+    } else {
+        len = NGX_QUIC_SERVER_CID_LEN;
+    }
+
+    dcid = &data[1];
+    advance_data(len); /* we expect the packet to have full DCID */
+
+    /* make verifier happy */
+    if (dcid + sizeof(__u64) > end) {
+        goto failed;
+    }
+
+    key = ngx_quic_parse_uint64(dcid);
+
+    rc = bpf_sk_select_reuseport(ctx, &ngx_quic_sockmap, &key, 0);
+
+    switch (rc) {
+    case 0:
+        debugmsg("nginx quic socket selected by key 0x%x", key);
+        return SK_PASS;
+
+    /* kernel returns positive error numbers, errno.h defines positive */
+    case -ENOENT:
+        debugmsg("nginx quic default route for key 0x%x", key);
+        /* let the default reuseport logic decide which socket to choose */
+        return SK_PASS;
+
+    default:
+        debugmsg("nginx quic bpf_sk_select_reuseport err: %d key 0x%x",
+                  rc, key);
+        goto failed;
+    }
+
+failed:
+    /*
+     * SK_DROP will generate ICMP, but we may want to process "invalid" packet
+     * in userspace quic to investigate further and finally react properly
+     * (maybe ignore, maybe send something in response or close connection)
+     */
+    return SK_PASS;
+}