summaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2020-08-24 14:35:01 -0700
committerAlexei Starovoitov <ast@kernel.org>2020-08-24 14:39:18 -0700
commit890f4365e47ecbf554c043448ccec7ea10435120 (patch)
treeb1faac2bd96331562c4ef6cd40e05c4947d13969 /include/linux
parent9c0f8cbdc0e9cf8a2a3a96045778b8d759f172c2 (diff)
parent267cf9fa43d1c9d525d5d818a8651f2900e3aa9e (diff)
downloadcachepc-linux-890f4365e47ecbf554c043448ccec7ea10435120.tar.gz
cachepc-linux-890f4365e47ecbf554c043448ccec7ea10435120.zip
Merge branch 'bpf-tcp-header-opts'
Martin KaFai Lau says: ==================== The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal traffic only. This patch set introduces the necessary BPF logic and API to allow bpf program to write and parse header options. There are also some changes to TCP and they are mostly to provide the needed sk and skb info to the bpf program to make decision. Patch 9 is the main patch and has more details on the API and design. The set includes an example which sends the max delay ack in the BPF TCP header option and the receiving side can then adjust its RTO accordingly. v5: - Move some of the comments from git commit message to the UAPI bpf.h in patch 9 - Some variable clean up in the tests (patch 11). v4: - Since bpf-next is currently closed, tag the set with RFC to keep the review cadence - Separate tcp changes in its own patches (5, 6, 7). It is a bit tricky since most of the tcp changes is to call out the bpf prog to write and parse the header. The write and parse callout has been modularized into a few bpf_skops_* function in v3. This revision (v4) tries to move those bpf_skops_* functions into separate TCP patches. However, they will be half implemented to highlight the changes to the TCP stack, mainly: - when the bpf prog will be called in the TCP stack and - what information needs to pump through the TCP stack to the actual bpf prog callsite. The bpf_skops_* functions will be fully implemented in patch 9 together with other bpf pieces. - Use struct_size() in patch 1 (Eric) - Add saw_unknown to struct tcp_options_received in patch 4 (Eric) v3: - Add kdoc for tcp_make_synack (Jakub Kicinski) - Add BPF_WRITE_HDR_TCP_CURRENT_MSS and BPF_WRITE_HDR_TCP_SYNACK_COOKIE in bpf.h to give a clearer meaning to sock_ops->args[0] when writing header option. - Rename BPF_SOCK_OPS_PARSE_UNKWN_HDR_OPT_CB_FLAG to BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG v2: - Instead of limiting the bpf prog to write experimental option (kind:254, magic:0xeB9F), this revision allows the bpf prog to write any TCP header option through the bpf_store_hdr_opt() helper. That will allow different bpf-progs to write its own option and the helper will guarantee there is no duplication. - Add bpf_load_hdr_opt() helper to search a particular option by kind. Some of the get_syn logic is refactored to bpf_sock_ops_get_syn(). - Since bpf prog is no longer limited to option (254, 0xeB9F), the TCP_SKB_CB(skb)->bpf_hdr_opt_off is no longer needed. Instead, when there is any option kernel cannot recognize, the bpf prog will be called if the BPF_SOCK_OPS_PARSE_UNKWN_HDR_OPT_CB_FLAG is set. [ The "unknown_opt" is learned in tcp_parse_options() in patch 4. ] - Add BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG. If this flag is set, the bpf-prog will be called on all tcp packet received at an established sk. It will be useful to ensure a previously written header option is received by the peer. e.g. The latter test is using this on the active-side during syncookie. - The test_tcp_hdr_options.c is adjusted accordingly to test writing both experimental and regular TCP header option. - The test_misc_tcp_hdr_options.c is added to mainly test different cases on the new helpers. - Break up the TCP_BPF_RTO_MIN and TCP_BPF_DELACK_MAX into two patches. - Directly store the tcp_hdrlen in "struct saved_syn" instead of going back to the tcp header to obtain it by "th->doff * 4" - Add a new optval(==2) for setsockopt(TCP_SAVE_SYN) such that it will also store the mac header (patch 9). ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/bpf-cgroup.h25
-rw-r--r--include/linux/filter.h8
-rw-r--r--include/linux/tcp.h20
3 files changed, 46 insertions, 7 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 64f367044e25..2f98d2fce62e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
+ * fullsock and its parent fullsock cannot be traced by
+ * sk_to_full_sk().
+ *
+ * e.g. sock_ops->sk is a request_sock and it is under syncookie mode.
+ * Its listener-sk is not attached to the rsk_listener.
+ * In this case, the caller holds the listener-sk (unlocked),
+ * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with
+ * the listener-sk such that the cgroup-bpf-progs of the
+ * listener-sk will be run.
+ *
+ * Regardless of syncookie mode or not,
+ * calling bpf_setsockopt on listener-sk will not make sense anyway,
+ * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here.
+ */
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \
+ sock_ops, \
+ BPF_CGROUP_SOCK_OPS); \
+ __ret; \
+})
+
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
({ \
int __ret = 0; \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0a355b005bf4..995625950cc1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1236,13 +1236,17 @@ struct bpf_sock_addr_kern {
struct bpf_sock_ops_kern {
struct sock *sk;
- u32 op;
union {
u32 args[4];
u32 reply;
u32 replylong[4];
};
- u32 is_fullsock;
+ struct sk_buff *syn_skb;
+ struct sk_buff *skb;
+ void *skb_data_end;
+ u8 op;
+ u8 is_fullsock;
+ u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
* the BPF program. New fields that
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 14b62d7df942..56ff2952edaf 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -92,6 +92,8 @@ struct tcp_options_received {
smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
+ u8 saw_unknown:1, /* Received unknown option */
+ unused:7;
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
@@ -237,14 +239,13 @@ struct tcp_sock {
repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue;
- u8 syn_data:1, /* SYN includes data */
+ u8 save_syn:2, /* Save headers of SYN packet */
+ syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
- save_syn:1, /* Save headers of SYN packet */
- is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
- syn_smc:1; /* SYN includes SMC */
+ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq; /* snd_nxt at the time of TLP */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
@@ -391,6 +392,9 @@ struct tcp_sock {
#if IS_ENABLED(CONFIG_MPTCP)
bool is_mptcp;
#endif
+#if IS_ENABLED(CONFIG_SMC)
+ bool syn_smc; /* SYN includes SMC */
+#endif
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
@@ -406,7 +410,7 @@ struct tcp_sock {
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock __rcu *fastopen_rsk;
- u32 *saved_syn;
+ struct saved_syn *saved_syn;
};
enum tsq_enum {
@@ -484,6 +488,12 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
tp->saved_syn = NULL;
}
+static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
+{
+ return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
const struct sk_buff *orig_skb);