summaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2021-04-01 10:56:15 -0700
committerAlexei Starovoitov <ast@kernel.org>2021-04-01 10:56:15 -0700
commit89d69c5d0fbcabd8656459bc8b1a476d6f1efee4 (patch)
treebdcfa87c0cf209ce0de77a4b8cfaec4302e4f10f /include/linux
parente27bfefb21f28d5295432f042b5d9d7871100c35 (diff)
parent8d7cb74f2ccb5486ab8c631a8fcdc7621bbbc42c (diff)
downloadcachepc-linux-89d69c5d0fbcabd8656459bc8b1a476d6f1efee4.tar.gz
cachepc-linux-89d69c5d0fbcabd8656459bc8b1a476d6f1efee4.zip
Merge branch 'sockmap: introduce BPF_SK_SKB_VERDICT and support UDP'
Cong Wang says: ==================== From: Cong Wang <cong.wang@bytedance.com> We have thousands of services connected to a daemon on every host via AF_UNIX dgram sockets, after they are moved into VM, we have to add a proxy to forward these communications from VM to host, because rewriting thousands of them is not practical. This proxy uses an AF_UNIX socket connected to services and a UDP socket to connect to the host. It is inefficient because data is copied between kernel space and user space twice, and we can not use splice() which only supports TCP. Therefore, we want to use sockmap to do the splicing without going to user-space at all (after the initial setup). Currently sockmap only fully supports TCP, UDP is partially supported as it is only allowed to add into sockmap. This patchset, as the second part of the original large patchset, extends sockmap with: 1) cross-protocol support with BPF_SK_SKB_VERDICT; 2) full UDP support. On the high level, ->read_sock() is required for each protocol to support sockmap redirection, and in order to do sock proto update, a new ops ->psock_update_sk_prot() is introduced, which is also required. And the BPF ->recvmsg() is also needed to replace the original ->recvmsg() to retrieve skmsg. To make life easier, we have to get rid of lock_sock() in sk_psock_handle_skb(), otherwise we would have to implement ->sendmsg_locked() on top of ->sendmsg(), which is ugly. Please see each patch for more details. To see the big picture, the original patchset is available here: https://github.com/congwang/linux/tree/sockmap this patchset is also available: https://github.com/congwang/linux/tree/sockmap2 --- v8: get rid of 'offset' in udp_read_sock() add checks for skb_verdict/stream_verdict conflict add two cleanup patches for sock_map_link() add a new test case v7: use work_mutex to protect psock->work return err in udp_read_sock() add patch 6/13 clean up test case v6: get rid of sk_psock_zap_ingress() add rcu work patch v5: use INDIRECT_CALL_2() for function pointers use ingress_lock to fix a race condition found by Jacub rename two helper functions v4: get rid of lock_sock() in sk_psock_handle_skb() get rid of udp_sendmsg_locked() remove an empty line update cover letter v3: export tcp/udp_update_proto() rename sk->sk_prot->psock_update_sk_prot() improve changelogs v2: separate from the original large patchset rebase to the latest bpf-next split UDP test case move inet_csk_has_ulp() check to tcp_bpf.c clean up udp_read_sock() ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/skbuff.h1
-rw-r--r--include/linux/skmsg.h77
2 files changed, 59 insertions, 19 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8def85fcc22..dbf820a50a39 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 6c09d94be2e9..f78e90a04a69 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,7 @@ struct sk_psock_progs {
struct bpf_prog *msg_parser;
struct bpf_prog *stream_parser;
struct bpf_prog *stream_verdict;
+ struct bpf_prog *skb_verdict;
};
enum sk_psock_state_bits {
@@ -89,6 +90,7 @@ struct sk_psock {
#endif
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
+ spinlock_t ingress_lock;
unsigned long state;
struct list_head link;
spinlock_t link_lock;
@@ -97,13 +99,12 @@ struct sk_psock {
void (*saved_close)(struct sock *sk, long timeout);
void (*saved_write_space)(struct sock *sk);
void (*saved_data_ready)(struct sock *sk);
+ int (*psock_update_sk_prot)(struct sock *sk, bool restore);
struct proto *sk_proto;
+ struct mutex work_mutex;
struct sk_psock_work_state work_state;
struct work_struct work;
- union {
- struct rcu_head rcu;
- struct work_struct gc;
- };
+ struct rcu_work rwork;
};
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
@@ -124,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+ long timeo, int *err);
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags);
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
{
@@ -284,7 +289,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
+ spin_lock_bh(&psock->ingress_lock);
list_add_tail(&msg->list, &psock->ingress_msg);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg;
+
+ spin_lock_bh(&psock->ingress_lock);
+ msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+ if (msg)
+ list_del(&msg->list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return msg;
+}
+
+static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg;
+
+ spin_lock_bh(&psock->ingress_lock);
+ msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return msg;
+}
+
+static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct sk_msg *ret;
+
+ spin_lock_bh(&psock->ingress_lock);
+ if (list_is_last(&msg->list, &psock->ingress_msg))
+ ret = NULL;
+ else
+ ret = list_next_entry(msg, list);
+ spin_unlock_bh(&psock->ingress_lock);
+ return ret;
}
static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
@@ -292,6 +335,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
return psock ? list_empty(&psock->ingress_msg) : true;
}
+static inline void kfree_sk_msg(struct sk_msg *msg)
+{
+ if (msg->skb)
+ consume_skb(msg->skb);
+ kfree(msg);
+}
+
static inline void sk_psock_report_error(struct sk_psock *psock, int err)
{
struct sock *sk = psock->sk;
@@ -301,6 +351,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
}
struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_stop(struct sk_psock *psock, bool wait);
#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
@@ -349,25 +400,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock)
}
}
-static inline void sk_psock_update_proto(struct sock *sk,
- struct sk_psock *psock,
- struct proto *ops)
-{
- /* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, ops);
-}
-
static inline void sk_psock_restore_proto(struct sock *sk,
struct sk_psock *psock)
{
sk->sk_prot->unhash = psock->saved_unhash;
- if (inet_csk_has_ulp(sk)) {
- tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
- } else {
- sk->sk_write_space = psock->saved_write_space;
- /* Pairs with lockless read in sk_clone_lock() */
- WRITE_ONCE(sk->sk_prot, psock->sk_proto);
- }
+ if (psock->psock_update_sk_prot)
+ psock->psock_update_sk_prot(sk, true);
}
static inline void sk_psock_set_state(struct sk_psock *psock,
@@ -442,6 +480,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
psock_set_prog(&progs->msg_parser, NULL);
psock_set_prog(&progs->stream_parser, NULL);
psock_set_prog(&progs->stream_verdict, NULL);
+ psock_set_prog(&progs->skb_verdict, NULL);
}
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);