cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

subflow.c (54629B)


      1// SPDX-License-Identifier: GPL-2.0
      2/* Multipath TCP
      3 *
      4 * Copyright (c) 2017 - 2019, Intel Corporation.
      5 */
      6
      7#define pr_fmt(fmt) "MPTCP: " fmt
      8
      9#include <linux/kernel.h>
     10#include <linux/module.h>
     11#include <linux/netdevice.h>
     12#include <crypto/algapi.h>
     13#include <crypto/sha2.h>
     14#include <net/sock.h>
     15#include <net/inet_common.h>
     16#include <net/inet_hashtables.h>
     17#include <net/protocol.h>
     18#include <net/tcp.h>
     19#if IS_ENABLED(CONFIG_MPTCP_IPV6)
     20#include <net/ip6_route.h>
     21#include <net/transp_v6.h>
     22#endif
     23#include <net/mptcp.h>
     24#include <uapi/linux/mptcp.h>
     25#include "protocol.h"
     26#include "mib.h"
     27
     28#include <trace/events/mptcp.h>
     29
     30static void mptcp_subflow_ops_undo_override(struct sock *ssk);
     31
     32static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
     33				  enum linux_mptcp_mib_field field)
     34{
     35	MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
     36}
     37
     38static void subflow_req_destructor(struct request_sock *req)
     39{
     40	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
     41
     42	pr_debug("subflow_req=%p", subflow_req);
     43
     44	if (subflow_req->msk)
     45		sock_put((struct sock *)subflow_req->msk);
     46
     47	mptcp_token_destroy_request(req);
     48	tcp_request_sock_ops.destructor(req);
     49}
     50
     51static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
     52				  void *hmac)
     53{
     54	u8 msg[8];
     55
     56	put_unaligned_be32(nonce1, &msg[0]);
     57	put_unaligned_be32(nonce2, &msg[4]);
     58
     59	mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
     60}
     61
     62static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
     63{
     64	return mptcp_is_fully_established((void *)msk) &&
     65		((mptcp_pm_is_userspace(msk) &&
     66		  mptcp_userspace_pm_active(msk)) ||
     67		 READ_ONCE(msk->pm.accept_subflow));
     68}
     69
     70/* validate received token and create truncated hmac and nonce for SYN-ACK */
     71static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
     72{
     73	struct mptcp_sock *msk = subflow_req->msk;
     74	u8 hmac[SHA256_DIGEST_SIZE];
     75
     76	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
     77
     78	subflow_generate_hmac(msk->local_key, msk->remote_key,
     79			      subflow_req->local_nonce,
     80			      subflow_req->remote_nonce, hmac);
     81
     82	subflow_req->thmac = get_unaligned_be64(hmac);
     83}
     84
     85static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
     86{
     87	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
     88	struct mptcp_sock *msk;
     89	int local_id;
     90
     91	msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token);
     92	if (!msk) {
     93		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
     94		return NULL;
     95	}
     96
     97	local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
     98	if (local_id < 0) {
     99		sock_put((struct sock *)msk);
    100		return NULL;
    101	}
    102	subflow_req->local_id = local_id;
    103
    104	return msk;
    105}
    106
    107static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
    108{
    109	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
    110
    111	subflow_req->mp_capable = 0;
    112	subflow_req->mp_join = 0;
    113	subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
    114	subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener));
    115	subflow_req->msk = NULL;
    116	mptcp_token_init_request(req);
    117}
    118
    119static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
    120{
    121	return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
    122}
    123
    124static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
    125{
    126	struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
    127
    128	if (mpext) {
    129		memset(mpext, 0, sizeof(*mpext));
    130		mpext->reset_reason = reason;
    131	}
    132}
    133
    134/* Init mptcp request socket.
    135 *
    136 * Returns an error code if a JOIN has failed and a TCP reset
    137 * should be sent.
    138 */
    139static int subflow_check_req(struct request_sock *req,
    140			     const struct sock *sk_listener,
    141			     struct sk_buff *skb)
    142{
    143	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
    144	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
    145	struct mptcp_options_received mp_opt;
    146	bool opt_mp_capable, opt_mp_join;
    147
    148	pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
    149
    150#ifdef CONFIG_TCP_MD5SIG
    151	/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
    152	 * TCP option space.
    153	 */
    154	if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
    155		return -EINVAL;
    156#endif
    157
    158	mptcp_get_options(skb, &mp_opt);
    159
    160	opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
    161	opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
    162	if (opt_mp_capable) {
    163		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
    164
    165		if (opt_mp_join)
    166			return 0;
    167	} else if (opt_mp_join) {
    168		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
    169	}
    170
    171	if (opt_mp_capable && listener->request_mptcp) {
    172		int err, retries = MPTCP_TOKEN_MAX_RETRIES;
    173
    174		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
    175again:
    176		do {
    177			get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
    178		} while (subflow_req->local_key == 0);
    179
    180		if (unlikely(req->syncookie)) {
    181			mptcp_crypto_key_sha(subflow_req->local_key,
    182					     &subflow_req->token,
    183					     &subflow_req->idsn);
    184			if (mptcp_token_exists(subflow_req->token)) {
    185				if (retries-- > 0)
    186					goto again;
    187				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
    188			} else {
    189				subflow_req->mp_capable = 1;
    190			}
    191			return 0;
    192		}
    193
    194		err = mptcp_token_new_request(req);
    195		if (err == 0)
    196			subflow_req->mp_capable = 1;
    197		else if (retries-- > 0)
    198			goto again;
    199		else
    200			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
    201
    202	} else if (opt_mp_join && listener->request_mptcp) {
    203		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
    204		subflow_req->mp_join = 1;
    205		subflow_req->backup = mp_opt.backup;
    206		subflow_req->remote_id = mp_opt.join_id;
    207		subflow_req->token = mp_opt.token;
    208		subflow_req->remote_nonce = mp_opt.nonce;
    209		subflow_req->msk = subflow_token_join_request(req);
    210
    211		/* Can't fall back to TCP in this case. */
    212		if (!subflow_req->msk) {
    213			subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
    214			return -EPERM;
    215		}
    216
    217		if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
    218			pr_debug("syn inet_sport=%d %d",
    219				 ntohs(inet_sk(sk_listener)->inet_sport),
    220				 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
    221			if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
    222				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
    223				return -EPERM;
    224			}
    225			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
    226		}
    227
    228		subflow_req_create_thmac(subflow_req);
    229
    230		if (unlikely(req->syncookie)) {
    231			if (mptcp_can_accept_new_subflow(subflow_req->msk))
    232				subflow_init_req_cookie_join_save(subflow_req, skb);
    233			else
    234				return -EPERM;
    235		}
    236
    237		pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
    238			 subflow_req->remote_nonce, subflow_req->msk);
    239	}
    240
    241	return 0;
    242}
    243
    244int mptcp_subflow_init_cookie_req(struct request_sock *req,
    245				  const struct sock *sk_listener,
    246				  struct sk_buff *skb)
    247{
    248	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
    249	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
    250	struct mptcp_options_received mp_opt;
    251	bool opt_mp_capable, opt_mp_join;
    252	int err;
    253
    254	subflow_init_req(req, sk_listener);
    255	mptcp_get_options(skb, &mp_opt);
    256
    257	opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
    258	opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
    259	if (opt_mp_capable && opt_mp_join)
    260		return -EINVAL;
    261
    262	if (opt_mp_capable && listener->request_mptcp) {
    263		if (mp_opt.sndr_key == 0)
    264			return -EINVAL;
    265
    266		subflow_req->local_key = mp_opt.rcvr_key;
    267		err = mptcp_token_new_request(req);
    268		if (err)
    269			return err;
    270
    271		subflow_req->mp_capable = 1;
    272		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
    273	} else if (opt_mp_join && listener->request_mptcp) {
    274		if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
    275			return -EINVAL;
    276
    277		subflow_req->mp_join = 1;
    278		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
    279	}
    280
    281	return 0;
    282}
    283EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
    284
    285static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
    286					      struct sk_buff *skb,
    287					      struct flowi *fl,
    288					      struct request_sock *req)
    289{
    290	struct dst_entry *dst;
    291	int err;
    292
    293	tcp_rsk(req)->is_mptcp = 1;
    294	subflow_init_req(req, sk);
    295
    296	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
    297	if (!dst)
    298		return NULL;
    299
    300	err = subflow_check_req(req, sk, skb);
    301	if (err == 0)
    302		return dst;
    303
    304	dst_release(dst);
    305	if (!req->syncookie)
    306		tcp_request_sock_ops.send_reset(sk, skb);
    307	return NULL;
    308}
    309
    310#if IS_ENABLED(CONFIG_MPTCP_IPV6)
    311static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
    312					      struct sk_buff *skb,
    313					      struct flowi *fl,
    314					      struct request_sock *req)
    315{
    316	struct dst_entry *dst;
    317	int err;
    318
    319	tcp_rsk(req)->is_mptcp = 1;
    320	subflow_init_req(req, sk);
    321
    322	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
    323	if (!dst)
    324		return NULL;
    325
    326	err = subflow_check_req(req, sk, skb);
    327	if (err == 0)
    328		return dst;
    329
    330	dst_release(dst);
    331	if (!req->syncookie)
    332		tcp6_request_sock_ops.send_reset(sk, skb);
    333	return NULL;
    334}
    335#endif
    336
    337/* validate received truncated hmac and create hmac for third ACK */
    338static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
    339{
    340	u8 hmac[SHA256_DIGEST_SIZE];
    341	u64 thmac;
    342
    343	subflow_generate_hmac(subflow->remote_key, subflow->local_key,
    344			      subflow->remote_nonce, subflow->local_nonce,
    345			      hmac);
    346
    347	thmac = get_unaligned_be64(hmac);
    348	pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
    349		 subflow, subflow->token, thmac, subflow->thmac);
    350
    351	return thmac == subflow->thmac;
    352}
    353
    354void mptcp_subflow_reset(struct sock *ssk)
    355{
    356	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    357	struct sock *sk = subflow->conn;
    358
    359	/* must hold: tcp_done() could drop last reference on parent */
    360	sock_hold(sk);
    361
    362	tcp_set_state(ssk, TCP_CLOSE);
    363	tcp_send_active_reset(ssk, GFP_ATOMIC);
    364	tcp_done(ssk);
    365	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) &&
    366	    schedule_work(&mptcp_sk(sk)->work))
    367		return; /* worker will put sk for us */
    368
    369	sock_put(sk);
    370}
    371
    372static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
    373{
    374	return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
    375}
    376
    377void __mptcp_set_connected(struct sock *sk)
    378{
    379	if (sk->sk_state == TCP_SYN_SENT) {
    380		inet_sk_state_store(sk, TCP_ESTABLISHED);
    381		sk->sk_state_change(sk);
    382	}
    383}
    384
    385static void mptcp_set_connected(struct sock *sk)
    386{
    387	mptcp_data_lock(sk);
    388	if (!sock_owned_by_user(sk))
    389		__mptcp_set_connected(sk);
    390	else
    391		__set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
    392	mptcp_data_unlock(sk);
    393}
    394
    395static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
    396{
    397	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
    398	struct mptcp_options_received mp_opt;
    399	struct sock *parent = subflow->conn;
    400
    401	subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
    402
    403	/* be sure no special action on any packet other than syn-ack */
    404	if (subflow->conn_finished)
    405		return;
    406
    407	mptcp_propagate_sndbuf(parent, sk);
    408	subflow->rel_write_seq = 1;
    409	subflow->conn_finished = 1;
    410	subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
    411	pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
    412
    413	mptcp_get_options(skb, &mp_opt);
    414	if (subflow->request_mptcp) {
    415		if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
    416			MPTCP_INC_STATS(sock_net(sk),
    417					MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
    418			mptcp_do_fallback(sk);
    419			pr_fallback(mptcp_sk(subflow->conn));
    420			goto fallback;
    421		}
    422
    423		if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
    424			WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true);
    425		if (mp_opt.deny_join_id0)
    426			WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true);
    427		subflow->mp_capable = 1;
    428		subflow->can_ack = 1;
    429		subflow->remote_key = mp_opt.sndr_key;
    430		pr_debug("subflow=%p, remote_key=%llu", subflow,
    431			 subflow->remote_key);
    432		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
    433		mptcp_finish_connect(sk);
    434		mptcp_set_connected(parent);
    435	} else if (subflow->request_join) {
    436		u8 hmac[SHA256_DIGEST_SIZE];
    437
    438		if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) {
    439			subflow->reset_reason = MPTCP_RST_EMPTCP;
    440			goto do_reset;
    441		}
    442
    443		subflow->backup = mp_opt.backup;
    444		subflow->thmac = mp_opt.thmac;
    445		subflow->remote_nonce = mp_opt.nonce;
    446		subflow->remote_id = mp_opt.join_id;
    447		pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
    448			 subflow, subflow->thmac, subflow->remote_nonce,
    449			 subflow->backup);
    450
    451		if (!subflow_thmac_valid(subflow)) {
    452			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
    453			subflow->reset_reason = MPTCP_RST_EMPTCP;
    454			goto do_reset;
    455		}
    456
    457		if (!mptcp_finish_join(sk))
    458			goto do_reset;
    459
    460		subflow_generate_hmac(subflow->local_key, subflow->remote_key,
    461				      subflow->local_nonce,
    462				      subflow->remote_nonce,
    463				      hmac);
    464		memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
    465
    466		subflow->mp_join = 1;
    467		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
    468
    469		if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
    470			pr_debug("synack inet_dport=%d %d",
    471				 ntohs(inet_sk(sk)->inet_dport),
    472				 ntohs(inet_sk(parent)->inet_dport));
    473			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
    474		}
    475	} else if (mptcp_check_fallback(sk)) {
    476fallback:
    477		mptcp_rcv_space_init(mptcp_sk(parent), sk);
    478		mptcp_set_connected(parent);
    479	}
    480	return;
    481
    482do_reset:
    483	subflow->reset_transient = 0;
    484	mptcp_subflow_reset(sk);
    485}
    486
    487static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
    488{
    489	subflow->local_id = local_id;
    490	subflow->local_id_valid = 1;
    491}
    492
    493static int subflow_chk_local_id(struct sock *sk)
    494{
    495	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
    496	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
    497	int err;
    498
    499	if (likely(subflow->local_id_valid))
    500		return 0;
    501
    502	err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
    503	if (err < 0)
    504		return err;
    505
    506	subflow_set_local_id(subflow, err);
    507	return 0;
    508}
    509
    510static int subflow_rebuild_header(struct sock *sk)
    511{
    512	int err = subflow_chk_local_id(sk);
    513
    514	if (unlikely(err < 0))
    515		return err;
    516
    517	return inet_sk_rebuild_header(sk);
    518}
    519
    520#if IS_ENABLED(CONFIG_MPTCP_IPV6)
    521static int subflow_v6_rebuild_header(struct sock *sk)
    522{
    523	int err = subflow_chk_local_id(sk);
    524
    525	if (unlikely(err < 0))
    526		return err;
    527
    528	return inet6_sk_rebuild_header(sk);
    529}
    530#endif
    531
    532struct request_sock_ops mptcp_subflow_request_sock_ops;
    533static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
    534
    535static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
    536{
    537	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
    538
    539	pr_debug("subflow=%p", subflow);
    540
    541	/* Never answer to SYNs sent to broadcast or multicast */
    542	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
    543		goto drop;
    544
    545	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
    546				&subflow_request_sock_ipv4_ops,
    547				sk, skb);
    548drop:
    549	tcp_listendrop(sk);
    550	return 0;
    551}
    552
    553#if IS_ENABLED(CONFIG_MPTCP_IPV6)
    554static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
    555static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
    556static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
    557static struct proto tcpv6_prot_override;
    558
    559static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
    560{
    561	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
    562
    563	pr_debug("subflow=%p", subflow);
    564
    565	if (skb->protocol == htons(ETH_P_IP))
    566		return subflow_v4_conn_request(sk, skb);
    567
    568	if (!ipv6_unicast_destination(skb))
    569		goto drop;
    570
    571	if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
    572		__IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
    573		return 0;
    574	}
    575
    576	return tcp_conn_request(&mptcp_subflow_request_sock_ops,
    577				&subflow_request_sock_ipv6_ops, sk, skb);
    578
    579drop:
    580	tcp_listendrop(sk);
    581	return 0; /* don't send reset */
    582}
    583#endif
    584
    585/* validate hmac received in third ACK */
    586static bool subflow_hmac_valid(const struct request_sock *req,
    587			       const struct mptcp_options_received *mp_opt)
    588{
    589	const struct mptcp_subflow_request_sock *subflow_req;
    590	u8 hmac[SHA256_DIGEST_SIZE];
    591	struct mptcp_sock *msk;
    592
    593	subflow_req = mptcp_subflow_rsk(req);
    594	msk = subflow_req->msk;
    595	if (!msk)
    596		return false;
    597
    598	subflow_generate_hmac(msk->remote_key, msk->local_key,
    599			      subflow_req->remote_nonce,
    600			      subflow_req->local_nonce, hmac);
    601
    602	return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
    603}
    604
    605static void mptcp_sock_destruct(struct sock *sk)
    606{
    607	/* if new mptcp socket isn't accepted, it is free'd
    608	 * from the tcp listener sockets request queue, linked
    609	 * from req->sk.  The tcp socket is released.
    610	 * This calls the ULP release function which will
    611	 * also remove the mptcp socket, via
    612	 * sock_put(ctx->conn).
    613	 *
    614	 * Problem is that the mptcp socket will be in
    615	 * ESTABLISHED state and will not have the SOCK_DEAD flag.
    616	 * Both result in warnings from inet_sock_destruct.
    617	 */
    618	if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
    619		sk->sk_state = TCP_CLOSE;
    620		WARN_ON_ONCE(sk->sk_socket);
    621		sock_orphan(sk);
    622	}
    623
    624	mptcp_destroy_common(mptcp_sk(sk));
    625	inet_sock_destruct(sk);
    626}
    627
    628static void mptcp_force_close(struct sock *sk)
    629{
    630	/* the msk is not yet exposed to user-space */
    631	inet_sk_state_store(sk, TCP_CLOSE);
    632	sk_common_release(sk);
    633}
    634
    635static void subflow_ulp_fallback(struct sock *sk,
    636				 struct mptcp_subflow_context *old_ctx)
    637{
    638	struct inet_connection_sock *icsk = inet_csk(sk);
    639
    640	mptcp_subflow_tcp_fallback(sk, old_ctx);
    641	icsk->icsk_ulp_ops = NULL;
    642	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
    643	tcp_sk(sk)->is_mptcp = 0;
    644
    645	mptcp_subflow_ops_undo_override(sk);
    646}
    647
    648static void subflow_drop_ctx(struct sock *ssk)
    649{
    650	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
    651
    652	if (!ctx)
    653		return;
    654
    655	subflow_ulp_fallback(ssk, ctx);
    656	if (ctx->conn)
    657		sock_put(ctx->conn);
    658
    659	kfree_rcu(ctx, rcu);
    660}
    661
    662void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
    663				     struct mptcp_options_received *mp_opt)
    664{
    665	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
    666
    667	subflow->remote_key = mp_opt->sndr_key;
    668	subflow->fully_established = 1;
    669	subflow->can_ack = 1;
    670	WRITE_ONCE(msk->fully_established, true);
    671}
    672
    673static struct sock *subflow_syn_recv_sock(const struct sock *sk,
    674					  struct sk_buff *skb,
    675					  struct request_sock *req,
    676					  struct dst_entry *dst,
    677					  struct request_sock *req_unhash,
    678					  bool *own_req)
    679{
    680	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
    681	struct mptcp_subflow_request_sock *subflow_req;
    682	struct mptcp_options_received mp_opt;
    683	bool fallback, fallback_is_fatal;
    684	struct sock *new_msk = NULL;
    685	struct sock *child;
    686
    687	pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
    688
    689	/* After child creation we must look for MPC even when options
    690	 * are not parsed
    691	 */
    692	mp_opt.suboptions = 0;
    693
    694	/* hopefully temporary handling for MP_JOIN+syncookie */
    695	subflow_req = mptcp_subflow_rsk(req);
    696	fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
    697	fallback = !tcp_rsk(req)->is_mptcp;
    698	if (fallback)
    699		goto create_child;
    700
    701	/* if the sk is MP_CAPABLE, we try to fetch the client key */
    702	if (subflow_req->mp_capable) {
    703		/* we can receive and accept an in-window, out-of-order pkt,
    704		 * which may not carry the MP_CAPABLE opt even on mptcp enabled
    705		 * paths: always try to extract the peer key, and fallback
    706		 * for packets missing it.
    707		 * Even OoO DSS packets coming legitly after dropped or
    708		 * reordered MPC will cause fallback, but we don't have other
    709		 * options.
    710		 */
    711		mptcp_get_options(skb, &mp_opt);
    712		if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
    713			fallback = true;
    714			goto create_child;
    715		}
    716
    717		new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
    718		if (!new_msk)
    719			fallback = true;
    720	} else if (subflow_req->mp_join) {
    721		mptcp_get_options(skb, &mp_opt);
    722		if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
    723		    !subflow_hmac_valid(req, &mp_opt) ||
    724		    !mptcp_can_accept_new_subflow(subflow_req->msk)) {
    725			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
    726			fallback = true;
    727		}
    728	}
    729
    730create_child:
    731	child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
    732						     req_unhash, own_req);
    733
    734	if (child && *own_req) {
    735		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
    736
    737		tcp_rsk(req)->drop_req = false;
    738
    739		/* we need to fallback on ctx allocation failure and on pre-reqs
    740		 * checking above. In the latter scenario we additionally need
    741		 * to reset the context to non MPTCP status.
    742		 */
    743		if (!ctx || fallback) {
    744			if (fallback_is_fatal) {
    745				subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
    746				goto dispose_child;
    747			}
    748
    749			subflow_drop_ctx(child);
    750			goto out;
    751		}
    752
    753		/* ssk inherits options of listener sk */
    754		ctx->setsockopt_seq = listener->setsockopt_seq;
    755
    756		if (ctx->mp_capable) {
    757			/* this can't race with mptcp_close(), as the msk is
    758			 * not yet exposted to user-space
    759			 */
    760			inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED);
    761
    762			/* record the newly created socket as the first msk
    763			 * subflow, but don't link it yet into conn_list
    764			 */
    765			WRITE_ONCE(mptcp_sk(new_msk)->first, child);
    766
    767			/* new mpc subflow takes ownership of the newly
    768			 * created mptcp socket
    769			 */
    770			new_msk->sk_destruct = mptcp_sock_destruct;
    771			mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
    772			mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
    773			mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
    774			ctx->conn = new_msk;
    775			new_msk = NULL;
    776
    777			/* with OoO packets we can reach here without ingress
    778			 * mpc option
    779			 */
    780			if (mp_opt.suboptions & OPTIONS_MPTCP_MPC)
    781				mptcp_subflow_fully_established(ctx, &mp_opt);
    782		} else if (ctx->mp_join) {
    783			struct mptcp_sock *owner;
    784
    785			owner = subflow_req->msk;
    786			if (!owner) {
    787				subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
    788				goto dispose_child;
    789			}
    790
    791			/* move the msk reference ownership to the subflow */
    792			subflow_req->msk = NULL;
    793			ctx->conn = (struct sock *)owner;
    794
    795			if (subflow_use_different_sport(owner, sk)) {
    796				pr_debug("ack inet_sport=%d %d",
    797					 ntohs(inet_sk(sk)->inet_sport),
    798					 ntohs(inet_sk((struct sock *)owner)->inet_sport));
    799				if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
    800					SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
    801					goto dispose_child;
    802				}
    803				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
    804			}
    805
    806			if (!mptcp_finish_join(child))
    807				goto dispose_child;
    808
    809			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
    810			tcp_rsk(req)->drop_req = true;
    811		}
    812	}
    813
    814out:
    815	/* dispose of the left over mptcp master, if any */
    816	if (unlikely(new_msk))
    817		mptcp_force_close(new_msk);
    818
    819	/* check for expected invariant - should never trigger, just help
    820	 * catching eariler subtle bugs
    821	 */
    822	WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
    823		     (!mptcp_subflow_ctx(child) ||
    824		      !mptcp_subflow_ctx(child)->conn));
    825	return child;
    826
    827dispose_child:
    828	subflow_drop_ctx(child);
    829	tcp_rsk(req)->drop_req = true;
    830	inet_csk_prepare_for_destroy_sock(child);
    831	tcp_done(child);
    832	req->rsk_ops->send_reset(sk, skb);
    833
    834	/* The last child reference will be released by the caller */
    835	return child;
    836}
    837
    838static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
    839static struct proto tcp_prot_override;
    840
    841enum mapping_status {
    842	MAPPING_OK,
    843	MAPPING_INVALID,
    844	MAPPING_EMPTY,
    845	MAPPING_DATA_FIN,
    846	MAPPING_DUMMY,
    847	MAPPING_BAD_CSUM
    848};
    849
    850static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
    851{
    852	pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
    853		 ssn, subflow->map_subflow_seq, subflow->map_data_len);
    854}
    855
    856static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
    857{
    858	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    859	unsigned int skb_consumed;
    860
    861	skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
    862	if (WARN_ON_ONCE(skb_consumed >= skb->len))
    863		return true;
    864
    865	return skb->len - skb_consumed <= subflow->map_data_len -
    866					  mptcp_subflow_get_map_offset(subflow);
    867}
    868
    869static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
    870{
    871	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    872	u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
    873
    874	if (unlikely(before(ssn, subflow->map_subflow_seq))) {
    875		/* Mapping covers data later in the subflow stream,
    876		 * currently unsupported.
    877		 */
    878		dbg_bad_map(subflow, ssn);
    879		return false;
    880	}
    881	if (unlikely(!before(ssn, subflow->map_subflow_seq +
    882				  subflow->map_data_len))) {
    883		/* Mapping does covers past subflow data, invalid */
    884		dbg_bad_map(subflow, ssn);
    885		return false;
    886	}
    887	return true;
    888}
    889
    890static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
    891					      bool csum_reqd)
    892{
    893	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    894	u32 offset, seq, delta;
    895	__sum16 csum;
    896	int len;
    897
    898	if (!csum_reqd)
    899		return MAPPING_OK;
    900
    901	/* mapping already validated on previous traversal */
    902	if (subflow->map_csum_len == subflow->map_data_len)
    903		return MAPPING_OK;
    904
    905	/* traverse the receive queue, ensuring it contains a full
    906	 * DSS mapping and accumulating the related csum.
    907	 * Preserve the accoumlate csum across multiple calls, to compute
    908	 * the csum only once
    909	 */
    910	delta = subflow->map_data_len - subflow->map_csum_len;
    911	for (;;) {
    912		seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
    913		offset = seq - TCP_SKB_CB(skb)->seq;
    914
    915		/* if the current skb has not been accounted yet, csum its contents
    916		 * up to the amount covered by the current DSS
    917		 */
    918		if (offset < skb->len) {
    919			__wsum csum;
    920
    921			len = min(skb->len - offset, delta);
    922			csum = skb_checksum(skb, offset, len, 0);
    923			subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
    924								subflow->map_csum_len);
    925
    926			delta -= len;
    927			subflow->map_csum_len += len;
    928		}
    929		if (delta == 0)
    930			break;
    931
    932		if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
    933			/* if this subflow is closed, the partial mapping
    934			 * will be never completed; flush the pending skbs, so
    935			 * that subflow_sched_work_if_closed() can kick in
    936			 */
    937			if (unlikely(ssk->sk_state == TCP_CLOSE))
    938				while ((skb = skb_peek(&ssk->sk_receive_queue)))
    939					sk_eat_skb(ssk, skb);
    940
    941			/* not enough data to validate the csum */
    942			return MAPPING_EMPTY;
    943		}
    944
    945		/* the DSS mapping for next skbs will be validated later,
    946		 * when a get_mapping_status call will process such skb
    947		 */
    948		skb = skb->next;
    949	}
    950
    951	/* note that 'map_data_len' accounts only for the carried data, does
    952	 * not include the eventual seq increment due to the data fin,
    953	 * while the pseudo header requires the original DSS data len,
    954	 * including that
    955	 */
    956	csum = __mptcp_make_csum(subflow->map_seq,
    957				 subflow->map_subflow_seq,
    958				 subflow->map_data_len + subflow->map_data_fin,
    959				 subflow->map_data_csum);
    960	if (unlikely(csum)) {
    961		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
    962		return MAPPING_BAD_CSUM;
    963	}
    964
    965	subflow->valid_csum_seen = 1;
    966	return MAPPING_OK;
    967}
    968
    969static enum mapping_status get_mapping_status(struct sock *ssk,
    970					      struct mptcp_sock *msk)
    971{
    972	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    973	bool csum_reqd = READ_ONCE(msk->csum_enabled);
    974	struct mptcp_ext *mpext;
    975	struct sk_buff *skb;
    976	u16 data_len;
    977	u64 map_seq;
    978
    979	skb = skb_peek(&ssk->sk_receive_queue);
    980	if (!skb)
    981		return MAPPING_EMPTY;
    982
    983	if (mptcp_check_fallback(ssk))
    984		return MAPPING_DUMMY;
    985
    986	mpext = mptcp_get_ext(skb);
    987	if (!mpext || !mpext->use_map) {
    988		if (!subflow->map_valid && !skb->len) {
    989			/* the TCP stack deliver 0 len FIN pkt to the receive
    990			 * queue, that is the only 0len pkts ever expected here,
    991			 * and we can admit no mapping only for 0 len pkts
    992			 */
    993			if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
    994				WARN_ONCE(1, "0len seq %d:%d flags %x",
    995					  TCP_SKB_CB(skb)->seq,
    996					  TCP_SKB_CB(skb)->end_seq,
    997					  TCP_SKB_CB(skb)->tcp_flags);
    998			sk_eat_skb(ssk, skb);
    999			return MAPPING_EMPTY;
   1000		}
   1001
   1002		if (!subflow->map_valid)
   1003			return MAPPING_INVALID;
   1004
   1005		goto validate_seq;
   1006	}
   1007
   1008	trace_get_mapping_status(mpext);
   1009
   1010	data_len = mpext->data_len;
   1011	if (data_len == 0) {
   1012		pr_debug("infinite mapping received");
   1013		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
   1014		subflow->map_data_len = 0;
   1015		return MAPPING_INVALID;
   1016	}
   1017
   1018	if (mpext->data_fin == 1) {
   1019		if (data_len == 1) {
   1020			bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
   1021								 mpext->dsn64);
   1022			pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
   1023			if (subflow->map_valid) {
   1024				/* A DATA_FIN might arrive in a DSS
   1025				 * option before the previous mapping
   1026				 * has been fully consumed. Continue
   1027				 * handling the existing mapping.
   1028				 */
   1029				skb_ext_del(skb, SKB_EXT_MPTCP);
   1030				return MAPPING_OK;
   1031			} else {
   1032				if (updated && schedule_work(&msk->work))
   1033					sock_hold((struct sock *)msk);
   1034
   1035				return MAPPING_DATA_FIN;
   1036			}
   1037		} else {
   1038			u64 data_fin_seq = mpext->data_seq + data_len - 1;
   1039
   1040			/* If mpext->data_seq is a 32-bit value, data_fin_seq
   1041			 * must also be limited to 32 bits.
   1042			 */
   1043			if (!mpext->dsn64)
   1044				data_fin_seq &= GENMASK_ULL(31, 0);
   1045
   1046			mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
   1047			pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
   1048				 data_fin_seq, mpext->dsn64);
   1049		}
   1050
   1051		/* Adjust for DATA_FIN using 1 byte of sequence space */
   1052		data_len--;
   1053	}
   1054
   1055	map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64);
   1056	WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
   1057
   1058	if (subflow->map_valid) {
   1059		/* Allow replacing only with an identical map */
   1060		if (subflow->map_seq == map_seq &&
   1061		    subflow->map_subflow_seq == mpext->subflow_seq &&
   1062		    subflow->map_data_len == data_len &&
   1063		    subflow->map_csum_reqd == mpext->csum_reqd) {
   1064			skb_ext_del(skb, SKB_EXT_MPTCP);
   1065			goto validate_csum;
   1066		}
   1067
   1068		/* If this skb data are fully covered by the current mapping,
   1069		 * the new map would need caching, which is not supported
   1070		 */
   1071		if (skb_is_fully_mapped(ssk, skb)) {
   1072			MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
   1073			return MAPPING_INVALID;
   1074		}
   1075
   1076		/* will validate the next map after consuming the current one */
   1077		goto validate_csum;
   1078	}
   1079
   1080	subflow->map_seq = map_seq;
   1081	subflow->map_subflow_seq = mpext->subflow_seq;
   1082	subflow->map_data_len = data_len;
   1083	subflow->map_valid = 1;
   1084	subflow->map_data_fin = mpext->data_fin;
   1085	subflow->mpc_map = mpext->mpc_map;
   1086	subflow->map_csum_reqd = mpext->csum_reqd;
   1087	subflow->map_csum_len = 0;
   1088	subflow->map_data_csum = csum_unfold(mpext->csum);
   1089
   1090	/* Cfr RFC 8684 Section 3.3.0 */
   1091	if (unlikely(subflow->map_csum_reqd != csum_reqd))
   1092		return MAPPING_INVALID;
   1093
   1094	pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
   1095		 subflow->map_seq, subflow->map_subflow_seq,
   1096		 subflow->map_data_len, subflow->map_csum_reqd,
   1097		 subflow->map_data_csum);
   1098
   1099validate_seq:
   1100	/* we revalidate valid mapping on new skb, because we must ensure
   1101	 * the current skb is completely covered by the available mapping
   1102	 */
   1103	if (!validate_mapping(ssk, skb)) {
   1104		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH);
   1105		return MAPPING_INVALID;
   1106	}
   1107
   1108	skb_ext_del(skb, SKB_EXT_MPTCP);
   1109
   1110validate_csum:
   1111	return validate_data_csum(ssk, skb, csum_reqd);
   1112}
   1113
   1114static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
   1115				       u64 limit)
   1116{
   1117	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
   1118	bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
   1119	u32 incr;
   1120
   1121	incr = limit >= skb->len ? skb->len + fin : limit;
   1122
   1123	pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
   1124		 subflow->map_subflow_seq);
   1125	MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
   1126	tcp_sk(ssk)->copied_seq += incr;
   1127	if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
   1128		sk_eat_skb(ssk, skb);
   1129	if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
   1130		subflow->map_valid = 0;
   1131}
   1132
   1133/* sched mptcp worker to remove the subflow if no more data is pending */
   1134static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
   1135{
   1136	struct sock *sk = (struct sock *)msk;
   1137
   1138	if (likely(ssk->sk_state != TCP_CLOSE))
   1139		return;
   1140
   1141	if (skb_queue_empty(&ssk->sk_receive_queue) &&
   1142	    !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) {
   1143		sock_hold(sk);
   1144		if (!schedule_work(&msk->work))
   1145			sock_put(sk);
   1146	}
   1147}
   1148
   1149static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
   1150{
   1151	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
   1152
   1153	if (subflow->mp_join)
   1154		return false;
   1155	else if (READ_ONCE(msk->csum_enabled))
   1156		return !subflow->valid_csum_seen;
   1157	else
   1158		return !subflow->fully_established;
   1159}
   1160
   1161static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
   1162{
   1163	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
   1164	unsigned long fail_tout;
   1165
   1166	/* greceful failure can happen only on the MPC subflow */
   1167	if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
   1168		return;
   1169
   1170	/* since the close timeout take precedence on the fail one,
   1171	 * no need to start the latter when the first is already set
   1172	 */
   1173	if (sock_flag((struct sock *)msk, SOCK_DEAD))
   1174		return;
   1175
   1176	/* we don't need extreme accuracy here, use a zero fail_tout as special
   1177	 * value meaning no fail timeout at all;
   1178	 */
   1179	fail_tout = jiffies + TCP_RTO_MAX;
   1180	if (!fail_tout)
   1181		fail_tout = 1;
   1182	WRITE_ONCE(subflow->fail_tout, fail_tout);
   1183	tcp_send_ack(ssk);
   1184
   1185	mptcp_reset_timeout(msk, subflow->fail_tout);
   1186}
   1187
   1188static bool subflow_check_data_avail(struct sock *ssk)
   1189{
   1190	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
   1191	enum mapping_status status;
   1192	struct mptcp_sock *msk;
   1193	struct sk_buff *skb;
   1194
   1195	if (!skb_peek(&ssk->sk_receive_queue))
   1196		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
   1197	if (subflow->data_avail)
   1198		return true;
   1199
   1200	msk = mptcp_sk(subflow->conn);
   1201	for (;;) {
   1202		u64 ack_seq;
   1203		u64 old_ack;
   1204
   1205		status = get_mapping_status(ssk, msk);
   1206		trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
   1207		if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
   1208			     status == MAPPING_BAD_CSUM))
   1209			goto fallback;
   1210
   1211		if (status != MAPPING_OK)
   1212			goto no_data;
   1213
   1214		skb = skb_peek(&ssk->sk_receive_queue);
   1215		if (WARN_ON_ONCE(!skb))
   1216			goto no_data;
   1217
   1218		/* if msk lacks the remote key, this subflow must provide an
   1219		 * MP_CAPABLE-based mapping
   1220		 */
   1221		if (unlikely(!READ_ONCE(msk->can_ack))) {
   1222			if (!subflow->mpc_map)
   1223				goto fallback;
   1224			WRITE_ONCE(msk->remote_key, subflow->remote_key);
   1225			WRITE_ONCE(msk->ack_seq, subflow->map_seq);
   1226			WRITE_ONCE(msk->can_ack, true);
   1227		}
   1228
   1229		old_ack = READ_ONCE(msk->ack_seq);
   1230		ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
   1231		pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
   1232			 ack_seq);
   1233		if (unlikely(before64(ack_seq, old_ack))) {
   1234			mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
   1235			continue;
   1236		}
   1237
   1238		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
   1239		break;
   1240	}
   1241	return true;
   1242
   1243no_data:
   1244	subflow_sched_work_if_closed(msk, ssk);
   1245	return false;
   1246
   1247fallback:
   1248	if (!__mptcp_check_fallback(msk)) {
   1249		/* RFC 8684 section 3.7. */
   1250		if (status == MAPPING_BAD_CSUM &&
   1251		    (subflow->mp_join || subflow->valid_csum_seen)) {
   1252			subflow->send_mp_fail = 1;
   1253
   1254			if (!READ_ONCE(msk->allow_infinite_fallback)) {
   1255				subflow->reset_transient = 0;
   1256				subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
   1257				goto reset;
   1258			}
   1259			mptcp_subflow_fail(msk, ssk);
   1260			WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
   1261			return true;
   1262		}
   1263
   1264		if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
   1265			/* fatal protocol error, close the socket.
   1266			 * subflow_error_report() will introduce the appropriate barriers
   1267			 */
   1268			subflow->reset_transient = 0;
   1269			subflow->reset_reason = MPTCP_RST_EMPTCP;
   1270
   1271reset:
   1272			ssk->sk_err = EBADMSG;
   1273			tcp_set_state(ssk, TCP_CLOSE);
   1274			while ((skb = skb_peek(&ssk->sk_receive_queue)))
   1275				sk_eat_skb(ssk, skb);
   1276			tcp_send_active_reset(ssk, GFP_ATOMIC);
   1277			WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
   1278			return false;
   1279		}
   1280
   1281		mptcp_do_fallback(ssk);
   1282	}
   1283
   1284	skb = skb_peek(&ssk->sk_receive_queue);
   1285	subflow->map_valid = 1;
   1286	subflow->map_seq = READ_ONCE(msk->ack_seq);
   1287	subflow->map_data_len = skb->len;
   1288	subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
   1289	WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
   1290	return true;
   1291}
   1292
   1293bool mptcp_subflow_data_available(struct sock *sk)
   1294{
   1295	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
   1296
   1297	/* check if current mapping is still valid */
   1298	if (subflow->map_valid &&
   1299	    mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
   1300		subflow->map_valid = 0;
   1301		WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
   1302
   1303		pr_debug("Done with mapping: seq=%u data_len=%u",
   1304			 subflow->map_subflow_seq,
   1305			 subflow->map_data_len);
   1306	}
   1307
   1308	return subflow_check_data_avail(sk);
   1309}
   1310
   1311/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
   1312 * not the ssk one.
   1313 *
   1314 * In mptcp, rwin is about the mptcp-level connection data.
   1315 *
   1316 * Data that is still on the ssk rx queue can thus be ignored,
   1317 * as far as mptcp peer is concerned that data is still inflight.
   1318 * DSS ACK is updated when skb is moved to the mptcp rx queue.
   1319 */
   1320void mptcp_space(const struct sock *ssk, int *space, int *full_space)
   1321{
   1322	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
   1323	const struct sock *sk = subflow->conn;
   1324
   1325	*space = __mptcp_space(sk);
   1326	*full_space = tcp_full_space(sk);
   1327}
   1328
   1329void __mptcp_error_report(struct sock *sk)
   1330{
   1331	struct mptcp_subflow_context *subflow;
   1332	struct mptcp_sock *msk = mptcp_sk(sk);
   1333
   1334	mptcp_for_each_subflow(msk, subflow) {
   1335		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
   1336		int err = sock_error(ssk);
   1337
   1338		if (!err)
   1339			continue;
   1340
   1341		/* only propagate errors on fallen-back sockets or
   1342		 * on MPC connect
   1343		 */
   1344		if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
   1345			continue;
   1346
   1347		inet_sk_state_store(sk, inet_sk_state_load(ssk));
   1348		sk->sk_err = -err;
   1349
   1350		/* This barrier is coupled with smp_rmb() in mptcp_poll() */
   1351		smp_wmb();
   1352		sk_error_report(sk);
   1353		break;
   1354	}
   1355}
   1356
   1357static void subflow_error_report(struct sock *ssk)
   1358{
   1359	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
   1360
   1361	mptcp_data_lock(sk);
   1362	if (!sock_owned_by_user(sk))
   1363		__mptcp_error_report(sk);
   1364	else
   1365		__set_bit(MPTCP_ERROR_REPORT,  &mptcp_sk(sk)->cb_flags);
   1366	mptcp_data_unlock(sk);
   1367}
   1368
   1369static void subflow_data_ready(struct sock *sk)
   1370{
   1371	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
   1372	u16 state = 1 << inet_sk_state_load(sk);
   1373	struct sock *parent = subflow->conn;
   1374	struct mptcp_sock *msk;
   1375
   1376	msk = mptcp_sk(parent);
   1377	if (state & TCPF_LISTEN) {
   1378		/* MPJ subflow are removed from accept queue before reaching here,
   1379		 * avoid stray wakeups
   1380		 */
   1381		if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
   1382			return;
   1383
   1384		parent->sk_data_ready(parent);
   1385		return;
   1386	}
   1387
   1388	WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
   1389		     !subflow->mp_join && !(state & TCPF_CLOSE));
   1390
   1391	if (mptcp_subflow_data_available(sk))
   1392		mptcp_data_ready(parent, sk);
   1393	else if (unlikely(sk->sk_err))
   1394		subflow_error_report(sk);
   1395}
   1396
   1397static void subflow_write_space(struct sock *ssk)
   1398{
   1399	struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
   1400
   1401	mptcp_propagate_sndbuf(sk, ssk);
   1402	mptcp_write_space(sk);
   1403}
   1404
   1405static const struct inet_connection_sock_af_ops *
   1406subflow_default_af_ops(struct sock *sk)
   1407{
   1408#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1409	if (sk->sk_family == AF_INET6)
   1410		return &subflow_v6_specific;
   1411#endif
   1412	return &subflow_specific;
   1413}
   1414
   1415#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1416void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
   1417{
   1418	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
   1419	struct inet_connection_sock *icsk = inet_csk(sk);
   1420	const struct inet_connection_sock_af_ops *target;
   1421
   1422	target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
   1423
   1424	pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
   1425		 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
   1426
   1427	if (likely(icsk->icsk_af_ops == target))
   1428		return;
   1429
   1430	subflow->icsk_af_ops = icsk->icsk_af_ops;
   1431	icsk->icsk_af_ops = target;
   1432}
   1433#endif
   1434
   1435void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
   1436			 struct sockaddr_storage *addr,
   1437			 unsigned short family)
   1438{
   1439	memset(addr, 0, sizeof(*addr));
   1440	addr->ss_family = family;
   1441	if (addr->ss_family == AF_INET) {
   1442		struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
   1443
   1444		if (info->family == AF_INET)
   1445			in_addr->sin_addr = info->addr;
   1446#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1447		else if (ipv6_addr_v4mapped(&info->addr6))
   1448			in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
   1449#endif
   1450		in_addr->sin_port = info->port;
   1451	}
   1452#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1453	else if (addr->ss_family == AF_INET6) {
   1454		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
   1455
   1456		if (info->family == AF_INET)
   1457			ipv6_addr_set_v4mapped(info->addr.s_addr,
   1458					       &in6_addr->sin6_addr);
   1459		else
   1460			in6_addr->sin6_addr = info->addr6;
   1461		in6_addr->sin6_port = info->port;
   1462	}
   1463#endif
   1464}
   1465
   1466int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
   1467			    const struct mptcp_addr_info *remote)
   1468{
   1469	struct mptcp_sock *msk = mptcp_sk(sk);
   1470	struct mptcp_subflow_context *subflow;
   1471	struct sockaddr_storage addr;
   1472	int remote_id = remote->id;
   1473	int local_id = loc->id;
   1474	int err = -ENOTCONN;
   1475	struct socket *sf;
   1476	struct sock *ssk;
   1477	u32 remote_token;
   1478	int addrlen;
   1479	int ifindex;
   1480	u8 flags;
   1481
   1482	if (!mptcp_is_fully_established(sk))
   1483		goto err_out;
   1484
   1485	err = mptcp_subflow_create_socket(sk, &sf);
   1486	if (err)
   1487		goto err_out;
   1488
   1489	ssk = sf->sk;
   1490	subflow = mptcp_subflow_ctx(ssk);
   1491	do {
   1492		get_random_bytes(&subflow->local_nonce, sizeof(u32));
   1493	} while (!subflow->local_nonce);
   1494
   1495	if (local_id)
   1496		subflow_set_local_id(subflow, local_id);
   1497
   1498	mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
   1499					     &flags, &ifindex);
   1500	subflow->remote_key = msk->remote_key;
   1501	subflow->local_key = msk->local_key;
   1502	subflow->token = msk->token;
   1503	mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
   1504
   1505	addrlen = sizeof(struct sockaddr_in);
   1506#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1507	if (addr.ss_family == AF_INET6)
   1508		addrlen = sizeof(struct sockaddr_in6);
   1509#endif
   1510	mptcp_sockopt_sync(msk, ssk);
   1511
   1512	ssk->sk_bound_dev_if = ifindex;
   1513	err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
   1514	if (err)
   1515		goto failed;
   1516
   1517	mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
   1518	pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
   1519		 remote_token, local_id, remote_id);
   1520	subflow->remote_token = remote_token;
   1521	subflow->remote_id = remote_id;
   1522	subflow->request_join = 1;
   1523	subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
   1524	mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
   1525
   1526	sock_hold(ssk);
   1527	list_add_tail(&subflow->node, &msk->conn_list);
   1528	err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
   1529	if (err && err != -EINPROGRESS)
   1530		goto failed_unlink;
   1531
   1532	/* discard the subflow socket */
   1533	mptcp_sock_graft(ssk, sk->sk_socket);
   1534	iput(SOCK_INODE(sf));
   1535	WRITE_ONCE(msk->allow_infinite_fallback, false);
   1536	return err;
   1537
   1538failed_unlink:
   1539	list_del(&subflow->node);
   1540	sock_put(mptcp_subflow_tcp_sock(subflow));
   1541
   1542failed:
   1543	subflow->disposable = 1;
   1544	sock_release(sf);
   1545
   1546err_out:
   1547	/* we account subflows before the creation, and this failures will not
   1548	 * be caught by sk_state_change()
   1549	 */
   1550	mptcp_pm_close_subflow(msk);
   1551	return err;
   1552}
   1553
   1554static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
   1555{
   1556#ifdef CONFIG_SOCK_CGROUP_DATA
   1557	struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
   1558				*child_skcd = &child->sk_cgrp_data;
   1559
   1560	/* only the additional subflows created by kworkers have to be modified */
   1561	if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
   1562	    cgroup_id(sock_cgroup_ptr(child_skcd))) {
   1563#ifdef CONFIG_MEMCG
   1564		struct mem_cgroup *memcg = parent->sk_memcg;
   1565
   1566		mem_cgroup_sk_free(child);
   1567		if (memcg && css_tryget(&memcg->css))
   1568			child->sk_memcg = memcg;
   1569#endif /* CONFIG_MEMCG */
   1570
   1571		cgroup_sk_free(child_skcd);
   1572		*child_skcd = *parent_skcd;
   1573		cgroup_sk_clone(child_skcd);
   1574	}
   1575#endif /* CONFIG_SOCK_CGROUP_DATA */
   1576}
   1577
   1578static void mptcp_subflow_ops_override(struct sock *ssk)
   1579{
   1580#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1581	if (ssk->sk_prot == &tcpv6_prot)
   1582		ssk->sk_prot = &tcpv6_prot_override;
   1583	else
   1584#endif
   1585		ssk->sk_prot = &tcp_prot_override;
   1586}
   1587
   1588static void mptcp_subflow_ops_undo_override(struct sock *ssk)
   1589{
   1590#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1591	if (ssk->sk_prot == &tcpv6_prot_override)
   1592		ssk->sk_prot = &tcpv6_prot;
   1593	else
   1594#endif
   1595		ssk->sk_prot = &tcp_prot;
   1596}
   1597int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
   1598{
   1599	struct mptcp_subflow_context *subflow;
   1600	struct net *net = sock_net(sk);
   1601	struct socket *sf;
   1602	int err;
   1603
   1604	/* un-accepted server sockets can reach here - on bad configuration
   1605	 * bail early to avoid greater trouble later
   1606	 */
   1607	if (unlikely(!sk->sk_socket))
   1608		return -EINVAL;
   1609
   1610	err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
   1611			       &sf);
   1612	if (err)
   1613		return err;
   1614
   1615	lock_sock(sf->sk);
   1616
   1617	/* the newly created socket has to be in the same cgroup as its parent */
   1618	mptcp_attach_cgroup(sk, sf->sk);
   1619
   1620	/* kernel sockets do not by default acquire net ref, but TCP timer
   1621	 * needs it.
   1622	 */
   1623	sf->sk->sk_net_refcnt = 1;
   1624	get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
   1625	sock_inuse_add(net, 1);
   1626	err = tcp_set_ulp(sf->sk, "mptcp");
   1627	release_sock(sf->sk);
   1628
   1629	if (err) {
   1630		sock_release(sf);
   1631		return err;
   1632	}
   1633
   1634	/* the newly created socket really belongs to the owning MPTCP master
   1635	 * socket, even if for additional subflows the allocation is performed
   1636	 * by a kernel workqueue. Adjust inode references, so that the
   1637	 * procfs/diag interaces really show this one belonging to the correct
   1638	 * user.
   1639	 */
   1640	SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
   1641	SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
   1642	SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
   1643
   1644	subflow = mptcp_subflow_ctx(sf->sk);
   1645	pr_debug("subflow=%p", subflow);
   1646
   1647	*new_sock = sf;
   1648	sock_hold(sk);
   1649	subflow->conn = sk;
   1650	mptcp_subflow_ops_override(sf->sk);
   1651
   1652	return 0;
   1653}
   1654
   1655static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
   1656							gfp_t priority)
   1657{
   1658	struct inet_connection_sock *icsk = inet_csk(sk);
   1659	struct mptcp_subflow_context *ctx;
   1660
   1661	ctx = kzalloc(sizeof(*ctx), priority);
   1662	if (!ctx)
   1663		return NULL;
   1664
   1665	rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
   1666	INIT_LIST_HEAD(&ctx->node);
   1667	INIT_LIST_HEAD(&ctx->delegated_node);
   1668
   1669	pr_debug("subflow=%p", ctx);
   1670
   1671	ctx->tcp_sock = sk;
   1672
   1673	return ctx;
   1674}
   1675
   1676static void __subflow_state_change(struct sock *sk)
   1677{
   1678	struct socket_wq *wq;
   1679
   1680	rcu_read_lock();
   1681	wq = rcu_dereference(sk->sk_wq);
   1682	if (skwq_has_sleeper(wq))
   1683		wake_up_interruptible_all(&wq->wait);
   1684	rcu_read_unlock();
   1685}
   1686
   1687static bool subflow_is_done(const struct sock *sk)
   1688{
   1689	return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
   1690}
   1691
   1692static void subflow_state_change(struct sock *sk)
   1693{
   1694	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
   1695	struct sock *parent = subflow->conn;
   1696
   1697	__subflow_state_change(sk);
   1698
   1699	if (subflow_simultaneous_connect(sk)) {
   1700		mptcp_propagate_sndbuf(parent, sk);
   1701		mptcp_do_fallback(sk);
   1702		mptcp_rcv_space_init(mptcp_sk(parent), sk);
   1703		pr_fallback(mptcp_sk(parent));
   1704		subflow->conn_finished = 1;
   1705		mptcp_set_connected(parent);
   1706	}
   1707
   1708	/* as recvmsg() does not acquire the subflow socket for ssk selection
   1709	 * a fin packet carrying a DSS can be unnoticed if we don't trigger
   1710	 * the data available machinery here.
   1711	 */
   1712	if (mptcp_subflow_data_available(sk))
   1713		mptcp_data_ready(parent, sk);
   1714	else if (unlikely(sk->sk_err))
   1715		subflow_error_report(sk);
   1716
   1717	subflow_sched_work_if_closed(mptcp_sk(parent), sk);
   1718
   1719	if (__mptcp_check_fallback(mptcp_sk(parent)) &&
   1720	    !subflow->rx_eof && subflow_is_done(sk)) {
   1721		subflow->rx_eof = 1;
   1722		mptcp_subflow_eof(parent);
   1723	}
   1724}
   1725
   1726void mptcp_subflow_queue_clean(struct sock *listener_ssk)
   1727{
   1728	struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
   1729	struct mptcp_sock *msk, *next, *head = NULL;
   1730	struct request_sock *req;
   1731
   1732	/* build a list of all unaccepted mptcp sockets */
   1733	spin_lock_bh(&queue->rskq_lock);
   1734	for (req = queue->rskq_accept_head; req; req = req->dl_next) {
   1735		struct mptcp_subflow_context *subflow;
   1736		struct sock *ssk = req->sk;
   1737		struct mptcp_sock *msk;
   1738
   1739		if (!sk_is_mptcp(ssk))
   1740			continue;
   1741
   1742		subflow = mptcp_subflow_ctx(ssk);
   1743		if (!subflow || !subflow->conn)
   1744			continue;
   1745
   1746		/* skip if already in list */
   1747		msk = mptcp_sk(subflow->conn);
   1748		if (msk->dl_next || msk == head)
   1749			continue;
   1750
   1751		msk->dl_next = head;
   1752		head = msk;
   1753	}
   1754	spin_unlock_bh(&queue->rskq_lock);
   1755	if (!head)
   1756		return;
   1757
   1758	/* can't acquire the msk socket lock under the subflow one,
   1759	 * or will cause ABBA deadlock
   1760	 */
   1761	release_sock(listener_ssk);
   1762
   1763	for (msk = head; msk; msk = next) {
   1764		struct sock *sk = (struct sock *)msk;
   1765		bool slow;
   1766
   1767		slow = lock_sock_fast_nested(sk);
   1768		next = msk->dl_next;
   1769		msk->first = NULL;
   1770		msk->dl_next = NULL;
   1771		unlock_sock_fast(sk, slow);
   1772	}
   1773
   1774	/* we are still under the listener msk socket lock */
   1775	lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
   1776}
   1777
   1778static int subflow_ulp_init(struct sock *sk)
   1779{
   1780	struct inet_connection_sock *icsk = inet_csk(sk);
   1781	struct mptcp_subflow_context *ctx;
   1782	struct tcp_sock *tp = tcp_sk(sk);
   1783	int err = 0;
   1784
   1785	/* disallow attaching ULP to a socket unless it has been
   1786	 * created with sock_create_kern()
   1787	 */
   1788	if (!sk->sk_kern_sock) {
   1789		err = -EOPNOTSUPP;
   1790		goto out;
   1791	}
   1792
   1793	ctx = subflow_create_ctx(sk, GFP_KERNEL);
   1794	if (!ctx) {
   1795		err = -ENOMEM;
   1796		goto out;
   1797	}
   1798
   1799	pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
   1800
   1801	tp->is_mptcp = 1;
   1802	ctx->icsk_af_ops = icsk->icsk_af_ops;
   1803	icsk->icsk_af_ops = subflow_default_af_ops(sk);
   1804	ctx->tcp_state_change = sk->sk_state_change;
   1805	ctx->tcp_error_report = sk->sk_error_report;
   1806
   1807	WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
   1808	WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
   1809
   1810	sk->sk_data_ready = subflow_data_ready;
   1811	sk->sk_write_space = subflow_write_space;
   1812	sk->sk_state_change = subflow_state_change;
   1813	sk->sk_error_report = subflow_error_report;
   1814out:
   1815	return err;
   1816}
   1817
   1818static void subflow_ulp_release(struct sock *ssk)
   1819{
   1820	struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
   1821	bool release = true;
   1822	struct sock *sk;
   1823
   1824	if (!ctx)
   1825		return;
   1826
   1827	sk = ctx->conn;
   1828	if (sk) {
   1829		/* if the msk has been orphaned, keep the ctx
   1830		 * alive, will be freed by __mptcp_close_ssk(),
   1831		 * when the subflow is still unaccepted
   1832		 */
   1833		release = ctx->disposable || list_empty(&ctx->node);
   1834		sock_put(sk);
   1835	}
   1836
   1837	mptcp_subflow_ops_undo_override(ssk);
   1838	if (release)
   1839		kfree_rcu(ctx, rcu);
   1840}
   1841
   1842static void subflow_ulp_clone(const struct request_sock *req,
   1843			      struct sock *newsk,
   1844			      const gfp_t priority)
   1845{
   1846	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
   1847	struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
   1848	struct mptcp_subflow_context *new_ctx;
   1849
   1850	if (!tcp_rsk(req)->is_mptcp ||
   1851	    (!subflow_req->mp_capable && !subflow_req->mp_join)) {
   1852		subflow_ulp_fallback(newsk, old_ctx);
   1853		return;
   1854	}
   1855
   1856	new_ctx = subflow_create_ctx(newsk, priority);
   1857	if (!new_ctx) {
   1858		subflow_ulp_fallback(newsk, old_ctx);
   1859		return;
   1860	}
   1861
   1862	new_ctx->conn_finished = 1;
   1863	new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
   1864	new_ctx->tcp_state_change = old_ctx->tcp_state_change;
   1865	new_ctx->tcp_error_report = old_ctx->tcp_error_report;
   1866	new_ctx->rel_write_seq = 1;
   1867	new_ctx->tcp_sock = newsk;
   1868
   1869	if (subflow_req->mp_capable) {
   1870		/* see comments in subflow_syn_recv_sock(), MPTCP connection
   1871		 * is fully established only after we receive the remote key
   1872		 */
   1873		new_ctx->mp_capable = 1;
   1874		new_ctx->local_key = subflow_req->local_key;
   1875		new_ctx->token = subflow_req->token;
   1876		new_ctx->ssn_offset = subflow_req->ssn_offset;
   1877		new_ctx->idsn = subflow_req->idsn;
   1878
   1879		/* this is the first subflow, id is always 0 */
   1880		new_ctx->local_id_valid = 1;
   1881	} else if (subflow_req->mp_join) {
   1882		new_ctx->ssn_offset = subflow_req->ssn_offset;
   1883		new_ctx->mp_join = 1;
   1884		new_ctx->fully_established = 1;
   1885		new_ctx->backup = subflow_req->backup;
   1886		new_ctx->remote_id = subflow_req->remote_id;
   1887		new_ctx->token = subflow_req->token;
   1888		new_ctx->thmac = subflow_req->thmac;
   1889
   1890		/* the subflow req id is valid, fetched via subflow_check_req()
   1891		 * and subflow_token_join_request()
   1892		 */
   1893		subflow_set_local_id(new_ctx, subflow_req->local_id);
   1894	}
   1895}
   1896
   1897static void tcp_release_cb_override(struct sock *ssk)
   1898{
   1899	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
   1900
   1901	if (mptcp_subflow_has_delegated_action(subflow))
   1902		mptcp_subflow_process_delegated(ssk);
   1903
   1904	tcp_release_cb(ssk);
   1905}
   1906
   1907static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
   1908	.name		= "mptcp",
   1909	.owner		= THIS_MODULE,
   1910	.init		= subflow_ulp_init,
   1911	.release	= subflow_ulp_release,
   1912	.clone		= subflow_ulp_clone,
   1913};
   1914
   1915static int subflow_ops_init(struct request_sock_ops *subflow_ops)
   1916{
   1917	subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
   1918	subflow_ops->slab_name = "request_sock_subflow";
   1919
   1920	subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
   1921					      subflow_ops->obj_size, 0,
   1922					      SLAB_ACCOUNT |
   1923					      SLAB_TYPESAFE_BY_RCU,
   1924					      NULL);
   1925	if (!subflow_ops->slab)
   1926		return -ENOMEM;
   1927
   1928	subflow_ops->destructor = subflow_req_destructor;
   1929
   1930	return 0;
   1931}
   1932
   1933void __init mptcp_subflow_init(void)
   1934{
   1935	mptcp_subflow_request_sock_ops = tcp_request_sock_ops;
   1936	if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0)
   1937		panic("MPTCP: failed to init subflow request sock ops\n");
   1938
   1939	subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
   1940	subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
   1941
   1942	subflow_specific = ipv4_specific;
   1943	subflow_specific.conn_request = subflow_v4_conn_request;
   1944	subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
   1945	subflow_specific.sk_rx_dst_set = subflow_finish_connect;
   1946	subflow_specific.rebuild_header = subflow_rebuild_header;
   1947
   1948	tcp_prot_override = tcp_prot;
   1949	tcp_prot_override.release_cb = tcp_release_cb_override;
   1950
   1951#if IS_ENABLED(CONFIG_MPTCP_IPV6)
   1952	subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
   1953	subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
   1954
   1955	subflow_v6_specific = ipv6_specific;
   1956	subflow_v6_specific.conn_request = subflow_v6_conn_request;
   1957	subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
   1958	subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
   1959	subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;
   1960
   1961	subflow_v6m_specific = subflow_v6_specific;
   1962	subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
   1963	subflow_v6m_specific.send_check = ipv4_specific.send_check;
   1964	subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
   1965	subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
   1966	subflow_v6m_specific.net_frag_header_len = 0;
   1967	subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
   1968
   1969	tcpv6_prot_override = tcpv6_prot;
   1970	tcpv6_prot_override.release_cb = tcp_release_cb_override;
   1971#endif
   1972
   1973	mptcp_diag_subflow_init(&subflow_ulp_ops);
   1974
   1975	if (tcp_register_ulp(&subflow_ulp_ops) != 0)
   1976		panic("MPTCP: failed to register subflows to ULP\n");
   1977}