From 0df48c26d8418c5c9fba63fac15b660d70ca2f1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:17 -0700 Subject: tcp: add tcpi_bytes_acked to tcp_info This patch tracks total number of bytes acked for a TCP socket. This is the sum of all changes done to tp->snd_una, and allows for precise tracking of delivered data. RFC4898 named this : tcpEStatsAppHCThruOctetsAcked This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_acked was placed near tp->snd_una for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 3b9718328d8b..6666e98a0af9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -189,6 +189,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3-71-gd317 From bdd1f9edacb5f5835d1e6276571bbbe5b88ded48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 15:28:18 -0700 Subject: tcp: add tcpi_bytes_received to tcp_info This patch tracks total number of payload bytes received on a TCP socket. This is the sum of all changes done to tp->rcv_nxt RFC4898 named this : tcpEStatsAppHCThruOctetsReceived This is a 64bit field, and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->bytes_received was placed near tp->rcv_nxt for best data locality and minimal performance impact. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Matt Mathis Cc: Eric Salo Cc: Martin Lau Cc: Chris Rapier Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 17 +++++++++++++---- 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0f73b43171da..3b2911502a8c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -145,6 +145,10 @@ struct tcp_sock { * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ + u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 6666e98a0af9..a48f93f3207b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -190,6 +190,7 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bf0e8ca7b5b..99fcc0b22c92 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2666,6 +2666,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) spin_lock_bh(&sk->sk_lock.slock); info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3d87aca6be8..3c673d5e6cff 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 378d3f4d4dc3..7e6962bcfc30 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3289,6 +3289,15 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) tp->snd_una = ack; } +/* If we update tp->rcv_nxt, also update tp->bytes_received */ +static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) +{ + u32 delta = seq - tp->rcv_nxt; + + tp->bytes_received += delta; + tp->rcv_nxt = seq; +} + /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 @@ -4245,7 +4254,7 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4413,7 +4422,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; - tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); @@ -4506,7 +4515,7 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5254,7 +5263,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); - tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } -- cgit v1.2.3-71-gd317 From 64f40ff5bbdb1b679fb3c4dbc8230d6517d2b8dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 16:23:48 -0700 Subject: tcp: prepare CC get_info() access from getsockopt() We would like that optional info provided by Congestion Control modules using netlink can also be read using getsockopt() This patch changes get_info() to put this information in a buffer, instead of skb, like tcp_get_info(), so that following patch can reuse this common infrastructure. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Daniel Borkmann Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++++- include/uapi/linux/inet_diag.h | 4 ++++ net/ipv4/inet_diag.c | 8 +++++--- net/ipv4/tcp_dctcp.c | 20 ++++++++++---------- net/ipv4/tcp_illinois.c | 21 +++++++++++---------- net/ipv4/tcp_vegas.c | 19 ++++++++++--------- net/ipv4/tcp_vegas.h | 3 ++- 7 files changed, 46 insertions(+), 34 deletions(-) (limited to 'include/uapi') diff --git a/include/net/tcp.h b/include/net/tcp.h index dd7b4ea6a10c..6d204f3f9df8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -804,6 +804,8 @@ enum tcp_ca_ack_event_flags { /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 +union tcp_cc_info; + struct tcp_congestion_ops { struct list_head list; u32 key; @@ -829,7 +831,8 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us); /* get info for inet_diag (optional) */ - int (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); + size_t (*get_info)(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index d65c0a09efd3..c7093c75bdd6 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -143,4 +143,8 @@ struct tcp_dctcp_info { __u32 dctcp_ab_tot; }; +union tcp_cc_info { + struct tcpvegas_info vegas; + struct tcp_dctcp_info dctcp; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bb77ebdae3b3..4d32262c7502 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -224,14 +224,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT) { - int err = 0; + union tcp_cc_info info; + size_t sz = 0; + int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) - err = ca_ops->get_info(sk, ext, skb); + sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); - if (err < 0) + if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 4376016f7fa5..4c41c1287197 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) } } -static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); @@ -286,18 +287,17 @@ static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcp_dctcp_info info; - - memset(&info, 0, sizeof(info)); + memset(info, 0, sizeof(struct tcp_dctcp_info)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { - info.dctcp_enabled = 1; - info.dctcp_ce_state = (u16) ca->ce_state; - info.dctcp_alpha = ca->dctcp_alpha; - info.dctcp_ab_ecn = ca->acked_bytes_ecn; - info.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_enabled = 1; + info->dctcp.dctcp_ce_state = (u16) ca->ce_state; + info->dctcp.dctcp_alpha = ca->dctcp_alpha; + info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; + info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; } - return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + *attr = INET_DIAG_DCTCPINFO; + return sizeof(*info); } return 0; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 67476f085e48..f71002e4db0b 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -300,24 +300,25 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) } /* Extract info for Tcp socket info provided via netlink. */ -static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) +static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct illinois *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = 1, - .tcpv_rttcnt = ca->cnt_rtt, - .tcpv_minrtt = ca->base_rtt, - }; + info->vegas.tcpv_enabled = 1; + info->vegas.tcpv_rttcnt = ca->cnt_rtt; + info->vegas.tcpv_minrtt = ca->base_rtt; + info->vegas.tcpv_rtt = 0; - if (info.tcpv_rttcnt > 0) { + if (info->vegas.tcpv_rttcnt > 0) { u64 t = ca->sum_rtt; - do_div(t, info.tcpv_rttcnt); - info.tcpv_rtt = t; + do_div(t, info->vegas.tcpv_rttcnt); + info->vegas.tcpv_rtt = t; } - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c71a1b8f7bde..a6cea1d5e20d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -286,18 +286,19 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } /* Extract info for Tcp socket info provided via netlink. */ -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) { const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcpvegas_info info = { - .tcpv_enabled = ca->doing_vegas_now, - .tcpv_rttcnt = ca->cntRTT, - .tcpv_rtt = ca->baseRTT, - .tcpv_minrtt = ca->minRTT, - }; - - return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); + info->vegas.tcpv_enabled = ca->doing_vegas_now, + info->vegas.tcpv_rttcnt = ca->cntRTT, + info->vegas.tcpv_rtt = ca->baseRTT, + info->vegas.tcpv_minrtt = ca->minRTT, + + *attr = INET_DIAG_VEGASINFO; + return sizeof(struct tcpvegas_info); } return 0; } diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index e8a6b33cc61d..ef9da5306c68 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); #endif /* __TCP_VEGAS_H */ -- cgit v1.2.3-71-gd317 From 6e9250f59ef9efb932c84850cd221f22c2a03c4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2015 16:23:49 -0700 Subject: tcp: add TCP_CC_INFO socket option Some Congestion Control modules can provide per flow information, but current way to get this information is to use netlink. Like TCP_INFO, let's add TCP_CC_INFO so that applications can issue a getsockopt() if they have a socket file descriptor, instead of playing complex netlink games. Sample usage would be : union tcp_cc_info info; socklen_t len = sizeof(info); if (getsockopt(fd, SOL_TCP, TCP_CC_INFO, &info, &len) == -1) Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Daniel Borkmann Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index a48f93f3207b..faa72f4fa547 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -112,6 +112,7 @@ enum { #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 99fcc0b22c92..46efa03d2b11 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -252,6 +252,7 @@ #include #include #include +#include #include #include #include @@ -2739,6 +2740,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_CC_INFO: { + const struct tcp_congestion_ops *ca_ops; + union tcp_cc_info info; + size_t sz = 0; + int attr; + + if (get_user(len, optlen)) + return -EFAULT; + + ca_ops = icsk->icsk_ca_ops; + if (ca_ops && ca_ops->get_info) + sz = ca_ops->get_info(sk, ~0U, &attr, &info); + + len = min_t(unsigned int, len, sz); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; -- cgit v1.2.3-71-gd317 From c967a0873a7836c7a77bf611f1c7d3f47c554c45 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 5 May 2015 09:06:30 -0700 Subject: mpls: Move reserved label definitions Move to include/uapi/linux/mpls.h to be externally visibile. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/mpls.h | 10 ++++++++++ net/mpls/af_mpls.c | 18 +++++++++--------- net/mpls/internal.h | 10 ---------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h index bc9abfe88c9a..0fe6ea5a41d5 100644 --- a/include/uapi/linux/mpls.h +++ b/include/uapi/linux/mpls.h @@ -31,4 +31,14 @@ struct mpls_label { #define MPLS_LS_TTL_MASK 0x000000FF #define MPLS_LS_TTL_SHIFT 0 +/* Reserved labels */ +#define MPLS_LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ +#define MPLS_LABEL_ROUTER_ALERT 1 /* RFC3032 */ +#define MPLS_LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ +#define MPLS_LABEL_IMPLICIT_NULL 3 /* RFC3032 */ +#define MPLS_LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define MPLS_LABEL_GAL 13 /* RFC5586 */ +#define MPLS_LABEL_OAM_ALERT 14 /* RFC3429 */ +#define MPLS_LABEL_EXTENSION 15 /* RFC7274 */ + #endif /* _UAPI_MPLS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 954810c76a86..b6eb7615960a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -647,7 +647,7 @@ int nla_get_labels(const struct nlattr *nla, return -EINVAL; switch (dec.label) { - case LABEL_IMPLICIT_NULL: + case MPLS_LABEL_IMPLICIT_NULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. @@ -935,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* In case the predefined labels need to be populated */ - if (limit > LABEL_IPV4_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(lo->addr_len); if (!rt0) @@ -945,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } - if (limit > LABEL_IPV6_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(lo->addr_len); if (!rt2) @@ -973,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit) memcpy(labels, old, cp_size); /* If needed set the predefined labels */ - if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) && - (limit > LABEL_IPV6_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV6_EXPLICIT_NULL], rt2); + if ((old_limit <= MPLS_LABEL_IPV6_EXPLICIT_NULL) && + (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6_EXPLICIT_NULL], rt2); rt2 = NULL; } - if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) && - (limit > LABEL_IPV4_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[LABEL_IPV4_EXPLICIT_NULL], rt0); + if ((old_limit <= MPLS_LABEL_IPV4_EXPLICIT_NULL) && + (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4_EXPLICIT_NULL], rt0); rt0 = NULL; } diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 693877d69606..b064c345042c 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,16 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H -#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ -#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */ -#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ -#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */ -#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ -#define LABEL_GAL 13 /* RFC5586 */ -#define LABEL_OAM_ALERT 14 /* RFC3429 */ -#define LABEL_EXTENSION 15 /* RFC7274 */ - - struct mpls_shim_hdr { __be32 label_stack_entry; }; -- cgit v1.2.3-71-gd317 From 78f5b899195019f71f7593c604d75ca61658eae3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 7 May 2015 08:08:51 -0700 Subject: mpls: Change reserved label names to be consistent with netbsd Since these are now visible to userspace it is nice to be consistent with BSD (sys/netmpls/mpls.h in netBSD). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/mpls.h | 12 ++++++------ net/mpls/af_mpls.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h index 0fe6ea5a41d5..139d4dd1cab8 100644 --- a/include/uapi/linux/mpls.h +++ b/include/uapi/linux/mpls.h @@ -32,13 +32,13 @@ struct mpls_label { #define MPLS_LS_TTL_SHIFT 0 /* Reserved labels */ -#define MPLS_LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ -#define MPLS_LABEL_ROUTER_ALERT 1 /* RFC3032 */ -#define MPLS_LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ -#define MPLS_LABEL_IMPLICIT_NULL 3 /* RFC3032 */ -#define MPLS_LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define MPLS_LABEL_IPV4NULL 0 /* RFC3032 */ +#define MPLS_LABEL_RTALERT 1 /* RFC3032 */ +#define MPLS_LABEL_IPV6NULL 2 /* RFC3032 */ +#define MPLS_LABEL_IMPLNULL 3 /* RFC3032 */ +#define MPLS_LABEL_ENTROPY 7 /* RFC6790 */ #define MPLS_LABEL_GAL 13 /* RFC5586 */ -#define MPLS_LABEL_OAM_ALERT 14 /* RFC3429 */ +#define MPLS_LABEL_OAMALERT 14 /* RFC3429 */ #define MPLS_LABEL_EXTENSION 15 /* RFC7274 */ #endif /* _UAPI_MPLS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b6eb7615960a..7b3f732269e4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -647,7 +647,7 @@ int nla_get_labels(const struct nlattr *nla, return -EINVAL; switch (dec.label) { - case MPLS_LABEL_IMPLICIT_NULL: + case MPLS_LABEL_IMPLNULL: /* RFC3032: This is a label that an LSR may * assign and distribute, but which never * actually appears in the encapsulation. @@ -935,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) } /* In case the predefined labels need to be populated */ - if (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; rt0 = mpls_rt_alloc(lo->addr_len); if (!rt0) @@ -945,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } - if (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL) { + if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; rt2 = mpls_rt_alloc(lo->addr_len); if (!rt2) @@ -973,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit) memcpy(labels, old, cp_size); /* If needed set the predefined labels */ - if ((old_limit <= MPLS_LABEL_IPV6_EXPLICIT_NULL) && - (limit > MPLS_LABEL_IPV6_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6_EXPLICIT_NULL], rt2); + if ((old_limit <= MPLS_LABEL_IPV6NULL) && + (limit > MPLS_LABEL_IPV6NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2); rt2 = NULL; } - if ((old_limit <= MPLS_LABEL_IPV4_EXPLICIT_NULL) && - (limit > MPLS_LABEL_IPV4_EXPLICIT_NULL)) { - RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4_EXPLICIT_NULL], rt0); + if ((old_limit <= MPLS_LABEL_IPV4NULL) && + (limit > MPLS_LABEL_IPV4NULL)) { + RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0); rt0 = NULL; } -- cgit v1.2.3-71-gd317 From eea39946a1f36e8a5a47c86e7ecfca6076868505 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 13 May 2015 21:17:41 -0700 Subject: rename RTNH_F_EXTERNAL to RTNH_F_OFFLOAD RTNH_F_EXTERNAL today is printed as "offload" in iproute2 output. This patch renames the flag to be consistent with what the user sees. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 +- net/ipv4/fib_trie.c | 2 +- net/switchdev/switchdev.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 974db03f7b1a..17fb02f488da 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -337,7 +337,7 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ -#define RTNH_F_EXTERNAL 8 /* Route installed externally */ +#define RTNH_F_OFFLOAD 8 /* offloaded route */ /* Macros to handle hexthops */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e13fcc602da2..64c2076ced54 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1764,7 +1764,7 @@ void fib_table_flush_external(struct fib_table *tb) /* record local slen */ slen = fa->fa_slen; - if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) continue; netdev_switch_fib_ipv4_del(n->key, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 46568b85c333..055453d48668 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -338,7 +338,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, fi, tos, type, nlflags, tb_id); if (!err) - fi->fib_flags |= RTNH_F_EXTERNAL; + fi->fib_flags |= RTNH_F_OFFLOAD; } return err; @@ -364,7 +364,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, const struct swdev_ops *ops; int err = 0; - if (!(fi->fib_flags & RTNH_F_EXTERNAL)) + if (!(fi->fib_flags & RTNH_F_OFFLOAD)) return 0; dev = netdev_switch_get_dev_by_nhs(fi); @@ -376,7 +376,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len, fi, tos, type, tb_id); if (!err) - fi->fib_flags &= ~RTNH_F_EXTERNAL; + fi->fib_flags &= ~RTNH_F_OFFLOAD; } return err; -- cgit v1.2.3-71-gd317 From b3cad287d13b5f6695c6b4aab72969cd64bf0171 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 7 May 2015 14:54:16 +0200 Subject: conntrack: RFC5961 challenge ACK confuse conntrack LAST-ACK transition In compliance with RFC5961, the network stack send challenge ACK in response to spurious SYN packets, since commit 0c228e833c88 ("tcp: Restore RFC5961-compliant behavior for SYN packets"). This pose a problem for netfilter conntrack in state LAST_ACK, because this challenge ACK is (falsely) seen as ACKing last FIN, causing a false state transition (into TIME_WAIT). The challenge ACK is hard to distinguish from real last ACK. Thus, solution introduce a flag that tracks the potential for seeing a challenge ACK, in case a SYN packet is let through and current state is LAST_ACK. When conntrack transition LAST_ACK to TIME_WAIT happens, this flag is used for determining if we are expecting a challenge ACK. Scapy based reproducer script avail here: https://github.com/netoptimizer/network-testing/blob/master/scapy/tcp_hacks_3WHS_LAST_ACK.py Fixes: 0c228e833c88 ("tcp: Restore RFC5961-compliant behavior for SYN packets") Signed-off-by: Jesper Dangaard Brouer Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_tcp.h | 3 +++ net/netfilter/nf_conntrack_proto_tcp.c | 35 ++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_conntrack_tcp.h b/include/uapi/linux/netfilter/nf_conntrack_tcp.h index 9993a421201c..ef9f80f0f529 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_tcp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_tcp.h @@ -42,6 +42,9 @@ enum tcp_conntrack { /* The field td_maxack has been set */ #define IP_CT_TCP_FLAG_MAXACK_SET 0x20 +/* Marks possibility for expected RFC5961 challenge ACK */ +#define IP_CT_EXP_CHALLENGE_ACK 0x40 + struct nf_ct_tcp_flags { __u8 flags; __u8 mask; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 5caa0c41bf26..70383de72054 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ @@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW - * sLA -> sTW Last ACK detected. + * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ @@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct, 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; memset(&ct->proto.tcp.seen[dir], 0, @@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct, * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and - * then we'll get in sync. Otherwise, the server ignores it. */ + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; @@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } + /* Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK) + ct->proto.tcp.last_flags |= + IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); if (LOG_INVALID(net, IPPROTO_TCP)) @@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct, nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; + case TCP_CONNTRACK_TIME_WAIT: + /* RFC5961 compliance cause stack to send "challenge-ACK" + * e.g. in response to spurious SYNs. Conntrack MUST + * not believe this ACK is acking last FIN. + */ + if (old_state == TCP_CONNTRACK_LAST_ACK && + index == TCP_ACK_SET && + ct->proto.tcp.last_dir != dir && + ct->proto.tcp.last_index == TCP_SYN_SET && + (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: challenge-ACK ignored "); + return NF_ACCEPT; /* Don't change state */ + } + break; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) -- cgit v1.2.3-71-gd317