From e785fa0a164aa11001cba931367c7f94ffaff888 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: nl80211: check for the required netlink attributes presence nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Cc: # v3.1-rc1 Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..fbd5593e88cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10903,6 +10903,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) -- cgit v1.2.3-71-gd317 From b0ade85165b3caeb0cd908cffe5921a39f25c243 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Sep 2017 13:41:41 +0200 Subject: netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no spinlock debugging options (CONFIG_GENERIC_LOCKBREAK, CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_LOCK_ALLOC) are enabled on a UP platform (e.g. m68k defconfig), arch_spinlock_t is an empty struct, hence using ARRAY_SIZE(nf_nat_locks) causes a division by zero: net/netfilter/nf_nat_core.c: In function ‘nf_nat_setup_info’: net/netfilter/nf_nat_core.c:432: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘__nf_nat_cleanup_conntrack’: net/netfilter/nf_nat_core.c:535: warning: division by zero net/netfilter/nf_nat_core.c:537: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘nf_nat_init’: net/netfilter/nf_nat_core.c:810: warning: division by zero net/netfilter/nf_nat_core.c:811: warning: division by zero net/netfilter/nf_nat_core.c:824: warning: division by zero Fix this by using the CONNTRACK_LOCKS definition instead. Suggested-by: Florian Westphal Fixes: 8073e960a03bf7b5 ("netfilter: nat: use keyed locks") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); -- cgit v1.2.3-71-gd317 From 7f4f7dd4417d9efd038b14d39c70170db2e0baa0 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 11 Sep 2017 21:52:40 +0200 Subject: netfilter: ipset: ipset list may return wrong member count for set with timeout Simple testcase: $ ipset create test hash:ip timeout 5 $ ipset add test 1.2.3.4 $ ipset add test 1.2.2.2 $ sleep 5 $ ipset l Name: test Type: hash:ip Revision: 5 Header: family inet hashsize 1024 maxelem 65536 timeout 5 Size in memory: 296 References: 0 Number of entries: 2 Members: We return "Number of entries: 2" but no members are listed. That is because mtype_list runs "ip_set_timeout_expired" and does not list the expired entries, but set->elements is never upated (until mtype_gc cleans it up later). Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; -- cgit v1.2.3-71-gd317 From 63ecc3d9436f8012e49dc846d6cb0a85a3433517 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 13 Sep 2017 19:30:51 -0600 Subject: udpv6: Fix the checksum computation when HW checksum does not apply While trying an ESP transport mode encryption for UDPv6 packets of datagram size 1436 with MTU 1500, checksum error was observed in the secondary fragment. This error occurs due to the UDP payload checksum being missed out when computing the full checksum for these packets in udp6_hwcsum_outgoing(). Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3-71-gd317 From 265698d7e6132a2d41471135534f4f36ad15b09c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Sep 2017 22:46:36 +0200 Subject: nl80211: fix null-ptr dereference on invalid mesh configuration If TX rates are specified during mesh join, the channel must also be specified. Check the channel pointer to avoid a null pointer dereference if it isn't. Reported-by: Jouni Malinen Fixes: 8564e38206de ("cfg80211: add checks for beacon rate, extend to mesh") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fbd5593e88cb..690874293cfc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9987,6 +9987,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) -- cgit v1.2.3-71-gd317 From 76cc0d3282d4b933fa144fa41fbc5318e0fdca24 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 12:00:07 +0800 Subject: ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header Now in ip6gre_header before packing the ipv6 header, it skb_push t->hlen which only includes encap_hlen + tun_hlen. It means greh and inner header would be over written by ipv6 stuff and ipv6h might have no chance to set up. Jianlin found this issue when using remote any on ip6_gre, the packets he captured on gre dev are truncated: 22:50:26.210866 Out ethertype IPv6 (0x86dd), length 120: truncated-ip6 -\ 8128 bytes missing!(flowlabel 0x92f40, hlim 0, next-header Options (0) \ payload length: 8192) ::1:2000:0 > ::1:0:86dd: HBH [trunc] ip-proto-128 \ 8184 It should also skb_push ipv6hdr so that ipv6h points to the right position to set ipv6 stuff up. This patch is to skb_push hlen + sizeof(*ipv6h) and also fix some indents in ip6gre_header. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..20f66f4c9460 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. -- cgit v1.2.3-71-gd317 From 8c22dab03ad072e45060c299c70d02a4f6fc4aab Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 15:58:33 +0800 Subject: ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline If ipv6 has been disabled from cmdline since kernel started, it makes no sense to allow users to create any ip6 tunnel. Otherwise, it could some potential problem. Jianlin found a kernel crash caused by this in ip6_gre when he set ipv6.disable=1 in grub: [ 209.588865] Unable to handle kernel paging request for data at address 0x00000080 [ 209.588872] Faulting instruction address: 0xc000000000a3aa6c [ 209.588879] Oops: Kernel access of bad area, sig: 11 [#1] [ 209.589062] NIP [c000000000a3aa6c] fib_rules_lookup+0x4c/0x260 [ 209.589071] LR [c000000000b9ad90] fib6_rule_lookup+0x50/0xb0 [ 209.589076] Call Trace: [ 209.589097] fib6_rule_lookup+0x50/0xb0 [ 209.589106] rt6_lookup+0xc4/0x110 [ 209.589116] ip6gre_tnl_link_config+0x214/0x2f0 [ip6_gre] [ 209.589125] ip6gre_newlink+0x138/0x3a0 [ip6_gre] [ 209.589134] rtnl_newlink+0x798/0xb80 [ 209.589142] rtnetlink_rcv_msg+0xec/0x390 [ 209.589151] netlink_rcv_skb+0x138/0x150 [ 209.589159] rtnetlink_rcv+0x48/0x70 [ 209.589169] netlink_unicast+0x538/0x640 [ 209.589175] netlink_sendmsg+0x40c/0x480 [ 209.589184] ___sys_sendmsg+0x384/0x4e0 [ 209.589194] SyS_sendmsg+0xd4/0x140 [ 209.589201] SyS_socketcall+0x3e0/0x4f0 [ 209.589209] system_call+0x38/0xe0 This patch is to return -EOPNOTSUPP in ip6_tunnel_init if ipv6 has been disabled from cmdline. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..f2f21c24915f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2259,6 +2259,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; -- cgit v1.2.3-71-gd317 From 3ff4cbec87da48b0ec1f7b6196607b034de0c680 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 16 Sep 2017 14:02:21 +0200 Subject: net/sched: cls_matchall: fix crash when used with classful qdisc this script, edited from Linux Advanced Routing and Traffic Control guide tc q a dev en0 root handle 1: htb default a tc c a dev en0 parent 1: classid 1:1 htb rate 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:a htb rate 5mbit ceil 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:b htb rate 1mbit ceil 6mbit burst 15k tc f a dev en0 parent 1:0 prio 1 $clsname $clsargs classid 1:b ping $address -c1 tc -s c s dev en0 classifies traffic to 1:b or 1:a, depending on whether the packet matches or not the pattern $clsargs of filter $clsname. However, when $clsname is 'matchall', a systematic crash can be observed in htb_classify(). HTB and classful qdiscs don't assign initial value to struct tcf_result, but then they expect it to contain valid values after filters have been run. Thus, current 'matchall' ignores the TCA_MATCHALL_CLASSID attribute, configured by user, and makes HTB (and classful qdiscs) dereference random pointers. By assigning head->res to *res in mall_classify(), before the actions are invoked, we fix this crash and enable TCA_MATCHALL_CLASSID functionality, that had no effect on 'matchall' classifier since its first introduction. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1460213 Reported-by: Jiri Benc Fixes: b87f7936a932 ("net/sched: introduce Match-all classifier") Signed-off-by: Davide Caratti Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } -- cgit v1.2.3-71-gd317 From 4c7124413aa759b8ea0b90cd39177e525396e662 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 Sep 2017 11:05:16 -0700 Subject: tcp: remove two unused functions remove tcp_may_send_now and tcp_snd_test that are no longer used Fixes: 840a3cbe8969 ("tcp: remove forward retransmit feature") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_output.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..3bc910a9bfc6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -544,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); -bool tcp_may_send_now(struct sock *sk); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..517d737059d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1806,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions -- cgit v1.2.3-71-gd317 From 29a0cfbf91ba997591535a4f7246835ce8328141 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 18 Sep 2017 12:21:37 +0200 Subject: libceph: don't allow bidirectional swap of pg-upmap-items This reverts most of commit f53b7665c8ce ("libceph: upmap semantic changes"). We need to prevent duplicates in the final result. For example, we can currently take [1,2,3] and apply [(1,2)] and get [2,2,3] or [1,2,3] and apply [(3,2)] and get [1,2,2] The rest of the system is not prepared to handle duplicates in the result set like this. The reverted piece was intended to allow [1,2,3] and [(1,2),(2,1)] to get [2,1,3] to reorder primaries. First, this bidirectional swap is hard to implement in a way that also prevents dups. For example, [1,2,3] and [(1,4),(2,3),(3,4)] would give [4,3,4] but would we just drop the last step we'd have [4,3,3] which is also invalid, etc. Simpler to just not handle bidirectional swaps. In practice, they are not needed: if you just want to choose a different primary then use primary_affinity, or pg_upmap (not pg_upmap_items). Cc: stable@vger.kernel.org # 4.13 Link: http://tracker.ceph.com/issues/21410 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f358d0bfa76b..79d14d70b7ea 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2445,19 +2445,34 @@ static void apply_upmap(struct ceph_osdmap *osdmap, pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - for (i = 0; i < raw->size; i++) { - for (j = 0; j < pg->pg_upmap_items.len; j++) { - int from = pg->pg_upmap_items.from_to[j][0]; - int to = pg->pg_upmap_items.from_to[j][1]; - - if (from == raw->osds[i]) { - if (!(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) - raw->osds[i] = to; + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; + + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; break; } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } } + if (!exists && pos >= 0) + raw->osds[pos] = to; } } } -- cgit v1.2.3-71-gd317 From b5b7db8d680464b1d631fd016f5e093419f0bfd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 10:05:57 -0700 Subject: tcp: fastopen: fix on syn-data transmit failure Our recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff8800b35cb1d8 ffffffff81cad00d 0000000000000000 ffffffff828a4347 ffff88009f86c080 ffffffff8316eb20 0000000000000d7f ffff8800b35cb220 ffffffff812c33c2 ffff8800baad2440 00000009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 517d737059d1..0bc9e46a5369 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ -- cgit v1.2.3-71-gd317 From 7c30013133964aaa2f45c17d6e9782ac6cfd7f5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Sep 2017 00:44:21 +0200 Subject: bpf: fix ri->map_owner pointer on bpf_prog_realloc Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur that require slow path in bpf_prog_realloc() to alloc more memory, e.g. from patching inlining helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, which also works with prog clones as they share the same aux object. Instead of then fetching aux->prog during runtime, which could potentially incur cache misses due to false sharing, we are going to just use aux for comparison on the map owner. This will also keep the patchlet of the same size, and later check in xdp_map_invalid() only accesses read-only aux pointer from the prog, it's also in the same cacheline already from prior access when calling bpf_func. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 7 ++++++- net/core/filter.c | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, diff --git a/net/core/filter.c b/net/core/filter.c index 24dd33dd9f04..82edad58d066 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,7 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2500,11 +2500,17 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2512,9 +2518,9 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2574,7 +2580,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2583,10 +2589,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; if (map) { - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2632,7 +2638,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2646,7 +2652,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); -- cgit v1.2.3-71-gd317 From 6819a14ecbe2e089e5c5bb74edecafdde2028a00 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 4 Sep 2017 15:52:55 +0100 Subject: net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") incorrectly assumes that no RTM_NEWADDR are sent for addresses in tentative state, as this does happen for the standard IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is sent after DAD failure for a link-local when strict DAD (accept_dad=2) is configured, or on the next admin down in other cases. The absence of this notification breaks backwards compatibility and causes problems after DAD failure if this notification was being relied on. The solution is to allow RTM_DELADDR to still be sent after DAD failure. Fixes: f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") Signed-off-by: Mike Manning Cc: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..d7dbcc8eda10 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4940,9 +4940,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); -- cgit v1.2.3-71-gd317 From 35e015e1f5773417952fe91ce8790baf9b4237a2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 12 Sep 2017 17:46:37 +0200 Subject: ipv6: fix net.ipv6.conf.all interface DAD handlers Currently, writing into net.ipv6.conf.all.{accept_dad,use_optimistic,optimistic_dad} has no effect. Fix handling of these flags by: - using the maximum of global and per-interface values for the accept_dad flag. That is, if at least one of the two values is non-zero, enable DAD on the interface. If at least one value is set to 2, enable DAD and disable IPv6 operation on the interface if MAC-based link-local address was found - using the logical OR of global and per-interface values for the optimistic_dad flag. If at least one of them is set to one, optimistic duplicate address detection (RFC 4429) is enabled on the interface - using the logical OR of global and per-interface values for the use_optimistic flag. If at least one of them is set to one, optimistic addresses won't be marked as deprecated during source address selection on the interface. While at it, as we're modifying the prototype for ipv6_use_optimistic_addr(), drop inline, and let the compiler decide. Fixes: 7fd2561e4ebd ("net: ipv6: Add a sysctl to make optimistic addresses useful candidates") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++---- net/ipv6/addrconf.c | 27 ++++++++++++++++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b3345d0fe0a6..77f4de59dc9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1680,6 +1680,9 @@ accept_dad - INTEGER 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate link-local address has been found. + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + force_tllao - BOOLEAN Enable sending the target link-layer address option even when responding to a unicast neighbor solicitation. @@ -1727,16 +1730,23 @@ suppress_frag_ndisc - INTEGER optimistic_dad - BOOLEAN Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. use_optimistic - BOOLEAN If enabled, do not classify optimistic addresses as deprecated during source address selection. Preferred addresses will still be chosen before optimistic addresses, subject to other ranking in the source address selection algorithm. - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. stable_secret - IPv6 address This IPv6 address will be used as a secret to generate IPv6 diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d7dbcc8eda10..96861c702c06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3810,6 +3820,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; -- cgit v1.2.3-71-gd317 From 008ba2a13f2d04c947adc536d19debb8fe66f110 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 Sep 2017 17:14:41 -0400 Subject: packet: hold bind lock when rebinding to fanout hook Packet socket bind operations must hold the po->bind_lock. This keeps po->running consistent with whether the socket is actually on a ptype list to receive packets. fanout_add unbinds a socket and its packet_rcv/tpacket_rcv call, then binds the fanout object to receive through packet_rcv_fanout. Make it hold the po->bind_lock when testing po->running and rebinding. Else, it can race with other rebind operations, such as that in packet_set_ring from packet_rcv to tpacket_rcv. Concurrent updates can result in a socket being added to a fanout group twice, causing use-after-free KASAN bug reports, among others. Reported independently by both trinity and syzkaller. Verified that the syzkaller reproducer passes after this patch. Fixes: dc99f600698d ("packet: Add fanout support.") Reported-by: nixioaming Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..d288f52c53f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1684,10 +1684,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1745,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); -- cgit v1.2.3-71-gd317 From c2a64bb9fcd31c39feddf30748b4ee8d82e53c6a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 13:19:13 -0400 Subject: net: compat: assert the size of cmsg copied in is as expected The actual length of cmsg fetched in during the second loop (i.e., kcmsg - kcmsg_base) could be different from what we get from the first loop (i.e., kcmlen). The main reason is that the two get_user() calls in the two loops (i.e., get_user(ucmlen, &ucmsg->cmsg_len) and __get_user(ucmlen, &ucmsg->cmsg_len)) could cause ucmlen to have different values even they fetch from the same userspace address, as user can race to change the memory content in &ucmsg->cmsg_len across fetches. Although in the second loop, the sanity check if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) is inplace, it only ensures that the cmsg fetched in during the second loop does not exceed the length of kcmlen, but not necessarily equal to kcmlen. But indicated by the assignment kmsg->msg_controllen = kcmlen, we should enforce that. This patch adds this additional sanity check and ensures that what is recorded in kmsg->msg_controllen is the actual cmsg length. Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; -- cgit v1.2.3-71-gd317 From 92dd5452c1be873a1193561f4f691763103d22ac Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 19 Sep 2017 18:45:56 +0100 Subject: net: change skb->mac_header when Generic XDP calls adjust_head Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..9a2254f9802f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: -- cgit v1.2.3-71-gd317 From c8e1812960eeae42e2183154927028511c4bc566 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:45:36 +0300 Subject: net_sched: always reset qdisc backlog in qdisc_reset() SKB stored in qdisc->gso_skb also counted into backlog. Some qdiscs don't reset backlog to zero in ->reset(), for example sfq just dequeue and free all queued skb. Signed-off-by: Konstantin Khlebnikov Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..bf8c81e07c70 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,6 +685,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); -- cgit v1.2.3-71-gd317 From 21f4d5cc25ec0e6e8eb8420dd2c399e6d2fc7d14 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:46:11 +0300 Subject: net_sched/hfsc: fix curve activation in hfsc_change_class() If real-time or fair-share curves are enabled in hfsc_change_class() class isn't inserted into rb-trees yet. Thus init_ed() and init_vf() must be called in place of update_ed() and update_vf(). Remove isn't required because for now curves cannot be disabled. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); -- cgit v1.2.3-71-gd317 From fe2502e49b58606580c77b3d84e42f946de182d8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Sep 2017 09:18:45 -0700 Subject: net_sched: remove cls_flower idr on failure Fixes: c15ab236d69d ("net/sched: Change cls_flower to use IDR") Cc: Chris Mi Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..d230cb4c8094 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -922,28 +922,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +952,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +981,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); -- cgit v1.2.3-71-gd317 From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:13 -0700 Subject: net: ethtool: Add back transceiver type Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") deprecated the ethtool_cmd::transceiver field, which was fine in premise, except that the PHY library was actually using it to report the type of transceiver: internal or external. Use the first word of the reserved field to put this __u8 transceiver field back in. It is made read-only, and we don't expect the ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this is mostly for the legacy path where we do: ethtool_get_settings() -> dev->ethtool_ops->get_link_ksettings() -> convert_link_ksettings_to_legacy_settings() to have no information loss compared to the legacy get_settings API. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- net/core/ethtool.c | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c041dae8e2c..5bd1b1de4ea0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1753,6 +1753,8 @@ enum ethtool_reset_flags { * %ethtool_link_mode_bit_indices for the link modes, and other * link features that the link partner advertised through * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. * * If autonegotiation is disabled, the speed and @duplex represent the * fixed link mode and are writable if the driver supports multiple @@ -1804,7 +1806,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix; __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; - __u32 reserved[8]; + __u8 transceiver; + __u8 reserved1[3]; + __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } -- cgit v1.2.3-71-gd317 From 09579ac803a3638344b8544b5940793d5358673e Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:26 +0200 Subject: net/smc: add missing dev_put In the infiniband part, SMC currently uses get_netdev which calls dev_hold on the returned net device. However, the SMC code never calls dev_put on that net device resulting in a wrong reference count. This patch adds a dev_put after the usage of the net device to fix the issue. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); -- cgit v1.2.3-71-gd317 From 846e344eb7229018457d6d6fc1ab0cc0a167692f Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:27 +0200 Subject: net/smc: add receive timeout check The SMC receive function currently lacks a timeout check under the condition that no data were received and no data are available. This patch adds such a check. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_rx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { -- cgit v1.2.3-71-gd317 From 731b008560e6dfaf5fb297543f17bbe9bb868f3c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:28 +0200 Subject: net/smc: take RCU read lock for routing cache lookup smc_netinfo_by_tcpsk() looks up the routing cache. Such a lookup requires protection by an RCU read lock. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..2e8d2dabac0c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); -- cgit v1.2.3-71-gd317 From a6832c3acdb2ceb099ec3c385777fbaa6d5a5fd6 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:29 +0200 Subject: net/smc: adjust net_device refcount smc_pnet_fill_entry() uses dev_get_by_name() adding a refcount to ndev. The following smc_pnet_enter() has to reduce the refcount if the entry to be added exists already in the pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; -- cgit v1.2.3-71-gd317 From 8301fa44b41b1ca46a0547809eb173112559dc51 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:30 +0200 Subject: net/smc: adapt send request completion notification The solicited flag is meaningful for the receive completion queue. Ask for next work completion of any type on the send queue. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); -- cgit v1.2.3-71-gd317 From 5bc11ddbdf7fc6681db5c3f9a92cdee0f19cee1e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:31 +0200 Subject: net/smc: longer delay for client link group removal Client link group creation always follows the server linkgroup creation. If peer creates a new server link group, client has to create a new client link group. If peer reuses a server link group for a new connection, client has to reuse its client link group as well. This patch introduces a longer delay for client link group removal to make sure this link group still exists, once the peer decides to reuse a server link group. This avoids out-of-sync conditions for link groups. If already scheduled, modify the delay. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) -- cgit v1.2.3-71-gd317 From bfbedfd38378c1ad9df84469403d69c17b074b66 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:32 +0200 Subject: net/smc: terminate link group if out-of-sync is received An out-of-sync condition can just be detected by the client. If the server receives a CLC DECLINE message indicating an out-of-sync condition for the link groups, the server must clean up the out-of-sync link group. There is no need for an extra third parameter in smc_clc_send_decline(). Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_clc.c | 10 +++++----- net/smc/smc_clc.h | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2e8d2dabac0c..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -513,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -808,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -903,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); -- cgit v1.2.3-71-gd317 From 18e537cd58e8d6932719bfa79cb96a1fbc639199 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:33 +0200 Subject: net/smc: introduce a delay The number of outstanding work requests is limited. If all work requests are in use, tx processing is postponed to another scheduling of the tx worker. Switch to a delayed worker to have a gap for tx completion queue events before the next retry. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 +- net/smc/smc_close.c | 12 +++++++----- net/smc/smc_tx.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..5201bc103bd8 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -425,7 +427,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +441,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..3866573288dd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -406,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +433,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -468,7 +471,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +491,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } -- cgit v1.2.3-71-gd317 From 8c96feeeb39ba0b89c6121f71e8f7aa2a4d46671 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:34 +0200 Subject: net/smc: no close wait in case of process shut down Usually socket closing is delayed if there is still data available in the send buffer to be transmitted. If a process is killed, the delay should be avoided. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 5201bc103bd8..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -413,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; -- cgit v1.2.3-71-gd317 From 581fe0ea61584d88072527ae9fb9dcb9d1f2783e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Sep 2017 19:42:37 -0400 Subject: net: orphan frags on stand-alone ptype in dev_queue_xmit_nit Zerocopy skbs frags are copied when the skb is looped to a local sock. Commit 1080e512d44d ("net: orphan frags on receive") introduced calls to skb_orphan_frags to deliver_skb and __netif_receive_skb for this. With msg_zerocopy, these skbs can also exist in the tx path and thus loop from dev_queue_xmit_nit. This already calls deliver_skb in its loop. But it does not orphan before a separate pt_prev->func(). Add the missing skb_orphan_frags_rx. Changes v1->v2: handle skb_orphan_frags_rx failure Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 9a2254f9802f..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); -- cgit v1.2.3-71-gd317 From cbb2fb5c72f48d3029c144be0f0e61da1c7bccf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:06 -0400 Subject: net: set tb->fast_sk_family We need to set the tb->fast_sk_family properly so we can use the proper comparison function for all subsequent reuseport bind requests. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..f87f4805e244 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -328,6 +328,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif -- cgit v1.2.3-71-gd317 From 7a56673b58f2414679e926bba80309a037a4fd35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:07 -0400 Subject: net: use inet6_rcv_saddr to compare sockets In ipv6_rcv_saddr_equal() we need to use inet6_rcv_saddr(sk) for the ipv6 compare with the fast socket information to make sure we're doing the proper comparisons. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f87f4805e244..a1bf30438bc5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, -- cgit v1.2.3-71-gd317 From fbed24bcc69d3e48c5402c371f19f5c7688871e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:08 -0400 Subject: inet: fix improper empty comparison When doing my reuseport rework I screwed up and changed a if (hlist_empty(&tb->owners)) to if (!hlist_empty(&tb->owners)) This is obviously bad as all of the reuseport/reuse logic was reversed, which caused weird problems like allowing an ipv4 bind conflict if we opened an ipv4 only socket on a port followed by an ipv6 only socket on the same port. Fixes: b9470c27607b ("inet: kill smallest_size and smallest_port") Reported-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a1bf30438bc5..c039c937ba90 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -321,7 +321,7 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; -- cgit v1.2.3-71-gd317 From edd31551148c09608feee6b8756ad148d550ee3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:31 +0300 Subject: IB: Correct MR length field to be 64-bit The ib_mr->length represents the length of the MR in bytes as per the IBTA spec 1.3 section 11.2.10.3 (REGISTER PHYSICAL MEMORY REGION). Currently ib_mr->length field is defined as only 32-bits field. This might result into truncation and failed WRs of consumers who registers more than 4GB bytes memory regions and whose WRs accessing such MRs. This patch makes the length 64-bit to avoid such truncation. Cc: Sagi Grimberg Cc: Chuck Lever Cc: Faisal Latif Fixes: 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Signed-off-by: Ilya Lesokhin Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- include/rdma/ib_verbs.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f0dc5f4aa177..442b9bdc0f03 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3232,7 +3232,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->ibmr.iova); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - mr->ibmr.length); + lower_32_bits(mr->ibmr.length)); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); set_wqe_32bit_value(wqe->wqe_words, @@ -3274,7 +3274,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->npages * 8); nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %d, rkey: %0x, pgl_paddr: %llx, " + "length: %lld, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", (unsigned long long) mr->ibmr.iova, mr->ibmr.length, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9c3e9ab53a41..322209d5ff58 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -154,7 +154,7 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec npages %d data length %d\n", + iser_err("page vec npages %d data length %lld\n", page_vec->npages, page_vec->fake_mr.length); for (i = 0; i < page_vec->npages; i++) iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbb5f54db882..e8608b2dc844 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { u32 lkey; u32 rkey; u64 iova; - u32 length; + u64 length; unsigned int page_size; bool need_inval; union { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); -- cgit v1.2.3-71-gd317 From cdd10c9627496ad25c87ce6394e29752253c69d3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:23 +0200 Subject: l2tp: ensure sessions are freed after their PPPOL2TP socket If l2tp_tunnel_delete() or l2tp_tunnel_closeall() deletes a session right after pppol2tp_release() orphaned its socket, then the 'sock' variable of the pppol2tp_session_close() callback is NULL. Yet the session is still used by pppol2tp_release(). Therefore we need to take an extra reference in any case, to prevent l2tp_tunnel_delete() or l2tp_tunnel_closeall() from freeing the session. Since the pppol2tp_session_close() callback is only set if the session is associated to a PPPOL2TP socket and that both l2tp_tunnel_delete() and l2tp_tunnel_closeall() hold the PPPOL2TP socket before calling pppol2tp_session_close(), we're sure that pppol2tp_session_close() and pppol2tp_session_destruct() are paired and called in the right order. So the reference taken by the former will be released by the later. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if -- cgit v1.2.3-71-gd317 From b228a94066406b6c456321d69643b0d7ce11cfa6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:24 +0200 Subject: l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall() There are several ways to remove L2TP sessions: * deleting a session explicitly using the netlink interface (with L2TP_CMD_SESSION_DELETE), * deleting the session's parent tunnel (either by closing the tunnel's file descriptor or using the netlink interface), * closing the PPPOL2TP file descriptor of a PPP pseudo-wire. In some cases, when these methods are used concurrently on the same session, the session can be removed twice, leading to use-after-free bugs. This patch adds a 'dead' flag, used by l2tp_session_delete() and l2tp_tunnel_closeall() to prevent them from stepping on each other's toes. The session deletion path used when closing a PPPOL2TP file descriptor doesn't need to be adapted. It already has to ensure that a session remains valid for the lifetime of its PPPOL2TP file descriptor. So it takes an extra reference on the session in the ->session_close() callback (pppol2tp_session_close()), which is eventually dropped in the ->sk_destruct() callback of the PPPOL2TP socket (pppol2tp_session_destruct()). Still, __l2tp_session_unhash() and l2tp_session_queue_purge() can be called twice and even concurrently for a given session, but thanks to proper locking and re-initialisation of list fields, this is not an issue. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++++++ net/l2tp/l2tp_core.h | 1 + 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..d8c2a89a76e1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1750,6 +1753,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..70a12df40a5f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ -- cgit v1.2.3-71-gd317 From 36f6ee22d2d66046e369757ec6bbe1c482957ba6 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 26 Sep 2017 15:14:29 +0300 Subject: vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit When running LTP IPsec tests, KASan might report: BUG: KASAN: use-after-free in vti_tunnel_xmit+0xeee/0xff0 [ip_vti] Read of size 4 at addr ffff880dc6ad1980 by task swapper/0/0 ... Call Trace: dump_stack+0x63/0x89 print_address_description+0x7c/0x290 kasan_report+0x28d/0x370 ? vti_tunnel_xmit+0xeee/0xff0 [ip_vti] __asan_report_load4_noabort+0x19/0x20 vti_tunnel_xmit+0xeee/0xff0 [ip_vti] ? vti_init_net+0x190/0x190 [ip_vti] ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 dev_hard_start_xmit+0x147/0x510 ? icmp_echo.part.24+0x1f0/0x210 __dev_queue_xmit+0x1394/0x1c60 ... Freed by task 0: save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 kasan_slab_free+0x70/0xc0 kmem_cache_free+0x81/0x1e0 kfree_skbmem+0xb1/0xe0 kfree_skb+0x75/0x170 kfree_skb_list+0x3e/0x60 __dev_queue_xmit+0x1298/0x1c60 dev_queue_xmit+0x10/0x20 neigh_resolve_output+0x3a8/0x740 ip_finish_output2+0x5c0/0xe70 ip_finish_output+0x4ba/0x680 ip_output+0x1c1/0x3a0 xfrm_output_resume+0xc65/0x13d0 xfrm_output+0x1e4/0x380 xfrm4_output_finish+0x5c/0x70 Can be fixed if we get skb->len before dst_output(). Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code") Fixes: 22e1b23dafa8 ("vti6: Support inter address family tunneling.") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 3 ++- net/ipv6/ip6_vti.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..89453cf62158 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { -- cgit v1.2.3-71-gd317 From 62b982eeb4589b2e6d7c01a90590e3a4c2b2ca19 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Sep 2017 16:16:43 +0200 Subject: l2tp: fix race condition in l2tp_tunnel_delete If we try to delete the same tunnel twice, the first delete operation does a lookup (l2tp_tunnel_get), finds the tunnel, calls l2tp_tunnel_delete, which queues it for deletion by l2tp_tunnel_del_work. The second delete operation also finds the tunnel and calls l2tp_tunnel_delete. If the workqueue has already fired and started running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the same tunnel a second time, and try to free the socket again. Add a dead flag to prevent firing the workqueue twice. Then we can remove the check of queue_work's result that was meant to prevent that race but doesn't. Reproducer: ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000 ip link set l2tp1 up ip l2tp del tunnel tunnel_id 3000 ip l2tp del tunnel tunnel_id 3000 Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") Reported-by: Jianlin Shi Signed-off-by: Sabrina Dubroca Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 10 ++++------ net/l2tp/l2tp_core.h | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8c2a89a76e1..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1688,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 70a12df40a5f..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -161,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -255,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, -- cgit v1.2.3-71-gd317 From c2cc187e53011c1c4931055984657da9085c763b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Sep 2017 13:19:26 +0300 Subject: sctp: Fix a big endian bug in sctp_diag_dump() The sctp_for_each_transport() function takes an pointer to int. The cb->args[] array holds longs so it's only using the high 32 bits. It works on little endian system but will break on big endian 64 bit machines. Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Dan Carpenter Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; -- cgit v1.2.3-71-gd317 From 35f493b87ec072c5a2497ffbee243095ef725827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 08:40:02 -0700 Subject: inetpeer: fix RCU lookup() again My prior fix was not complete, as we were dereferencing a pointer three times per node, not twice as I initially thought. Fixes: 4cc5b44b29a9 ("inetpeer: fix RCU lookup()") Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..b20c8ac64081 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; -- cgit v1.2.3-71-gd317 From e804441cfe0b60f6c430901946a69c01eac09df1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 25 Sep 2017 15:55:53 -0700 Subject: net: dsa: Fix network device registration order We cannot be registering the network device first, then setting its carrier off and finally connecting it to a PHY, doing that leaves a window during which the carrier is at best inconsistent, and at worse the device is not usable without a down/up sequence since the network device is visible to user space with possibly no PHY device attached. Re-order steps so that they make logical sense. This fixes some devices where the port was not usable after e.g: an unbind then bind of the driver. Fixes: 0071f56e46da ("dsa: Register netdev before phy") Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..865e29e62bad 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1301,28 +1301,33 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) -- cgit v1.2.3-71-gd317 From 4971613c1639d8e5f102c4e797c3bf8f83a5a69e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:19:37 -0400 Subject: packet: in packet_do_bind, test fanout with bind_lock held Once a socket has po->fanout set, it remains a member of the group until it is destroyed. The prot_hook must be constant and identical across sockets in the group. If fanout_add races with packet_do_bind between the test of po->fanout and taking the lock, the bind call may make type or dev inconsistent with that of the fanout group. Hold po->bind_lock when testing po->fanout to avoid this race. I had to introduce artificial delay (local_bh_enable) to actually observe the race. Fixes: dc99f600698d ("packet: Add fanout support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..a10c2836465c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3069,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { -- cgit v1.2.3-71-gd317 From da7c9561015e93d10fe6aab73e9288e0d09d65a6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:20:17 -0400 Subject: packet: only test po->has_vnet_hdr once in packet_snd Packet socket option po->has_vnet_hdr can be updated concurrently with other operations if no ring is attached. Do not test the option twice in packet_snd, as the value may change in between calls. A race on setsockopt disable may cause a packet > mtu to be sent without having GSO options set. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a10c2836465c..bec01a3daf5b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2840,6 +2840,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2883,6 +2884,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2941,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; -- cgit v1.2.3-71-gd317 From 9d538fa60bad4f7b23193c89e843797a1cf71ef3 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 26 Sep 2017 17:38:50 -0700 Subject: net: Set sk_prot_creator when cloning sockets to the right proto sk->sk_prot and sk->sk_prot_creator can differ when the app uses IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one). Which is why sk_prot_creator is there to make sure that sk_prot_free() does the kmem_cache_free() on the right kmem_cache slab. Now, if such a socket gets transformed back to a listening socket (using connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through sk_clone_lock() when a new connection comes in. But sk_prot_creator will still point to the IPv6 kmem_cache (as everything got copied in sk_clone_lock()). When freeing, we will thus put this memory back into the IPv6 kmem_cache although it was allocated in the IPv4 cache. I have seen memory corruption happening because of this. With slub-debugging and MEMCG_KMEM enabled this gives the warning "cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP" A C-program to trigger this: void main(void) { int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); int new_fd, newest_fd, client_fd; struct sockaddr_in6 bind_addr; struct sockaddr_in bind_addr4, client_addr1, client_addr2; struct sockaddr unsp; int val; memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sin6_family = AF_INET6; bind_addr.sin6_port = ntohs(42424); memset(&client_addr1, 0, sizeof(client_addr1)); client_addr1.sin_family = AF_INET; client_addr1.sin_port = ntohs(42424); client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&client_addr2, 0, sizeof(client_addr2)); client_addr2.sin_family = AF_INET; client_addr2.sin_port = ntohs(42421); client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&unsp, 0, sizeof(unsp)); unsp.sa_family = AF_UNSPEC; bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)); listen(fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1)); new_fd = accept(fd, NULL, NULL); close(fd); val = AF_INET; setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val)); connect(new_fd, &unsp, sizeof(unsp)); memset(&bind_addr4, 0, sizeof(bind_addr4)); bind_addr4.sin_family = AF_INET; bind_addr4.sin_port = ntohs(42421); bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4)); listen(new_fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2)); newest_fd = accept(new_fd, NULL, NULL); close(new_fd); close(client_fd); close(new_fd); } As far as I can see, this bug has been there since the beginning of the git-days. Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..7d55c05f449d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); -- cgit v1.2.3-71-gd317 From e49aa15ef6c179f69e5578a271801f31a09e9a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Sep 2017 13:20:32 -0700 Subject: Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" This reverts commit dbbccdc4ced015cdd4051299bd87fbe0254ad351. It turns out that the "legacy" users aren't so legacy at all, and that turning off the legacy ioctl will break the current Qt bluetooth stack for bluetooth LE devices that were released just a couple of months ago. So it's simply not true that this was a legacy interface that hasn't been needed and is only limited to old legacy BT devices. Because I actually read Kconfig help messages, and actively try to turn off features that I don't need, I turned the option off. Then I spent _way_ too much time debugging BLE issues until I realized that it wasn't the Qt and subsurface development that had broken one of my dive computer BLE downloads, but simply my broken kernel config. Maybe in a decade it will be true that this is a legacy interface. And maybe with a better help-text and correct dependencies, this kind of legacy removal might be acceptable. But as things are right now both the commit message and the Kconfig help text were misleading, and the Kconfig option had the wrong dependenencies. There's no reason to keep that broken Kconfig option in the tree. Cc: Marcel Holtmann Cc: Johan Hedberg Signed-off-by: Linus Torvalds --- net/bluetooth/Kconfig | 10 ---------- net/bluetooth/hci_sock.c | 6 ------ 2 files changed, 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3-71-gd317 From fef0035c0f31322d417d1954bba5ab959bf91183 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Sep 2017 00:41:44 +0200 Subject: netlink: do not proceed if dump's start() errs Drivers that use the start method for netlink dumping rely on dumpit not being called if start fails. For example, ila_xlat.c allocates memory and assigns it to cb->args[0] in its start() function. It might fail to do that and return -ENOMEM instead. However, even when returning an error, dumpit will be called, which, in the example above, quickly dereferences the memory in cb->args[0], which will OOPS the kernel. This is but one example of how this goes wrong. Since start() has always been a function with an int return type, it therefore makes sense to use it properly, rather than ignoring it. This patch thus returns early and does not call dumpit() when start() fails. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) -- cgit v1.2.3-71-gd317 From d51711c0557d6dbd26c63144aef32c7b3ec264b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:31 +0800 Subject: ip_gre: ipgre_tap device should keep dst Without keeping dst, the tunnel will not update any mtu/pmtu info, since it does not have a dst on the skb. Reproducer: client(ipgre_tap1 - eth1) <-----> (eth1 - ipgre_tap1)server After reducing eth1's mtu on client, then perforamnce became 0. This patch is to netif_keep_dst in gre_tap_init, as ipgre does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..8b837f6f5532 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1223,6 +1223,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit v1.2.3-71-gd317 From 2d40557cc702ed8e5edd9bd422233f86652d932e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:50 +0800 Subject: ip6_gre: ip6gre_tap device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed a issue that ipgre_tap mtu couldn't be updated in tx path. The same fix is needed for ip6gre_tap as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 20f66f4c9460..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1311,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], -- cgit v1.2.3-71-gd317 From d41bb33ba33b8f8debe54ed36be6925eb496e354 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:24:07 +0800 Subject: ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path Now when updating mtu in tx path, it doesn't consider ARPHRD_ETHER tunnel device, like ip6gre_tap tunnel, for which it should also subtract ether header to get the correct mtu. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f2f21c24915f..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; -- cgit v1.2.3-71-gd317 From 7487449c86c65202b3b725c4524cb48dd65e4e6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:36 +0200 Subject: IPv4: early demux can return an error code Currently no error is emitted, but this infrastructure will used by the next patch to allow source address validation for mcast sockets. Since early demux can do a route lookup and an ipv4 route lookup can return an error code this is consistent with the current ipv4 route infrastructure. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/protocol.h | 4 ++-- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 25 +++++++++++++++---------- net/ipv4/tcp_ipv4.c | 9 +++++---- net/ipv4/udp.c | 11 ++++++----- 6 files changed, 30 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/protocol.h b/include/net/protocol.h index 65ba335b0e7e..4fc75f7ae23b 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -39,8 +39,8 @@ /* This is used to register protocols. */ struct net_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); + int (*early_demux)(struct sk_buff *skb); + int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 3bc910a9bfc6..89974c5286d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -void tcp_v4_early_demux(struct sk_buff *skb); +int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); diff --git a/include/net/udp.h b/include/net/udp.h index 12dfbfe2e2d7..6c759c8594e2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); } -void udp_v4_early_demux(struct sk_buff *skb); +int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..9b30f821fe96 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb) struct in_device *in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb) */ skb_dst_set_noref(skb, dst); } + return 0; } int udp_rcv(struct sk_buff *skb) -- cgit v1.2.3-71-gd317 From bc044e8db7962e727a75b591b9851ff2ac5cf846 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:37 +0200 Subject: udp: perform source validation for mcast early demux The UDP early demux can leverate the rx dst cache even for multicast unconnected sockets. In such scenario the ipv4 source address is validated only on the first packet in the given flow. After that, when we fetch the dst entry from the socket rx cache, we stop enforcing the rp_filter and we even start accepting any kind of martian addresses. Disabling the dst cache for unconnected multicast socket will cause large performace regression, nearly reducing by half the max ingress tput. Instead we factor out a route helper to completely validate an skb source address for multicast packets and we call it from the UDP early demux for mcast packets landing on unconnected sockets, after successful fetching the related cached dst entry. This still gives a measurable, but limited performance regression: rp_filter = 0 rp_filter = 1 edmux disabled: 1182 Kpps 1127 Kpps edmux before: 2238 Kpps 2238 Kpps edmux after: 2037 Kpps 2019 Kpps The above figures are on top of current net tree. Applying the net-next commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") the delta with rp_filter == 0 will decrease even more. Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/route.h | 4 +++- net/ipv4/route.c | 46 ++++++++++++++++++++++++++-------------------- net/ipv4/udp.c | 13 ++++++++++++- 3 files changed, 41 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 57dfc6850d37..d538e6db1afe 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } - +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin); int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..ac6fde5d45f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9b30f821fe96..5676237d2b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; @@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } return 0; } -- cgit v1.2.3-71-gd317 From aad06212d36cf34859428a0a279e5c14ee5c9e26 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 29 Sep 2017 10:02:54 +0200 Subject: tipc: use only positive error codes in messages In commit e3a77561e7d32 ("tipc: split up function tipc_msg_eval()"), we have updated the function tipc_msg_lookup_dest() to set the error codes to negative values at destination lookup failures. Thus when the function sets the error code to -TIPC_ERR_NO_NAME, its inserted into the 4 bit error field of the message header as 0xf instead of TIPC_ERR_NO_NAME (1). The value 0xf is an unknown error code. In this commit, we set only positive error code. Fixes: e3a77561e7d32 ("tipc: split up function tipc_msg_eval()") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..121e59a1d0e7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); -- cgit v1.2.3-71-gd317 From 935a9749a36828af0e8be224a5cd4bc758112c34 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:53 +0800 Subject: ip_gre: get key from session_id correctly in erspan_rcv erspan only uses the first 10 bits of session_id as the key to look up the tunnel. But in erspan_rcv, it missed 'session_id & ID_MASK' when getting the key from session_id. If any other flag is also set in session_id in a packet, it would fail to find the tunnel due to incorrect key in erspan_rcv. This patch is to add 'session_id & ID_MASK' there and also remove the unnecessary variable session_id. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8b837f6f5532..b25b1e5112d0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, -- cgit v1.2.3-71-gd317 From 5513d08d29511c263c00933c00dd7a82fffda3c9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:54 +0800 Subject: ip_gre: check packet length and mtu correctly in erspan_xmit As a ARPHRD_ETHER device, skb->len in erspan_xmit is the length of the whole ether packet. So before checking if a packet size exceeds the mtu, skb->len should subtract dev->hard_header_len. Otherwise, all packets with max size according to mtu would be trimmed to be truncated packet. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b25b1e5112d0..2a4ef9dc48ff 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -731,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } -- cgit v1.2.3-71-gd317 From c122fda271717f4fc618e0a31e833941fd5f1efd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:55 +0800 Subject: ip_gre: set tunnel hlen properly in erspan_tunnel_init According to __gre_tunnel_init, tunnel->hlen should be set as the headers' length between inner packet and outer iphdr. It would be used especially to calculate a proper mtu when updating mtu in tnl_update_pmtu. Now without setting it, a bigger mtu value than expected would be updated, which hurts performance a lot. This patch is to fix it by setting tunnel->hlen with: tunnel->tun_hlen + tunnel->encap_hlen + sizeof(struct erspanhdr) Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a4ef9dc48ff..fad0bb1e3e9a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1245,7 +1245,9 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; -- cgit v1.2.3-71-gd317 From c84bed440e4e11a973e8c0254d0dfaccfca41fb0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:56 +0800 Subject: ip_gre: erspan device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed the issue ipgre_tap dev mtu couldn't be updated in tx path. The same fix is needed for erspan as well. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fad0bb1e3e9a..467e44d7587d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1254,6 +1254,7 @@ static int erspan_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } -- cgit v1.2.3-71-gd317 From 9f775ead5e570e7e19015b9e4e2f3dd6e71a5935 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 15:44:38 +0200 Subject: l2tp: fix l2tp_eth module loading The l2tp_eth module crashes if its netlink callbacks are run when the pernet data aren't initialised. We should normally register_pernet_device() before the genl callbacks. However, the pernet data only maintain a list of l2tpeth interfaces, and this list is never used. So let's just drop pernet handling instead. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 51 ++------------------------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } -- cgit v1.2.3-71-gd317 From eefca20eb20c66b06cf5ed09b49b1a7caaa27b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2017 12:20:51 -0700 Subject: socket, bpf: fix possible use after free Starting from linux-4.4, 3WHS no longer takes the listener lock. Since this time, we might hit a use-after-free in sk_filter_charge(), if the filter we got in the memcpy() of the listener content just happened to be replaced by a thread changing listener BPF filter. To fix this, we need to make sure the filter refcount is not already zero before incrementing it again. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 12 ++++++++---- net/core/sock.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..74b8c91fb5f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) diff --git a/net/core/sock.c b/net/core/sock.c index 7d55c05f449d..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1684,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new -- cgit v1.2.3-71-gd317 From ce024f42c2e28b6bce4ecc1e891b42f57f753892 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 3 Oct 2017 13:20:48 +0300 Subject: net: rtnetlink: fix info leak in RTM_GETSTATS call When RTM_GETSTATS was added the fields of its header struct were not all initialized when returning the result thus leaking 4 bytes of information to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks to Alexander Potapenko for the detailed report and bisection. Reported-by: Alexander Potapenko Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..d4bcdcc68e92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3854,6 +3854,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; -- cgit v1.2.3-71-gd317 From e769fcec6bc4bdd1b0e2cf817680148f9c40b1c4 Mon Sep 17 00:00:00 2001 From: Vishakha Narvekar Date: Tue, 3 Oct 2017 16:13:29 -0400 Subject: net: 8021q: skip packets if the vlan is down If the vlan is down, free the packet instead of proceeding with other processing, or counting it as received. If vlan interfaces are used as slaves for bonding, with arp monitoring for connectivity, if the rx counter is seen to be incrementing, then the bond device will not observe that the interface is down. CC: David S. Miller Signed-off-by: Vishakha Narvekar Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. -- cgit v1.2.3-71-gd317