From 9256645af09807bc52fa8b2e66ecd28ab25318c4 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 1 Feb 2016 18:51:04 -0500 Subject: net/core: relax BUILD_BUG_ON in netdev_stats_to_stats64 The netdev_stats_to_stats64 function copies the deprecated net_device_stats format stats into rtnl_link_stats64 for legacy support purposes, but with the BUILD_BUG_ON as it was, it wasn't possible to extend rtnl_link_stats64 without also extending net_device_stats. Relax the BUILD_BUG_ON to only require that rtnl_link_stats64 is larger, and zero out all the stat counters that aren't present in net_device_stats. CC: Eric Dumazet CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8cba3d852f25..65863e512227 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7253,24 +7253,31 @@ void netdev_run_todo(void) } } -/* Convert net_device_stats to rtnl_link_stats64. They have the same - * fields in the same order, with only the type differing. +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*stats64)); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + sizeof(*netdev_stats), 0, + sizeof(*stats64) - sizeof(*netdev_stats)); #else - size_t i, n = sizeof(*stats64) / sizeof(u64); + size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); -- cgit v1.2.3-71-gd317 From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 1 Feb 2016 18:51:05 -0500 Subject: net: add rx_nohandler stat counter This adds an rx_nohandler stat counter, along with a sysfs statistics node, and copies the counter out via netlink as well. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Daniel Borkmann CC: Tom Herbert CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/uapi/linux/if_link.h | 4 ++++ net/core/dev.c | 6 +++++- net/core/net-sysfs.c | 2 ++ net/core/rtnetlink.c | 2 ++ 5 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 289c2314d766..78a20cec2a0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1397,6 +1397,8 @@ enum netdev_priv_flags { * do not use this in drivers * @tx_dropped: Dropped packets by core network, * do not use this in drivers + * @rx_nohandler: nohandler dropped packets by core network on + * inactive devices, do not use this in drivers * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, @@ -1611,6 +1613,7 @@ struct net_device { atomic_long_t rx_dropped; atomic_long_t tx_dropped; + atomic_long_t rx_nohandler; #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def * wireless_handlers; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a30b78090594..d3e90b91e07e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -35,6 +35,8 @@ struct rtnl_link_stats { /* for cslip etc */ __u32 rx_compressed; __u32 tx_compressed; + + __u32 rx_nohandler; /* dropped, no handler found */ }; /* The main device statistics structure */ @@ -68,6 +70,8 @@ struct rtnl_link_stats64 { /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; + + __u64 rx_nohandler; /* dropped, no handler found */ }; /* The struct should be in sync with struct ifmap */ diff --git a/net/core/dev.c b/net/core/dev.c index 65863e512227..f1284835b8c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4154,7 +4154,10 @@ ncls: ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { drop: - atomic_long_inc(&skb->dev->rx_dropped); + if (!deliver_exact) + atomic_long_inc(&skb->dev->rx_dropped); + else + atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -7307,6 +7310,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } storage->rx_dropped += atomic_long_read(&dev->rx_dropped); storage->tx_dropped += atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b6c8a6629b39..da7dbc237a5f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -574,6 +574,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] = { &dev_attr_rx_packets.attr, @@ -599,6 +600,7 @@ static struct attribute *netstat_attrs[] = { &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, NULL }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d735e854f916..20d71358c143 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; + + a->rx_nohandler = b->rx_nohandler; } static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) -- cgit v1.2.3-71-gd317 From 795bb1c00dd338aa0d12f9a7f1f4776fb3160416 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Feb 2016 13:14:59 +0100 Subject: net: bulk free infrastructure for NAPI context, use napi_consume_skb Discovered that network stack were hitting the kmem_cache/SLUB slowpath when freeing SKBs. Doing bulk free with kmem_cache_free_bulk can speedup this slowpath. NAPI context is a bit special, lets take advantage of that for bulk free'ing SKBs. In NAPI context we are running in softirq, which gives us certain protection. A softirq can run on several CPUs at once. BUT the important part is a softirq will never preempt another softirq running on the same CPU. This gives us the opportunity to access per-cpu variables in softirq context. Extend napi_alloc_cache (before only contained page_frag_cache) to be a struct with a small array based stack for holding SKBs. Introduce a SKB defer and flush API for accessing this. Introduce napi_consume_skb() as replacement for e.g. dev_consume_skb_any() when running in NAPI context. A small trick to handle/detect if we are called from netpoll is to see if budget is 0. In that case, we need to invoke dev_consume_skb_irq(). Joint work with Alexander Duyck. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++ net/core/dev.c | 1 + net/core/skbuff.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 81 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a8fc2220e8ce..b56c0103fa15 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2404,6 +2404,9 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } +void napi_consume_skb(struct sk_buff *skb, int budget); + +void __kfree_skb_flush(void); /** * __dev_alloc_pages - allocate page for network Rx diff --git a/net/core/dev.c b/net/core/dev.c index f1284835b8c9..9b2c7a999e71 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5155,6 +5155,7 @@ static void net_rx_action(struct softirq_action *h) } } + __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b0cce744e2a0..b64187b87773 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -347,8 +347,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +#define NAPI_SKB_CACHE_SIZE 64 + +struct napi_alloc_cache { + struct page_frag_cache page; + size_t skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { @@ -378,9 +386,9 @@ EXPORT_SYMBOL(netdev_alloc_frag); static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(nc, fragsz, gfp_mask); + return __alloc_page_frag(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -474,7 +482,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; void *data; @@ -494,7 +502,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(nc, len, gfp_mask); + data = __alloc_page_frag(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; @@ -505,7 +513,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->pfmemalloc) + if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -747,6 +755,69 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +void __kfree_skb_flush(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* flush skb_cache if containing objects */ + if (nc->skb_count) { + kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, + nc->skb_cache); + nc->skb_count = 0; + } +} + +static void __kfree_skb_defer(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + /* drop skb->head and call any destructors for packet */ + skb_release_all(skb); + + /* record skb to CPU local list */ + nc->skb_cache[nc->skb_count++] = skb; + +#ifdef CONFIG_SLUB + /* SLUB writes into objects when freeing */ + prefetchw(skb); +#endif + + /* flush skb_cache if it is filled */ + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE, + nc->skb_cache); + nc->skb_count = 0; + } +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + if (unlikely(!skb)) + return; + + /* if budget is 0 assume netpoll w/ IRQs disabled */ + if (unlikely(!budget)) { + dev_consume_skb_irq(skb); + return; + } + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { + __kfree_skb(skb); + return; + } + + __kfree_skb_defer(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + /* Make sure a field is enclosed inside headers_start/headers_end section */ #define CHECK_SKB_FIELD(field) \ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ -- cgit v1.2.3-71-gd317 From 15fad714be86eab13e7568fecaf475b2a9730d3e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Feb 2016 13:15:04 +0100 Subject: net: bulk free SKBs that were delay free'ed due to IRQ context The network stack defers SKBs free, in-case free happens in IRQ or when IRQs are disabled. This happens in __dev_kfree_skb_irq() that writes SKBs that were free'ed during IRQ to the softirq completion queue (softnet_data.completion_queue). These SKBs are naturally delayed, and cleaned up during NET_TX_SOFTIRQ in function net_tx_action(). Take advantage of this a use the skb defer and flush API, as we are already in softirq context. For modern drivers this rarely happens. Although most drivers do call dev_kfree_skb_any(), which detects the situation and calls __dev_kfree_skb_irq() when needed. This due to netpoll can call from IRQ context. Signed-off-by: Alexander Duyck Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/dev.c | 8 +++++++- net/core/skbuff.c | 8 ++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b56c0103fa15..6ec86f1a2ed9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2407,6 +2407,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, void napi_consume_skb(struct sk_buff *skb, int budget); void __kfree_skb_flush(void); +void __kfree_skb_defer(struct sk_buff *skb); /** * __dev_alloc_pages - allocate page for network Rx diff --git a/net/core/dev.c b/net/core/dev.c index 9b2c7a999e71..3f4071a84a03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3829,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); - __kfree_skb(skb); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); } + + __kfree_skb_flush(); } if (sd->output_queue) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b64187b87773..a5bd067ec1a3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -767,7 +767,7 @@ void __kfree_skb_flush(void) } } -static void __kfree_skb_defer(struct sk_buff *skb) +static inline void _kfree_skb_defer(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -789,6 +789,10 @@ static void __kfree_skb_defer(struct sk_buff *skb) nc->skb_count = 0; } } +void __kfree_skb_defer(struct sk_buff *skb) +{ + _kfree_skb_defer(skb); +} void napi_consume_skb(struct sk_buff *skb, int budget) { @@ -814,7 +818,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - __kfree_skb_defer(skb); + _kfree_skb_defer(skb); } EXPORT_SYMBOL(napi_consume_skb); -- cgit v1.2.3-71-gd317