From 0cd3be6e9a46f84ef7a42e1a5645d32ad547b12e Mon Sep 17 00:00:00 2001 From: Sebastian Hesselbarth Date: Mon, 4 May 2015 23:04:16 +0200 Subject: clk: si5351: Do not pass struct clk in platform_data When registering clk-si5351 by platform_data, we should not pass struct clk for the reference clocks. Drop struct clk from platform_data and rework the driver to use devm_clk_get of named clock references. While at it, check for at least one valid input clock and properly prepare/ enable valid reference clocks. Signed-off-by: Sebastian Hesselbarth Reported-by: Michael Welling Reported-by: Jean-Francois Moine Reported-by: Russell King Tested-by: Michael Welling Tested-by: Jean-Francois Moine Signed-off-by: Michael Turquette --- include/linux/platform_data/si5351.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h index a947ab8b441a..533d9807e543 100644 --- a/include/linux/platform_data/si5351.h +++ b/include/linux/platform_data/si5351.h @@ -5,8 +5,6 @@ #ifndef __LINUX_PLATFORM_DATA_SI5351_H__ #define __LINUX_PLATFORM_DATA_SI5351_H__ -struct clk; - /** * enum si5351_pll_src - Si5351 pll clock source * @SI5351_PLL_SRC_DEFAULT: default, do not change eeprom config @@ -107,8 +105,6 @@ struct si5351_clkout_config { * @clkout: array of clkout configuration */ struct si5351_platform_data { - struct clk *clk_xtal; - struct clk *clk_clkin; enum si5351_pll_src pll_src[2]; struct si5351_clkout_config clkout[8]; }; -- cgit v1.2.3-71-gd317 From 2d94e5224e81c58986a8cf44a3bf4830ce5cb96e Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 7 May 2015 14:26:34 -0700 Subject: HID: hid-sensor-hub: Fix debug lock warning When CONFIG_DEBUG_LOCK_ALLOC is defined, mutex magic is compared and warned for (l->magic != l), here l is the address of mutex pointer. In hid-sensor-hub as part of hsdev creation, a per hsdev mutex is initialized during MFD cell creation. This hsdev, which contains, mutex is part of platform data for the a cell. But platform_data is copied in platform_device_add_data() in platform.c. This copy will copy the whole hsdev structure including mutex. But once copied the magic will no longer match. So when client driver call sensor_hub_input_attr_get_raw_value, this will trigger mutex warning. So to avoid this allocate mutex dynamically. This will be same even after copy. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/hid-sensor-hub.c | 13 ++++++++++--- include/linux/hid-sensor-hub.h | 4 ++-- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index c3f6f1e311ea..090a1ba0abb6 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -294,7 +294,7 @@ int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev, if (!report) return -EINVAL; - mutex_lock(&hsdev->mutex); + mutex_lock(hsdev->mutex_ptr); if (flag == SENSOR_HUB_SYNC) { memset(&hsdev->pending, 0, sizeof(hsdev->pending)); init_completion(&hsdev->pending.ready); @@ -328,7 +328,7 @@ int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev, kfree(hsdev->pending.raw_data); hsdev->pending.status = false; } - mutex_unlock(&hsdev->mutex); + mutex_unlock(hsdev->mutex_ptr); return ret_val; } @@ -667,7 +667,14 @@ static int sensor_hub_probe(struct hid_device *hdev, hsdev->vendor_id = hdev->vendor; hsdev->product_id = hdev->product; hsdev->usage = collection->usage; - mutex_init(&hsdev->mutex); + hsdev->mutex_ptr = devm_kzalloc(&hdev->dev, + sizeof(struct mutex), + GFP_KERNEL); + if (!hsdev->mutex_ptr) { + ret = -ENOMEM; + goto err_stop_hw; + } + mutex_init(hsdev->mutex_ptr); hsdev->start_collection_index = i; if (last_hsdev) last_hsdev->end_collection_index = i; diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h index 0408421d885f..0042bf330b99 100644 --- a/include/linux/hid-sensor-hub.h +++ b/include/linux/hid-sensor-hub.h @@ -74,7 +74,7 @@ struct sensor_hub_pending { * @usage: Usage id for this hub device instance. * @start_collection_index: Starting index for a phy type collection * @end_collection_index: Last index for a phy type collection - * @mutex: synchronizing mutex. + * @mutex_ptr: synchronizing mutex pointer. * @pending: Holds information of pending sync read request. */ struct hid_sensor_hub_device { @@ -84,7 +84,7 @@ struct hid_sensor_hub_device { u32 usage; int start_collection_index; int end_collection_index; - struct mutex mutex; + struct mutex *mutex_ptr; struct sensor_hub_pending pending; }; -- cgit v1.2.3-71-gd317 From 336b7e1f230912cd8df2497be8dd7be4647d8fc8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 11 May 2015 14:06:32 -0400 Subject: block: remove export for blk_queue_bio With commit ff36ab345 ("dm: remove request-based logic from make_request_fn wrapper") DM no longer calls blk_queue_bio() directly, so remove its export. Doing so required a forward declaration in blk-core.c. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- include/linux/blkdev.h | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 7871603f0a29..03b5f8d77f37 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -734,6 +734,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); +static void blk_queue_bio(struct request_queue *q, struct bio *bio); + struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) @@ -1578,7 +1580,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -void blk_queue_bio(struct request_queue *q, struct bio *bio) +static void blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1686,7 +1688,6 @@ out_unlock: spin_unlock_irq(q->queue_lock); } } -EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..5d93a6645e88 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -821,8 +821,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern void blk_queue_bio(struct request_queue *q, struct bio *bio); - /* * A queue has just exitted congestion. Note this in the global counter of * congested queues, and wake up anyone who was waiting for requests to be -- cgit v1.2.3-71-gd317 From f7bcb70ebae0dcdb5a2d859b09e4465784d99029 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 8 May 2015 13:47:23 -0700 Subject: ktime: Fix ktime_divns to do signed division It was noted that the 32bit implementation of ktime_divns() was doing unsigned division and didn't properly handle negative values. And when a ktime helper was changed to utilize ktime_divns, it caused a regression on some IR blasters. See the following bugzilla for details: https://bugzilla.redhat.com/show_bug.cgi?id=1200353 This patch fixes the problem in ktime_divns by checking and preserving the sign bit, and then reapplying it if appropriate after the division, it also changes the return type to a s64 to make it more obvious this is expected. Nicolas also pointed out that negative dividers would cause infinite loops on 32bit systems, negative dividers is unlikely for users of this function, but out of caution this patch adds checks for negative dividers for both 32-bit (BUG_ON) and 64-bit(WARN_ON) versions to make sure no such use cases creep in. [ tglx: Hand an u64 to do_div() to avoid the compiler warning ] Fixes: 166afb64511e 'ktime: Sanitize ktime_to_us/ms conversion' Reported-and-tested-by: Trevor Cordes Signed-off-by: John Stultz Acked-by: Nicolas Pitre Cc: Ingo Molnar Cc: Josh Boyer Cc: One Thousand Gnomes Cc: Link: http://lkml.kernel.org/r/1431118043-23452-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 27 +++++++++++++++++++++------ kernel/time/hrtimer.c | 14 ++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 5fc3d1083071..2b6a204bd8d4 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -166,19 +166,34 @@ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) } #if BITS_PER_LONG < 64 -extern u64 __ktime_divns(const ktime_t kt, s64 div); -static inline u64 ktime_divns(const ktime_t kt, s64 div) +extern s64 __ktime_divns(const ktime_t kt, s64 div); +static inline s64 ktime_divns(const ktime_t kt, s64 div) { + /* + * Negative divisors could cause an inf loop, + * so bug out here. + */ + BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { - u64 ns = kt.tv64; - do_div(ns, div); - return ns; + s64 ns = kt.tv64; + u64 tmp = ns < 0 ? -ns : ns; + + do_div(tmp, div); + return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (u64)((kt).tv64 / (div)) +static inline s64 ktime_divns(const ktime_t kt, s64 div) +{ + /* + * 32-bit implementation cannot handle negative divisors, + * so catch them on 64bit as well. + */ + WARN_ON(div < 0); + return kt.tv64 / div; +} #endif static inline s64 ktime_to_us(const ktime_t kt) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..93ef7190bdea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -266,21 +266,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 __ktime_divns(const ktime_t kt, s64 div) +s64 __ktime_divns(const ktime_t kt, s64 div) { - u64 dclc; int sft = 0; + s64 dclc; + u64 tmp; dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } - dclc >>= sft; - do_div(dclc, (unsigned long) div); - - return dclc; + tmp >>= sft; + do_div(tmp, (unsigned long) div); + return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ -- cgit v1.2.3-71-gd317 From 07ee0722bf941960fb3888f9c9b5839473372fd1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 May 2015 11:30:47 +0800 Subject: rhashtable: Add cap on number of elements in hash table We currently have no limit on the number of elements in a hash table. This is a problem because some users (tipc) set a ceiling on the maximum table size and when that is reached the hash table may degenerate. Others may encounter OOM when growing and if we allow insertions when that happens the hash table perofrmance may also suffer. This patch adds a new paramater insecure_max_entries which becomes the cap on the table. If unset it defaults to max_size * 2. If it is also zero it means that there is no cap on the number of elements in the table. However, the table will grow whenever the utilisation hits 100% and if that growth fails, you will get ENOMEM on insertion. As allowing oversubscription is potentially dangerous, the name contains the word insecure. Note that the cap is not a hard limit. This is done for performance reasons as enforcing a hard limit will result in use of atomic ops that are heavier than the ones we currently use. The reasoning is that we're only guarding against a gross over- subscription of the table, rather than a small breach of the limit. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 19 +++++++++++++++++++ lib/rhashtable.c | 11 +++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index dbcbcc59aa92..843ceca9a21e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -17,6 +17,7 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H +#include #include #include #include @@ -100,6 +101,7 @@ struct rhashtable; * @key_len: Length of key * @key_offset: Offset of key in struct to be hashed * @head_offset: Offset of rhash_head in struct to be hashed + * @insecure_max_entries: Maximum number of entries (may be exceeded) * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking * @nulls_base: Base value to generate nulls marker @@ -115,6 +117,7 @@ struct rhashtable_params { size_t key_len; size_t key_offset; size_t head_offset; + unsigned int insecure_max_entries; unsigned int max_size; unsigned int min_size; u32 nulls_base; @@ -286,6 +289,18 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_grow_above_max - returns true if table is above maximum + * @ht: hash table + * @tbl: current table + */ +static inline bool rht_grow_above_max(const struct rhashtable *ht, + const struct bucket_table *tbl) +{ + return ht->p.insecure_max_entries && + atomic_read(&ht->nelems) >= ht->p.insecure_max_entries; +} + /* The bucket lock is selected based on the hash and protects mutations * on a group of hash buckets. * @@ -589,6 +604,10 @@ restart: goto out; } + err = -E2BIG; + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out; + if (unlikely(rht_grow_above_100(ht, tbl))) { slow_path: spin_unlock_bh(lock); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index b28df4019ade..4396434e4715 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -14,6 +14,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -446,6 +447,10 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key, if (key && rhashtable_lookup_fast(ht, key, ht->p)) goto exit; + err = -E2BIG; + if (unlikely(rht_grow_above_max(ht, tbl))) + goto exit; + err = -EAGAIN; if (rhashtable_check_elasticity(ht, tbl, hash) || rht_grow_above_100(ht, tbl)) @@ -738,6 +743,12 @@ int rhashtable_init(struct rhashtable *ht, if (params->max_size) ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (params->insecure_max_entries) + ht->p.insecure_max_entries = + rounddown_pow_of_two(params->insecure_max_entries); + else + ht->p.insecure_max_entries = ht->p.max_size * 2; + ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); /* The maximum (not average) chain length grows with the -- cgit v1.2.3-71-gd317 From faecbb45ebefb20260ad4a631e011e93c896cb73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 May 2015 13:42:25 +0200 Subject: Revert "netfilter: bridge: query conntrack about skb dnat" This reverts commit c055d5b03bb4cb69d349d787c9787c0383abd8b2. There are two issues: 'dnat_took_place' made me think that this is related to -j DNAT/MASQUERADE. But thats only one part of the story. This is also relevant for SNAT when we undo snat translation in reverse/reply direction. Furthermore, I originally wanted to do this mainly to avoid storing ipv6 addresses once we make DNAT/REDIRECT work for ipv6 on bridges. However, I forgot about SNPT/DNPT which is stateless. So we can't escape storing address for ipv6 anyway. Might as well do it for ipv4 too. Reported-and-tested-by: Bernhard Thaler Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/skbuff.h | 1 + net/bridge/br_netfilter.c | 27 +++++++++------------------ 2 files changed, 10 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 66e374d62f64..f15154a879c7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -176,6 +176,7 @@ struct nf_bridge_info { struct net_device *physindev; struct net_device *physoutdev; char neigh_header[8]; + __be32 ipv4_daddr; }; #endif diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index ab55e2472beb..60ddfbeb47f5 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -37,10 +37,6 @@ #include #include -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include -#endif - #include #include "br_private.h" #ifdef CONFIG_SYSCTL @@ -350,24 +346,15 @@ free_skb: return 0; } -static bool dnat_took_place(const struct sk_buff *skb) +static bool daddr_was_changed(const struct sk_buff *skb, + const struct nf_bridge_info *nf_bridge) { -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_untracked(ct)) - return false; - - return test_bit(IPS_DST_NAT_BIT, &ct->status); -#else - return false; -#endif + return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. + * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge @@ -421,7 +408,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) nf_bridge->pkt_otherhost = false; } nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; - if (dnat_took_place(skb)) { + if (daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -632,6 +619,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { + struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); @@ -669,6 +657,9 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops, if (!setup_pre_routing(skb)) return NF_DROP; + nf_bridge = nf_bridge_info_get(skb); + nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; + skb->protocol = htons(ETH_P_IP); NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb, -- cgit v1.2.3-71-gd317 From d654976cbf852ee20612ee10dbe57cdacda9f452 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2015 21:51:19 -0700 Subject: tcp: fix a potential deadlock in tcp_get_info() Taking socket spinlock in tcp_get_info() can deadlock, as inet_diag_dump_icsk() holds the &hashinfo->ehash_locks[i], while packet processing can use the reverse locking order. We could avoid this locking for TCP_LISTEN states, but lockdep would certainly get confused as all TCP sockets share same lockdep classes. [ 523.722504] ====================================================== [ 523.728706] [ INFO: possible circular locking dependency detected ] [ 523.734990] 4.1.0-dbg-DEV #1676 Not tainted [ 523.739202] ------------------------------------------------------- [ 523.745474] ss/18032 is trying to acquire lock: [ 523.750002] (slock-AF_INET){+.-...}, at: [] tcp_get_info+0x2c4/0x360 [ 523.758129] [ 523.758129] but task is already holding lock: [ 523.763968] (&(&hashinfo->ehash_locks[i])->rlock){+.-...}, at: [] inet_diag_dump_icsk+0x1d5/0x6c0 [ 523.774661] [ 523.774661] which lock already depends on the new lock. [ 523.774661] [ 523.782850] [ 523.782850] the existing dependency chain (in reverse order) is: [ 523.790326] -> #1 (&(&hashinfo->ehash_locks[i])->rlock){+.-...}: [ 523.796599] [] lock_acquire+0xbb/0x270 [ 523.802565] [] _raw_spin_lock+0x38/0x50 [ 523.808628] [] __inet_hash_nolisten+0x78/0x110 [ 523.815273] [] tcp_v4_syn_recv_sock+0x24b/0x350 [ 523.822067] [] tcp_check_req+0x3c1/0x500 [ 523.828199] [] tcp_v4_do_rcv+0x239/0x3d0 [ 523.834331] [] tcp_v4_rcv+0xa8e/0xc10 [ 523.840202] [] ip_local_deliver_finish+0x133/0x3e0 [ 523.847214] [] ip_local_deliver+0xaa/0xc0 [ 523.853440] [] ip_rcv_finish+0x168/0x5c0 [ 523.859624] [] ip_rcv+0x307/0x420 Lets use u64_sync infrastructure instead. As a bonus, 64bit arches get optimized, as these are nop for them. Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 11 +++++++---- net/ipv4/tcp_fastopen.c | 4 ++++ net/ipv4/tcp_input.c | 4 ++++ 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3b2911502a8c..e8bbf403618f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -158,6 +158,8 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */ + u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46efa03d2b11..f1377f2a0472 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -402,6 +402,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + u64_stats_init(&tp->syncp); tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); @@ -2598,6 +2599,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; + unsigned int start; u32 rate; memset(info, 0, sizeof(*info)); @@ -2665,10 +2667,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate = READ_ONCE(sk->sk_max_pacing_rate); info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL; - spin_lock_bh(&sk->sk_lock.slock); - info->tcpi_bytes_acked = tp->bytes_acked; - info->tcpi_bytes_received = tp->bytes_received; - spin_unlock_bh(&sk->sk_lock.slock); + do { + start = u64_stats_fetch_begin_irq(&tp->syncp); + info->tcpi_bytes_acked = tp->bytes_acked; + info->tcpi_bytes_received = tp->bytes_received; + } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3c673d5e6cff..46b087a27503 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -206,6 +206,10 @@ static bool tcp_fastopen_create_child(struct sock *sk, skb_set_owner_r(skb2, child); __skb_queue_tail(&child->sk_receive_queue, skb2); tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; } else { end_seq = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 243d674b3ef5..c9ab964189a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3286,7 +3286,9 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; + u64_stats_update_begin(&tp->syncp); tp->bytes_acked += delta; + u64_stats_update_end(&tp->syncp); tp->snd_una = ack; } @@ -3295,7 +3297,9 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; + u64_stats_update_begin(&tp->syncp); tp->bytes_received += delta; + u64_stats_update_end(&tp->syncp); tp->rcv_nxt = seq; } -- cgit v1.2.3-71-gd317 From cc4a84c3da6f9dd6a297dc81fe4437643a60fe03 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 May 2015 14:07:30 -0700 Subject: net: phy: bcm7xxx: Fix 7425 PHY ID and flags While adding support for 7425 PHY in the 7xxx PHY driver, the ID that was used was actually coming from an external PHY: a BCM5461x. Fix this by using the proper ID for the internal 7425 PHY and set the PHY_IS_INTERNAL flag, otherwise consumers of this PHY driver would not be able to properly identify it as such. Fixes: d068b02cfdfc2 ("net: phy: add BCM7425 and BCM7429 PHYs") Signed-off-by: Florian Fainelli Reviewed-by: Petri Gynther Signed-off-by: David S. Miller --- drivers/net/phy/bcm7xxx.c | 2 +- include/linux/brcmphy.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 64c74c6a4828..b5dc59de094e 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -404,7 +404,7 @@ static struct phy_driver bcm7xxx_driver[] = { .name = "Broadcom BCM7425", .features = PHY_GBIT_FEATURES | SUPPORTED_Pause | SUPPORTED_Asym_Pause, - .flags = 0, + .flags = PHY_IS_INTERNAL, .config_init = bcm7xxx_config_init, .config_aneg = genphy_config_aneg, .read_status = genphy_read_status, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index ae2982c0f7a6..656da2a12ffe 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -17,7 +17,7 @@ #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 -#define PHY_ID_BCM7425 0x03625e60 +#define PHY_ID_BCM7425 0x600d86b0 #define PHY_ID_BCM7429 0x600d8730 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7439_2 0xae025080 -- cgit v1.2.3-71-gd317 From 194ec9368c0dbc421acdb2620d4dfb3cc3d022ff Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 14 May 2015 15:28:24 +0100 Subject: drivers: of/base: move of_init to driver_init Commit 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") adds the symlink `of_node` for each device pointing to it's device tree node while creating/initialising it. However the devicetree sysfs is created and setup in of_init which is executed at core_initcall level. For all the devices created before of_init, the following error is thrown: "Error -2(-ENOENT) creating of_node link" Like many other components in driver model, initialize the sysfs support for OF/devicetree from driver_init so that it's ready before any devices are created. Fixes: 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from devices with an OF node") Suggested-by: Rob Herring Cc: Grant Likely Cc: Pawel Moll Cc: Benjamin Herrenschmidt Signed-off-by: Sudeep Holla Tested-by: Robert Schwebel Acked-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/base/init.c | 2 ++ drivers/of/base.c | 8 +++----- include/linux/of.h | 6 ++++++ 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/init.c b/drivers/base/init.c index da033d3bab3c..48c0e220acc0 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "base.h" @@ -34,4 +35,5 @@ void __init driver_init(void) cpu_dev_init(); memory_dev_init(); container_dev_init(); + of_core_init(); } diff --git a/drivers/of/base.c b/drivers/of/base.c index 99764db0875a..f0650265febf 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -189,7 +189,7 @@ int __of_attach_node_sysfs(struct device_node *np) return 0; } -static int __init of_init(void) +void __init of_core_init(void) { struct device_node *np; @@ -198,7 +198,8 @@ static int __init of_init(void) of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj); if (!of_kset) { mutex_unlock(&of_mutex); - return -ENOMEM; + pr_err("devicetree: failed to register existing nodes\n"); + return; } for_each_of_allnodes(np) __of_attach_node_sysfs(np); @@ -207,10 +208,7 @@ static int __init of_init(void) /* Symlink in /proc as required by userspace ABI */ if (of_root) proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base"); - - return 0; } -core_initcall(of_init); static struct property *__of_find_property(const struct device_node *np, const char *name, int *lenp) diff --git a/include/linux/of.h b/include/linux/of.h index ddeaae6d2083..b871ff9d81d7 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -121,6 +121,8 @@ extern struct device_node *of_stdout; extern raw_spinlock_t devtree_lock; #ifdef CONFIG_OF +void of_core_init(void); + static inline bool is_of_node(struct fwnode_handle *fwnode) { return fwnode && fwnode->type == FWNODE_OF; @@ -376,6 +378,10 @@ bool of_console_check(struct device_node *dn, char *name, int index); #else /* CONFIG_OF */ +static inline void of_core_init(void) +{ +} + static inline bool is_of_node(struct fwnode_handle *fwnode) { return false; -- cgit v1.2.3-71-gd317 From b371b594317869971af326adcf7cd65cabdb4087 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:13 +0200 Subject: perf/x86: Fix event/group validation Commit 43b4578071c0 ("perf/x86: Reduce stack usage of x86_schedule_events()") violated the rule that 'fake' scheduling; as used for event/group validation; should not change the event state. This went mostly un-noticed because repeated calls of x86_pmu::get_event_constraints() would give the same result. And x86_pmu::put_event_constraints() would mostly not do anything. Commit e979121b1b15 ("perf/x86/intel: Implement cross-HT corruption bug workaround") made the situation much worse by actually setting the event->hw.constraint value to NULL, so when validation and actual scheduling interact we get NULL ptr derefs. Fix it by removing the constraint pointer from the event and move it back to an array, this time in cpuc instead of on the stack. validate_group() x86_schedule_events() event->hw.constraint = c; # store perf_task_event_sched_in() ... x86_schedule_events(); event->hw.constraint = c2; # store ... put_event_constraints(event); # assume failure to schedule intel_put_event_constraints() event->hw.constraint = NULL; c = event->hw.constraint; # read -> NULL if (!test_bit(hwc->idx, c->idxmsk)) # <- *BOOM* NULL deref This in particular is possible when the event in question is a cpu-wide event and group-leader, where the validate_group() tries to add an event to the group. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Hunter Cc: Linus Torvalds Cc: Maria Dimakopoulou Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 43b4578071c0 ("perf/x86: Reduce stack usage of x86_schedule_events()") Fixes: e979121b1b15 ("perf/x86/intel: Implement cross-HT corruption bug workaround") Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 33 +++++++++++++++------------ arch/x86/kernel/cpu/perf_event.h | 9 ++++---- arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++-------- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 7 +++--- arch/x86/kernel/cpu/perf_event_intel_uncore.h | 1 + include/linux/perf_event.h | 4 ---- 7 files changed, 33 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 87848ebe2bb7..1664eeea65e0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -620,7 +620,7 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct perf_event **events; + struct event_constraint **constraints; struct sched_state state; int saved_states; struct sched_state saved[SCHED_STATES_MAX]; @@ -629,7 +629,7 @@ struct perf_sched { /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax) { int idx; @@ -637,10 +637,10 @@ static void perf_sched_init(struct perf_sched *sched, struct perf_event **events memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->events = events; + sched->constraints = constraints; for (idx = 0; idx < num; idx++) { - if (events[idx]->hw.constraint->weight == wmin) + if (constraints[idx]->weight == wmin) break; } @@ -687,7 +687,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; @@ -745,7 +745,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -756,12 +756,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -int perf_assign_events(struct perf_event **events, int n, +int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, events, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax); do { if (!perf_sched_find_counter(&sched)) @@ -788,9 +788,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &cpuc->event_list[i]->hw; + cpuc->event_constraint[i] = NULL; c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - hwc->constraint = c; + cpuc->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -801,7 +801,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = hwc->constraint; + c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -821,9 +821,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) } /* slow path */ - if (i != n) - unsched = perf_assign_events(cpuc->event_list, n, wmin, + if (i != n) { + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, assign); + } /* * In case of success (unsched = 0), mark events as committed, @@ -840,7 +841,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, e, assign[i]); + x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } @@ -1292,8 +1293,10 @@ static void x86_pmu_del(struct perf_event *event, int flags) x86_pmu.put_event_constraints(cpuc, event); /* Delete the array entry. */ - while (++i < cpuc->n_events) + while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; + cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + } --cpuc->n_events; perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6ac5cb7a9e14..fdfaab7c5e55 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -172,7 +172,10 @@ struct cpu_hw_events { added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + unsigned int group_flag; int is_fake; @@ -519,9 +522,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, - struct perf_event *event, - int cntr); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); void (*start_scheduling)(struct cpu_hw_events *cpuc); @@ -717,7 +718,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); -int perf_assign_events(struct perf_event **events, int n, +int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3998131d1a68..7a58fb5df15c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2106,7 +2106,7 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c1 = cpuc->event_constraint[idx]; struct event_constraint *c2; /* @@ -2188,8 +2188,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = event->hw.constraint; - intel_put_shared_regs_event_constraints(cpuc, event); /* @@ -2197,19 +2195,14 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, * all events are subject to and must call the * put_excl_constraints() routine */ - if (c && cpuc->excl_cntrs) + if (cpuc->excl_cntrs) intel_put_excl_constraints(cpuc, event); - - /* cleanup dynamic constraint */ - if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) - event->hw.constraint = NULL; } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, - struct perf_event *event, int cntr) +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = event->hw.constraint; + struct event_constraint *c = cpuc->event_constraint[idx]; struct intel_excl_states *xlo, *xl; int tid = cpuc->excl_thread_id; int o_tid = 1 - tid; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 813f75d71175..7f73b3553e2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -706,9 +706,9 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); - else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); if (cpuc->enabled) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..ec2ba578d286 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &box->event_list[i]->hw; c = uncore_get_event_constraint(box, box->event_list[i]); - hwc->constraint = c; + box->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { hwc = &box->event_list[i]->hw; - c = hwc->constraint; + c = box->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -395,7 +394,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int } /* slow path */ if (i != n) - ret = perf_assign_events(box->event_list, n, + ret = perf_assign_events(box->event_constraint, n, wmin, wmax, assign); if (!assign || ret) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..f789ec9a0133 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,7 @@ struct intel_uncore_box { atomic_t refcnt; struct perf_event *events[UNCORE_PMC_IDX_MAX]; struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..d8a82a89f35a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -92,8 +92,6 @@ struct hw_perf_event_extra { int idx; /* index in shared_regs->regs[] */ }; -struct event_constraint; - /** * struct hw_perf_event - performance event hardware details: */ @@ -112,8 +110,6 @@ struct hw_perf_event { struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; - - struct event_constraint *constraint; }; struct { /* software */ struct hrtimer hrtimer; -- cgit v1.2.3-71-gd317 From f36963c9d3f6f415732710da3acdd8608a9fa0e5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 9 May 2015 03:14:13 +0930 Subject: cpumask_set_cpu_local_first => cpumask_local_spread, lament da91309e0a7e (cpumask: Utility function to set n'th cpu...) created a genuinely weird function. I never saw it before, it went through DaveM. (He only does this to make us other maintainers feel better about our own mistakes.) cpumask_set_cpu_local_first's purpose is say "I need to spread things across N online cpus, choose the ones on this numa node first"; you call it in a loop. It can fail. One of the two callers ignores this, the other aborts and fails the device open. It can fail in two ways: allocating the off-stack cpumask, or through a convoluted codepath which AFAICT can only occur if cpu_online_mask changes. Which shouldn't happen, because if cpu_online_mask can change while you call this, it could return a now-offline cpu anyway. It contains a nonsensical test "!cpumask_of_node(numa_node)". This was drawn to my attention by Geert, who said this causes a warning on Sparc. It sets a single bit in a cpumask instead of returning a cpu number, because that's what the callers want. It could be made more efficient by passing the previous cpu rather than an index, but that would be more invasive to the callers. Fixes: da91309e0a7e8966d916a74cce42ed170fde06bf Signed-off-by: Rusty Russell (then rebased) Tested-by: Amir Vadai Acked-by: Amir Vadai Acked-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 6 +-- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 10 ++-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 6 +-- include/linux/cpumask.h | 6 +-- lib/cpumask.c | 74 +++++++++----------------- 5 files changed, 37 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index a6dcbf850c1f..6f9ffb9026cd 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2358,11 +2358,11 @@ static int be_evt_queues_create(struct be_adapter *adapter) adapter->cfg_num_qs); for_all_evt_queues(adapter, eqo, i) { + int numa_node = dev_to_node(&adapter->pdev->dev); if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL)) return -ENOMEM; - cpumask_set_cpu_local_first(i, dev_to_node(&adapter->pdev->dev), - eqo->affinity_mask); - + cpumask_set_cpu(cpumask_local_spread(i, numa_node), + eqo->affinity_mask); netif_napi_add(adapter->netdev, &eqo->napi, be_poll, BE_NAPI_WEIGHT); napi_hash_add(&eqo->napi); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 32f5ec737472..cf467a9f6cc7 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1501,17 +1501,13 @@ static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) { struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx]; int numa_node = priv->mdev->dev->numa_node; - int ret = 0; if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) return -ENOMEM; - ret = cpumask_set_cpu_local_first(ring_idx, numa_node, - ring->affinity_mask); - if (ret) - free_cpumask_var(ring->affinity_mask); - - return ret; + cpumask_set_cpu(cpumask_local_spread(ring_idx, numa_node), + ring->affinity_mask); + return 0; } static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index f7bf312fb443..7bed3a88579f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -144,9 +144,9 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, ring->queue_index = queue_index; if (queue_index < priv->num_tx_rings_p_up) - cpumask_set_cpu_local_first(queue_index, - priv->mdev->dev->numa_node, - &ring->affinity_mask); + cpumask_set_cpu(cpumask_local_spread(queue_index, + priv->mdev->dev->numa_node), + &ring->affinity_mask); *pring = ring; return 0; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 27e285b92b5f..59915ea5373c 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -151,10 +151,8 @@ static inline unsigned int cpumask_any_but(const struct cpumask *mask, return 1; } -static inline int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +static inline unsigned int cpumask_local_spread(unsigned int i, int node) { - set_bit(0, cpumask_bits(dstp)); - return 0; } @@ -208,7 +206,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); -int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp); +unsigned int cpumask_local_spread(unsigned int i, int node); /** * for_each_cpu - iterate over every cpu in a mask diff --git a/lib/cpumask.c b/lib/cpumask.c index 830dd5dec40f..5f627084f2e9 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -139,64 +139,42 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) #endif /** - * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first - * + * cpumask_local_spread - select the i'th cpu with local numa cpu's first * @i: index number - * @numa_node: local numa_node - * @dstp: cpumask with the relevant cpu bit set according to the policy + * @node: local numa_node * - * This function sets the cpumask according to a numa aware policy. - * cpumask could be used as an affinity hint for the IRQ related to a - * queue. When the policy is to spread queues across cores - local cores - * first. + * This function selects an online CPU according to a numa aware policy; + * local cpus are returned first, followed by non-local ones, then it + * wraps around. * - * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set - * the cpu bit and need to re-call the function. + * It's not very efficient, but useful for setup. */ -int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +unsigned int cpumask_local_spread(unsigned int i, int node) { - cpumask_var_t mask; int cpu; - int ret = 0; - - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (numa_node == -1 || !cpumask_of_node(numa_node)) { - /* Use all online cpu's for non numa aware system */ - cpumask_copy(mask, cpu_online_mask); + if (node == -1) { + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; } else { - int n; - - cpumask_and(mask, - cpumask_of_node(numa_node), cpu_online_mask); - - n = cpumask_weight(mask); - if (i >= n) { - i -= n; - - /* If index > number of local cpu's, mask out local - * cpu's - */ - cpumask_andnot(mask, cpu_online_mask, mask); + /* NUMA first. */ + for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask) + if (i-- == 0) + return cpu; + + for_each_cpu(cpu, cpu_online_mask) { + /* Skip NUMA nodes, done above. */ + if (cpumask_test_cpu(cpu, cpumask_of_node(node))) + continue; + + if (i-- == 0) + return cpu; } } - - for_each_cpu(cpu, mask) { - if (--i < 0) - goto out; - } - - ret = -EAGAIN; - -out: - free_cpumask_var(mask); - - if (!ret) - cpumask_set_cpu(cpu, dstp); - - return ret; + BUG(); } -EXPORT_SYMBOL(cpumask_set_cpu_local_first); +EXPORT_SYMBOL(cpumask_local_spread); -- cgit v1.2.3-71-gd317 From aad653a0bc09dd4ebcb5579f9f835bbae9ef2ba3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 19 May 2015 15:58:37 +1000 Subject: block: discard bdi_unregister() in favour of bdi_destroy() bdi_unregister() now contains very little functionality. It contains a "WARN_ON" if bdi->dev is NULL. This warning is of no real consequence as bdi->dev isn't needed by anything else in the function, and it triggers if blk_cleanup_queue() -> bdi_destroy() is called before bdi_unregister, which happens since Commit: 6cd18e711dd8 ("block: destroy bdi before blockdev is unregistered.") So this isn't wanted. It also calls bdi_set_min_ratio(). This needs to be called after writes through the bdi have all been flushed, and before the bdi is destroyed. Calling it early is better than calling it late as it frees up a global resource. Calling it immediately after bdi_wb_shutdown() in bdi_destroy() perfectly fits these requirements. So bdi_unregister() can be discarded with the important content moved to bdi_destroy(), as can the writeback_bdi_unregister event which is already not used. Reported-by: Mike Snitzer Cc: stable@vger.kernel.org (v4.0) Fixes: c4db59d31e39 ("fs: don't reassign dirty inodes to default_backing_dev_info") Fixes: 6cd18e711dd8 ("block: destroy bdi before blockdev is unregistered.") Acked-by: Peter Zijlstra (Intel) Acked-by: Dan Williams Tested-by: Nicholas Moulin Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 1 - include/linux/backing-dev.h | 1 - include/trace/events/writeback.h | 1 - mm/backing-dev.c | 18 +----------------- 4 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 0a536dc05f3b..666e11b83983 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -653,7 +653,6 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aff923ae8c4b..d87d8eced064 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -116,7 +116,6 @@ __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); -void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 880dd7437172..c178d13d6f4c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -250,7 +250,6 @@ DEFINE_EVENT(writeback_class, name, \ DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); -DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6dc4580df2af..000e7b3b9896 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -359,23 +359,6 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) flush_delayed_work(&bdi->wb.dwork); } -/* - * Called when the device behind @bdi has been removed or ejected. - * - * We can't really do much here except for reducing the dirty ratio at - * the moment. In the future we should be able to set a flag so that - * the filesystem can handle errors at mark_inode_dirty time instead - * of only at writeback time. - */ -void bdi_unregister(struct backing_dev_info *bdi) -{ - if (WARN_ON_ONCE(!bdi->dev)) - return; - - bdi_set_min_ratio(bdi, 0); -} -EXPORT_SYMBOL(bdi_unregister); - static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) { memset(wb, 0, sizeof(*wb)); @@ -443,6 +426,7 @@ void bdi_destroy(struct backing_dev_info *bdi) int i; bdi_wb_shutdown(bdi); + bdi_set_min_ratio(bdi, 0); WARN_ON(!list_empty(&bdi->work_list)); WARN_ON(delayed_work_pending(&bdi->wb.dwork)); -- cgit v1.2.3-71-gd317 From 80188b0d77d7426b494af739ac129e0e684acb84 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:39:34 +1000 Subject: percpu_counter: batch size aware __percpu_counter_compare() XFS uses non-stanard batch sizes for avoiding frequent global counter updates on it's allocated inode counters, as they increment or decrement in batches of 64 inodes. Hence the standard percpu counter batch of 32 means that the counter is effectively a global counter. Currently Xfs uses a batch size of 128 so that it doesn't take the global lock on every single modification. However, Xfs also needs to compare accurately against zero, which means we need to use percpu_counter_compare(), and that has a hard-coded batch size of 32, and hence will spuriously fail to detect when it is supposed to use precise comparisons and hence the accounting goes wrong. Add __percpu_counter_compare() to take a custom batch size so we can use it sanely in XFS and factor percpu_counter_compare() to use it. Signed-off-by: Dave Chinner Acked-by: Tejun Heo Signed-off-by: Dave Chinner --- include/linux/percpu_counter.h | 13 ++++++++++++- lib/percpu_counter.c | 6 +++--- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 50e50095c8d1..84a109449610 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -41,7 +41,12 @@ void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); -int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs); +int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); + +static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) +{ + return __percpu_counter_compare(fbc, rhs, percpu_counter_batch); +} static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { @@ -116,6 +121,12 @@ static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) return 0; } +static inline int +__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) +{ + return percpu_counter_compare(fbc, rhs); +} + static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 48144cdae819..f051d69f0910 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -197,13 +197,13 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ -int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) +int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ - if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) { + if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else @@ -218,7 +218,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) else return 0; } -EXPORT_SYMBOL(percpu_counter_compare); +EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { -- cgit v1.2.3-71-gd317