From 1ed8d48c57bf7400eac7b8dc622ab0413715cafb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:52 -0700 Subject: writeback: bdi_for_each_wb() iteration is memcg ID based not blkcg wb's (bdi_writeback's) are currently keyed by memcg ID; however, in an earlier implementation, wb's were keyed by blkcg ID. bdi_for_each_wb() walks bdi->cgwb_tree in the ascending ID order and allows iterations to start from an arbitrary ID which is used to interrupt and resume iterations. Unfortunately, while changing wb to be keyed by memcg ID instead of blkcg, bdi_for_each_wb() was missed and is still assuming that wb's are keyed by blkcg ID. This doesn't affect iterations which don't get interrupted but bdi_split_work_to_wbs() makes use of iteration resuming on allocation failures and thus may incorrectly skip or repeat wb's. Fix it by changing bdi_for_each_wb() to take memcg IDs instead of blkcg IDs and updating bdi_split_work_to_wbs() accordingly. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..23ebb946e66f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) } struct wb_iter { - int start_blkcg_id; + int start_memcg_id; struct radix_tree_iter tree_iter; void **slot; }; @@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, WARN_ON_ONCE(!rcu_read_lock_held()); - if (iter->start_blkcg_id >= 0) { - iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); - iter->start_blkcg_id = -1; + if (iter->start_memcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); + iter->start_memcg_id = -1; } else { iter->slot = radix_tree_next_slot(iter->slot, titer, 0); } @@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, struct backing_dev_info *bdi, - int start_blkcg_id) + int start_memcg_id) { - iter->start_blkcg_id = start_blkcg_id; + iter->start_memcg_id = start_memcg_id; - if (start_blkcg_id) + if (start_memcg_id) return __wb_iter_next(iter, bdi); else return &bdi->wb; } /** - * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order * @wb_cur: cursor struct bdi_writeback pointer * @bdi: bdi to walk wb's of * @iter: pointer to struct wb_iter to be used as iteration buffer - * @start_blkcg_id: blkcg ID to start iteration from + * @start_memcg_id: memcg ID to start iteration from * * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending - * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter * to be used as temp storage during iteration. rcu_read_lock() must be * held throughout iteration. */ -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ - for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) #else /* CONFIG_CGROUP_WRITEBACK */ -- cgit v1.2.3-71-gd317 From 9acee9c551f045d2c5b5261aa587331423fd7d92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:55 -0700 Subject: kernfs: implement kernfs_path_len() Add a function to determine the path length of a kernfs node. This for now will be used by writeback tracepoint updates. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 23 +++++++++++++++++++++++ include/linux/kernfs.h | 4 ++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2d48d28e1640..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) return ret; } +/** + * kernfs_path_len - determine the length of the full path of a given node + * @kn: kernfs_node of interest + * + * The returned length doesn't include the space for the terminating '\0'. + */ +size_t kernfs_path_len(struct kernfs_node *kn) +{ + size_t len = 0; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + do { + len += strlen(kn->name) + 1; + kn = kn->parent; + } while (kn && kn->parent); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return len; +} + /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +size_t kernfs_path_len(struct kernfs_node *kn); char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } +static inline size_t kernfs_path_len(struct kernfs_node *kn) +{ return 0; } + static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } -- cgit v1.2.3-71-gd317 From 401efbf835040dd2ebca54f78d58fc8e3c51f91d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:06 -0700 Subject: blkcg: remove unnecessary request_list->blkg NULL test in blk_put_rl() Since ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg"), a request_list always has its blkg associated. Drop unnecessary rl->blkg NULL test from blk_put_rl(). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1b62d768c7df..9711fc277c02 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -394,8 +394,7 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } -- cgit v1.2.3-71-gd317 From 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:09 -0700 Subject: blkcg: restructure blkg_policy_data allocation in blkcg_activate_policy() When a policy gets activated, it needs to allocate and install its policy data on all existing blkg's (blkcg_gq's). Because blkg iteration is protected by a spinlock, it currently counts the total number of blkg's in the system, allocates the matching number of policy data on a list and installs them during a single iteration. This can be simplified by using speculative GFP_NOWAIT allocations while iterating and falling back to a preallocated policy data on failure. If the preallocated one has already been consumed, it releases the lock, preallocate with GFP_KERNEL and then restarts the iteration. This can be a bit more expensive than before but policy activation is a very cold path and shouldn't matter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 55 ++++++++++++++++++---------------------------- include/linux/blk-cgroup.h | 3 --- 2 files changed, 21 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f91a4e09e0c9..9e9b0df339ee 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1047,65 +1047,52 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); + struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *nd; - int cnt = 0, ret; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - spin_unlock_irq(q->queue_lock); - - /* allocate per-blkg policy data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds and cpds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, nd, &pds, alloc_node) - kfree(pd); + kfree(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9711fc277c02..db822880242a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -80,9 +80,6 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* -- cgit v1.2.3-71-gd317 From 001bea73e70efdf48a9e00188cf302f6b6aed2bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:11 -0700 Subject: blkcg: replace blkcg_policy->pd_size with ->pd_alloc/free_fn() methods A blkg (blkcg_gq) represents the relationship between a cgroup and request_queue. Each active policy has a pd (blkg_policy_data) on each blkg. The pd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->pd_size larger than the size of pd. This is a bit unusual but was done this way mostly to simplify error handling and all the existing use cases could be handled this way; however, this is becoming too restrictive now that percpu memory can be allocated without blocking. This introduces two new mandatory blkcg_policy methods - pd_alloc_fn() and pd_free_fn() - which are used to allocate and release pd for a given policy. As pd allocation is now done from policy side, it can simply allocate a larger area which embeds pd at the beginning. This change makes ->pd_size pointless. Removed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 21 +++++++++++---------- block/blk-throttle.c | 13 ++++++++++++- block/cfq-iosched.c | 13 ++++++++++++- include/linux/blk-cgroup.h | 18 +++++++++--------- 4 files changed, 44 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4defbbabc0ff..d1bc6099bd1e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -68,7 +68,8 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->blkcg != &blkcg_root) blk_exit_rl(&blkg->rl); @@ -114,7 +115,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -1057,7 +1058,7 @@ int blkcg_activate_policy(struct request_queue *q, blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { - pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); if (!pd_prealloc) { ret = -ENOMEM; goto out_bypass_end; @@ -1072,7 +1073,7 @@ pd_prealloc: if (blkg->pd[pol->plid]) continue; - pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); if (!pd) swap(pd, pd_prealloc); if (!pd) { @@ -1093,7 +1094,8 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: blk_queue_bypass_end(q); - kfree(pd_prealloc); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1128,8 +1130,10 @@ void blkcg_deactivate_policy(struct request_queue *q, if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1151,9 +1155,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518ac7..f1dd691c5359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -403,6 +403,11 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq) del_timer_sync(&sq->pending_timer); } +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) +{ + return kzalloc_node(sizeof(struct throtl_grp), gfp, node); +} + static void throtl_pd_init(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); @@ -493,6 +498,11 @@ static void throtl_pd_exit(struct blkcg_gq *blkg) throtl_service_queue_exit(&tg->service_queue); } +static void throtl_pd_free(struct blkg_policy_data *pd) +{ + kfree(pd); +} + static void throtl_pd_reset_stats(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); @@ -1468,12 +1478,13 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), .cftypes = throtl_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_exit_fn = throtl_pd_exit, + .pd_free_fn = throtl_pd_free, .pd_reset_stats_fn = throtl_pd_reset_stats, }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9c9ec7cc9f99..69ce2883099e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1582,6 +1582,11 @@ static void cfq_cpd_init(const struct blkcg *blkcg) } } +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) +{ + return kzalloc_node(sizeof(struct cfq_group), gfp, node); +} + static void cfq_pd_init(struct blkcg_gq *blkg) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1618,6 +1623,11 @@ static void cfq_pd_offline(struct blkcg_gq *blkg) cfqg_stats_xfer_dead(cfqg); } +static void cfq_pd_free(struct blkg_policy_data *pd) +{ + return kfree(pd); +} + /* offset delta from cfqg->stats to cfqg->dead_stats */ static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - offsetof(struct cfq_group, stats); @@ -4633,13 +4643,14 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), .cpd_size = sizeof(struct cfq_group_data), .cftypes = cfq_blkcg_files, .cpd_init_fn = cfq_cpd_init, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db822880242a..bd173ea360ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -68,13 +68,11 @@ struct blkg_rwstat { * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ @@ -126,16 +124,16 @@ struct blkcg_gq { }; typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); struct blkcg_policy { int plid; - /* policy specific private data size */ - size_t pd_size; /* policy specific per-blkcg data size */ size_t cpd_size; /* cgroup files for the policy */ @@ -143,10 +141,12 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; -- cgit v1.2.3-71-gd317 From b2ce2643cc705aa9043642d7b6248ccfd8e20629 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:13 -0700 Subject: blk-throttle: clean up blkg_policy_data alloc/init/exit/free methods With the recent addition of alloc and free methods, things became messier. This patch reorganizes them according to the followings. * ->pd_alloc_fn() Responsible for allocation and static initializations - the ones which can be done independent of where the pd might be attached. * ->pd_init_fn() Initializations which require the knowledge of where the pd is attached. * ->pd_free_fn() The counter part of pd_alloc_fn(). Static de-init and freeing. This leaves ->pd_exit_fn() without any users. Removed. While at it, collapse an one liner function throtl_pd_exit(), which has only one user, into its user. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 11 --------- block/blk-throttle.c | 57 ++++++++++++++++------------------------------ block/cfq-iosched.c | 15 ++++++++---- include/linux/blk-cgroup.h | 2 -- 4 files changed, 31 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d1bc6099bd1e..acfb09af58a5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -402,15 +402,6 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -1127,8 +1118,6 @@ void blkcg_deactivate_policy(struct request_queue *q, if (pol->pd_offline_fn) pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); if (blkg->pd[pol->plid]) { pol->pd_free_fn(blkg->pd[pol->plid]); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3c869768cfdd..c3a235b8ec7e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -330,26 +330,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) -{ - del_timer_sync(&sq->pending_timer); -} - static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { struct throtl_grp *tg; - int cpu; + int rw, cpu; tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) @@ -361,6 +354,19 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) return NULL; } + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + for_each_possible_cpu(cpu) { struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu); @@ -375,8 +381,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -391,25 +396,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - + sq->parent_sq = &td->service_queue; if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; } /* @@ -436,17 +426,11 @@ static void throtl_pd_online(struct blkcg_gq *blkg) tg_update_has_rules(blkg_to_tg(blkg)); } -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - - throtl_service_queue_exit(&tg->service_queue); -} - static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); + del_timer_sync(&tg->service_queue.pending_timer); free_percpu(tg->stats_cpu); kfree(tg); } @@ -1421,7 +1405,6 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, .pd_free_fn = throtl_pd_free, .pd_reset_stats_fn = throtl_pd_reset_stats, }; @@ -1616,7 +1599,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 69ce2883099e..4b795c7250ef 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1584,7 +1584,17 @@ static void cfq_cpd_init(const struct blkcg *blkcg) static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { - return kzalloc_node(sizeof(struct cfq_group), gfp, node); + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; + + cfq_init_cfqg_base(cfqg); + cfqg_stats_init(&cfqg->stats); + cfqg_stats_init(&cfqg->dead_stats); + + return &cfqg->pd; } static void cfq_pd_init(struct blkcg_gq *blkg) @@ -1592,11 +1602,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg) struct cfq_group *cfqg = blkg_to_cfqg(blkg); struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); - cfq_init_cfqg_base(cfqg); cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); } static void cfq_pd_offline(struct blkcg_gq *blkg) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bd173ea360ce..9879469b1b38 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -128,7 +128,6 @@ typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -145,7 +144,6 @@ struct blkcg_policy { blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; -- cgit v1.2.3-71-gd317 From a9520cd6f2ac1fbbf206b915946534c6dddbaae2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:14 -0700 Subject: blkcg: make blkcg_policy methods take a pointer to blkcg_policy_data The newly added ->pd_alloc_fn() and ->pd_free_fn() deal with pd (blkg_policy_data) while the older ones use blkg (blkcg_gq). As using blkg doesn't make sense for ->pd_alloc_fn() and after allocation pd can always be mapped to blkg and given that these are policy-specific methods, it makes sense to converge on pd. This patch makes all methods deal with pd instead of blkg. Most conversions are trivial. In blk-cgroup.c, a couple method invocation sites now test whether pd exists instead of policy state for consistency. This shouldn't cause any behavioral differences. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 18 ++++++++---------- block/blk-throttle.c | 13 +++++++------ block/cfq-iosched.c | 14 +++++++------- include/linux/blk-cgroup.h | 8 ++++---- 4 files changed, 26 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index acfb09af58a5..8343450cffe2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -242,7 +242,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -256,7 +256,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; @@ -347,7 +347,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); } blkg->online = false; @@ -468,9 +468,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -1076,7 +1075,7 @@ pd_prealloc: pd->blkg = blkg; pd->plid = pol->plid; if (pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); @@ -1116,10 +1115,9 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c3a235b8ec7e..c2c75477a6b2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -377,9 +377,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; @@ -417,13 +418,13 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); + tg_update_has_rules(pd_to_tg(pd)); } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -435,9 +436,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd) kfree(tg); } -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) +static void throtl_pd_reset_stats(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); int cpu; for_each_possible_cpu(cpu) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4b795c7250ef..95e6b0cbaa84 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1597,18 +1597,18 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) return &cfqg->pd; } -static void cfq_pd_init(struct blkcg_gq *blkg) +static void cfq_pd_init(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); int i; for (i = 0; i < IOPRIO_BE_NR; i++) { @@ -1661,9 +1661,9 @@ static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data * return a; } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); cfqg_stats_reset(&cfqg->dead_stats); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9879469b1b38..ddd4b8b252c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -125,11 +125,11 @@ struct blkcg_gq { typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; -- cgit v1.2.3-71-gd317 From 814376483e7d85b69a70634633f1f9d01c6ee0cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:15 -0700 Subject: blkcg: minor updates around blkcg_policy_data * Rename blkcg->pd[] to blkcg->cpd[] so that cpd is consistently used for blkcg_policy_data. * Make blkcg_policy->cpd_init_fn() take blkcg_policy_data instead of blkcg. This makes it consistent with blkg_policy_data methods and to-be-added cpd alloc/free methods. * blkcg_policy_data->blkcg and cpd_to_blkcg() added so that cpd_init_fn() can determine the associated blkcg from blkcg_policy_data. v2: blkcg_policy_data->blkcg initializations were missing. Added. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 24 +++++++++++++----------- block/cfq-iosched.c | 11 +++++------ include/linux/blk-cgroup.h | 14 ++++++++++---- 3 files changed, 28 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8343450cffe2..247c42c8c83b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -821,7 +821,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->pd[i]); + kfree(blkcg->cpd[i]); kfree(blkcg); } @@ -857,15 +857,16 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) if (!pol || !pol->cpd_size) continue; - BUG_ON(blkcg->pd[i]); + BUG_ON(blkcg->cpd[i]); cpd = kzalloc(pol->cpd_size, GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } - blkcg->pd[i] = cpd; + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -881,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->pd[i]); + kfree(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -1168,9 +1169,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) goto err_free_cpds; } - blkcg->pd[pol->plid] = cpd; + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } } @@ -1186,8 +1188,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) err_free_cpds: if (pol->cpd_size) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + kfree(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; } } blkcg_policy[pol->plid] = NULL; @@ -1222,8 +1224,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) if (pol->cpd_size) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + kfree(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; } } blkcg_policy[pol->plid] = NULL; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 95e6b0cbaa84..dd6ea9ee6245 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -220,7 +220,7 @@ struct cfqg_stats { /* Per-cgroup data */ struct cfq_group_data { /* must be the first member */ - struct blkcg_policy_data pd; + struct blkcg_policy_data cpd; unsigned int weight; unsigned int leaf_weight; @@ -612,7 +612,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) static struct cfq_group_data *cpd_to_cfqgd(struct blkcg_policy_data *cpd) { - return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; } static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) @@ -1568,12 +1568,11 @@ static void cfqg_stats_init(struct cfqg_stats *stats) #endif } -static void cfq_cpd_init(const struct blkcg *blkcg) +static void cfq_cpd_init(struct blkcg_policy_data *cpd) { - struct cfq_group_data *cgd = - cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - if (blkcg == &blkcg_root) { + if (cpd_to_blkcg(cpd) == &blkcg_root) { cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; } else { diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index ddd4b8b252c7..7988d4749fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -45,7 +45,7 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK @@ -88,7 +88,8 @@ struct blkg_policy_data { * each policy handle per-blkcg data. */ struct blkcg_policy_data { - /* the policy id this per-policy data belongs to */ + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; int plid; }; @@ -123,7 +124,7 @@ struct blkcg_gq { struct rcu_head rcu_head; }; -typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -243,7 +244,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { - return blkcg ? blkcg->pd[pol->plid] : NULL; + return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** @@ -257,6 +258,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) return pd ? pd->blkg : NULL; } +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest -- cgit v1.2.3-71-gd317 From e4a9bde9589fdc51283755cdd75d47b27ca7c6fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:16 -0700 Subject: blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn() methods Each active policy has a cpd (blkcg_policy_data) on each blkcg. The cpd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->cpd_size larger than the size of cpd. This is a bit unusual but blkg (blkcg_gq) policy data used to be handled this way too so it made sense to be consistent; however, blkg policy data switched to alloc/free callbacks. This patch makes similar changes to cpd handling. blkcg_policy->cpd_alloc/free_fn() are added to replace ->cpd_size. As cpd allocation is now done from policy side, it can simply allocate a larger area which embeds cpd at the beginning. As ->cpd_alloc_fn() may be able to perform all necessary initializations, this patch makes ->cpd_init_fn() optional. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 39 ++++++++++++++++++++++++--------------- block/cfq-iosched.c | 19 ++++++++++++++++++- include/linux/blk-cgroup.h | 17 ++++++++++------- 3 files changed, 52 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 247c42c8c83b..2b4354b6b5de 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -817,11 +817,15 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) int i; mutex_lock(&blkcg_pol_mutex); + list_del(&blkcg->all_blkcgs_node); - mutex_unlock(&blkcg_pol_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->cpd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + mutex_unlock(&blkcg_pol_mutex); + kfree(blkcg); } @@ -854,11 +858,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ - if (!pol || !pol->cpd_size) + if (!pol || !pol->cpd_alloc_fn) continue; - BUG_ON(blkcg->cpd[i]); - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; @@ -866,7 +869,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(cpd); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -882,7 +886,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->cpd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -1159,11 +1164,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { mutex_unlock(&blkcg_pol_mutex); goto err_free_cpds; @@ -1186,10 +1191,12 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; @@ -1222,10 +1229,12 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dd6ea9ee6245..a4429b366820 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1568,6 +1568,16 @@ static void cfqg_stats_init(struct cfqg_stats *stats) #endif } +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) +{ + struct cfq_group_data *cgd; + + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); @@ -1581,6 +1591,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd) } } +static void cfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_cfqgd(cpd)); +} + static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { struct cfq_group *cfqg; @@ -4649,10 +4664,12 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .cpd_size = sizeof(struct cfq_group_data), .cftypes = cfq_blkcg_files, + .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7988d4749fff..15f2382bc723 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -81,11 +81,11 @@ struct blkg_policy_data { }; /* - * Policies that need to keep per-blkcg data which is independent - * from any request_queue associated to it must specify its size - * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. cpd_init() is invoked to let - * each policy handle per-blkcg data. + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ @@ -124,7 +124,9 @@ struct blkcg_gq { struct rcu_head rcu_head; }; +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -134,13 +136,14 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; - /* policy specific per-blkcg data size */ - size_t cpd_size; /* cgroup files for the policy */ struct cftype *cftypes; /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; -- cgit v1.2.3-71-gd317 From 24f290466f79a6497f1654f64b9a841872cba3ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:17 -0700 Subject: blkcg: inline [__]blkg_lookup() blkg_lookup() checks whether the target queue is bypassing and, if not, calls __blkg_lookup() which first checks the lookup hint and then performs radix tree walk. The operations upto hint checking are trivial and there are many users of this function. This patch inlines blkg_lookup() and the fast path part of __blkg_lookup(). The radix tree lookup and hint update are now in blkg_lookup_slowpath(). This will help consolidating blkg handling by easing moving root blkcg short-circuit to inlined lookup fast path. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 38 ++--------------------------------- include/linux/blk-cgroup.h | 49 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2b4354b6b5de..52e637ed9d7e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -131,26 +131,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -169,25 +154,6 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); - /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 15f2382bc723..d5b54aa50582 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -155,7 +155,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -231,6 +232,49 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) return css_to_blkcg(blkcg->css.parent); } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -313,9 +357,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant -- cgit v1.2.3-71-gd317 From 85b6bc9db6d5ab6980b43c38b5cbd11d24414ce4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:18 -0700 Subject: blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() Currently, both throttle and cfq policies implement their own root blkg (blkcg_gq) lookup fast path. This patch moves root blkg optimization from throtl_lookup_tg() to __blkg_lookup(). cfq-iosched currently doesn't use blkg_lookup() but will be converted and drop the optimization too. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 ------- include/linux/blk-cgroup.h | 3 +++ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c2c75477a6b2..1f63fc834dc3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -452,13 +452,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd) static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkcg *blkcg) { - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index d5b54aa50582..0609bce69f68 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -249,6 +249,9 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, { struct blkcg_gq *blkg; + if (blkcg == &blkcg_root) + return q->root_blkg; + blkg = rcu_dereference(blkcg->blkg_hint); if (blkg && blkg->q == q) return blkg; -- cgit v1.2.3-71-gd317 From ae11889636111199dbcf47283b4167f578b69472 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:20 -0700 Subject: blkcg: consolidate blkg creation in blkcg_bio_issue_check() blkg (blkcg_gq) currently is created by blkcg policies invoking blkg_lookup_create() which ends up repeating about the same code in different policies. Theoretically, this can avoid the overhead of looking and/or creating blkg's if blkcg is enabled but no policy is in use; however, the cost of blkg lookup / creation is very low especially if only the root blkcg is in use which is highly likely if no blkcg policy is in active use - it boils down to a single very predictable conditional and surrounding RCU protection. This patch consolidates blkg creation to a new function blkcg_bio_issue_check() which is called during bio issue from generic_make_request_checks(). blkcg_bio_issue_check() is now the only function which tries to create missing blkg's. The subsequent policy and request_list operations just perform blkg_lookup() and if missing falls back to the root. * blk_get_rl() no longer tries to create blkg. It uses blkg_lookup() instead of blkg_lookup_create(). * blk_throtl_bio() is now called from blkcg_bio_issue_check() with rcu read locked and blkg already looked up. Both throtl_lookup_tg() and throtl_lookup_create_tg() are dropped. * cfq is similarly updated. cfq_lookup_create_cfqg() is replaced with cfq_lookup_cfqg()which uses blkg_lookup(). This consolidates blkg handling and avoids unnecessary blkg creation retries under memory pressure. In addition, this provides a common bio entry point into blkcg where things like common accounting can be performed. v2: Build fixes for !CONFIG_CFQ_GROUP_IOSCHED and !CONFIG_BLK_DEV_THROTTLING. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 4 +-- block/blk-throttle.c | 72 ++++------------------------------------------ block/blk.h | 5 ---- block/cfq-iosched.c | 33 +++++++-------------- include/linux/blk-cgroup.h | 40 ++++++++++++++++++++++++-- 6 files changed, 57 insertions(+), 99 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52e637ed9d7e..097c4a670fa4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -153,6 +153,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, return NULL; } +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as @@ -295,7 +296,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { diff --git a/block/blk-core.c b/block/blk-core.c index 627ed0c593fb..140094f4eaca 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1889,8 +1889,8 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 900a777e01c2..29c22ed4b073 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -182,11 +182,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -449,39 +444,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd) } } -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); -} - -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else - tg = td_root_tg(td); - } - - return tg; -} - static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -1403,46 +1365,26 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_reset_stats_fn = throtl_pd_reset_stats, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_bypass(q))) goto out_unlock; - tg = throtl_lookup_create_tg(td, blkcg); sq = &tg->service_queue; while (true) { @@ -1507,8 +1449,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we diff --git a/block/blk.h b/block/blk.h index 026d9594142b..d905b26fbff5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -266,15 +266,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a4429b366820..0994f3b523a8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1683,28 +1683,15 @@ static void cfq_pd_reset_stats(struct blkg_policy_data *pd) cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) @@ -2108,8 +2095,8 @@ static struct cftype cfq_blkcg_files[] = { { } /* terminate */ }; #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -3716,7 +3703,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0609bce69f68..4d1659c7f84b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -421,8 +421,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * or if either the blkcg or queue is going away. Fall back to * root_rl in such cases. */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) goto root_rl; blkg_get(blkg); @@ -636,6 +636,39 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to, u64_stats_update_end(&to->syncp); } +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio); +#else +static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { return false; } +#endif + +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool throtl = false; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } + + throtl = blk_throtl_bio(q, blkg, bio); + + rcu_read_unlock(); + return !throtl; +} + #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -689,6 +722,9 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) { return true; } + #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -- cgit v1.2.3-71-gd317 From e6269c44546755094979ab53609e6e203a68c8ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:21 -0700 Subject: blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it cgroup stats are local to each cgroup and doesn't propagate to ancestors by default. When recursive stats are necessary, the sum is calculated over all the descendants. This initially was for backward compatibility to support both group-local and recursive stats but this mode of operation makes general sense as stat update is much hotter thafn reporting those stats. This however ends up losing recursive stats when a child is removed. To work around this, cfq-iosched adds its stats to its parent cfq_group->dead_stats which is summed up together when calculating recursive stats. It's planned that the core stats will be moved to blkcg_gq, so we want to move the mechanism for keeping track of the stats of dead children from cfq to blkcg core. This patch adds blkg_[rw]stat->aux_cnt which are atomic64_t's keeping track of auxiliary counts which are excluded when reading local counts but included for recursive. blkg_[rw]stat_merge() which were used by cfq to implement dead_stats are replaced by blkg_[rw]stat_add_aux(), and cfq now forwards stats of a dead cgroup to the aux counts of parent->stats instead of separate ->dead_stats. This will also help making blkg_[rw]stats per-cpu. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 ++++--- block/cfq-iosched.c | 67 +++++++++++++--------------------------------- include/linux/blk-cgroup.h | 46 ++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 097c4a670fa4..ff79b52d1a0e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -584,7 +584,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); * @off: offset to the blkg_stat in @pd * * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue + * descendants and their aux counts. The caller must be holding the queue * lock for online tests. */ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) @@ -602,7 +602,8 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) struct blkg_stat *stat = (void *)pos_pd + off; if (pos_blkg->online) - sum += blkg_stat_read(stat); + sum += blkg_stat_read(stat) + + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -616,7 +617,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * @off: offset to the blkg_stat in @pd * * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue + * descendants and their aux counts. The caller must be holding the queue * lock for online tests. */ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, @@ -642,7 +643,8 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + sum.cnt[i] += tmp.cnt[i] + + atomic64_read(&rwstat->aux_cnt[i]); } rcu_read_unlock(); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0994f3b523a8..b272cfff7364 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -304,7 +304,6 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ /* async queue for each priority case */ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; @@ -736,28 +735,28 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); + blkg_rwstat_add_aux(&to->serviced, &from->serviced); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -770,10 +769,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -1606,7 +1603,6 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) cfq_init_cfqg_base(cfqg); cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); return &cfqg->pd; } @@ -1649,38 +1645,11 @@ static void cfq_pd_free(struct blkg_policy_data *pd) return kfree(pd); } -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; -} - -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat a, b; - - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; -} - static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, @@ -1883,7 +1852,7 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); + u64 sum = blkg_stat_recursive_sum(pd, off); return __blkg_prfill_u64(sf, pd, sum); } @@ -1891,7 +1860,7 @@ static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off); return __blkg_prfill_rwstat(sf, pd, &sum); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4d1659c7f84b..e8092276af58 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,14 +53,20 @@ struct blkcg { #endif }; +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ struct blkg_stat { struct u64_stats_sync syncp; uint64_t cnt; + atomic64_t aux_cnt; }; struct blkg_rwstat { struct u64_stats_sync syncp; uint64_t cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; /* @@ -483,6 +489,7 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, static inline void blkg_stat_init(struct blkg_stat *stat) { u64_stats_init(&stat->syncp); + atomic64_set(&stat->aux_cnt, 0); } /** @@ -504,8 +511,9 @@ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. + * Read the current value of @stat. The returned value doesn't include the + * aux count. This function can be called without synchroniztion and takes + * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { @@ -527,23 +535,31 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) static inline void blkg_stat_reset(struct blkg_stat *stat) { stat->cnt = 0; + atomic64_set(&stat->aux_cnt, 0); } /** - * blkg_stat_merge - merge a blkg_stat into another + * blkg_stat_add_aux - add a blkg_stat into another's aux count * @to: the destination blkg_stat * @from: the source * - * Add @from's count to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +static inline void blkg_stat_add_aux(struct blkg_stat *to, + struct blkg_stat *from) { - blkg_stat_add(to, blkg_stat_read(from)); + atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); } static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) { + int i; + u64_stats_init(&rwstat->syncp); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** @@ -614,26 +630,30 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { + int i; + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** - * blkg_rwstat_merge - merge a blkg_rwstat into another + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * - * Add @from's counts to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) { struct blkg_rwstat v = blkg_rwstat_read(from); int i; - u64_stats_update_begin(&to->syncp); for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); + atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); } #ifdef CONFIG_BLK_DEV_THROTTLING -- cgit v1.2.3-71-gd317 From 24bdb8ef068ebdc2a57ce715f0ab22d5da32832a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:22 -0700 Subject: blkcg: make blkcg_[rw]stat per-cpu blkcg_[rw]stat are used as stat counters for blkcg policies. It isn't per-cpu by itself and blk-throttle makes it per-cpu by wrapping around it. This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc per-cpu wrapping in blk-throttle. * blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct percpu_counter. This makes syncp unnecessary as remote accesses are handled by percpu_counter itself. * blkg_[rw]stat_init() can now fail due to percpu allocation failure and thus are updated to return int. * percpu_counters need explicit freeing. blkg_[rw]stat_exit() added. * As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading and summing results are stored in ->aux_cnt[] instead. * Custom per-cpu stat implementation in blk-throttle is removed. This makes all blkcg stat counters per-cpu without complicating policy implmentations. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 ++-- block/blk-throttle.c | 89 +++++++++++---------------------- block/cfq-iosched.c | 70 +++++++++++++++++++------- include/linux/blk-cgroup.h | 120 +++++++++++++++++++++++++-------------------- 4 files changed, 153 insertions(+), 136 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ff79b52d1a0e..02a2d029b5a5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -539,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -643,8 +644,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i] + - atomic64_read(&rwstat->aux_cnt[i]); + atomic64_add(atomic64_read(&tmp.aux_cnt[i]) + + atomic64_read(&rwstat->aux_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 29c22ed4b073..c0b2263a222a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -142,8 +134,10 @@ struct throtl_grp { unsigned long slice_start[2]; unsigned long slice_end[2]; - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; }; struct throtl_data @@ -337,17 +331,15 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { struct throtl_grp *tg; - int rw, cpu; + int rw; tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) - return NULL; + goto err; - tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp); - if (!tg->stats_cpu) { - kfree(tg); - return NULL; - } + if (blkg_rwstat_init(&tg->service_bytes, gfp) || + blkg_rwstat_init(&tg->serviced, gfp)) + goto err_free_tg; throtl_service_queue_init(&tg->service_queue); @@ -362,14 +354,14 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->iops[READ] = -1; tg->iops[WRITE] = -1; - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_init(&stats_cpu->service_bytes); - blkg_rwstat_init(&stats_cpu->serviced); - } - return &tg->pd; + +err_free_tg: + blkg_rwstat_exit(&tg->serviced); + blkg_rwstat_exit(&tg->service_bytes); + kfree(tg); +err: + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -427,21 +419,17 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); - free_percpu(tg->stats_cpu); + blkg_rwstat_exit(&tg->serviced); + blkg_rwstat_exit(&tg->service_bytes); kfree(tg); } static void throtl_pd_reset_stats(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); - int cpu; - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } + blkg_rwstat_reset(&tg->service_bytes); + blkg_rwstat_reset(&tg->serviced); } static struct throtl_grp * @@ -855,7 +843,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, int rw) { struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; unsigned long flags; /* @@ -865,10 +852,8 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, */ local_irq_save(flags); - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); + blkg_rwstat_add(&tg->serviced, rw, 1); + blkg_rwstat_add(&tg->service_bytes, rw, bytes); local_irq_restore(flags); } @@ -1176,27 +1161,9 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) +static int tg_print_rwstat(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } @@ -1337,13 +1304,13 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = offsetof(struct throtl_grp, service_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = offsetof(struct throtl_grp, serviced), + .seq_show = tg_print_rwstat, }, { } /* terminate */ }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b272cfff7364..71e55c91ee98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1542,27 +1542,55 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->service_bytes); + blkg_rwstat_exit(&stats->serviced); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); + blkg_stat_exit(&stats->sectors); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} + +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->service_bytes, gfp) || + blkg_rwstat_init(&stats->serviced, gfp) || + blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + + blkg_stat_init(&stats->sectors, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; } static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) @@ -1602,7 +1630,10 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) return NULL; cfq_init_cfqg_base(cfqg); - cfqg_stats_init(&cfqg->stats); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } return &cfqg->pd; } @@ -1642,7 +1673,10 @@ static void cfq_pd_offline(struct blkg_policy_data *pd) static void cfq_pd_free(struct blkg_policy_data *pd) { - return kfree(pd); + struct cfq_group *cfqg = pd_to_cfqg(pd); + + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } static void cfq_pd_reset_stats(struct blkg_policy_data *pd) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e8092276af58..fdc7ac08b1ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,12 +14,15 @@ */ #include -#include +#include #include #include #include #include +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -55,17 +58,16 @@ struct blkcg { /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. + * recursive. Used to carry stats of dead children, and, for blkg_rwstat, + * to carry result values from read and sum operations. */ struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; + struct percpu_counter cpu_cnt; atomic64_t aux_cnt; }; struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; @@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) -static inline void blkg_stat_init(struct blkg_stat *stat) +static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { - u64_stats_init(&stat->syncp); + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static inline void blkg_stat_exit(struct blkg_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); } /** @@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) * @stat: target blkg_stat * @val: value to add * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); + __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read - * - * Read the current value of @stat. The returned value doesn't include the - * aux count. This function can be called without synchroniztion and takes - * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; + return percpu_counter_sum_positive(&stat->cpu_cnt); } /** @@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) */ static inline void blkg_stat_reset(struct blkg_stat *stat) { - stat->cnt = 0; + percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } @@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to, &to->aux_cnt); } -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { - int i; + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} - u64_stats_init(&rwstat->syncp); +static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&rwstat->aux_cnt[i], 0); + percpu_counter_destroy(&rwstat->cpu_cnt[i]); } /** @@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, int rw, uint64_t val) { - u64_stats_update_begin(&rwstat->syncp); + struct percpu_counter *cnt; if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else - rwstat->cnt[BLKG_RWSTAT_READ] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - u64_stats_update_end(&rwstat->syncp); + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. + * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) { - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + struct blkg_rwstat result; + int i; - return tmp; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&result.aux_cnt[i], + percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + return result; } /** @@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } /** @@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); + } } /** @@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + atomic64_add(atomic64_read(&v.aux_cnt[i]) + + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } -- cgit v1.2.3-71-gd317 From f12c74cab1635d67077ce8cc40da88b57980f637 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:23 -0700 Subject: blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq Currently, blkg_[rw]stat_recursive_sum() assume that the target counter is located in pd (blkg_policy_data); however, some counters are planned to be moved to blkg (blkcg_gq). This patch updates blkg_[rw]stat_recursive_sum() to take blkg and blkg_policy pointers instead of pd. If policy is NULL, it indexes into blkg. If non-NULL, into the blkg's pd of the policy. The existing usages are updated to maintain the current behaviors. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 69 ++++++++++++++++++++++++++++------------------ block/cfq-iosched.c | 8 +++--- include/linux/blk-cgroup.h | 7 +++-- 3 files changed, 50 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 02a2d029b5a5..b26320720a3c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -581,30 +581,39 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and their aux counts. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; - if (pos_blkg->online) - sum += blkg_stat_read(stat) + - atomic64_read(&stat->aux_cnt); + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -614,33 +623,39 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and their aux counts. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat, tmp; if (!pos_blkg->online) continue; + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 71e55c91ee98..e861cc1ebd62 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1886,16 +1886,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_stat_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index fdc7ac08b1ce..4630ce8f9425 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -191,9 +191,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); struct blkg_conf_ctx { struct gendisk *disk; -- cgit v1.2.3-71-gd317 From 77ea733884eb5520f22c36def1309fe2ab61633e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:24 -0700 Subject: blkcg: move io_service_bytes and io_serviced stats into blkcg_gq Currently, both cfq-iosched and blk-throttle keep track of io_service_bytes and io_serviced stats. While keeping track of them separately may be useful during development, it doesn't make much sense otherwise. Also, blk-throttle was counting bio's as IOs while cfq-iosched request's, which is more confusing than informative. This patch adds ->stat_bytes and ->stat_ios to blkg (blkcg_gq), removes the counterparts from cfq-iosched and blk-throttle and let them print from the common blkg counters. The common counters are incremented during bio issue in blkcg_bio_issue_check(). The outputs are still filtered by whether the policy has blkg_policy_data on a given blkg, so cfq's output won't show up if it has never been used for a given blkg. The only times when the outputs would differ significantly are when policies are attached on the fly or elevators are switched back and forth. Those are quite exceptional operations and I don't think they warrant keeping separate counters. v3: Update blkio-controller.txt accordingly. v2: Account IOs during bio issues instead of request completions so that bio-based drivers can be handled the same way. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- Documentation/cgroups/blkio-controller.txt | 24 ++------ block/blk-cgroup.c | 98 ++++++++++++++++++++++++++++++ block/blk-throttle.c | 73 ++-------------------- block/cfq-iosched.c | 32 +++------- include/linux/blk-cgroup.h | 14 +++++ 5 files changed, 133 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 68b6a6a470b0..12686bec37b9 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -201,7 +201,7 @@ Proportional weight policy files specifies the number of bytes. - blkio.io_serviced - - Number of IOs completed to/from the disk by the group. These + - Number of IOs (bio) issued to the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field @@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is subjected to both the constraints. - blkio.throttle.io_serviced - - Number of IOs (bio) completed to/from the disk by the group (as - seen by throttling policy). These are further divided by the type - of operation - read or write, sync or async. First two fields specify - the major and minor number of the device, third field specifies the - operation type and the fourth field specifies the number of IOs. - - blkio.io_serviced does accounting as seen by CFQ and counts are in - number of requests (struct request). On the other hand, - blkio.throttle.io_serviced counts number of IO in terms of number - of bios as seen by throttling policy. These bios can later be - merged by elevator and total number of requests completed can be - lesser. + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. - blkio.throttle.io_service_bytes - Number of bytes transferred to/from the disk by the group. These @@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is device, third field specifies the operation type and the fourth field specifies the number of bytes. - These numbers should roughly be same as blkio.io_service_bytes as - updated by CFQ. The difference between two is that - blkio.io_service_bytes will not be updated if CFQ is not operating - on request queue. - Common files among various policies ----------------------------------- - blkio.reset_stats diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b26320720a3c..a25263ca39ca 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -73,6 +73,9 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->blkcg != &blkcg_root) blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -95,6 +98,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -300,6 +307,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -315,6 +323,12 @@ static void blkg_destroy(struct blkcg_gq *blkg) if (blkg->pd[i] && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); + } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -431,6 +445,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -579,6 +596,87 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat * @blkg: blkg of interest diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c0b2263a222a..bd3e4b2c7a9d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -133,11 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; }; struct throtl_data @@ -335,11 +330,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) - goto err; - - if (blkg_rwstat_init(&tg->service_bytes, gfp) || - blkg_rwstat_init(&tg->serviced, gfp)) - goto err_free_tg; + return NULL; throtl_service_queue_init(&tg->service_queue); @@ -355,13 +346,6 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->iops[WRITE] = -1; return &tg->pd; - -err_free_tg: - blkg_rwstat_exit(&tg->serviced); - blkg_rwstat_exit(&tg->service_bytes); - kfree(tg); -err: - return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -419,19 +403,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); - blkg_rwstat_exit(&tg->serviced); - blkg_rwstat_exit(&tg->service_bytes); kfree(tg); } -static void throtl_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct throtl_grp *tg = pd_to_tg(pd); - - blkg_rwstat_reset(&tg->service_bytes); - blkg_rwstat_reset(&tg->serviced); -} - static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -839,25 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - blkg_rwstat_add(&tg->serviced, rw, 1); - blkg_rwstat_add(&tg->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -871,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1161,13 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static int tg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1304,13 +1244,13 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct throtl_grp, service_bytes), - .seq_show = tg_print_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct throtl_grp, serviced), - .seq_show = tg_print_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, }, { } /* terminate */ }; @@ -1329,7 +1269,6 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_free_fn = throtl_pd_free, - .pd_reset_stats_fn = throtl_pd_reset_stats, }; bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e861cc1ebd62..a948d4df3fc3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -177,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -696,8 +692,6 @@ static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, uint64_t bytes, int rw) { blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, @@ -717,8 +711,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -738,8 +730,6 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); @@ -1544,8 +1534,6 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) #ifdef CONFIG_CFQ_GROUP_IOSCHED static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -1566,9 +1554,7 @@ static void cfqg_stats_exit(struct cfqg_stats *stats) static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || @@ -1994,13 +1980,13 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -2036,13 +2022,13 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4630ce8f9425..286e1bde249f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -127,6 +127,9 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; @@ -190,6 +193,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +int blkg_print_stat_bytes(struct seq_file *sf, void *v); +int blkg_print_stat_ios(struct seq_file *sf, void *v); +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off); @@ -700,6 +707,13 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); + if (!throtl) { + blkg = blkg ?: q->root_blkg; + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, + bio->bi_iter.bi_size); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); + } + rcu_read_unlock(); return !throtl; } -- cgit v1.2.3-71-gd317 From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:29 -0700 Subject: blkcg: rename subsystem name from blkio to io blkio interface has become messy over time and is currently the largest. In addition to the inconsistent naming scheme, it has multiple stat files which report more or less the same thing, a number of debug stat files which expose internal details which shouldn't have been part of the public interface in the first place, recursive and non-recursive stats and leaf and non-leaf knobs. Both recursive vs. non-recursive and leaf vs. non-leaf distinctions don't make any sense on the unified hierarchy as only leaf cgroups can contain processes. cgroups is going through a major interface revision with the unified hierarchy involving significant fundamental usage changes and given that a significant portion of the interface doesn't make sense anymore, it's a good time to reorganize the interface. As the first step, this patch renames the external visible subsystem name from "blkio" to "io". This is more concise, matches the other two major subsystem names, "cpu" and "memory", and better suited as blkcg will be involved in anything writeback related too whether an actual block device is involved or not. As the subsystem legacy_name is set to "blkio", the only userland visible change outside the unified hierarchy is that blkcg is reported as "io" instead of "blkio" in the subsystem initialized message during boot. On the unified hierarchy, blkcg now appears as "io". Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.c | 7 ++++--- include/linux/backing-dev.h | 2 +- include/linux/blk-cgroup.h | 4 ++-- include/linux/cgroup_subsys.h | 2 +- mm/backing-dev.c | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index d6e5ba3399f0..c52222c6c69b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2046,7 +2046,7 @@ int bio_associate_current(struct bio *bio) get_io_context_active(ioc); bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, blkio_cgrp_id); + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } EXPORT_SYMBOL_GPL(bio_associate_current); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0af3bff198ed..fc197ea4c992 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1089,12 +1089,13 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, .legacy_cftypes = blkcg_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1104,7 +1105,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1266,7 +1267,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 23ebb946e66f..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 286e1bde249f..db89acd2a864 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..86b5056104df 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -16,7 +16,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..d0ee90e72867 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi, int ret = 0; memcg = mem_cgroup_from_css(memcg_css); - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); blkcg_cgwb_list = &blkcg->cgwb_list; @@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &blkio_cgrp_subsys); + &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; -- cgit v1.2.3-71-gd317 From 880f50e228f80626dff6327a6e281e40286f5228 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:30 -0700 Subject: blkcg: mark existing cftypes as legacy blkcg is about to grow interface for the unified hierarchy. Add legacy to existing cftypes. * blkcg_policy->cftypes -> blkcg_policy->legacy_cftypes * blk-cgroup.c:blkcg_files -> blkcg_legacy_files * cfq-iosched.c:cfq_blkcg_files -> cfq_blkcg_legacy_files * blk-throttle.c:throtl_files -> throtl_legacy_files Pure renames. No functional change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 ++++++------ block/blk-throttle.c | 4 ++-- block/cfq-iosched.c | 4 ++-- include/linux/blk-cgroup.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fc197ea4c992..429726bed560 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -847,7 +847,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); -struct cftype blkcg_files[] = { +struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, @@ -1094,7 +1094,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* @@ -1266,9 +1266,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) + if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, - pol->cftypes)); + pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; @@ -1305,8 +1305,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bd3e4b2c7a9d..8b4f6b81bb72 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1217,7 +1217,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1263,7 +1263,7 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 38277e3ce286..baa8459f7064 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1944,7 +1944,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -4654,7 +4654,7 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db89acd2a864..6e016e6fee87 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,7 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ - struct cftype *cftypes; + struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; -- cgit v1.2.3-71-gd317 From 36aa9e5f591e84d67aad2c5bff75e413d77660dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:31 -0700 Subject: blkcg: move body parsing from blkg_conf_prep() to its callers Currently, blkg_conf_prep() expects input to be of the following form MAJ:MIN NUM and reads the NUM part into blkg_conf_ctx->v. This is quite restrictive and gets in the way in implementing blkcg interface for the unified hierarchy. This patch updates blkg_conf_prep() so that it expects MAJ:MIN BODY_STR where BODY_STR is an arbitrary string. blkg_conf_ctx->v is replaced with ->body which is a char pointer pointing to the start of BODY_STR. Parsing of the body is moved to blkg_conf_prep()'s callers. To allow using, for example, strsep() on blkg_conf_ctx->val, it is a non-const pointer and to accommodate that const is dropped from @input too. This doesn't cause any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 22 ++++++++++++++-------- block/blk-throttle.c | 18 ++++++++++++------ block/cfq-iosched.c | 17 +++++++++++------ include/linux/blk-cgroup.h | 4 ++-- 4 files changed, 39 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 429726bed560..8ee1ca4d4f2f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "blk.h" @@ -773,23 +774,28 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) return -ENODEV; @@ -826,7 +832,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8b4f6b81bb72..0e17c8f92d25 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1154,21 +1154,25 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) return ret; + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + tg = blkg_to_tg(ctx.blkg); sq = &tg->service_queue; - if (!ctx.v) - ctx.v = -1; - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; + *(u64 *)((void *)tg + of_cft(of)->private) = v; else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1201,8 +1205,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, throtl_schedule_next_dispatch(sq->parent_sq, true); } + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index baa8459f7064..ea88d8905bbd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1747,26 +1747,31 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, struct cfq_group *cfqg; struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); ret = -ERANGE; - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: cfqgd->weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6e016e6fee87..85a4d989ae43 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -206,11 +206,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; - u64 v; + char *body; }; int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); + char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -- cgit v1.2.3-71-gd317 From dd165eb3bb4ef16bcdb75417add40633f38c52b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:33 -0700 Subject: blkcg: misc preparations for unified hierarchy interface * Export blkg_dev_name() * Drop unnecessary @cft from __cfq_set_weight(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 ++- block/cfq-iosched.c | 8 ++++---- include/linux/blk-cgroup.h | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8ee1ca4d4f2f..b5e72d756be1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -462,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ea88d8905bbd..7a7230136e43 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1788,8 +1788,8 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, return __cfqg_set_weight_device(of, buf, nbytes, off, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool is_leaf_weight) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; @@ -1834,13 +1834,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 85a4d989ae43..b270aef519c6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -182,6 +182,7 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), -- cgit v1.2.3-71-gd317 From 2ee867dcfa2eaef1063b686da55c35878b2da4a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:34 -0700 Subject: blkcg: implement interface for the unified hierarchy blkcg interface grew to be the biggest of all controllers and unfortunately most inconsistent too. The interface files are inconsistent with a number of cloes duplicates. Some files have recursive variants while others don't. There's distinction between normal and leaf weights which isn't intuitive and there are a lot of stat knobs which don't make much sense outside of debugging and expose too much implementation details to userland. In the unified hierarchy, everything is always hierarchical and internal nodes can't have tasks rendering the two structural issues twisting the current interface. The interface has to be updated in a significant anyway and this is a good chance to revamp it as a whole. This patch implements blkcg interface for the unified hierarchy. * (from a previous patch) blkcg is identified by "io" instead of "blkio" on the unified hierarchy. Given that the whole interface is updated anyway, the rename shouldn't carry noticeable conversion overhead. * The original interface consisted of 27 files is replaced with the following three files. blkio.stat : per-blkcg stats blkio.weight : per-cgroup and per-cgroup-queue weight settings blkio.max : per-cgroup-queue bps and iops max limits Documentation/cgroups/unified-hierarchy.txt updated accordingly. v2: blkcg_policy->dfl_cftypes wasn't removed on blkcg_policy_unregister() corrupting the cftypes list. Fixed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/cgroups/unified-hierarchy.txt | 61 ++++++++++++++- block/blk-cgroup.c | 53 +++++++++++++ block/blk-throttle.c | 112 ++++++++++++++++++++++++++++ block/cfq-iosched.c | 61 +++++++++++++-- include/linux/blk-cgroup.h | 1 + 5 files changed, 279 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 1ee9caf29e57..bd1ce15d5178 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -27,7 +27,7 @@ CONTENTS 5-3-1. Format 5-3-2. Control Knobs 5-4. Per-Controller Changes - 5-4-1. blkio + 5-4-1. io 5-4-2. cpuset 5-4-3. memory 6. Planned Changes @@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or universal, and there are various other knobs which simply aren't available for tasks. -The blkio controller implicitly creates a hidden leaf node for each +The io controller implicitly creates a hidden leaf node for each cgroup to host the tasks. The hidden leaf has its own copies of all the knobs with "leaf_" prefixed. While this allows equivalent control over internal tasks, it's with serious drawbacks. It always adds an @@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified. 5-4. Per-Controller Changes -5-4-1. blkio +5-4-1. io -- blk-throttle becomes properly hierarchical. +- blkio is renamed to io. The interface is overhauled anyway. The + new name is more in line with the other two major controllers, cpu + and memory, and better suited given that it may be used for cgroup + writeback without involving block layer. + +- Everything including stat is always hierarchical making separate + recursive stat files pointless and, as no internal node can have + tasks, leaf weights are meaningless. The operation model is + simplified and the interface is overhauled accordingly. + + io.stat + + The stat file. The reported stats are from the point where + bio's are issued to request_queue. The stats are counted + independent of which policies are enabled. Each line in the + file follows the following format. More fields may later be + added at the end. + + $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS + + io.weight + + The weight setting, currently only available and effective if + cfq-iosched is in use for the target device. The weight is + between 10 and 1000 and defaults to 500. The first line + always contains the default weight in the following format to + use when per-device setting is missing. + + default $WEIGHT + + Subsequent lines list per-device weights of the following + format. + + $MAJ:$MIN $WEIGHT + + Writing "$WEIGHT" or "default $WEIGHT" changes the default + setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight + while "$MAJ:$MIN default" clears it. + + This file is available only on non-root cgroups. + + io.max + + The maximum bandwidth and/or iops setting, only available if + blk-throttle is enabled. The file is of the following format. + + $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS + + ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are + read/write IOs per second. "max" indicates no limit. Writing + to the file follows the same format but the individual + settings may be ommitted or specified in any order. + + This file is available only on non-root cgroups. 5-4-2. cpuset diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b5e72d756be1..88bdb73bd5e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -854,6 +854,53 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + +struct cftype blkcg_files[] = { + { + .name = "stat", + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", @@ -1101,6 +1148,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", #ifdef CONFIG_MEMCG @@ -1273,6 +1321,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); @@ -1312,6 +1363,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a8bb2fd8f523..c75a2636dd40 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1265,6 +1265,117 @@ static struct cftype throtl_legacy_files[] = { { } /* terminate */ }; +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, + }, + { } /* terminate */ +}; + static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -1273,6 +1384,7 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { + .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7a7230136e43..97da5719c87a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1740,7 +1740,7 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; @@ -1753,9 +1753,17 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, if (ret) return ret; - ret = -EINVAL; - if (sscanf(ctx.body, "%llu", &v) != 1) + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; goto out_finish; + } cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1779,13 +1787,13 @@ out_finish: static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, @@ -2103,6 +2111,48 @@ static struct cftype cfq_blkcg_legacy_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, struct blkcg *blkcg) @@ -4659,6 +4709,7 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { + .dfl_cftypes = cfq_blkcg_files, .legacy_cftypes = cfq_blkcg_legacy_files, .cpd_alloc_fn = cfq_cpd_alloc, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b270aef519c6..9a7c4bd45fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ + struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ -- cgit v1.2.3-71-gd317 From 69d7fde5909b614114343974cfc52cb8ff30b544 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:36 -0700 Subject: blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy cgroup is trying to make interface consistent across different controllers. For weight based resource control, the knob should have the range [1, 10000] and default to 100. This patch updates cfq-iosched so that the weight range conforms. The internal calculations have enough range and the widening of the weight range shouldn't cause any problem. * blkcg_policy->cpd_bind_fn() is added. If present, this is invoked when blkcg is attached to a hierarchy. * cfq_cpd_init() is updated to use the new default value on the unified hierarchy. * cfq_cpd_bind() callback is implemented to clear per-blkg configs and apply the default config matching the hierarchy type. * cfqd->root_group->[leaf_]weight initialization in cfq_init_queue() is moved into !CONFIG_CFQ_GROUP_IOSCHED block. cfq_cpd_bind() is now responsible for initializing the initial weights when blkcg is enabled. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- Documentation/cgroups/unified-hierarchy.txt | 2 +- block/blk-cgroup.c | 21 +++++++++++ block/cfq-iosched.c | 55 +++++++++++++++++++++-------- include/linux/blk-cgroup.h | 2 ++ 4 files changed, 64 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index bd1ce15d5178..e0975c2cf03d 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -464,7 +464,7 @@ may be specified in any order and not all pairs have to be specified. The weight setting, currently only available and effective if cfq-iosched is in use for the target device. The weight is - between 10 and 1000 and defaults to 500. The first line + between 1 and 10000 and defaults to 100. The first line always contains the default weight in the following format to use when per-device setting is missing. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 88bdb73bd5e0..ac8370cb2515 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1143,11 +1143,32 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0fe721eaff28..04de88463a98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1522,6 +1522,9 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + static void cfqg_stats_exit(struct cfqg_stats *stats) { blkg_rwstat_exit(&stats->merged); @@ -1578,14 +1581,14 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - if (cpd_to_blkcg(cpd) == &blkcg_root) { - cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - } else { - cgd->weight = CFQ_WEIGHT_LEGACY_DFL; - cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL; - } + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; } static void cfq_cpd_free(struct blkcg_policy_data *cpd) @@ -1593,6 +1596,19 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_cfqgd(cpd)); } +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); +} + static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { struct cfq_group *cfqg; @@ -1742,6 +1758,8 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; @@ -1769,7 +1787,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, cfqgd = blkcg_to_cfqgd(blkcg); ret = -ERANGE; - if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) { + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { cfqg->dev_weight = v; cfqg->new_weight = v ?: cfqgd->weight; @@ -1797,15 +1815,17 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, } static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool is_leaf_weight) + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; struct cfq_group_data *cfqgd; int ret = 0; - if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1826,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) cfqg->new_leaf_weight = cfqgd->leaf_weight; } @@ -1842,13 +1866,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -2135,7 +2159,7 @@ static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { - ret = __cfq_set_weight(of_css(of), v, false); + ret = __cfq_set_weight(of_css(of), v, true, false, false); return ret ?: nbytes; } @@ -4512,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); -#endif cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4715,6 +4739,7 @@ static struct blkcg_policy blkcg_policy_cfq = { .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9a7c4bd45fff..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -138,6 +138,7 @@ struct blkcg_gq { typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -155,6 +156,7 @@ struct blkcg_policy { blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; -- cgit v1.2.3-71-gd317