cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rdma.c (15013B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * RDMA resource limiting controller for cgroups.
      4 *
      5 * Used to allow a cgroup hierarchy to stop processes from consuming
      6 * additional RDMA resources after a certain limit is reached.
      7 *
      8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
      9 */
     10
     11#include <linux/bitops.h>
     12#include <linux/slab.h>
     13#include <linux/seq_file.h>
     14#include <linux/cgroup.h>
     15#include <linux/parser.h>
     16#include <linux/cgroup_rdma.h>
     17
     18#define RDMACG_MAX_STR "max"
     19
     20/*
     21 * Protects list of resource pools maintained on per cgroup basis
     22 * and rdma device list.
     23 */
     24static DEFINE_MUTEX(rdmacg_mutex);
     25static LIST_HEAD(rdmacg_devices);
     26
     27enum rdmacg_file_type {
     28	RDMACG_RESOURCE_TYPE_MAX,
     29	RDMACG_RESOURCE_TYPE_STAT,
     30};
     31
     32/*
     33 * resource table definition as to be seen by the user.
     34 * Need to add entries to it when more resources are
     35 * added/defined at IB verb/core layer.
     36 */
     37static char const *rdmacg_resource_names[] = {
     38	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
     39	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
     40};
     41
     42/* resource tracker for each resource of rdma cgroup */
     43struct rdmacg_resource {
     44	int max;
     45	int usage;
     46};
     47
     48/*
     49 * resource pool object which represents per cgroup, per device
     50 * resources. There are multiple instances of this object per cgroup,
     51 * therefore it cannot be embedded within rdma_cgroup structure. It
     52 * is maintained as list.
     53 */
     54struct rdmacg_resource_pool {
     55	struct rdmacg_device	*device;
     56	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
     57
     58	struct list_head	cg_node;
     59	struct list_head	dev_node;
     60
     61	/* count active user tasks of this pool */
     62	u64			usage_sum;
     63	/* total number counts which are set to max */
     64	int			num_max_cnt;
     65};
     66
     67static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
     68{
     69	return container_of(css, struct rdma_cgroup, css);
     70}
     71
     72static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
     73{
     74	return css_rdmacg(cg->css.parent);
     75}
     76
     77static inline struct rdma_cgroup *get_current_rdmacg(void)
     78{
     79	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
     80}
     81
     82static void set_resource_limit(struct rdmacg_resource_pool *rpool,
     83			       int index, int new_max)
     84{
     85	if (new_max == S32_MAX) {
     86		if (rpool->resources[index].max != S32_MAX)
     87			rpool->num_max_cnt++;
     88	} else {
     89		if (rpool->resources[index].max == S32_MAX)
     90			rpool->num_max_cnt--;
     91	}
     92	rpool->resources[index].max = new_max;
     93}
     94
     95static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
     96{
     97	int i;
     98
     99	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
    100		set_resource_limit(rpool, i, S32_MAX);
    101}
    102
    103static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
    104{
    105	lockdep_assert_held(&rdmacg_mutex);
    106
    107	list_del(&rpool->cg_node);
    108	list_del(&rpool->dev_node);
    109	kfree(rpool);
    110}
    111
    112static struct rdmacg_resource_pool *
    113find_cg_rpool_locked(struct rdma_cgroup *cg,
    114		     struct rdmacg_device *device)
    115
    116{
    117	struct rdmacg_resource_pool *pool;
    118
    119	lockdep_assert_held(&rdmacg_mutex);
    120
    121	list_for_each_entry(pool, &cg->rpools, cg_node)
    122		if (pool->device == device)
    123			return pool;
    124
    125	return NULL;
    126}
    127
    128static struct rdmacg_resource_pool *
    129get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
    130{
    131	struct rdmacg_resource_pool *rpool;
    132
    133	rpool = find_cg_rpool_locked(cg, device);
    134	if (rpool)
    135		return rpool;
    136
    137	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
    138	if (!rpool)
    139		return ERR_PTR(-ENOMEM);
    140
    141	rpool->device = device;
    142	set_all_resource_max_limit(rpool);
    143
    144	INIT_LIST_HEAD(&rpool->cg_node);
    145	INIT_LIST_HEAD(&rpool->dev_node);
    146	list_add_tail(&rpool->cg_node, &cg->rpools);
    147	list_add_tail(&rpool->dev_node, &device->rpools);
    148	return rpool;
    149}
    150
    151/**
    152 * uncharge_cg_locked - uncharge resource for rdma cgroup
    153 * @cg: pointer to cg to uncharge and all parents in hierarchy
    154 * @device: pointer to rdmacg device
    155 * @index: index of the resource to uncharge in cg (resource pool)
    156 *
    157 * It also frees the resource pool which was created as part of
    158 * charging operation when there are no resources attached to
    159 * resource pool.
    160 */
    161static void
    162uncharge_cg_locked(struct rdma_cgroup *cg,
    163		   struct rdmacg_device *device,
    164		   enum rdmacg_resource_type index)
    165{
    166	struct rdmacg_resource_pool *rpool;
    167
    168	rpool = find_cg_rpool_locked(cg, device);
    169
    170	/*
    171	 * rpool cannot be null at this stage. Let kernel operate in case
    172	 * if there a bug in IB stack or rdma controller, instead of crashing
    173	 * the system.
    174	 */
    175	if (unlikely(!rpool)) {
    176		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
    177		return;
    178	}
    179
    180	rpool->resources[index].usage--;
    181
    182	/*
    183	 * A negative count (or overflow) is invalid,
    184	 * it indicates a bug in the rdma controller.
    185	 */
    186	WARN_ON_ONCE(rpool->resources[index].usage < 0);
    187	rpool->usage_sum--;
    188	if (rpool->usage_sum == 0 &&
    189	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
    190		/*
    191		 * No user of the rpool and all entries are set to max, so
    192		 * safe to delete this rpool.
    193		 */
    194		free_cg_rpool_locked(rpool);
    195	}
    196}
    197
    198/**
    199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
    200 * @device: pointer to rdmacg device
    201 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
    202 *           stop uncharging
    203 * @index: index of the resource to uncharge in cg in given resource pool
    204 */
    205static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
    206				     struct rdmacg_device *device,
    207				     struct rdma_cgroup *stop_cg,
    208				     enum rdmacg_resource_type index)
    209{
    210	struct rdma_cgroup *p;
    211
    212	mutex_lock(&rdmacg_mutex);
    213
    214	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
    215		uncharge_cg_locked(p, device, index);
    216
    217	mutex_unlock(&rdmacg_mutex);
    218
    219	css_put(&cg->css);
    220}
    221
    222/**
    223 * rdmacg_uncharge - hierarchically uncharge rdma resource count
    224 * @device: pointer to rdmacg device
    225 * @index: index of the resource to uncharge in cgroup in given resource pool
    226 */
    227void rdmacg_uncharge(struct rdma_cgroup *cg,
    228		     struct rdmacg_device *device,
    229		     enum rdmacg_resource_type index)
    230{
    231	if (index >= RDMACG_RESOURCE_MAX)
    232		return;
    233
    234	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
    235}
    236EXPORT_SYMBOL(rdmacg_uncharge);
    237
    238/**
    239 * rdmacg_try_charge - hierarchically try to charge the rdma resource
    240 * @rdmacg: pointer to rdma cgroup which will own this resource
    241 * @device: pointer to rdmacg device
    242 * @index: index of the resource to charge in cgroup (resource pool)
    243 *
    244 * This function follows charging resource in hierarchical way.
    245 * It will fail if the charge would cause the new value to exceed the
    246 * hierarchical limit.
    247 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
    248 * Returns pointer to rdmacg for this resource when charging is successful.
    249 *
    250 * Charger needs to account resources on two criteria.
    251 * (a) per cgroup & (b) per device resource usage.
    252 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
    253 * the configured limits. Per device provides granular configuration
    254 * in multi device usage. It allocates resource pool in the hierarchy
    255 * for each parent it come across for first resource. Later on resource
    256 * pool will be available. Therefore it will be much faster thereon
    257 * to charge/uncharge.
    258 */
    259int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
    260		      struct rdmacg_device *device,
    261		      enum rdmacg_resource_type index)
    262{
    263	struct rdma_cgroup *cg, *p;
    264	struct rdmacg_resource_pool *rpool;
    265	s64 new;
    266	int ret = 0;
    267
    268	if (index >= RDMACG_RESOURCE_MAX)
    269		return -EINVAL;
    270
    271	/*
    272	 * hold on to css, as cgroup can be removed but resource
    273	 * accounting happens on css.
    274	 */
    275	cg = get_current_rdmacg();
    276
    277	mutex_lock(&rdmacg_mutex);
    278	for (p = cg; p; p = parent_rdmacg(p)) {
    279		rpool = get_cg_rpool_locked(p, device);
    280		if (IS_ERR(rpool)) {
    281			ret = PTR_ERR(rpool);
    282			goto err;
    283		} else {
    284			new = rpool->resources[index].usage + 1;
    285			if (new > rpool->resources[index].max) {
    286				ret = -EAGAIN;
    287				goto err;
    288			} else {
    289				rpool->resources[index].usage = new;
    290				rpool->usage_sum++;
    291			}
    292		}
    293	}
    294	mutex_unlock(&rdmacg_mutex);
    295
    296	*rdmacg = cg;
    297	return 0;
    298
    299err:
    300	mutex_unlock(&rdmacg_mutex);
    301	rdmacg_uncharge_hierarchy(cg, device, p, index);
    302	return ret;
    303}
    304EXPORT_SYMBOL(rdmacg_try_charge);
    305
    306/**
    307 * rdmacg_register_device - register rdmacg device to rdma controller.
    308 * @device: pointer to rdmacg device whose resources need to be accounted.
    309 *
    310 * If IB stack wish a device to participate in rdma cgroup resource
    311 * tracking, it must invoke this API to register with rdma cgroup before
    312 * any user space application can start using the RDMA resources.
    313 */
    314void rdmacg_register_device(struct rdmacg_device *device)
    315{
    316	INIT_LIST_HEAD(&device->dev_node);
    317	INIT_LIST_HEAD(&device->rpools);
    318
    319	mutex_lock(&rdmacg_mutex);
    320	list_add_tail(&device->dev_node, &rdmacg_devices);
    321	mutex_unlock(&rdmacg_mutex);
    322}
    323EXPORT_SYMBOL(rdmacg_register_device);
    324
    325/**
    326 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
    327 * @device: pointer to rdmacg device which was previously registered with rdma
    328 *          controller using rdmacg_register_device().
    329 *
    330 * IB stack must invoke this after all the resources of the IB device
    331 * are destroyed and after ensuring that no more resources will be created
    332 * when this API is invoked.
    333 */
    334void rdmacg_unregister_device(struct rdmacg_device *device)
    335{
    336	struct rdmacg_resource_pool *rpool, *tmp;
    337
    338	/*
    339	 * Synchronize with any active resource settings,
    340	 * usage query happening via configfs.
    341	 */
    342	mutex_lock(&rdmacg_mutex);
    343	list_del_init(&device->dev_node);
    344
    345	/*
    346	 * Now that this device is off the cgroup list, its safe to free
    347	 * all the rpool resources.
    348	 */
    349	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
    350		free_cg_rpool_locked(rpool);
    351
    352	mutex_unlock(&rdmacg_mutex);
    353}
    354EXPORT_SYMBOL(rdmacg_unregister_device);
    355
    356static int parse_resource(char *c, int *intval)
    357{
    358	substring_t argstr;
    359	char *name, *value = c;
    360	size_t len;
    361	int ret, i;
    362
    363	name = strsep(&value, "=");
    364	if (!name || !value)
    365		return -EINVAL;
    366
    367	i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
    368	if (i < 0)
    369		return i;
    370
    371	len = strlen(value);
    372
    373	argstr.from = value;
    374	argstr.to = value + len;
    375
    376	ret = match_int(&argstr, intval);
    377	if (ret >= 0) {
    378		if (*intval < 0)
    379			return -EINVAL;
    380		return i;
    381	}
    382	if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
    383		*intval = S32_MAX;
    384		return i;
    385	}
    386	return -EINVAL;
    387}
    388
    389static int rdmacg_parse_limits(char *options,
    390			       int *new_limits, unsigned long *enables)
    391{
    392	char *c;
    393	int err = -EINVAL;
    394
    395	/* parse resource options */
    396	while ((c = strsep(&options, " ")) != NULL) {
    397		int index, intval;
    398
    399		index = parse_resource(c, &intval);
    400		if (index < 0)
    401			goto err;
    402
    403		new_limits[index] = intval;
    404		*enables |= BIT(index);
    405	}
    406	return 0;
    407
    408err:
    409	return err;
    410}
    411
    412static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
    413{
    414	struct rdmacg_device *device;
    415
    416	lockdep_assert_held(&rdmacg_mutex);
    417
    418	list_for_each_entry(device, &rdmacg_devices, dev_node)
    419		if (!strcmp(name, device->name))
    420			return device;
    421
    422	return NULL;
    423}
    424
    425static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
    426				       char *buf, size_t nbytes, loff_t off)
    427{
    428	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
    429	const char *dev_name;
    430	struct rdmacg_resource_pool *rpool;
    431	struct rdmacg_device *device;
    432	char *options = strstrip(buf);
    433	int *new_limits;
    434	unsigned long enables = 0;
    435	int i = 0, ret = 0;
    436
    437	/* extract the device name first */
    438	dev_name = strsep(&options, " ");
    439	if (!dev_name) {
    440		ret = -EINVAL;
    441		goto err;
    442	}
    443
    444	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
    445	if (!new_limits) {
    446		ret = -ENOMEM;
    447		goto err;
    448	}
    449
    450	ret = rdmacg_parse_limits(options, new_limits, &enables);
    451	if (ret)
    452		goto parse_err;
    453
    454	/* acquire lock to synchronize with hot plug devices */
    455	mutex_lock(&rdmacg_mutex);
    456
    457	device = rdmacg_get_device_locked(dev_name);
    458	if (!device) {
    459		ret = -ENODEV;
    460		goto dev_err;
    461	}
    462
    463	rpool = get_cg_rpool_locked(cg, device);
    464	if (IS_ERR(rpool)) {
    465		ret = PTR_ERR(rpool);
    466		goto dev_err;
    467	}
    468
    469	/* now set the new limits of the rpool */
    470	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
    471		set_resource_limit(rpool, i, new_limits[i]);
    472
    473	if (rpool->usage_sum == 0 &&
    474	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
    475		/*
    476		 * No user of the rpool and all entries are set to max, so
    477		 * safe to delete this rpool.
    478		 */
    479		free_cg_rpool_locked(rpool);
    480	}
    481
    482dev_err:
    483	mutex_unlock(&rdmacg_mutex);
    484
    485parse_err:
    486	kfree(new_limits);
    487
    488err:
    489	return ret ?: nbytes;
    490}
    491
    492static void print_rpool_values(struct seq_file *sf,
    493			       struct rdmacg_resource_pool *rpool)
    494{
    495	enum rdmacg_file_type sf_type;
    496	int i;
    497	u32 value;
    498
    499	sf_type = seq_cft(sf)->private;
    500
    501	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
    502		seq_puts(sf, rdmacg_resource_names[i]);
    503		seq_putc(sf, '=');
    504		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
    505			if (rpool)
    506				value = rpool->resources[i].max;
    507			else
    508				value = S32_MAX;
    509		} else {
    510			if (rpool)
    511				value = rpool->resources[i].usage;
    512			else
    513				value = 0;
    514		}
    515
    516		if (value == S32_MAX)
    517			seq_puts(sf, RDMACG_MAX_STR);
    518		else
    519			seq_printf(sf, "%d", value);
    520		seq_putc(sf, ' ');
    521	}
    522}
    523
    524static int rdmacg_resource_read(struct seq_file *sf, void *v)
    525{
    526	struct rdmacg_device *device;
    527	struct rdmacg_resource_pool *rpool;
    528	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
    529
    530	mutex_lock(&rdmacg_mutex);
    531
    532	list_for_each_entry(device, &rdmacg_devices, dev_node) {
    533		seq_printf(sf, "%s ", device->name);
    534
    535		rpool = find_cg_rpool_locked(cg, device);
    536		print_rpool_values(sf, rpool);
    537
    538		seq_putc(sf, '\n');
    539	}
    540
    541	mutex_unlock(&rdmacg_mutex);
    542	return 0;
    543}
    544
    545static struct cftype rdmacg_files[] = {
    546	{
    547		.name = "max",
    548		.write = rdmacg_resource_set_max,
    549		.seq_show = rdmacg_resource_read,
    550		.private = RDMACG_RESOURCE_TYPE_MAX,
    551		.flags = CFTYPE_NOT_ON_ROOT,
    552	},
    553	{
    554		.name = "current",
    555		.seq_show = rdmacg_resource_read,
    556		.private = RDMACG_RESOURCE_TYPE_STAT,
    557		.flags = CFTYPE_NOT_ON_ROOT,
    558	},
    559	{ }	/* terminate */
    560};
    561
    562static struct cgroup_subsys_state *
    563rdmacg_css_alloc(struct cgroup_subsys_state *parent)
    564{
    565	struct rdma_cgroup *cg;
    566
    567	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
    568	if (!cg)
    569		return ERR_PTR(-ENOMEM);
    570
    571	INIT_LIST_HEAD(&cg->rpools);
    572	return &cg->css;
    573}
    574
    575static void rdmacg_css_free(struct cgroup_subsys_state *css)
    576{
    577	struct rdma_cgroup *cg = css_rdmacg(css);
    578
    579	kfree(cg);
    580}
    581
    582/**
    583 * rdmacg_css_offline - cgroup css_offline callback
    584 * @css: css of interest
    585 *
    586 * This function is called when @css is about to go away and responsible
    587 * for shooting down all rdmacg associated with @css. As part of that it
    588 * marks all the resource pool entries to max value, so that when resources are
    589 * uncharged, associated resource pool can be freed as well.
    590 */
    591static void rdmacg_css_offline(struct cgroup_subsys_state *css)
    592{
    593	struct rdma_cgroup *cg = css_rdmacg(css);
    594	struct rdmacg_resource_pool *rpool;
    595
    596	mutex_lock(&rdmacg_mutex);
    597
    598	list_for_each_entry(rpool, &cg->rpools, cg_node)
    599		set_all_resource_max_limit(rpool);
    600
    601	mutex_unlock(&rdmacg_mutex);
    602}
    603
    604struct cgroup_subsys rdma_cgrp_subsys = {
    605	.css_alloc	= rdmacg_css_alloc,
    606	.css_free	= rdmacg_css_free,
    607	.css_offline	= rdmacg_css_offline,
    608	.legacy_cftypes	= rdmacg_files,
    609	.dfl_cftypes	= rdmacg_files,
    610};