brd.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
brd.c (12407B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Ram backed block device driver.
      4 *
      5 * Copyright (C) 2007 Nick Piggin
      6 * Copyright (C) 2007 Novell Inc.
      7 *
      8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
      9 * of their respective owners.
     10 */
     11
     12#include <linux/init.h>
     13#include <linux/initrd.h>
     14#include <linux/module.h>
     15#include <linux/moduleparam.h>
     16#include <linux/major.h>
     17#include <linux/blkdev.h>
     18#include <linux/bio.h>
     19#include <linux/highmem.h>
     20#include <linux/mutex.h>
     21#include <linux/pagemap.h>
     22#include <linux/radix-tree.h>
     23#include <linux/fs.h>
     24#include <linux/slab.h>
     25#include <linux/backing-dev.h>
     26#include <linux/debugfs.h>
     27
     28#include <linux/uaccess.h>
     29
     30/*
     31 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
     32 * the pages containing the block device's contents. A brd page's ->index is
     33 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
     34 * with, the kernel's pagecache or buffer cache (which sit above our block
     35 * device).
     36 */
     37struct brd_device {
     38	int			brd_number;
     39	struct gendisk		*brd_disk;
     40	struct list_head	brd_list;
     41
     42	/*
     43	 * Backing store of pages and lock to protect it. This is the contents
     44	 * of the block device.
     45	 */
     46	spinlock_t		brd_lock;
     47	struct radix_tree_root	brd_pages;
     48	u64			brd_nr_pages;
     49};
     50
     51/*
     52 * Look up and return a brd's page for a given sector.
     53 */
     54static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
     55{
     56	pgoff_t idx;
     57	struct page *page;
     58
     59	/*
     60	 * The page lifetime is protected by the fact that we have opened the
     61	 * device node -- brd pages will never be deleted under us, so we
     62	 * don't need any further locking or refcounting.
     63	 *
     64	 * This is strictly true for the radix-tree nodes as well (ie. we
     65	 * don't actually need the rcu_read_lock()), however that is not a
     66	 * documented feature of the radix-tree API so it is better to be
     67	 * safe here (we don't have total exclusion from radix tree updates
     68	 * here, only deletes).
     69	 */
     70	rcu_read_lock();
     71	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
     72	page = radix_tree_lookup(&brd->brd_pages, idx);
     73	rcu_read_unlock();
     74
     75	BUG_ON(page && page->index != idx);
     76
     77	return page;
     78}
     79
     80/*
     81 * Look up and return a brd's page for a given sector.
     82 * If one does not exist, allocate an empty page, and insert that. Then
     83 * return it.
     84 */
     85static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
     86{
     87	pgoff_t idx;
     88	struct page *page;
     89	gfp_t gfp_flags;
     90
     91	page = brd_lookup_page(brd, sector);
     92	if (page)
     93		return page;
     94
     95	/*
     96	 * Must use NOIO because we don't want to recurse back into the
     97	 * block or filesystem layers from page reclaim.
     98	 */
     99	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
    100	page = alloc_page(gfp_flags);
    101	if (!page)
    102		return NULL;
    103
    104	if (radix_tree_preload(GFP_NOIO)) {
    105		__free_page(page);
    106		return NULL;
    107	}
    108
    109	spin_lock(&brd->brd_lock);
    110	idx = sector >> PAGE_SECTORS_SHIFT;
    111	page->index = idx;
    112	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
    113		__free_page(page);
    114		page = radix_tree_lookup(&brd->brd_pages, idx);
    115		BUG_ON(!page);
    116		BUG_ON(page->index != idx);
    117	} else {
    118		brd->brd_nr_pages++;
    119	}
    120	spin_unlock(&brd->brd_lock);
    121
    122	radix_tree_preload_end();
    123
    124	return page;
    125}
    126
    127/*
    128 * Free all backing store pages and radix tree. This must only be called when
    129 * there are no other users of the device.
    130 */
    131#define FREE_BATCH 16
    132static void brd_free_pages(struct brd_device *brd)
    133{
    134	unsigned long pos = 0;
    135	struct page *pages[FREE_BATCH];
    136	int nr_pages;
    137
    138	do {
    139		int i;
    140
    141		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
    142				(void **)pages, pos, FREE_BATCH);
    143
    144		for (i = 0; i < nr_pages; i++) {
    145			void *ret;
    146
    147			BUG_ON(pages[i]->index < pos);
    148			pos = pages[i]->index;
    149			ret = radix_tree_delete(&brd->brd_pages, pos);
    150			BUG_ON(!ret || ret != pages[i]);
    151			__free_page(pages[i]);
    152		}
    153
    154		pos++;
    155
    156		/*
    157		 * It takes 3.4 seconds to remove 80GiB ramdisk.
    158		 * So, we need cond_resched to avoid stalling the CPU.
    159		 */
    160		cond_resched();
    161
    162		/*
    163		 * This assumes radix_tree_gang_lookup always returns as
    164		 * many pages as possible. If the radix-tree code changes,
    165		 * so will this have to.
    166		 */
    167	} while (nr_pages == FREE_BATCH);
    168}
    169
    170/*
    171 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
    172 */
    173static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
    174{
    175	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
    176	size_t copy;
    177
    178	copy = min_t(size_t, n, PAGE_SIZE - offset);
    179	if (!brd_insert_page(brd, sector))
    180		return -ENOSPC;
    181	if (copy < n) {
    182		sector += copy >> SECTOR_SHIFT;
    183		if (!brd_insert_page(brd, sector))
    184			return -ENOSPC;
    185	}
    186	return 0;
    187}
    188
    189/*
    190 * Copy n bytes from src to the brd starting at sector. Does not sleep.
    191 */
    192static void copy_to_brd(struct brd_device *brd, const void *src,
    193			sector_t sector, size_t n)
    194{
    195	struct page *page;
    196	void *dst;
    197	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
    198	size_t copy;
    199
    200	copy = min_t(size_t, n, PAGE_SIZE - offset);
    201	page = brd_lookup_page(brd, sector);
    202	BUG_ON(!page);
    203
    204	dst = kmap_atomic(page);
    205	memcpy(dst + offset, src, copy);
    206	kunmap_atomic(dst);
    207
    208	if (copy < n) {
    209		src += copy;
    210		sector += copy >> SECTOR_SHIFT;
    211		copy = n - copy;
    212		page = brd_lookup_page(brd, sector);
    213		BUG_ON(!page);
    214
    215		dst = kmap_atomic(page);
    216		memcpy(dst, src, copy);
    217		kunmap_atomic(dst);
    218	}
    219}
    220
    221/*
    222 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
    223 */
    224static void copy_from_brd(void *dst, struct brd_device *brd,
    225			sector_t sector, size_t n)
    226{
    227	struct page *page;
    228	void *src;
    229	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
    230	size_t copy;
    231
    232	copy = min_t(size_t, n, PAGE_SIZE - offset);
    233	page = brd_lookup_page(brd, sector);
    234	if (page) {
    235		src = kmap_atomic(page);
    236		memcpy(dst, src + offset, copy);
    237		kunmap_atomic(src);
    238	} else
    239		memset(dst, 0, copy);
    240
    241	if (copy < n) {
    242		dst += copy;
    243		sector += copy >> SECTOR_SHIFT;
    244		copy = n - copy;
    245		page = brd_lookup_page(brd, sector);
    246		if (page) {
    247			src = kmap_atomic(page);
    248			memcpy(dst, src, copy);
    249			kunmap_atomic(src);
    250		} else
    251			memset(dst, 0, copy);
    252	}
    253}
    254
    255/*
    256 * Process a single bvec of a bio.
    257 */
    258static int brd_do_bvec(struct brd_device *brd, struct page *page,
    259			unsigned int len, unsigned int off, unsigned int op,
    260			sector_t sector)
    261{
    262	void *mem;
    263	int err = 0;
    264
    265	if (op_is_write(op)) {
    266		err = copy_to_brd_setup(brd, sector, len);
    267		if (err)
    268			goto out;
    269	}
    270
    271	mem = kmap_atomic(page);
    272	if (!op_is_write(op)) {
    273		copy_from_brd(mem + off, brd, sector, len);
    274		flush_dcache_page(page);
    275	} else {
    276		flush_dcache_page(page);
    277		copy_to_brd(brd, mem + off, sector, len);
    278	}
    279	kunmap_atomic(mem);
    280
    281out:
    282	return err;
    283}
    284
    285static void brd_submit_bio(struct bio *bio)
    286{
    287	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
    288	sector_t sector = bio->bi_iter.bi_sector;
    289	struct bio_vec bvec;
    290	struct bvec_iter iter;
    291
    292	bio_for_each_segment(bvec, bio, iter) {
    293		unsigned int len = bvec.bv_len;
    294		int err;
    295
    296		/* Don't support un-aligned buffer */
    297		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
    298				(len & (SECTOR_SIZE - 1)));
    299
    300		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
    301				  bio_op(bio), sector);
    302		if (err) {
    303			bio_io_error(bio);
    304			return;
    305		}
    306		sector += len >> SECTOR_SHIFT;
    307	}
    308
    309	bio_endio(bio);
    310}
    311
    312static int brd_rw_page(struct block_device *bdev, sector_t sector,
    313		       struct page *page, unsigned int op)
    314{
    315	struct brd_device *brd = bdev->bd_disk->private_data;
    316	int err;
    317
    318	if (PageTransHuge(page))
    319		return -ENOTSUPP;
    320	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
    321	page_endio(page, op_is_write(op), err);
    322	return err;
    323}
    324
    325static const struct block_device_operations brd_fops = {
    326	.owner =		THIS_MODULE,
    327	.submit_bio =		brd_submit_bio,
    328	.rw_page =		brd_rw_page,
    329};
    330
    331/*
    332 * And now the modules code and kernel interface.
    333 */
    334static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
    335module_param(rd_nr, int, 0444);
    336MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
    337
    338unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
    339module_param(rd_size, ulong, 0444);
    340MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
    341
    342static int max_part = 1;
    343module_param(max_part, int, 0444);
    344MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
    345
    346MODULE_LICENSE("GPL");
    347MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
    348MODULE_ALIAS("rd");
    349
    350#ifndef MODULE
    351/* Legacy boot options - nonmodular */
    352static int __init ramdisk_size(char *str)
    353{
    354	rd_size = simple_strtol(str, NULL, 0);
    355	return 1;
    356}
    357__setup("ramdisk_size=", ramdisk_size);
    358#endif
    359
    360/*
    361 * The device scheme is derived from loop.c. Keep them in synch where possible
    362 * (should share code eventually).
    363 */
    364static LIST_HEAD(brd_devices);
    365static struct dentry *brd_debugfs_dir;
    366
    367static int brd_alloc(int i)
    368{
    369	struct brd_device *brd;
    370	struct gendisk *disk;
    371	char buf[DISK_NAME_LEN];
    372	int err = -ENOMEM;
    373
    374	list_for_each_entry(brd, &brd_devices, brd_list)
    375		if (brd->brd_number == i)
    376			return -EEXIST;
    377	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
    378	if (!brd)
    379		return -ENOMEM;
    380	brd->brd_number		= i;
    381	list_add_tail(&brd->brd_list, &brd_devices);
    382
    383	spin_lock_init(&brd->brd_lock);
    384	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
    385
    386	snprintf(buf, DISK_NAME_LEN, "ram%d", i);
    387	if (!IS_ERR_OR_NULL(brd_debugfs_dir))
    388		debugfs_create_u64(buf, 0444, brd_debugfs_dir,
    389				&brd->brd_nr_pages);
    390
    391	disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
    392	if (!disk)
    393		goto out_free_dev;
    394
    395	disk->major		= RAMDISK_MAJOR;
    396	disk->first_minor	= i * max_part;
    397	disk->minors		= max_part;
    398	disk->fops		= &brd_fops;
    399	disk->private_data	= brd;
    400	strlcpy(disk->disk_name, buf, DISK_NAME_LEN);
    401	set_capacity(disk, rd_size * 2);
    402	
    403	/*
    404	 * This is so fdisk will align partitions on 4k, because of
    405	 * direct_access API needing 4k alignment, returning a PFN
    406	 * (This is only a problem on very small devices <= 4M,
    407	 *  otherwise fdisk will align on 1M. Regardless this call
    408	 *  is harmless)
    409	 */
    410	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
    411
    412	/* Tell the block layer that this is not a rotational device */
    413	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
    414	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
    415	err = add_disk(disk);
    416	if (err)
    417		goto out_cleanup_disk;
    418
    419	return 0;
    420
    421out_cleanup_disk:
    422	blk_cleanup_disk(disk);
    423out_free_dev:
    424	list_del(&brd->brd_list);
    425	kfree(brd);
    426	return err;
    427}
    428
    429static void brd_probe(dev_t dev)
    430{
    431	brd_alloc(MINOR(dev) / max_part);
    432}
    433
    434static void brd_cleanup(void)
    435{
    436	struct brd_device *brd, *next;
    437
    438	debugfs_remove_recursive(brd_debugfs_dir);
    439
    440	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
    441		del_gendisk(brd->brd_disk);
    442		blk_cleanup_disk(brd->brd_disk);
    443		brd_free_pages(brd);
    444		list_del(&brd->brd_list);
    445		kfree(brd);
    446	}
    447}
    448
    449static inline void brd_check_and_reset_par(void)
    450{
    451	if (unlikely(!max_part))
    452		max_part = 1;
    453
    454	/*
    455	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
    456	 * otherwise, it is possiable to get same dev_t when adding partitions.
    457	 */
    458	if ((1U << MINORBITS) % max_part != 0)
    459		max_part = 1UL << fls(max_part);
    460
    461	if (max_part > DISK_MAX_PARTS) {
    462		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
    463			DISK_MAX_PARTS, DISK_MAX_PARTS);
    464		max_part = DISK_MAX_PARTS;
    465	}
    466}
    467
    468static int __init brd_init(void)
    469{
    470	int err, i;
    471
    472	brd_check_and_reset_par();
    473
    474	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
    475
    476	for (i = 0; i < rd_nr; i++) {
    477		err = brd_alloc(i);
    478		if (err)
    479			goto out_free;
    480	}
    481
    482	/*
    483	 * brd module now has a feature to instantiate underlying device
    484	 * structure on-demand, provided that there is an access dev node.
    485	 *
    486	 * (1) if rd_nr is specified, create that many upfront. else
    487	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
    488	 * (2) User can further extend brd devices by create dev node themselves
    489	 *     and have kernel automatically instantiate actual device
    490	 *     on-demand. Example:
    491	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
    492	 *		fdisk -l /path/devnod_name
    493	 *	If (X / max_part) was not already created it will be created
    494	 *	dynamically.
    495	 */
    496
    497	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
    498		err = -EIO;
    499		goto out_free;
    500	}
    501
    502	pr_info("brd: module loaded\n");
    503	return 0;
    504
    505out_free:
    506	brd_cleanup();
    507
    508	pr_info("brd: module NOT loaded !!!\n");
    509	return err;
    510}
    511
    512static void __exit brd_exit(void)
    513{
    514
    515	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
    516	brd_cleanup();
    517
    518	pr_info("brd: module unloaded\n");
    519}
    520
    521module_init(brd_init);
    522module_exit(brd_exit);
    523