From 2dd209f00fc5a1caafa493066c7cd692fd2fd57c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 9 Mar 2020 21:26:16 -0700 Subject: blk-mq: Fix a comment in include/linux/blk-mq.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'hctx_list' member of struct blk_mq_hw_ctx is not a list head but instead an entry in q->unused_hctx_list. Fix the comment above this struct member. Fixes: d386732bc142 ("blk-mq: fill header with kernel-doc") Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Cc: AndrĂ© Almeida Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 11cfd6470b1a..31344d5f83e2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -162,7 +162,10 @@ struct blk_mq_hw_ctx { struct dentry *sched_debugfs_dir; #endif - /** @hctx_list: List of all hardware queues. */ + /** + * @hctx_list: if this hctx is not in use, this is an entry in + * q->unused_hctx_list. + */ struct list_head hctx_list; /** -- cgit v1.2.3-71-gd317 From 30a2da7b7e225ef6c87a660419ea04d3cef3f6a7 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 11 Mar 2020 16:07:50 +0530 Subject: block: Fix use-after-free issue accessing struct io_cq There is a potential race between ioc_release_fn() and ioc_clear_queue() as shown below, due to which below kernel crash is observed. It also can result into use-after-free issue. context#1: context#2: ioc_release_fn() __ioc_clear_queue() gets the same icq ->spin_lock(&ioc->lock); ->spin_lock(&ioc->lock); ->ioc_destroy_icq(icq); ->list_del_init(&icq->q_node); ->call_rcu(&icq->__rcu_head, icq_free_icq_rcu); ->spin_unlock(&ioc->lock); ->ioc_destroy_icq(icq); ->hlist_del_init(&icq->ioc_node); This results into below crash as this memory is now used by icq->__rcu_head in context#1. There is a chance that icq could be free'd as well. 22150.386550: <6> Unable to handle kernel write to read-only memory at virtual address ffffffaa8d31ca50 ... Call trace: 22150.607350: <2> ioc_destroy_icq+0x44/0x110 22150.611202: <2> ioc_clear_queue+0xac/0x148 22150.615056: <2> blk_cleanup_queue+0x11c/0x1a0 22150.619174: <2> __scsi_remove_device+0xdc/0x128 22150.623465: <2> scsi_forget_host+0x2c/0x78 22150.627315: <2> scsi_remove_host+0x7c/0x2a0 22150.631257: <2> usb_stor_disconnect+0x74/0xc8 22150.635371: <2> usb_unbind_interface+0xc8/0x278 22150.639665: <2> device_release_driver_internal+0x198/0x250 22150.644897: <2> device_release_driver+0x24/0x30 22150.649176: <2> bus_remove_device+0xec/0x140 22150.653204: <2> device_del+0x270/0x460 22150.656712: <2> usb_disable_device+0x120/0x390 22150.660918: <2> usb_disconnect+0xf4/0x2e0 22150.664684: <2> hub_event+0xd70/0x17e8 22150.668197: <2> process_one_work+0x210/0x480 22150.672222: <2> worker_thread+0x32c/0x4c8 Fix this by adding a new ICQ_DESTROYED flag in ioc_destroy_icq() to indicate this icq is once marked as destroyed. Also, ensure __ioc_clear_queue() is accessing icq within rcu_read_lock/unlock so that icq doesn't get free'd up while it is still using it. Signed-off-by: Sahitya Tummala Co-developed-by: Pradeep P V K Signed-off-by: Pradeep P V K Signed-off-by: Jens Axboe --- block/blk-ioc.c | 7 +++++++ include/linux/iocontext.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5ed59ac6ae58..9df50fb507ca 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq) * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; + icq->flags |= ICQ_DESTROYED; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } @@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; + rcu_read_lock(); while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); + if (icq->flags & ICQ_DESTROYED) { + spin_unlock_irqrestore(&ioc->lock, flags); + continue; + } ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } + rcu_read_unlock(); } /** diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index dba15ca8e60b..1dcd9198beb7 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -8,6 +8,7 @@ enum { ICQ_EXITED = 1 << 2, + ICQ_DESTROYED = 1 << 3, }; /* -- cgit v1.2.3-71-gd317 From 9243c6f3e012a92dd900d97ef45efaf8a8edc448 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Sat, 7 Mar 2020 15:56:59 +0100 Subject: block: Document genhd capability flags The kernel documentation includes a brief section about genhd capabilities, but it turns out that the only documented capability (GENHD_FL_MEDIA_CHANGE_NOTIFY) isn't used any more. This patch removes that flag, and documents the rest, based on my understanding of the current uses of these flags in the kernel. The documentation is kept in the header file, alongside the declarations, in the hope that it will be kept up-to-date in future; the kernel documentation is changed to include the documentation generated from the header file. Because the ultimate goal is to provide some end-user documentation (or end-administrator documentation), the comments are perhaps more user-oriented than might be expected. Since the values are shown to users in hexadecimal, the documentation lists them in hexadecimal, and the constant declarations are adjusted to match. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Stephen Kitt Signed-off-by: Jens Axboe --- Documentation/block/capability.rst | 16 +++------ include/linux/genhd.h | 69 ++++++++++++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst index 2cf258d64bbe..160a5148b915 100644 --- a/Documentation/block/capability.rst +++ b/Documentation/block/capability.rst @@ -2,17 +2,9 @@ Generic Block Device Capability =============================== -This file documents the sysfs file block//capability +This file documents the sysfs file ``block//capability``. -capability is a hex word indicating which capabilities a specific disk -supports. For more information on bits not listed here, see -include/linux/genhd.h +``capability`` is a bitfield, printed in hexadecimal, indicating which +capabilities a specific block device supports: -GENHD_FL_MEDIA_CHANGE_NOTIFY ----------------------------- - -Value: 4 - -When this bit is set, the disk supports Asynchronous Notification -of media change events. These events will be broadcast to user -space via kernel uevent. +.. kernel-doc:: include/linux/genhd.h diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fbe58538ad6..e7244fb6b6ad 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -133,17 +133,64 @@ struct hd_struct { struct rcu_work rcu_work; }; -#define GENHD_FL_REMOVABLE 1 -/* 2 is unused */ -#define GENHD_FL_MEDIA_CHANGE_NOTIFY 4 -#define GENHD_FL_CD 8 -#define GENHD_FL_UP 16 -#define GENHD_FL_SUPPRESS_PARTITION_INFO 32 -#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ -#define GENHD_FL_NATIVE_CAPACITY 128 -#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 -#define GENHD_FL_NO_PART_SCAN 512 -#define GENHD_FL_HIDDEN 1024 +/** + * DOC: genhd capability flags + * + * ``GENHD_FL_REMOVABLE`` (0x0001): indicates that the block device + * gives access to removable media. + * When set, the device remains present even when media is not + * inserted. + * Must not be set for devices which are removed entirely when the + * media is removed. + * + * ``GENHD_FL_CD`` (0x0008): the block device is a CD-ROM-style + * device. + * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. + * + * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up", + * with a similar meaning to network interfaces. + * + * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include + * partition information in ``/proc/partitions`` or in the output of + * printk_all_partitions(). + * Used for the null block device and some MMC devices. + * + * ``GENHD_FL_EXT_DEVT`` (0x0040): the driver supports extended + * dynamic ``dev_t``, i.e. it wants extended device numbers + * (``BLOCK_EXT_MAJOR``). + * This affects the maximum number of partitions. + * + * ``GENHD_FL_NATIVE_CAPACITY`` (0x0080): based on information in the + * partition table, the device's capacity has been extended to its + * native capacity; i.e. the device has hidden capacity used by one + * of the partitions (this is a flag used so that native capacity is + * only ever unlocked once). + * + * ``GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE`` (0x0100): event polling is + * blocked whenever a writer holds an exclusive lock. + * + * ``GENHD_FL_NO_PART_SCAN`` (0x0200): partition scanning is disabled. + * Used for loop devices in their default settings and some MMC + * devices. + * + * ``GENHD_FL_HIDDEN`` (0x0400): the block device is hidden; it + * doesn't produce events, doesn't appear in sysfs, and doesn't have + * an associated ``bdev``. + * Implies ``GENHD_FL_SUPPRESS_PARTITION_INFO`` and + * ``GENHD_FL_NO_PART_SCAN``. + * Used for multipath devices. + */ +#define GENHD_FL_REMOVABLE 0x0001 +/* 2 is unused (used to be GENHD_FL_DRIVERFS) */ +/* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ +#define GENHD_FL_CD 0x0008 +#define GENHD_FL_UP 0x0010 +#define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 +#define GENHD_FL_EXT_DEVT 0x0040 +#define GENHD_FL_NATIVE_CAPACITY 0x0080 +#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 0x0100 +#define GENHD_FL_NO_PART_SCAN 0x0200 +#define GENHD_FL_HIDDEN 0x0400 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.3-71-gd317 From e598a72faeb543599bdf0d930df3a71906404e6f Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 13 Mar 2020 05:30:05 +0000 Subject: block/genhd: Notify udev about capacity change Allow block/genhd to notify user space (via udev) about disk size changes using a new helper set_capacity_revalidate_and_notify(), which is a wrapper on top of set_capacity(). set_capacity_revalidate_and_notify() will only notify via udev if the current capacity or the target capacity is not zero and iff the capacity changes. Suggested-by: Christoph Hellwig Signed-off-by: Someswarudu Sangaraju Signed-off-by: Balbir Singh Reviewed-by: Bob Liu Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 24 ++++++++++++++++++++++++ include/linux/genhd.h | 2 ++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index ff6268970ddc..6a60131baffa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -46,6 +46,30 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool revalidate) +{ + sector_t capacity = get_capacity(disk); + + set_capacity(disk, size); + + if (revalidate) + revalidate_disk(disk); + + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; + + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + } +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e7244fb6b6ad..3c1a816785c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -508,6 +508,8 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); +extern void set_capacity_revalidate_and_notify(struct gendisk *disk, + sector_t size, bool revalidate); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ -- cgit v1.2.3-71-gd317 From ea3edd4dc23027083fbb4a73b65114d08fe73a76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:11 +0100 Subject: block: remove __bdevname There is no good reason for __bdevname to exist. Just open code printing the string in the callers. For three of them the format string can be trivially merged into existing printk statements, and in init/do_mounts.c we can at least do the scnprintf once at the start of the function, and unconditional of CONFIG_BLOCK to make the output for tiny configfs a little more helpful. Acked-by: Theodore Ts'o # for ext4 Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 14 -------------- drivers/md/md.c | 4 ++-- fs/ext4/super.c | 6 +++--- fs/reiserfs/journal.c | 5 ++--- include/linux/fs.h | 1 - init/do_mounts.c | 12 ++---------- 6 files changed, 9 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index 564fae77711d..98256e6beabb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -57,20 +57,6 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..8238fb64f87f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2491,12 +2491,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - pr_warn("md: could not open %s.\n", __bdevname(dev, b)); + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); return PTR_ERR(bdev); } rdev->bdev = bdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c7c4adb664e..4cbd1cc34dfa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -927,7 +927,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) @@ -935,8 +934,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); return NULL; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 072156c4f895..5c766330e493 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super, int result; dev_t jdev; fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; - char b[BDEVNAME_SIZE]; result = 0; @@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super, result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; reiserfs_warning(super, "sh-458", - "cannot init journal device '%s': %i", - __bdevname(jdev, b), result); + "cannot init journal device unknown-block(%u,%u): %i", + MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3cd4fe6b845e..561b35e3b95b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2699,7 +2699,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 -extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern void blkdev_show(struct seq_file *,off_t); diff --git a/init/do_mounts.c b/init/do_mounts.c index 0ae9cc22f2ae..29d326b6c29d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -429,12 +429,10 @@ void __init mount_block_root(char *name, int flags) struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; -#ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; -#else - const char *b = name; -#endif + scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); get_fs_names(fs_names); retry: for (p = fs_names; *p; p += strlen(p)+1) { @@ -451,9 +449,6 @@ retry: * and bad superblock on root device. * and give them a list of the available devices */ -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", root_device_name, b, err); printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); @@ -476,9 +471,6 @@ retry: for (p = fs_names; *p; p += strlen(p)+1) printk(" %s", p); printk("\n"); -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif panic("VFS: Unable to mount root fs on %s", b); out: put_page(page); -- cgit v1.2.3-71-gd317 From 3ad5cee5cd000dc05e6c2410b06fc1d818e7b1e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:13 +0100 Subject: block: move sysfs methods shared by disks and partitions to genhd.c Move the sysfs _show methods that are used both on the full disk and partition nodes to genhd.c instead of hiding them in the partitioning code. Also move the declaration for these methods to block/blk.h so that we don't expose them to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 11 +++++++ block/genhd.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++- block/partition-generic.c | 76 +------------------------------------------- include/linux/genhd.h | 14 --------- 4 files changed, 91 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index 670337b7cfa0..43df9dcb3d4e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -214,6 +214,17 @@ static inline void elevator_exit(struct request_queue *q, struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +ssize_t part_size_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/genhd.c b/block/genhd.c index 2484348d1850..f7d60b620b97 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -1199,6 +1200,60 @@ static ssize_t disk_ro_show(struct device *dev, return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight; + + inflight = part_in_flight(q, p); + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + part_stat_read(p, ios[STAT_READ]), + part_stat_read(p, merges[STAT_READ]), + (unsigned long long)part_stat_read(p, sectors[STAT_READ]), + (unsigned int)part_stat_read_msecs(p, STAT_READ), + part_stat_read(p, ios[STAT_WRITE]), + part_stat_read(p, merges[STAT_WRITE]), + (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + inflight, + jiffies_to_msecs(part_stat_read(p, io_ticks)), + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + part_stat_read(p, ios[STAT_DISCARD]), + part_stat_read(p, merges[STAT_DISCARD]), + (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), + part_stat_read(p, ios[STAT_FLUSH]), + (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1237,10 +1292,33 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + #ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); -#endif +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); diff --git a/block/partition-generic.c b/block/partition-generic.c index 6bf5aec2a0dc..e6fd2226a639 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,6 +18,7 @@ #include #include #include +#include "blk.h" #include "partitions/check.h" @@ -41,13 +42,6 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); -} - static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -69,74 +63,6 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", p->discard_alignment); } -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight; - - inflight = part_in_flight(q, p); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u " - "%8lu %8lu %8llu %8u " - "%8lu %8u" - "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); -} - -ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - - part_in_flight_rw(q, p, inflight); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3c1a816785c8..612d38fce55c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -706,20 +706,6 @@ extern void blk_register_region(dev_t devt, unsigned long range, void *data); extern void blk_unregister_region(dev_t devt, unsigned long range); -extern ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf); -#ifdef CONFIG_FAIL_MAKE_REQUEST -extern ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf); -extern ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count); -#endif /* CONFIG_FAIL_MAKE_REQUEST */ - #define alloc_disk_node(minors, node_id) \ ({ \ static struct lock_class_key __key; \ -- cgit v1.2.3-71-gd317 From f17c21c1ecb80e957bafa07d6454836854be7cf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:14 +0100 Subject: block: remove alloc_part_info and free_part_info There isn't any good reason not to simply open code the allocation and freeing of the partition_meta_info structure. Especially as one of the branches in alloc_part_info is entirely dead code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 6 ++++-- include/linux/genhd.h | 15 +-------------- 2 files changed, 5 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index e6fd2226a639..f2004f3bd6f7 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -249,7 +249,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); if (!pinfo) { err = -ENOMEM; goto out_free_stats; @@ -308,7 +310,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, return p; out_free_info: - free_part_info(p); + kfree(p->info); out_free_stats: free_part_stats(p); out_free: diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 612d38fce55c..77f8bb8edfcd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -465,19 +465,6 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw); -static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) -{ - if (disk) - return kzalloc_node(sizeof(struct partition_meta_info), - GFP_KERNEL, disk->node_id); - return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL); -} - -static inline void free_part_info(struct hd_struct *part) -{ - kfree(part->info); -} - void update_io_ticks(struct hd_struct *part, unsigned long now); /* block/genhd.c */ @@ -755,7 +742,7 @@ static inline void hd_struct_kill(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { free_part_stats(part); - free_part_info(part); + kfree(part->info); percpu_ref_exit(&part->ref); } -- cgit v1.2.3-71-gd317 From 1a9fba3a77a5b39d1c9e1611758303f2649474e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:18 +0100 Subject: block: unexport read_dev_sector and put_dev_sector read_dev_sector and put_dev_sector are now only used by the partition parsing code. Remove the export for read_dev_sector and merge it into the only caller. Clean the mess up a bit by using goto labels and the SECTOR_SHIFT constant. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 31 +++++++++++++++++++------------ block/partitions/check.h | 14 +++++++------- include/linux/blkdev.h | 9 --------- 3 files changed, 26 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index f2004f3bd6f7..fef6bacb2bbb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -484,22 +484,29 @@ out_free_state: return ret; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = state->bdev->bd_inode->i_mapping; struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9); -fail: - put_page(page); + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: p->v = NULL; return NULL; } - -EXPORT_SYMBOL(read_dev_sector); diff --git a/block/partitions/check.h b/block/partitions/check.h index 6042f769471a..0fcf80117887 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -28,14 +28,14 @@ void free_partitions(struct parsed_partitions *state); struct parsed_partitions * check_partition(struct gendisk *, struct block_device *); -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) +typedef struct { + struct page *v; +} Sector; + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); +static inline void put_dev_sector(Sector p) { - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); + put_page(p.v); } static inline void diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f629d40c645c..53a1325efbc3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1484,15 +1484,6 @@ static inline unsigned int block_size(struct block_device *bdev) return bdev->bd_block_size; } -typedef struct {struct page *v;} Sector; - -unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); - -static inline void put_dev_sector(Sector p) -{ - put_page(p.v); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); -- cgit v1.2.3-71-gd317 From 74cc979c3c7f8328b24651daf15280f07533e735 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:19 +0100 Subject: block: cleanup how md_autodetect_dev is called Add a new include/linux/raid/detect.h header to declare the md_autodetect_dev prototype which can be shared between md and the partition code. Then use IS_BUILTIN to call it instead of the ifdef magic. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 11 ++++------- drivers/md/md.c | 1 + include/linux/raid/detect.h | 3 +++ 3 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 include/linux/raid/detect.h (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index fef6bacb2bbb..4d771ae835ed 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -18,14 +18,11 @@ #include #include #include +#include #include "blk.h" #include "partitions/check.h" -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); -#endif - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -407,10 +404,10 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part_to_dev(part)->devt); -#endif + return true; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 8238fb64f87f..1c7193715f07 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/raid/detect.h b/include/linux/raid/detect.h new file mode 100644 index 000000000000..37dd3f40cd31 --- /dev/null +++ b/include/linux/raid/detect.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +void md_autodetect_dev(dev_t dev); -- cgit v1.2.3-71-gd317 From 1442f76d4317b420580e11238d20789708c742a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:26 +0100 Subject: block: move struct partition out of genhd.h struct partition is the on-disk format of a MSDOS partition table entry. Move it out of genhd.h into a new msdos_partition.h header and give it a msdos_ prefix to avoid confusion. Also move the magic number from block/partitions/msdos.h to the new header so that it can be used by the SCSI drivers looking at the DOS partition tables. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 6 +++--- block/partitions/msdos.c | 28 ++++++++++++++-------------- block/partitions/msdos.h | 8 -------- drivers/scsi/BusLogic.c | 8 +++++--- drivers/scsi/aacraid/linit.c | 7 ++++--- drivers/scsi/scsi_debug.c | 5 +++-- drivers/scsi/scsicam.c | 3 ++- include/linux/genhd.h | 13 ------------- include/linux/msdos_partition.h | 20 ++++++++++++++++++++ 9 files changed, 51 insertions(+), 47 deletions(-) delete mode 100644 block/partitions/msdos.h create mode 100644 include/linux/msdos_partition.h (limited to 'include/linux') diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index a2d97ee1908c..6fdfcb40c537 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -14,10 +14,10 @@ #include #include #include +#include #include "ldm.h" #include "check.h" -#include "msdos.h" /* * ldm_debug/info/error/crit - Output an error message @@ -493,7 +493,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; - struct partition *p; + struct msdos_partition *p; int i; bool result = false; @@ -508,7 +508,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; - p = (struct partition*)(data + 0x01BE); + p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (SYS_IND (p) == LDM_PARTITION) { result = true; diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index c572022f3781..88ee5ee7f442 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -20,9 +20,9 @@ * Re-organised Feb 1998 Russell King */ #include +#include #include "check.h" -#include "msdos.h" #include "efi.h" /* @@ -34,17 +34,17 @@ #define SYS_IND(p) get_unaligned(&p->sys_ind) -static inline sector_t nr_sects(struct partition *p) +static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } -static inline sector_t start_sect(struct partition *p) +static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } -static inline int is_extended_partition(struct partition *p) +static inline int is_extended_partition(struct msdos_partition *p) { return (SYS_IND(p) == DOS_EXTENDED_PARTITION || SYS_IND(p) == WIN98_EXTENDED_PARTITION || @@ -67,7 +67,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { - struct partition *pt = (struct partition *) (p + 0x1be); + struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; @@ -121,7 +121,7 @@ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { - struct partition *p; + struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; @@ -145,7 +145,7 @@ static void parse_extended(struct parsed_partitions *state, if (!msdos_magic_present(data + 510)) goto done; - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, @@ -402,14 +402,14 @@ static void parse_minix(struct parsed_partitions *state, #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; int i; data = read_part_sector(state, offset, §); if (!data) return; - p = (struct partition *)(data + 0x1be); + p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or @@ -453,7 +453,7 @@ int msdos_partition(struct parsed_partitions *state) sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; - struct partition *p; + struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; @@ -487,7 +487,7 @@ int msdos_partition(struct parsed_partitions *state) * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* @@ -509,7 +509,7 @@ int msdos_partition(struct parsed_partitions *state) } #ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { @@ -518,7 +518,7 @@ int msdos_partition(struct parsed_partitions *state) } } #endif - p = (struct partition *) (data + 0x1be); + p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); @@ -565,7 +565,7 @@ int msdos_partition(struct parsed_partitions *state) strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); + p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = SYS_IND(p); int n; diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h deleted file mode 100644 index 123e666bb932..000000000000 --- a/block/partitions/msdos.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - - diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 3170b295a5da..186259417449 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -3410,9 +3411,10 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, a partition table entry whose end_head matches one of the standard BusLogic geometry translations (64/32, 128/32, or 255/63). */ - if (*(unsigned short *) (buf + 64) == 0xAA55) { - struct partition *part1_entry = (struct partition *) buf; - struct partition *part_entry = part1_entry; + if (*(unsigned short *) (buf + 64) == MSDOS_LABEL_MAGIC) { + struct msdos_partition *part1_entry = + (struct msdos_partition *)buf; + struct msdos_partition *part_entry = part1_entry; int saved_cyl = diskparam->cylinders, part_no; unsigned char part_end_head = 0, part_end_sector = 0; diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index ee6bc2f9b80a..0443b74390cf 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -328,9 +329,9 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, buf = scsi_bios_ptable(bdev); if (!buf) return 0; - if(*(__le16 *)(buf + 0x40) == cpu_to_le16(0xaa55)) { - struct partition *first = (struct partition * )buf; - struct partition *entry = first; + if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { + struct msdos_partition *first = (struct msdos_partition *)buf; + struct msdos_partition *entry = first; int saved_cylinders = param->cylinders; int num; unsigned char end_head, end_sec; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 44cb054d5e66..4c6c448dc2df 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -4146,7 +4147,7 @@ static int scsi_debug_host_reset(struct scsi_cmnd *SCpnt) static void __init sdebug_build_parts(unsigned char *ramp, unsigned long store_size) { - struct partition *pp; + struct msdos_partition *pp; int starts[SDEBUG_MAX_PARTS + 2]; int sectors_per_part, num_sectors, k; int heads_by_sects, start_sec, end_sec; @@ -4171,7 +4172,7 @@ static void __init sdebug_build_parts(unsigned char *ramp, ramp[510] = 0x55; /* magic partition markings */ ramp[511] = 0xAA; - pp = (struct partition *)(ramp + 0x1be); + pp = (struct msdos_partition *)(ramp + 0x1be); for (k = 0; starts[k + 1]; ++k, ++pp) { start_sec = starts[k]; end_sec = starts[k + 1] - 1; diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 665d7db2f94c..682cf08ab041 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -61,7 +62,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) { int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; - struct partition *p, *largest = NULL; + struct msdos_partition *p, *largest = NULL; void *buf; int ret = false; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 77f8bb8edfcd..3d84da9ec0fa 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -70,19 +70,6 @@ enum { #include #include -struct partition { - unsigned char boot_ind; /* 0x80 - active */ - unsigned char head; /* starting head */ - unsigned char sector; /* starting sector */ - unsigned char cyl; /* starting cylinder */ - unsigned char sys_ind; /* What partition type */ - unsigned char end_head; /* end head */ - unsigned char end_sector; /* end sector */ - unsigned char end_cyl; /* end cylinder */ - __le32 start_sect; /* starting sector counting from 0 */ - __le32 nr_sects; /* nr of sectors in partition */ -} __attribute__((packed)); - struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h new file mode 100644 index 000000000000..a8e2c1b4bc66 --- /dev/null +++ b/include/linux/msdos_partition.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MSDOS_PARTITION_H +#define _LINUX_MSDOS_PARTITION_H + +#define MSDOS_LABEL_MAGIC 0xAA55 + +struct msdos_partition { + u8 boot_ind; /* 0x80 - active */ + u8 head; /* starting head */ + u8 sector; /* starting sector */ + u8 cyl; /* starting cylinder */ + u8 sys_ind; /* What partition type */ + u8 end_head; /* end head */ + u8 end_sector; /* end sector */ + u8 end_cyl; /* end cylinder */ + __le32 start_sect; /* starting sector counting from 0 */ + __le32 nr_sects; /* nr of sectors in partition */ +} __packed; + +#endif /* LINUX_MSDOS_PARTITION_H */ -- cgit v1.2.3-71-gd317 From 0226e9ead44b2eb8f2471d24e0b0c5ff60bc322c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:27 +0100 Subject: block: move the *_PARTITION enum out of genhd.h The enum containing the *_PARTITION symbolic names is only relevant for the partition parser. More specifically most values are MSDOS partition table system indicators and thus should go straight into msdos.c. One value is only used by the sun partition parser, and the sun and sgi partition parsers use the same value as the x86 Linux RAID indicator to also indicate RAID autodetection. Duplicate them in sun.c and sgi.c given that the different partition types use entirely different values otherwise. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/sgi.c | 4 ++++ block/partitions/sun.c | 5 +++++ include/linux/genhd.h | 30 ------------------------------ include/linux/msdos_partition.h | 31 +++++++++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 927cf501603e..4273f1bb0515 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -9,6 +9,10 @@ #define SGI_LABEL_MAGIC 0x0be5a941 +enum { + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 28b44100f2b1..47dc53eccf77 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -13,6 +13,11 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE +enum { + SUN_WHOLE_DISK = 5, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + int sun_partition(struct parsed_partitions *state) { int i; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3d84da9ec0fa..3050b0ee9cc7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -30,36 +30,6 @@ extern struct device_type part_type; extern struct kobject *block_depr; extern struct class block_class; -enum { -/* These three have identical behaviour; use the second one if DOS FDISK gets - confused about extended/logical partitions starting past cylinder 1023. */ - DOS_EXTENDED_PARTITION = 5, - LINUX_EXTENDED_PARTITION = 0x85, - WIN98_EXTENDED_PARTITION = 0x0f, - - SUN_WHOLE_DISK = DOS_EXTENDED_PARTITION, - - LINUX_SWAP_PARTITION = 0x82, - LINUX_DATA_PARTITION = 0x83, - LINUX_LVM_PARTITION = 0x8e, - LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ - - SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, - NEW_SOLARIS_X86_PARTITION = 0xbf, - - DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ - DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */ - DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */ - EZD_PARTITION = 0x55, /* EZ-DRIVE */ - - FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */ - OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */ - NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */ - BSDI_PARTITION = 0xb7, /* BSDI Partition ID */ - MINIX_PARTITION = 0x81, /* Minix Partition ID */ - UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */ -}; - #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h index a8e2c1b4bc66..e151af072cd1 100644 --- a/include/linux/msdos_partition.h +++ b/include/linux/msdos_partition.h @@ -17,4 +17,35 @@ struct msdos_partition { __le32 nr_sects; /* nr of sectors in partition */ } __packed; +enum msdos_sys_ind { + /* + * These three have identical behaviour; use the second one if DOS FDISK + * gets confused about extended/logical partitions starting past + * cylinder 1023. + */ + DOS_EXTENDED_PARTITION = 5, + LINUX_EXTENDED_PARTITION = 0x85, + WIN98_EXTENDED_PARTITION = 0x0f, + + LINUX_SWAP_PARTITION = 0x82, + LINUX_DATA_PARTITION = 0x83, + LINUX_LVM_PARTITION = 0x8e, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ + + SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, + NEW_SOLARIS_X86_PARTITION = 0xbf, + + DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ + DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */ + DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */ + EZD_PARTITION = 0x55, /* EZ-DRIVE */ + + FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */ + OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */ + NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */ + BSDI_PARTITION = 0xb7, /* BSDI Partition ID */ + MINIX_PARTITION = 0x81, /* Minix Partition ID */ + UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */ +}; + #endif /* LINUX_MSDOS_PARTITION_H */ -- cgit v1.2.3-71-gd317 From cb0ab52652123c47cb72e665ce9fdd3029dcb175 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:28 +0100 Subject: partitions/msdos: remove LINUX_SWAP_PARTITION Just always use NEW_SOLARIS_X86_PARTITION and explain the situation, as that is less confusing than two names for a single value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 18 ++++++++++++------ include/linux/msdos_partition.h | 3 +-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 88ee5ee7f442..e44e2f0a02cc 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -77,13 +77,19 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; - /* Assume the partition table is valid if Linux partitions exists */ + + /* + * Assume the partition table is valid if Linux partitions exists. + * Note that old Solaris/x86 partitions use the same indicator as + * Linux swap partitions, so we consider that a Linux partition as + * well. + */ for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) + if (pt->sys_ind == SOLARIS_X86_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, §); diff --git a/include/linux/msdos_partition.h b/include/linux/msdos_partition.h index e151af072cd1..2cb82db2a43c 100644 --- a/include/linux/msdos_partition.h +++ b/include/linux/msdos_partition.h @@ -27,12 +27,11 @@ enum msdos_sys_ind { LINUX_EXTENDED_PARTITION = 0x85, WIN98_EXTENDED_PARTITION = 0x0f, - LINUX_SWAP_PARTITION = 0x82, LINUX_DATA_PARTITION = 0x83, LINUX_LVM_PARTITION = 0x8e, LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ - SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION, + SOLARIS_X86_PARTITION = 0x82, /* also Linux swap partitions */ NEW_SOLARIS_X86_PARTITION = 0xbf, DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */ -- cgit v1.2.3-71-gd317 From 3f4fc59c1321b74df27dcd5d77b37989ed93265b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:29 +0100 Subject: block: move the various x86 Unix label formats out of genhd.h All these are just used in block/partitions/msdos.c, so move them out of the genhd.h driver included by every driver. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 125 +++++++++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 143 ----------------------------------------------- 2 files changed, 125 insertions(+), 143 deletions(-) (limited to 'include/linux') diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index e44e2f0a02cc..8f2fcc080264 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -18,6 +18,12 @@ * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King + * + * BSD disklabel support by Yossi Gottlieb + * updated by Marc Espie + * + * Unixware slices support by Andrzej Krzysztofowicz + * and Krzysztof G. Baranowski */ #include #include @@ -215,6 +221,30 @@ done: put_dev_sector(sect); } +#define SOLARIS_X86_NUMSLICE 16 +#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) + +struct solaris_x86_slice { + __le16 s_tag; /* ID tag of partition */ + __le16 s_flag; /* permission flags */ + __le32 s_start; /* start sector no of partition */ + __le32 s_size; /* # of blocks in partition */ +}; + +struct solaris_x86_vtoc { + unsigned int v_bootinfo[3]; /* info needed by mboot */ + __le32 v_sanity; /* to verify vtoc sanity */ + __le32 v_version; /* layout version */ + char v_volume[8]; /* volume name */ + __le16 v_sectorsz; /* sector size in bytes */ + __le16 v_nparts; /* number of partitions */ + unsigned int v_reserved[10]; /* free space */ + struct solaris_x86_slice + v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ + unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ + char v_asciilabel[128]; /* for compatibility */ +}; + /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ @@ -270,6 +300,54 @@ static void parse_solaris_x86(struct parsed_partitions *state, #endif } +/* check against BSD src/sys/sys/disklabel.h for consistency */ +#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ +#define BSD_MAXPARTITIONS 16 +#define OPENBSD_MAXPARTITIONS 16 +#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ +struct bsd_disklabel { + __le32 d_magic; /* the magic number */ + __s16 d_type; /* drive type */ + __s16 d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + char d_packname[16]; /* pack identifier */ + __u32 d_secsize; /* # of bytes per sector */ + __u32 d_nsectors; /* # of data sectors per track */ + __u32 d_ntracks; /* # of tracks per cylinder */ + __u32 d_ncylinders; /* # of data cylinders per unit */ + __u32 d_secpercyl; /* # of data sectors per cylinder */ + __u32 d_secperunit; /* # of data sectors per unit */ + __u16 d_sparespertrack; /* # of spare sectors per track */ + __u16 d_sparespercyl; /* # of spare sectors per cylinder */ + __u32 d_acylinders; /* # of alt. cylinders per unit */ + __u16 d_rpm; /* rotational speed */ + __u16 d_interleave; /* hardware sector interleave */ + __u16 d_trackskew; /* sector 0 skew, per track */ + __u16 d_cylskew; /* sector 0 skew, per cylinder */ + __u32 d_headswitch; /* head switch time, usec */ + __u32 d_trkseek; /* track-to-track seek, usec */ + __u32 d_flags; /* generic flags */ +#define NDDATA 5 + __u32 d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + __u32 d_spare[NSPARE]; /* reserved for future use */ + __le32 d_magic2; /* the magic number (again) */ + __le16 d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + __le16 d_npartitions; /* number of partitions in following */ + __le32 d_bbsize; /* size of boot area at sn0, bytes */ + __le32 d_sbsize; /* max size of fs superblock, bytes */ + struct bsd_partition { /* the partition table */ + __le32 p_size; /* number of sectors in partition */ + __le32 p_offset; /* starting sector */ + __le32 p_fsize; /* filesystem basic fragment size */ + __u8 p_fstype; /* filesystem type, see below */ + __u8 p_frag; /* filesystem fragments per block */ + __le16 p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ +}; + #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a @@ -354,6 +432,51 @@ static void parse_openbsd(struct parsed_partitions *state, #endif } +#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ +#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ +#define UNIXWARE_NUMSLICE 16 +#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ + +struct unixware_slice { + __le16 s_label; /* label */ + __le16 s_flags; /* permission flags */ + __le32 start_sect; /* starting sector */ + __le32 nr_sects; /* number of sectors in slice */ +}; + +struct unixware_disklabel { + __le32 d_type; /* drive type */ + __le32 d_magic; /* the magic number */ + __le32 d_version; /* version number */ + char d_serial[12]; /* serial number of the device */ + __le32 d_ncylinders; /* # of data cylinders per device */ + __le32 d_ntracks; /* # of tracks per cylinder */ + __le32 d_nsectors; /* # of data sectors per track */ + __le32 d_secsize; /* # of bytes per sector */ + __le32 d_part_start; /* # of first sector of this partition*/ + __le32 d_unknown1[12]; /* ? */ + __le32 d_alt_tbl; /* byte offset of alternate table */ + __le32 d_alt_len; /* byte length of alternate table */ + __le32 d_phys_cyl; /* # of physical cylinders per device */ + __le32 d_phys_trk; /* # of physical tracks per cylinder */ + __le32 d_phys_sec; /* # of physical sectors per track */ + __le32 d_phys_bytes; /* # of physical bytes per sector */ + __le32 d_unknown2; /* ? */ + __le32 d_unknown3; /* ? */ + __le32 d_pad[8]; /* pad */ + + struct unixware_vtoc { + __le32 v_magic; /* the magic number */ + __le32 v_version; /* version number */ + char v_name[8]; /* volume name */ + __le16 v_nslices; /* # of slices */ + __le16 v_unknown1; /* ? */ + __le32 v_reserved[10]; /* reserved */ + struct unixware_slice + v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ + } vtoc; +}; /* 408 */ + /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. @@ -397,6 +520,8 @@ static void parse_unixware(struct parsed_partitions *state, #endif } +#define MINIX_NR_SUBPARTITIONS 4 + /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3050b0ee9cc7..da62b44b15be 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -473,149 +473,6 @@ static inline void set_capacity(struct gendisk *disk, sector_t size) disk->part0.nr_sects = size; } -#ifdef CONFIG_SOLARIS_X86_PARTITION - -#define SOLARIS_X86_NUMSLICE 16 -#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) - -struct solaris_x86_slice { - __le16 s_tag; /* ID tag of partition */ - __le16 s_flag; /* permission flags */ - __le32 s_start; /* start sector no of partition */ - __le32 s_size; /* # of blocks in partition */ -}; - -struct solaris_x86_vtoc { - unsigned int v_bootinfo[3]; /* info needed by mboot (unsupported) */ - __le32 v_sanity; /* to verify vtoc sanity */ - __le32 v_version; /* layout version */ - char v_volume[8]; /* volume name */ - __le16 v_sectorsz; /* sector size in bytes */ - __le16 v_nparts; /* number of partitions */ - unsigned int v_reserved[10]; /* free space */ - struct solaris_x86_slice - v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ - unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp (unsupported) */ - char v_asciilabel[128]; /* for compatibility */ -}; - -#endif /* CONFIG_SOLARIS_X86_PARTITION */ - -#ifdef CONFIG_BSD_DISKLABEL -/* - * BSD disklabel support by Yossi Gottlieb - * updated by Marc Espie - */ - -/* check against BSD src/sys/sys/disklabel.h for consistency */ - -#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ -#define BSD_MAXPARTITIONS 16 -#define OPENBSD_MAXPARTITIONS 16 -#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ -struct bsd_disklabel { - __le32 d_magic; /* the magic number */ - __s16 d_type; /* drive type */ - __s16 d_subtype; /* controller/d_type specific */ - char d_typename[16]; /* type name, e.g. "eagle" */ - char d_packname[16]; /* pack identifier */ - __u32 d_secsize; /* # of bytes per sector */ - __u32 d_nsectors; /* # of data sectors per track */ - __u32 d_ntracks; /* # of tracks per cylinder */ - __u32 d_ncylinders; /* # of data cylinders per unit */ - __u32 d_secpercyl; /* # of data sectors per cylinder */ - __u32 d_secperunit; /* # of data sectors per unit */ - __u16 d_sparespertrack; /* # of spare sectors per track */ - __u16 d_sparespercyl; /* # of spare sectors per cylinder */ - __u32 d_acylinders; /* # of alt. cylinders per unit */ - __u16 d_rpm; /* rotational speed */ - __u16 d_interleave; /* hardware sector interleave */ - __u16 d_trackskew; /* sector 0 skew, per track */ - __u16 d_cylskew; /* sector 0 skew, per cylinder */ - __u32 d_headswitch; /* head switch time, usec */ - __u32 d_trkseek; /* track-to-track seek, usec */ - __u32 d_flags; /* generic flags */ -#define NDDATA 5 - __u32 d_drivedata[NDDATA]; /* drive-type specific information */ -#define NSPARE 5 - __u32 d_spare[NSPARE]; /* reserved for future use */ - __le32 d_magic2; /* the magic number (again) */ - __le16 d_checksum; /* xor of data incl. partitions */ - - /* filesystem and partition information: */ - __le16 d_npartitions; /* number of partitions in following */ - __le32 d_bbsize; /* size of boot area at sn0, bytes */ - __le32 d_sbsize; /* max size of fs superblock, bytes */ - struct bsd_partition { /* the partition table */ - __le32 p_size; /* number of sectors in partition */ - __le32 p_offset; /* starting sector */ - __le32 p_fsize; /* filesystem basic fragment size */ - __u8 p_fstype; /* filesystem type, see below */ - __u8 p_frag; /* filesystem fragments per block */ - __le16 p_cpg; /* filesystem cylinders per group */ - } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ -}; - -#endif /* CONFIG_BSD_DISKLABEL */ - -#ifdef CONFIG_UNIXWARE_DISKLABEL -/* - * Unixware slices support by Andrzej Krzysztofowicz - * and Krzysztof G. Baranowski - */ - -#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ -#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ -#define UNIXWARE_NUMSLICE 16 -#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ - -struct unixware_slice { - __le16 s_label; /* label */ - __le16 s_flags; /* permission flags */ - __le32 start_sect; /* starting sector */ - __le32 nr_sects; /* number of sectors in slice */ -}; - -struct unixware_disklabel { - __le32 d_type; /* drive type */ - __le32 d_magic; /* the magic number */ - __le32 d_version; /* version number */ - char d_serial[12]; /* serial number of the device */ - __le32 d_ncylinders; /* # of data cylinders per device */ - __le32 d_ntracks; /* # of tracks per cylinder */ - __le32 d_nsectors; /* # of data sectors per track */ - __le32 d_secsize; /* # of bytes per sector */ - __le32 d_part_start; /* # of first sector of this partition */ - __le32 d_unknown1[12]; /* ? */ - __le32 d_alt_tbl; /* byte offset of alternate table */ - __le32 d_alt_len; /* byte length of alternate table */ - __le32 d_phys_cyl; /* # of physical cylinders per device */ - __le32 d_phys_trk; /* # of physical tracks per cylinder */ - __le32 d_phys_sec; /* # of physical sectors per track */ - __le32 d_phys_bytes; /* # of physical bytes per sector */ - __le32 d_unknown2; /* ? */ - __le32 d_unknown3; /* ? */ - __le32 d_pad[8]; /* pad */ - - struct unixware_vtoc { - __le32 v_magic; /* the magic number */ - __le32 v_version; /* version number */ - char v_name[8]; /* volume name */ - __le16 v_nslices; /* # of slices */ - __le16 v_unknown1; /* ? */ - __le32 v_reserved[10]; /* reserved */ - struct unixware_slice - v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ - } vtoc; - -}; /* 408 */ - -#endif /* CONFIG_UNIXWARE_DISKLABEL */ - -#ifdef CONFIG_MINIX_SUBPARTITION -# define MINIX_NR_SUBPARTITIONS 4 -#endif /* CONFIG_MINIX_SUBPARTITION */ - #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -- cgit v1.2.3-71-gd317 From 2b8bd423614c595540eaadcfbc702afe8e155e50 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:04 +0300 Subject: block/diskstats: more accurate approximation of io_ticks for slow disks Currently io_ticks is approximated by adding one at each start and end of requests if jiffies counter has changed. This works perfectly for requests shorter than a jiffy or if one of requests starts/ends at each jiffy. If disk executes just one request at a time and they are longer than two jiffies then only first and last jiffies will be accounted. Fix is simple: at the end of request add up into io_ticks jiffies passed since last update rather than just one jiffy. Example: common HDD executes random read 4k requests around 12ms. fio --name=test --filename=/dev/sdb --rw=randread --direct=1 --runtime=30 & iostat -x 10 sdb Note changes of iostat's "%util" 8,43% -> 99,99% before/after patch: Before: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,60 0,00 330,40 0,00 8,00 0,96 12,09 12,09 0,00 1,02 8,43 After: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,50 0,00 330,00 0,00 8,00 1,00 12,10 12,10 0,00 12,12 99,99 Now io_ticks does not loose time between start and end of requests, but for queue-depth > 1 some I/O time between adjacent starts might be lost. For load estimation "%util" is not as useful as average queue length, but it clearly shows how often disk queue is completely empty. Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Konstantin Khlebnikov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/admin-guide/iostats.rst | 5 ++++- block/bio.c | 8 ++++---- block/blk-core.c | 4 ++-- include/linux/genhd.h | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index df5b8345c41d..9b14b0c2c9c4 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -100,7 +100,7 @@ Field 10 -- # of milliseconds spent doing I/Os (unsigned int) Since 5.0 this field counts jiffies when at least one request was started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. + I/O time might be not accounted in case of concurrent requests. Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int) This field is incremented at each I/O start, I/O completion, I/O @@ -143,6 +143,9 @@ are summed (possibly overflowing the unsigned long variable they are summed to) and the result given to the user. There is no convenient user interface for accessing the per-CPU counters themselves. +Since 4.19 request times are measured with nanoseconds precision and +truncated to milliseconds before showing in this interface. + Disks vs Partitions ------------------- diff --git a/block/bio.c b/block/bio.c index 209715765a7a..68f65ef2ceba 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1768,14 +1768,14 @@ defer: schedule_work(&bio_dirty_work); } -void update_io_ticks(struct hd_struct *part, unsigned long now) +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { - __part_stat_add(part, io_ticks, 1); + __part_stat_add(part, io_ticks, end ? now - stamp : 1); } } if (part->partno) { @@ -1791,7 +1791,7 @@ void generic_start_io_acct(struct request_queue *q, int op, part_stat_lock(); - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); @@ -1809,7 +1809,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, part_stat_lock(); - update_io_ticks(part, now); + update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index abfdcf81a228..4401b30a1751 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1337,7 +1337,7 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_lock(); part = req->part; - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); @@ -1379,7 +1379,7 @@ void blk_account_io_start(struct request *rq, bool new_io) rq->part = part; } - update_io_ticks(part, jiffies); + update_io_ticks(part, jiffies, false); part_stat_unlock(); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index da62b44b15be..13bb51f37b3f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -422,7 +422,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw); -void update_io_ticks(struct hd_struct *part, unsigned long now); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, -- cgit v1.2.3-71-gd317 From ea18e0f0a63af9064db3d4065d90fa743ae0991b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:06 +0300 Subject: block/diskstats: accumulate all per-cpu counters in one pass Reading /proc/diskstats iterates over all cpus for summing each field. It's faster to sum all fields in one pass. Hammering /proc/diskstats with fio shows 2x performance improvement: fio --name=test --numjobs=$JOBS --filename=/proc/diskstats \ --size=1k --bs=1k --fallocate=none --create_on_open=1 \ --time_based=1 --runtime=10 --invalidate=0 --group_report JOBS=1 JOBS=10 Before: 7k iops 64k iops After: 18k iops 120k iops Also this way code is more compact: add/remove: 1/0 grow/shrink: 0/2 up/down: 194/-1540 (-1346) Function old new delta part_stat_read_all - 194 +194 diskstats_show 1344 631 -713 part_stat_show 1219 392 -827 Total: Before=14966947, After=14965601, chg -0.01% Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/genhd.c | 102 ++++++++++++++++++++++++++++++++++---------------- include/linux/genhd.h | 3 -- 2 files changed, 70 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index f7d60b620b97..9eb981f7e5a4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -92,6 +92,34 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); +#ifdef CONFIG_SMP +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; + stat->time_in_queue += ptr->time_in_queue; + } +} +#else /* CONFIG_SMP */ +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); +} +#endif /* CONFIG_SMP */ + void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { if (queue_is_mq(q)) @@ -1214,9 +1242,12 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; unsigned int inflight; + part_stat_read_all(p, &stat); inflight = part_in_flight(q, p); + return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1224,23 +1255,23 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", - part_stat_read(p, ios[STAT_READ]), - part_stat_read(p, merges[STAT_READ]), - (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(p, STAT_READ), - part_stat_read(p, ios[STAT_WRITE]), - part_stat_read(p, merges[STAT_WRITE]), - (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(p, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue)), - part_stat_read(p, ios[STAT_DISCARD]), - part_stat_read(p, merges[STAT_DISCARD]), - (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(p, STAT_DISCARD), - part_stat_read(p, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(p, STAT_FLUSH)); + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, @@ -1492,6 +1523,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct hd_struct *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; + struct disk_stats stat; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1503,7 +1535,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { + part_stat_read_all(hd, &stat); inflight = part_in_flight(gp->queue, hd); + seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1513,23 +1547,27 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[STAT_READ]), - part_stat_read(hd, merges[STAT_READ]), - part_stat_read(hd, sectors[STAT_READ]), - (unsigned int)part_stat_read_msecs(hd, STAT_READ), - part_stat_read(hd, ios[STAT_WRITE]), - part_stat_read(hd, merges[STAT_WRITE]), - part_stat_read(hd, sectors[STAT_WRITE]), - (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), inflight, - jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)), - part_stat_read(hd, ios[STAT_DISCARD]), - part_stat_read(hd, merges[STAT_DISCARD]), - part_stat_read(hd, sectors[STAT_DISCARD]), - (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD), - part_stat_read(hd, ios[STAT_FLUSH]), - (unsigned int)part_stat_read_msecs(hd, STAT_FLUSH) + jiffies_to_msecs(stat.io_ticks), + jiffies_to_msecs(stat.time_in_queue), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) ); } disk_part_iter_exit(&piter); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13bb51f37b3f..b0c588d1aa29 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -380,9 +380,6 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ -#define part_stat_read_msecs(part, which) \ - div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) - #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ -- cgit v1.2.3-71-gd317 From 8cd5b8fc00716fb71f6b32d594b38a8f286d6c20 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:08 +0300 Subject: block/diskstats: replace time_in_queue with sum of request times Column "time_in_queue" in diskstats is supposed to show total waiting time of all requests. I.e. value should be equal to the sum of times from other columns. But this is not true, because column "time_in_queue" is counted separately in jiffies rather than in nanoseconds as other times. This patch removes redundant counter for "time_in_queue" and shows total time of read, write, discard and flush requests. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/bio.c | 1 - block/blk-core.c | 1 - block/genhd.c | 13 ++++++++++--- include/linux/genhd.h | 1 - 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 68f65ef2ceba..bc9152977bf0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1811,7 +1811,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op, update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 4401b30a1751..eaf6cb3887e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1340,7 +1340,6 @@ void blk_account_io_done(struct request *req, u64 now) update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); - part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); diff --git a/block/genhd.c b/block/genhd.c index 9eb981f7e5a4..792356e922a1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -110,7 +110,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } stat->io_ticks += ptr->io_ticks; - stat->time_in_queue += ptr->time_in_queue; } } #else /* CONFIG_SMP */ @@ -1265,7 +1264,11 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], @@ -1559,7 +1562,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), - jiffies_to_msecs(stat.time_in_queue), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], stat.sectors[STAT_DISCARD], diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b0c588d1aa29..790fdc3e0b3d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -46,7 +46,6 @@ struct disk_stats { unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; - unsigned long time_in_queue; local_t in_flight[2]; }; -- cgit v1.2.3-71-gd317 From 31eb6186797c149665fd44f3847bdf8d539efa59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:35 +0100 Subject: block: mark block_depr static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/genhd.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 792356e922a1..7dafd7504493 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,7 +27,7 @@ #include "blk.h" static DEFINE_MUTEX(block_class_lock); -struct kobject *block_depr; +static struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 790fdc3e0b3d..b322e805a916 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -27,7 +27,6 @@ #define part_to_dev(part) (&((part)->__dev)) extern struct device_type part_type; -extern struct kobject *block_depr; extern struct class block_class; #define DISK_MAX_PARTS 256 -- cgit v1.2.3-71-gd317 From 6005771c17db56b6a9acc12bd084134191560e18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:36 +0100 Subject: block: mark part_in_flight and part_in_flight_rw static Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ++++--- include/linux/genhd.h | 3 --- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 7dafd7504493..5f9df331822a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -139,7 +139,8 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) +static unsigned int part_in_flight(struct request_queue *q, + struct hd_struct *part) { int cpu; unsigned int inflight; @@ -159,8 +160,8 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) return inflight; } -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) { int cpu; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b322e805a916..c0c5bb51fa56 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -409,9 +409,6 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_local_read_cpu(gendiskp, field, cpu) \ local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) -unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part); -void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, -- cgit v1.2.3-71-gd317 From 29125ed624eeb3ac2eb7bca313a8de29c1c84dcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:40 +0100 Subject: block: move guard_bio_eod to bio.c This is bio layer functionality and not related to buffer heads. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/buffer.c | 43 ------------------------------------------- fs/internal.h | 1 - include/linux/bio.h | 1 + 4 files changed, 44 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index bc9152977bf0..11e6aac35092 100644 --- a/block/bio.c +++ b/block/bio.c @@ -588,6 +588,49 @@ void bio_truncate(struct bio *bio, unsigned new_size) bio->bi_iter.bi_size = new_size; } +/** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to diff --git a/fs/buffer.c b/fs/buffer.c index b8d28370cfd7..3f5758e01e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3019,49 +3019,6 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -/* - * This allows us to do IO even on the odd last sectors - * of a device, even if the block size is some multiple - * of the physical sector size. - * - * We'll just truncate the bio to the size of the device, - * and clear the end of the buffer head manually. - * - * Truly out-of-range accesses will turn into actual IO - * errors, this only handles the "we need to be able to - * do IO at the final sector" case. - */ -void guard_bio_eod(struct bio *bio) -{ - sector_t maxsector; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); - - if (!maxsector) - return; - - /* - * If the *whole* IO is past the end of the device, - * let it through, and the IO layer will turn it into - * an EIO. - */ - if (unlikely(bio->bi_iter.bi_sector >= maxsector)) - return; - - maxsector -= bio->bi_iter.bi_sector; - if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) - return; - - bio_truncate(bio, maxsector << 9); -} - static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) { diff --git a/fs/internal.h b/fs/internal.h index f3f280b952a3..4d37912a5587 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -38,7 +38,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) /* * buffer.c */ -extern void guard_bio_eod(struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); diff --git a/include/linux/bio.h b/include/linux/bio.h index 853d92ceee64..a430e9c1c2d2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -471,6 +471,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); void bio_truncate(struct bio *bio, unsigned new_size); +void guard_bio_eod(struct bio *bio); static inline void zero_fill_bio(struct bio *bio) { -- cgit v1.2.3-71-gd317 From 581e26004a09c50e5017caadc850ea17e374a5ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:41 +0100 Subject: block: move block layer internals out of include/linux/genhd.h None of this needs to be exposed to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 116 +++++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 1 + block/partitions/check.h | 1 + block/partitions/core.c | 1 - include/linux/genhd.h | 120 ----------------------------------------------- 5 files changed, 118 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index 43df9dcb3d4e..ac20f972842e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -149,6 +149,9 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } + +void blk_integrity_add(struct gendisk *); +void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) @@ -171,6 +174,12 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } +static inline void blk_integrity_add(struct gendisk *disk) +{ +} +static inline void blk_integrity_del(struct gendisk *disk) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); @@ -365,4 +374,111 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + +int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +void blk_free_devt(dev_t devt); +void blk_invalidate_devt(dev_t devt); +char *disk_name(struct gendisk *hd, int partno, char *buf); +#define ADDPART_FLAG_NONE 0 +#define ADDPART_FLAG_RAID 1 +#define ADDPART_FLAG_WHOLEDISK 2 +struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info); +void __delete_partition(struct percpu_ref *ref); +void delete_partition(struct gendisk *disk, int partno); +int disk_expand_part_tbl(struct gendisk *disk, int target); + +static inline int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, __delete_partition, 0, + GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +static inline void hd_struct_get(struct hd_struct *part) +{ + percpu_ref_get(&part->ref); +} + +static inline int hd_struct_try_get(struct hd_struct *part) +{ + return percpu_ref_tryget_live(&part->ref); +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + percpu_ref_put(&part->ref); +} + +static inline void hd_struct_kill(struct hd_struct *part) +{ + percpu_ref_kill(&part->ref); +} + +static inline void hd_free_part(struct hd_struct *part) +{ + free_part_stats(part); + kfree(part->info); + percpu_ref_exit(&part->ref); +} + +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/ioctl.c b/block/ioctl.c index 127194b9f9bd..6e827de1a4c4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,6 +11,7 @@ #include #include #include +#include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) diff --git a/block/partitions/check.h b/block/partitions/check.h index f845355489ec..c577e9ee67f0 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -2,6 +2,7 @@ #include #include #include +#include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition diff --git a/block/partitions/core.c b/block/partitions/core.c index b442bc209b86..b79c4513629b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -10,7 +10,6 @@ #include #include #include -#include "../blk.h" #include "check.h" static int (*check_part[])(struct parsed_partitions *) = { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c0c5bb51fa56..14354a6e89c2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -298,9 +298,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, - sector_t sector); - /* * Macros to operate on percpu disk statistics: * @@ -409,13 +406,6 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_local_read_cpu(gendiskp, field, cpu) \ local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) -void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, - int rw); -void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, - int rw); - -void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); @@ -465,27 +455,11 @@ static inline void set_capacity(struct gendisk *disk, sector_t size) disk->part0.nr_sects = size; } -#define ADDPART_FLAG_NONE 0 -#define ADDPART_FLAG_RAID 1 -#define ADDPART_FLAG_WHOLEDISK 2 - -extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt); -extern void blk_free_devt(dev_t devt); -extern void blk_invalidate_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); -extern char *disk_name (struct gendisk *hd, int partno, char *buf); int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); -extern int disk_expand_part_tbl(struct gendisk *disk, int target); -extern struct hd_struct * __must_check add_partition(struct gendisk *disk, - int partno, sector_t start, - sector_t len, int flags, - struct partition_meta_info - *info); -extern void __delete_partition(struct percpu_ref *); -extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); @@ -517,100 +491,6 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) -static inline int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, __delete_partition, 0, - GFP_KERNEL)) - return -ENOMEM; - return 0; -} - -static inline void hd_struct_get(struct hd_struct *part) -{ - percpu_ref_get(&part->ref); -} - -static inline int hd_struct_try_get(struct hd_struct *part) -{ - return percpu_ref_tryget_live(&part->ref); -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - percpu_ref_put(&part->ref); -} - -static inline void hd_struct_kill(struct hd_struct *part) -{ - percpu_ref_kill(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - free_part_stats(part); - kfree(part->info); - percpu_ref_exit(&part->ref); -} - -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - -#if defined(CONFIG_BLK_DEV_INTEGRITY) -extern void blk_integrity_add(struct gendisk *); -extern void blk_integrity_del(struct gendisk *); -#else /* CONFIG_BLK_DEV_INTEGRITY */ -static inline void blk_integrity_add(struct gendisk *disk) { } -static inline void blk_integrity_del(struct gendisk *disk) { } -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -- cgit v1.2.3-71-gd317 From c6a564ffadc9105880329710164ee493f0de103c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 16:48:42 +0100 Subject: block: move the part_stat* helpers from genhd.h to a new header These macros are just used by a few files. Move them out of genhd.h, which is included everywhere into a new standalone header. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 1 + drivers/block/drbd/drbd_receiver.c | 1 + drivers/block/drbd/drbd_worker.c | 1 + drivers/block/zram/zram_drv.c | 1 + drivers/md/dm.c | 1 + drivers/md/md.c | 1 + drivers/nvme/target/admin-cmd.c | 1 + fs/ext4/super.c | 2 +- fs/ext4/sysfs.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + include/linux/genhd.h | 108 ---------------------------------- include/linux/part_stat.h | 115 +++++++++++++++++++++++++++++++++++++ 13 files changed, 126 insertions(+), 109 deletions(-) create mode 100644 include/linux/part_stat.h (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index ac20f972842e..d9673164a145 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "blk-mq.h" #include "blk-mq-sched.h" diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 79e216446030..c15e7083b13a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index b7f605c6e231..0dc019da1f8d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1bdb5793842b..76e75097e9ab 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "zram_drv.h" diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0413018c8305..0d881cfa160b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -25,6 +25,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c7193715f07..f6cf3b53f6c1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include "md.h" diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..cca759c918a4 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4cbd1cc34dfa..c8dff4c68141 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -43,7 +43,7 @@ #include #include #include - +#include #include #include diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index d218ebdafa4a..04bfaf63752c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5355be6b6755..088c3e7a1080 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 65a7a432dfee..d398b2d90c6c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 14354a6e89c2..927eed0be179 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -298,114 +298,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -/* - * Macros to operate on percpu disk statistics: - * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters - * and should be called between disk_stat_lock() and - * disk_stat_unlock(). - * - * part_stat_read() can be called at any time. - * - * part_stat_{add|set_all}() and {init|free}_part_stats are for - * internal use only. - */ -#ifdef CONFIG_SMP -#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) - -#define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field) - -#define part_stat_get(part, field) \ - part_stat_get_cpu(part, field, smp_processor_id()) - -#define part_stat_read(part, field) \ -({ \ - typeof((part)->dkstats->field) res = 0; \ - unsigned int _cpu; \ - for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ - res; \ -}) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - int i; - - for_each_possible_cpu(i) - memset(per_cpu_ptr(part->dkstats, i), value, - sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ - free_percpu(part->dkstats); -} - -#else /* !CONFIG_SMP */ -#define part_stat_lock() ({ rcu_read_lock(); 0; }) -#define part_stat_unlock() rcu_read_unlock() - -#define part_stat_get(part, field) ((part)->dkstats.field) -#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) -#define part_stat_read(part, field) part_stat_get(part, field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - memset(&part->dkstats, value, sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ -} - -#endif /* CONFIG_SMP */ - -#define part_stat_read_accum(part, field) \ - (part_stat_read(part, field[STAT_READ]) + \ - part_stat_read(part, field[STAT_WRITE]) + \ - part_stat_read(part, field[STAT_DISCARD])) - -#define __part_stat_add(part, field, addnd) \ - (part_stat_get(part, field) += (addnd)) - -#define part_stat_add(part, field, addnd) do { \ - __part_stat_add((part), field, addnd); \ - if ((part)->partno) \ - __part_stat_add(&part_to_disk((part))->part0, \ - field, addnd); \ -} while (0) - -#define part_stat_dec(gendiskp, field) \ - part_stat_add(gendiskp, field, -1) -#define part_stat_inc(gendiskp, field) \ - part_stat_add(gendiskp, field, 1) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) - -#define part_stat_local_dec(gendiskp, field) \ - local_dec(&(part_stat_get(gendiskp, field))) -#define part_stat_local_inc(gendiskp, field) \ - local_inc(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read(gendiskp, field) \ - local_read(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read_cpu(gendiskp, field, cpu) \ - local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) - /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h new file mode 100644 index 000000000000..ece607607a86 --- /dev/null +++ b/include/linux/part_stat.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PART_STAT_H +#define _LINUX_PART_STAT_H + +#include + +/* + * Macros to operate on percpu disk statistics: + * + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters + * and should be called between disk_stat_lock() and + * disk_stat_unlock(). + * + * part_stat_read() can be called at any time. + * + * part_stat_{add|set_all}() and {init|free}_part_stats are for + * internal use only. + */ +#ifdef CONFIG_SMP +#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) +#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) + +#define part_stat_get_cpu(part, field, cpu) \ + (per_cpu_ptr((part)->dkstats, (cpu))->field) + +#define part_stat_get(part, field) \ + part_stat_get_cpu(part, field, smp_processor_id()) + +#define part_stat_read(part, field) \ +({ \ + typeof((part)->dkstats->field) res = 0; \ + unsigned int _cpu; \ + for_each_possible_cpu(_cpu) \ + res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ + res; \ +}) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + int i; + + for_each_possible_cpu(i) + memset(per_cpu_ptr(part->dkstats, i), value, + sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + part->dkstats = alloc_percpu(struct disk_stats); + if (!part->dkstats) + return 0; + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ + free_percpu(part->dkstats); +} + +#else /* !CONFIG_SMP */ +#define part_stat_lock() ({ rcu_read_lock(); 0; }) +#define part_stat_unlock() rcu_read_unlock() + +#define part_stat_get(part, field) ((part)->dkstats.field) +#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) +#define part_stat_read(part, field) part_stat_get(part, field) + +static inline void part_stat_set_all(struct hd_struct *part, int value) +{ + memset(&part->dkstats, value, sizeof(struct disk_stats)); +} + +static inline int init_part_stats(struct hd_struct *part) +{ + return 1; +} + +static inline void free_part_stats(struct hd_struct *part) +{ +} + +#endif /* CONFIG_SMP */ + +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) + +#define __part_stat_add(part, field, addnd) \ + (part_stat_get(part, field) += (addnd)) + +#define part_stat_add(part, field, addnd) do { \ + __part_stat_add((part), field, addnd); \ + if ((part)->partno) \ + __part_stat_add(&part_to_disk((part))->part0, \ + field, addnd); \ +} while (0) + +#define part_stat_dec(gendiskp, field) \ + part_stat_add(gendiskp, field, -1) +#define part_stat_inc(gendiskp, field) \ + part_stat_add(gendiskp, field, 1) +#define part_stat_sub(gendiskp, field, subnd) \ + part_stat_add(gendiskp, field, -subnd) + +#define part_stat_local_dec(gendiskp, field) \ + local_dec(&(part_stat_get(gendiskp, field))) +#define part_stat_local_inc(gendiskp, field) \ + local_inc(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read(gendiskp, field) \ + local_read(&(part_stat_get(gendiskp, field))) +#define part_stat_local_read_cpu(gendiskp, field, cpu) \ + local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) + +#endif /* _LINUX_PART_STAT_H */ -- cgit v1.2.3-71-gd317 From 348e114bbd4dce430eae70f01a04c8fc259b4cf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:07:17 +0100 Subject: block: move the ->devnode callback to struct block_device_operations There really isn't any good reason to stash a method directly into struct gendisk. Move it together with the other block device operations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- drivers/block/pktcdvd.c | 12 ++++++------ include/linux/blkdev.h | 1 + include/linux/genhd.h | 1 - 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 6323cc789efa..14cf395a1479 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1497,8 +1497,8 @@ static char *block_devnode(struct device *dev, umode_t *mode, { struct gendisk *disk = dev_to_disk(dev); - if (disk->devnode) - return disk->devnode(disk, mode); + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); return NULL; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 5f970a7d32c0..0d286a87e647 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2679,6 +2679,11 @@ static unsigned int pkt_check_events(struct gendisk *disk, return attached_disk->fops->check_events(attached_disk, clearing); } +static char *pkt_devnode(struct gendisk *disk, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); +} + static const struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, .open = pkt_open, @@ -2686,13 +2691,9 @@ static const struct block_device_operations pktcdvd_ops = { .ioctl = pkt_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = pkt_check_events, + .devnode = pkt_devnode, }; -static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name); -} - /* * Set up mapping from pktcdvd device to CD-ROM device. */ @@ -2748,7 +2749,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); - disk->devnode = pktcdvd_devnode; disk->private_data = pd; disk->queue = blk_alloc_queue(GFP_KERNEL); if (!disk->queue) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53a1325efbc3..e8defd718d62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1697,6 +1697,7 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); + char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; }; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 927eed0be179..85b9e253cd39 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -191,7 +191,6 @@ struct gendisk { * disks that can't be partitioned. */ char disk_name[DISK_NAME_LEN]; /* name of major driver */ - char *(*devnode)(struct gendisk *gd, umode_t *mode); unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ -- cgit v1.2.3-71-gd317 From 2f227bb99934a4faa6dfe2cda2594bce8897a323 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:08 +0100 Subject: block: add a blk_mq_init_queue_data helper This allows a driver to pass a queuedata member before ->init_hctx is called. null_blk currently open codes this logic, but I'd rather have it in the core to ease future maintainance. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 +++++++++- include/linux/blk-mq.h | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 745ec592a513..216bf62e88b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2724,13 +2724,15 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; /* * Initialize the queue without an elevator. device_add_disk() will do @@ -2742,6 +2744,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; } +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} EXPORT_SYMBOL(blk_mq_init_queue); /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 31344d5f83e2..f389d7c724bd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -412,6 +412,8 @@ enum { << BLK_MQ_F_ALLOC_POLICY_START_BIT) struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q, bool elevator_init); -- cgit v1.2.3-71-gd317 From 3d745ea5b095a3985129e162900b7e6c22518a9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 09:30:11 +0100 Subject: block: simplify queue allocation Current make_request based drivers use either blk_alloc_queue_node or blk_alloc_queue to allocate a queue, and then set up the make_request_fn function pointer and a few parameters using the blk_queue_make_request helper. Simplify this by passing the make_request pointer to blk_alloc_queue, and while at it merge the _node variant into the main helper by always passing a node_id, and remove the superfluous gfp_mask parameter. A lower-level __blk_alloc_queue is kept for the blk-mq case. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 3 +-- arch/xtensa/platforms/iss/simdisk.c | 3 +-- block/blk-cgroup.c | 2 +- block/blk-core.c | 39 ++++++++++++++++++++++--------------- block/blk-mq.c | 8 ++------ block/blk-settings.c | 36 ---------------------------------- block/blk.h | 2 ++ drivers/block/brd.c | 4 +--- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/null_blk_main.c | 3 +-- drivers/block/pktcdvd.c | 3 +-- drivers/block/ps3vram.c | 3 +-- drivers/block/rsxx/dev.c | 3 +-- drivers/block/umem.c | 4 +--- drivers/block/zram/zram_drv.c | 4 +--- drivers/lightnvm/core.c | 3 +-- drivers/md/bcache/super.c | 3 +-- drivers/md/dm.c | 9 ++++----- drivers/md/md.c | 3 +-- drivers/nvdimm/blk.c | 3 +-- drivers/nvdimm/btt.c | 3 +-- drivers/nvdimm/pmem.c | 3 +-- drivers/nvme/host/multipath.c | 3 +-- drivers/s390/block/dcssblk.c | 4 ++-- drivers/s390/block/xpram.c | 4 ++-- include/linux/blkdev.h | 4 +--- 26 files changed, 54 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 40712e49381b..c3a630440512 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -118,12 +118,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(nfhd_make_request, NUMA_NO_NODE); if (dev->queue == NULL) goto free_dev; dev->queue->queuedata = dev; - blk_queue_make_request(dev->queue, nfhd_make_request); blk_queue_logical_block_size(dev->queue, bsize); dev->disk = alloc_disk(16); diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 833109880165..49322b66cda9 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -267,13 +267,12 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->queue = blk_alloc_queue(GFP_KERNEL); + dev->queue = blk_alloc_queue(simdisk_make_request, NUMA_NO_NODE); if (dev->queue == NULL) { pr_err("blk_alloc_queue failed\n"); goto out_alloc_queue; } - blk_queue_make_request(dev->queue, simdisk_make_request); dev->queue->queuedata = dev; dev->gd = alloc_disk(SIMDISK_MINORS); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a229b94d5390..c15a26096038 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1010,7 +1010,7 @@ unlock: * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from blk_alloc_queue_node(). Responsible for initializing blkcg + * Called from __blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index eaf6cb3887e6..18b8c09d093e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -388,12 +388,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -struct request_queue *blk_alloc_queue(gfp_t gfp_mask) -{ - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_alloc_queue); - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -470,24 +464,19 @@ static void blk_timeout_work(struct work_struct *work) { } -/** - * blk_alloc_queue_node - allocate a request queue - * @gfp_mask: memory allocation flags - * @node_id: NUMA node to allocate memory from - */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +struct request_queue *__blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, - gfp_mask | __GFP_ZERO, node_id); + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; @@ -495,7 +484,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); if (!q->backing_dev_info) goto fail_split; @@ -541,6 +530,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + return q; fail_ref: @@ -557,7 +549,22 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue_node); + +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) +{ + struct request_queue *q; + + if (WARN_ON_ONCE(!make_request)) + return -EINVAL; + + q = __blk_alloc_queue(node_id); + if (!q) + return NULL; + q->make_request_fn = make_request; + q->nr_requests = BLKDEV_MAX_RQ; + return q; +} +EXPORT_SYMBOL(blk_alloc_queue); bool blk_get_queue(struct request_queue *q) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 216bf62e88b6..f6291ceedee4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2729,7 +2729,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = __blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; @@ -2939,11 +2939,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - blk_queue_make_request(q, blk_mq_make_request); - - /* - * Do this after blk_queue_make_request() overrides it... - */ + q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index c8eda2e7b91e..126d216a2db6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -86,42 +86,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - - q->make_request_fn = mfn; - blk_queue_dma_alignment(q, 511); - - blk_set_default_limits(&q->limits); -} -EXPORT_SYMBOL(blk_queue_make_request); - /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index d9673164a145..491e52fc0aa6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -482,4 +482,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } +struct request_queue *__blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 220c5e18aba0..2fb25c348d53 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -381,12 +381,10 @@ static struct brd_device *brd_alloc(int i) spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); - brd->brd_queue = blk_alloc_queue(GFP_KERNEL); + brd->brd_queue = blk_alloc_queue(brd_make_request, NUMA_NO_NODE); if (!brd->brd_queue) goto out_free_dev; - blk_queue_make_request(brd->brd_queue, brd_make_request); - /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a18155cdce41..72a7c3ea2ce3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2801,7 +2801,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + q = blk_alloc_queue(drbd_make_request, NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; @@ -2828,7 +2828,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig q->backing_dev_info->congested_fn = drbd_congested; q->backing_dev_info->congested_data = device; - blk_queue_make_request(q, drbd_make_request); blk_queue_write_cache(q, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 2f864782550d..5f13793d35ee 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1743,12 +1743,11 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); + nullb->q = blk_alloc_queue(null_queue_bio, dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; } - blk_queue_make_request(nullb->q, null_queue_bio); rv = init_driver_queues(nullb); if (rv) goto out_cleanup_blk_queue; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 0d286a87e647..0b944ac96d6b 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2493,7 +2493,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd) { struct request_queue *q = pd->disk->queue; - blk_queue_make_request(q, pkt_make_request); blk_queue_logical_block_size(q, CD_FRAMESIZE); blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); q->queuedata = pd; @@ -2750,7 +2749,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); disk->private_data = pd; - disk->queue = blk_alloc_queue(GFP_KERNEL); + disk->queue = blk_alloc_queue(pkt_make_request, NUMA_NO_NODE); if (!disk->queue) goto out_mem2; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 4628e1a27a2b..821d4d8b1d76 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -737,7 +737,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(ps3vram_make_request, NUMA_NO_NODE); if (!queue) { dev_err(&dev->core, "blk_alloc_queue failed\n"); error = -ENOMEM; @@ -746,7 +746,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) priv->queue = queue; queue->queuedata = dev; - blk_queue_make_request(queue, ps3vram_make_request); blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index c47d28b2ce44..8ffa8260dcaf 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -248,7 +248,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - card->queue = blk_alloc_queue(GFP_KERNEL); + card->queue = blk_alloc_queue(rsxx_make_request, NUMA_NO_NODE); if (!card->queue) { dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); unregister_blkdev(card->major, DRIVER_NAME); @@ -269,7 +269,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) blk_queue_logical_block_size(card->queue, blk_size); } - blk_queue_make_request(card->queue, rsxx_make_request); blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 4eaf97d7a170..d84e8a878df2 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -885,11 +885,9 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) card->biotail = &card->bio; spin_lock_init(&card->lock); - card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + card->queue = blk_alloc_queue(mm_make_request, NUMA_NO_NODE); if (!card->queue) goto failed_alloc; - - blk_queue_make_request(card->queue, mm_make_request); card->queue->queuedata = card; tasklet_init(&card->tasklet, process_page, (unsigned long)card); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 76e75097e9ab..ebb234f36909 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1895,7 +1895,7 @@ static int zram_add(void) #ifdef CONFIG_ZRAM_WRITEBACK spin_lock_init(&zram->wb_limit_lock); #endif - queue = blk_alloc_queue(GFP_KERNEL); + queue = blk_alloc_queue(zram_make_request, NUMA_NO_NODE); if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); @@ -1903,8 +1903,6 @@ static int zram_add(void) goto out_free_idr; } - blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 7543e395a2c6..db38a68abb6c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -380,12 +380,11 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_dev; } - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + tqueue = blk_alloc_queue(tt->make_rq, dev->q->node); if (!tqueue) { ret = -ENOMEM; goto err_disk; } - blk_queue_make_request(tqueue, tt->make_rq); strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); tdisk->flags = GENHD_FL_EXT_DEVT; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5e38a167c85e..d98354fa28e3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -866,11 +866,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); if (!q) return -ENOMEM; - blk_queue_make_request(q, make_request_fn); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0d881cfa160b..753302e83910 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1939,16 +1939,15 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); - if (!md->queue) - goto bad; - md->queue->queuedata = md; /* * default to bio-based required ->make_request_fn until DM * table is loaded and md->type established. If request-based * table is loaded: blk-mq will override accordingly. */ - blk_queue_make_request(md->queue, dm_make_request); + md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + if (!md->queue) + goto bad; + md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) diff --git a/drivers/md/md.c b/drivers/md/md.c index f6cf3b53f6c1..cd1210a0d957 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5623,12 +5623,11 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); + mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 677d6f45b5c4..43751fab9d36 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -249,13 +249,12 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk)); available_disk_size = internal_nlba * nsblk_sector_size(nsblk); - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(nd_blk_make_request, NUMA_NO_NODE); if (!q) return -ENOMEM; if (devm_add_action_or_reset(dev, nd_blk_release_queue, q)) return -ENOMEM; - blk_queue_make_request(q, nd_blk_make_request); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0d04ea3d9fd7..3b09419218d6 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1521,7 +1521,7 @@ static int btt_blk_init(struct btt *btt) struct nd_namespace_common *ndns = nd_btt->ndns; /* create a new disk and request queue for btt */ - btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + btt->btt_queue = blk_alloc_queue(btt_make_request, NUMA_NO_NODE); if (!btt->btt_queue) return -ENOMEM; @@ -1540,7 +1540,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; - blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4eae441f86c9..4ffc6f7ca131 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -395,7 +395,7 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); + q = blk_alloc_queue(pmem_make_request, dev_to_node(dev)); if (!q) return -ENOMEM; @@ -433,7 +433,6 @@ static int pmem_attach_disk(struct device *dev, pmem->virt_addr = addr; blk_queue_write_cache(q, true, fua); - blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..a38d7f196aba 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -377,11 +377,10 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); + q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); if (!q) goto out; q->queuedata = head; - blk_queue_make_request(q, nvme_ns_head_make_request); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 63502ca537eb..80d22290f268 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -636,10 +636,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } dev_info->gd->major = dcssblk_major; dev_info->gd->fops = &dcssblk_devops; - dev_info->dcssblk_queue = blk_alloc_queue(GFP_KERNEL); + dev_info->dcssblk_queue = + blk_alloc_queue(dcssblk_make_request, NUMA_NO_NODE); dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; - blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 3df5d68d09f0..45a04daec89e 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -343,14 +343,14 @@ static int __init xpram_setup_blkdev(void) xpram_disks[i] = alloc_disk(1); if (!xpram_disks[i]) goto out; - xpram_queues[i] = blk_alloc_queue(GFP_KERNEL); + xpram_queues[i] = blk_alloc_queue(xpram_make_request, + NUMA_NO_NODE); if (!xpram_queues[i]) { put_disk(xpram_disks[i]); goto out; } blk_queue_flag_set(QUEUE_FLAG_NONROT, xpram_queues[i]); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, xpram_queues[i]); - blk_queue_make_request(xpram_queues[i], xpram_make_request); blk_queue_logical_block_size(xpram_queues[i], 4096); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e8defd718d62..3f27ff08483e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1063,7 +1063,6 @@ extern void blk_abort_request(struct request *); * Access functions for manipulating queue properties */ extern void blk_cleanup_queue(struct request_queue *); -extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); @@ -1140,8 +1139,7 @@ extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(gfp_t); -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id); +struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); -- cgit v1.2.3-71-gd317 From 130879f1ee0e25b0391b8c78b3baac6fe41f4d38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2020 18:48:37 +0100 Subject: block: move bio_map_* to blk-map.c The bio_map_* helpers are just the low-level helpers for the blk_rq_map_* APIs. Move them together for better logical grouping, as no there isn't much overlap with other code in bio.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 510 +--------------------------------------------------- block/blk-map.c | 508 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 4 + include/linux/bio.h | 14 -- 4 files changed, 513 insertions(+), 523 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 11e6aac35092..21cbaa6a1c20 100644 --- a/block/bio.c +++ b/block/bio.c @@ -780,7 +780,7 @@ static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, * * This should only be used by passthrough bios. */ -static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, bool *same_page) { @@ -1194,90 +1194,6 @@ void bio_list_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_list_copy_data); -struct bio_map_data { - int is_our_pages; - struct iov_iter iter; - struct iovec iov[]; -}; - -static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - if (data->nr_segs > UIO_MAXIOV) - return NULL; - - bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); - if (!bmd) - return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); - bmd->iter = *data; - bmd->iter.iov = bmd->iov; - return bmd; -} - -/** - * bio_copy_from_iter - copy all pages from iov_iter to bio - * @bio: The &struct bio which describes the I/O as destination - * @iter: iov_iter as source - * - * Copy all pages from iov_iter to bio. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_from_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - iter); - - if (!iov_iter_count(iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - -/** - * bio_copy_to_iter - copy all pages from bio to iov_iter - * @bio: The &struct bio which describes the I/O as source - * @iter: iov_iter as destination - * - * Copy all pages from bio to iov_iter. - * Returns 0 on success, or error on failure. - */ -static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) -{ - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - ssize_t ret; - - ret = copy_page_to_iter(bvec->bv_page, - bvec->bv_offset, - bvec->bv_len, - &iter); - - if (!iov_iter_count(&iter)) - break; - - if (ret < bvec->bv_len) - return -EFAULT; - } - - return 0; -} - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1288,430 +1204,6 @@ void bio_free_pages(struct bio *bio) } EXPORT_SYMBOL(bio_free_pages); -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user_iov() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - int ret = 0; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free - * and return -EINTR so user space doesn't expect any data. - */ - if (!current->mm) - ret = -EINTR; - else if (bio_data_dir(bio) == READ) - ret = bio_copy_to_iter(bio, bmd->iter); - if (bmd->is_our_pages) - bio_free_pages(bio); - } - kfree(bmd); - bio_put(bio); - return ret; -} - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct page *page; - struct bio *bio; - int i = 0, ret; - int nr_pages; - unsigned int len = iter->count; - unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; - - bmd = bio_alloc_map_data(iter, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - /* - * We need to do a deep copy of the iov_iter including the iovecs. - * The caller provided iov might point to an on-stack or otherwise - * shortlived one. - */ - bmd->is_our_pages = map_data ? 0 : 1; - - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { - if (!map_data) - __free_page(page); - break; - } - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - if (map_data) - map_data->offset += bio->bi_iter.bi_size; - - /* - * success - */ - if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = bio_copy_from_iter(bio, iter); - if (ret) - goto cleanup; - } else { - if (bmd->is_our_pages) - zero_fill_bio(bio); - iov_iter_advance(iter, bio->bi_iter.bi_size); - } - - bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; -cleanup: - if (!map_data) - bio_free_pages(bio); - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, - gfp_t gfp_mask) -{ - int j; - struct bio *bio; - int ret; - - if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (iov_iter_count(iter)) { - struct page **pages; - ssize_t bytes; - size_t offs, added = 0; - int npages; - - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; - j = 0; - } else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!__bio_add_pc_page(q, bio, page, n, offs, - &same_page)) { - if (same_page) - put_page(page); - break; - } - - added += n; - bytes -= n; - offs = 0; - } - iov_iter_advance(iter, added); - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - put_page(pages[j++]); - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) - break; - } - - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; - - out_unmap: - bio_release_pages(bio, false); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - -static void bio_invalidate_vmalloc_pages(struct bio *bio) -{ -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE - if (bio->bi_private && !op_is_write(bio_op(bio))) { - unsigned long i, len = 0; - - for (i = 0; i < bio->bi_vcnt; i++) - len += bio->bi_io_vec[i].bv_len; - invalidate_kernel_vmap_range(bio->bi_private, len); - } -#endif -} - -static void bio_map_kern_endio(struct bio *bio) -{ - bio_invalidate_vmalloc_pages(bio); - bio_put(bio); -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); - bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { - /* we don't support partial mappings */ - bio_put(bio); - return ERR_PTR(-EINVAL); - } - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -static void bio_copy_kern_endio(struct bio *bio) -{ - bio_free_pages(bio); - bio_put(bio); -} - -static void bio_copy_kern_endio_read(struct bio *bio) -{ - char *p = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); - p += bvec->bv_len; - } - - bio_copy_kern_endio(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - struct bio *bio; - void *p = data; - int nr_pages = 0; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) - goto cleanup; - - if (!reading) - memcpy(page_address(page), p, bytes); - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) - break; - - len -= bytes; - p += bytes; - } - - if (reading) { - bio->bi_end_io = bio_copy_kern_endio_read; - bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; - } - - return bio; - -cleanup: - bio_free_pages(bio); - bio_put(bio); - return ERR_PTR(-ENOMEM); -} - /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. diff --git a/block/blk-map.c b/block/blk-map.c index b0790268ed9d..b72c361911a4 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -11,6 +11,514 @@ #include "blk.h" +struct bio_map_data { + int is_our_pages; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (data->nr_segs > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); + if (!bmd) + return NULL; + memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); + bmd->iter = *data; + bmd->iter.iov = bmd->iov; + return bmd; +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + iter); + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +static int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. + */ + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} + +/** + * bio_copy_user_iov - copy user data to bio + * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +static struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio; + int i = 0, ret; + int nr_pages; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + bmd = bio_alloc_map_data(iter, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = map_data ? 0 : 1; + + nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + if (nr_pages > BIO_MAX_PAGES) + nr_pages = BIO_MAX_PAGES; + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + + ret = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + break; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); + break; + } + + len -= bytes; + offset = 0; + } + + if (ret) + goto cleanup; + + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + /* + * success + */ + if ((iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, iter); + if (ret) + goto cleanup; + } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); + iov_iter_advance(iter, bio->bi_iter.bi_size); + } + + bio->bi_private = bmd; + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + return bio; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ERR_PTR(ret); +} + +/** + * bio_map_user_iov - map user iovec into bio + * @q: the struct request_queue for the bio + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Map the user space address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_user_iov(struct request_queue *q, + struct iov_iter *iter, gfp_t gfp_mask) +{ + int j; + struct bio *bio; + int ret; + + if (!iov_iter_count(iter)) + return ERR_PTR(-EINVAL); + + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (iov_iter_count(iter)) { + struct page **pages; + ssize_t bytes; + size_t offs, added = 0; + int npages; + + bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (unlikely(bytes <= 0)) { + ret = bytes ? bytes : -EFAULT; + goto out_unmap; + } + + npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); + + if (unlikely(offs & queue_dma_alignment(q))) { + ret = -EINVAL; + j = 0; + } else { + for (j = 0; j < npages; j++) { + struct page *page = pages[j]; + unsigned int n = PAGE_SIZE - offs; + bool same_page = false; + + if (n > bytes) + n = bytes; + + if (!__bio_add_pc_page(q, bio, page, n, offs, + &same_page)) { + if (same_page) + put_page(page); + break; + } + + added += n; + bytes -= n; + offs = 0; + } + iov_iter_advance(iter, added); + } + /* + * release the pages we didn't map into the bio, if any + */ + while (j < npages) + put_page(pages[j++]); + kvfree(pages); + /* couldn't stuff something into bio? */ + if (bytes) + break; + } + + bio_set_flag(bio, BIO_USER_MAPPED); + + /* + * subtle -- if bio_map_user_iov() ended up bouncing a bio, + * it would normally disappear when its bi_end_io is run. + * however, we need it for the unmap, so grab an extra + * reference to it + */ + bio_get(bio); + return bio; + + out_unmap: + bio_release_pages(bio, false); + bio_put(bio); + return ERR_PTR(ret); +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. + * + * bio_unmap_user() may sleep. + */ +static void bio_unmap_user(struct bio *bio) +{ + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); + bio_put(bio); +} + +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_invalidate_vmalloc_pages(bio); + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_copy_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} + /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. diff --git a/block/blk.h b/block/blk.h index 491e52fc0aa6..0a94ec68af32 100644 --- a/block/blk.h +++ b/block/blk.h @@ -484,4 +484,8 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) struct request_queue *__blk_alloc_queue(int node_id); +int __bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + bool *same_page); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index a430e9c1c2d2..c1c0f9ea4e63 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -441,14 +441,6 @@ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_release_pages(struct bio *bio, bool mark_dirty); -struct rq_map_data; -extern struct bio *bio_map_user_iov(struct request_queue *, - struct iov_iter *, gfp_t); -extern void bio_unmap_user(struct bio *); -extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, - gfp_t); -extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, - gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); @@ -463,12 +455,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); - -extern struct bio *bio_copy_user_iov(struct request_queue *, - struct rq_map_data *, - struct iov_iter *, - gfp_t); -extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); -- cgit v1.2.3-71-gd317