From 37fb3a30b46237f23cfdf7ee09d49f9888dd13bf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 8 Aug 2011 16:08:08 +0200 Subject: fuse: fix flock Commit a9ff4f87 "fuse: support BSD locking semantics" overlooked a number of issues with supporing flock locks over existing POSIX locking infrastructure: - it's not backward compatible, passing flock(2) calls to userspace unconditionally (if userspace sets FUSE_POSIX_LOCKS) - it doesn't cater for the fact that flock locks are automatically unlocked on file release - it doesn't take into account the fact that flock exclusive locks (write locks) don't need an fd opened for write. The last one invalidates the original premise of the patch that flock locks can be emulated with POSIX locks. This patch fixes the first two issues. The last one needs to be fixed in userspace if the filesystem assumed that a write lock will happen only on a file operned for write (as in the case of the current fuse library). Reported-by: Sebastian Pipping Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index d464de53db43..464cff526860 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -47,6 +47,9 @@ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' * - add FUSE_IOCTL_32BIT flag + * + * 7.17 + * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK */ #ifndef _LINUX_FUSE_H @@ -78,7 +81,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 16 +#define FUSE_KERNEL_MINOR_VERSION 17 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -153,8 +156,10 @@ struct fuse_file_lock { /** * INIT request/reply flags * + * FUSE_POSIX_LOCKS: remote locking for POSIX file locks * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -163,6 +168,7 @@ struct fuse_file_lock { #define FUSE_EXPORT_SUPPORT (1 << 4) #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) +#define FUSE_FLOCK_LOCKS (1 << 10) /** * CUSE INIT request/reply flags @@ -175,6 +181,7 @@ struct fuse_file_lock { * Release flags */ #define FUSE_RELEASE_FLUSH (1 << 0) +#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) /** * Getattr flags -- cgit v1.2.3-71-gd317 From bb0822954aab7d23a3f902c2a103ee0242f6046e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Aug 2011 13:37:14 -0600 Subject: squeeze max-pause area and drop pass-good area Revert the pass-good area introduced in ffd1f609ab10 ("writeback: introduce max-pause and pass-good dirty limits") and make the max-pause area smaller and safe. This fixes ~30% performance regression in the ext3 data=writeback fio_mmap_randwrite_64k/fio_mmap_randrw_64k test cases, where there are 12 JBOD disks, on each disk runs 8 concurrent tasks doing reads+writes. Using deadline scheduler also has a regression, but not that big as CFQ, so this suggests we have some write starvation. The test logs show that - the disks are sometimes under utilized - global dirty pages sometimes rush high to the pass-good area for several hundred seconds, while in the mean time some bdi dirty pages drop to very low value (bdi_dirty << bdi_thresh). Then suddenly the global dirty pages dropped under global dirty threshold and bdi_dirty rush very high (for example, 2 times higher than bdi_thresh). During which time balance_dirty_pages() is not called at all. So the problems are 1) The random writes progress so slow that they break the assumption of the max-pause logic that "8 pages per 200ms is typically more than enough to curb heavy dirtiers". 2) The max-pause logic ignored task_bdi_thresh and thus opens the possibility for some bdi's to over dirty pages, leading to (bdi_dirty >> bdi_thresh) and then (bdi_thresh >> bdi_dirty) for others. 3) The higher max-pause/pass-good thresholds somehow leads to the bad swing of dirty pages. The fix is to allow the task to slightly dirty over task_bdi_thresh, but no way to exceed bdi_dirty and/or global dirty_thresh. Tests show that it fixed the JBOD regression completely (both behavior and performance), while still being able to cut down large pause times in balance_dirty_pages() for single-disk cases. Reported-by: Li Shaohua Tested-by: Li Shaohua Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 11 ----------- mm/page-writeback.c | 15 ++------------- 2 files changed, 2 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f1bfa12ea246..2b8963ff0f35 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -12,15 +12,6 @@ * * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) * - * The 1/16 region above the global dirty limit will be put to maximum pauses: - * - * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) - * - * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put - * to loops: - * - * (limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA) - * * Further beyond, all dirtier tasks will enter a loop waiting (possibly long * time) for the dirty pages to drop, unless written enough pages. * @@ -31,8 +22,6 @@ */ #define DIRTY_SCOPE 8 #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) -#define DIRTY_MAXPAUSE_AREA 16 -#define DIRTY_PASSGOOD_AREA 8 /* * 4MB minimal write chunk size diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1960744f881..0e309cd1b5b9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -754,21 +754,10 @@ static void balance_dirty_pages(struct address_space *mapping, * 200ms is typically more than enough to curb heavy dirtiers; * (b) the pause time limit makes the dirtiers more responsive. */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_MAXPAUSE_AREA && + if (nr_dirty < dirty_thresh && + bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && time_after(jiffies, start_time + MAX_PAUSE)) break; - /* - * pass-good area. When some bdi gets blocked (eg. NFS server - * not responding), or write bandwidth dropped dramatically due - * to concurrent reads, or dirty threshold suddenly dropped and - * the dirty pages cannot be brought down anytime soon (eg. on - * slow USB stick), at least let go of the good bdi's. - */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_PASSGOOD_AREA && - bdi_dirty < bdi_thresh) - break; /* * Increase the delay for each loop, up to our previous -- cgit v1.2.3-71-gd317 From 0d7c5f2572ccfa7bf83292b1506926663f2d164a Mon Sep 17 00:00:00 2001 From: Pavan Savoy Date: Wed, 10 Aug 2011 10:18:31 -0500 Subject: drivers:misc:ti-st: platform hooks for chip states Certain platform specific or Host-WiLink Interface specific actions would be required to be taken when the chip is being enabled and after the chip is disabled such as configuration of the mux modes for the GPIO of host connected to the nshutdown of the chip or relinquishing UART after the chip is disabled. Similar actions can also be taken when the chip is in deep sleep or when the chip is awake. Performance enhancements such as configuring the host to run faster when chip is awake and slower when chip is asleep can also be made here. Signed-off-by: Pavan Savoy Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ti-st/st_kim.c | 12 ++++++++++++ drivers/misc/ti-st/st_ll.c | 19 +++++++++++++++++++ include/linux/ti_wilink_st.h | 27 ++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c index 38fd2f04c07e..6884dd1c997b 100644 --- a/drivers/misc/ti-st/st_kim.c +++ b/drivers/misc/ti-st/st_kim.c @@ -434,11 +434,17 @@ long st_kim_start(void *kim_data) { long err = 0; long retry = POR_RETRY_COUNT; + struct ti_st_plat_data *pdata; struct kim_data_s *kim_gdata = (struct kim_data_s *)kim_data; pr_info(" %s", __func__); + pdata = kim_gdata->kim_pdev->dev.platform_data; do { + /* platform specific enabling code here */ + if (pdata->chip_enable) + pdata->chip_enable(kim_gdata); + /* Configure BT nShutdown to HIGH state */ gpio_set_value(kim_gdata->nshutdown, GPIO_LOW); mdelay(5); /* FIXME: a proper toggle */ @@ -489,6 +495,8 @@ long st_kim_stop(void *kim_data) { long err = 0; struct kim_data_s *kim_gdata = (struct kim_data_s *)kim_data; + struct ti_st_plat_data *pdata = + kim_gdata->kim_pdev->dev.platform_data; INIT_COMPLETION(kim_gdata->ldisc_installed); @@ -515,6 +523,10 @@ long st_kim_stop(void *kim_data) gpio_set_value(kim_gdata->nshutdown, GPIO_HIGH); mdelay(1); gpio_set_value(kim_gdata->nshutdown, GPIO_LOW); + + /* platform specific disable */ + if (pdata->chip_disable) + pdata->chip_disable(kim_gdata); return err; } diff --git a/drivers/misc/ti-st/st_ll.c b/drivers/misc/ti-st/st_ll.c index 3f2495138855..1ff460a8e9c7 100644 --- a/drivers/misc/ti-st/st_ll.c +++ b/drivers/misc/ti-st/st_ll.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) "(stll) :" fmt #include #include +#include #include /**********************************************************************/ @@ -37,6 +38,9 @@ static void send_ll_cmd(struct st_data_s *st_data, static void ll_device_want_to_sleep(struct st_data_s *st_data) { + struct kim_data_s *kim_data; + struct ti_st_plat_data *pdata; + pr_debug("%s", __func__); /* sanity check */ if (st_data->ll_state != ST_LL_AWAKE) @@ -46,10 +50,19 @@ static void ll_device_want_to_sleep(struct st_data_s *st_data) send_ll_cmd(st_data, LL_SLEEP_ACK); /* update state */ st_data->ll_state = ST_LL_ASLEEP; + + /* communicate to platform about chip asleep */ + kim_data = st_data->kim_data; + pdata = kim_data->kim_pdev->dev.platform_data; + if (pdata->chip_asleep) + pdata->chip_asleep(NULL); } static void ll_device_want_to_wakeup(struct st_data_s *st_data) { + struct kim_data_s *kim_data; + struct ti_st_plat_data *pdata; + /* diff actions in diff states */ switch (st_data->ll_state) { case ST_LL_ASLEEP: @@ -70,6 +83,12 @@ static void ll_device_want_to_wakeup(struct st_data_s *st_data) } /* update state */ st_data->ll_state = ST_LL_AWAKE; + + /* communicate to platform about chip wakeup */ + kim_data = st_data->kim_data; + pdata = kim_data->kim_pdev->dev.platform_data; + if (pdata->chip_asleep) + pdata->chip_awake(NULL); } /**********************************************************************/ diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h index b004e557caa9..2ef4385da6bf 100644 --- a/include/linux/ti_wilink_st.h +++ b/include/linux/ti_wilink_st.h @@ -410,7 +410,28 @@ struct gps_event_hdr { u16 plen; } __attribute__ ((packed)); -/* platform data */ +/** + * struct ti_st_plat_data - platform data shared between ST driver and + * platform specific board file which adds the ST device. + * @nshutdown_gpio: Host's GPIO line to which chip's BT_EN is connected. + * @dev_name: The UART/TTY name to which chip is interfaced. (eg: /dev/ttyS1) + * @flow_cntrl: Should always be 1, since UART's CTS/RTS is used for PM + * purposes. + * @baud_rate: The baud rate supported by the Host UART controller, this will + * be shared across with the chip via a HCI VS command from User-Space Init + * Mgr application. + * @suspend: + * @resume: legacy PM routines hooked to platform specific board file, so as + * to take chip-host interface specific action. + * @chip_enable: + * @chip_disable: Platform/Interface specific mux mode setting, GPIO + * configuring, Host side PM disabling etc.. can be done here. + * @chip_asleep: + * @chip_awake: Chip specific deep sleep states is communicated to Host + * specific board-xx.c to take actions such as cut UART clocks when chip + * asleep or run host faster when chip awake etc.. + * + */ struct ti_st_plat_data { long nshutdown_gpio; unsigned char dev_name[UART_DEV_NAME_LEN]; /* uart name */ @@ -418,6 +439,10 @@ struct ti_st_plat_data { unsigned long baud_rate; int (*suspend)(struct platform_device *, pm_message_t); int (*resume)(struct platform_device *); + int (*chip_enable) (struct kim_data_s *); + int (*chip_disable) (struct kim_data_s *); + int (*chip_asleep) (struct kim_data_s *); + int (*chip_awake) (struct kim_data_s *); }; #endif /* TI_WILINK_ST_H */ -- cgit v1.2.3-71-gd317 From 5dc06c5a70b79a323152bec7e55783e705767e63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 14:49:55 +0200 Subject: block: remove READ_META and WRITE_META Replace all occurnanced of the undocumented READ_META with READ | REQ_META and remove the unused WRITE_META define. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/ext3/inode.c | 4 ++-- fs/ext3/namei.c | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/namei.c | 2 +- fs/gfs2/quota.c | 2 +- include/linux/fs.h | 2 -- 6 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 04da6acde85d..bba7b96e0d9b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -2807,7 +2807,7 @@ make_io: trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 5571708b6a58..c7d4032aa82a 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -922,7 +922,7 @@ restart: bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c4da98a959ae..1dfa18feeb3e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -650,7 +650,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -3301,7 +3301,7 @@ make_io: trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ_META, bh); + submit_bh(READ | REQ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f8068c7bae9f..d36315ae629e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -922,7 +922,7 @@ restart: bh = ext4_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 42e8d23bc047..053434049dbb 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -709,7 +709,7 @@ get_a_page: set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ_META, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 178cdb4f1d4a..eae44c981173 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -162,10 +162,8 @@ struct inodes_stat_t { #define READA RWA_MASK #define READ_SYNC (READ | REQ_SYNC) -#define READ_META (READ | REQ_META) #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) #define WRITE_ODIRECT (WRITE | REQ_SYNC) -#define WRITE_META (WRITE | REQ_META) #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) -- cgit v1.2.3-71-gd317 From 65299a3b788bd274bed92f9fa3232082c9f3ea70 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Aug 2011 14:50:29 +0200 Subject: block: separate priority boosting from REQ_META Add a new REQ_PRIO to let requests preempt others in the cfq I/O schedule, and lave REQ_META purely for marking requests as metadata in blktrace. All existing callers of REQ_META except for XFS are updated to also set REQ_PRIO for now. Signed-off-by: Christoph Hellwig Reviewed-by: Namhyung Kim Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++++++---------- drivers/mmc/card/block.c | 3 +++ fs/ext3/inode.c | 4 ++-- fs/ext3/namei.c | 3 ++- fs/ext4/inode.c | 4 ++-- fs/ext4/namei.c | 3 ++- fs/gfs2/log.c | 4 ++-- fs/gfs2/meta_io.c | 6 +++--- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/quota.c | 2 +- include/linux/blk_types.h | 6 ++++-- 11 files changed, 32 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a33bd4377c61..16ace89613bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -130,8 +130,8 @@ struct cfq_queue { unsigned long slice_end; long slice_resid; - /* pending metadata requests */ - int meta_pending; + /* pending priority requests */ + int prio_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -684,8 +684,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, if (rq_is_sync(rq1) != rq_is_sync(rq2)) return rq_is_sync(rq1) ? rq1 : rq2; - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) - return rq1->cmd_flags & REQ_META ? rq1 : rq2; + if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) + return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); @@ -1612,9 +1612,9 @@ static void cfq_remove_request(struct request *rq) cfqq->cfqd->rq_queued--; cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), rq_is_sync(rq)); - if (rq->cmd_flags & REQ_META) { - WARN_ON(!cfqq->meta_pending); - cfqq->meta_pending--; + if (rq->cmd_flags & REQ_PRIO) { + WARN_ON(!cfqq->prio_pending); + cfqq->prio_pending--; } } @@ -3372,7 +3372,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; /* @@ -3439,8 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq->cmd_flags & REQ_META) - cfqq->meta_pending++; + if (rq->cmd_flags & REQ_PRIO) + cfqq->prio_pending++; cfq_update_io_thinktime(cfqd, cfqq, cic); cfq_update_io_seektime(cfqd, cfqq, rq); diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 1ff5486213fb..4c1a648d00fc 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -926,6 +926,9 @@ static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, /* * Reliable writes are used to implement Forced Unit Access and * REQ_META accesses, and are supported only on MMCs. + * + * XXX: this really needs a good explanation of why REQ_META + * is treated special. */ bool do_rel_wr = ((req->cmd_flags & REQ_FUA) || (req->cmd_flags & REQ_META)) && diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index bba7b96e0d9b..12661e1deedd 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -2807,7 +2807,7 @@ make_io: trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index c7d4032aa82a..0629e09f6511 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -922,7 +922,8 @@ restart: bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1dfa18feeb3e..c7cbb3d85d9e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -650,7 +650,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return bh; if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -3301,7 +3301,7 @@ make_io: trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d36315ae629e..1c924faeb6c8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -922,7 +922,8 @@ restart: bh = ext4_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); } } if ((bh = bh_use[ra_ptr++]) == NULL) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 85c62923ee29..598646434362 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) bh->b_end_io = end_buffer_write_sync; get_bh(bh); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - submit_bh(WRITE_SYNC | REQ_META, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); else - submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 747238cd9f96..be29858900f6 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | + int write_op = REQ_META | REQ_PRIO | (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); @@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ_SYNC | REQ_META, bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); if (!(flags & DIO_WAIT)) return 0; @@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3bc073a4cf82..079587e53849 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META, bio); + submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 053434049dbb..0e8bb13381e4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -709,7 +709,7 @@ get_a_page: set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 32f0076e844b..71fc53bb8f1c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -124,6 +124,7 @@ enum rq_flag_bits { __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ + __REQ_PRIO, /* boost priority in cfq */ __REQ_DISCARD, /* request to discard sectors */ __REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */ @@ -161,14 +162,15 @@ enum rq_flag_bits { #define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER) #define REQ_SYNC (1 << __REQ_SYNC) #define REQ_META (1 << __REQ_META) +#define REQ_PRIO (1 << __REQ_PRIO) #define REQ_DISCARD (1 << __REQ_DISCARD) #define REQ_NOIDLE (1 << __REQ_NOIDLE) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ - (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_DISCARD | \ - REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) + (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \ + REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK #define REQ_RAHEAD (1 << __REQ_RAHEAD) -- cgit v1.2.3-71-gd317 From 24d406a6bf736f7aebdc8fa0f0ec86e0890c6d24 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Aug 2011 14:59:28 +0200 Subject: TTY: pty, fix pty counting tty_operations->remove is normally called like: queue_release_one_tty ->tty_shutdown ->tty_driver_remove_tty ->tty_operations->remove However tty_shutdown() is called from queue_release_one_tty() only if tty_operations->shutdown is NULL. But for pty, it is not. pty_unix98_shutdown() is used there as ->shutdown. So tty_operations->remove of pty (i.e. pty_unix98_remove()) is never called. This results in invalid pty_count. I.e. what can be seen in /proc/sys/kernel/pty/nr. I see this was already reported at: https://lkml.org/lkml/2009/11/5/370 But it was not fixed since then. This patch is kind of a hackish way. The problem lies in ->install. We allocate there another tty (so-called tty->link). So ->install is called once, but ->remove twice, for both tty and tty->link. The fix here is to count both tty and tty->link and divide the count by 2 for user. And to have ->remove called, let's make tty_driver_remove_tty() global and call that from pty_unix98_shutdown() (tty_operations->shutdown). While at it, let's document that when ->shutdown is defined, tty_shutdown() is not called. Signed-off-by: Jiri Slaby Cc: Alan Cox Cc: "H. Peter Anvin" Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 17 +++++++++++++++-- drivers/tty/tty_io.c | 3 +-- include/linux/tty.h | 2 ++ include/linux/tty_driver.h | 3 +++ 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 98b6e3bdb000..e809e9d4683c 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -446,8 +446,19 @@ static inline void legacy_pty_init(void) { } int pty_limit = NR_UNIX98_PTY_DEFAULT; static int pty_limit_min; static int pty_limit_max = NR_UNIX98_PTY_MAX; +static int tty_count; static int pty_count; +static inline void pty_inc_count(void) +{ + pty_count = (++tty_count) / 2; +} + +static inline void pty_dec_count(void) +{ + pty_count = (--tty_count) / 2; +} + static struct cdev ptmx_cdev; static struct ctl_table pty_table[] = { @@ -542,6 +553,7 @@ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, static void pty_unix98_shutdown(struct tty_struct *tty) { + tty_driver_remove_tty(tty->driver, tty); /* We have our own method as we don't use the tty index */ kfree(tty->termios); } @@ -588,7 +600,8 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) */ tty_driver_kref_get(driver); tty->count++; - pty_count++; + pty_inc_count(); /* tty */ + pty_inc_count(); /* tty->link */ return 0; err_free_mem: deinitialize_tty_struct(o_tty); @@ -602,7 +615,7 @@ err_free_tty: static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { - pty_count--; + pty_dec_count(); } static const struct tty_operations ptm_unix98_ops = { diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 150e4f747c7d..4f1fc81112e6 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1295,8 +1295,7 @@ static int tty_driver_install_tty(struct tty_driver *driver, * * Locking: tty_mutex for now */ -static void tty_driver_remove_tty(struct tty_driver *driver, - struct tty_struct *tty) +void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); diff --git a/include/linux/tty.h b/include/linux/tty.h index 44bc0c5617e1..5f2ede82b3d6 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -421,6 +421,8 @@ extern void tty_driver_flush_buffer(struct tty_struct *tty); extern void tty_throttle(struct tty_struct *tty); extern void tty_unthrottle(struct tty_struct *tty); extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); +extern void tty_driver_remove_tty(struct tty_driver *driver, + struct tty_struct *tty); extern void tty_shutdown(struct tty_struct *tty); extern void tty_free_termios(struct tty_struct *tty); extern int is_current_pgrp_orphaned(void); diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 9deeac855240..ecdaeb98b293 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -47,6 +47,9 @@ * * This routine is called synchronously when a particular tty device * is closed for the last time freeing up the resources. + * Note that tty_shutdown() is not called if ops->shutdown is defined. + * This means one is responsible to take care of calling ops->remove (e.g. + * via tty_driver_remove_tty) and releasing tty->termios. * * * void (*cleanup)(struct tty_struct * tty); -- cgit v1.2.3-71-gd317 From 56ebdaf2fa3c5276be201c5d1aff1490b682ecf2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 24 Aug 2011 16:04:34 +0200 Subject: block: simplify force plug flush code a little bit Cleaning up the code a little bit. attempt_plug_merge() traverses the plug list anyway, we can do the request counting there, so stack size is reduced a little bit. The motivation here is I suspect if we should count the requests for each queue (task could handle multiple disks in the meantime), but my test doesn't show it's worthy doing. If somebody proves we should do it, below change will make that more easier. Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++------ include/linux/blkdev.h | 1 - 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 67dba6941194..b2ed78afd9f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1167,7 +1167,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, * true if merge was successful, otherwise false. */ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, - struct bio *bio) + struct bio *bio, unsigned int *request_count) { struct blk_plug *plug; struct request *rq; @@ -1176,10 +1176,13 @@ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, plug = tsk->plug; if (!plug) goto out; + *request_count = 0; list_for_each_entry_reverse(rq, &plug->list, queuelist) { int el_ret; + (*request_count)++; + if (rq->q != q) continue; @@ -1219,6 +1222,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; struct request *req; + unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a @@ -1237,7 +1241,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(current, q, bio)) + if (attempt_plug_merge(current, q, bio, &request_count)) goto out; spin_lock_irq(q->queue_lock); @@ -1302,9 +1306,8 @@ get_rq: if (__rq->q != q) plug->should_sort = 1; } - if (plug->count >= BLK_MAX_REQUEST_COUNT) + if (request_count >= BLK_MAX_REQUEST_COUNT) blk_flush_plug_list(plug, false); - plug->count++; list_add_tail(&req->queuelist, &plug->list); drive_stat_acct(req, 1); } else { @@ -2634,7 +2637,6 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); plug->should_sort = 0; - plug->count = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2718,7 +2720,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; list_splice_init(&plug->list, &list); - plug->count = 0; if (plug->should_sort) { list_sort(NULL, &list, plug_rq_cmp); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b15d54f8c2..7fbaa9103344 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -873,7 +873,6 @@ struct blk_plug { struct list_head list; struct list_head cb_list; unsigned int should_sort; - unsigned int count; }; #define BLK_MAX_REQUEST_COUNT 16 -- cgit v1.2.3-71-gd317 From be27425dcc516fd08245b047ea57f83b8f6f0903 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Aug 2011 16:15:10 -0700 Subject: Add a personality to report 2.6.x version numbers I ran into a couple of programs which broke with the new Linux 3.0 version. Some of those were binary only. I tried to use LD_PRELOAD to work around it, but it was quite difficult and in one case impossible because of a mix of 32bit and 64bit executables. For example, all kind of management software from HP doesnt work, unless we pretend to run a 2.6 kernel. $ uname -a Linux svivoipvnx001 3.0.0-08107-g97cd98f #1062 SMP Fri Aug 12 18:11:45 CEST 2011 i686 i686 i386 GNU/Linux $ hpacucli ctrl all show Error: No controllers detected. $ rpm -qf /usr/sbin/hpacucli hpacucli-8.75-12.0 Another notable case is that Python now reports "linux3" from sys.platform(); which in turn can break things that were checking sys.platform() == "linux2": https://bugzilla.mozilla.org/show_bug.cgi?id=664564 It seems pretty clear to me though it's a bug in the apps that are using '==' instead of .startswith(), but this allows us to unbreak broken programs. This patch adds a UNAME26 personality that makes the kernel report a 2.6.40+x version number instead. The x is the x in 3.x. I know this is somewhat ugly, but I didn't find a better workaround, and compatibility to existing programs is important. Some programs also read /proc/sys/kernel/osrelease. This can be worked around in user space with mount --bind (and a mount namespace) To use: wget ftp://ftp.kernel.org/pub/linux/kernel/people/ak/uname26/uname26.c gcc -o uname26 uname26.c ./uname26 program Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- include/linux/personality.h | 1 + kernel/sys.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/personality.h b/include/linux/personality.h index eec3bae164d4..8fc7dd1a57ff 100644 --- a/include/linux/personality.h +++ b/include/linux/personality.h @@ -22,6 +22,7 @@ extern int __set_personality(unsigned int); * These occupy the top three bytes. */ enum { + UNAME26 = 0x0020000, ADDR_NO_RANDOMIZE = 0x0040000, /* disable randomization of VA space */ FDPIC_FUNCPTRS = 0x0080000, /* userspace function ptrs point to descriptors * (signal handling) diff --git a/kernel/sys.c b/kernel/sys.c index dd948a1fca4c..18ee1d2f6474 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,8 @@ #include #include +/* Move somewhere else to avoid recompiling? */ +#include #include #include @@ -1161,6 +1165,34 @@ DECLARE_RWSEM(uts_sem); #define override_architecture(name) 0 #endif +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, int len) +{ + int ret = 0; + char buf[len]; + + if (current->personality & UNAME26) { + char *rest = UTS_RELEASE; + int ndots = 0; + unsigned v; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + snprintf(buf, len, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, len); + } + return ret; +} + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1170,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) errno = -EFAULT; up_read(&uts_sem); + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; if (!errno && override_architecture(name)) errno = -EFAULT; return errno; @@ -1191,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) error = -EFAULT; up_read(&uts_sem); + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; if (!error && override_architecture(name)) error = -EFAULT; return error; @@ -1225,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) if (!error && override_architecture(name)) error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; return error ? -EFAULT : 0; } #endif -- cgit v1.2.3-71-gd317 From e096d0c7e2e4e5893792db865dd065ac73cf1f00 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Thu, 25 Aug 2011 07:48:12 -0400 Subject: lockdep: Add helper function for dir vs file i_mutex annotation Purely in-memory filesystems do not use the inode hash as the dcache tells us if an entry already exists. As a result, they do not call unlock_new_inode, and thus directory inodes do not get put into a different lockdep class for i_sem. We need the different lockdep classes, because the locking order for i_mutex is different for directory inodes and regular inodes. Directory inodes can do "readdir()", which takes i_mutex *before* possibly taking mm->mmap_sem (due to a page fault while copying the directory entry to user space). In contrast, regular inodes can be mmap'ed, which takes mm->mmap_sem before accessing i_mutex. The two cases can never happen for the same inode, so no real deadlock can occur, but without the different lockdep classes, lockdep cannot understand that. As a result, if CONFIG_DEBUG_LOCK_ALLOC is set, this can lead to false positives from lockdep like below: find/645 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x5c/0xac but task is already holding lock: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [] vfs_readdir+0x5b/0xb4 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sb->s_type->i_mutex_key#15){+.+.+.}: [] lock_acquire+0xbf/0x103 [] __mutex_lock_common+0x4c/0x361 [] mutex_lock_nested+0x40/0x45 [] hugetlbfs_file_mmap+0x82/0x110 [] mmap_region+0x258/0x432 [] do_mmap_pgoff+0x2ac/0x306 [] sys_mmap_pgoff+0x118/0x16a [] sys_mmap+0x22/0x24 [] system_call_fastpath+0x16/0x1b -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0xa1a/0xcf7 [] lock_acquire+0xbf/0x103 [] might_fault+0x89/0xac [] filldir+0x6f/0xc7 [] dcache_readdir+0x67/0x205 [] vfs_readdir+0x7b/0xb4 [] sys_getdents+0x7e/0xd1 [] system_call_fastpath+0x16/0x1b This patch moves the directory vs file lockdep annotation into a helper function that can be called by in-memory filesystems and has hugetlbfs call it. Signed-off-by: Josh Boyer Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 + fs/inode.c | 24 +++++++++++++++--------- include/linux/fs.h | 5 +++++ 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87b6e0421c12..ec889538e5a6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_op = &page_symlink_inode_operations; break; } + lockdep_annotate_inode_mutex_key(inode); } return inode; } diff --git a/fs/inode.c b/fs/inode.c index 73920d555c88..ec7924696a13 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -848,16 +848,9 @@ struct inode *new_inode(struct super_block *sb) } EXPORT_SYMBOL(new_inode); -/** - * unlock_new_inode - clear the I_NEW state and wake up any waiters - * @inode: new inode to unlock - * - * Called when the inode is fully initialised to clear the new state of the - * inode and wake up anyone waiting for the inode to finish initialisation. - */ -void unlock_new_inode(struct inode *inode) -{ #ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; @@ -873,7 +866,20 @@ void unlock_new_inode(struct inode *inode) &type->i_mutex_dir_key); } } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; diff --git a/include/linux/fs.h b/include/linux/fs.h index 178cdb4f1d4a..c2bd68f2277a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2318,6 +2318,11 @@ extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*te extern struct inode * iget_locked(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +#else +static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; +#endif extern void unlock_new_inode(struct inode *); extern unsigned int get_next_ino(void); -- cgit v1.2.3-71-gd317 From a801876638c5ce650223476c4eb8f37cea32dc1c Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Thu, 25 Aug 2011 15:59:06 -0700 Subject: MAINTAINERS: Evgeniy has moved Signed-off-by: Evgeniy Polyakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- drivers/w1/masters/ds2490.c | 4 ++-- drivers/w1/masters/matrox_w1.c | 4 ++-- drivers/w1/slaves/w1_smem.c | 4 ++-- drivers/w1/slaves/w1_therm.c | 4 ++-- drivers/w1/w1.c | 4 ++-- drivers/w1/w1.h | 2 +- drivers/w1/w1_family.c | 2 +- drivers/w1/w1_family.h | 2 +- drivers/w1/w1_int.c | 2 +- drivers/w1/w1_int.h | 2 +- drivers/w1/w1_io.c | 2 +- drivers/w1/w1_log.h | 2 +- drivers/w1/w1_netlink.c | 2 +- drivers/w1/w1_netlink.h | 2 +- include/linux/connector.h | 2 +- 16 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 069ee3b5c651..ee0ac2c98bef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7087,7 +7087,7 @@ S: Supported F: drivers/mmc/host/vub300.c W1 DALLAS'S 1-WIRE BUS -M: Evgeniy Polyakov +M: Evgeniy Polyakov S: Maintained F: Documentation/w1/ F: drivers/w1/ diff --git a/drivers/w1/masters/ds2490.c b/drivers/w1/masters/ds2490.c index 02bf7bf7160b..b5abaae38e97 100644 --- a/drivers/w1/masters/ds2490.c +++ b/drivers/w1/masters/ds2490.c @@ -1,7 +1,7 @@ /* * dscore.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify @@ -1024,5 +1024,5 @@ module_init(ds_init); module_exit(ds_fini); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("DS2490 USB <-> W1 bus master driver (DS9490*)"); diff --git a/drivers/w1/masters/matrox_w1.c b/drivers/w1/masters/matrox_w1.c index 334d1ccf9c92..f667c26b2195 100644 --- a/drivers/w1/masters/matrox_w1.c +++ b/drivers/w1/masters/matrox_w1.c @@ -1,7 +1,7 @@ /* * matrox_w1.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify @@ -39,7 +39,7 @@ #include "../w1_log.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Driver for transport(Dallas 1-wire prtocol) over VGA DDC(matrox gpio)."); static struct pci_device_id matrox_w1_tbl[] = { diff --git a/drivers/w1/slaves/w1_smem.c b/drivers/w1/slaves/w1_smem.c index cc8c02e92593..84655625c870 100644 --- a/drivers/w1/slaves/w1_smem.c +++ b/drivers/w1/slaves/w1_smem.c @@ -1,7 +1,7 @@ /* * w1_smem.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify @@ -32,7 +32,7 @@ #include "../w1_family.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Driver for 1-wire Dallas network protocol, 64bit memory family."); static struct w1_family w1_smem_family_01 = { diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c index 402928b135d1..a1ef9b5b38cf 100644 --- a/drivers/w1/slaves/w1_therm.c +++ b/drivers/w1/slaves/w1_therm.c @@ -1,7 +1,7 @@ /* * w1_therm.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify @@ -34,7 +34,7 @@ #include "../w1_family.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Driver for 1-wire Dallas network protocol, temperature family."); /* Allow the strong pullup to be disabled, but default to enabled. diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c index 6c136c19e982..c37497823851 100644 --- a/drivers/w1/w1.c +++ b/drivers/w1/w1.c @@ -1,7 +1,7 @@ /* * w1.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify @@ -42,7 +42,7 @@ #include "w1_netlink.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); +MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Driver for 1-wire Dallas network protocol."); static int w1_timeout = 10; diff --git a/drivers/w1/w1.h b/drivers/w1/w1.h index 1ce23fc6186c..4d012ca3f32c 100644 --- a/drivers/w1/w1.h +++ b/drivers/w1/w1.h @@ -1,7 +1,7 @@ /* * w1.h * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c index 4a099041f28a..63359797c8b1 100644 --- a/drivers/w1/w1_family.c +++ b/drivers/w1/w1_family.c @@ -1,7 +1,7 @@ /* * w1_family.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_family.h b/drivers/w1/w1_family.h index 98a1ac0f4693..490cda2281bc 100644 --- a/drivers/w1/w1_family.h +++ b/drivers/w1/w1_family.h @@ -1,7 +1,7 @@ /* * w1_family.h * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index b50be3f1073d..d220bce2cee4 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -1,7 +1,7 @@ /* * w1_int.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_int.h b/drivers/w1/w1_int.h index 4274082d2262..2ad7d4414bed 100644 --- a/drivers/w1/w1_int.h +++ b/drivers/w1/w1_int.h @@ -1,7 +1,7 @@ /* * w1_int.h * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c index 8e8b64cfafb6..765b37b62a4f 100644 --- a/drivers/w1/w1_io.c +++ b/drivers/w1/w1_io.c @@ -1,7 +1,7 @@ /* * w1_io.c * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_log.h b/drivers/w1/w1_log.h index e6ab7cf08f88..9c7bd62e6bdc 100644 --- a/drivers/w1/w1_log.h +++ b/drivers/w1/w1_log.h @@ -1,7 +1,7 @@ /* * w1_log.h * - * Copyright (c) 2004 Evgeniy Polyakov + * Copyright (c) 2004 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c index 55aabd927c60..40788c925d1c 100644 --- a/drivers/w1/w1_netlink.c +++ b/drivers/w1/w1_netlink.c @@ -1,7 +1,7 @@ /* * w1_netlink.c * - * Copyright (c) 2003 Evgeniy Polyakov + * Copyright (c) 2003 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h index 27e950f935b1..b0922dc29658 100644 --- a/drivers/w1/w1_netlink.h +++ b/drivers/w1/w1_netlink.h @@ -1,7 +1,7 @@ /* * w1_netlink.h * - * Copyright (c) 2003 Evgeniy Polyakov + * Copyright (c) 2003 Evgeniy Polyakov * * * This program is free software; you can redistribute it and/or modify diff --git a/include/linux/connector.h b/include/linux/connector.h index 0c69ad825b39..3c9c54fd5690 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -1,7 +1,7 @@ /* * connector.h * - * 2004-2005 Copyright (c) Evgeniy Polyakov + * 2004-2005 Copyright (c) Evgeniy Polyakov * All rights reserved. * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3-71-gd317 From 284fb68d00c56e971ed01e0b4bac5ddd4d1b74ab Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Thu, 25 Aug 2011 15:59:13 -0700 Subject: rapidio: fix use of non-compatible registers Replace/remove use of RIO v.1.2 registers/bits that are not forward-compatible with newer versions of RapidIO specification. RapidIO specification v.1.3 removed Write Port CSR, Doorbell CSR, Mailbox CSR and Mailbox and Doorbell bits of the PEF CAR. Use of removed (since RIO v.1.3) register bits affects users of currently available 1.3 and 2.x compliant devices who may use not so recent kernel versions. Removing checks for unsupported bits makes corresponding routines compatible with all versions of RapidIO specification. Therefore, backporting makes stable kernel versions compliant with RIO v.1.3 and later as well. Signed-off-by: Alexandre Bounine Cc: Kumar Gala Cc: Matt Porter Cc: Li Yang Cc: Thomas Moll Cc: Chul Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 23 ++++++++--------------- drivers/rapidio/rio-scan.c | 3 +-- include/linux/rio_regs.h | 18 +++++++++--------- 3 files changed, 18 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 86ac38c96bcf..3bb131137033 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -80,13 +80,13 @@ static int rionet_capable = 1; */ static struct rio_dev **rionet_active; -#define is_rionet_capable(pef, src_ops, dst_ops) \ - ((pef & RIO_PEF_INB_MBOX) && \ - (pef & RIO_PEF_INB_DOORBELL) && \ +#define is_rionet_capable(src_ops, dst_ops) \ + ((src_ops & RIO_SRC_OPS_DATA_MSG) && \ + (dst_ops & RIO_DST_OPS_DATA_MSG) && \ (src_ops & RIO_SRC_OPS_DOORBELL) && \ (dst_ops & RIO_DST_OPS_DOORBELL)) #define dev_rionet_capable(dev) \ - is_rionet_capable(dev->pef, dev->src_ops, dev->dst_ops) + is_rionet_capable(dev->src_ops, dev->dst_ops) #define RIONET_MAC_MATCH(x) (*(u32 *)x == 0x00010001) #define RIONET_GET_DESTID(x) (*(u16 *)(x + 4)) @@ -282,7 +282,6 @@ static int rionet_open(struct net_device *ndev) { int i, rc = 0; struct rionet_peer *peer, *tmp; - u32 pwdcsr; struct rionet_private *rnet = netdev_priv(ndev); if (netif_msg_ifup(rnet)) @@ -332,13 +331,8 @@ static int rionet_open(struct net_device *ndev) continue; } - /* - * If device has initialized inbound doorbells, - * send a join message - */ - rio_read_config_32(peer->rdev, RIO_WRITE_PORT_CSR, &pwdcsr); - if (pwdcsr & RIO_DOORBELL_AVAIL) - rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN); + /* Send a join message */ + rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN); } out: @@ -492,7 +486,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) static int rionet_probe(struct rio_dev *rdev, const struct rio_device_id *id) { int rc = -ENODEV; - u32 lpef, lsrc_ops, ldst_ops; + u32 lsrc_ops, ldst_ops; struct rionet_peer *peer; struct net_device *ndev = NULL; @@ -515,12 +509,11 @@ static int rionet_probe(struct rio_dev *rdev, const struct rio_device_id *id) * on later probes */ if (!rionet_check) { - rio_local_read_config_32(rdev->net->hport, RIO_PEF_CAR, &lpef); rio_local_read_config_32(rdev->net->hport, RIO_SRC_OPS_CAR, &lsrc_ops); rio_local_read_config_32(rdev->net->hport, RIO_DST_OPS_CAR, &ldst_ops); - if (!is_rionet_capable(lpef, lsrc_ops, ldst_ops)) { + if (!is_rionet_capable(lsrc_ops, ldst_ops)) { printk(KERN_ERR "%s: local device is not network capable\n", DRV_NAME); diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index ee893581d4b7..ebe77dd87daf 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -505,8 +505,7 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net, rdev->dev.dma_mask = &rdev->dma_mask; rdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); - if ((rdev->pef & RIO_PEF_INB_DOORBELL) && - (rdev->dst_ops & RIO_DST_OPS_DOORBELL)) + if (rdev->dst_ops & RIO_DST_OPS_DOORBELL) rio_init_dbell_res(&rdev->riores[RIO_DOORBELL_RESOURCE], 0, 0xffff); diff --git a/include/linux/rio_regs.h b/include/linux/rio_regs.h index 9026b30238f3..218168a2b5e9 100644 --- a/include/linux/rio_regs.h +++ b/include/linux/rio_regs.h @@ -36,12 +36,12 @@ #define RIO_PEF_PROCESSOR 0x20000000 /* [I] Processor */ #define RIO_PEF_SWITCH 0x10000000 /* [I] Switch */ #define RIO_PEF_MULTIPORT 0x08000000 /* [VI, 2.1] Multiport */ -#define RIO_PEF_INB_MBOX 0x00f00000 /* [II] Mailboxes */ -#define RIO_PEF_INB_MBOX0 0x00800000 /* [II] Mailbox 0 */ -#define RIO_PEF_INB_MBOX1 0x00400000 /* [II] Mailbox 1 */ -#define RIO_PEF_INB_MBOX2 0x00200000 /* [II] Mailbox 2 */ -#define RIO_PEF_INB_MBOX3 0x00100000 /* [II] Mailbox 3 */ -#define RIO_PEF_INB_DOORBELL 0x00080000 /* [II] Doorbells */ +#define RIO_PEF_INB_MBOX 0x00f00000 /* [II, <= 1.2] Mailboxes */ +#define RIO_PEF_INB_MBOX0 0x00800000 /* [II, <= 1.2] Mailbox 0 */ +#define RIO_PEF_INB_MBOX1 0x00400000 /* [II, <= 1.2] Mailbox 1 */ +#define RIO_PEF_INB_MBOX2 0x00200000 /* [II, <= 1.2] Mailbox 2 */ +#define RIO_PEF_INB_MBOX3 0x00100000 /* [II, <= 1.2] Mailbox 3 */ +#define RIO_PEF_INB_DOORBELL 0x00080000 /* [II, <= 1.2] Doorbells */ #define RIO_PEF_EXT_RT 0x00000200 /* [III, 1.3] Extended route table support */ #define RIO_PEF_STD_RT 0x00000100 /* [III, 1.3] Standard route table support */ #define RIO_PEF_CTLS 0x00000010 /* [III] CTLS */ @@ -102,7 +102,7 @@ #define RIO_SWITCH_RT_LIMIT 0x34 /* [III, 1.3] Switch Route Table Destination ID Limit CAR */ #define RIO_RT_MAX_DESTID 0x0000ffff -#define RIO_MBOX_CSR 0x40 /* [II] Mailbox CSR */ +#define RIO_MBOX_CSR 0x40 /* [II, <= 1.2] Mailbox CSR */ #define RIO_MBOX0_AVAIL 0x80000000 /* [II] Mbox 0 avail */ #define RIO_MBOX0_FULL 0x40000000 /* [II] Mbox 0 full */ #define RIO_MBOX0_EMPTY 0x20000000 /* [II] Mbox 0 empty */ @@ -128,8 +128,8 @@ #define RIO_MBOX3_FAIL 0x00000008 /* [II] Mbox 3 fail */ #define RIO_MBOX3_ERROR 0x00000004 /* [II] Mbox 3 error */ -#define RIO_WRITE_PORT_CSR 0x44 /* [I] Write Port CSR */ -#define RIO_DOORBELL_CSR 0x44 /* [II] Doorbell CSR */ +#define RIO_WRITE_PORT_CSR 0x44 /* [I, <= 1.2] Write Port CSR */ +#define RIO_DOORBELL_CSR 0x44 /* [II, <= 1.2] Doorbell CSR */ #define RIO_DOORBELL_AVAIL 0x80000000 /* [II] Doorbell avail */ #define RIO_DOORBELL_FULL 0x40000000 /* [II] Doorbell full */ #define RIO_DOORBELL_EMPTY 0x20000000 /* [II] Doorbell empty */ -- cgit v1.2.3-71-gd317 From cc7993f6439b49909a8792660c4d0741fec9d584 Mon Sep 17 00:00:00 2001 From: Dilan Lee Date: Thu, 25 Aug 2011 15:59:17 -0700 Subject: backlight: add a callback 'notify_after' for backlight control We need a callback to do some things after pwm_enable, pwm_disable and pwm_config. Signed-off-by: Dilan Lee Reviewed-by: Robert Morell Reviewed-by: Arun Murthy Cc: Richard Purdie Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/pwm_bl.c | 9 +++++++++ include/linux/pwm_backlight.h | 1 + 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index b8f38ec6eb18..8b5b2a4124c7 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -28,6 +28,8 @@ struct pwm_bl_data { unsigned int lth_brightness; int (*notify)(struct device *, int brightness); + void (*notify_after)(struct device *, + int brightness); int (*check_fb)(struct device *, struct fb_info *); }; @@ -55,6 +57,10 @@ static int pwm_backlight_update_status(struct backlight_device *bl) pwm_config(pb->pwm, brightness, pb->period); pwm_enable(pb->pwm); } + + if (pb->notify_after) + pb->notify_after(pb->dev, brightness); + return 0; } @@ -105,6 +111,7 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->period = data->pwm_period_ns; pb->notify = data->notify; + pb->notify_after = data->notify_after; pb->check_fb = data->check_fb; pb->lth_brightness = data->lth_brightness * (data->pwm_period_ns / data->max_brightness); @@ -172,6 +179,8 @@ static int pwm_backlight_suspend(struct platform_device *pdev, pb->notify(pb->dev, 0); pwm_config(pb->pwm, 0, pb->period); pwm_disable(pb->pwm); + if (pb->notify_after) + pb->notify_after(pb->dev, 0); return 0; } diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h index 5e3e25a3c9c3..63d2df43e61a 100644 --- a/include/linux/pwm_backlight.h +++ b/include/linux/pwm_backlight.h @@ -14,6 +14,7 @@ struct platform_pwm_backlight_data { unsigned int pwm_period_ns; int (*init)(struct device *dev); int (*notify)(struct device *dev, int brightness); + void (*notify_after)(struct device *dev, int brightness); void (*exit)(struct device *dev); int (*check_fb)(struct device *dev, struct fb_info *info); }; -- cgit v1.2.3-71-gd317 From f5b940997397229975ea073679b03967932a541b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 26 Aug 2011 18:03:11 -0400 Subject: All Arch: remove linkage for sys_nfsservctl system call The nfsservctl system call is now gone, so we should remove all linkage for it. Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- arch/alpha/kernel/systbls.S | 2 +- arch/arm/kernel/calls.S | 2 +- arch/avr32/kernel/syscall_table.S | 2 +- arch/blackfin/mach-common/entry.S | 2 +- arch/cris/arch-v10/kernel/entry.S | 2 +- arch/cris/arch-v32/kernel/entry.S | 2 +- arch/frv/kernel/entry.S | 2 +- arch/h8300/kernel/syscalls.S | 2 +- arch/ia64/kernel/entry.S | 2 +- arch/m32r/kernel/syscall_table.S | 2 +- arch/m68k/kernel/syscalltable.S | 2 +- arch/microblaze/kernel/syscall_table.S | 2 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/s390/kernel/compat_wrapper.S | 6 ------ arch/s390/kernel/syscalls.S | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sparc/kernel/sys32.S | 1 - arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 2 +- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- arch/xtensa/include/asm/unistd.h | 2 +- fs/compat.c | 5 ----- include/asm-generic/unistd.h | 2 +- include/linux/compat.h | 1 - include/linux/syscalls.h | 3 --- kernel/sys_ni.c | 1 - 33 files changed, 27 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index b9c28f3f1956..6acea1f96de3 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -360,7 +360,7 @@ sys_call_table: .quad sys_newuname .quad sys_nanosleep /* 340 */ .quad sys_mremap - .quad sys_nfsservctl + .quad sys_ni_syscall /* old nfsservctl */ .quad sys_setresuid .quad sys_getresuid .quad sys_pciconfig_read /* 345 */ diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 80f7896cc016..9943e9e74a1b 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -178,7 +178,7 @@ CALL(sys_ni_syscall) /* vm86 */ CALL(sys_ni_syscall) /* was sys_query_module */ CALL(sys_poll) - CALL(sys_nfsservctl) + CALL(sys_ni_syscall) /* was nfsservctl */ /* 170 */ CALL(sys_setresgid16) CALL(sys_getresgid16) CALL(sys_prctl) diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S index c7fd394d28a4..6eba53530d1c 100644 --- a/arch/avr32/kernel/syscall_table.S +++ b/arch/avr32/kernel/syscall_table.S @@ -158,7 +158,7 @@ sys_call_table: .long sys_sched_rr_get_interval .long sys_nanosleep .long sys_poll - .long sys_nfsservctl /* 145 */ + .long sys_ni_syscall /* 145 was nfsservctl */ .long sys_setresgid .long sys_getresgid .long sys_prctl diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 225d311c9701..e4137297b790 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1543,7 +1543,7 @@ ENTRY(_sys_call_table) .long _sys_ni_syscall /* for vm86 */ .long _sys_ni_syscall /* old "query_module" */ .long _sys_ni_syscall /* sys_poll */ - .long _sys_nfsservctl + .long _sys_ni_syscall /* old nfsservctl */ .long _sys_setresgid /* setresgid16 */ /* 170 */ .long _sys_getresgid /* getresgid16 */ .long _sys_prctl diff --git a/arch/cris/arch-v10/kernel/entry.S b/arch/cris/arch-v10/kernel/entry.S index 1161883eb582..592fbe9dfb62 100644 --- a/arch/cris/arch-v10/kernel/entry.S +++ b/arch/cris/arch-v10/kernel/entry.S @@ -771,7 +771,7 @@ sys_call_table: .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S index 84fed7e91ada..c3ea4694fbaf 100644 --- a/arch/cris/arch-v32/kernel/entry.S +++ b/arch/cris/arch-v32/kernel/entry.S @@ -714,7 +714,7 @@ sys_call_table: .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S index 017d6d7b784f..5ba23f715ea5 100644 --- a/arch/frv/kernel/entry.S +++ b/arch/frv/kernel/entry.S @@ -1358,7 +1358,7 @@ sys_call_table: .long sys_ni_syscall /* for vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/h8300/kernel/syscalls.S b/arch/h8300/kernel/syscalls.S index f4b2e67bcc34..4be2ea2fbe26 100644 --- a/arch/h8300/kernel/syscalls.S +++ b/arch/h8300/kernel/syscalls.S @@ -183,7 +183,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* for vm86 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_query_module */ .long SYMBOL_NAME(sys_poll) - .long SYMBOL_NAME(sys_nfsservctl) + .long SYMBOL_NAME(sys_ni_syscall) /* old nfsservctl */ .long SYMBOL_NAME(sys_setresgid16) /* 170 */ .long SYMBOL_NAME(sys_getresgid16) .long SYMBOL_NAME(sys_prctl) diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 97dd2abdeb1a..198c753d1006 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1614,7 +1614,7 @@ sys_call_table: data8 sys_sched_get_priority_min data8 sys_sched_rr_get_interval data8 sys_nanosleep - data8 sys_nfsservctl + data8 sys_ni_syscall // old nfsservctl data8 sys_prctl // 1170 data8 sys_getpagesize data8 sys_mmap2 diff --git a/arch/m32r/kernel/syscall_table.S b/arch/m32r/kernel/syscall_table.S index 528f2e6ad064..f365c19795ef 100644 --- a/arch/m32r/kernel/syscall_table.S +++ b/arch/m32r/kernel/syscall_table.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long sys_tas /* vm86 syscall holder */ .long sys_ni_syscall /* query_module syscall holder */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid /* 170 */ .long sys_getresgid .long sys_prctl diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S index 00d1452f9571..c468f2edaa85 100644 --- a/arch/m68k/kernel/syscalltable.S +++ b/arch/m68k/kernel/syscalltable.S @@ -189,7 +189,7 @@ ENTRY(sys_call_table) .long sys_getpagesize .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index d915a122c865..8789daa2a346 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -173,7 +173,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* sys_vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* old nfsservctl */ .long sys_setresgid /* 170 */ .long sys_getresgid .long sys_prctl diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index e521420a45a5..865bc7a6f5a1 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -424,7 +424,7 @@ einval: li v0, -ENOSYS sys sys_getresuid 3 sys sys_ni_syscall 0 /* was sys_query_module */ sys sys_poll 3 - sys sys_nfsservctl 3 + sys sys_ni_syscall 0 /* was nfsservctl */ sys sys_setresgid 3 /* 4190 */ sys sys_getresgid 3 sys sys_prctl 5 diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 85874d6a8a70..fb7334bea731 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -299,7 +299,7 @@ sys_call_table: PTR sys_ni_syscall /* 5170, was get_kernel_syms */ PTR sys_ni_syscall /* was query_module */ PTR sys_quotactl - PTR sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_ni_syscall /* res. for getpmsg */ PTR sys_ni_syscall /* 5175 for putpmsg */ PTR sys_ni_syscall /* res. for afs_syscall */ diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index b85842fc87ae..f9296e894e46 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -294,7 +294,7 @@ EXPORT(sysn32_call_table) PTR sys_ni_syscall /* 6170, was get_kernel_syms */ PTR sys_ni_syscall /* was query_module */ PTR sys_quotactl - PTR compat_sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_ni_syscall /* res. for getpmsg */ PTR sys_ni_syscall /* 6175 for putpmsg */ PTR sys_ni_syscall /* res. for afs_syscall */ diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 46c4763edf21..4d7c9827706f 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -392,7 +392,7 @@ sys_call_table: PTR sys_getresuid PTR sys_ni_syscall /* was query_module */ PTR sys_poll - PTR compat_sys_nfsservctl + PTR sys_ni_syscall /* was nfsservctl */ PTR sys_setresgid /* 4190 */ PTR sys_getresgid PTR sys_prctl diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index ae435e1d5669..3e3620d9fc45 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -589,7 +589,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 08ab9aa6a0d5..7526db6bf501 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -665,12 +665,6 @@ ENTRY(sys32_poll_wrapper) lgfr %r4,%r4 # long jg sys_poll # branch to system call -ENTRY(compat_sys_nfsservctl_wrapper) - lgfr %r2,%r2 # int - llgtr %r3,%r3 # struct compat_nfsctl_arg* - llgtr %r4,%r4 # union compat_nfsctl_res* - jg compat_sys_nfsservctl # branch to system call - ENTRY(sys32_setresgid16_wrapper) llgfr %r2,%r2 # __kernel_old_gid_emu31_t llgfr %r3,%r3 # __kernel_old_gid_emu31_t diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 6ee39ef8fe4a..73eb08c874fb 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -177,7 +177,7 @@ SYSCALL(sys_getresuid16,sys_ni_syscall,sys32_getresuid16_wrapper) /* 165 old get NI_SYSCALL /* for vm86 */ NI_SYSCALL /* old sys_query_module */ SYSCALL(sys_poll,sys_poll,sys32_poll_wrapper) -SYSCALL(sys_nfsservctl,sys_nfsservctl,compat_sys_nfsservctl_wrapper) +NI_SYSCALL /* old nfsservctl */ SYSCALL(sys_setresgid16,sys_ni_syscall,sys32_setresgid16_wrapper) /* 170 old setresgid16 syscall */ SYSCALL(sys_getresgid16,sys_ni_syscall,sys32_getresgid16_wrapper) /* old getresgid16 syscall */ SYSCALL(sys_prctl,sys_prctl,sys32_prctl_wrapper) diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 39b051de4c7c..293e39c59c00 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -185,7 +185,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index 089c4d825d08..ceb34b94afa9 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -189,7 +189,7 @@ sys_call_table: .long sys_ni_syscall /* vm86 */ .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* was nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index 44e5faf1ad5f..d97f3eb72e06 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -81,7 +81,6 @@ SIGN2(sys32_fadvise64, compat_sys_fadvise64, %o0, %o4) SIGN2(sys32_fadvise64_64, compat_sys_fadvise64_64, %o0, %o5) SIGN2(sys32_bdflush, sys_bdflush, %o0, %o1) SIGN1(sys32_mlockall, sys_mlockall, %o0) -SIGN1(sys32_nfsservctl, compat_sys_nfsservctl, %o0) SIGN1(sys32_clock_nanosleep, compat_sys_clock_nanosleep, %o1) SIGN1(sys32_timer_settime, compat_sys_timer_settime, %o1) SIGN1(sys32_io_submit, compat_sys_io_submit, %o1) diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 6e492d59f6b1..09d8ec454450 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -67,7 +67,7 @@ sys_call_table: /*235*/ .long sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .long sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler /*245*/ .long sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_ni_syscall /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun /*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index f566518483b5..c9296ab0b1f4 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -145,7 +145,7 @@ sys_call_table: .word sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .word sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler .word sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .word sys_64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nis_syscall .word sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .word sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_io_setup, sys_io_destroy diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a0e866d233ee..54edb207ff3a 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -672,7 +672,7 @@ ia32_sys_call_table: .quad sys32_vm86_warning /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll - .quad compat_sys_nfsservctl + .quad quiet_ni_syscall /* old nfsservctl */ .quad sys_setresgid16 /* 170 */ .quad sys_getresgid16 .quad sys_prctl diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d92641cc7acc..201040573444 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -414,7 +414,7 @@ __SYSCALL(__NR_query_module, sys_ni_syscall) __SYSCALL(__NR_quotactl, sys_quotactl) #define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* reserved for LiS/STREAMS */ #define __NR_getpmsg 181 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index fbb0a045a1a2..bc19be332bc9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -168,7 +168,7 @@ ENTRY(sys_call_table) .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll - .long sys_nfsservctl + .long sys_ni_syscall /* Old nfsservctl */ .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index a6f934f37f1a..798ee6d285a1 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -455,7 +455,7 @@ __SYSCALL(203, sys_reboot, 3) #define __NR_quotactl 204 __SYSCALL(204, sys_quotactl, 4) #define __NR_nfsservctl 205 -__SYSCALL(205, sys_nfsservctl, 3) +__SYSCALL(205, sys_ni_syscall, 0) #define __NR__sysctl 206 __SYSCALL(206, sys_sysctl, 1) #define __NR_bdflush 207 diff --git a/fs/compat.c b/fs/compat.c index 0b48d018e38a..58b1da459893 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1675,11 +1675,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} - #ifdef CONFIG_EPOLL #ifdef HAVE_SET_RESTORE_SIGMASK diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 4f76959397fa..f4c38d8c6674 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -143,7 +143,7 @@ __SYSCALL(__NR_pivot_root, sys_pivot_root) /* fs/nfsctl.c */ #define __NR_nfsservctl 42 -__SC_COMP(__NR_nfsservctl, sys_nfsservctl, compat_sys_nfsservctl) +__SYSCALL(__NR_nfsservctl, sys_ni_syscall) /* fs/open.c */ #define __NR3264_statfs 43 diff --git a/include/linux/compat.h b/include/linux/compat.h index 8779405e15a8..c6e7523bf765 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -438,7 +438,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, struct compat_timespec __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); -asmlinkage long compat_sys_nfsservctl(int cmd, void *notused, void *notused2); asmlinkage long compat_sys_signalfd4(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize, int flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 8c03b98df5f9..1ff0ec2a5e8d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -702,9 +702,6 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args); asmlinkage long sys_sysinfo(struct sysinfo __user *info); asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2); -asmlinkage long sys_nfsservctl(int cmd, - struct nfsctl_arg __user *arg, - void __user *res); asmlinkage long sys_syslog(int type, char __user *buf, int len); asmlinkage long sys_uselib(const char __user *library); asmlinkage long sys_ni_syscall(void); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fef..a9a5de07c4f1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } -cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); -- cgit v1.2.3-71-gd317 From a8d757ef076f0f95f13a918808824058de25b3eb Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 25 Aug 2011 15:58:03 +0200 Subject: perf events: Fix slow and broken cgroup context switch code The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 +++++++++++------- kernel/events/core.c | 63 ++++++++++++++++++++++++++++++++++++++-------- kernel/sched.c | 2 +- 3 files changed, 69 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 245bafdafd5e..c816075c01ce 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -944,8 +944,10 @@ extern void perf_pmu_unregister(struct pmu *pmu); extern int perf_num_counters(void); extern const char *perf_pmu_name(void); -extern void __perf_event_task_sched_in(struct task_struct *task); -extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); +extern void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task); +extern void __perf_event_task_sched_out(struct task_struct *prev, + struct task_struct *next); extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); @@ -1059,17 +1061,20 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) extern struct jump_label_key perf_sched_events; -static inline void perf_event_task_sched_in(struct task_struct *task) +static inline void perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { if (static_branch(&perf_sched_events)) - __perf_event_task_sched_in(task); + __perf_event_task_sched_in(prev, task); } -static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +static inline void perf_event_task_sched_out(struct task_struct *prev, + struct task_struct *next) { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - __perf_event_task_sched_out(task, next); + if (static_branch(&perf_sched_events)) + __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); @@ -1139,10 +1144,11 @@ extern void perf_event_disable(struct perf_event *event); extern void perf_event_task_tick(void); #else static inline void -perf_event_task_sched_in(struct task_struct *task) { } +perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { } static inline void -perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) { } +perf_event_task_sched_out(struct task_struct *prev, + struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index b8785e26ee1c..45847fbb599a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* + * next is NULL when called from perf_event_enable_on_exec() + * that will systematically cause a cgroup_switch() + */ + if (next) + cgrp2 = perf_cgroup_from_task(next); + + /* + * only schedule out current cgroup events if we know + * that we are switching to a different cgroup. Otherwise, + * do no touch the cgroup events. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* prev can never be NULL */ + cgrp2 = perf_cgroup_from_task(prev); + + /* + * only need to schedule in cgroup events if we are changing + * cgroup during ctxsw. Cgroup events were not scheduled + * out of ctxsw out if that was not the case. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWIN); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { } @@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); + perf_cgroup_sched_out(task, next); } static void task_ctx_sched_out(struct perf_event_context *ctx) @@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void __perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task) * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); + perf_cgroup_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * ctxswin cgroup events which are already scheduled * in. */ - perf_cgroup_sched_out(current); + perf_cgroup_sched_out(current, NULL); raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0408cdc6d572 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - perf_event_task_sched_in(current); + perf_event_task_sched_in(prev, current); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -- cgit v1.2.3-71-gd317 From 8efcc57dedfebc99c3cd39564e3fc47cd1a24b75 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 3 Aug 2011 18:04:29 +0900 Subject: mfd: Fix value of WM8994_CONFIGURE_GPIO This needs to be an out of band value for the register and on this device registers are 16 bit so we must shift left one to the 17th bit. Signed-off-by: Mark Brown Cc: stable@kernel.org Signed-off-by: Samuel Ortiz --- include/linux/mfd/wm8994/pdata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h index d12f8d635a81..97cf4f27d647 100644 --- a/include/linux/mfd/wm8994/pdata.h +++ b/include/linux/mfd/wm8994/pdata.h @@ -26,7 +26,7 @@ struct wm8994_ldo_pdata { struct regulator_init_data *init_data; }; -#define WM8994_CONFIGURE_GPIO 0x8000 +#define WM8994_CONFIGURE_GPIO 0x10000 #define WM8994_DRC_REGS 5 #define WM8994_EQ_REGS 20 -- cgit v1.2.3-71-gd317 From bff747c58cf97bf4fc8b499ee0f419b59d6b226d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 8 Sep 2011 10:16:47 -0700 Subject: regulator: fix kernel-doc warning in consumer.h Fix kernel-doc warning about internal/private data by marking it as "private:" so that kernel-doc will ignore it. Warning(include/linux/regulator/consumer.h:128): No description found for parameter 'ret' Signed-off-by: Randy Dunlap Acked-by: Mark Brown Signed-off-by: Linus Torvalds --- include/linux/regulator/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 26f6ea4444e3..b47771aa5718 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -123,7 +123,7 @@ struct regulator_bulk_data { const char *supply; struct regulator *consumer; - /* Internal use */ + /* private: Internal use */ int ret; }; -- cgit v1.2.3-71-gd317 From 185efc0f9a1f2d6ad6d4782c5d9e529f3290567f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 14 Sep 2011 16:21:58 -0700 Subject: memcg: Revert "memcg: add memory.vmscan_stat" Revert the post-3.0 commit 82f9d486e59f5 ("memcg: add memory.vmscan_stat"). The implementation of per-memcg reclaim statistics violates how memcg hierarchies usually behave: hierarchically. The reclaim statistics are accounted to child memcgs and the parent hitting the limit, but not to hierarchy levels in between. Usually, hierarchical statistics are perfectly recursive, with each level representing the sum of itself and all its children. Since this exports statistics to userspace, this may lead to confusion and problems with changing things after the release, so revert it now, we can try again later. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 85 +------------------ include/linux/memcontrol.h | 19 ----- include/linux/swap.h | 6 ++ mm/memcontrol.c | 172 ++------------------------------------- mm/vmscan.c | 39 ++------- 5 files changed, 18 insertions(+), 303 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 6f3c598971fc..06eb6d957c83 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -380,7 +380,7 @@ will be charged as a new owner of it. 5.2 stat file -5.2.1 memory.stat file includes following statistics +memory.stat file includes following statistics # per-memory cgroup local status cache - # of bytes of page cache memory. @@ -438,89 +438,6 @@ Note: file_mapped is accounted only when the memory cgroup is owner of page cache.) -5.2.2 memory.vmscan_stat - -memory.vmscan_stat includes statistics information for memory scanning and -freeing, reclaiming. The statistics shows memory scanning information since -memory cgroup creation and can be reset to 0 by writing 0 as - - #echo 0 > ../memory.vmscan_stat - -This file contains following statistics. - -[param]_[file_or_anon]_pages_by_[reason]_[under_heararchy] -[param]_elapsed_ns_by_[reason]_[under_hierarchy] - -For example, - - scanned_file_pages_by_limit indicates the number of scanned - file pages at vmscan. - -Now, 3 parameters are supported - - scanned - the number of pages scanned by vmscan - rotated - the number of pages activated at vmscan - freed - the number of pages freed by vmscan - -If "rotated" is high against scanned/freed, the memcg seems busy. - -Now, 2 reason are supported - - limit - the memory cgroup's limit - system - global memory pressure + softlimit - (global memory pressure not under softlimit is not handled now) - -When under_hierarchy is added in the tail, the number indicates the -total memcg scan of its children and itself. - -elapsed_ns is a elapsed time in nanosecond. This may include sleep time -and not indicates CPU usage. So, please take this as just showing -latency. - -Here is an example. - -# cat /cgroup/memory/A/memory.vmscan_stat -scanned_pages_by_limit 9471864 -scanned_anon_pages_by_limit 6640629 -scanned_file_pages_by_limit 2831235 -rotated_pages_by_limit 4243974 -rotated_anon_pages_by_limit 3971968 -rotated_file_pages_by_limit 272006 -freed_pages_by_limit 2318492 -freed_anon_pages_by_limit 962052 -freed_file_pages_by_limit 1356440 -elapsed_ns_by_limit 351386416101 -scanned_pages_by_system 0 -scanned_anon_pages_by_system 0 -scanned_file_pages_by_system 0 -rotated_pages_by_system 0 -rotated_anon_pages_by_system 0 -rotated_file_pages_by_system 0 -freed_pages_by_system 0 -freed_anon_pages_by_system 0 -freed_file_pages_by_system 0 -elapsed_ns_by_system 0 -scanned_pages_by_limit_under_hierarchy 9471864 -scanned_anon_pages_by_limit_under_hierarchy 6640629 -scanned_file_pages_by_limit_under_hierarchy 2831235 -rotated_pages_by_limit_under_hierarchy 4243974 -rotated_anon_pages_by_limit_under_hierarchy 3971968 -rotated_file_pages_by_limit_under_hierarchy 272006 -freed_pages_by_limit_under_hierarchy 2318492 -freed_anon_pages_by_limit_under_hierarchy 962052 -freed_file_pages_by_limit_under_hierarchy 1356440 -elapsed_ns_by_limit_under_hierarchy 351386416101 -scanned_pages_by_system_under_hierarchy 0 -scanned_anon_pages_by_system_under_hierarchy 0 -scanned_file_pages_by_system_under_hierarchy 0 -rotated_pages_by_system_under_hierarchy 0 -rotated_anon_pages_by_system_under_hierarchy 0 -rotated_file_pages_by_system_under_hierarchy 0 -freed_pages_by_system_under_hierarchy 0 -freed_anon_pages_by_system_under_hierarchy 0 -freed_file_pages_by_system_under_hierarchy 0 -elapsed_ns_by_system_under_hierarchy 0 - 5.3 swappiness Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3b535db00a94..343bd7661f2a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -39,16 +39,6 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct mem_cgroup *mem_cont, int active, int file); -struct memcg_scanrecord { - struct mem_cgroup *mem; /* scanend memory cgroup */ - struct mem_cgroup *root; /* scan target hierarchy root */ - int context; /* scanning context (see memcontrol.c) */ - unsigned long nr_scanned[2]; /* the number of scanned pages */ - unsigned long nr_rotated[2]; /* the number of rotated pages */ - unsigned long nr_freed[2]; /* the number of freed pages */ - unsigned long elapsed; /* nsec of time elapsed while scanning */ -}; - #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -127,15 +117,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); -extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct memcg_scanrecord *rec); -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - struct memcg_scanrecord *rec, - unsigned long *nr_scanned); - #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 14d62490922e..c71f84bb62ec 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,6 +252,12 @@ static inline void lru_cache_add_file(struct page *page) extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, int mode, int file); +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap); +extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ebd1e86bef1c..3508777837c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -204,50 +204,6 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); -enum { - SCAN_BY_LIMIT, - SCAN_BY_SYSTEM, - NR_SCAN_CONTEXT, - SCAN_BY_SHRINK, /* not recorded now */ -}; - -enum { - SCAN, - SCAN_ANON, - SCAN_FILE, - ROTATE, - ROTATE_ANON, - ROTATE_FILE, - FREED, - FREED_ANON, - FREED_FILE, - ELAPSED, - NR_SCANSTATS, -}; - -struct scanstat { - spinlock_t lock; - unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; - unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; -}; - -const char *scanstat_string[NR_SCANSTATS] = { - "scanned_pages", - "scanned_anon_pages", - "scanned_file_pages", - "rotated_pages", - "rotated_anon_pages", - "rotated_file_pages", - "freed_pages", - "freed_anon_pages", - "freed_file_pages", - "elapsed_ns", -}; -#define SCANSTAT_WORD_LIMIT "_by_limit" -#define SCANSTAT_WORD_SYSTEM "_by_system" -#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" - - /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -313,8 +269,7 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* For recording LRU-scan statistics */ - struct scanstat scanstat; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1678,44 +1633,6 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif -static void __mem_cgroup_record_scanstat(unsigned long *stats, - struct memcg_scanrecord *rec) -{ - - stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; - stats[SCAN_ANON] += rec->nr_scanned[0]; - stats[SCAN_FILE] += rec->nr_scanned[1]; - - stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; - stats[ROTATE_ANON] += rec->nr_rotated[0]; - stats[ROTATE_FILE] += rec->nr_rotated[1]; - - stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; - stats[FREED_ANON] += rec->nr_freed[0]; - stats[FREED_FILE] += rec->nr_freed[1]; - - stats[ELAPSED] += rec->elapsed; -} - -static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) -{ - struct mem_cgroup *mem; - int context = rec->context; - - if (context >= NR_SCAN_CONTEXT) - return; - - mem = rec->mem; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); - spin_unlock(&mem->scanstat.lock); - - mem = rec->root; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); - spin_unlock(&mem->scanstat.lock); -} - /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1740,9 +1657,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - struct memcg_scanrecord rec; unsigned long excess; - unsigned long scanned; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1750,15 +1666,6 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; - if (shrink) - rec.context = SCAN_BY_SHRINK; - else if (check_soft) - rec.context = SCAN_BY_SYSTEM; - else - rec.context = SCAN_BY_LIMIT; - - rec.root = root_mem; - while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1799,23 +1706,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } - rec.mem = victim; - rec.nr_scanned[0] = 0; - rec.nr_scanned[1] = 0; - rec.nr_rotated[0] = 0; - rec.nr_rotated[1] = 0; - rec.nr_freed[0] = 0; - rec.nr_freed[1] = 0; - rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &rec, &scanned); - *total_scanned += scanned; + noswap, zone, &nr_scanned); + *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, &rec); - mem_cgroup_record_scanstat(&rec); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3854,18 +3752,14 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { - struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } - rec.context = SCAN_BY_SHRINK; - rec.mem = mem; - rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, &rec); + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4709,54 +4603,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ -static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, - struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - char string[64]; - int i; - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); - } - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); - } - return 0; -} - -static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, - unsigned int event) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - - spin_lock(&mem->scanstat.lock); - memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); - memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); - spin_unlock(&mem->scanstat.lock); - return 0; -} - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4827,11 +4673,6 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif - { - .name = "vmscan_stat", - .read_map = mem_cgroup_vmscan_stat_read, - .trigger = mem_cgroup_reset_vmscan_stat, - }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -5095,7 +4936,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); - spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); diff --git a/mm/vmscan.c b/mm/vmscan.c index e49bcb6d4948..b55699cd9067 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -105,7 +105,6 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; - struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; - if (!scanning_global_lru(sc)) { - sc->memcg_record->nr_scanned[0] += *nr_anon; - sc->memcg_record->nr_scanned[1] += *nr_file; - } } /* @@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_freed[file] += nr_reclaimed; - local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -2265,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - struct memcg_scanrecord *rec, - unsigned long *scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2278,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_swap = !noswap, .order = 0, .mem_cgroup = mem, - .memcg_record = rec, }; - ktime_t start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2289,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = ktime_get(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2298,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = ktime_get(); - - if (rec) - rec->elapsed += ktime_to_ns(ktime_sub(end, start)); - *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap, - struct memcg_scanrecord *rec) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; - ktime_t start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2325,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem_cont, - .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2334,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = ktime_get(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2349,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = ktime_get(); - if (rec) - rec->elapsed += ktime_to_ns(ktime_sub(end, start)); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); -- cgit v1.2.3-71-gd317 From 4f5b04800a224aadb6cffcbbc3d3fa26e2367c7f Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 14 Sep 2011 16:22:29 -0700 Subject: drivers/gpio/gpio-generic.c: fix build errors Building a kernel with hotplug disabled results in a link failure: `bgpio_remove' referenced in section `___ksymtab_gpl+bgpio_remove' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o This is because of bgpio_remove() is exported. It is illegal to export symbols which are discarded either at link time or as part of an init/exit section. Fix this by dropping the __devexit attributation from bgpio_remove(). Also drop the __devinit attributation from bgpio_init(). Signed-off-by: Russell King Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpio/gpio-generic.c | 15 +++++---------- include/linux/basic_mmio_gpio.h | 15 +++++---------- 2 files changed, 10 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-generic.c b/drivers/gpio/gpio-generic.c index 231714def4d2..4e24436b0f82 100644 --- a/drivers/gpio/gpio-generic.c +++ b/drivers/gpio/gpio-generic.c @@ -351,7 +351,7 @@ static int bgpio_setup_direction(struct bgpio_chip *bgc, return 0; } -int __devexit bgpio_remove(struct bgpio_chip *bgc) +int bgpio_remove(struct bgpio_chip *bgc) { int err = gpiochip_remove(&bgc->gc); @@ -361,15 +361,10 @@ int __devexit bgpio_remove(struct bgpio_chip *bgc) } EXPORT_SYMBOL_GPL(bgpio_remove); -int __devinit bgpio_init(struct bgpio_chip *bgc, - struct device *dev, - unsigned long sz, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - void __iomem *dirout, - void __iomem *dirin, - bool big_endian) +int bgpio_init(struct bgpio_chip *bgc, struct device *dev, + unsigned long sz, void __iomem *dat, void __iomem *set, + void __iomem *clr, void __iomem *dirout, void __iomem *dirin, + bool big_endian) { int ret; diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h index 98999cf107ce..feb912196745 100644 --- a/include/linux/basic_mmio_gpio.h +++ b/include/linux/basic_mmio_gpio.h @@ -63,15 +63,10 @@ static inline struct bgpio_chip *to_bgpio_chip(struct gpio_chip *gc) return container_of(gc, struct bgpio_chip, gc); } -int __devexit bgpio_remove(struct bgpio_chip *bgc); -int __devinit bgpio_init(struct bgpio_chip *bgc, - struct device *dev, - unsigned long sz, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - void __iomem *dirout, - void __iomem *dirin, - bool big_endian); +int bgpio_remove(struct bgpio_chip *bgc); +int bgpio_init(struct bgpio_chip *bgc, struct device *dev, + unsigned long sz, void __iomem *dat, void __iomem *set, + void __iomem *clr, void __iomem *dirout, void __iomem *dirin, + bool big_endian); #endif /* __BASIC_MMIO_GPIO_H */ -- cgit v1.2.3-71-gd317 From 946cedccbd7387488d2cee5da92cdfeb28d2e670 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2011 03:21:44 +0000 Subject: tcp: Change possible SYN flooding messages "Possible SYN flooding on port xxxx " messages can fill logs on servers. Change logic to log the message only once per listener, and add two new SNMP counters to track : TCPReqQFullDoCookies : number of times a SYNCOOKIE was replied to client TCPReqQFullDrop : number of times a SYN request was dropped because syncookies were not enabled. Based on a prior patch from Tom Herbert, and suggestions from David. Signed-off-by: Eric Dumazet CC: Tom Herbert Signed-off-by: David S. Miller --- include/linux/snmp.h | 2 ++ include/net/request_sock.h | 3 ++- include/net/tcp.h | 3 +++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp_ipv4.c | 49 ++++++++++++++++++++++++++-------------------- net/ipv6/tcp_ipv6.c | 31 +++-------------------------- 6 files changed, 40 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 12b2b18e50c1..e16557a357e5 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -231,6 +231,8 @@ enum LINUX_MIB_TCPDEFERACCEPTDROP, LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */ LINUX_MIB_TCPTIMEWAITOVERFLOW, /* TCPTimeWaitOverflow */ + LINUX_MIB_TCPREQQFULLDOCOOKIES, /* TCPReqQFullDoCookies */ + LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ __LINUX_MIB_MAX }; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 99e6e19b57c2..4c0766e201e3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -96,7 +96,8 @@ extern int sysctl_max_syn_backlog; */ struct listen_sock { u8 max_qlen_log; - /* 3 bytes hole, try to use */ + u8 synflood_warned; + /* 2 bytes hole, try to use */ int qlen; int qlen_young; int clock_hand; diff --git a/include/net/tcp.h b/include/net/tcp.h index 149a415d1e0a..e9b48b094683 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -460,6 +460,9 @@ extern int tcp_write_wakeup(struct sock *); extern void tcp_send_fin(struct sock *sk); extern void tcp_send_active_reset(struct sock *sk, gfp_t priority); extern int tcp_send_synack(struct sock *); +extern int tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto); extern void tcp_push_one(struct sock *, unsigned int mss_now); extern void tcp_send_ack(struct sock *sk); extern void tcp_send_delayed_ack(struct sock *sk); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b14ec7d03b6e..4bfad5da94f4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -254,6 +254,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), + SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), + SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c12b8ec849d..c34f01513945 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -808,20 +808,38 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static void syn_flood_warning(const struct sk_buff *skb) +/* + * Return 1 if a syncookie should be sent + */ +int tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto) { - const char *msg; + const char *msg = "Dropping request"; + int want_cookie = 0; + struct listen_sock *lopt; + + #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) + if (sysctl_tcp_syncookies) { msg = "Sending cookies"; - else + want_cookie = 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + } else #endif - msg = "Dropping request"; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - pr_info("TCP: Possible SYN flooding on port %d. %s.\n", - ntohs(tcp_hdr(skb)->dest), msg); + lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + if (!lopt->synflood_warned) { + lopt->synflood_warned = 1; + pr_info("%s: Possible SYN flooding on port %d. %s. " + " Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); + } + return want_cookie; } +EXPORT_SYMBOL(tcp_syn_flood_action); /* * Save and compile IPv4 options into the request_sock if needed. @@ -1235,11 +1253,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_SYN_COOKIES int want_cookie = 0; -#else -#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ -#endif /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1250,14 +1264,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - want_cookie = 1; - } else -#endif - goto drop; + want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); + if (!want_cookie) + goto drop; } /* Accept backlog is full. If we have already queued enough @@ -1303,9 +1312,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; -#ifdef CONFIG_SYN_COOKIES want_cookie = 0; /* not our kind of cookie */ -#endif tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1fb63f4aeb7..3c9fa618b69d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -531,20 +531,6 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, return tcp_v6_send_synack(sk, req, rvp); } -static inline void syn_flood_warning(struct sk_buff *skb) -{ -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest)); - else -#endif - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Dropping request.\n", ntohs(tcp_hdr(skb)->dest)); -} - static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree_skb(inet6_rsk(req)->pktopts); @@ -1179,11 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; -#ifdef CONFIG_SYN_COOKIES int want_cookie = 0; -#else -#define want_cookie 0 -#endif if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1192,14 +1174,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - want_cookie = 1; - else -#endif - goto drop; + want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); + if (!want_cookie) + goto drop; } if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) @@ -1249,9 +1226,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) while (l-- > 0) *c++ ^= *hash_location++; -#ifdef CONFIG_SYN_COOKIES want_cookie = 0; /* not our kind of cookie */ -#endif tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { -- cgit v1.2.3-71-gd317 From 48c830120f2a20b44220aa26feda9ed15f49eaab Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 31 Aug 2011 08:03:29 +0000 Subject: net: copy userspace buffers on device forwarding dev_forward_skb loops an skb back into host networking stack which might hang on the memory indefinitely. In particular, this can happen in macvtap in bridged mode. Copy the userspace fragments to avoid blocking the sender in that case. As this patch makes skb_copy_ubufs extern now, I also added some documentation and made it clear the SKBTX_DEV_ZEROCOPY flag automatically instead of doing it in all callers. This can be made into a separate patch if people feel it's worth it. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/dev.c | 8 ++++++++ net/core/skbuff.c | 22 +++++++++++++++++----- 3 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7b996ed86d5b..8bd383caa363 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -524,6 +524,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, extern bool skb_recycle_check(struct sk_buff *skb, int skb_size); extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); extern struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); extern struct sk_buff *skb_copy(const struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 17d67b579beb..b10ff0a71855 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1515,6 +1515,14 @@ static inline bool is_skb_forwardable(struct net_device *dev, */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + skb_orphan(skb); nf_reset(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 27002dffe7ed..387703f56fce 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -611,8 +611,21 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -/* skb frags copy userspace buffers to kernel */ -static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +/* skb_copy_ubufs - copy userspace skb frags buffers to kernel + * @skb: the skb to modify + * @gfp_mask: allocation priority + * + * This must be called on SKBTX_DEV_ZEROCOPY skb. + * It will copy all frags into kernel and drop the reference + * to userspace pages. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + * + * Returns 0 on success or a negative error code on failure + * to allocate kernel memory to copy to. + */ +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { int i; int num_frags = skb_shinfo(skb)->nr_frags; @@ -652,6 +665,8 @@ static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(skb)->frags[i - 1].page = head; head = (struct page *)head->private; } + + skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; return 0; } @@ -677,7 +692,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { if (skb_copy_ubufs(skb, gfp_mask)) return NULL; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; } n = skb + 1; @@ -803,7 +817,6 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) n = NULL; goto out; } - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; @@ -896,7 +909,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { if (skb_copy_ubufs(skb, gfp_mask)) goto nofrags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); -- cgit v1.2.3-71-gd317 From 5bd078dda4d4fbdb4bd138a6bd5b6e274c019ed2 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 14 Sep 2011 11:31:36 -0500 Subject: irq: Add declaration of irq_domain_simple_ops to irqdomain.h irq_domain_simple_ops is exported, but is not declared in irqdomain.h, so add it. Signed-off-by: Rob Herring Cc: Grant Likely Cc: marc.zyngier@arm.com Cc: thomas.abraham@linaro.org Cc: jamie@jamieiles.com Cc: b-cousson@ti.com Cc: shawn.guo@linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: devicetree-discuss@lists.ozlabs.org Link: http://lkml.kernel.org/r/1316017900-19918-2-git-send-email-robherring2@gmail.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e807ad687a07..3ad553e8eae2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -80,6 +80,7 @@ extern void irq_domain_del(struct irq_domain *domain); #endif /* CONFIG_IRQ_DOMAIN */ #if defined(CONFIG_IRQ_DOMAIN) && defined(CONFIG_OF_IRQ) +extern struct irq_domain_ops irq_domain_simple_ops; extern void irq_domain_add_simple(struct device_node *controller, int irq_base); extern void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start); -- cgit v1.2.3-71-gd317 From b6cf8788a3382c2000743a0e393bcc8aeb0601cb Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 20 Sep 2011 17:07:29 +0200 Subject: [S390] kvm: extension capability for new address space layout 598841ca9919d008b520114d8a4378c4ce4e40a1 ([S390] use gmap address spaces for kvm guest images) changed kvm on s390 to use a separate address space for kvm guests. We can now put KVM guests anywhere in the user address mode with a size up to 8PB - as long as the memory is 1MB-aligned. This change was done without KVM extension capability bit. The change was added after 3.0, but we still have a chance to add a feature bit before 3.1 (keeping the releases in a sane state). We use number 71 to avoid collisions with other pending kvm patches as requested by Alexander Graf. Signed-off-by: Christian Borntraeger Acked-by: Avi Kivity Cc: Alexander Graf Signed-off-by: Heiko Carstens --- arch/s390/kvm/kvm-s390.c | 1 + include/linux/kvm.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b4eced131e5c..dc2b580e27bc 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -123,6 +123,7 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { case KVM_CAP_S390_PSW: + case KVM_CAP_S390_GMAP: r = 1; break; default: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2c366b52f505..aace6b8691a2 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -553,6 +553,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_SPAPR_TCE 63 #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 +#define KVM_CAP_S390_GMAP 71 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-71-gd317 From d94c177beeb4469cd4f6e83354ab0223353e98ed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Sep 2011 17:44:55 -0700 Subject: vfs pathname lookup: Add LOOKUP_AUTOMOUNT flag Since we've now turned around and made LOOKUP_FOLLOW *not* force an automount, we want to add the ability to force an automount event on lookup even if we don't happen to have one of the other flags that force it implicitly (LOOKUP_OPEN, LOOKUP_DIRECTORY, LOOKUP_PARENT..) Most cases will never want to use this, since you'd normally want to delay automounting as long as possible, which usually implies LOOKUP_OPEN (when we open a file or directory, we really cannot avoid the automount any more). But Trond argued sufficiently forcefully that at a minimum bind mounting a file and quotactl will want to force the automount lookup. Some other cases (like nfs_follow_remote_path()) could use it too, although LOOKUP_DIRECTORY would work there as well. This commit just adds the flag and logic, no users yet, though. It also doesn't actually touch the LOOKUP_NO_AUTOMOUNT flag that is related, and was made irrelevant by the same change that made us not follow on LOOKUP_FOLLOW. Cc: Trond Myklebust Cc: Ian Kent Cc: Jeff Layton Cc: Miklos Szeredi Cc: David Howells Cc: Al Viro Cc: Greg KH Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- include/linux/namei.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index f4788365ea22..09606fd83d57 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -739,7 +739,7 @@ static int follow_automount(struct path *path, unsigned flags, * of the daemon to instantiate them before they can be used. */ if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE)) && + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; diff --git a/include/linux/namei.h b/include/linux/namei.h index 76fe2c62ae71..e13dac7caab2 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -48,6 +48,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; */ #define LOOKUP_FOLLOW 0x0001 #define LOOKUP_DIRECTORY 0x0002 +#define LOOKUP_AUTOMOUNT 0x0004 #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 -- cgit v1.2.3-71-gd317 From b6c8069d3577481390b3f24a8434ad72a3235594 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 27 Sep 2011 08:12:33 -0700 Subject: vfs: remove LOOKUP_NO_AUTOMOUNT flag That flag no longer makes sense, since we don't look up automount points as eagerly any more. Additionally, it turns out that the NO_AUTOMOUNT handling was buggy to begin with: it would avoid automounting even for cases where we really *needed* to do the automount handling, and could return ENOENT for autofs entries that hadn't been instantiated yet. With our new non-eager automount semantics, one discussion has been about adding a AT_AUTOMOUNT flag to vfs_fstatat (and thus the newfstatat() and fstatat64() system calls), but it's probably not worth it: you can always force at least directory automounting by simply adding the final '/' to the filename, which works for *all* of the stat family system calls, old and new. So AT_NO_AUTOMOUNT (and thus LOOKUP_NO_AUTOMOUNT) really were just a result of our bad default behavior. Acked-by: Ian Kent Acked-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/namei.c | 6 ------ fs/stat.c | 2 -- include/linux/namei.h | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 09606fd83d57..0b3138de2a3b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -721,12 +721,6 @@ static int follow_automount(struct path *path, unsigned flags, if (!path->dentry->d_op || !path->dentry->d_op->d_automount) return -EREMOTE; - /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT - * and this is the terminal part of the path. - */ - if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) - return -EISDIR; /* we actually want to stop here */ - /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. diff --git a/fs/stat.c b/fs/stat.c index ba5316ffac61..78a3aa83c7ea 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - if (flag & AT_NO_AUTOMOUNT) - lookup_flags |= LOOKUP_NO_AUTOMOUNT; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; diff --git a/include/linux/namei.h b/include/linux/namei.h index e13dac7caab2..409328d1cbbb 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -53,7 +53,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 #define LOOKUP_RCU 0x0040 -#define LOOKUP_NO_AUTOMOUNT 0x0080 + /* * Intent data */ -- cgit v1.2.3-71-gd317 From f75159e9936143177b442afc78150b7a7ad8aa07 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 20 Sep 2011 01:25:41 +0000 Subject: ptp: fix L2 event message recognition The IEEE 1588 standard defines two kinds of messages, event and general messages. Event messages require time stamping, and general do not. When using UDP transport, two separate ports are used for the two message types. The BPF designed to recognize event messages incorrectly classifies L2 general messages as event messages. This commit fixes the issue by extending the filter to check the message type field for L2 PTP packets. Event messages are be distinguished from general messages by testing the "general" bit. Signed-off-by: Richard Cochran Cc: Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index e07e2742a865..1dc420ba213a 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -51,6 +51,7 @@ #define PTP_CLASS_V2_VLAN (PTP_CLASS_V2 | PTP_CLASS_VLAN) #define PTP_EV_PORT 319 +#define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */ #define OFF_ETYPE 12 #define OFF_IHL 14 @@ -116,14 +117,20 @@ static inline int ptp_filter_init(struct sock_filter *f, int len) {OP_OR, 0, 0, PTP_CLASS_IPV6 }, /* */ \ {OP_RETA, 0, 0, 0 }, /* */ \ /*L3x*/ {OP_RETK, 0, 0, PTP_CLASS_NONE }, /* */ \ -/*L40*/ {OP_JEQ, 0, 6, ETH_P_8021Q }, /* f goto L50 */ \ +/*L40*/ {OP_JEQ, 0, 9, ETH_P_8021Q }, /* f goto L50 */ \ {OP_LDH, 0, 0, OFF_ETYPE + 4 }, /* */ \ - {OP_JEQ, 0, 9, ETH_P_1588 }, /* f goto L60 */ \ + {OP_JEQ, 0, 15, ETH_P_1588 }, /* f goto L60 */ \ + {OP_LDB, 0, 0, ETH_HLEN + VLAN_HLEN }, /* */ \ + {OP_AND, 0, 0, PTP_GEN_BIT }, /* */ \ + {OP_JEQ, 0, 12, 0 }, /* f goto L6x */ \ {OP_LDH, 0, 0, ETH_HLEN + VLAN_HLEN }, /* */ \ {OP_AND, 0, 0, PTP_CLASS_VMASK }, /* */ \ {OP_OR, 0, 0, PTP_CLASS_VLAN }, /* */ \ {OP_RETA, 0, 0, 0 }, /* */ \ -/*L50*/ {OP_JEQ, 0, 4, ETH_P_1588 }, /* f goto L61 */ \ +/*L50*/ {OP_JEQ, 0, 7, ETH_P_1588 }, /* f goto L61 */ \ + {OP_LDB, 0, 0, ETH_HLEN }, /* */ \ + {OP_AND, 0, 0, PTP_GEN_BIT }, /* */ \ + {OP_JEQ, 0, 4, 0 }, /* f goto L6x */ \ {OP_LDH, 0, 0, ETH_HLEN }, /* */ \ {OP_AND, 0, 0, PTP_CLASS_VMASK }, /* */ \ {OP_OR, 0, 0, PTP_CLASS_L2 }, /* */ \ -- cgit v1.2.3-71-gd317 From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Sep 2011 12:42:04 +0200 Subject: posix-cpu-timers: Cure SMP wobbles David reported: Attached below is a watered-down version of rt/tst-cpuclock2.c from GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or similar. Run it several times, and you will see cases where the main thread will measure a process clock difference before and after the nanosleep which is smaller than the cpu-burner thread's individual thread clock difference. This doesn't make any sense since the cpu-burner thread is part of the top-level process's thread group. I've reproduced this on both x86-64 and sparc64 (using both 32-bit and 64-bit binaries). For example: [davem@boricha build-x86_64-linux]$ ./test process: before(0.001221967) after(0.498624371) diff(497402404) thread: before(0.000081692) after(0.498316431) diff(498234739) self: before(0.001223521) after(0.001240219) diff(16698) [davem@boricha build-x86_64-linux]$ The diff of 'process' should always be >= the diff of 'thread'. I make sure to wrap the 'thread' clock measurements the most tightly around the nanosleep() call, and that the 'process' clock measurements are the outer-most ones. --- #include #include #include #include #include #include #include #include static pthread_barrier_t barrier; static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) __asm__ __volatile__("" : : : "memory"); return NULL; } int main(void) { clockid_t process_clock, my_thread_clock, th_clock; struct timespec process_before, process_after; struct timespec me_before, me_after; struct timespec th_before, th_after; struct timespec sleeptime; unsigned long diff; pthread_t th; int err; err = clock_getcpuclockid(0, &process_clock); if (err) return 1; err = pthread_getcpuclockid(pthread_self(), &my_thread_clock); if (err) return 1; pthread_barrier_init(&barrier, NULL, 2); err = pthread_create(&th, NULL, chew_cpu, NULL); if (err) return 1; err = pthread_getcpuclockid(th, &th_clock); if (err) return 1; pthread_barrier_wait(&barrier); err = clock_gettime(process_clock, &process_before); if (err) return 1; err = clock_gettime(my_thread_clock, &me_before); if (err) return 1; err = clock_gettime(th_clock, &th_before); if (err) return 1; sleeptime.tv_sec = 0; sleeptime.tv_nsec = 500000000; nanosleep(&sleeptime, NULL); err = clock_gettime(th_clock, &th_after); if (err) return 1; err = clock_gettime(my_thread_clock, &me_after); if (err) return 1; err = clock_gettime(process_clock, &process_after); if (err) return 1; diff = process_after.tv_nsec - process_before.tv_nsec; printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", process_before.tv_sec, process_before.tv_nsec, process_after.tv_sec, process_after.tv_nsec, diff); diff = th_after.tv_nsec - th_before.tv_nsec; printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", th_before.tv_sec, th_before.tv_nsec, th_after.tv_sec, th_after.tv_nsec, diff); diff = me_after.tv_nsec - me_before.tv_nsec; printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n", me_before.tv_sec, me_before.tv_nsec, me_after.tv_sec, me_after.tv_nsec, diff); return 0; } This is due to us using p->se.sum_exec_runtime in thread_group_cputime() where we iterate the thread group and sum all data. This does not take time since the last schedule operation (tick or otherwise) into account. We can cure this by using task_sched_runtime() at the cost of having to take locks. This also means we can (and must) do away with thread_group_sched_runtime() since the modified thread_group_cputime() is now more accurate and would deadlock when called from thread_group_sched_runtime(). Aside of that it makes the function safe on 32 bit systems. The old code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a 64bit value and could be changed on another cpu at the same time. Reported-by: David Miller Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins Tested-by: David Miller Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 - kernel/posix-cpu-timers.c | 5 +++-- kernel/sched.c | 24 ------------------------ 3 files changed, 3 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ac2c0578e0f..41d0237fd449 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1956,7 +1956,6 @@ static inline void disable_sched_clock_irqtime(void) {} extern unsigned long long task_sched_runtime(struct task_struct *task); -extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..c8008dd58ef2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); - times->sum_exec_runtime += t->se.sum_exec_runtime; + times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); @@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + thread_group_cputime(p, &cputime); + cpu->sched = cputime.sum_exec_runtime; break; } return 0; diff --git a/kernel/sched.c b/kernel/sched.c index d249ea88428c..b50b0f0c9aa9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3724,30 +3724,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -/* - * Return sum_exec_runtime for the thread group. - * In case the task is currently running, return the sum plus current's - * pending runtime that have not been accounted yet. - * - * Note that the thread group might have other running tasks as well, - * so the return value not includes other pending runtime that other - * running tasks might have. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - struct task_cputime totals; - unsigned long flags; - struct rq *rq; - u64 ns; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to -- cgit v1.2.3-71-gd317 From 5f39e6705faade2e89d119958a8c51b9b6e2c53c Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Mon, 3 Oct 2011 09:50:20 -0500 Subject: PCI: Disable MPS configuration by default Add the ability to disable PCI-E MPS turning and using the BIOS configured MPS defaults. Due to the number of issues recently discovered on some x86 chipsets, make this the default behavior. Also, add the option for peer to peer DMA MPS configuration. Peer to peer DMA is outside the scope of this patch, but MPS configuration could prevent it from working by having the MPS on one root port different than the MPS on another. To work around this, simply make the system wide MPS the smallest possible value (128B). Signed-off-by: Jon Mason Acked-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- drivers/pci/pci.c | 6 +++++- drivers/pci/probe.c | 14 +++++++++++++- include/linux/pci.h | 3 ++- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 4e84fd4a4312..e9651f0a8817 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -77,7 +77,7 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; -enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_SAFE; +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_TUNE_OFF; /* * The default CLS is used if arch didn't set CLS explicitly and not @@ -3568,10 +3568,14 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "pcie_bus_tune_off", 17)) { + pcie_bus_config = PCIE_BUS_TUNE_OFF; } else if (!strncmp(str, "pcie_bus_safe", 13)) { pcie_bus_config = PCIE_BUS_SAFE; } else if (!strncmp(str, "pcie_bus_perf", 13)) { pcie_bus_config = PCIE_BUS_PERFORMANCE; + } else if (!strncmp(str, "pcie_bus_peer2peer", 18)) { + pcie_bus_config = PCIE_BUS_PEER2PEER; } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index f3f94a5c068f..6ab6bd3df4b2 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1458,12 +1458,24 @@ static int pcie_bus_configure_set(struct pci_dev *dev, void *data) */ void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss) { - u8 smpss = mpss; + u8 smpss; if (!pci_is_pcie(bus->self)) return; + if (pcie_bus_config == PCIE_BUS_TUNE_OFF) + return; + + /* FIXME - Peer to peer DMA is possible, though the endpoint would need + * to be aware to the MPS of the destination. To work around this, + * simply force the MPS of the entire system to the smallest possible. + */ + if (pcie_bus_config == PCIE_BUS_PEER2PEER) + smpss = 0; + if (pcie_bus_config == PCIE_BUS_SAFE) { + smpss = mpss; + pcie_find_smpss(bus->self, &smpss); pci_walk_bus(bus, pcie_find_smpss, &smpss); } diff --git a/include/linux/pci.h b/include/linux/pci.h index 8c230cbcbb48..9fc01226055b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -621,8 +621,9 @@ struct pci_driver { extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss); enum pcie_bus_config_types { - PCIE_BUS_PERFORMANCE, + PCIE_BUS_TUNE_OFF, PCIE_BUS_SAFE, + PCIE_BUS_PERFORMANCE, PCIE_BUS_PEER2PEER, }; -- cgit v1.2.3-71-gd317