From e92251762d02a46177d4105d1744041e3f8bc465 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 2 Feb 2006 12:23:12 +0000 Subject: [MMC] Add MMC command type flags Some hosts need to know the command type, so pass it via a set of flags in cmd->flags. Signed-off-by: Russell King --- include/linux/mmc/mmc.h | 35 +++++++++++++++++++++++------------ include/linux/mmc/protocol.h | 2 +- 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index ccd3e13de1e8..f38872abc126 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -21,24 +21,35 @@ struct mmc_command { u32 arg; u32 resp[4]; unsigned int flags; /* expected response type */ -#define MMC_RSP_NONE (0 << 0) -#define MMC_RSP_SHORT (1 << 0) -#define MMC_RSP_LONG (2 << 0) -#define MMC_RSP_MASK (3 << 0) -#define MMC_RSP_CRC (1 << 3) /* expect valid crc */ -#define MMC_RSP_BUSY (1 << 4) /* card may send busy */ -#define MMC_RSP_OPCODE (1 << 5) /* response contains opcode */ +#define MMC_RSP_PRESENT (1 << 0) +#define MMC_RSP_136 (1 << 1) /* 136 bit response */ +#define MMC_RSP_CRC (1 << 2) /* expect valid crc */ +#define MMC_RSP_BUSY (1 << 3) /* card may send busy */ +#define MMC_RSP_OPCODE (1 << 4) /* response contains opcode */ +#define MMC_CMD_MASK (3 << 5) /* command type */ +#define MMC_CMD_AC (0 << 5) +#define MMC_CMD_ADTC (1 << 5) +#define MMC_CMD_BC (2 << 5) +#define MMC_CMD_BCR (3 << 5) /* * These are the response types, and correspond to valid bit * patterns of the above flags. One additional valid pattern * is all zeros, which means we don't expect a response. */ -#define MMC_RSP_R1 (MMC_RSP_SHORT|MMC_RSP_CRC|MMC_RSP_OPCODE) -#define MMC_RSP_R1B (MMC_RSP_SHORT|MMC_RSP_CRC|MMC_RSP_OPCODE|MMC_RSP_BUSY) -#define MMC_RSP_R2 (MMC_RSP_LONG|MMC_RSP_CRC) -#define MMC_RSP_R3 (MMC_RSP_SHORT) -#define MMC_RSP_R6 (MMC_RSP_SHORT|MMC_RSP_CRC) +#define MMC_RSP_NONE (0) +#define MMC_RSP_R1 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) +#define MMC_RSP_R1B (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE|MMC_RSP_BUSY) +#define MMC_RSP_R2 (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC) +#define MMC_RSP_R3 (MMC_RSP_PRESENT) +#define MMC_RSP_R6 (MMC_RSP_PRESENT|MMC_RSP_CRC) + +#define mmc_resp_type(cmd) ((cmd)->flags & (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC|MMC_RSP_BUSY|MMC_RSP_OPCODE)) + +/* + * These are the command types. + */ +#define mmc_cmd_type(cmd) ((cmd)->flags & MMC_CMD_TYPE) unsigned int retries; /* max number of retries */ unsigned int error; /* command error */ diff --git a/include/linux/mmc/protocol.h b/include/linux/mmc/protocol.h index a14dc306545b..81c3f77f652c 100644 --- a/include/linux/mmc/protocol.h +++ b/include/linux/mmc/protocol.h @@ -79,7 +79,7 @@ /* SD commands type argument response */ /* class 8 */ /* This is basically the same command as for MMC with some quirks. */ -#define SD_SEND_RELATIVE_ADDR 3 /* ac R6 */ +#define SD_SEND_RELATIVE_ADDR 3 /* bcr R6 */ /* Application commands */ #define SD_APP_SET_BUS_WIDTH 6 /* ac [1:0] bus width R1 */ -- cgit v1.2.3-71-gd317 From 3ec9c59449744dcc390d593a017d30671546fd9e Mon Sep 17 00:00:00 2001 From: Andrey Panin Date: Thu, 2 Feb 2006 20:15:09 +0000 Subject: [SERIAL] SIIG 8-port serial boards support This patch adds support for SIIG 8-port boards. These boards have 4 ports in separate bars and another 4 ports in the single bar. Because of this strange port arrangement these cards need special setup function. Fortunately no other SIIG cards have more than 4 port, so this setup function could be used for them too. Signed-off-by: Andrey Panin Signed-off-by: Russell King --- drivers/serial/8250_pci.c | 25 ++++++++++++++++++++++++- include/linux/pci_ids.h | 3 +++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index 2a912153321e..bb9ec28ccc2b 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -439,6 +439,20 @@ static int pci_siig_init(struct pci_dev *dev) return -ENODEV; } +static int pci_siig_setup(struct serial_private *priv, + struct pciserial_board *board, + struct uart_port *port, int idx) +{ + unsigned int bar = FL_GET_BASE(board->flags) + idx, offset = 0; + + if (idx > 3) { + bar = 4; + offset = (idx - 4) * 8; + } + + return setup_port(priv, port, bar, offset, 0); +} + /* * Timedia has an explosion of boards, and to avoid the PCI table from * growing *huge*, we use this function to collapse some 70 entries @@ -748,7 +762,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = { .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, .init = pci_siig_init, - .setup = pci_default_setup, + .setup = pci_siig_setup, }, /* * Titan cards @@ -2141,6 +2155,15 @@ static struct pci_device_id serial_pci_tbl[] = { { PCI_VENDOR_ID_SIIG, PCI_DEVICE_ID_SIIG_4S_20x_850, PCI_ANY_ID, PCI_ANY_ID, 0, 0, pbn_b0_bt_4_921600 }, + { PCI_VENDOR_ID_SIIG, PCI_DEVICE_ID_SIIG_8S_20x_550, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, + pbn_b0_bt_8_921600 }, + { PCI_VENDOR_ID_SIIG, PCI_DEVICE_ID_SIIG_8S_20x_650, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, + pbn_b0_bt_8_921600 }, + { PCI_VENDOR_ID_SIIG, PCI_DEVICE_ID_SIIG_8S_20x_850, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, + pbn_b0_bt_8_921600 }, /* * Computone devices submitted by Doug McNash dmcnash@computone.com diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index b0b908f583c5..92a619ba163f 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1670,6 +1670,9 @@ #define PCI_DEVICE_ID_SIIG_2S1P_20x_550 0x2060 #define PCI_DEVICE_ID_SIIG_2S1P_20x_650 0x2061 #define PCI_DEVICE_ID_SIIG_2S1P_20x_850 0x2062 +#define PCI_DEVICE_ID_SIIG_8S_20x_550 0x2080 +#define PCI_DEVICE_ID_SIIG_8S_20x_650 0x2081 +#define PCI_DEVICE_ID_SIIG_8S_20x_850 0x2082 #define PCI_SUBDEVICE_ID_SIIG_QUARTET_SERIAL 0x2050 #define PCI_VENDOR_ID_RADISYS 0x1331 -- cgit v1.2.3-71-gd317 From 3d0f89bb169482d26d5aa4e82e763077e7e9bc4d Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 25 Jan 2006 13:31:07 -0800 Subject: configfs: Add permission and ownership to configfs objects. configfs always made item and attribute ownership root.root and permissions based on a umask of 022. Add ->setattr() to allow chown(2)/chmod(2), and persist the changes for the lifetime of the items and attributes. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- .../filesystems/configfs/configfs_example.c | 2 + fs/configfs/configfs_internal.h | 11 +- fs/configfs/dir.c | 36 +++++-- fs/configfs/file.c | 19 ++-- fs/configfs/inode.c | 117 +++++++++++++++++++-- fs/configfs/mount.c | 28 ++++- fs/configfs/symlink.c | 1 + include/linux/configfs.h | 2 +- 8 files changed, 179 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c index f3c6e4946f98..3d4713a6c207 100644 --- a/Documentation/filesystems/configfs/configfs_example.c +++ b/Documentation/filesystems/configfs/configfs_example.c @@ -320,6 +320,7 @@ static struct config_item_type simple_children_type = { .ct_item_ops = &simple_children_item_ops, .ct_group_ops = &simple_children_group_ops, .ct_attrs = simple_children_attrs, + .ct_owner = THIS_MODULE, }; static struct configfs_subsystem simple_children_subsys = { @@ -403,6 +404,7 @@ static struct config_item_type group_children_type = { .ct_item_ops = &group_children_item_ops, .ct_group_ops = &group_children_group_ops, .ct_attrs = group_children_attrs, + .ct_owner = THIS_MODULE, }; static struct configfs_subsystem group_children_subsys = { diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 8899d9c5f6bf..f70e46951b37 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -36,6 +36,7 @@ struct configfs_dirent { int s_type; umode_t s_mode; struct dentry * s_dentry; + struct iattr * s_iattr; }; #define CONFIGFS_ROOT 0x0001 @@ -48,10 +49,11 @@ struct configfs_dirent { #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) extern struct vfsmount * configfs_mount; +extern kmem_cache_t *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode); +extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); @@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); extern int configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry static inline void release_configfs_dirent(struct configfs_dirent * sd) { - if (!(sd->s_type & CONFIGFS_ROOT)) - kfree(sd); + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b668ec61527e..ca60e3abef45 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare { struct configfs_dirent * sd; - sd = kmalloc(sizeof(*sd), GFP_KERNEL); + sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; @@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p, int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - error = configfs_create(d, mode, init_dir); + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR); if (!error) { - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR); + error = configfs_create(d, mode, init_dir); if (!error) { p->d_inode->i_nlink++; (d)->d_op = &configfs_dentry_ops; + } else { + struct configfs_dirent *sd = d->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } } } return error; @@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl, int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; - err = configfs_create(dentry, mode, init_symlink); + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); if (!err) { - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, - mode, CONFIGFS_ITEM_LINK); + err = configfs_create(dentry, mode, init_symlink); if (!err) dentry->d_op = &configfs_dentry_ops; + else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } + } } return err; } @@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); - if (error) + if (error) { + configfs_put(sd); return error; + } dentry->d_op = &configfs_dentry_ops; - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); return 0; @@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = { .symlink = configfs_symlink, .unlink = configfs_unlink, .lookup = configfs_lookup, + .setattr = configfs_setattr, }; #if 0 diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c26cd61f13af..3921920d8716 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -150,7 +149,7 @@ out: /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. - * @userbuf: data from user. + * @buf: data from user. * @count: number of bytes in @userbuf. * * Allocate @buffer->page if it hasn't been already, then @@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size /** * flush_write_buffer - push buffer to config_item. - * @file: file pointer. + * @dentry: dentry to the attribute * @buffer: data buffer for file. + * @count: number of bytes * * Get the correct pointers for the config_item and the attribute we're * dealing with, then call the store() method for the attribute, @@ -217,15 +217,16 @@ static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer * buffer = file->private_data; + ssize_t len; down(&buffer->sem); - count = fill_write_buffer(buffer,buf,count); - if (count > 0) - count = flush_write_buffer(file->f_dentry,buffer,count); - if (count > 0) - *ppos += count; + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_dentry, buffer, count); + if (len > 0) + *ppos += len; up(&buffer->sem); - return count; + return len; } static int check_perm(struct inode * inode, struct file * file) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 6577c588de9d..737842f2764b 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "configfs_internal.h" @@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -struct inode * configfs_new_inode(mode_t mode) +static struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + if (error) + return error; + + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + memset(sd_iattr, 0, sizeof(struct iattr)); + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + + /* attributes were changed atleast once in past */ + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); } return inode; } @@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - if ((inode = configfs_new_inode(mode))) { + struct configfs_dirent *sd = dentry->d_fsdata; + if ((inode = configfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; @@ -103,7 +194,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { - struct attribute * attr; + struct configfs_attribute *attr; if (!sd || !sd->s_element) BUG(); @@ -114,7 +205,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & CONFIGFS_ITEM_ATTR) { attr = sd->s_element; - return attr->name; + return attr->ca_name; } return NULL; } @@ -130,13 +221,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else + } else { + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + } } } @@ -145,6 +240,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 1a2f6f6a4d91..f920d30478e5 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -38,6 +38,7 @@ struct vfsmount * configfs_mount = NULL; struct super_block * configfs_sb = NULL; +kmem_cache_t *configfs_dir_cachep; static int configfs_mnt_count = 0; static struct super_operations configfs_ops = { @@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = { .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, void *data, int silent) @@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; + sb->s_time_gran = 1; configfs_sb = sb; - inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root); if (inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; @@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL); static int __init configfs_init(void) { - int err; + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL, NULL); + if (!configfs_dir_cachep) + goto out; kset_set_kset_s(&config_subsys, kernel_subsys); err = subsystem_register(&config_subsys); - if (err) - return err; + if (err) { + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } err = register_filesystem(&configfs_fs_type); if (err) { printk(KERN_ERR "configfs: Unable to register filesystem!\n"); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } +out: return err; } @@ -148,11 +164,13 @@ static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.0.1"); +MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); module_init(configfs_init); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 50f5840521a9..99137026b409 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -277,5 +277,6 @@ struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, .put_link = configfs_put_link, + .setattr = configfs_setattr, }; diff --git a/include/linux/configfs.h b/include/linux/configfs.h index acffb8c9073a..a7f015027535 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -126,7 +126,7 @@ extern struct config_item *config_group_find_obj(struct config_group *, const ch struct configfs_attribute { - char *ca_name; + const char *ca_name; struct module *ca_owner; mode_t ca_mode; }; -- cgit v1.2.3-71-gd317 From 53ea68ecea11bcbb3451c2758ce181bd97b569a9 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 3 Feb 2006 08:21:12 -0500 Subject: [PATCH] SELinux: fix size-128 slab leak Remove private inode tests from security_inode_alloc and security_inode_free, as we otherwise end up leaking inode security structures for private inodes. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Linus Torvalds --- include/linux/security.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index bb1da86747c7..7cbef482e13a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1499,15 +1499,11 @@ static inline void security_sb_post_pivotroot (struct nameidata *old_nd, static inline int security_inode_alloc (struct inode *inode) { - if (unlikely (IS_PRIVATE (inode))) - return 0; return security_ops->inode_alloc_security (inode); } static inline void security_inode_free (struct inode *inode) { - if (unlikely (IS_PRIVATE (inode))) - return; security_ops->inode_free_security (inode); } -- cgit v1.2.3-71-gd317 From 19ea7302df2eb4f2ad7f29af814d8cf55fc8b9c9 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Sat, 4 Feb 2006 02:15:36 -0800 Subject: [NETFILTER]: iptables: fix typos in ipt_connbytes.h Fix some typos that make iptables userspace compilation fail. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ipt_connbytes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h index b04dfa3083c9..f63e6ee91113 100644 --- a/include/linux/netfilter_ipv4/ipt_connbytes.h +++ b/include/linux/netfilter_ipv4/ipt_connbytes.h @@ -1,10 +1,10 @@ #ifndef _IPT_CONNBYTES_H #define _IPT_CONNBYTES_H -#include +#include #define ipt_connbytes_what xt_connbytes_what -#define IPT_CONNBYTES_PKTS XT_CONNBYTES_PACKETS +#define IPT_CONNBYTES_PKTS XT_CONNBYTES_PKTS #define IPT_CONNBYTES_BYTES XT_CONNBYTES_BYTES #define IPT_CONNBYTES_AVGPKT XT_CONNBYTES_AVGPKT -- cgit v1.2.3-71-gd317 From 0047c65a60fa3b6607b55e058ea6a89f39cb3f28 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 4 Feb 2006 02:19:09 -0800 Subject: [NETFILTER]: Prepare {ipt,ip6t}_policy match for x_tables unification The IPv4 and IPv6 version of the policy match are identical besides address comparison and the data structure used for userspace communication. Unify the data structures to break compatiblity now (before it is released), so we can port it to x_tables in 2.6.17. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ipt_policy.h | 22 ++++++++++++++-------- include/linux/netfilter_ipv6/ip6t_policy.h | 22 ++++++++++++++-------- net/ipv4/netfilter/ipt_policy.c | 9 ++++++--- net/ipv6/netfilter/ip6t_policy.c | 4 ++-- 4 files changed, 36 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h index 7fd1bec453f1..a3f6eff39d33 100644 --- a/include/linux/netfilter_ipv4/ipt_policy.h +++ b/include/linux/netfilter_ipv4/ipt_policy.h @@ -27,16 +27,22 @@ struct ipt_policy_spec reqid:1; }; +union ipt_policy_addr +{ + struct in_addr a4; + struct in6_addr a6; +}; + struct ipt_policy_elem { - u_int32_t saddr; - u_int32_t smask; - u_int32_t daddr; - u_int32_t dmask; - u_int32_t spi; - u_int32_t reqid; - u_int8_t proto; - u_int8_t mode; + union ipt_policy_addr saddr; + union ipt_policy_addr smask; + union ipt_policy_addr daddr; + union ipt_policy_addr dmask; + u_int32_t spi; + u_int32_t reqid; + u_int8_t proto; + u_int8_t mode; struct ipt_policy_spec match; struct ipt_policy_spec invert; diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h index 5a93afcd2ff1..671bd818300f 100644 --- a/include/linux/netfilter_ipv6/ip6t_policy.h +++ b/include/linux/netfilter_ipv6/ip6t_policy.h @@ -27,16 +27,22 @@ struct ip6t_policy_spec reqid:1; }; +union ip6t_policy_addr +{ + struct in_addr a4; + struct in6_addr a6; +}; + struct ip6t_policy_elem { - struct in6_addr saddr; - struct in6_addr smask; - struct in6_addr daddr; - struct in6_addr dmask; - u_int32_t spi; - u_int32_t reqid; - u_int8_t proto; - u_int8_t mode; + union ip6t_policy_addr saddr; + union ip6t_policy_addr smask; + union ip6t_policy_addr daddr; + union ip6t_policy_addr dmask; + u_int32_t spi; + u_int32_t reqid; + u_int8_t proto; + u_int8_t mode; struct ip6t_policy_spec match; struct ip6t_policy_spec invert; diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c index a48949a3a750..5a7a265280f9 100644 --- a/net/ipv4/netfilter/ipt_policy.c +++ b/net/ipv4/netfilter/ipt_policy.c @@ -26,10 +26,13 @@ MODULE_LICENSE("GPL"); static inline int match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e) { -#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) +#define MATCH_ADDR(x,y,z) (!e->match.x || \ + ((e->x.a4.s_addr == (e->y.a4.s_addr & (z))) \ + ^ e->invert.x)) +#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) - return MATCH(saddr, x->props.saddr.a4 & e->smask) && - MATCH(daddr, x->id.daddr.a4 & e->dmask) && + return MATCH_ADDR(saddr, smask, x->props.saddr.a4) && + MATCH_ADDR(daddr, dmask, x->id.daddr.a4) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c index 1d0f48276123..3d39ec924041 100644 --- a/net/ipv6/netfilter/ip6t_policy.c +++ b/net/ipv6/netfilter/ip6t_policy.c @@ -26,8 +26,8 @@ MODULE_LICENSE("GPL"); static inline int match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e) { -#define MATCH_ADDR(x,y,z) (!e->match.x || \ - ((!ip6_masked_addrcmp(&e->x, &e->y, z)) \ +#define MATCH_ADDR(x,y,z) (!e->match.x || \ + ((!ip6_masked_addrcmp(&e->x.a6, &e->y.a6, z)) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) -- cgit v1.2.3-71-gd317 From a460ad62260def15c42130de253d6cfc32528a2f Mon Sep 17 00:00:00 2001 From: Phillip Susi Date: Sat, 4 Feb 2006 23:27:44 -0800 Subject: [PATCH] pktcdvd: Fix overflow for discs with large packets The pktcdvd driver was using an 8 bit field to store the packet length obtained from the disc track info. This causes it to overflow packet length values of 128KB or more. I changed the field to 32 bits to fix this. The pktcdvd driver defaulted to its maximum allowed packet length when it detected a 0 in the track info field. I changed this to fail the operation and refuse to access the media. This seems more sane than attempting to access it with a value that almost certainly will not work. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 2 +- include/linux/pktcdvd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 93affeeef7bd..d95e7e1ac355 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1639,7 +1639,7 @@ static int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { printk("pktcdvd: detected zero packet size!\n"); - pd->settings.size = 128; + return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { printk("pktcdvd: packet size is too big\n"); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 2c177e4c8f22..d1c9c4a86e52 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -114,7 +114,7 @@ struct pkt_ctrl_command { struct packet_settings { - __u8 size; /* packet size in (512 byte) sectors */ + __u32 size; /* packet size in (512 byte) sectors */ __u8 fp; /* fixed packets */ __u8 link_loss; /* the rest is specified * as per Mt Fuji */ -- cgit v1.2.3-71-gd317 From e1bc89bc9991e994f2b3c60d9ad2fdb5ad9b10fc Mon Sep 17 00:00:00 2001 From: Peter Osterlund Date: Sat, 4 Feb 2006 23:27:47 -0800 Subject: [PATCH] pktcdvd: Don't waste kernel memory Allocate memory for read-gathering at open time, when it is known just how much memory is needed. This avoids wasting kernel memory when the real packet size is smaller than the maximum packet size supported by the driver. This is always the case when using DVD discs. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/Kconfig | 4 ++-- drivers/block/pktcdvd.c | 53 ++++++++++++++++++++++++++----------------------- include/linux/pktcdvd.h | 4 ++-- 3 files changed, 32 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index db6818fdf15d..8b1331677407 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -433,8 +433,8 @@ config CDROM_PKTCDVD_BUFFERS This controls the maximum number of active concurrent packets. More concurrent packets can increase write performance, but also require more memory. Each concurrent packet will require approximately 64Kb - of non-swappable kernel memory, memory which will be allocated at - pktsetup time. + of non-swappable kernel memory, memory which will be allocated when + a disc is opened for writing. config CDROM_PKTCDVD_WCACHE bool "Enable write caching (EXPERIMENTAL)" diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index cd16813effc5..4e7dbcc425ff 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -129,7 +129,7 @@ static struct bio *pkt_bio_alloc(int nr_iovecs) /* * Allocate a packet_data struct */ -static struct packet_data *pkt_alloc_packet_data(void) +static struct packet_data *pkt_alloc_packet_data(int frames) { int i; struct packet_data *pkt; @@ -138,11 +138,12 @@ static struct packet_data *pkt_alloc_packet_data(void) if (!pkt) goto no_pkt; - pkt->w_bio = pkt_bio_alloc(PACKET_MAX_SIZE); + pkt->frames = frames; + pkt->w_bio = pkt_bio_alloc(frames); if (!pkt->w_bio) goto no_bio; - for (i = 0; i < PAGES_PER_PACKET; i++) { + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!pkt->pages[i]) goto no_page; @@ -150,7 +151,7 @@ static struct packet_data *pkt_alloc_packet_data(void) spin_lock_init(&pkt->lock); - for (i = 0; i < PACKET_MAX_SIZE; i++) { + for (i = 0; i < frames; i++) { struct bio *bio = pkt_bio_alloc(1); if (!bio) goto no_rd_bio; @@ -160,14 +161,14 @@ static struct packet_data *pkt_alloc_packet_data(void) return pkt; no_rd_bio: - for (i = 0; i < PACKET_MAX_SIZE; i++) { + for (i = 0; i < frames; i++) { struct bio *bio = pkt->r_bios[i]; if (bio) bio_put(bio); } no_page: - for (i = 0; i < PAGES_PER_PACKET; i++) + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) if (pkt->pages[i]) __free_page(pkt->pages[i]); bio_put(pkt->w_bio); @@ -184,12 +185,12 @@ static void pkt_free_packet_data(struct packet_data *pkt) { int i; - for (i = 0; i < PACKET_MAX_SIZE; i++) { + for (i = 0; i < pkt->frames; i++) { struct bio *bio = pkt->r_bios[i]; if (bio) bio_put(bio); } - for (i = 0; i < PAGES_PER_PACKET; i++) + for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) __free_page(pkt->pages[i]); bio_put(pkt->w_bio); kfree(pkt); @@ -204,17 +205,17 @@ static void pkt_shrink_pktlist(struct pktcdvd_device *pd) list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { pkt_free_packet_data(pkt); } + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); } static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) { struct packet_data *pkt; - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); - INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); - spin_lock_init(&pd->cdrw.active_list_lock); + BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); + while (nr_packets > 0) { - pkt = pkt_alloc_packet_data(); + pkt = pkt_alloc_packet_data(pd->settings.size >> 2); if (!pkt) { pkt_shrink_pktlist(pd); return 0; @@ -949,7 +950,7 @@ try_next_bio: pd->current_sector = zone + pd->settings.size; pkt->sector = zone; - pkt->frames = pd->settings.size >> 2; + BUG_ON(pkt->frames != pd->settings.size >> 2); pkt->write_size = 0; /* @@ -1985,8 +1986,14 @@ static int pkt_open_dev(struct pktcdvd_device *pd, int write) if ((ret = pkt_set_segment_merging(pd, q))) goto out_unclaim; - if (write) + if (write) { + if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { + printk("pktcdvd: not enough memory for buffers\n"); + ret = -ENOMEM; + goto out_unclaim; + } printk("pktcdvd: %lukB available on disc\n", lba << 1); + } return 0; @@ -2012,6 +2019,8 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); bd_release(pd->bdev); blkdev_put(pd->bdev); + + pkt_shrink_pktlist(pd); } static struct pktcdvd_device *pkt_find_dev_from_minor(int dev_minor) @@ -2377,12 +2386,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - printk("pktcdvd: not enough memory for buffers\n"); - ret = -ENOMEM; - goto out_mem; - } - pd->bdev = bdev; set_blocksize(bdev, CD_FRAMESIZE); @@ -2393,7 +2396,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (IS_ERR(pd->cdrw.thread)) { printk("pktcdvd: can't start kernel thread\n"); ret = -ENOMEM; - goto out_thread; + goto out_mem; } proc = create_proc_entry(pd->name, 0, pkt_proc); @@ -2404,8 +2407,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) DPRINTK("pktcdvd: writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); return 0; -out_thread: - pkt_shrink_pktlist(pd); out_mem: blkdev_put(bdev); /* This is safe: open() is still holding a reference. */ @@ -2501,6 +2502,10 @@ static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd) goto out_mem; pd->disk = disk; + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); + INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); + spin_lock_init(&pd->cdrw.active_list_lock); + spin_lock_init(&pd->lock); spin_lock_init(&pd->iosched.lock); sprintf(pd->name, "pktcdvd%d", idx); @@ -2565,8 +2570,6 @@ static int pkt_remove_dev(struct pkt_ctrl_command *ctrl_cmd) blkdev_put(pd->bdev); - pkt_shrink_pktlist(pd); - remove_proc_entry(pd->name, pkt_proc); DPRINTK("pktcdvd: writer %s unmapped\n", pd->name); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index d1c9c4a86e52..1623da88d6fe 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -170,7 +170,7 @@ struct packet_iosched #error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" #endif #define PACKET_MAX_SIZE 32 -#define PAGES_PER_PACKET (PACKET_MAX_SIZE * CD_FRAMESIZE / PAGE_SIZE) +#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) #define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) enum packet_data_state { @@ -219,7 +219,7 @@ struct packet_data atomic_t io_errors; /* Number of read/write errors during IO */ struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */ - struct page *pages[PAGES_PER_PACKET]; + struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE]; int cache_valid; /* If non-zero, the data for the zone defined */ /* by the sector variable is completely cached */ -- cgit v1.2.3-71-gd317 From 5c55ac9bbca22ee134408f83de5f2bda3b1b2a53 Mon Sep 17 00:00:00 2001 From: Phillip Susi Date: Sat, 4 Feb 2006 23:27:48 -0800 Subject: [PATCH] pktcdvd: Allow larger packets The pktcdvd driver uses a compile time macro constant to define the maximum supported packet length. I changed this from 32 sectors to 128 sectors because that allows over 100 MB of additional usable space on a 700 MB cdrw, and increases throughput. Note that you need a modified cdrwtool program that can format a CDRW disc with larger packets to benefit from this change. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pktcdvd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 1623da88d6fe..8a94c717c266 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -169,7 +169,7 @@ struct packet_iosched #if (PAGE_SIZE % CD_FRAMESIZE) != 0 #error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" #endif -#define PACKET_MAX_SIZE 32 +#define PACKET_MAX_SIZE 128 #define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) #define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) -- cgit v1.2.3-71-gd317 From bc5e483da61eb5ab8d24b4a919fb512e5886d02c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 Feb 2006 23:27:51 -0800 Subject: [PATCH] reiserfs_get_acl() build fix With CONFIG_REISERFS_FS_XATTR=y, CONFIG_REISERFS_FS_POSIX_ACL=n: fs/reiserfs/xattr.c: In function `reiserfs_check_acl': fs/reiserfs/xattr.c:1330: called object is not a function Cc: Chris Mason Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reiserfs_acl.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h index 0a3605099c44..806ec5b06707 100644 --- a/include/linux/reiserfs_acl.h +++ b/include/linux/reiserfs_acl.h @@ -58,9 +58,13 @@ extern struct reiserfs_xattr_handler posix_acl_default_handler; extern struct reiserfs_xattr_handler posix_acl_access_handler; #else -#define reiserfs_get_acl NULL #define reiserfs_cache_default_acl(inode) 0 +static inline struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} + static inline int reiserfs_xattr_posix_acl_init(void) { return 0; -- cgit v1.2.3-71-gd317 From fe1dcbc4f311c2e6c23b33c0fa8572461618ab3e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 Feb 2006 23:27:54 -0800 Subject: [PATCH] jbd: fix transaction batching Ben points out that: When writing files out using O_SYNC, jbd's 1 jiffy delay results in a significant drop in throughput as the disk sits idle. The patch below results in a 4-5x performance improvement (from 6.5MB/s to ~24-30MB/s on my IDE test box) when writing out files using O_SYNC. So optimise the batching code by omitting it entirely if the process which is doing a sync write is the same as the one which did the most recent sync write. If that's true, we're unlikely to get any other processes joining the transaction. (Has been in -mm for ages - it took me a long time to get on to performance testing it) Numbers, on write-cache-disabled IDE: /usr/bin/time -p synctest -n 10 -uf -t 1 -p 1 dir-name Unpatched: 40 seconds Patched: 35 seconds Batching disabled: 35 seconds This is the problematic single-process-doing-fsync case. With multiple fsyncing processes the numbers are AFACIT unaltered by the patch. Aside: performance testing and instrumentation shows that the transaction batching almost doesn't help (testing with synctest -n 1 -uf -t 100 -p 10 dir-name on non-writeback-caching IDE). This is because by the time one process is running a synchronous commit, a bunch of other processes already have a transaction handle open, so they're all going to batch into the same transaction anyway. The batching seems to offer maybe 5-10% speedup with this workload, but I'm pretty sure it was more important than that when it was first developed 4-odd years ago... Cc: "Stephen C. Tweedie" Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/transaction.c | 10 +++++++++- include/linux/jbd.h | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 429f4b263cf1..ca917973c2c0 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int old_handle_count, err; + pid_t pid; J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); @@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + journal->j_last_sync_writer = pid; do { old_handle_count = transaction->t_handle_count; schedule_timeout_uninterruptible(1); diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 558cb4c26ec9..751bb3849467 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -23,6 +23,7 @@ #define jfs_debug jbd_debug #else +#include #include #include #include @@ -618,6 +619,7 @@ struct transaction_s * @j_wbuf: array of buffer_heads for journal_commit_transaction * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize + * @j_last_sync_writer: most recent pid which did a synchronous write * @j_private: An opaque pointer to fs-private information. */ @@ -807,6 +809,8 @@ struct journal_s struct buffer_head **j_wbuf; int j_wbufsize; + pid_t j_last_sync_writer; + /* * An opaque pointer to fs-private information. ext3 puts its * superblock pointer here -- cgit v1.2.3-71-gd317 From 21bbd691827e3610ef975a88863859381ac8d8e0 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 9 Jan 2006 15:19:18 +1100 Subject: [PATCH] I2C: Resurrect i2c_smbus_write_i2c_block_data. Signed-off-by: Jean Delvare --- drivers/i2c/i2c-core.c | 15 +++++++++++++++ include/linux/i2c.h | 3 +++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 0ce58b506046..1a2c9ab5d9e3 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -946,6 +946,20 @@ s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command, u8 *val } } +s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client, u8 command, + u8 length, u8 *values) +{ + union i2c_smbus_data data; + + if (length > I2C_SMBUS_BLOCK_MAX) + length = I2C_SMBUS_BLOCK_MAX; + data.block[0] = length; + memcpy(data.block + 1, values, length); + return i2c_smbus_xfer(client->adapter, client->addr, client->flags, + I2C_SMBUS_WRITE, command, + I2C_SMBUS_I2C_BLOCK_DATA, &data); +} + /* Simulate a SMBus command using the i2c protocol No checking of parameters is done! */ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter * adapter, u16 addr, @@ -1150,6 +1164,7 @@ EXPORT_SYMBOL(i2c_smbus_read_word_data); EXPORT_SYMBOL(i2c_smbus_write_word_data); EXPORT_SYMBOL(i2c_smbus_write_block_data); EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data); +EXPORT_SYMBOL(i2c_smbus_write_i2c_block_data); MODULE_AUTHOR("Simon G. Vogl "); MODULE_DESCRIPTION("I2C-Bus main module"); diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7863a59bd598..63f1d63cc1d8 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -100,6 +100,9 @@ extern s32 i2c_smbus_write_block_data(struct i2c_client * client, /* Returns the number of read bytes */ extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client, u8 command, u8 *values); +extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client, + u8 command, u8 length, + u8 *values); /* * A driver is capable of handling one or more physical devices present on -- cgit v1.2.3-71-gd317 From 0dfd812d4b2afc797310943b451608d347854e76 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 7 Feb 2006 06:45:34 -0200 Subject: V4L/DVB (3300): Add standard for South Korean NTSC-M using A2 audio. South Korea uses NTSC-M but with A2 audio instead of BTSC. Several audio chips need this information in order to set the correct audio processing registers. Acked-by: Mauro Carvalho Chehab Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/bttv-driver.c | 2 +- drivers/media/video/cx25840/cx25840-core.c | 50 ++++++++++++------------------ drivers/media/video/tda9887.c | 7 ++++- drivers/media/video/tuner-core.c | 5 +++ include/linux/videodev2.h | 4 ++- 5 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c index aa4c4c521880..578b20085082 100644 --- a/drivers/media/video/bttv-driver.c +++ b/drivers/media/video/bttv-driver.c @@ -214,7 +214,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = { we can capture, of the first and second field. */ .vbistart = { 7,320 }, },{ - .v4l2_id = V4L2_STD_NTSC_M, + .v4l2_id = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR, .name = "NTSC", .Fsc = 28636363, .swidth = 768, diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c index c66c2c1f4809..08ffd1f325fc 100644 --- a/drivers/media/video/cx25840/cx25840-core.c +++ b/drivers/media/video/cx25840/cx25840-core.c @@ -220,33 +220,23 @@ static void input_change(struct i2c_client *client) cx25840_write(client, 0x808, 0xff); cx25840_write(client, 0x80b, 0x10); } else if (std & V4L2_STD_NTSC) { - /* NTSC */ - if (state->pvr150_workaround) { - /* Certain Hauppauge PVR150 models have a hardware bug - that causes audio to drop out. For these models the - audio standard must be set explicitly. - To be precise: it affects cards with tuner models - 85, 99 and 112 (model numbers from tveeprom). */ - if (std == V4L2_STD_NTSC_M_JP) { - /* Japan uses EIAJ audio standard */ - cx25840_write(client, 0x808, 0x2f); - } else { - /* Others use the BTSC audio standard */ - cx25840_write(client, 0x808, 0x1f); - } - /* South Korea uses the A2-M (aka Zweiton M) audio - standard, and should set 0x808 to 0x3f, but I don't - know how to detect this. */ - } else if (std == V4L2_STD_NTSC_M_JP) { + /* Certain Hauppauge PVR150 models have a hardware bug + that causes audio to drop out. For these models the + audio standard must be set explicitly. + To be precise: it affects cards with tuner models + 85, 99 and 112 (model numbers from tveeprom). */ + int hw_fix = state->pvr150_workaround; + + if (std == V4L2_STD_NTSC_M_JP) { /* Japan uses EIAJ audio standard */ - cx25840_write(client, 0x808, 0xf7); + cx25840_write(client, 0x808, hw_fix ? 0x2f : 0xf7); + } else if (std == V4L2_STD_NTSC_M_KR) { + /* South Korea uses A2 audio standard */ + cx25840_write(client, 0x808, hw_fix ? 0x3f : 0xf8); } else { /* Others use the BTSC audio standard */ - cx25840_write(client, 0x808, 0xf6); + cx25840_write(client, 0x808, hw_fix ? 0x1f : 0xf6); } - /* South Korea uses the A2-M (aka Zweiton M) audio standard, - and should set 0x808 to 0xf8, but I don't know how to - detect this. */ cx25840_write(client, 0x80b, 0x00); } @@ -330,17 +320,17 @@ static int set_v4lstd(struct i2c_client *client, v4l2_std_id std) u8 fmt=0; /* zero is autodetect */ /* First tests should be against specific std */ - if (std & V4L2_STD_NTSC_M_JP) { + if (std == V4L2_STD_NTSC_M_JP) { fmt=0x2; - } else if (std & V4L2_STD_NTSC_443) { + } else if (std == V4L2_STD_NTSC_443) { fmt=0x3; - } else if (std & V4L2_STD_PAL_M) { + } else if (std == V4L2_STD_PAL_M) { fmt=0x5; - } else if (std & V4L2_STD_PAL_N) { + } else if (std == V4L2_STD_PAL_N) { fmt=0x6; - } else if (std & V4L2_STD_PAL_Nc) { + } else if (std == V4L2_STD_PAL_Nc) { fmt=0x7; - } else if (std & V4L2_STD_PAL_60) { + } else if (std == V4L2_STD_PAL_60) { fmt=0x8; } else { /* Then, test against generic ones */ @@ -369,7 +359,7 @@ v4l2_std_id cx25840_get_v4lstd(struct i2c_client * client) } switch (fmt) { - case 0x1: return V4L2_STD_NTSC_M; + case 0x1: return V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR; case 0x2: return V4L2_STD_NTSC_M_JP; case 0x3: return V4L2_STD_NTSC_443; case 0x4: return V4L2_STD_PAL; diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c index 7c71422f5d3f..0d54f6c1982b 100644 --- a/drivers/media/video/tda9887.c +++ b/drivers/media/video/tda9887.c @@ -231,7 +231,7 @@ static struct tvnorm tvnorms[] = { cAudioIF_6_5 | cVideoIF_38_90 ), },{ - .std = V4L2_STD_NTSC_M, + .std = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR, .name = "NTSC-M", .b = ( cNegativeFmTV | cQSS ), @@ -619,6 +619,11 @@ static int tda9887_fixup_std(struct tda9887 *t) tda9887_dbg("insmod fixup: NTSC => NTSC_M_JP\n"); t->std = V4L2_STD_NTSC_M_JP; break; + case 'k': + case 'K': + tda9887_dbg("insmod fixup: NTSC => NTSC_M_KR\n"); + t->std = V4L2_STD_NTSC_M_KR; + break; case '-': /* default parameter, do nothing */ break; diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c index 873bf3d9679c..e7ee619d62c5 100644 --- a/drivers/media/video/tuner-core.c +++ b/drivers/media/video/tuner-core.c @@ -366,6 +366,11 @@ static int tuner_fixup_std(struct tuner *t) tuner_dbg("insmod fixup: NTSC => NTSC_M_JP\n"); t->std = V4L2_STD_NTSC_M_JP; break; + case 'k': + case 'K': + tuner_dbg("insmod fixup: NTSC => NTSC_M_KR\n"); + t->std = V4L2_STD_NTSC_M_KR; + break; case '-': /* default parameter, do nothing */ break; diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index ce40675324bd..839ccc70698e 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -628,6 +628,7 @@ typedef __u64 v4l2_std_id; #define V4L2_STD_NTSC_M ((v4l2_std_id)0x00001000) #define V4L2_STD_NTSC_M_JP ((v4l2_std_id)0x00002000) #define V4L2_STD_NTSC_443 ((v4l2_std_id)0x00004000) +#define V4L2_STD_NTSC_M_KR ((v4l2_std_id)0x00008000) #define V4L2_STD_SECAM_B ((v4l2_std_id)0x00010000) #define V4L2_STD_SECAM_D ((v4l2_std_id)0x00020000) @@ -660,7 +661,8 @@ typedef __u64 v4l2_std_id; V4L2_STD_PAL_H |\ V4L2_STD_PAL_I) #define V4L2_STD_NTSC (V4L2_STD_NTSC_M |\ - V4L2_STD_NTSC_M_JP) + V4L2_STD_NTSC_M_JP |\ + V4L2_STD_NTSC_M_KR) #define V4L2_STD_SECAM_DK (V4L2_STD_SECAM_D |\ V4L2_STD_SECAM_K |\ V4L2_STD_SECAM_K1) -- cgit v1.2.3-71-gd317 From 46cd2f32baf181b74b16cceb123bab6fe1f61f85 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Feb 2006 12:58:50 -0800 Subject: [PATCH] Fix build failure in recent pm_prepare_* changes. Fix compilation problem in PM headers. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/suspend.h | 10 +++++++++- kernel/power/console.c | 4 +++- kernel/power/power.h | 16 ---------------- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 43bcd13eb1ec..37c1c76fd547 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -42,13 +42,21 @@ extern void mark_free_pages(struct zone *zone); #ifdef CONFIG_PM /* kernel/power/swsusp.c */ extern int software_suspend(void); + +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +extern int pm_prepare_console(void); +extern void pm_restore_console(void); +#else +static inline int pm_prepare_console(void) { return 0; } +static inline void pm_restore_console(void) {} +#endif /* defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) */ #else static inline int software_suspend(void) { printk("Warning: fake suspend called\n"); return -EPERM; } -#endif +#endif /* CONFIG_PM */ #ifdef CONFIG_SUSPEND_SMP extern void disable_nonboot_cpus(void); diff --git a/kernel/power/console.c b/kernel/power/console.c index 579d239d129f..623786d44159 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,7 +9,9 @@ #include #include "power.h" -#ifdef SUSPEND_CONSOLE +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + static int orig_fgconsole, orig_kmsg; int pm_prepare_console(void) diff --git a/kernel/power/power.h b/kernel/power/power.h index d8f0d1a76bae..388dba680841 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,14 +1,6 @@ #include #include -/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but - we probably do not take enough locks for switching consoles, etc, - so bad things might happen. -*/ -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) -#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) -#endif - struct swsusp_info { struct new_utsname uts; u32 version_code; @@ -42,14 +34,6 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -#ifdef SUSPEND_CONSOLE -extern int pm_prepare_console(void); -extern void pm_restore_console(void); -#else -static int pm_prepare_console(void) { return 0; } -static void pm_restore_console(void) {} -#endif - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -- cgit v1.2.3-71-gd317 From 8519fb30e438f8088b71a94a7d5a660a814d3872 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 7 Feb 2006 12:58:52 -0800 Subject: [PATCH] mm: compound release fix Compound pages on SMP systems can now often be freed from pagetables via the release_pages path. This uses put_page_testzero which does not handle compound pages at all. Releasing constituent pages from process mappings decrements their count to a large negative number and leaks the reference at the head page - net result is a memory leak. The problem was hidden because the debug check in put_page_testzero itself actually did take compound pages into consideration. Fix the bug and the debug check. Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/swap.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 85854b867463..75e9f0724997 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -303,7 +303,7 @@ struct page { */ #define put_page_testzero(p) \ ({ \ - BUG_ON(page_count(p) == 0); \ + BUG_ON(atomic_read(&(p)->_count) == -1);\ atomic_add_negative(-1, &(p)->_count); \ }) diff --git a/mm/swap.c b/mm/swap.c index bc2442a7b0ee..76247424dea1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -34,19 +34,22 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -void put_page(struct page *page) +static void put_compound_page(struct page *page) { - if (unlikely(PageCompound(page))) { - page = (struct page *)page_private(page); - if (put_page_testzero(page)) { - void (*dtor)(struct page *page); + page = (struct page *)page_private(page); + if (put_page_testzero(page)) { + void (*dtor)(struct page *page); - dtor = (void (*)(struct page *))page[1].mapping; - (*dtor)(page); - } - return; + dtor = (void (*)(struct page *))page[1].mapping; + (*dtor)(page); } - if (put_page_testzero(page)) +} + +void put_page(struct page *page) +{ + if (unlikely(PageCompound(page))) + put_compound_page(page); + else if (put_page_testzero(page)) __page_cache_release(page); } EXPORT_SYMBOL(put_page); @@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold) struct page *page = pages[i]; struct zone *pagezone; + if (unlikely(PageCompound(page))) { + if (zone) { + spin_unlock_irq(&zone->lru_lock); + zone = NULL; + } + put_compound_page(page); + continue; + } + if (!put_page_testzero(page)) continue; -- cgit v1.2.3-71-gd317 From 741a295130606143edbf9fc740f633dbc1e6225f Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:59:00 -0800 Subject: [PATCH] unshare system call -v5: unshare namespace If the namespace structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 56 ++++++++++++++++++++++++++++++----------------- include/linux/namespace.h | 1 + kernel/fork.c | 17 +++++++++----- 3 files changed, 48 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index ce97becff461..a2bef5c81033 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1325,27 +1325,17 @@ dput_out: return retval; } -int copy_namespace(int flags, struct task_struct *tsk) +/* + * Allocate a new namespace structure and populate it with contents + * copied from the namespace of the passed in task structure. + */ +struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { struct namespace *namespace = tsk->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct fs_struct *fs = tsk->fs; struct vfsmount *p, *q; - if (!namespace) - return 0; - - get_namespace(namespace); - - if (!(flags & CLONE_NEWNS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - put_namespace(namespace); - return -EPERM; - } - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); if (!new_ns) goto out; @@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk) } up_write(&namespace_sem); - tsk->namespace = new_ns; - if (rootmnt) mntput(rootmnt); if (pwdmnt) @@ -1405,12 +1393,40 @@ int copy_namespace(int flags, struct task_struct *tsk) if (altrootmnt) mntput(altrootmnt); - put_namespace(namespace); - return 0; +out: + return new_ns; +} + +int copy_namespace(int flags, struct task_struct *tsk) +{ + struct namespace *namespace = tsk->namespace; + struct namespace *new_ns; + int err = 0; + + if (!namespace) + return 0; + + get_namespace(namespace); + + if (!(flags & CLONE_NEWNS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = dup_namespace(tsk, tsk->fs); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->namespace = new_ns; out: put_namespace(namespace); - return -ENOMEM; + return err; } asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, diff --git a/include/linux/namespace.h b/include/linux/namespace.h index 6731977c4c13..3abc8e3b4879 100644 --- a/include/linux/namespace.h +++ b/include/linux/namespace.h @@ -15,6 +15,7 @@ struct namespace { extern int copy_namespace(int, struct task_struct *); extern void __put_namespace(struct namespace *namespace); +extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *); static inline void put_namespace(struct namespace *namespace) { diff --git a/kernel/fork.c b/kernel/fork.c index 598e5c27242c..07dd241aa1e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1388,16 +1388,21 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of namespace for tasks created without CLONE_NEWNS is not - * supported yet + * Unshare the namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp) +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { struct namespace *ns = current->namespace; if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) - return -EINVAL; + (ns && atomic_read(&ns->count) > 1)) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + if (!*new_nsp) + return -ENOMEM; + } return 0; } @@ -1482,7 +1487,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns))) + if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; -- cgit v1.2.3-71-gd317 From 1b8623545b42c03eb92e51b28c84acf4b8ba00a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Dec 2005 01:07:03 -0500 Subject: [PATCH] remove bogus asm/bug.h includes. A bunch of asm/bug.h includes are both not needed (since it will get pulled anyway) and bogus (since they are done too early). Removed. Signed-off-by: Al Viro --- crypto/scatterwalk.c | 1 - drivers/cdrom/viocd.c | 2 -- drivers/net/hamradio/baycom_par.c | 1 - drivers/tc/tc.c | 1 - drivers/video/backlight/backlight.c | 1 - drivers/video/backlight/lcd.c | 1 - drivers/video/pmag-ba-fb.c | 1 - drivers/video/pmagb-b-fb.c | 1 - fs/reiserfs/hashes.c | 1 - include/asm-mips/io.h | 1 - include/asm-powerpc/dma-mapping.h | 1 - include/linux/cpumask.h | 1 - include/linux/dcache.h | 1 - include/linux/jbd.h | 1 - include/linux/mtd/map.h | 1 - include/linux/nodemask.h | 1 - include/linux/smp.h | 1 - kernel/compat.c | 1 - net/dccp/ccids/lib/tfrc_equation.c | 1 - net/ipv4/xfrm4_policy.c | 1 - net/ipv6/raw.c | 1 - net/ipv6/xfrm6_policy.c | 1 - net/xfrm/xfrm_policy.c | 1 - 23 files changed, 24 deletions(-) (limited to 'include/linux') diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c index 47ac90e615f4..2953e2cc56f0 100644 --- a/crypto/scatterwalk.c +++ b/crypto/scatterwalk.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "internal.h" #include "scatterwalk.h" diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 193446e6a08a..e27617259552 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -42,8 +42,6 @@ #include #include -#include - #include #include #include diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c index 3b1bef1ee215..77411a00d1ee 100644 --- a/drivers/net/hamradio/baycom_par.c +++ b/drivers/net/hamradio/baycom_par.c @@ -86,7 +86,6 @@ #include #include -#include #include #include diff --git a/drivers/tc/tc.c b/drivers/tc/tc.c index a0e5af638e0e..4a51e56f85b6 100644 --- a/drivers/tc/tc.c +++ b/drivers/tc/tc.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 9d5015e99372..bd39bbd88d41 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -13,7 +13,6 @@ #include #include #include -#include static ssize_t backlight_show_power(struct class_device *cdev, char *buf) { diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index 68c690605aa7..9e32485ee7bb 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -13,7 +13,6 @@ #include #include #include -#include static ssize_t lcd_show_power(struct class_device *cdev, char *buf) { diff --git a/drivers/video/pmag-ba-fb.c b/drivers/video/pmag-ba-fb.c index f3927b6cda9d..f5361cd8ccce 100644 --- a/drivers/video/pmag-ba-fb.c +++ b/drivers/video/pmag-ba-fb.c @@ -30,7 +30,6 @@ #include #include -#include #include #include diff --git a/drivers/video/pmagb-b-fb.c b/drivers/video/pmagb-b-fb.c index 25148de5fe67..eeeac924b500 100644 --- a/drivers/video/pmagb-b-fb.c +++ b/drivers/video/pmagb-b-fb.c @@ -27,7 +27,6 @@ #include #include -#include #include #include diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index a3ec238fd9e0..e664ac16fad9 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -21,7 +21,6 @@ #include #include #include -#include #define DELTA 0x9E3779B9 #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ diff --git a/include/asm-mips/io.h b/include/asm-mips/io.h index d42685747e7d..a9fa1254894a 100644 --- a/include/asm-mips/io.h +++ b/include/asm-mips/io.h @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h index 837756ab7dc7..2ac63f569592 100644 --- a/include/asm-powerpc/dma-mapping.h +++ b/include/asm-powerpc/dma-mapping.h @@ -15,7 +15,6 @@ #include #include #include -#include #define DMA_ERROR_CODE (~(dma_addr_t)0x0) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 13e9f4a3ab26..20b446f26ecd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -84,7 +84,6 @@ #include #include #include -#include typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern cpumask_t _unused_cpumask_arg_; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a3f09947940e..4361f3789975 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -8,7 +8,6 @@ #include #include #include -#include struct nameidata; struct vfsmount; diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 751bb3849467..0fe4aa891ddc 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -239,7 +239,6 @@ typedef struct journal_superblock_s #include #include -#include #define JBD_ASSERTIONS #ifdef JBD_ASSERTIONS diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index fedfbc8a287f..7dfd6e1fcde7 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -15,7 +15,6 @@ #include #include #include -#include #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1 #define map_bankwidth(map) 1 diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 4726ef7ba8e8..b959a4525cbd 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -84,7 +84,6 @@ #include #include #include -#include typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; diff --git a/include/linux/smp.h b/include/linux/smp.h index 9dfa3ee769ae..44153fdf73fc 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -17,7 +17,6 @@ extern void cpu_idle(void); #include #include #include -#include /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. diff --git a/kernel/compat.c b/kernel/compat.c index 1867290c37e3..8c9cd88b6785 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,7 +23,6 @@ #include #include -#include int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index d2b5933b4510..add3cae65e2d 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -15,7 +15,6 @@ #include #include -#include #include #include "tfrc.h" diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 42196ba3b0b9..45f7ae58f2c0 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -8,7 +8,6 @@ * */ -#include #include #include #include diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 66f1d12ea578..738376cf0c51 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 69bd957380e7..91cce8b2d7a5 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -11,7 +11,6 @@ * */ -#include #include #include #include diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 077bbf9fb9b7..dbf4620768d6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -13,7 +13,6 @@ * */ -#include #include #include #include -- cgit v1.2.3-71-gd317 From bee14e1f8ae2d5fd3f324e0c8562f791537160b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 07:33:44 -0500 Subject: [PATCH] __user annotations of video_spu_palette Signed-off-by: Al Viro --- include/linux/dvb/video.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dvb/video.h b/include/linux/dvb/video.h index b1999bfeaa56..b81e58b2ebf8 100644 --- a/include/linux/dvb/video.h +++ b/include/linux/dvb/video.h @@ -135,7 +135,7 @@ typedef struct video_spu { typedef struct video_spu_palette { /* SPU Palette information */ int length; - uint8_t *palette; + uint8_t __user *palette; } video_spu_palette_t; -- cgit v1.2.3-71-gd317 From 5b1a43d7df65689b4c3b5a1c5c8158f1d4f74fbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 05:24:20 -0500 Subject: [PATCH] drivers/media/video __user annotations and fixes * compat_alloc_user_space() returns __user pointer * copying between two userland areas is copy_in_user(), not copy_from_user() * dereferencing userland pointers is bad * so's get_user() from local variables ... plus usual __user annotations Signed-off-by: Al Viro --- drivers/media/video/compat_ioctl32.c | 89 +++++++++++++++++------------------- include/linux/videodev2.h | 2 +- 2 files changed, 42 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/video/compat_ioctl32.c b/drivers/media/video/compat_ioctl32.c index 297c32ab51e3..840fe0177121 100644 --- a/drivers/media/video/compat_ioctl32.c +++ b/drivers/media/video/compat_ioctl32.c @@ -167,29 +167,32 @@ static int get_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user if (kp->clipcount > 2048) return -EINVAL; if (kp->clipcount) { - struct v4l2_clip32 *uclips = compat_ptr(up->clips); - struct v4l2_clip *kclips; + struct v4l2_clip32 __user *uclips; + struct v4l2_clip __user *kclips; int n = kp->clipcount; + compat_caddr_t p; + if (get_user(p, &up->clips)) + return -EFAULT; + uclips = compat_ptr(p); kclips = compat_alloc_user_space(n * sizeof(struct v4l2_clip)); kp->clips = kclips; while (--n >= 0) { - if (!access_ok(VERIFY_READ, &uclips->c, sizeof(uclips->c)) || - copy_from_user(&kclips->c, &uclips->c, sizeof(uclips->c))) + if (copy_in_user(&kclips->c, &uclips->c, sizeof(uclips->c))) + return -EFAULT; + if (put_user(n ? kclips + 1 : NULL, &kclips->next)) return -EFAULT; - kclips->next = n ? kclips + 1 : 0; uclips += 1; kclips += 1; } } else - kp->clips = 0; + kp->clips = NULL; return 0; } static int put_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_window32)) || - copy_to_user(&up->w, &kp->w, sizeof(up->w)) || + if (copy_to_user(&up->w, &kp->w, sizeof(up->w)) || put_user(kp->field, &up->field) || put_user(kp->chromakey, &up->chromakey) || put_user(kp->clipcount, &up->clipcount)) @@ -199,33 +202,29 @@ static int put_v4l2_window32(struct v4l2_window *kp, struct v4l2_window32 __user static inline int get_v4l2_pix_format(struct v4l2_pix_format *kp, struct v4l2_pix_format __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_pix_format)) || - copy_from_user(kp, up, sizeof(struct v4l2_pix_format))) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_pix_format))) + return -EFAULT; return 0; } static inline int put_v4l2_pix_format(struct v4l2_pix_format *kp, struct v4l2_pix_format __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_pix_format)) || - copy_to_user(up, kp, sizeof(struct v4l2_pix_format))) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_pix_format))) + return -EFAULT; return 0; } static inline int get_v4l2_vbi_format(struct v4l2_vbi_format *kp, struct v4l2_vbi_format __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_vbi_format)) || - copy_from_user(kp, up, sizeof(struct v4l2_vbi_format))) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_vbi_format))) + return -EFAULT; return 0; } static inline int put_v4l2_vbi_format(struct v4l2_vbi_format *kp, struct v4l2_vbi_format __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_vbi_format)) || - copy_to_user(up, kp, sizeof(struct v4l2_vbi_format))) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_vbi_format))) + return -EFAULT; return 0; } @@ -279,18 +278,16 @@ static int put_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __user static inline int get_v4l2_standard(struct v4l2_standard *kp, struct v4l2_standard __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_standard)) || - copy_from_user(kp, up, sizeof(struct v4l2_standard))) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_standard))) + return -EFAULT; return 0; } static inline int put_v4l2_standard(struct v4l2_standard *kp, struct v4l2_standard __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_standard)) || - copy_to_user(up, kp, sizeof(struct v4l2_standard))) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_standard))) + return -EFAULT; return 0; } @@ -328,18 +325,16 @@ static int put_v4l2_standard32(struct v4l2_standard *kp, struct v4l2_standard32 static inline int get_v4l2_tuner(struct v4l2_tuner *kp, struct v4l2_tuner __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_tuner)) || - copy_from_user(kp, up, sizeof(struct v4l2_tuner))) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_tuner))) + return -EFAULT; return 0; } static inline int put_v4l2_tuner(struct v4l2_tuner *kp, struct v4l2_tuner __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_tuner)) || - copy_to_user(up, kp, sizeof(struct v4l2_tuner))) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_tuner))) + return -EFAULT; return 0; } @@ -380,11 +375,13 @@ static int get_v4l2_buffer32(struct v4l2_buffer *kp, struct v4l2_buffer32 __user break; case V4L2_MEMORY_USERPTR: { - unsigned long tmp = (unsigned long)compat_ptr(up->m.userptr); + compat_long_t tmp; - if(get_user(kp->length, &up->length) || - get_user(kp->m.userptr, &tmp)) - return -EFAULT; + if (get_user(kp->length, &up->length) || + get_user(tmp, &up->m.userptr)) + return -EFAULT; + + kp->m.userptr = (unsigned long)compat_ptr(tmp); } break; case V4L2_MEMORY_OVERLAY: @@ -468,33 +465,29 @@ static int put_v4l2_framebuffer32(struct v4l2_framebuffer *kp, struct v4l2_frame static inline int get_v4l2_input32(struct v4l2_input *kp, struct v4l2_input __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_input) - 4) || - copy_from_user(kp, up, sizeof(struct v4l2_input) - 4)) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_input) - 4)) + return -EFAULT; return 0; } static inline int put_v4l2_input32(struct v4l2_input *kp, struct v4l2_input __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_input) - 4) || - copy_to_user(up, kp, sizeof(struct v4l2_input) - 4)) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_input) - 4)) + return -EFAULT; return 0; } static inline int get_v4l2_input(struct v4l2_input *kp, struct v4l2_input __user *up) { - if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_input)) || - copy_from_user(kp, up, sizeof(struct v4l2_input))) - return -EFAULT; + if (copy_from_user(kp, up, sizeof(struct v4l2_input))) + return -EFAULT; return 0; } static inline int put_v4l2_input(struct v4l2_input *kp, struct v4l2_input __user *up) { - if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_input)) || - copy_to_user(up, kp, sizeof(struct v4l2_input))) - return -EFAULT; + if (copy_to_user(up, kp, sizeof(struct v4l2_input))) + return -EFAULT; return 0; } diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index b23be44cbea8..5208b12d5550 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -549,7 +549,7 @@ struct v4l2_framebuffer struct v4l2_clip { struct v4l2_rect c; - struct v4l2_clip *next; + struct v4l2_clip __user *next; }; struct v4l2_window -- cgit v1.2.3-71-gd317 From d656101009d76000b8fc0998a33d592100334d52 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 05:59:06 -0500 Subject: [PATCH] sn3 iomem annotations and fixes Signed-off-by: Al Viro --- drivers/sn/ioc3.c | 18 +++++++++--------- include/linux/ioc3.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c index c70ae81b5d98..12357e1fa558 100644 --- a/drivers/sn/ioc3.c +++ b/drivers/sn/ioc3.c @@ -38,10 +38,10 @@ static inline unsigned mcr_pack(unsigned pulse, unsigned sample) static int nic_wait(struct ioc3_driver_data *idd) { - volatile unsigned mcr; + unsigned mcr; do { - mcr = (volatile unsigned)idd->vma->mcr; + mcr = readl(&idd->vma->mcr); } while (!(mcr & 2)); return mcr & 1; @@ -53,7 +53,7 @@ static int nic_reset(struct ioc3_driver_data *idd) unsigned long flags; local_irq_save(flags); - idd->vma->mcr = mcr_pack(500, 65); + writel(mcr_pack(500, 65), &idd->vma->mcr); presence = nic_wait(idd); local_irq_restore(flags); @@ -68,7 +68,7 @@ static inline int nic_read_bit(struct ioc3_driver_data *idd) unsigned long flags; local_irq_save(flags); - idd->vma->mcr = mcr_pack(6, 13); + writel(mcr_pack(6, 13), &idd->vma->mcr); result = nic_wait(idd); local_irq_restore(flags); @@ -80,9 +80,9 @@ static inline int nic_read_bit(struct ioc3_driver_data *idd) static inline void nic_write_bit(struct ioc3_driver_data *idd, int bit) { if (bit) - idd->vma->mcr = mcr_pack(6, 110); + writel(mcr_pack(6, 110), &idd->vma->mcr); else - idd->vma->mcr = mcr_pack(80, 30); + writel(mcr_pack(80, 30), &idd->vma->mcr); nic_wait(idd); } @@ -337,7 +337,7 @@ static void probe_nic(struct ioc3_driver_data *idd) int save = 0, loops = 3; unsigned long first, addr; - idd->vma->gpcr_s = GPCR_MLAN_EN; + writel(GPCR_MLAN_EN, &idd->vma->gpcr_s); while(loops>0) { idd->nic_part[0] = 0; @@ -408,7 +408,7 @@ static irqreturn_t ioc3_intr_io(int irq, void *arg, struct pt_regs *regs) read_lock_irqsave(&ioc3_submodules_lock, flags); - if(idd->dual_irq && idd->vma->eisr) { + if(idd->dual_irq && readb(&idd->vma->eisr)) { /* send Ethernet IRQ to the driver */ if(ioc3_ethernet && idd->active[ioc3_ethernet->id] && ioc3_ethernet->intr) { @@ -682,7 +682,7 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) idd->id = ioc3_counter++; up_write(&ioc3_devices_rwsem); - idd->gpdr_shadow = idd->vma->gpdr; + idd->gpdr_shadow = readl(&idd->vma->gpdr); /* Read IOC3 NIC contents */ probe_nic(idd); diff --git a/include/linux/ioc3.h b/include/linux/ioc3.h index e7906a72a4f1..da7c09e4ede6 100644 --- a/include/linux/ioc3.h +++ b/include/linux/ioc3.h @@ -27,7 +27,7 @@ struct ioc3_driver_data { int id; /* IOC3 sequence number */ /* PCI mapping */ unsigned long pma; /* physical address */ - struct __iomem ioc3 *vma; /* pointer to registers */ + struct ioc3 __iomem *vma; /* pointer to registers */ struct pci_dev *pdev; /* PCI device */ /* IRQ stuff */ int dual_irq; /* set if separate IRQs are used */ -- cgit v1.2.3-71-gd317 From 30e9656cc340035e102fea46e1908689494b042d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2006 01:01:31 -0800 Subject: [PATCH] block: implement elv_insert and use it (fix ordcolor flipping bug) q->ordcolor must only be flipped on initial queueing of a hardbarrier request. Constructing ordered sequence and requeueing used to pass through __elv_add_request() which flips q->ordcolor when it sees a barrier request. This patch separates out elv_insert() from __elv_add_request() and uses elv_insert() when constructing ordered sequence and requeueing. elv_insert() inserts the given request at the specified position and does nothing else. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/elevator.c | 70 ++++++++++++++++++++++++++---------------------- block/ll_rw_blk.c | 4 +-- include/linux/elevator.h | 1 + 3 files changed, 41 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/block/elevator.c b/block/elevator.c index 2fc269f69726..24b702d649a9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -293,7 +293,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq) rq->flags &= ~REQ_STARTED; - __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0); + elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); } static void elv_drain_elevator(request_queue_t *q) @@ -310,41 +310,11 @@ static void elv_drain_elevator(request_queue_t *q) } } -void __elv_add_request(request_queue_t *q, struct request *rq, int where, - int plug) +void elv_insert(request_queue_t *q, struct request *rq, int where) { struct list_head *pos; unsigned ordseq; - if (q->ordcolor) - rq->flags |= REQ_ORDERED_COLOR; - - if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { - /* - * toggle ordered color - */ - if (blk_barrier_rq(rq)) - q->ordcolor ^= 1; - - /* - * barriers implicitly indicate back insertion - */ - if (where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - /* - * this request is scheduling boundary, update end_sector - */ - if (blk_fs_request(rq)) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - } - } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - if (plug) - blk_plug_device(q); - rq->q = q; switch (where) { @@ -425,6 +395,42 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where, } } +void __elv_add_request(request_queue_t *q, struct request *rq, int where, + int plug) +{ + if (q->ordcolor) + rq->flags |= REQ_ORDERED_COLOR; + + if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + /* + * toggle ordered color + */ + if (blk_barrier_rq(rq)) + q->ordcolor ^= 1; + + /* + * barriers implicitly indicate back insertion + */ + if (where == ELEVATOR_INSERT_SORT) + where = ELEVATOR_INSERT_BACK; + + /* + * this request is scheduling boundary, update + * end_sector + */ + if (blk_fs_request(rq)) { + q->end_sector = rq_end_sector(rq); + q->boundary_rq = rq; + } + } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT) + where = ELEVATOR_INSERT_BACK; + + if (plug) + blk_plug_device(q); + + elv_insert(q, rq, where); +} + void elv_add_request(request_queue_t *q, struct request *rq, int where, int plug) { diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index ee5ed98db4cd..03d9c82b0fe7 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -454,7 +454,7 @@ static void queue_flush(request_queue_t *q, unsigned which) rq->end_io = end_io; q->prepare_flush_fn(q, rq); - __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0); + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); } static inline struct request *start_ordered(request_queue_t *q, @@ -490,7 +490,7 @@ static inline struct request *start_ordered(request_queue_t *q, else q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; - __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0); + elv_insert(q, rq, ELEVATOR_INSERT_FRONT); if (q->ordered & QUEUE_ORDERED_PREFLUSH) { queue_flush(q, QUEUE_ORDERED_PREFLUSH); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 23fe746a1d51..18cf1f3e1184 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -82,6 +82,7 @@ struct elevator_queue extern void elv_dispatch_sort(request_queue_t *, struct request *); extern void elv_add_request(request_queue_t *, struct request *, int, int); extern void __elv_add_request(request_queue_t *, struct request *, int, int); +extern void elv_insert(request_queue_t *, struct request *, int); extern int elv_merge(request_queue_t *, struct request **, struct bio *); extern void elv_merge_requests(request_queue_t *, struct request *, struct request *); -- cgit v1.2.3-71-gd317 From 85d1494e5ff8e20a52ce514584ffda4f0265025e Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Wed, 8 Feb 2006 21:46:24 +0000 Subject: [SERIAL] 8250_pci: add new PCI serial card support This patch adds new PCI serial card support. Signed-off-by: Yoichi Yuasa Signed-off-by: Russell King --- drivers/serial/8250_pci.c | 4 ++++ include/linux/pci_ids.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c index bb9ec28ccc2b..94886c000d2a 100644 --- a/drivers/serial/8250_pci.c +++ b/drivers/serial/8250_pci.c @@ -1882,6 +1882,10 @@ static struct pci_device_id serial_pci_tbl[] = { PCI_SUBVENDOR_ID_CONNECT_TECH, PCI_SUBDEVICE_ID_CONNECT_TECH_TITAN_4, 0, 0, pbn_b0_4_1843200 }, + { PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954, + PCI_VENDOR_ID_AFAVLAB, + PCI_SUBDEVICE_ID_AFAVLAB_P061, 0, 0, + pbn_b0_4_1152000 }, { PCI_VENDOR_ID_EXAR, PCI_DEVICE_ID_EXAR_XR17C152, PCI_SUBVENDOR_ID_CONNECT_TECH, PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_2_232, 0, 0, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 92a619ba163f..7a61ccdcbc4b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1832,6 +1832,7 @@ #define PCI_VENDOR_ID_AFAVLAB 0x14db #define PCI_DEVICE_ID_AFAVLAB_P028 0x2180 #define PCI_DEVICE_ID_AFAVLAB_P030 0x2182 +#define PCI_SUBDEVICE_ID_AFAVLAB_P061 0x2150 #define PCI_VENDOR_ID_BROADCOM 0x14e4 #define PCI_DEVICE_ID_TIGON3_5752 0x1600 -- cgit v1.2.3-71-gd317 From 9ac95f2f90e022c16d293d7978faddf7e779a1a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Feb 2006 22:41:50 +0300 Subject: [PATCH] do_sigaction: cleanup ->sa_mask manipulation Clear unblockable signals beforehand. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/signal.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0cfcd1c7865e..9c1da0269a18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1098,7 +1098,7 @@ extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); -extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); /* These can be the second arg to send_sig_info/send_group_sig_info. */ diff --git a/kernel/signal.c b/kernel/signal.c index 01a1e7f7acf7..ea154104a00b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2430,7 +2430,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) } int -do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) +do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; sigset_t mask; @@ -2454,6 +2454,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) *oact = *k; if (act) { + sigdelsetmask(&act->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -2479,8 +2481,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) read_lock(&tasklist_lock); spin_lock_irq(&t->sighand->siglock); *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2495,8 +2495,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) } *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); } spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3-71-gd317 From a70ea994a0d83fd0151a070be72b87d014ef0a7e Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Thu, 9 Feb 2006 16:40:11 -0800 Subject: [NETLINK]: Fix a severe bug netlink overrun was broken while improvement of netlink. Destination socket is used in the place where it was meant to be source socket, so that now overrun is never sent to user netlink sockets, when it should be, and it even can be set on kernel socket, which results in complete deadlock of rtnetlink. Suggested fix is to restore status quo passing source socket as additional argument to netlink_attachskb(). A little explanation: overrun is set on a socket, when it failed to receive some message and sender of this messages does not or even have no way to handle this error. This happens in two cases: 1. when kernel sends something. Kernel never retransmits and cannot wait for buffer space. 2. when user sends a broadcast and the message was not delivered to some recipients. Signed-off-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- include/linux/netlink.h | 3 ++- ipc/mqueue.c | 3 ++- net/netlink/af_netlink.c | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6a2ccf78a356..c256ebe2a7b4 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -160,7 +160,8 @@ extern int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfilp(struct file *filp); -int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo); +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + long timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 59302fc3643b..fd2e26b6f966 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1018,7 +1018,8 @@ retry: goto out; } - ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT); + ret = netlink_attachskb(sock, nc, 0, + MAX_SCHEDULE_TIMEOUT, NULL); if (ret == 1) goto retry; if (ret) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2101b45d2ec6..6b9772d95872 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -702,7 +702,8 @@ struct sock *netlink_getsockbyfilp(struct file *filp) * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ -int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + long timeo, struct sock *ssk) { struct netlink_sock *nlk; @@ -712,7 +713,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long t test_bit(0, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); if (!timeo) { - if (!nlk->pid) + if (!ssk || nlk_sk(ssk)->pid == 0) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); @@ -797,7 +798,7 @@ retry: kfree_skb(skb); return PTR_ERR(sk); } - err = netlink_attachskb(sk, skb, nonblock, timeo); + err = netlink_attachskb(sk, skb, nonblock, timeo, ssk); if (err == 1) goto retry; if (err) -- cgit v1.2.3-71-gd317 From 9c15e852a524d55ab768cf48c97f5c684f876af2 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Fri, 10 Feb 2006 01:51:05 -0800 Subject: [PATCH] kexec: fix in free initrd when overlapped with crashkernel region It is possible that the reserved crashkernel region can be overlapped with initrd since the bootloader sets the initrd location. When the initrd region is freed, the second kernel memory will not be contiguous. The Kexec_load can cause an oops since there is no contiguous memory to write the second kernel or this memory could be used in the first kernel itself and may not be part of the dump. For example, on powerpc, the initrd is located at 36MB and the crashkernel starts at 32MB. The kexec_load caused panic since writing into non-allocated memory (after 36MB). We could see the similar issue even on other archs. One possibility is to move the initrd outside of crashkernel region. But, the initrd region will be freed anyway before the system is up. This patch fixes this issue and frees only regions that are not part of crashkernel memory in case overlaps. Signed-off-by: Haren Myneni Acked-by: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 1 + init/initramfs.c | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a311f58c8a7c..cfb3410e32b1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -6,6 +6,7 @@ #include #include #include +#include #include /* Verify architecture specific macros are defined */ diff --git a/init/initramfs.c b/init/initramfs.c index 0c5d9a3f951b..637344b05981 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -466,10 +466,32 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) extern char __initramfs_start[], __initramfs_end[]; #ifdef CONFIG_BLK_DEV_INITRD #include +#include static void __init free_initrd(void) { - free_initrd_mem(initrd_start, initrd_end); +#ifdef CONFIG_KEXEC + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); + + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (initrd_start < crashk_end && initrd_end > crashk_start) { + /* + * Initialize initrd memory region since the kexec boot does + * not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + } else +#endif + free_initrd_mem(initrd_start, initrd_end); + initrd_start = 0; initrd_end = 0; } -- cgit v1.2.3-71-gd317 From 7a8ef1cb774e5438d292365626f9b96616283706 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 10 Feb 2006 01:51:08 -0800 Subject: [PATCH] x86: don't initialise cpu_possible_map to all ones Initialising cpu_possible_map to all-ones with CONFIG_HOTPLUG_CPU means that a) All for_each_cpu() loops will iterate across all NR_CPUS CPUs, rather than over possible ones. That can be quite expensive. b) Soon we'll be allocating per-cpu areas only for possible CPUs. So with CPU_MASK_ALL, we'll be wasting memory. I also switched voyager over to not use CPU_MASK_ALL in the non-CPU-hotplug case. Should be OK.. I note that parisc is also using CPU_MASK_ALL. Suggest that it stop doing that. Cc: James Bottomley Cc: Kyle McMartin Cc: Paul Jackson Cc: Ashok Raj Cc: Zwane Mwaikambo Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smpboot.c | 4 ---- arch/i386/mach-voyager/voyager_smp.c | 2 +- include/linux/cpumask.h | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 255adb498268..fb00ab7b7612 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -87,11 +87,7 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); -#ifdef CONFIG_HOTPLUG_CPU -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#else cpumask_t cpu_possible_map; -#endif EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index 72a1b9cae2e4..6e4c3baef6cc 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -240,7 +240,7 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_callout_map); -cpumask_t cpu_possible_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_possible_map); /* The per processor IRQ masks (these are usually kept in sync) */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 20b446f26ecd..60e56c6e03dd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -328,7 +328,7 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp, * bitmap of size NR_CPUS. * * #ifdef CONFIG_HOTPLUG_CPU - * cpu_possible_map - all NR_CPUS bits set + * cpu_possible_map - has bit 'cpu' set iff cpu is populatable * cpu_present_map - has bit 'cpu' set iff cpu is populated * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler * #else -- cgit v1.2.3-71-gd317 From 8977d929e49021d9a6e031310aab01fa72f849c2 Mon Sep 17 00:00:00 2001 From: Paul Fulghum Date: Fri, 10 Feb 2006 01:51:14 -0800 Subject: [PATCH] tty buffering stall fix Prevent stalled processing of received data when a driver allocates tty buffer space but does not immediately follow the allocation with more data and a call to schedule receive tty processing. (example: hvc_console) This bug was introduced by the first locking patch for the new tty buffering. Signed-off-by: Paul Fulghum Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 30 ++++++++++++++++++++++-------- include/linux/kbd_kern.h | 4 +++- include/linux/tty.h | 2 ++ include/linux/tty_flip.h | 4 +++- 4 files changed, 30 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 076e07c1da38..a23816d3e9a1 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -268,6 +268,8 @@ static struct tty_buffer *tty_buffer_alloc(size_t size) p->size = size; p->next = NULL; p->active = 0; + p->commit = 0; + p->read = 0; p->char_buf_ptr = (char *)(p->data); p->flag_buf_ptr = (unsigned char *)p->char_buf_ptr + size; /* printk("Flip create %p\n", p); */ @@ -298,6 +300,8 @@ static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size) *tbh = t->next; t->next = NULL; t->used = 0; + t->commit = 0; + t->read = 0; /* DEBUG ONLY */ memset(t->data, '*', size); /* printk("Flip recycle %p\n", t); */ @@ -335,6 +339,7 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size) if (b != NULL) { b->next = n; b->active = 0; + b->commit = b->used; } else tty->buf.head = n; tty->buf.tail = n; @@ -2752,6 +2757,9 @@ static void flush_to_ldisc(void *private_) unsigned long flags; struct tty_ldisc *disc; struct tty_buffer *tbuf; + int count; + char *char_buf; + unsigned char *flag_buf; disc = tty_ldisc_ref(tty); if (disc == NULL) /* !TTY_LDISC */ @@ -2765,16 +2773,20 @@ static void flush_to_ldisc(void *private_) goto out; } spin_lock_irqsave(&tty->buf.lock, flags); - while((tbuf = tty->buf.head) != NULL && !tbuf->active) { + while((tbuf = tty->buf.head) != NULL) { + while ((count = tbuf->commit - tbuf->read) != 0) { + char_buf = tbuf->char_buf_ptr + tbuf->read; + flag_buf = tbuf->flag_buf_ptr + tbuf->read; + tbuf->read += count; + spin_unlock_irqrestore(&tty->buf.lock, flags); + disc->receive_buf(tty, char_buf, flag_buf, count); + spin_lock_irqsave(&tty->buf.lock, flags); + } + if (tbuf->active) + break; tty->buf.head = tbuf->next; if (tty->buf.head == NULL) tty->buf.tail = NULL; - spin_unlock_irqrestore(&tty->buf.lock, flags); - /* printk("Process buffer %p for %d\n", tbuf, tbuf->used); */ - disc->receive_buf(tty, tbuf->char_buf_ptr, - tbuf->flag_buf_ptr, - tbuf->used); - spin_lock_irqsave(&tty->buf.lock, flags); tty_buffer_free(tty, tbuf); } spin_unlock_irqrestore(&tty->buf.lock, flags); @@ -2871,8 +2883,10 @@ void tty_flip_buffer_push(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->buf.lock, flags); - if (tty->buf.tail != NULL) + if (tty->buf.tail != NULL) { tty->buf.tail->active = 0; + tty->buf.tail->commit = tty->buf.tail->used; + } spin_unlock_irqrestore(&tty->buf.lock, flags); if (tty->low_latency) diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h index 3aed37314ab8..e87c32a5c86a 100644 --- a/include/linux/kbd_kern.h +++ b/include/linux/kbd_kern.h @@ -153,8 +153,10 @@ static inline void con_schedule_flip(struct tty_struct *t) { unsigned long flags; spin_lock_irqsave(&t->buf.lock, flags); - if (t->buf.tail != NULL) + if (t->buf.tail != NULL) { t->buf.tail->active = 0; + t->buf.tail->commit = t->buf.tail->used; + } spin_unlock_irqrestore(&t->buf.lock, flags); schedule_work(&t->buf.work); } diff --git a/include/linux/tty.h b/include/linux/tty.h index a7bd3b4558d2..f45cd74e6f24 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -58,6 +58,8 @@ struct tty_buffer { int used; int size; int active; + int commit; + int read; /* Data points here */ unsigned long data[0]; }; diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index 82961eb19888..222faf97d5f9 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -29,8 +29,10 @@ _INLINE_ void tty_schedule_flip(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->buf.lock, flags); - if (tty->buf.tail != NULL) + if (tty->buf.tail != NULL) { tty->buf.tail->active = 0; + tty->buf.tail->commit = tty->buf.tail->used; + } spin_unlock_irqrestore(&tty->buf.lock, flags); schedule_delayed_work(&tty->buf.work, 1); } -- cgit v1.2.3-71-gd317 From cff2b760096d1e6feaa31948e7af4abbefe47822 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 11 Feb 2006 17:55:47 -0800 Subject: [PATCH] fstatat64 support The *at patches introduced fstatat and, due to inusfficient research, I used the newfstat functions generally as the guideline. The result is that on 32-bit platforms we don't have all the information needed to implement fstatat64. This patch modifies the code to pass up 64-bit information if __ARCH_WANT_STAT64 is defined. I renamed the syscall entry point to make this clear. Other archs will continue to use the existing code. On x86-64 the compat code is implemented using a new sys32_ function. this is what is done for the other stat syscalls as well. This patch might break some other archs (those which define __ARCH_WANT_STAT64 and which already wired up the syscall). Yet others might need changes to accomodate the compatibility mode. I really don't want to do that work because all this stat handling is a mess (more so in glibc, but the kernel is also affected). It should be done by the arch maintainers. I'll provide some stand-alone test shortly. Those who are eager could compile glibc and run 'make check' (no installation needed). The patch below has been tested on x86 and x86-64. Signed-off-by: Ulrich Drepper Cc: Christoph Hellwig Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/syscall_table.S | 2 +- arch/x86_64/ia32/ia32entry.S | 2 +- arch/x86_64/ia32/sys_ia32.c | 22 ++++++++++++++++++++++ fs/stat.c | 22 ++++++++++++++++++++++ include/asm-i386/unistd.h | 2 +- include/asm-x86_64/ia32_unistd.h | 2 +- include/linux/syscalls.h | 2 ++ 7 files changed, 50 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 5a8b3fb6d27b..ac687d00a1ce 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -299,7 +299,7 @@ ENTRY(sys_call_table) .long sys_mknodat .long sys_fchownat .long sys_futimesat - .long sys_newfstatat /* 300 */ + .long sys_fstatat64 /* 300 */ .long sys_unlinkat .long sys_renameat .long sys_linkat diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index ada4535d0161..00dee176c08e 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -677,7 +677,7 @@ ia32_sys_call_table: .quad sys_mknodat .quad sys_fchownat .quad compat_sys_futimesat - .quad compat_sys_newfstatat /* 300 */ + .quad sys32_fstatat /* 300 */ .quad sys_unlinkat .quad sys_renameat .quad sys_linkat diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 54481af5344a..2bc55af95419 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -180,6 +180,28 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } +asmlinkage long +sys32_fstatat(unsigned int dfd, char __user *filename, + struct stat64 __user* statbuf, int flag) +{ + struct kstat stat; + int error = -EINVAL; + + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + if (flag & AT_SYMLINK_NOFOLLOW) + error = vfs_lstat_fd(dfd, filename, &stat); + else + error = vfs_stat_fd(dfd, filename, &stat); + + if (!error) + error = cp_stat64(statbuf, &stat); + +out: + return error; +} + /* * Linux/i386 didn't use to be able to handle more than * 4 system call parameters, so these system calls used a memory diff --git a/fs/stat.c b/fs/stat.c index 24211b030f39..9948cc1685a4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } +#ifndef __ARCH_WANT_STAT64 asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { @@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename, out: return error; } +#endif asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) { @@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf) return error; } +asmlinkage long sys_fstatat64(int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag) +{ + struct kstat stat; + int error = -EINVAL; + + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + if (flag & AT_SYMLINK_NOFOLLOW) + error = vfs_lstat_fd(dfd, filename, &stat); + else + error = vfs_stat_fd(dfd, filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + +out: + return error; +} #endif /* __ARCH_WANT_STAT64 */ void inode_add_bytes(struct inode *inode, loff_t bytes) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index cf6f2cd9c514..dc81a55dd94d 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -305,7 +305,7 @@ #define __NR_mknodat 297 #define __NR_fchownat 298 #define __NR_futimesat 299 -#define __NR_newfstatat 300 +#define __NR_fstatat64 300 #define __NR_unlinkat 301 #define __NR_renameat 302 #define __NR_linkat 303 diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h index 20468983d453..eeb2bcd635de 100644 --- a/include/asm-x86_64/ia32_unistd.h +++ b/include/asm-x86_64/ia32_unistd.h @@ -305,7 +305,7 @@ #define __NR_ia32_mknodat 297 #define __NR_ia32_fchownat 298 #define __NR_ia32_futimesat 299 -#define __NR_ia32_newfstatat 300 +#define __NR_ia32_fstatat64 300 #define __NR_ia32_unlinkat 301 #define __NR_ia32_renameat 302 #define __NR_ia32_linkat 303 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3877209d23c3..d73501ba7e44 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -557,6 +557,8 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode); asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag); +asmlinkage long sys_fstatat64(int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag); asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf, int bufsiz); asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, -- cgit v1.2.3-71-gd317 From 643a654540579b0dcc7a206a4a7475276a41aff0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 11 Feb 2006 17:55:52 -0800 Subject: [PATCH] select: fix returned timeval With David Woodhouse select() presently has a habit of increasing the value of the user's `timeout' argument on return. We were writing back a timeout larger than the original. We _deliberately_ round up, since we know we must wait at _least_ as long as the caller asks us to. The patch adds a couple of helper functions for magnitude comparison of timespecs and of timevals, and uses them to prevent the various poll and select functions from returning a timeout which is larger than the one which was passed in. The patch also fixes a bug in compat_sys_pselect7(): it was adding the new timeout value to the old one and was returning that. It should just return the new timeout value. (We have various handy timespec/timeval-to-from-nsec conversion functions in time.h. But this code open-codes it all). Cc: "David S. Miller" Cc: Andi Kleen Cc: Ulrich Drepper Cc: Thomas Gleixner Cc: george anzinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 37 +++++++++++++++++++++++++------------ fs/select.c | 32 +++++++++++++++++++++++--------- include/linux/compat.h | 20 ++++++++++++++++++++ include/linux/time.h | 25 ++++++++++++++++++++++++- 4 files changed, 92 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/compat.c b/fs/compat.c index 70c5af4cc270..a2ba78bdf7f7 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct compat_timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (compat_timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); if (tsp && !(current->personality & STICKY_TIMEOUTS)) { - ts.tv_sec += timeout / HZ; - ts.tv_nsec += (timeout % HZ) * (1000000000/HZ); - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; + struct compat_timespec rts; + + rts.tv_sec = timeout / HZ; + rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); + if (rts.tv_nsec >= NSEC_PER_SEC) { + rts.tv_sec++; + rts.tv_nsec -= NSEC_PER_SEC; } - (void)copy_to_user(tsp, &ts, sizeof(ts)); + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + copy_to_user(tsp, &rts, sizeof(rts)); } if (ret == -ERESTARTNOHAND) { @@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct compat_timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (compat_timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/fs/select.c b/fs/select.c index bc60a3e14ef3..6ce68a9c8976 100644 --- a/fs/select.c +++ b/fs/select.c @@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (timeval_compare(&rtv, &tv) < 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tsp) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only @@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) < 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/include/linux/compat.h b/include/linux/compat.h index f9ca534787e2..c9ab2a26348c 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -161,5 +161,25 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); +static inline int compat_timeval_compare(struct compat_timeval *lhs, + struct compat_timeval *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_usec - rhs->tv_usec; +} + +static inline int compat_timespec_compare(struct compat_timespec *lhs, + struct compat_timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/include/linux/time.h b/include/linux/time.h index 7b4dc36532bb..d9cdba54b789 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -33,11 +33,34 @@ struct timezone { #define NSEC_PER_SEC 1000000000L #define NSEC_PER_USEC 1000L -static __inline__ int timespec_equal(struct timespec *a, struct timespec *b) +static inline int timespec_equal(struct timespec *a, struct timespec *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } +/* + * lhs < rhs: return <0 + * lhs == rhs: return 0 + * lhs > rhs: return >0 + */ +static inline int timespec_compare(struct timespec *lhs, struct timespec *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_nsec - rhs->tv_nsec; +} + +static inline int timeval_compare(struct timeval *lhs, struct timeval *rhs) +{ + if (lhs->tv_sec < rhs->tv_sec) + return -1; + if (lhs->tv_sec > rhs->tv_sec) + return 1; + return lhs->tv_usec - rhs->tv_usec; +} + extern unsigned long mktime(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -- cgit v1.2.3-71-gd317 From bc7fc0601b3eb2254f080492f3fd69e319ed32d0 Mon Sep 17 00:00:00 2001 From: "Antonino A. Daplas" Date: Sat, 11 Feb 2006 17:56:07 -0800 Subject: [PATCH] nvidiafb: Add support for Geforce4 MX 4000 Add support for Geforce4 MX 4000 (0x185) Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/nvidia/nvidia.c | 2 ++ include/linux/pci_ids.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index dbcb8962e57d..a7c4e5e8ead6 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -138,6 +138,8 @@ static struct pci_device_id nvidiafb_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 7a61ccdcbc4b..82b83da25d77 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1087,6 +1087,7 @@ #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440_8X 0x0181 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440SE_8X 0x0182 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X 0x0183 +#define PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000 0x0185 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO 0x0186 #define PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO 0x0187 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_580_XGL 0x0188 -- cgit v1.2.3-71-gd317 From 7c8903f6373f9abecf060bad53ca36bc4ac037f2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 14 Feb 2006 13:53:03 -0800 Subject: [PATCH] jbd: revert checkpoint list changes This patch reverts commit f93ea411b73594f7d144855fd34278bcf34a9afc: [PATCH] jbd: split checkpoint lists This broke journal_flush() for OCFS2, which is its method of being sure that metadata is sent to disk for another node. And two related commits 8d3c7fce2d20ecc3264c8d8c91ae3beacdeaed1b and 43c3e6f5abdf6acac9b90c86bf03f995bf7d3d92 with the subjects: [PATCH] jbd: log_do_checkpoint fix [PATCH] jbd: remove_transaction fix These seem to be incremental bugfixes on the original patch and as such are no longer needed. Signed-off-by: Mark Fasheh Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++------------------------------ fs/jbd/commit.c | 3 +- include/linux/jbd.h | 8 +- 3 files changed, 179 insertions(+), 250 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e6265a0b56b8..543ed543d1e5 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -24,75 +24,29 @@ #include /* - * Unlink a buffer from a transaction checkpoint list. + * Unlink a buffer from a transaction. * * Called with j_list_lock held. */ -static void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { + if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ - -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ - -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * + * Returns 1 if we released it. * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; + __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); + ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) * * Called with j_list_lock held. */ - -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh; + struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; - tid_t this_tid; - int released = 0; - - this_tid = transaction->t_tid; -restart: - /* Didn't somebody clean up the transaction in the meanwhile */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; + int ret = 0; + + assert_spin_locked(&journal->j_list_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; + goto out_return_1; } + /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list + * This is foul */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - } + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * AKPM: I think the buffer_jbddirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + ret = 1; + } else { + jbd_unlock_bh_state(bh); + } + } while (jh != last_jh); + + return ret; +out_return_1: + spin_lock(&journal->j_list_lock); + return 1; } #define NR_BATCH 64 @@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + spin_unlock(&journal->j_list_lock); ll_rw_block(SWRITE, *batch_count, bhs); + spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. * - * Called with j_list_lock held and drops it if 1 is returned + * Called with j_list_lock held. * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) +static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - put_bh(bh); - ret = 1; - } - else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } - else if (!buffer_dirty(bh)) { - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - ret = 1; - } - else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. @@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; - __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } } return ret; } /* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. __log_wait_for_space() will retry if we didn't free enough. * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; int result; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); @@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal) return result; /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. */ - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0; - - while (!retry && transaction->t_checkpoint_list) { + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions) { + transaction_t *transaction; + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + tid_t this_tid; + + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { struct buffer_head *bh; - jh = transaction->t_checkpoint_list; + jh = next_jh; + next_jh = jh->b_cpnext; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, - &batch_count); - if (!retry && - lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); + retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); + if (cond_resched_lock(&journal->j_list_lock)) { retry = 1; break; } - } + } while (jh != last_jh && !retry); if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } __flush_batch(journal, bhs, &batch_count); + retry = 1; } - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one. + * If someone cleaned up this transaction while we slept, we're + * done + */ + if (journal->j_checkpoint_transactions != transaction) + break; + if (retry) + continue; + /* + * Maybe it's a new transaction, but it fell at the same + * address */ - __wait_cp_io(journal, transaction); + if (transaction->t_tid != this_tid) + continue; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + if (journal->j_checkpoint_transactions != transaction) + break; } -out: spin_unlock(&journal->j_list_lock); result = cleanup_journal_tail(journal); if (result < 0) return result; + return 0; } @@ -471,53 +455,6 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - /* * journal_clean_checkpoint_list * @@ -525,38 +462,46 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns number of bufers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0, released; + int ret = 0; transaction = journal->j_checkpoint_transactions; - if (!transaction) + if (transaction == 0) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { + struct journal_head *jh; + transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranknig */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + goto out; + } while (jh != last_jh); + } } while (transaction != last_transaction); out: return ret; @@ -571,22 +516,18 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is + * list until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. + * checkpoint list. * * This function is called with the journal locked. * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -int __journal_remove_checkpoint(struct journal_head *jh) +void __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; - int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) + if (transaction->t_checkpoint_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); - ret = 1; out: JBUFFER_TRACE(jh, "exit"); - return ret; } /* @@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 29e62d98bae6..002ad2bbc769 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -829,8 +829,7 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 0fe4aa891ddc..41ee79962bb2 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -497,12 +497,6 @@ struct transaction_s */ struct journal_head *t_checkpoint_list; - /* - * Doubly-linked circular list of all buffers submitted for IO while - * checkpointing. [j_list_lock] - */ - struct journal_head *t_checkpoint_io_list; - /* * Doubly-linked circular list of temporary buffers currently undergoing * IO in the log [j_list_lock] @@ -852,7 +846,7 @@ extern void journal_commit_transaction(journal_t *); /* Checkpoint list management */ int __journal_clean_checkpoint_list(journal_t *journal); -int __journal_remove_checkpoint(struct journal_head *); +void __journal_remove_checkpoint(struct journal_head *); void __journal_insert_checkpoint(struct journal_head *, transaction_t *); /* Buffer IO */ -- cgit v1.2.3-71-gd317 From 5ac5f9d1ce8492163dbde5d357dc5d03becf7e36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Feb 2006 13:53:04 -0800 Subject: [PATCH] NLM: Fix the NLM_GRANTED callback checks If 2 threads attached to the same process are blocking on different locks on different files (maybe even on different servers) but have the same lock arguments (i.e. same offset+length - actually quite common, since most processes try to lock the entire file) then the first GRANTED call that wakes one up will also wake the other. Currently when the NLM_GRANTED callback comes in, lockd walks the list of blocked locks in search of a match to the lock that the NLM server has granted. Although it checks the lock pid, start and end, it fails to check the filehandle and the server address. By checking the filehandle and server IP address, we ensure that this only happens if the locks truly are referencing the same file. Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/clntlock.c | 27 +++++++++++++++++---------- fs/lockd/svc4proc.c | 2 +- fs/lockd/svcproc.c | 2 +- include/linux/lockd/lockd.h | 6 +++--- 4 files changed, 22 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 3eaf6e701087..da6354baa0b8 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -u32 -nlmclnt_grant(struct nlm_lock *lock) +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) { + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; struct nlm_wait *block; u32 res = nlm_lck_denied; @@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock) * Warning: must not use cookie to match it! */ list_for_each_entry(block, &nlm_blocked, b_list) { - if (nlm_compare_locks(block->b_lock, &lock->fl)) { - /* Alright, we found a lock. Set the return status - * and wake up the caller - */ - block->b_status = NLM_LCK_GRANTED; - wake_up(&block->b_wait); - res = nlm_granted; - } + struct file_lock *fl_blocked = block->b_lock; + + if (!nlm_compare_locks(fl_blocked, fl)) + continue; + if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + continue; + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = NLM_LCK_GRANTED; + wake_up(&block->b_wait); + res = nlm_granted; } return res; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4063095d849e..b10f913aa06a 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3bc437e0cf5b..35681d9cf1fc 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 920766cea79c..ef21ed296039 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -149,7 +149,7 @@ struct nlm_rqst * nlmclnt_alloc_call(void); int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl); void nlmclnt_finish_block(struct nlm_rqst *req); long nlmclnt_block(struct nlm_rqst *req, long timeout); -u32 nlmclnt_grant(struct nlm_lock *); +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *); void nlmclnt_recovery(struct nlm_host *, u32); int nlmclnt_reclaim(struct nlm_host *, struct file_lock *); int nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *); @@ -204,7 +204,7 @@ nlmsvc_file_inode(struct nlm_file *file) * Compare two host addresses (needs modifying for ipv6) */ static __inline__ int -nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2) +nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2) { return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } @@ -214,7 +214,7 @@ nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2) * When the second lock is of type F_UNLCK, this acts like a wildcard. */ static __inline__ int -nlm_compare_locks(struct file_lock *fl1, struct file_lock *fl2) +nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { return fl1->fl_pid == fl2->fl_pid && fl1->fl_start == fl2->fl_start -- cgit v1.2.3-71-gd317 From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Tue, 14 Feb 2006 13:53:10 -0800 Subject: [PATCH] sched: revert "filter affine wakeups" Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6: [PATCH] sched: filter affine wakeups Apparently caused more than 10% performance regression for aim7 benchmark. The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144 disks. Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who supplied benchmark results). The problem is, for aim7, the wake-up pattern is random, but it still needs load balancing action in the wake-up path to achieve best performance. With the above commit, lack of load balancing hurts that workload. However, for workloads like database transaction processing, the requirement is exactly opposite. In the wake up path, best performance is achieved with absolutely zero load balancing. We simply wake up the process on the CPU that it was previously run. Worst performance is obtained when we do load balancing at wake up. There isn't an easy way to auto detect the workload characteristics. Ingo's earlier patch that detects idle CPU and decide whether to load balance or not doesn't perform with aim7 either since all CPUs are busy (it causes even bigger perf. regression). Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more than 10% performance regression with aim7. Signed-off-by: Ken Chen Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +---- kernel/sched.c | 10 +--------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c1da0269a18..b6f51e3a38ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -697,11 +697,8 @@ struct task_struct { int lock_depth; /* BKL lock depth */ -#if defined(CONFIG_SMP) - int last_waker_cpu; /* CPU that last woke this task up */ -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) int oncpu; -#endif #endif int prio, static_prio; struct list_head run_list; diff --git a/kernel/sched.c b/kernel/sched.c index 87d93be336a1..66d957227de9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) } } - if (p->last_waker_cpu != this_cpu) - goto out_set_cpu; - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; @@ -1277,8 +1274,6 @@ out_set_cpu: cpu = task_cpu(p); } - p->last_waker_cpu = this_cpu; - out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { @@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) - p->last_waker_cpu = cpu; -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; #endif -#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; -- cgit v1.2.3-71-gd317 From ee68cea2c26b7a8222f9020f54d22c6067011e8b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 15 Feb 2006 01:34:23 -0800 Subject: [NETFILTER]: Fix xfrm lookup after SNAT To find out if a packet needs to be handled by IPsec after SNAT, packets are currently rerouted in POST_ROUTING and a new xfrm lookup is done. This breaks SNAT of non-unicast packets to non-local addresses because the packet is routed as incoming packet and no neighbour entry is bound to the dst_entry. In general, it seems to be a bad idea to replace the dst_entry after the packet was already sent to the output routine because its state might not match what's expected. This patch changes the xfrm lookup in POST_ROUTING to re-use the original dst_entry without routing the packet again. This means no policy routing can be used for transport mode transforms (which keep the original route) when packets are SNATed to match the policy, but it looks like the best we can do for now. Signed-off-by: Patrick McHardy Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4.h | 2 +- net/ipv4/netfilter.c | 41 ++++++++++++++++++++++++++++++++++ net/ipv4/netfilter/ip_nat_standalone.c | 6 ++--- 3 files changed, 45 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index fdc4a9527343..43c09d790b83 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -79,7 +79,7 @@ enum nf_ip_hook_priorities { #ifdef __KERNEL__ extern int ip_route_me_harder(struct sk_buff **pskb); - +extern int ip_xfrm_me_harder(struct sk_buff **pskb); #endif /*__KERNEL__*/ #endif /*__LINUX_IP_NETFILTER_H*/ diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 52a3d7c57907..ed42cdc57cd9 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -78,6 +78,47 @@ int ip_route_me_harder(struct sk_buff **pskb) } EXPORT_SYMBOL(ip_route_me_harder); +#ifdef CONFIG_XFRM +int ip_xfrm_me_harder(struct sk_buff **pskb) +{ + struct flowi fl; + unsigned int hh_len; + struct dst_entry *dst; + + if (IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) + return 0; + if (xfrm_decode_session(*pskb, &fl, AF_INET) < 0) + return -1; + + dst = (*pskb)->dst; + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + dst_hold(dst); + + if (xfrm_lookup(&dst, &fl, (*pskb)->sk, 0) < 0) + return -1; + + dst_release((*pskb)->dst); + (*pskb)->dst = dst; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + return 0; +} +EXPORT_SYMBOL(ip_xfrm_me_harder); +#endif + void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(ip_nat_decode_session); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 92c54999a19d..7c3f7d380240 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -235,19 +235,19 @@ ip_nat_out(unsigned int hooknum, return NF_ACCEPT; ret = ip_nat_fn(hooknum, pskb, in, out, okfn); +#ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.ip != ct->tuplehash[!dir].tuple.dst.ip -#ifdef CONFIG_XFRM || ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all -#endif ) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + return ip_xfrm_me_harder(pskb) == 0 ? ret : NF_DROP; } +#endif return ret; } -- cgit v1.2.3-71-gd317 From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Feb 2006 22:50:10 +0300 Subject: [PATCH] fix zap_thread's ptrace related problems 1. The tracee can go from ptrace_stop() to do_signal_stop() after __ptrace_unlink(p). 2. It is unsafe to __ptrace_unlink(p) while p->parent may wait for tasklist_lock in ptrace_detach(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/ptrace.h | 1 + kernel/ptrace.c | 25 +++++++++++++++---------- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 055378d2513e..0e1c95074d42 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm) do_each_thread(g,p) { if (mm == p->mm && p != tsk && p->ptrace && p->parent->mm == mm) { - __ptrace_unlink(p); + __ptrace_detach(p, 0); } } while_each_thread(g,p); write_unlock_irq(&tasklist_lock); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 9d5cd106b344..0d36750fc0f1 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -84,6 +84,7 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern int ptrace_attach(struct task_struct *tsk); extern int ptrace_detach(struct task_struct *, unsigned int); +extern void __ptrace_detach(struct task_struct *, unsigned int); extern void ptrace_disable(struct task_struct *); extern int ptrace_check_attach(struct task_struct *task, int kill); extern int ptrace_request(struct task_struct *child, long request, long addr, long data); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d2cf144d0af5..d95a72c9279d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child) */ void __ptrace_unlink(task_t *child) { - if (!child->ptrace) - BUG(); + BUG_ON(!child->ptrace); + child->ptrace = 0; if (!list_empty(&child->ptrace_list)) { list_del_init(&child->ptrace_list); @@ -184,22 +184,27 @@ bad: return retval; } +void __ptrace_detach(struct task_struct *child, unsigned int data) +{ + child->exit_code = data; + /* .. re-parent .. */ + __ptrace_unlink(child); + /* .. and wake it up. */ + if (child->exit_state != EXIT_ZOMBIE) + wake_up_process(child); +} + int ptrace_detach(struct task_struct *child, unsigned int data) { if (!valid_signal(data)) - return -EIO; + return -EIO; /* Architecture-specific hardware disable .. */ ptrace_disable(child); - /* .. re-parent .. */ - child->exit_code = data; - write_lock_irq(&tasklist_lock); - __ptrace_unlink(child); - /* .. and wake it up. */ - if (child->exit_state != EXIT_ZOMBIE) - wake_up_process(child); + if (child->ptrace) + __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); return 0; -- cgit v1.2.3-71-gd317 From 48d5cad87c3a4998d0bda16ccfb5c60dfe4de5fb Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 15 Feb 2006 15:10:22 -0800 Subject: [XFRM]: Fix SNAT-related crash in xfrm4_output_finish When a packet matching an IPsec policy is SNATed so it doesn't match any policy anymore it looses its xfrm bundle, which makes xfrm4_output_finish crash because of a NULL pointer dereference. This patch directs these packets to the original output path instead. Since the packets have already passed the POST_ROUTING hook, but need to start at the beginning of the original output path which includes another POST_ROUTING invocation, a flag is added to the IPCB to indicate that the packet was rerouted and doesn't need to pass the POST_ROUTING hook again. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter.h | 19 +++++++++++++++---- include/net/ip.h | 1 + include/net/xfrm.h | 1 - net/ipv4/ip_gre.c | 3 ++- net/ipv4/ip_output.c | 16 ++++++++++------ net/ipv4/ipip.c | 3 ++- net/ipv4/xfrm4_output.c | 13 ++++++++++--- 7 files changed, 40 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 4cf6088625c1..3ca3d9ee78a9 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -184,8 +184,11 @@ static inline int nf_hook_thresh(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct sk_buff *), int thresh) + int (*okfn)(struct sk_buff *), int thresh, + int cond) { + if (!cond) + return 1; #ifndef CONFIG_NETFILTER_DEBUG if (list_empty(&nf_hooks[pf][hook])) return 1; @@ -197,7 +200,7 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *)) { - return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN); + return nf_hook_thresh(pf, hook, pskb, indev, outdev, okfn, INT_MIN, 1); } /* Activate hook; either okfn or kfree_skb called, unless a hook @@ -224,7 +227,13 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb, #define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) \ ({int __ret; \ -if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh)) == 1)\ +if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, thresh, 1)) == 1)\ + __ret = (okfn)(skb); \ +__ret;}) + +#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) \ +({int __ret; \ +if ((__ret=nf_hook_thresh(pf, hook, &(skb), indev, outdev, okfn, INT_MIN, cond)) == 1)\ __ret = (okfn)(skb); \ __ret;}) @@ -295,11 +304,13 @@ extern struct proc_dir_entry *proc_net_netfilter; #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) +#define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb) static inline int nf_hook_thresh(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct sk_buff *), int thresh) + int (*okfn)(struct sk_buff *), int thresh, + int cond) { return okfn(*pskb); } diff --git a/include/net/ip.h b/include/net/ip.h index 8de0697b364c..fab3d5b3ab1c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -41,6 +41,7 @@ struct inet_skb_parm #define IPSKB_XFRM_TUNNEL_SIZE 2 #define IPSKB_XFRM_TRANSFORMED 4 #define IPSKB_FRAG_COMPLETE 8 +#define IPSKB_REROUTED 16 }; struct ipcm_cookie diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d09ca0e7d139..d6111a2f0a23 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -866,7 +866,6 @@ extern int xfrm_state_mtu(struct xfrm_state *x, int mtu); extern int xfrm_init_state(struct xfrm_state *x); extern int xfrm4_rcv(struct sk_buff *skb); extern int xfrm4_output(struct sk_buff *skb); -extern int xfrm4_output_finish(struct sk_buff *skb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler); extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler); extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index abe23923e4e7..9981dcd68f11 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -830,7 +830,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, gre_hlen); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); dst_release(skb->dst); skb->dst = &rt->u.dst; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3324fbfe528a..57d290d89ec2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -207,8 +207,10 @@ static inline int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb->dst->xfrm != NULL) - return xfrm4_output_finish(skb); + if (skb->dst->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } #endif if (skb->len > dst_mtu(skb->dst) && !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) @@ -271,8 +273,9 @@ int ip_mc_output(struct sk_buff *skb) newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, - ip_finish_output); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_output(struct sk_buff *skb) @@ -284,8 +287,9 @@ int ip_output(struct sk_buff *skb) skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e5cbe72c6b80..03d13742a4b8 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -622,7 +622,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); dst_release(skb->dst); skb->dst = &rt->u.dst; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d4df0ddd424b..32ad229b4fed 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -152,10 +152,16 @@ error_nolock: goto out_exit; } -int xfrm4_output_finish(struct sk_buff *skb) +static int xfrm4_output_finish(struct sk_buff *skb) { int err; +#ifdef CONFIG_NETFILTER + if (!skb->dst->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif while (likely((err = xfrm4_output_one(skb)) == 0)) { nf_reset(skb); @@ -178,6 +184,7 @@ int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, - xfrm4_output_finish); + return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, + xfrm4_output_finish, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } -- cgit v1.2.3-71-gd317 From 9c92d3486434e7310cb288587953e2dae4a79701 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 15 Feb 2006 15:18:19 -0800 Subject: [NETFILTER]: Don't invoke okfn in CONFIG_NETFILTER=n variant of nf_hook() nf_hook() is supposed to call the netfilter hook and return control of the packet back to the caller in case it may pass, the okfn is only used for queueing. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 3ca3d9ee78a9..468896939843 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -318,7 +318,7 @@ static inline int nf_hook(int pf, unsigned int hook, struct sk_buff **pskb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *)) { - return okfn(*pskb); + return 1; } static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} struct flowi; -- cgit v1.2.3-71-gd317 From b2ee9dbfad14ba8e34a589d552ddc67300a26bec Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Wed, 15 Feb 2006 15:17:40 -0800 Subject: [PATCH] hrtimer: fix multiple macro argument expansion For two macros the arguments were expanded twice, change them to inline functions to avoid it. Signed-off-by: Roman Zippel Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ktime.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 6aca67a569a2..f3dec45ef874 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -96,10 +96,16 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) ({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; }) /* convert a timespec to ktime_t format: */ -#define timespec_to_ktime(ts) ktime_set((ts).tv_sec, (ts).tv_nsec) +static inline ktime_t timespec_to_ktime(struct timespec ts) +{ + return ktime_set(ts.tv_sec, ts.tv_nsec); +} /* convert a timeval to ktime_t format: */ -#define timeval_to_ktime(tv) ktime_set((tv).tv_sec, (tv).tv_usec * 1000) +static inline ktime_t timeval_to_ktime(struct timeval tv) +{ + return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); +} /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec(kt) ns_to_timespec((kt).tv64) -- cgit v1.2.3-71-gd317 From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 16 Feb 2006 23:41:58 +0100 Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and cleanup AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution installation it's easiest if it's a boot time option. Also I moved the variable to a more appropiate place and make it independent from sysctl And marked __read_mostly which it is. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 3 +++ arch/i386/kernel/cpu/transmeta.c | 1 + include/linux/kernel.h | 6 ------ include/linux/mm.h | 2 ++ kernel/sysctl.c | 2 -- mm/memory.c | 10 ++++++++++ 6 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ac75b57edf2e..b874771385cd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1638,6 +1638,9 @@ running once the system is up. Format: ,,,,,[,[,[,]]] + norandmaps Don't use address space randomization + Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space + ______________________________________________________________________ Changelog: diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index bdbeb77f4e22..7214c9b577ab 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b49affa0ac5a..3b507bf05d09 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -326,12 +326,6 @@ struct sysinfo { /* Force a compilation error if condition is true */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#ifdef CONFIG_SYSCTL -extern int randomize_va_space; -#else -#define randomize_va_space 1 -#endif - /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) diff --git a/include/linux/mm.h b/include/linux/mm.h index 75e9f0724997..26e1663a5cbe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1051,5 +1051,7 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask, void drop_pagecache(void); void drop_slab(void); +extern int randomize_va_space; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71dd6f62efec..7654d55c47f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,8 +126,6 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif -int randomize_va_space = 1; - static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, diff --git a/mm/memory.c b/mm/memory.c index 2bee1f21aa8a..9abc6008544b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(vmalloc_earlyreserve); +int randomize_va_space __read_mostly = 1; + +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 0; +} +__setup("norandmaps", disable_randmaps); + + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but -- cgit v1.2.3-71-gd317 From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 17 Feb 2006 10:30:23 +1100 Subject: [PATCH] Provide an interface for getting the current tick length This provides an interface for arch code to find out how many nanoseconds are going to be added on to xtime by the next call to do_timer. The value returned is a fixed-point number in 52.12 format in nanoseconds. The reason for this format is that it gives the full precision that the timekeeping code is using internally. The motivation for this is to fix a problem that has arisen on 32-bit powerpc in that the value returned by do_gettimeofday drifts apart from xtime if NTP is being used. PowerPC is now using a lockless do_gettimeofday based on reading the timebase register and performing some simple arithmetic. (This method of getting the time is also exported to userspace via the VDSO.) However, the factor and offset it uses were calculated based on the nominal tick length and weren't being adjusted when NTP varied the tick length. Note that 64-bit powerpc has had the lockless do_gettimeofday for a long time now. It also had an extremely hairy routine that got called from the 32-bit compat routine for adjtimex, which adjusted the factor and offset according to what it thought the timekeeping code was going to do. Not only was this only called if a 32-bit task did adjtimex (i.e. not if a 64-bit task did adjtimex), it was also duplicating computations from kernel/timer.c and it wasn't clear that it was (still) correct. The simple solution is to ask the timekeeping code how long the current jiffy will be on each timer interrupt, after calling do_timer. If this jiffy will be a different length from the last one, we then need to compute new values for the factor and offset used in the lockless do_gettimeofday. In this way we can keep xtime and do_gettimeofday in sync, even when NTP is varying the tick length. Note that when adjtimex varies the tick length, it almost always introduces the variation from the next tick on. The only case I could see where adjtimex would vary the length of the current tick is when an old-style adjtime adjustment is being cancelled. (It's not clear to me why the adjustment has to be cancelled immediately rather than from the next tick on.) Thus I don't see any real need for a hook in adjtimex; the rare case of an old-style adjustment being cancelled can be fixed up at the next tick. Signed-off-by: Paul Mackerras Acked-by: john stultz Signed-off-by: Linus Torvalds --- include/linux/timex.h | 3 +++ kernel/timer.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 04a4a8cb4ed3..b7ca1204e42a 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -345,6 +345,9 @@ time_interpolator_reset(void) #endif /* !CONFIG_TIME_INTERPOLATION */ +/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ +extern u64 current_tick_length(void); + #endif /* KERNEL */ #endif /* LINUX_TIMEX_H */ diff --git a/kernel/timer.c b/kernel/timer.c index b9dad3994676..fe3a9a9f8328 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -717,12 +717,16 @@ static void second_overflow(void) #endif } -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +/* + * Returns how many microseconds we need to add to xtime this tick + * in doing an adjustment requested with adjtime. + */ +static long adjtime_adjustment(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; - if ((time_adjust_step = time_adjust) != 0 ) { + time_adjust_step = time_adjust; + if (time_adjust_step) { /* * We are doing an adjtime thing. Prepare time_adjust_step to * be within bounds. Note that a positive time_adjust means we @@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void) */ time_adjust_step = min(time_adjust_step, (long)tickadj); time_adjust_step = max(time_adjust_step, (long)-tickadj); + } + return time_adjust_step; +} +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + long time_adjust_step, delta_nsec; + + time_adjust_step = adjtime_adjustment(); + if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - } delta_nsec = tick_nsec + time_adjust_step * 1000; /* * Advance the phase, once it gets to one microsecond, then @@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void) } } +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 + * bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + long delta_nsec; + + delta_nsec = tick_nsec + adjtime_adjustment() * 1000; + return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; +} + /* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, -- cgit v1.2.3-71-gd317 From cc1887f3d8ae8ea61efa1a75af8ec0467b9dd546 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Feb 2006 23:48:38 +0900 Subject: [PATCH] libata: fix qc->n_elem == 0 case handling in ata_qc_next_sg This patch makes ata_for_each_sg() start with pad_sgent when qc->n_elem is zero. Previously, ata_for_each_sg() unconditionally started with qc->__sg, handling the first sg to fill_sg() routines even when the entry was invalid. And while at it, unwind ?: in ata_qc_next_sg() into if statement. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 9e5db2949c58..c91be5e64ede 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -556,6 +556,16 @@ ata_sg_is_last(struct scatterlist *sg, struct ata_queued_cmd *qc) return 0; } +static inline struct scatterlist * +ata_qc_first_sg(struct ata_queued_cmd *qc) +{ + if (qc->n_elem) + return qc->__sg; + if (qc->pad_len) + return &qc->pad_sgent; + return NULL; +} + static inline struct scatterlist * ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc) { @@ -563,11 +573,13 @@ ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc) return NULL; if (++sg - qc->__sg < qc->n_elem) return sg; - return qc->pad_len ? &qc->pad_sgent : NULL; + if (qc->pad_len) + return &qc->pad_sgent; + return NULL; } #define ata_for_each_sg(sg, qc) \ - for (sg = qc->__sg; sg; sg = ata_qc_next_sg(sg, qc)) + for (sg = ata_qc_first_sg(qc); sg; sg = ata_qc_next_sg(sg, qc)) static inline unsigned int ata_tag_valid(unsigned int tag) { -- cgit v1.2.3-71-gd317 From 9b0f8b040acd8dfd23860754c0d09ff4f44e2cbc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 20 Feb 2006 18:27:52 -0800 Subject: [PATCH] Terminate process that fails on a constrained allocation Some allocations are restricted to a limited set of nodes (due to memory policies or cpuset constraints). If the page allocator is not able to find enough memory then that does not mean that overall system memory is low. In particular going postal and more or less randomly shooting at processes is not likely going to help the situation but may just lead to suicide (the whole system coming down). It is better to signal to the process that no memory exists given the constraints that the process (or the configuration of the process) has placed on the allocation behavior. The process may be killed but then the sysadmin or developer can investigate the situation. The solution is similar to what we do when running out of hugepages. This patch adds a check before we kill processes. At that point performance considerations do not matter much so we just scan the zonelist and reconstruct a list of nodes. If the list of nodes does not contain all online nodes then this is a constrained allocation and we should kill the current process. Signed-off-by: Christoph Lameter Cc: Nick Piggin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/sysrq.c | 2 +- include/linux/swap.h | 2 +- mm/oom_kill.c | 103 ++++++++++++++++++++++++++++++++++++++------------- mm/page_alloc.c | 2 +- 4 files changed, 81 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5765f672e853..d58f82318853 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -243,7 +243,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(void *ignored) { - out_of_memory(GFP_KERNEL, 0); + out_of_memory(&NODE_DATA(0)->node_zonelists[ZONE_NORMAL], GFP_KERNEL, 0); } static DECLARE_WORK(moom_work, moom_callback, NULL); diff --git a/include/linux/swap.h b/include/linux/swap.h index f3e17d5963c3..d572b19afb7d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -147,7 +147,7 @@ struct swap_list_t { #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) /* linux/mm/oom_kill.c */ -extern void out_of_memory(gfp_t gfp_mask, int order); +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 949eba1d5ba3..8123fad5a485 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -132,6 +132,36 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) return points; } +/* + * Types of limitations to the nodes from which allocations may occur + */ +#define CONSTRAINT_NONE 1 +#define CONSTRAINT_MEMORY_POLICY 2 +#define CONSTRAINT_CPUSET 3 + +/* + * Determine the type of allocation constraint. + */ +static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) +{ +#ifdef CONFIG_NUMA + struct zone **z; + nodemask_t nodes = node_online_map; + + for (z = zonelist->zones; *z; z++) + if (cpuset_zone_allowed(*z, gfp_mask)) + node_clear((*z)->zone_pgdat->node_id, + nodes); + else + return CONSTRAINT_CPUSET; + + if (!nodes_empty(nodes)) + return CONSTRAINT_MEMORY_POLICY; +#endif + + return CONSTRAINT_NONE; +} + /* * Simple selection loop. We chose the process with the highest * number of 'points'. We expect the caller will lock the tasklist. @@ -184,7 +214,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that * we select a process with CAP_SYS_RAW_IO set). */ -static void __oom_kill_task(task_t *p) +static void __oom_kill_task(task_t *p, const char *message) { if (p->pid == 1) { WARN_ON(1); @@ -200,8 +230,8 @@ static void __oom_kill_task(task_t *p) return; } task_unlock(p); - printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", - p->pid, p->comm); + printk(KERN_ERR "%s: Killed process %d (%s).\n", + message, p->pid, p->comm); /* * We give our sacrificial lamb high priority and access to @@ -214,7 +244,7 @@ static void __oom_kill_task(task_t *p) force_sig(SIGKILL, p); } -static struct mm_struct *oom_kill_task(task_t *p) +static struct mm_struct *oom_kill_task(task_t *p, const char *message) { struct mm_struct *mm = get_task_mm(p); task_t * g, * q; @@ -226,21 +256,21 @@ static struct mm_struct *oom_kill_task(task_t *p) return NULL; } - __oom_kill_task(p); + __oom_kill_task(p, message); /* * kill all processes that share the ->mm (i.e. all threads), * but are in a different thread group */ do_each_thread(g, q) if (q->mm == mm && q->tgid != p->tgid) - __oom_kill_task(q); + __oom_kill_task(q, message); while_each_thread(g, q); return mm; } static struct mm_struct *oom_kill_process(struct task_struct *p, - unsigned long points) + unsigned long points, const char *message) { struct mm_struct *mm; struct task_struct *c; @@ -253,11 +283,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, c = list_entry(tsk, struct task_struct, sibling); if (c->mm == p->mm) continue; - mm = oom_kill_task(c); + mm = oom_kill_task(c, message); if (mm) return mm; } - return oom_kill_task(p); + return oom_kill_task(p, message); } /** @@ -268,10 +298,10 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(gfp_t gfp_mask, int order) +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct mm_struct *mm = NULL; - task_t * p; + task_t *p; unsigned long points; if (printk_ratelimit()) { @@ -283,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order) cpuset_lock(); read_lock(&tasklist_lock); + + /* + * Check if there were limitations on the allocation (only relevant for + * NUMA) that may require different handling. + */ + switch (constrained_alloc(zonelist, gfp_mask)) { + case CONSTRAINT_MEMORY_POLICY: + mm = oom_kill_process(current, points, + "No available memory (MPOL_BIND)"); + break; + + case CONSTRAINT_CPUSET: + mm = oom_kill_process(current, points, + "No available memory in cpuset"); + break; + + case CONSTRAINT_NONE: retry: - p = select_bad_process(&points); + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points); - if (PTR_ERR(p) == -1UL) - goto out; + if (PTR_ERR(p) == -1UL) + goto out; - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - cpuset_unlock(); - panic("Out of memory and no killable processes...\n"); - } + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + cpuset_unlock(); + panic("Out of memory and no killable processes...\n"); + } - mm = oom_kill_process(p, points); - if (!mm) - goto retry; + mm = oom_kill_process(p, points, "Out of memory"); + if (!mm) + goto retry; + + break; + } - out: - read_unlock(&tasklist_lock); +out: cpuset_unlock(); if (mm) mmput(mm); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 208812b25597..791690d7d3fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1015,7 +1015,7 @@ rebalance: if (page) goto got_pg; - out_of_memory(gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order); goto restart; } -- cgit v1.2.3-71-gd317 From c255d844dd73616f23e4b4733edcc2e5fa4042b2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 20 Feb 2006 18:27:58 -0800 Subject: [PATCH] suspend-to-ram: allow video options to be set at runtime Currently, acpi video options can only be set on kernel command line. That's little inflexible; I'd like userland s2ram application that just works, and modifying kernel command line according to whitelist is not fun. It is better to just allow s2ram application to set video options just before suspend (according to the whitelist). This implements sysctl to allow setting suspend video options without reboot. (akpm: Documentation updates for this new sysctl are pending..) Signed-off-by: Pavel Machek Cc: "Brown, Len" Cc: "Antonino A. Daplas" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 10 ++++++++++ include/linux/acpi.h | 3 ++- include/linux/sysctl.h | 1 + kernel/sysctl.c | 16 ++++++++++++---- 4 files changed, 25 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9f11d36a8c10..b0c7ab93dcb9 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -16,6 +16,7 @@ before actually making adjustments. Currently, these files might (depending on your configuration) show up in /proc/sys/kernel: +- acpi_video_flags - acct - core_pattern - core_uses_pid @@ -57,6 +58,15 @@ show up in /proc/sys/kernel: ============================================================== +acpi_video_flags: + +flags + +See Doc*/kernel/power/video.txt, it allows mode of video boot to be +set during run time. + +============================================================== + acct: highwater lowwater frequency diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 84d3d9f034ce..d3bc25e6d27d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -427,7 +427,8 @@ extern int acpi_mp_config; extern struct acpi_table_mcfg_config *pci_mmcfg_config; extern int pci_mmcfg_config_num; -extern int sbf_port ; +extern int sbf_port; +extern unsigned long acpi_video_flags; #else /* !CONFIG_ACPI */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 32a4139c4ad8..0e92bf7ec28e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -146,6 +146,7 @@ enum KERN_RANDOMIZE=68, /* int: randomize virtual address space */ KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ + KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7654d55c47f5..ebc41bf22f1e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,14 +44,12 @@ #include #include #include +#include +#include #include #include -#ifdef CONFIG_ROOT_NFS -#include -#endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -655,6 +653,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_ACPI_SLEEP + { + .ctl_name = KERN_ACPI_VIDEO_FLAGS, + .procname = "acpi_video_flags", + .data = &acpi_video_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3-71-gd317 From 7a9166e3b037296366cea6f3c97f705d33e209e6 Mon Sep 17 00:00:00 2001 From: Luke Yang Date: Mon, 20 Feb 2006 18:28:07 -0800 Subject: [PATCH] Fix undefined symbols for nommu architecture Signed-off-by: Luke Yang Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ kernel/sysctl.c | 2 ++ mm/nommu.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 26e1663a5cbe..498ff8778fb6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1051,7 +1051,11 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask, void drop_pagecache(void); void drop_slab(void); +#ifndef CONFIG_MMU +#define randomize_va_space 0 +#else extern int randomize_va_space; +#endif #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ebc41bf22f1e..c05a2b7125e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -636,6 +636,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#if defined(CONFIG_MMU) { .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", @@ -644,6 +645,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .ctl_name = KERN_SPIN_RETRY, diff --git a/mm/nommu.c b/mm/nommu.c index c10262d68232..99d21020ec9d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc); EXPORT_SYMBOL(vfree); EXPORT_SYMBOL(vmalloc_to_page); EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmap); +EXPORT_SYMBOL(vunmap); /* * Handle all mappings that got truncated by a "truncate()" -- cgit v1.2.3-71-gd317 From 7fd105e758c8d746d57ab7e77f100e096bf153c8 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 20 Feb 2006 18:28:08 -0800 Subject: [PATCH] Fix compile for CONFIG_SYSVIPC=n or CONFIG_SYSCTL=n The compat syscalls are added to sys_ni.c since they are not defined if the above CONFIG options are off. Also, nfs would not build with CONFIG_SYSCTL off. Noticed by Arthur Othieno. Signed-off-by: Stephen Rothwell Cc: "David S. Miller" Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nfs_fs.h | 2 +- kernel/sys_ni.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 547d649b274e..b4dc6e2e10c9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -398,7 +398,7 @@ extern struct inode_operations nfs_symlink_inode_operations; extern int nfs_register_sysctl(void); extern void nfs_unregister_sysctl(void); #else -#define nfs_register_sysctl() do { } while(0) +#define nfs_register_sysctl() 0 #define nfs_unregister_sysctl() do { } while(0) #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 17313b99e53d..1067090db6b1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -104,6 +104,8 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(compat_sys_ipc); +cond_syscall(compat_sys_sysctl); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3-71-gd317 From 5bd546aa78b5d74f3162815e41940f862215d9e3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 17 Feb 2006 20:23:29 +0000 Subject: [MMC] Fix mmc_cmd_type() mask It's MMC_CMD_MASK not MMC_CMD_TYPE. Signed-off-by: Russell King --- include/linux/mmc/mmc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index f38872abc126..bdc556d88498 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -49,7 +49,7 @@ struct mmc_command { /* * These are the command types. */ -#define mmc_cmd_type(cmd) ((cmd)->flags & MMC_CMD_TYPE) +#define mmc_cmd_type(cmd) ((cmd)->flags & MMC_CMD_MASK) unsigned int retries; /* max number of retries */ unsigned int error; /* command error */ -- cgit v1.2.3-71-gd317 From fa675765afed59bb89adba3369094ebd428b930b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Feb 2006 09:39:02 -0800 Subject: Revert mount/umount uevent removal This change reverts the 033b96fd30db52a710d97b06f87d16fc59fee0f1 commit from Kay Sievers that removed the mount/umount uevents from the kernel. Some older versions of HAL still depend on these events to detect when a new device has been mounted. These events are not correctly emitted, and are broken by design, and so, should not be relied upon by any future program. Instead, the /proc/mounts file should be polled to properly detect this kind of event. A feature-removal-schedule.txt entry has been added, noting when this interface will be removed from the kernel. Signed-off-by: Greg Kroah-Hartman --- Documentation/feature-removal-schedule.txt | 9 +++++++++ fs/super.c | 15 ++++++++++++++- include/linux/kobject.h | 6 ++++-- lib/kobject_uevent.c | 4 ++++ 4 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b730d765b525..be5ae600f533 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -171,3 +171,12 @@ Why: The ISA interface is faster and should be always available. The I2C probing is also known to cause trouble in at least one case (see bug #5889.) Who: Jean Delvare + +--------------------------- + +What: mount/umount uevents +When: February 2007 +Why: These events are not correct, and do not properly let userspace know + when a file system has been mounted or unmounted. Userspace should + poll the /proc/mounts file instead to detect this properly. +Who: Greg Kroah-Hartman diff --git a/fs/super.c b/fs/super.c index 30294218fa63..e20b5580afd5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -666,6 +666,16 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } +static void bdev_uevent(struct block_device *bdev, enum kobject_action action) +{ + if (bdev->bd_disk) { + if (bdev->bd_part) + kobject_uevent(&bdev->bd_part->kobj, action); + else + kobject_uevent(&bdev->bd_disk->kobj, action); + } +} + struct super_block *get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) @@ -707,8 +717,10 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, up_write(&s->s_umount); deactivate_super(s); s = ERR_PTR(error); - } else + } else { s->s_flags |= MS_ACTIVE; + bdev_uevent(bdev, KOBJ_MOUNT); + } } return s; @@ -724,6 +736,7 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; + bdev_uevent(bdev, KOBJ_UMOUNT); generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_excl(bdev); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 2a8d8da70961..c374b5fa8d3b 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -41,8 +41,10 @@ enum kobject_action { KOBJ_ADD = (__force kobject_action_t) 0x01, /* exclusive to core */ KOBJ_REMOVE = (__force kobject_action_t) 0x02, /* exclusive to core */ KOBJ_CHANGE = (__force kobject_action_t) 0x03, /* device state change */ - KOBJ_OFFLINE = (__force kobject_action_t) 0x04, /* device offline */ - KOBJ_ONLINE = (__force kobject_action_t) 0x05, /* device online */ + KOBJ_MOUNT = (__force kobject_action_t) 0x04, /* mount event for block devices (broken) */ + KOBJ_UMOUNT = (__force kobject_action_t) 0x05, /* umount event for block devices (broken) */ + KOBJ_OFFLINE = (__force kobject_action_t) 0x06, /* device offline */ + KOBJ_ONLINE = (__force kobject_action_t) 0x07, /* device online */ }; struct kobject { diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 1b1985c136ec..086a0c6e888e 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -38,6 +38,10 @@ static char *action_to_string(enum kobject_action action) return "remove"; case KOBJ_CHANGE: return "change"; + case KOBJ_MOUNT: + return "mount"; + case KOBJ_UMOUNT: + return "umount"; case KOBJ_OFFLINE: return "offline"; case KOBJ_ONLINE: -- cgit v1.2.3-71-gd317 From 85edae14e4ee5e68cf037e9e4bca7498ea16874d Mon Sep 17 00:00:00 2001 From: Michal Janusz Miroslaw Date: Thu, 23 Feb 2006 09:49:35 +0000 Subject: [SERIAL] Trivial comment fix: include/linux/serial_reg.h Trivial comment fix for include/linux/serial_reg.h Signed-off-by: Russell King --- include/linux/serial_reg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_reg.h b/include/linux/serial_reg.h index 6a2bb955844b..3c8a6aa77415 100644 --- a/include/linux/serial_reg.h +++ b/include/linux/serial_reg.h @@ -247,10 +247,10 @@ #define UART_CTR 0xFF /* - * The 16C950 Additional Control Reigster + * The 16C950 Additional Control Register */ #define UART_ACR_RXDIS 0x01 /* Receiver disable */ -#define UART_ACR_TXDIS 0x02 /* Receiver disable */ +#define UART_ACR_TXDIS 0x02 /* Transmitter disable */ #define UART_ACR_DSRFC 0x04 /* DSR Flow Control */ #define UART_ACR_TLENB 0x20 /* 950 trigger levels enable */ #define UART_ACR_ICRRD 0x40 /* ICR Read enable */ -- cgit v1.2.3-71-gd317 From c04030e16dbea2f7581f82cc6688695927f6ac5b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Fri, 24 Feb 2006 13:04:21 -0800 Subject: [PATCH] flags parameter for linkat I'm currently at the POSIX meeting and one thing covered was the incompatibility of Linux's link() with the POSIX definition. The name. Linux does not follow symlinks, POSIX requires it does. Even if somebody thinks this is a good default behavior we cannot change this because it would break the ABI. But the fact remains that some application might want this behavior. We have one chance to help implementing this without breaking the behavior. For this we could use the new linkat interface which would need a new flags parameter. If the new parameter is AT_SYMLINK_FOLLOW the new behavior could be invoked. I do not want to introduce such a patch now. But we could add the parameter now, just don't use it. The patch below would do this. Can we get this late patch applied before the release more or less fixes the syscall API? Signed-off-by: Ulrich Drepper Signed-off-by: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/scall32-o32.S | 2 +- arch/s390/kernel/compat_wrapper.S | 1 + fs/namei.c | 8 ++++++-- include/linux/syscalls.h | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index d83e033dbc87..2f2dc54b2e26 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -626,7 +626,7 @@ einval: li v0, -EINVAL sys sys_fstatat64 4 sys sys_unlinkat 3 sys sys_renameat 4 /* 4295 */ - sys sys_linkat 4 + sys sys_linkat 5 sys sys_symlinkat 3 sys sys_readlinkat 4 sys sys_fchmodat 3 diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 615964cca15f..50e80138e7ad 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -1552,6 +1552,7 @@ sys_linkat_wrapper: llgtr %r3,%r3 # const char * lgfr %r4,%r4 # int llgtr %r5,%r5 # const char * + lgfr %r6,%r6 # int jg sys_linkat .globl sys_symlinkat_wrapper diff --git a/fs/namei.c b/fs/namei.c index e28de846c591..557dcf395ca1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2224,13 +2224,17 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * and other special files. --ADM */ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, - int newdfd, const char __user *newname) + int newdfd, const char __user *newname, + int flags) { struct dentry *new_dentry; struct nameidata nd, old_nd; int error; char * to; + if (flags != 0) + return -EINVAL; + to = getname(newname); if (IS_ERR(to)) return PTR_ERR(to); @@ -2263,7 +2267,7 @@ exit: asmlinkage long sys_link(const char __user *oldname, const char __user *newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d73501ba7e44..b9ea44ac0ddb 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -543,7 +543,7 @@ asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag); asmlinkage long sys_symlinkat(const char __user * oldname, int newdfd, const char __user * newname); asmlinkage long sys_linkat(int olddfd, const char __user *oldname, - int newdfd, const char __user *newname); + int newdfd, const char __user *newname, int flags); asmlinkage long sys_renameat(int olddfd, const char __user * oldname, int newdfd, const char __user * newname); asmlinkage long sys_futimesat(int dfd, char __user *filename, -- cgit v1.2.3-71-gd317 From bafac2a512bf4fd2ce7520f3976ce8aab4435f74 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 27 Feb 2006 13:04:17 -0800 Subject: [NETFILTER]: Restore {ipt,ip6t,ebt}_LOG compatibility The nfnetlink_log infrastructure changes broke compatiblity of the LOG targets. They currently use whatever log backend was registered first, which means that if ipt_ULOG was loaded first, no messages will be printed to the ring buffer anymore. Restore compatiblity by using the old log functions by default and only use the nf_log backend if the user explicitly said so. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_bridge/ebt_log.h | 1 + include/linux/netfilter_ipv4/ipt_LOG.h | 3 ++- include/linux/netfilter_ipv6/ip6t_LOG.h | 3 ++- net/bridge/netfilter/ebt_log.c | 7 ++++++- net/ipv4/netfilter/ipt_LOG.c | 7 ++++++- net/ipv6/netfilter/ip6t_LOG.c | 7 ++++++- 6 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebt_log.h b/include/linux/netfilter_bridge/ebt_log.h index 358fbc84fb59..96e231ae7554 100644 --- a/include/linux/netfilter_bridge/ebt_log.h +++ b/include/linux/netfilter_bridge/ebt_log.h @@ -3,6 +3,7 @@ #define EBT_LOG_IP 0x01 /* if the frame is made by ip, log the ip information */ #define EBT_LOG_ARP 0x02 +#define EBT_LOG_NFLOG 0x04 #define EBT_LOG_MASK (EBT_LOG_IP | EBT_LOG_ARP) #define EBT_LOG_PREFIX_SIZE 30 #define EBT_LOG_WATCHER "log" diff --git a/include/linux/netfilter_ipv4/ipt_LOG.h b/include/linux/netfilter_ipv4/ipt_LOG.h index 22d16177319b..892f9a33fea8 100644 --- a/include/linux/netfilter_ipv4/ipt_LOG.h +++ b/include/linux/netfilter_ipv4/ipt_LOG.h @@ -6,7 +6,8 @@ #define IPT_LOG_TCPOPT 0x02 /* Log TCP options */ #define IPT_LOG_IPOPT 0x04 /* Log IP options */ #define IPT_LOG_UID 0x08 /* Log UID owning local socket */ -#define IPT_LOG_MASK 0x0f +#define IPT_LOG_NFLOG 0x10 /* Log using nf_log backend */ +#define IPT_LOG_MASK 0x1f struct ipt_log_info { unsigned char level; diff --git a/include/linux/netfilter_ipv6/ip6t_LOG.h b/include/linux/netfilter_ipv6/ip6t_LOG.h index 9008ff5c40ae..060c1a1c6c60 100644 --- a/include/linux/netfilter_ipv6/ip6t_LOG.h +++ b/include/linux/netfilter_ipv6/ip6t_LOG.h @@ -6,7 +6,8 @@ #define IP6T_LOG_TCPOPT 0x02 /* Log TCP options */ #define IP6T_LOG_IPOPT 0x04 /* Log IP options */ #define IP6T_LOG_UID 0x08 /* Log UID owning local socket */ -#define IP6T_LOG_MASK 0x0f +#define IP6T_LOG_NFLOG 0x10 /* Log using nf_log backend */ +#define IP6T_LOG_MASK 0x1f struct ip6t_log_info { unsigned char level; diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 0128fbbe2328..288ff1d4ccc4 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -166,7 +166,12 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, li.u.log.level = info->loglevel; li.u.log.logflags = info->bitmask; - nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix); + if (info->bitmask & EBT_LOG_NFLOG) + nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, + info->prefix); + else + ebt_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, + info->prefix); } static struct ebt_watcher log = diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 6606ddb66a29..cc27545ff97f 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -425,7 +425,12 @@ ipt_log_target(struct sk_buff **pskb, li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); + if (loginfo->logflags & IPT_LOG_NFLOG) + nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, + loginfo->prefix); + else + ipt_log_packet(PF_INET, hooknum, *pskb, in, out, &li, + loginfo->prefix); return IPT_CONTINUE; } diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 77c725832dec..6b930efa9fb9 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -436,7 +436,12 @@ ip6t_log_target(struct sk_buff **pskb, li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix); + if (loginfo->logflags & IP6T_LOG_NFLOG) + nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, + loginfo->prefix); + else + ip6t_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, + loginfo->prefix); return IP6T_CONTINUE; } -- cgit v1.2.3-71-gd317 From d2b176ed878d4d5fcc0bd35656dfd373f3702af9 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 28 Feb 2006 09:42:23 -0800 Subject: [IA64] sysctl option to silence unaligned trap warnings Allow sysadmin to disable all warnings about userland apps making unaligned accesses by using: # echo 1 > /proc/sys/kernel/ignore-unaligned-usertrap Rather than having to use prctl on a process by process basis. Default behaivour leaves the warnings enabled. Signed-off-by: Jes Sorensen Signed-off-by: Tony Luck --- arch/ia64/kernel/unaligned.c | 31 ++++++++++++++++++++++++++++--- include/linux/sysctl.h | 1 + kernel/sysctl.c | 14 ++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index 112913896844..1e357550c776 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -52,6 +52,15 @@ dump (const char *str, void *vp, size_t len) #define IA64_FIRST_ROTATING_FR 32 #define SIGN_EXT9 0xffffffffffffff00ul +/* + * sysctl settable hook which tells the kernel whether to honor the + * IA64_THREAD_UAC_NOPRINT prctl. Because this is user settable, we want + * to allow the super user to enable/disable this for security reasons + * (i.e. don't allow attacker to fill up logs with unaligned accesses). + */ +int no_unaligned_warning; +static int noprint_warning; + /* * For M-unit: * @@ -1324,8 +1333,9 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0) goto force_sigbus; - if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT) - && within_logging_rate_limit()) + if (!no_unaligned_warning && + !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) && + within_logging_rate_limit()) { char buf[200]; /* comm[] is at most 16 bytes... */ size_t len; @@ -1340,7 +1350,22 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) if (user_mode(regs)) tty_write_message(current->signal->tty, buf); buf[len-1] = '\0'; /* drop '\r' */ - printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */ + /* watch for command names containing %s */ + printk(KERN_WARNING "%s", buf); + } else { + if (no_unaligned_warning && !noprint_warning) { + noprint_warning = 1; + printk(KERN_WARNING "%s(%d) encountered an " + "unaligned exception which required\n" + "kernel assistance, which degrades " + "the performance of the application.\n" + "Unaligned exception warnings have " + "been disabled by the system " + "administrator\n" + "echo 0 > /proc/sys/kernel/ignore-" + "unaligned-usertrap to re-enable\n", + current->comm, current->pid); + } } } else { if (within_logging_rate_limit()) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 0e92bf7ec28e..bac61db26456 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -147,6 +147,7 @@ enum KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ + KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c05a2b7125e1..acf6c1550f27 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,10 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif +#ifdef CONFIG_IA64 +extern int no_unaligned_warning; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -665,6 +669,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 + { + .ctl_name = KERN_IA64_UNALIGNED, + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3-71-gd317 From 0551fbd29e16fccd46e41b7d01bf0f8f39b14212 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Feb 2006 16:59:19 -0800 Subject: [PATCH] Add mm->task_size and fix powerpc vdso This patch adds mm->task_size to keep track of the task size of a given mm and uses that to fix the powerpc vdso so that it uses the mm task size to decide what pages to fault in instead of the current thread flags (which broke when ptracing). (akpm: I expect that mm_struct.task_size will become the way in which we finally sort out the confusion between 32-bit processes and 32-bit mm's. It may need tweaks, but at this stage this patch is powerpc-only.) Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/vdso.c | 4 ++-- fs/exec.c | 6 ++++++ include/linux/sched.h | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index f0c47dab0903..04f7df39ffbb 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -182,8 +182,8 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma, unsigned long offset = address - vma->vm_start; struct page *pg; #ifdef CONFIG_PPC64 - void *vbase = test_thread_flag(TIF_32BIT) ? - vdso32_kbase : vdso64_kbase; + void *vbase = (vma->vm_mm->task_size > TASK_SIZE_USER32) ? + vdso64_kbase : vdso32_kbase; #else void *vbase = vdso32_kbase; #endif diff --git a/fs/exec.c b/fs/exec.c index 0e1c95074d42..0b515ac53134 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm) current->flags &= ~PF_RANDOMIZE; flush_thread(); + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + current->mm->task_size = TASK_SIZE; + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || file_permission(bprm->file, MAY_READ) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { diff --git a/include/linux/sched.h b/include/linux/sched.h index b6f51e3a38ec..ff2e09c953b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -298,8 +298,9 @@ struct mm_struct { unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); - unsigned long mmap_base; /* base of mmap area */ - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ + unsigned long mmap_base; /* base of mmap area */ + unsigned long task_size; /* size of task vm space */ + unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ -- cgit v1.2.3-71-gd317 From 3af1efe8a301f5b1c813f5f761cb1e10d6175605 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 2 Mar 2006 13:25:26 -0500 Subject: [PATCH] reiserfs: fix unaligned bitmap usage The bitmaps associated with generation numbers for directory entries are declared as an array of ints. On some platforms, this causes alignment exceptions. The following patch uses the standard bitmap declaration macros to declare the bitmaps, fixing the problem. Originally from Takashi Iwai. Signed-off-by: Takashi Iwai Acked-by: Jeff Mahoney Signed-off-by: Linus Torvalds --- fs/reiserfs/namei.c | 8 ++++---- include/linux/reiserfs_fs.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index c8123308e060..284f7852de8b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -247,7 +247,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, /* mark, that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), - (unsigned long *)de->de_gen_number_bit_string); + de->de_gen_number_bit_string); // calculate pointer to name and namelen de->de_entry_num = i; @@ -431,7 +431,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, struct reiserfs_de_head *deh; INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); int gen_number; char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc if we create file with short name */ @@ -486,7 +486,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* find the proper place for the new entry */ memset(bit_string, 0, sizeof(bit_string)); - de.de_gen_number_bit_string = (char *)bit_string; + de.de_gen_number_bit_string = bit_string; retval = reiserfs_find_entry(dir, name, namelen, &path, &de); if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) @@ -508,7 +508,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } gen_number = - find_first_zero_bit((unsigned long *)bit_string, + find_first_zero_bit(bit_string, MAX_GENERATION_NUMBER + 1); if (gen_number > MAX_GENERATION_NUMBER) { /* there is no free generation number */ diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 7d51149bd793..dad78cecfd20 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1052,7 +1052,7 @@ struct reiserfs_dir_entry { int de_entrylen; int de_namelen; char *de_name; - char *de_gen_number_bit_string; + unsigned long *de_gen_number_bit_string; __u32 de_dir_id; __u32 de_objectid; -- cgit v1.2.3-71-gd317 From 1e4b27df55166ce3b276f55bab223fa4ae8c5525 Mon Sep 17 00:00:00 2001 From: Karsten Keil Date: Mon, 6 Mar 2006 15:42:37 -0800 Subject: [PATCH] i4l: add new PCI IDs for HFC-S PCI Add new PCI IDs for HFC-S PCI based ISDN TA 'Primux II S0' and 'Primux II S0' from Gerdes AG Signed-off-by: Martin Bachem Signed-off-by: Karsten Keil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/isdn/hisax/config.c | 2 ++ drivers/isdn/hisax/hfc_pci.c | 2 ++ include/linux/pci_ids.h | 2 ++ 3 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c index 8159bcecd0c2..df9d65201819 100644 --- a/drivers/isdn/hisax/config.c +++ b/drivers/isdn/hisax/config.c @@ -1929,6 +1929,8 @@ static struct pci_device_id hisax_pci_tbl[] __initdata = { {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B00B, PCI_ANY_ID, PCI_ANY_ID}, {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B00C, PCI_ANY_ID, PCI_ANY_ID}, {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B100, PCI_ANY_ID, PCI_ANY_ID}, + {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B700, PCI_ANY_ID, PCI_ANY_ID}, + {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B701, PCI_ANY_ID, PCI_ANY_ID}, {PCI_VENDOR_ID_ABOCOM, PCI_DEVICE_ID_ABOCOM_2BD1, PCI_ANY_ID, PCI_ANY_ID}, {PCI_VENDOR_ID_ASUSTEK, PCI_DEVICE_ID_ASUSTEK_0675, PCI_ANY_ID, PCI_ANY_ID}, {PCI_VENDOR_ID_BERKOM, PCI_DEVICE_ID_BERKOM_T_CONCEPT, PCI_ANY_ID, PCI_ANY_ID}, diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c index 4866fc32d8d9..91d25acb5ede 100644 --- a/drivers/isdn/hisax/hfc_pci.c +++ b/drivers/isdn/hisax/hfc_pci.c @@ -51,6 +51,8 @@ static const PCI_ENTRY id_list[] = {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B00B, "Billion", "B00B"}, {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B00C, "Billion", "B00C"}, {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B100, "Seyeon", "B100"}, + {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B700, "Primux II S0", "B700"}, + {PCI_VENDOR_ID_CCD, PCI_DEVICE_ID_CCD_B701, "Primux II S0 NT", "B701"}, {PCI_VENDOR_ID_ABOCOM, PCI_DEVICE_ID_ABOCOM_2BD1, "Abocom/Magitek", "2BD1"}, {PCI_VENDOR_ID_ASUSTEK, PCI_DEVICE_ID_ASUSTEK_0675, "Asuscom/Askey", "675"}, {PCI_VENDOR_ID_BERKOM, PCI_DEVICE_ID_BERKOM_T_CONCEPT, "German telekom", "T-Concept"}, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 82b83da25d77..1709b5009d2e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1752,6 +1752,8 @@ #define PCI_DEVICE_ID_CCD_B00B 0xb00b #define PCI_DEVICE_ID_CCD_B00C 0xb00c #define PCI_DEVICE_ID_CCD_B100 0xb100 +#define PCI_DEVICE_ID_CCD_B700 0xb700 +#define PCI_DEVICE_ID_CCD_B701 0xb701 #define PCI_VENDOR_ID_EXAR 0x13a8 #define PCI_DEVICE_ID_EXAR_XR17C152 0x0152 -- cgit v1.2.3-71-gd317 From 69239749e1ac4f3496906aa4267cb9f61ce52c9c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 6 Mar 2006 15:42:45 -0800 Subject: [PATCH] fix next_timer_interrupt() for hrtimer Also from Thomas Gleixner Function next_timer_interrupt() got broken with a recent patch 6ba1b91213e81aa92b5cf7539f7d2a94ff54947c as sys_nanosleep() was moved to hrtimer. This broke things as next_timer_interrupt() did not check hrtimer tree for next event. Function next_timer_interrupt() is needed with dyntick (CONFIG_NO_IDLE_HZ, VST) implementations, as the system can be in idle when next hrtimer event was supposed to happen. At least ARM and S390 currently use next_timer_interrupt(). Signed-off-by: Thomas Gleixner Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/time.c | 10 ++++++---- include/linux/hrtimer.h | 4 ++++ kernel/hrtimer.c | 35 +++++++++++++++++++++++++++++++++++ kernel/timer.c | 16 ++++++++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index d7d932c02866..d6bd435a6857 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -422,12 +422,14 @@ static int timer_dyn_tick_disable(void) void timer_dyn_reprogram(void) { struct dyn_tick_timer *dyn_tick = system_timer->dyn_tick; + unsigned long next, seq; - if (dyn_tick) { - write_seqlock(&xtime_lock); - if (dyn_tick->state & DYN_TICK_ENABLED) + if (dyn_tick && (dyn_tick->state & DYN_TICK_ENABLED)) { + next = next_timer_interrupt(); + do { + seq = read_seqbegin(&xtime_lock); dyn_tick->reprogram(next_timer_interrupt() - jiffies); - write_sequnlock(&xtime_lock); + } while (read_seqretry(&xtime_lock, seq)); } } diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 6361544bb6ae..6401c31d6add 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -116,6 +116,10 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); +#ifdef CONFIG_NO_IDLE_HZ +extern ktime_t hrtimer_get_next_event(void); +#endif + static inline int hrtimer_active(const struct hrtimer *timer) { return timer->state == HRTIMER_PENDING; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5ae51f1bc7c8..14bc9cfa6399 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -505,6 +505,41 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) return rem; } +#ifdef CONFIG_NO_IDLE_HZ +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the delta to the next expiry event or KTIME_MAX if no timer + * is pending. + */ +ktime_t hrtimer_get_next_event(void) +{ + struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + unsigned long flags; + int i; + + for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { + struct hrtimer *timer; + + spin_lock_irqsave(&base->lock, flags); + if (!base->first) { + spin_unlock_irqrestore(&base->lock, flags); + continue; + } + timer = rb_entry(base->first, struct hrtimer, node); + delta.tv64 = timer->expires.tv64; + spin_unlock_irqrestore(&base->lock, flags); + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; + } + if (mindelta.tv64 < 0) + mindelta.tv64 = 0; + return mindelta; +} +#endif + /** * hrtimer_init - initialize a timer to the given clock * diff --git a/kernel/timer.c b/kernel/timer.c index fc6646fd5aab..8256f3f5ec0d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -489,9 +489,21 @@ unsigned long next_timer_interrupt(void) struct list_head *list; struct timer_list *nte; unsigned long expires; + unsigned long hr_expires = MAX_JIFFY_OFFSET; + ktime_t hr_delta; tvec_t *varray[4]; int i, j; + hr_delta = hrtimer_get_next_event(); + if (hr_delta.tv64 != KTIME_MAX) { + struct timespec tsdelta; + tsdelta = ktime_to_timespec(hr_delta); + hr_expires = timespec_to_jiffies(&tsdelta); + if (hr_expires < 3) + return hr_expires + jiffies; + } + hr_expires += jiffies; + base = &__get_cpu_var(tvec_bases); spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); @@ -542,6 +554,10 @@ found: } } spin_unlock(&base->t_base.lock); + + if (time_before(hr_expires, expires)) + return hr_expires; + return expires; } #endif -- cgit v1.2.3-71-gd317 From 78679302fe428f4f3dc853a51ee24f306010d874 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 6 Mar 2006 15:42:49 -0800 Subject: [PATCH] memory-hotplug compile fix include/linux/memory_hotplug.h:53: warning: 'struct page' declared inside parameter list (akpm: I tossed in a couple more possibly-needed-sometime struct decls too) Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 01f03bc06eff..968b1aa3732c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -6,6 +6,10 @@ #include #include +struct page; +struct zone; +struct pglist_data; + #ifdef CONFIG_MEMORY_HOTPLUG /* * pgdat resizing functions -- cgit v1.2.3-71-gd317 From a615fa83959896f8eac76c235953fb164cd1a9b9 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 6 Mar 2006 15:42:50 -0800 Subject: [PATCH] Increase max kmalloc size for very large systems Systems with extemely large numbers of nodes or cpus need to kmalloc structures larger than is currently supported. This patch increases the maximum supported size for very large systems. This patch should have no effect on current systems. (akpm: why not just use alloc_pages() for sysfs_cpus?) Signed-off-by: Jack Steiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmalloc_sizes.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmalloc_sizes.h b/include/linux/kmalloc_sizes.h index d82d4c05c12d..bda23e00ed71 100644 --- a/include/linux/kmalloc_sizes.h +++ b/include/linux/kmalloc_sizes.h @@ -19,8 +19,10 @@ CACHE(32768) CACHE(65536) CACHE(131072) -#ifndef CONFIG_MMU +#if (NR_CPUS > 512) || (MAX_NUMNODES > 256) || !defined(CONFIG_MMU) CACHE(262144) +#endif +#ifndef CONFIG_MMU CACHE(524288) CACHE(1048576) #ifdef CONFIG_LARGE_ALLOCS -- cgit v1.2.3-71-gd317 From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Mar 2006 14:03:09 -0800 Subject: Mark the pipe file operations static They aren't used (nor even really usable) outside of pipe.c anyway Signed-off-by: Linus Torvalds --- fs/pipe.c | 6 +++--- include/linux/fs.h | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index d722579df79a..8aada8e426f4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = { .fasync = pipe_rdwr_fasync, }; -struct file_operations read_pipe_fops = { +static struct file_operations read_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, @@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -struct file_operations write_pipe_fops = { +static struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = pipe_write, @@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -struct file_operations rdwr_pipe_fops = { +static struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, diff --git a/include/linux/fs.h b/include/linux/fs.h index e059da947007..0cc34b1c42c9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1418,9 +1418,6 @@ extern int is_bad_inode(struct inode *); extern struct file_operations read_fifo_fops; extern struct file_operations write_fifo_fops; extern struct file_operations rdwr_fifo_fops; -extern struct file_operations read_pipe_fops; -extern struct file_operations write_pipe_fops; -extern struct file_operations rdwr_pipe_fops; extern int fs_may_remount_ro(struct super_block *); -- cgit v1.2.3-71-gd317 From e2bab3d92486fb781f4d06f56339264ed1492392 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 7 Mar 2006 21:55:31 -0800 Subject: [PATCH] percpu_counter_sum() Implement percpu_counter_sum(). This is a more accurate but slower version of percpu_counter_read_positive(). We need this for Alex's speedup-ext3_statfs patch and for the nr_file accounting fix. Otherwise these things would be too inaccurate on large CPU counts. Cc: Ravikiran G Thirumalai Cc: Alex Tomas Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu_counter.h | 6 ++++++ mm/swap.c | 25 +++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index bd6708e2c027..682525511c9e 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -39,6 +39,7 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc) } void percpu_counter_mod(struct percpu_counter *fbc, long amount); +long percpu_counter_sum(struct percpu_counter *fbc); static inline long percpu_counter_read(struct percpu_counter *fbc) { @@ -92,6 +93,11 @@ static inline long percpu_counter_read_positive(struct percpu_counter *fbc) return fbc->count; } +static inline long percpu_counter_sum(struct percpu_counter *fbc) +{ + return percpu_counter_read_positive(fbc); +} + #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) diff --git a/mm/swap.c b/mm/swap.c index cce3dda59c59..e9ec06d845e8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount) if (count >= FBC_BATCH || count <= -FBC_BATCH) { spin_lock(&fbc->lock); fbc->count += count; + *pcount = 0; spin_unlock(&fbc->lock); - count = 0; + } else { + *pcount = count; } - *pcount = count; put_cpu(); } EXPORT_SYMBOL(percpu_counter_mod); + +/* + * Add up all the per-cpu counts, return the result. This is a more accurate + * but much slower version of percpu_counter_read_positive() + */ +long percpu_counter_sum(struct percpu_counter *fbc) +{ + long ret; + int cpu; + + spin_lock(&fbc->lock); + ret = fbc->count; + for_each_cpu(cpu) { + long *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; + } + spin_unlock(&fbc->lock); + return ret < 0 ? 0 : ret; +} +EXPORT_SYMBOL(percpu_counter_sum); #endif /* -- cgit v1.2.3-71-gd317 From 21a1ea9eb40411d4ee29448c53b9e4c0654d6ceb Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:33 -0800 Subject: [PATCH] rcu batch tuning This patch adds new tunables for RCU queue and finished batches. There are two types of controls - number of completed RCU updates invoked in a batch (blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark, qlowmark). By default, the per-cpu batch limit is set to a small value. If the input RCU rate exceeds the high watermark, we do two things - force quiescent state on all cpus and set the batch limit of the CPU to INTMAX. Setting batch limit to INTMAX forces all finished RCUs to be processed in one shot. If we have more than INTMAX RCUs queued up, then we have bigger problems anyway. Once the incoming queued RCUs fall below the low watermark, the batch limit is set to the default. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 13 +++++++ include/linux/rcupdate.h | 6 ++- kernel/rcupdate.c | 76 ++++++++++++++++++++++++++++--------- 3 files changed, 76 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 75205391b335..bad5987c4727 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1284,6 +1284,19 @@ running once the system is up. New name for the ramdisk parameter. See Documentation/ramdisk.txt. + rcu.blimit= [KNL,BOOT] Set maximum number of finished + RCU callbacks to process in one batch. + + rcu.qhimark= [KNL,BOOT] Set threshold of queued + RCU callbacks over which batch limiting is disabled. + + rcu.qlowmark= [KNL,BOOT] Set threshold of queued + RCU callbacks below which batch limiting is re-enabled. + + rcu.rsinterval= [KNL,BOOT,SMP] Set the number of additional + RCU callbacks to queued before forcing reschedule + on all cpus. + rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b87aefa082e2..c2ec6c77874e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -98,13 +98,17 @@ struct rcu_data { long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; struct rcu_head **nxttail; - long count; /* # of queued items */ + long qlen; /* # of queued callbacks */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; + long blimit; /* Upper limit on a processed batch */ int cpu; struct rcu_head barrier; +#ifdef CONFIG_SMP + long last_rs_qlen; /* qlen during the last resched */ +#endif }; DECLARE_PER_CPU(struct rcu_data, rcu_data); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 0cf8146bd585..8cf15a569fcd 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10000; +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; +#ifdef CONFIG_SMP +static int rsinterval = 1000; +#endif + +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { + rdp->last_rs_qlen = rdp->qlen; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif /** * call_rcu - Queue an RCU callback for invocation after a grace period. @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *head, rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - - if (unlikely(++rdp->count > 10000)) - set_need_resched(); - + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } local_irq_restore(flags); } -static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; -static struct completion rcu_barrier_completion; - /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_head *head, rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - rdp->count++; -/* - * Should we directly call rcu_do_batch() here ? - * if (unlikely(rdp->count > 10000)) - * rcu_do_batch(rdp); - */ + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + local_irq_restore(flags); } @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data *rdp) next = rdp->donelist = list->next; list->func(list); list = next; - rdp->count--; - if (++count >= maxbatch) + rdp->qlen--; + if (++count >= rdp->blimit) break; } + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; + rdp->blimit = blimit; } static void __devinit rcu_online_cpu(int cpu) @@ -567,7 +602,12 @@ void synchronize_kernel(void) synchronize_rcu(); } -module_param(maxbatch, int, 0); +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); +#ifdef CONFIG_SMP +module_param(rsinterval, int, 0); +#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ -- cgit v1.2.3-71-gd317 From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:35 -0800 Subject: [PATCH] fix file counting I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/file_table.c | 87 +++++++++++++++++++++++++++++++++------------------- include/linux/file.h | 2 -- include/linux/fs.h | 1 + kernel/sysctl.c | 5 ++- net/unix/af_unix.c | 2 +- 6 files changed, 62 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index a173bba32666..11dc83092d4a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff --git a/fs/file_table.c b/fs/file_table.c index 768b58167543..44fabeaa9415 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include #include #include #include @@ -19,52 +20,67 @@ #include #include #include +#include +#include + +#include /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +static inline void file_free_rcu(struct rcu_head *head) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +static inline void file_free(struct file *f) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } -static inline void file_free_rcu(struct rcu_head *head) +/* + * Return the total number of open files in the system + */ +static int get_nr_files(void) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + return percpu_counter_read_positive(&nr_files); } -static inline void file_free(struct file *f) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return files_stat.max_files; } +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,14 +94,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +123,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } diff --git a/include/linux/file.h b/include/linux/file.h index 418b6101b59a..9901b850f2e4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -60,8 +60,6 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); struct kmem_cache; -extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags); -extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0cc34b1c42c9..51c0c93bdf93 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ struct files_stat_struct { int max_files; /* tunable */ }; extern struct files_stat_struct files_stat; +extern int get_max_files(void); struct inodes_stat_t { int nr_inodes; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index de2d9109194e..32b48e8ee36e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,9 @@ #include #include +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -943,7 +946,7 @@ static ctl_table fs_table[] = { .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_files, }, { .ctl_name = FS_MAXFILE, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1b5989b1b670..c323cc6a28b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -547,7 +547,7 @@ static struct sock * unix_create1(struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + if (atomic_read(&unix_nr_socks) >= 2*get_max_files()) goto out; sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); -- cgit v1.2.3-71-gd317 From 0ef675d491bd65028fa838015ebc6ce8abefab6f Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Thu, 9 Mar 2006 17:33:38 -0800 Subject: [PATCH] mtd: 64 bit fixes Fix some bugs in mtd/jffs2 on 64bit platform. The MEMGETBADBLOCK/MEMSETBADBLOCK ioctl are not listed in compat_ioctl.h. And some variables in jffs2 are declared as uint32_t but used to hold size_t values. Signed-off-by: Atsushi Nemoto Cc: Thomas Gleixner Acked-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/nodelist.c | 3 ++- fs/jffs2/readinode.c | 2 +- include/linux/compat_ioctl.h | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index b635e167a3fa..d4d0c41490cd 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info int err = 0, pointed = 0; struct jffs2_eraseblock *jeb; unsigned char *buffer; - uint32_t crc, ofs, retlen, len; + uint32_t crc, ofs, len; + size_t retlen; BUG_ON(tn->csize == 0); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 5f0652df5d47..f1695642d0f7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r * negative error code on failure. */ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, - struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp, + struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp, uint32_t *latest_mctime, uint32_t *mctime_ver) { struct jffs2_full_dirent *fd; diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h index 8fad50f8e389..ae7dfb790df3 100644 --- a/include/linux/compat_ioctl.h +++ b/include/linux/compat_ioctl.h @@ -696,6 +696,8 @@ COMPATIBLE_IOCTL(MEMLOCK) COMPATIBLE_IOCTL(MEMUNLOCK) COMPATIBLE_IOCTL(MEMGETREGIONCOUNT) COMPATIBLE_IOCTL(MEMGETREGIONINFO) +COMPATIBLE_IOCTL(MEMGETBADBLOCK) +COMPATIBLE_IOCTL(MEMSETBADBLOCK) /* NBD */ ULONG_IOCTL(NBD_SET_SOCK) ULONG_IOCTL(NBD_SET_BLKSIZE) -- cgit v1.2.3-71-gd317 From 8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 9 Mar 2006 17:33:54 -0800 Subject: [PATCH] slab: Node rotor for freeing alien caches and remote per cpu pages. The cache reaper currently tries to free all alien caches and all remote per cpu pages in each pass of cache_reap. For a machines with large number of nodes (such as Altix) this may lead to sporadic delays of around ~10ms. Interrupts are disabled while reclaiming creating unacceptable delays. This patch changes that behavior by adding a per cpu reap_node variable. Instead of attempting to free all caches, we free only one alien cache and the per cpu pages from one remote node. That reduces the time spend in cache_reap. However, doing so will lengthen the time it takes to completely drain all remote per cpu pagesets and all alien caches. The time needed will grow with the number of nodes in the system. All caches are drained when they overflow their respective capacity. So the drawback here is only that a bit of memory may be wasted for awhile longer. Details: 1. Rename drain_remote_pages to drain_node_pages to allow the specification of the node to drain of pcp pages. 2. Add additional functions init_reap_node, next_reap_node for NUMA that manage a per cpu reap_node counter. 3. Add a reap_alien function that reaps only from the current reap_node. For us this seems to be a critical issue. Holdoffs of an average of ~7ms cause some HPC benchmarks to slow down significantly. F.e. NAS parallel slows down dramatically. NAS parallel has a 12-16 seconds runtime w/o rotor compared to 5.8 secs with the rotor patches. It gets down to 5.05 secs with the additional interrupt holdoff reductions. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- mm/page_alloc.c | 17 +++++++------- mm/slab.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 72 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 20f9148e38d9..7851e6b520cf 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -157,9 +157,9 @@ extern void FASTCALL(free_cold_page(struct page *page)); void page_alloc_init(void); #ifdef CONFIG_NUMA -void drain_remote_pages(void); +void drain_node_pages(int node); #else -static inline void drain_remote_pages(void) { }; +static inline void drain_node_pages(int node) { }; #endif #endif /* __LINUX_GFP_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 791690d7d3fa..234bd4895d14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } #ifdef CONFIG_NUMA -/* Called from the slab reaper to drain remote pagesets */ -void drain_remote_pages(void) +/* + * Called from the slab reaper to drain pagesets on a particular node that + * belong to the currently executing processor. + */ +void drain_node_pages(int nodeid) { - struct zone *zone; - int i; + int i, z; unsigned long flags; local_irq_save(flags); - for_each_zone(zone) { + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; - /* Do not drain local pagesets */ - if (zone->zone_pgdat->node_id == numa_node_id()) - continue; - pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; diff --git a/mm/slab.c b/mm/slab.c index 61800b88e241..d0bd7f07ab04 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char * dump_stack(); } +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = 0; + + __get_cpu_var(reap_node) = node; +} + +static void next_reap_node(void) +{ + int node = __get_cpu_var(reap_node); + + /* + * Also drain per cpu pages on remote zones + */ + if (node != numa_node_id()) + drain_node_pages(node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + /* * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz * via the workqueue/eventd. @@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu) * at that time. */ if (keventd_up() && reap_work->func == NULL) { + init_reap_node(cpu); INIT_WORK(reap_work, cache_reap, NULL); schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); } @@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep, } } +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +{ + int node = __get_cpu_var(reap_node); + + if (l3->alien) { + struct array_cache *ac = l3->alien[node]; + if (ac && ac->avail) { + spin_lock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node); + spin_unlock_irq(&ac->lock); + } + } +} + static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { int i = 0; @@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al #else #define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -3497,8 +3557,7 @@ static void cache_reap(void *unused) check_irq_on(); l3 = searchp->nodelists[numa_node_id()]; - if (l3->alien) - drain_alien_cache(searchp, l3->alien); + reap_alien(searchp, l3); spin_lock_irq(&l3->list_lock); drain_array_locked(searchp, cpu_cache_get(searchp), 0, @@ -3548,7 +3607,7 @@ static void cache_reap(void *unused) } check_irq_on(); mutex_unlock(&cache_chain_mutex); - drain_remote_pages(); + next_reap_node(); /* Setup the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } -- cgit v1.2.3-71-gd317 From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 11 Mar 2006 03:27:13 -0800 Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside This patch fixes illegal __GFP_FS allocation inside ext3 transaction in ext3_symlink(). Such allocation may re-enter ext3 code from try_to_free_pages. But JBD/ext3 code keeps a pointer to current journal handle in task_struct and, hence, is not reentrable. This bug led to "Assertion failure in journal_dirty_metadata()" messages. http://bugzilla.openvz.org/show_bug.cgi?id=115 Signed-off-by: Andrey Savochkin Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 3 ++- fs/namei.c | 13 +++++++++++-- include/linux/fs.h | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8bd8ac077704..b8f5cd1e540d 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2141,7 +2141,8 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = page_symlink(inode, symname, l); + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { ext3_dec_count(handle, inode); ext3_mark_inode_dirty(handle, inode); diff --git a/fs/namei.c b/fs/namei.c index 557dcf395ca1..8dc2b038d5d9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int page_symlink(struct inode *inode, const char *symname, int len) +int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); @@ -2654,6 +2656,12 @@ fail: return err; } +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + mapping_gfp_mask(inode->i_mapping)); +} + struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); diff --git a/include/linux/fs.h b/include/linux/fs.h index 51c0c93bdf93..128d0082522c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1664,6 +1664,8 @@ extern int vfs_follow_link(struct nameidata *, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern void *page_follow_link_light(struct dentry *, struct nameidata *); extern void page_put_link(struct dentry *, struct nameidata *, void *); +extern int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask); extern int page_symlink(struct inode *inode, const char *symname, int len); extern struct inode_operations page_symlink_inode_operations; extern int generic_readlink(struct dentry *, char __user *, int); -- cgit v1.2.3-71-gd317 From 7cd9013be6c22f3ff6f777354f766c8c0b955e17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 11 Mar 2006 03:27:18 -0800 Subject: [PATCH] remove __put_task_struct_cb export again The patch '[PATCH] RCU signal handling' [1] added an export for __put_task_struct_cb, a put_task_struct helper newly introduced in that patch. But the put_task_struct couldn't be used modular previously as __put_task_struct wasn't exported. There are not callers of it in modular code, and it shouldn't be exported because we don't want drivers to hold references to task_structs. This patch removes the export and folds __put_task_struct into __put_task_struct_cb as there's no other caller. [1] http://www2.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e56d090310d7625ecb43a1eeebd479f04affb48b Signed-off-by: Christoph Hellwig Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/fork.c | 4 +++- kernel/sched.c | 7 ------- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff2e09c953b9..62e6314382f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -892,7 +892,6 @@ static inline int pid_alive(struct task_struct *p) } extern void free_task(struct task_struct *tsk); -extern void __put_task_struct(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) extern void __put_task_struct_cb(struct rcu_head *rhp); diff --git a/kernel/fork.c b/kernel/fork.c index fbea12d7a943..a8eab86de7f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -108,8 +108,10 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); diff --git a/kernel/sched.c b/kernel/sched.c index e82c99f1db64..4d46e90f59c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -178,13 +178,6 @@ static unsigned int task_timeslice(task_t *p) #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) -void __put_task_struct_cb(struct rcu_head *rhp) -{ - __put_task_struct(container_of(rhp, struct task_struct, rcu)); -} - -EXPORT_SYMBOL_GPL(__put_task_struct_cb); - /* * These are the runqueue data structures: */ -- cgit v1.2.3-71-gd317 From 4a29cc2e503b33a1e96db4c3f9a94165f153f259 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 19 Mar 2006 13:21:12 -0800 Subject: [TG3]: 40-bit DMA workaround part 2 The 40-bit DMA workaround recently implemented for 5714, 5715, and 5780 needs to be expanded because there may be other tg3 devices behind the EPB Express to PCIX bridge in the 5780 class device. For example, some 4-port card or mother board designs have 5704 behind the 5714. All devices behind the EPB require the 40-bit DMA workaround. Thanks to Chris Elmquist again for reporting the problem and testing the patch. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 52 ++++++++++++++++++++++++++++++++++++++++--------- drivers/net/tg3.h | 1 + include/linux/pci_ids.h | 1 + 3 files changed, 45 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b8f1524da557..caf4102b54ce 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -9552,12 +9552,36 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) } } - /* Find msi capability. */ + /* The EPB bridge inside 5714, 5715, and 5780 cannot support + * DMA addresses > 40-bit. This bridge may have other additional + * 57xx devices behind it in some 4-port NIC designs for example. + * Any tg3 device found behind the bridge will also need the 40-bit + * DMA workaround. + */ if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780 || GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5714) { tp->tg3_flags2 |= TG3_FLG2_5780_CLASS; + tp->tg3_flags |= TG3_FLAG_40BIT_DMA_BUG; tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI); } + else { + struct pci_dev *bridge = NULL; + + do { + bridge = pci_get_device(PCI_VENDOR_ID_SERVERWORKS, + PCI_DEVICE_ID_SERVERWORKS_EPB, + bridge); + if (bridge && bridge->subordinate && + (bridge->subordinate->number <= + tp->pdev->bus->number) && + (bridge->subordinate->subordinate >= + tp->pdev->bus->number)) { + tp->tg3_flags |= TG3_FLAG_40BIT_DMA_BUG; + pci_dev_put(bridge); + break; + } + } while (bridge); + } /* Initialize misc host control in PCI block. */ tp->misc_host_ctrl |= (misc_ctrl_reg & @@ -10303,7 +10327,14 @@ static int __devinit tg3_test_dma(struct tg3 *tp) GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704) { u32 ccval = (tr32(TG3PCI_CLOCK_CTRL) & 0x1f); - if (ccval == 0x6 || ccval == 0x7) + /* If the 5704 is behind the EPB bridge, we can + * do the less restrictive ONE_DMA workaround for + * better performance. + */ + if ((tp->tg3_flags & TG3_FLAG_40BIT_DMA_BUG) && + GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704) + tp->dma_rwctrl |= 0x8000; + else if (ccval == 0x6 || ccval == 0x7) tp->dma_rwctrl |= DMA_RWCTRL_ONE_DMA; /* Set bit 23 to enable PCIX hw bug fix */ @@ -10759,19 +10790,20 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, goto err_out_iounmap; } - /* 5714, 5715 and 5780 cannot support DMA addresses > 40-bit. + /* The EPB bridge inside 5714, 5715, and 5780 and any + * device behind the EPB cannot support DMA addresses > 40-bit. * On 64-bit systems with IOMMU, use 40-bit dma_mask. * On 64-bit systems without IOMMU, use 64-bit dma_mask and * do DMA address check in tg3_start_xmit(). */ - if (tp->tg3_flags2 & TG3_FLG2_5780_CLASS) { + if (tp->tg3_flags2 & TG3_FLG2_IS_5788) + persist_dma_mask = dma_mask = DMA_32BIT_MASK; + else if (tp->tg3_flags & TG3_FLAG_40BIT_DMA_BUG) { persist_dma_mask = dma_mask = DMA_40BIT_MASK; #ifdef CONFIG_HIGHMEM dma_mask = DMA_64BIT_MASK; #endif - } else if (tp->tg3_flags2 & TG3_FLG2_IS_5788) - persist_dma_mask = dma_mask = DMA_32BIT_MASK; - else + } else persist_dma_mask = dma_mask = DMA_64BIT_MASK; /* Configure DMA attributes. */ @@ -10908,8 +10940,10 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, (tp->tg3_flags & TG3_FLAG_SPLIT_MODE) != 0, (tp->tg3_flags2 & TG3_FLG2_NO_ETH_WIRE_SPEED) == 0, (tp->tg3_flags2 & TG3_FLG2_TSO_CAPABLE) != 0); - printk(KERN_INFO "%s: dma_rwctrl[%08x]\n", - dev->name, tp->dma_rwctrl); + printk(KERN_INFO "%s: dma_rwctrl[%08x] dma_mask[%d-bit]\n", + dev->name, tp->dma_rwctrl, + (pdev->dma_mask == DMA_32BIT_MASK) ? 32 : + (((u64) pdev->dma_mask == DMA_40BIT_MASK) ? 40 : 64)); return 0; diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 7f4b7f6ac40d..7e3b613afb29 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2163,6 +2163,7 @@ struct tg3 { #define TG3_FLAG_10_100_ONLY 0x01000000 #define TG3_FLAG_PAUSE_AUTONEG 0x02000000 #define TG3_FLAG_IN_RESET_TASK 0x04000000 +#define TG3_FLAG_40BIT_DMA_BUG 0x08000000 #define TG3_FLAG_BROKEN_CHECKSUMS 0x10000000 #define TG3_FLAG_GOT_SERDES_FLOWCTL 0x20000000 #define TG3_FLAG_SPLIT_MODE 0x40000000 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1709b5009d2e..751eea58bde8 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1365,6 +1365,7 @@ #define PCI_DEVICE_ID_SERVERWORKS_HE 0x0008 #define PCI_DEVICE_ID_SERVERWORKS_LE 0x0009 #define PCI_DEVICE_ID_SERVERWORKS_GCNB_LE 0x0017 +#define PCI_DEVICE_ID_SERVERWORKS_EPB 0x0103 #define PCI_DEVICE_ID_SERVERWORKS_OSB4 0x0200 #define PCI_DEVICE_ID_SERVERWORKS_CSB5 0x0201 #define PCI_DEVICE_ID_SERVERWORKS_CSB6 0x0203 -- cgit v1.2.3-71-gd317