From c12d2da56d0e07d230968ee2305aaa86b93a6832 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Apr 2016 10:24:58 +0200 Subject: mm/gup: Remove the macro overload API migration helpers from the get_user*() APIs The pkeys changes brought about a truly hideous set of macros in: cde70140fed8 ("mm/gup: Overload get_user_pages() functions") ... which macros are (ab-)using the fact that __VA_ARGS__ can be used to shift parameter positions in macro arguments without breaking the build and so can be used to call separate C functions depending on the number of arguments of the macro. This allowed easy migration of these 3 GUP APIs, as both these variants worked at the C level: old: ret = get_user_pages(current, current->mm, address, 1, 1, 0, &page, NULL); new: ret = get_user_pages(address, 1, 1, 0, &page, NULL); ... while we also generated a (functionally harmless but noticeable) build time warning if the old API was used. As there are over 300 uses of these APIs, this trick eased the migration of the API and avoided excessive migration pain in linux-next. Now, with its work done, get rid of all of that complication and ugliness: 3 files changed, 16 insertions(+), 140 deletions(-) ... where the linecount of the migration hack was further inflated by the fact that there are NOMMU variants of these GUP APIs as well. Much of the conversion was done in linux-next over the past couple of months, and Linus recently removed all remaining old API uses from the upstream tree in the following upstrea commit: cb107161df3c ("Convert straggling drivers to new six-argument get_user_pages()") There was one more old-API usage in mm/gup.c, in the CONFIG_HAVE_GENERIC_RCU_GUP code path that ARM, ARM64 and PowerPC uses. After this commit any old API usage will break the build. [ Also fixed a PowerPC/HAVE_GENERIC_RCU_GUP warning reported by Stephen Rothwell. ] Cc: Andrew Morton Cc: Dave Hansen Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Rothwell Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- include/linux/mm.h | 64 +++--------------------------------------------------- 1 file changed, 3 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ed6407d1b7b5..d6508a025a31 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1250,78 +1250,20 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages6(unsigned long start, unsigned long nr_pages, +long get_user_pages(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, unsigned int gup_flags); -long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); -/* suppress warnings from use in EXPORT_SYMBOL() */ -#ifndef __DISABLE_GUP_DEPRECATED -#define __gup_deprecated __deprecated -#else -#define __gup_deprecated -#endif -/* - * These macros provide backward-compatibility with the old - * get_user_pages() variants which took tsk/mm. These - * functions/macros provide both compile-time __deprecated so we - * can catch old-style use and not break the build. The actual - * functions also have WARN_ON()s to let us know at runtime if - * the get_user_pages() should have been the "remote" variant. - * - * These are hideous, but temporary. - * - * If you run into one of these __deprecated warnings, look - * at how you are calling get_user_pages(). If you are calling - * it with current/current->mm as the first two arguments, - * simply remove those arguments. The behavior will be the same - * as it is now. If you are calling it on another task, use - * get_user_pages_remote() instead. - * - * Any questions? Ask Dave Hansen - */ -long -__gup_deprecated -get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - struct vm_area_struct **vmas); -#define GUP_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages, ...) \ - get_user_pages -#define get_user_pages(...) GUP_MACRO(__VA_ARGS__, \ - get_user_pages8, x, \ - get_user_pages6, x, x, x, x, x)(__VA_ARGS__) - -__gup_deprecated -long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked); -#define GUPL_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages_locked, ...) \ - get_user_pages_locked -#define get_user_pages_locked(...) GUPL_MACRO(__VA_ARGS__, \ - get_user_pages_locked8, x, \ - get_user_pages_locked6, x, x, x, x)(__VA_ARGS__) - -__gup_deprecated -long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages); -#define GUPU_MACRO(_1, _2, _3, _4, _5, _6, _7, get_user_pages_unlocked, ...) \ - get_user_pages_unlocked -#define get_user_pages_unlocked(...) GUPU_MACRO(__VA_ARGS__, \ - get_user_pages_unlocked7, x, \ - get_user_pages_unlocked5, x, x, x, x)(__VA_ARGS__) - /* Container for pinned pfns / pages */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ -- cgit v1.2.3-71-gd317 From b32e4482aadfd1322357f46d4ed8a990603664d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 15:51:57 -0700 Subject: fscrypto: don't let data integrity writebacks fail with ENOMEM This patch fixes the issue introduced by the ext4 crypto fix in a same manner. For F2FS, however, we flush the pending IOs and wait for a while to acquire free memory. Fixes: c9af28fdd4492 ("ext4 crypto: don't let data integrity writebacks fail with ENOMEM") Cc: Theodore Ts'o Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 36 ++++++++++++++++++++---------------- fs/f2fs/data.c | 16 +++++++++++++--- include/linux/fscrypto.h | 9 +++++---- 3 files changed, 38 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 58ae0ba91ca2..da70520f3ab4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -81,13 +81,14 @@ EXPORT_SYMBOL(fscrypt_release_ctx); /** * fscrypt_get_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation * * Allocates and initializes an encryption context. * * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -113,7 +114,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); if (!ctx) { - ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); if (!ctx) return ERR_PTR(-ENOMEM); ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; @@ -147,7 +148,8 @@ typedef enum { static int do_page_crypto(struct inode *inode, fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page) + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) { u8 xts_tweak[FS_XTS_TWEAK_SIZE]; struct skcipher_request *req = NULL; @@ -157,7 +159,7 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -199,10 +201,9 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) { - ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, - GFP_NOWAIT); + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); ctx->flags |= FS_WRITE_PATH_FL; @@ -213,6 +214,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * fscypt_encrypt_page() - Encrypts a page * @inode: The inode for which the encryption should take place * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation * * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx * encryption context. @@ -225,7 +227,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * error value or NULL. */ struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page) + struct page *plaintext_page, gfp_t gfp_flags) { struct fscrypt_ctx *ctx; struct page *ciphertext_page = NULL; @@ -233,18 +235,19 @@ struct page *fscrypt_encrypt_page(struct inode *inode, BUG_ON(!PageLocked(plaintext_page)); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); + plaintext_page, ciphertext_page, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -275,7 +278,7 @@ int fscrypt_decrypt_page(struct page *page) BUG_ON(!PageLocked(page)); return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page); + FS_DECRYPT, page->index, page, page, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); @@ -289,11 +292,11 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); if (IS_ERR(ciphertext_page)) { err = PTR_ERR(ciphertext_page); goto errout; @@ -301,11 +304,12 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page); + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); if (err) goto errout; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOWAIT, 1); if (!bio) { err = -ENOMEM; goto errout; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 53fec0872e60..5dafb9cef12e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -992,7 +992,7 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; @@ -1092,14 +1092,24 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), fio->old_blkaddr); - - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index cd91f75de49b..6027f6bbb061 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -263,9 +263,9 @@ static inline void fscrypt_set_d_op(struct dentry *dentry) extern struct kmem_cache *fscrypt_info_cachep; int fscrypt_initialize(void); -extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *); +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); -extern struct page *fscrypt_encrypt_page(struct inode *, struct page *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); extern int fscrypt_decrypt_page(struct page *); extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); @@ -299,7 +299,8 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, #endif /* crypto.c */ -static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i) +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) { return ERR_PTR(-EOPNOTSUPP); } @@ -310,7 +311,7 @@ static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) } static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, - struct page *p) + struct page *p, gfp_t f) { return ERR_PTR(-EOPNOTSUPP); } -- cgit v1.2.3-71-gd317 From 1363074667a6b7d0507527742ccd7bbed5e3ceaa Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 12 Apr 2016 12:27:09 +0200 Subject: USB: uas: Add a new NO_REPORT_LUNS quirk Add a new NO_REPORT_LUNS quirk and set it for Seagate drives with an usb-id of: 0bc2:331a, as these will fail to respond to a REPORT_LUNS command. Cc: stable@vger.kernel.org Reported-and-tested-by: David Webb Signed-off-by: Hans de Goede Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 2 ++ drivers/usb/storage/uas.c | 14 +++++++++++++- drivers/usb/storage/unusual_uas.h | 7 +++++++ drivers/usb/storage/usb.c | 5 ++++- include/linux/usb_usual.h | 2 ++ 5 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ecc74fa4bfde..0b3de80ec8f6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4077,6 +4077,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sector if the number is odd); i = IGNORE_DEVICE (don't bind to this device); + j = NO_REPORT_LUNS (don't use report luns + command, uas only); l = NOT_LOCKABLE (don't try to lock and unlock ejectable media); m = MAX_SECTORS_64 (don't transfer more diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index b1ec7499166d..16bc679dc2fc 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -2,7 +2,7 @@ * USB Attached SCSI * Note that this is not the same as the USB Mass Storage driver * - * Copyright Hans de Goede for Red Hat, Inc. 2013 - 2014 + * Copyright Hans de Goede for Red Hat, Inc. 2013 - 2016 * Copyright Matthew Wilcox for Intel Corp, 2010 * Copyright Sarah Sharp for Intel Corp, 2010 * @@ -781,6 +781,17 @@ static int uas_eh_bus_reset_handler(struct scsi_cmnd *cmnd) return SUCCESS; } +static int uas_target_alloc(struct scsi_target *starget) +{ + struct uas_dev_info *devinfo = (struct uas_dev_info *) + dev_to_shost(starget->dev.parent)->hostdata; + + if (devinfo->flags & US_FL_NO_REPORT_LUNS) + starget->no_report_luns = 1; + + return 0; +} + static int uas_slave_alloc(struct scsi_device *sdev) { struct uas_dev_info *devinfo = @@ -831,6 +842,7 @@ static struct scsi_host_template uas_host_template = { .module = THIS_MODULE, .name = "uas", .queuecommand = uas_queuecommand, + .target_alloc = uas_target_alloc, .slave_alloc = uas_slave_alloc, .slave_configure = uas_slave_configure, .eh_abort_handler = uas_eh_abort_handler, diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index ccc113e83d88..53341a77d89f 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -64,6 +64,13 @@ UNUSUAL_DEV(0x0bc2, 0x3312, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NO_ATA_1X), +/* Reported-by: David Webb */ +UNUSUAL_DEV(0x0bc2, 0x331a, 0x0000, 0x9999, + "Seagate", + "Expansion Desk", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_REPORT_LUNS), + /* Reported-by: Hans de Goede */ UNUSUAL_DEV(0x0bc2, 0x3320, 0x0000, 0x9999, "Seagate", diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c index 43576ed31ccd..9de988a0f856 100644 --- a/drivers/usb/storage/usb.c +++ b/drivers/usb/storage/usb.c @@ -482,7 +482,7 @@ void usb_stor_adjust_quirks(struct usb_device *udev, unsigned long *fflags) US_FL_NO_READ_DISC_INFO | US_FL_NO_READ_CAPACITY_16 | US_FL_INITIAL_READ10 | US_FL_WRITE_CACHE | US_FL_NO_ATA_1X | US_FL_NO_REPORT_OPCODES | - US_FL_MAX_SECTORS_240); + US_FL_MAX_SECTORS_240 | US_FL_NO_REPORT_LUNS); p = quirks; while (*p) { @@ -532,6 +532,9 @@ void usb_stor_adjust_quirks(struct usb_device *udev, unsigned long *fflags) case 'i': f |= US_FL_IGNORE_DEVICE; break; + case 'j': + f |= US_FL_NO_REPORT_LUNS; + break; case 'l': f |= US_FL_NOT_LOCKABLE; break; diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h index 7f5f78bd15ad..245f57dbbb61 100644 --- a/include/linux/usb_usual.h +++ b/include/linux/usb_usual.h @@ -79,6 +79,8 @@ /* Cannot handle MI_REPORT_SUPPORTED_OPERATION_CODES */ \ US_FLAG(MAX_SECTORS_240, 0x08000000) \ /* Sets max_sectors to 240 */ \ + US_FLAG(NO_REPORT_LUNS, 0x10000000) \ + /* Cannot handle REPORT_LUNS */ \ #define US_FLAG(name, value) US_FL_##name = value , enum { US_DO_ALL_FLAGS }; -- cgit v1.2.3-71-gd317 From cba2e47abcbd80e3f46f460899290402f98090ec Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Apr 2016 18:10:52 -0600 Subject: pmem: fix BUG() error in pmem.h:48 on X86_32 After 'commit fc0c2028135c ("x86, pmem: use memcpy_mcsafe() for memcpy_from_pmem()")', probing a PMEM device hits the BUG() error below on X86_32 kernel. kernel BUG at include/linux/pmem.h:48! memcpy_from_pmem() calls arch_memcpy_from_pmem(), which is unimplemented since CONFIG_ARCH_HAS_PMEM_API is undefined on X86_32. Fix the BUG() error by adding default_memcpy_from_pmem(). Acked-by: Dan Williams Signed-off-by: Toshi Kani Signed-off-by: Ross Zwisler Cc: Dan Williams Cc: Ross Zwisler --- include/linux/pmem.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pmem.h b/include/linux/pmem.h index ac6d872ce067..57d146fe44dd 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -72,6 +72,18 @@ static inline void arch_invalidate_pmem(void __pmem *addr, size_t size) } #endif +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); +} + +static inline int default_memcpy_from_pmem(void *dst, void __pmem const *src, + size_t size) +{ + memcpy(dst, (void __force *) src, size); + return 0; +} + /* * memcpy_from_pmem - read from persistent memory with error handling * @dst: destination buffer @@ -83,12 +95,10 @@ static inline void arch_invalidate_pmem(void __pmem *addr, size_t size) static inline int memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { - return arch_memcpy_from_pmem(dst, src, size); -} - -static inline bool arch_has_pmem_api(void) -{ - return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); + if (arch_has_pmem_api()) + return arch_memcpy_from_pmem(dst, src, size); + else + return default_memcpy_from_pmem(dst, src, size); } /** -- cgit v1.2.3-71-gd317 From 34dbbcdbf63360661ff7bda6c5f52f99ac515f92 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Apr 2016 11:22:00 -0700 Subject: Make file credentials available to the seqfile interfaces A lot of seqfile users seem to be using things like %pK that uses the credentials of the current process, but that is actually completely wrong for filesystem interfaces. The unix semantics for permission checking files is to check permissions at _open_ time, not at read or write time, and that is not just a small detail: passing off stdin/stdout/stderr to a suid application and making the actual IO happen in privileged context is a classic exploit technique. So if we want to be able to look at permissions at read time, we need to use the file open credentials, not the current ones. Normal file accesses can just use "f_cred" (or any of the helper functions that do that, like file_ns_capable()), but the seqfile interfaces do not have any such options. It turns out that seq_file _does_ save away the user_ns information of the file, though. Since user_ns is just part of the full credential information, replace that special case with saving off the cred pointer instead, and suddenly seq_file has all the permission information it needs. Signed-off-by: Linus Torvalds --- fs/seq_file.c | 7 ++++--- include/linux/seq_file.h | 13 ++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/seq_file.c b/fs/seq_file.c index e85664b7c7d9..19f532e7d35e 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -72,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op) mutex_init(&p->lock); p->op = op; -#ifdef CONFIG_USER_NS - p->user_ns = file->f_cred->user_ns; -#endif + + // No refcounting: the lifetime of 'p' is constrained + // to the lifetime of the file. + p->file = file; /* * Wrappers around seq_open(e.g. swaps_open) need to be diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dde00defbaa5..f3d45dd42695 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -7,13 +7,10 @@ #include #include #include +#include +#include struct seq_operations; -struct file; -struct path; -struct inode; -struct dentry; -struct user_namespace; struct seq_file { char *buf; @@ -27,9 +24,7 @@ struct seq_file { struct mutex lock; const struct seq_operations *op; int poll_event; -#ifdef CONFIG_USER_NS - struct user_namespace *user_ns; -#endif + const struct file *file; void *private; }; @@ -147,7 +142,7 @@ int seq_release_private(struct inode *, struct file *); static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS - return seq->user_ns; + return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; -- cgit v1.2.3-71-gd317 From d894ba18d4e449b3a7f6eb491f16c9e02933736e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 12 Apr 2016 13:11:25 -0400 Subject: soreuseport: fix ordering for mixed v4/v6 sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the SO_REUSEPORT socket option, it is possible to create sockets in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address. This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on the AF_INET6 sockets. Prior to the commits referenced below, an incoming IPv4 packet would always be routed to a socket of type AF_INET when this mixed-mode was used. After those changes, the same packet would be routed to the most recently bound socket (if this happened to be an AF_INET6 socket, it would have an IPv4 mapped IPv6 address). The change in behavior occurred because the recent SO_REUSEPORT optimizations short-circuit the socket scoring logic as soon as they find a match. They did not take into account the scoring logic that favors AF_INET sockets over AF_INET6 sockets in the event of a tie. To fix this problem, this patch changes the insertion order of AF_INET and AF_INET6 addresses in the TCP and UDP socket lists when the sockets have SO_REUSEPORT set. AF_INET sockets will be inserted at the head of the list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at the tail of the list. This will force AF_INET sockets to always be considered first. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Maciej Żenczykowski Signed-off-by: Craig Gallek Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist_nulls.h | 39 +++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 6 +++++- net/ipv4/udp.c | 9 +++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 1c33dd7da4a7..4ae95f7e8597 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, if (!is_a_nulls(first)) first->pprev = &n->next; } + +/** + * hlist_nulls_add_tail_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the end of the specified hlist_nulls, + * while permitting racing traversals. NOTE: tail insertion requires + * list traversal. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *i, *last = NULL; + + for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i); + i = hlist_nulls_next_rcu(i)) + last = i; + + if (last) { + n->next = last->next; + n->pprev = &last->next; + rcu_assign_pointer(hlist_nulls_next_rcu(last), n); + } else { + hlist_nulls_add_head_rcu(n, h); + } +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/sock.h b/include/net/sock.h index 255d3e03727b..121ffc115c4f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); + else + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..a2e7f55a1f61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -339,8 +339,13 @@ found: hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } -- cgit v1.2.3-71-gd317 From cb92148b58a49455f3a7204eba3aee09a8b7683c Mon Sep 17 00:00:00 2001 From: Hariprasad Shenai Date: Fri, 15 Apr 2016 13:00:11 -0500 Subject: PCI: Add pci_set_vpd_size() to set VPD size After 104daa71b396 ("PCI: Determine actual VPD size on first access"), the PCI core computes the valid VPD size by parsing the VPD starting at offset 0x0. We don't attempt to read past that valid size because that causes some devices to crash. However, some devices do have data past that valid size. For example, Chelsio adapters contain two VPD structures, and the driver needs both of them. Add pci_set_vpd_size(). If a driver knows it is safe to read past the end of the VPD data structure at offset 0, it can use pci_set_vpd_size() to allow access to as much data as it needs. [bhelgaas: changelog, split patches, rename to pci_set_vpd_size() and return int (not ssize_t)] Fixes: 104daa71b396 ("PCI: Determine actual VPD size on first access") Tested-by: Steve Wise Signed-off-by: Casey Leedom Signed-off-by: Hariprasad Shenai Signed-off-by: Bjorn Helgaas --- drivers/pci/access.c | 42 ++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 1 + include/linux/pci.h | 1 + 3 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 01b9d0a00abc..d11cdbb8fba3 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -275,6 +275,19 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void } EXPORT_SYMBOL(pci_write_vpd); +/** + * pci_set_vpd_size - Set size of Vital Product Data space + * @dev: pci device struct + * @len: size of vpd space + */ +int pci_set_vpd_size(struct pci_dev *dev, size_t len) +{ + if (!dev->vpd || !dev->vpd->ops) + return -ENODEV; + return dev->vpd->ops->set_size(dev, len); +} +EXPORT_SYMBOL(pci_set_vpd_size); + #define PCI_VPD_MAX_SIZE (PCI_VPD_ADDR_MASK + 1) /** @@ -498,9 +511,23 @@ out: return ret ? ret : count; } +static int pci_vpd_set_size(struct pci_dev *dev, size_t len) +{ + struct pci_vpd *vpd = dev->vpd; + + if (len == 0 || len > PCI_VPD_MAX_SIZE) + return -EIO; + + vpd->valid = 1; + vpd->len = len; + + return 0; +} + static const struct pci_vpd_ops pci_vpd_ops = { .read = pci_vpd_read, .write = pci_vpd_write, + .set_size = pci_vpd_set_size, }; static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, @@ -533,9 +560,24 @@ static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count, return ret; } +static int pci_vpd_f0_set_size(struct pci_dev *dev, size_t len) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, + PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); + int ret; + + if (!tdev) + return -ENODEV; + + ret = pci_set_vpd_size(tdev, len); + pci_dev_put(tdev); + return ret; +} + static const struct pci_vpd_ops pci_vpd_f0_ops = { .read = pci_vpd_f0_read, .write = pci_vpd_f0_write, + .set_size = pci_vpd_f0_set_size, }; int pci_vpd_init(struct pci_dev *dev) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d0fb93481573..a814bbb80fcb 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -97,6 +97,7 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev) struct pci_vpd_ops { ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); + int (*set_size)(struct pci_dev *dev, size_t len); }; struct pci_vpd { diff --git a/include/linux/pci.h b/include/linux/pci.h index 004b8133417d..932ec74909c6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1111,6 +1111,7 @@ void pci_unlock_rescan_remove(void); /* Vital product data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); +int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); -- cgit v1.2.3-71-gd317 From 67245ff332064c01b760afa7a384ccda024bfd24 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2016 15:16:07 -0700 Subject: devpts: clean up interface to pty drivers This gets rid of the horrible notion of having that struct inode *ptmx_inode be the linchpin of the interface between the pty code and devpts. By de-emphasizing the ptmx inode, a lot of things actually get cleaner, and we will have a much saner way forward. In particular, this will allow us to associate with any particular devpts instance at open-time, and not be artificially tied to one particular ptmx inode. The patch itself is actually fairly straightforward, and apart from some locking and return path cleanups it's pretty mechanical: - the interfaces that devpts exposes all take "struct pts_fs_info *" instead of "struct inode *ptmx_inode" now. NOTE! The "struct pts_fs_info" thing is a completely opaque structure as far as the pty driver is concerned: it's still declared entirely internally to devpts. So the pty code can't actually access it in any way, just pass it as a "cookie" to the devpts code. - the "look up the pts fs info" is now a single clear operation, that also does the reference count increment on the pts superblock. So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get ref" operation (devpts_get_ref(inode)), along with a "put ref" op (devpts_put_ref()). - the pty master "tty->driver_data" field now contains the pts_fs_info, not the ptmx inode. - because we don't care about the ptmx inode any more as some kind of base index, the ref counting can now drop the inode games - it just gets the ref on the superblock. - the pts_fs_info now has a back-pointer to the super_block. That's so that we can easily look up the information we actually need. Although quite often, the pts fs info was actually all we wanted, and not having to look it up based on some magical inode makes things more straightforward. In particular, now that "devpts_get_ref(inode)" operation should really be the *only* place we need to look up what devpts instance we're associated with, and we do it exactly once, at ptmx_open() time. The other side of this is that one ptmx node could now be associated with multiple different devpts instances - you could have a single /dev/ptmx node, and then have multiple mount namespaces with their own instances of devpts mounted on /dev/pts/. And that's all perfectly sane in a model where we just look up the pts instance at open time. This will eventually allow us to get rid of our odd single-vs-multiple pts instance model, but this patch in itself changes no semantics, only an internal binding model. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 63 ++++++++++++++++++++++------------------------- fs/devpts/inode.c | 49 ++++++++++++++++++------------------ include/linux/devpts_fs.h | 34 ++++++++----------------- 3 files changed, 64 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index e16a49b507ef..0058d9fbf931 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -663,14 +663,14 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { - struct inode *ptmx_inode; + struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) - ptmx_inode = tty->driver_data; + fsi = tty->driver_data; else - ptmx_inode = tty->link->driver_data; - devpts_kill_index(ptmx_inode, tty->index); - devpts_del_ref(ptmx_inode); + fsi = tty->link->driver_data; + devpts_kill_index(fsi, tty->index); + devpts_put_ref(fsi); } static const struct tty_operations ptm_unix98_ops = { @@ -720,6 +720,7 @@ static const struct tty_operations pty_unix98_ops = { static int ptmx_open(struct inode *inode, struct file *filp) { + struct pts_fs_info *fsi; struct tty_struct *tty; struct inode *slave_inode; int retval; @@ -734,47 +735,41 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; + fsi = devpts_get_ref(inode, filp); + retval = -ENODEV; + if (!fsi) + goto out_free_file; + /* find a device that is not in use. */ mutex_lock(&devpts_mutex); - index = devpts_new_index(inode); - if (index < 0) { - retval = index; - mutex_unlock(&devpts_mutex); - goto err_file; - } - + index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); - mutex_lock(&tty_mutex); - tty = tty_init_dev(ptm_driver, index); + retval = index; + if (index < 0) + goto out_put_ref; - if (IS_ERR(tty)) { - retval = PTR_ERR(tty); - goto out; - } + mutex_lock(&tty_mutex); + tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - tty->driver_data = inode; + retval = PTR_ERR(tty); + if (IS_ERR(tty)) + goto out; /* - * In the case where all references to ptmx inode are dropped and we - * still have /dev/tty opened pointing to the master/slave pair (ptmx - * is closed/released before /dev/tty), we must make sure that the inode - * is still valid when we call the final pty_unix98_shutdown, thus we - * hold an additional reference to the ptmx inode. For the same /dev/tty - * last close case, we also need to make sure the super_block isn't - * destroyed (devpts instance unmounted), before /dev/tty is closed and - * on its release devpts_kill_index is called. + * From here on out, the tty is "live", and the index and + * fsi will be killed/put by the tty_release() */ - devpts_add_ref(inode); + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + tty->driver_data = fsi; tty_add_file(tty, filp); - slave_inode = devpts_pty_new(inode, + slave_inode = devpts_pty_new(fsi, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index, tty->link); if (IS_ERR(slave_inode)) { @@ -793,12 +788,14 @@ static int ptmx_open(struct inode *inode, struct file *filp) return 0; err_release: tty_unlock(tty); + // This will also put-ref the fsi tty_release(inode, filp); return retval; out: - mutex_unlock(&tty_mutex); - devpts_kill_index(inode, index); -err_file: + devpts_kill_index(fsi, index); +out_put_ref: + devpts_put_ref(fsi); +out_free_file: tty_free_file(filp); return retval; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 655f21f99160..0af8e7d70d27 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ retry: return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index e0ee0b3000b2..358a4db72a27 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,38 +15,24 @@ #include +struct pts_fs_info; + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); -void devpts_kill_index(struct inode *ptmx_inode, int idx); -void devpts_add_ref(struct inode *ptmx_inode); -void devpts_del_ref(struct inode *ptmx_inode); +/* Look up a pts fs info and get a ref to it */ +struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); +void devpts_put_ref(struct pts_fs_info *); + +int devpts_new_index(struct pts_fs_info *); +void devpts_kill_index(struct pts_fs_info *, int); + /* mknod in devpts */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv); +struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); /* get private structure */ void *devpts_get_priv(struct inode *pts_inode); /* unlink */ void devpts_pty_kill(struct inode *inode); -#else - -/* Dummy stubs in the no-pty case */ -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } -static inline void devpts_add_ref(struct inode *ptmx_inode) { } -static inline void devpts_del_ref(struct inode *ptmx_inode) { } -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, - dev_t device, int index, void *priv) -{ - return ERR_PTR(-EINVAL); -} -static inline void *devpts_get_priv(struct inode *pts_inode) -{ - return NULL; -} -static inline void devpts_pty_kill(struct inode *inode) { } - #endif -- cgit v1.2.3-71-gd317 From 1d0fd42fa31d18ba0a3e0dd008c9e93e1cebe451 Mon Sep 17 00:00:00 2001 From: Wei Ni Date: Thu, 3 Mar 2016 17:33:46 +0800 Subject: thermal: consistently use int for trip temp The commit 17e8351a7739 consistently use int for temperature, however it missed a few in trip temperature and thermal_core. In current codes, the trip->temperature used "unsigned long" and zone->temperature used"int", if the temperature is negative value, it will get wrong result when compare temperature with trip temperature. This patch can fix it. Signed-off-by: Wei Ni Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 8 ++++---- include/linux/thermal.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index d4b54653ecf8..f1db49625555 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -688,7 +688,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, { struct thermal_zone_device *tz = to_thermal_zone(dev); int trip, ret; - unsigned long temperature; + int temperature; if (!tz->ops->set_trip_temp) return -EPERM; @@ -696,7 +696,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip)) return -EINVAL; - if (kstrtoul(buf, 10, &temperature)) + if (kstrtoint(buf, 10, &temperature)) return -EINVAL; ret = tz->ops->set_trip_temp(tz, trip, temperature); @@ -899,9 +899,9 @@ emul_temp_store(struct device *dev, struct device_attribute *attr, { struct thermal_zone_device *tz = to_thermal_zone(dev); int ret = 0; - unsigned long temperature; + int temperature; - if (kstrtoul(buf, 10, &temperature)) + if (kstrtoint(buf, 10, &temperature)) return -EINVAL; if (!tz->ops->set_emul_temp) { diff --git a/include/linux/thermal.h b/include/linux/thermal.h index a55d0523f75d..1b8a5a7876ce 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -352,8 +352,8 @@ struct thermal_zone_of_device_ops { struct thermal_trip { struct device_node *np; - unsigned long int temperature; - unsigned long int hysteresis; + int temperature; + int hysteresis; enum thermal_trip_type type; }; -- cgit v1.2.3-71-gd317 From 4bfd2e6e53435a214888fd35e230157a38ffc6a0 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 20 Apr 2016 16:01:16 +0300 Subject: net/mlx4_core: Avoid repeated calls to pci enable/disable Maintain the PCI status and provide wrappers for enabling and disabling the PCI device. Performing the actions more than once without doing its opposite results in warning logs. This occurred when EEH hotplugged the device causing a warning for disabling an already disabled device. Fixes: 2ba5fbd62b25 ('net/mlx4_core: Handle AER flow properly') Signed-off-by: Daniel Jurgens Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 39 +++++++++++++++++++++++++++---- include/linux/mlx4/device.h | 7 ++++++ 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 5d45aa34f6a5..12c77a70abdb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3172,6 +3172,34 @@ static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap return 0; } +static int mlx4_pci_enable_device(struct mlx4_dev *dev) +{ + struct pci_dev *pdev = dev->persist->pdev; + int err = 0; + + mutex_lock(&dev->persist->pci_status_mutex); + if (dev->persist->pci_status == MLX4_PCI_STATUS_DISABLED) { + err = pci_enable_device(pdev); + if (!err) + dev->persist->pci_status = MLX4_PCI_STATUS_ENABLED; + } + mutex_unlock(&dev->persist->pci_status_mutex); + + return err; +} + +static void mlx4_pci_disable_device(struct mlx4_dev *dev) +{ + struct pci_dev *pdev = dev->persist->pdev; + + mutex_lock(&dev->persist->pci_status_mutex); + if (dev->persist->pci_status == MLX4_PCI_STATUS_ENABLED) { + pci_disable_device(pdev); + dev->persist->pci_status = MLX4_PCI_STATUS_DISABLED; + } + mutex_unlock(&dev->persist->pci_status_mutex); +} + static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data, int total_vfs, int *nvfs, struct mlx4_priv *priv, int reset_flow) @@ -3582,7 +3610,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data, pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); - err = pci_enable_device(pdev); + err = mlx4_pci_enable_device(&priv->dev); if (err) { dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n"); return err; @@ -3715,7 +3743,7 @@ err_release_regions: pci_release_regions(pdev); err_disable_pdev: - pci_disable_device(pdev); + mlx4_pci_disable_device(&priv->dev); pci_set_drvdata(pdev, NULL); return err; } @@ -3775,6 +3803,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) priv->pci_dev_data = id->driver_data; mutex_init(&dev->persist->device_state_mutex); mutex_init(&dev->persist->interface_state_mutex); + mutex_init(&dev->persist->pci_status_mutex); ret = devlink_register(devlink, &pdev->dev); if (ret) @@ -3923,7 +3952,7 @@ static void mlx4_remove_one(struct pci_dev *pdev) } pci_release_regions(pdev); - pci_disable_device(pdev); + mlx4_pci_disable_device(dev); devlink_unregister(devlink); kfree(dev->persist); devlink_free(devlink); @@ -4042,7 +4071,7 @@ static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, if (state == pci_channel_io_perm_failure) return PCI_ERS_RESULT_DISCONNECT; - pci_disable_device(pdev); + mlx4_pci_disable_device(persist->dev); return PCI_ERS_RESULT_NEED_RESET; } @@ -4053,7 +4082,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) int err; mlx4_err(dev, "mlx4_pci_slot_reset was called\n"); - err = pci_enable_device(pdev); + err = mlx4_pci_enable_device(dev); if (err) { mlx4_err(dev, "Can not re-enable device, err=%d\n", err); return PCI_ERS_RESULT_DISCONNECT; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 8541a913f6a3..d1f904c8b2cb 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -828,6 +828,11 @@ struct mlx4_vf_dev { u8 n_ports; }; +enum mlx4_pci_status { + MLX4_PCI_STATUS_DISABLED, + MLX4_PCI_STATUS_ENABLED, +}; + struct mlx4_dev_persistent { struct pci_dev *pdev; struct mlx4_dev *dev; @@ -841,6 +846,8 @@ struct mlx4_dev_persistent { u8 state; struct mutex interface_state_mutex; /* protect SW state */ u8 interface_state; + struct mutex pci_status_mutex; /* sync pci state */ + enum mlx4_pci_status pci_status; }; struct mlx4_dev { -- cgit v1.2.3-71-gd317 From 75dd602a5198a6e5f75534db52b6e6fbaabb33d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Mar 2016 11:36:59 +0200 Subject: lockdep: Fix lock_chain::base size lock_chain::base is used to store an index into the chain_hlocks[] array, however that array contains more elements than can be indexed using the u16. Change the lock_chain structure to use a bitfield to encode the data it needs and add BUILD_BUG_ON() assertions to check the fields are wide enough. Also, for DEBUG_LOCKDEP, assert that we don't run out of elements of that array; as that would wreck the collision detectoring. Signed-off-by: Peter Zijlstra (Intel) Cc: Alfredo Alvarez Fernandez Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sedat Dilek Cc: Theodore Ts'o Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160330093659.GS3408@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 8 +++++--- kernel/locking/lockdep.c | 24 +++++++++++++++++++++++- kernel/locking/lockdep_proc.c | 2 ++ 3 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d026b190c530..d10ef06971b5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -196,9 +196,11 @@ struct lock_list { * We record lock dependency chains, so that we can cache them: */ struct lock_chain { - u8 irq_context; - u8 depth; - u16 base; + /* see BUILD_BUG_ON()s in lookup_chain_cache() */ + unsigned int irq_context : 2, + depth : 6, + base : 24; + /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index beb06f604420..78c1c0ee6dc1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2176,15 +2176,37 @@ cache_hit: chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; + + BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks)); + BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); + BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; } + + if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) + nr_chain_hlocks += chain->depth; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * Important for check_no_collision(). + */ + if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + if (debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; + } +#endif + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dbb61a302548..a0f61effad25 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { + if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } -- cgit v1.2.3-71-gd317 From 046339eaab26804f52f6604877f5674f70815b26 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 22 Apr 2016 00:33:03 +0300 Subject: net/mlx5e: Device's mtu field is u16 and not int For set/query MTU port firmware commands the MTU field is 16 bits, here I changed all the "int mtu" parameters of the functions wrapping those firmware commands to be u16. Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/port.c | 10 +++++----- include/linux/mlx5/port.h | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5acf346e048e..99eb1c1a3b7b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -671,8 +671,8 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *rep; - int max_mtu; - int oper_mtu; + u16 max_mtu; + u16 oper_mtu; int err; u8 ib_link_width_oper; u8 vl_hw_cap; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e0adb604f461..2fbbc625f106 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1408,7 +1408,7 @@ static int mlx5e_set_dev_port_mtu(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; - int hw_mtu; + u16 hw_mtu; int err; err = mlx5_set_port_mtu(mdev, MLX5E_SW2HW_MTU(netdev->mtu), 1); @@ -2004,7 +2004,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu) struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; bool was_opened; - int max_mtu; + u16 max_mtu; int err = 0; mlx5_query_port_max_mtu(mdev, &max_mtu, 1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index ae378c575deb..53cc1e2c693b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -247,8 +247,8 @@ int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status); -static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu, - int *max_mtu, int *oper_mtu, u8 port) +static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu, + u16 *max_mtu, u16 *oper_mtu, u8 port) { u32 in[MLX5_ST_SZ_DW(pmtu_reg)]; u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; @@ -268,7 +268,7 @@ static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu, *admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu); } -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port) +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port) { u32 in[MLX5_ST_SZ_DW(pmtu_reg)]; u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; @@ -283,14 +283,14 @@ int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port) } EXPORT_SYMBOL_GPL(mlx5_set_port_mtu); -void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port) { mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port); } EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu); -void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, u8 port) { mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port); diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index a1d145abd4eb..b30250ab7604 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -54,9 +54,9 @@ int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status *status); -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); -void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); -void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port); +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port); +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, u8 port); int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, -- cgit v1.2.3-71-gd317 From cd255efff9baadd654d6160e52d17ae7c568c9d3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 22 Apr 2016 00:33:05 +0300 Subject: net/mlx5e: Use vport MTU rather than physical port MTU Set and report vport MTU rather than physical MTU, Driver will set both vport and physical port mtu and will rely on the query of vport mtu. SRIOV VFs have to report their MTU to their vport manager (PF), and this will allow them to work with any MTU they need without failing the request. Also for some cases where the PF is not a port owner, PF can work with MTU less than the physical port mtu if set physical port mtu didn't take effect. Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 44 ++++++++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 40 +++++++++++++++++++++ include/linux/mlx5/vport.h | 2 ++ 3 files changed, 77 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 93e4ef43fa9f..85773f8573e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1404,24 +1404,50 @@ static int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5e_priv *priv) return 0; } -static int mlx5e_set_dev_port_mtu(struct net_device *netdev) +static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu) { - struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; - u16 hw_mtu; + u16 hw_mtu = MLX5E_SW2HW_MTU(mtu); int err; - err = mlx5_set_port_mtu(mdev, MLX5E_SW2HW_MTU(netdev->mtu), 1); + err = mlx5_set_port_mtu(mdev, hw_mtu, 1); if (err) return err; - mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1); + /* Update vport context MTU */ + mlx5_modify_nic_vport_mtu(mdev, hw_mtu); + return 0; +} + +static void mlx5e_query_mtu(struct mlx5e_priv *priv, u16 *mtu) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u16 hw_mtu = 0; + int err; + + err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu); + if (err || !hw_mtu) /* fallback to port oper mtu */ + mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1); + + *mtu = MLX5E_HW2SW_MTU(hw_mtu); +} + +static int mlx5e_set_dev_port_mtu(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + u16 mtu; + int err; + + err = mlx5e_set_mtu(priv, netdev->mtu); + if (err) + return err; - if (MLX5E_HW2SW_MTU(hw_mtu) != netdev->mtu) - netdev_warn(netdev, "%s: Port MTU %d is different than netdev mtu %d\n", - __func__, MLX5E_HW2SW_MTU(hw_mtu), netdev->mtu); + mlx5e_query_mtu(priv, &mtu); + if (mtu != netdev->mtu) + netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n", + __func__, mtu, netdev->mtu); - netdev->mtu = MLX5E_HW2SW_MTU(hw_mtu); + netdev->mtu = mtu; return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index bd518405859e..b69dadcfb897 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -196,6 +196,46 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev, } EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address); +int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int err; + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + if (!err) + *mtu = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.mtu); + + kvfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mtu); + +int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1); + MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu); + + err = mlx5_modify_nic_vport_context(mdev, in, inlen); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mtu); + int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, u32 vport, enum mlx5_list_type list_type, diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index bd93e6323603..301da4a5e6bf 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -45,6 +45,8 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, u16 vport, u8 *addr); +int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); +int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); -- cgit v1.2.3-71-gd317 From 5fc7197d3a256d9c5de3134870304b24892a4908 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Fri, 22 Apr 2016 00:33:07 +0300 Subject: net/mlx5: Add pci shutdown callback This patch introduces kexec support for mlx5. When switching kernels, kexec() calls shutdown, which unloads the driver and cleans its resources. In addition, remove unregister netdev from shutdown flow. This will allow a clean shutdown, even if some netdev clients did not release their reference from this netdev. Releasing The HW resources only is enough as the kernel is shutting down Signed-off-by: Majd Dibbiny Signed-off-by: Tariq Toukan Signed-off-by: Haggai Abramovsky Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 +++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 23 +++++++++++++++++++---- include/linux/mlx5/driver.h | 7 ++++--- 3 files changed, 36 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 85773f8573e5..67d548b70e14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2633,7 +2633,16 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) schedule_work(&priv->set_rx_mode_work); mlx5e_disable_async_events(priv); flush_scheduled_work(); - unregister_netdev(netdev); + if (test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) { + netif_device_detach(netdev); + mutex_lock(&priv->state_lock); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + mlx5e_close_locked(netdev); + mutex_unlock(&priv->state_lock); + } else { + unregister_netdev(netdev); + } + mlx5e_tc_cleanup(priv); mlx5e_vxlan_cleanup(priv); mlx5e_destroy_flow_tables(priv); @@ -2646,7 +2655,9 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); - free_netdev(netdev); + + if (!test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) + free_netdev(netdev); } static void *mlx5e_get_netdev(void *vpriv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ddd352ae3dbd..6892746fd10d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -966,7 +966,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) int err; mutex_lock(&dev->intf_state_mutex); - if (dev->interface_state == MLX5_INTERFACE_STATE_UP) { + if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n", __func__); goto out; @@ -1133,7 +1133,8 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) if (err) pr_info("failed request module on %s\n", MLX5_IB_MOD); - dev->interface_state = MLX5_INTERFACE_STATE_UP; + clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state); + set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); out: mutex_unlock(&dev->intf_state_mutex); @@ -1207,7 +1208,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) } mutex_lock(&dev->intf_state_mutex); - if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) { + if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) { dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n", __func__); goto out; @@ -1241,7 +1242,8 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) mlx5_cmd_cleanup(dev); out: - dev->interface_state = MLX5_INTERFACE_STATE_DOWN; + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state); mutex_unlock(&dev->intf_state_mutex); return err; } @@ -1452,6 +1454,18 @@ static const struct pci_error_handlers mlx5_err_handler = { .resume = mlx5_pci_resume }; +static void shutdown(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct mlx5_priv *priv = &dev->priv; + + dev_info(&pdev->dev, "Shutdown was called\n"); + /* Notify mlx5 clients that the kernel is being shut down */ + set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state); + mlx5_unload_one(dev, priv); + mlx5_pci_disable_device(dev); +} + static const struct pci_device_id mlx5_core_pci_table[] = { { PCI_VDEVICE(MELLANOX, 0x1011) }, /* Connect-IB */ { PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF}, /* Connect-IB VF */ @@ -1471,6 +1485,7 @@ static struct pci_driver mlx5_core_driver = { .id_table = mlx5_core_pci_table, .probe = init_one, .remove = remove_one, + .shutdown = shutdown, .err_handler = &mlx5_err_handler, .sriov_configure = mlx5_core_sriov_configure, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dcd5ac8d3b14..369c837d40f5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -519,8 +519,9 @@ enum mlx5_device_state { }; enum mlx5_interface_state { - MLX5_INTERFACE_STATE_DOWN, - MLX5_INTERFACE_STATE_UP, + MLX5_INTERFACE_STATE_DOWN = BIT(0), + MLX5_INTERFACE_STATE_UP = BIT(1), + MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2), }; enum mlx5_pci_status { @@ -544,7 +545,7 @@ struct mlx5_core_dev { enum mlx5_device_state state; /* sync interface state */ struct mutex intf_state_mutex; - enum mlx5_interface_state interface_state; + unsigned long intf_state; void (*event) (struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); -- cgit v1.2.3-71-gd317 From 6c1ea260f89709e0021d2c59f8fd2a104b5b1123 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 11 Apr 2016 19:34:49 +0200 Subject: libceph: make authorizer destruction independent of ceph_auth_client Starting the kernel client with cephx disabled and then enabling cephx and restarting userspace daemons can result in a crash: [262671.478162] BUG: unable to handle kernel paging request at ffffebe000000000 [262671.531460] IP: [] kfree+0x5a/0x130 [262671.584334] PGD 0 [262671.635847] Oops: 0000 [#1] SMP [262672.055841] CPU: 22 PID: 2961272 Comm: kworker/22:2 Not tainted 4.2.0-34-generic #39~14.04.1-Ubuntu [262672.162338] Hardware name: Dell Inc. PowerEdge R720/068CDY, BIOS 2.4.3 07/09/2014 [262672.268937] Workqueue: ceph-msgr con_work [libceph] [262672.322290] task: ffff88081c2d0dc0 ti: ffff880149ae8000 task.ti: ffff880149ae8000 [262672.428330] RIP: 0010:[] [] kfree+0x5a/0x130 [262672.535880] RSP: 0018:ffff880149aeba58 EFLAGS: 00010286 [262672.589486] RAX: 000001e000000000 RBX: 0000000000000012 RCX: ffff8807e7461018 [262672.695980] RDX: 000077ff80000000 RSI: ffff88081af2be04 RDI: 0000000000000012 [262672.803668] RBP: ffff880149aeba78 R08: 0000000000000000 R09: 0000000000000000 [262672.912299] R10: ffffebe000000000 R11: ffff880819a60e78 R12: ffff8800aec8df40 [262673.021769] R13: ffffffffc035f70f R14: ffff8807e5b138e0 R15: ffff880da9785840 [262673.131722] FS: 0000000000000000(0000) GS:ffff88081fac0000(0000) knlGS:0000000000000000 [262673.245377] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [262673.303281] CR2: ffffebe000000000 CR3: 0000000001c0d000 CR4: 00000000001406e0 [262673.417556] Stack: [262673.472943] ffff880149aeba88 ffff88081af2be04 ffff8800aec8df40 ffff88081af2be04 [262673.583767] ffff880149aeba98 ffffffffc035f70f ffff880149aebac8 ffff8800aec8df00 [262673.694546] ffff880149aebac8 ffffffffc035c89e ffff8807e5b138e0 ffff8805b047f800 [262673.805230] Call Trace: [262673.859116] [] ceph_x_destroy_authorizer+0x1f/0x50 [libceph] [262673.968705] [] ceph_auth_destroy_authorizer+0x3e/0x60 [libceph] [262674.078852] [] put_osd+0x45/0x80 [libceph] [262674.134249] [] remove_osd+0xae/0x140 [libceph] [262674.189124] [] __reset_osd+0x103/0x150 [libceph] [262674.243749] [] kick_requests+0x223/0x460 [libceph] [262674.297485] [] ceph_osdc_handle_map+0x282/0x5e0 [libceph] [262674.350813] [] dispatch+0x4e/0x720 [libceph] [262674.403312] [] try_read+0x3d1/0x1090 [libceph] [262674.454712] [] ? dequeue_entity+0x152/0x690 [262674.505096] [] con_work+0xcb/0x1300 [libceph] [262674.555104] [] process_one_work+0x14e/0x3d0 [262674.604072] [] worker_thread+0x11a/0x470 [262674.652187] [] ? rescuer_thread+0x310/0x310 [262674.699022] [] kthread+0xd2/0xf0 [262674.744494] [] ? kthread_create_on_node+0x1c0/0x1c0 [262674.789543] [] ret_from_fork+0x3f/0x70 [262674.834094] [] ? kthread_create_on_node+0x1c0/0x1c0 What happens is the following: (1) new MON session is established (2) old "none" ac is destroyed (3) new "cephx" ac is constructed ... (4) old OSD session (w/ "none" authorizer) is put ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer) osd->o_auth.authorizer in the "none" case is just a bare pointer into ac, which contains a single static copy for all services. By the time we get to (4), "none" ac, freed in (2), is long gone. On top of that, a new vtable installed in (3) points us at ceph_x_destroy_authorizer(), so we end up trying to destroy a "none" authorizer with a "cephx" destructor operating on invalid memory! To fix this, decouple authorizer destruction from ac and do away with a single static "none" authorizer by making a copy for each OSD or MDS session. Authorizers themselves are independent of ac and so there is no reason for destroy_authorizer() to be an ac op. Make it an op on the authorizer itself by turning ceph_authorizer into a real struct. Fixes: http://tracker.ceph.com/issues/15447 Reported-by: Alan Zhang Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 6 ++-- include/linux/ceph/auth.h | 10 +++--- include/linux/ceph/osd_client.h | 1 - net/ceph/auth.c | 8 ++--- net/ceph/auth_none.c | 71 ++++++++++++++++++++++------------------- net/ceph/auth_none.h | 3 +- net/ceph/auth_x.c | 21 ++++++------ net/ceph/auth_x.h | 1 + net/ceph/osd_client.c | 6 ++-- 9 files changed, 62 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 541ead4d8965..85b8517f17a0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -386,9 +386,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - ceph_auth_destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer(s->s_auth.authorizer); kfree(s); } } @@ -3900,7 +3898,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(auth->authorizer); auth->authorizer = NULL; } if (!auth->authorizer) { diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 260d78b587c4..1563265d2097 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -12,9 +12,12 @@ */ struct ceph_auth_client; -struct ceph_authorizer; struct ceph_msg; +struct ceph_authorizer { + void (*destroy)(struct ceph_authorizer *); +}; + struct ceph_auth_handshake { struct ceph_authorizer *authorizer; void *authorizer_buf; @@ -62,8 +65,6 @@ struct ceph_auth_client_ops { struct ceph_auth_handshake *auth); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len); - void (*destroy_authorizer)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); void (*invalidate_authorizer)(struct ceph_auth_client *ac, int peer_type); @@ -112,8 +113,7 @@ extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); -extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a); +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *a); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4343df806710..cbf460927c42 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -16,7 +16,6 @@ struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; -struct ceph_authorizer; /* * completion callback for async writepages diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 6b923bcaa2a4..2bc5965fdd1e 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -293,13 +293,9 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac, } EXPORT_SYMBOL(ceph_auth_create_authorizer); -void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, a); - mutex_unlock(&ac->mutex); + a->destroy(a); } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 8c93fa8d81bc..5f836f02ae36 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -16,7 +16,6 @@ static void reset(struct ceph_auth_client *ac) struct ceph_auth_none_info *xi = ac->private; xi->starting = true; - xi->built_authorizer = false; } static void destroy(struct ceph_auth_client *ac) @@ -39,6 +38,27 @@ static int should_authenticate(struct ceph_auth_client *ac) return xi->starting; } +static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac, + struct ceph_none_authorizer *au) +{ + void *p = au->buf; + void *const end = p + sizeof(au->buf); + int ret; + + ceph_encode_8_safe(&p, end, 1, e_range); + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + au->buf_len = p - (void *)au->buf; + dout("%s built authorizer len %d\n", __func__, au->buf_len); + return 0; + +e_range: + return -ERANGE; +} + static int build_request(struct ceph_auth_client *ac, void *buf, void *end) { return 0; @@ -57,32 +77,32 @@ static int handle_reply(struct ceph_auth_client *ac, int result, return result; } +static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a) +{ + kfree(a); +} + /* - * build an 'authorizer' with our entity_name and global_id. we can - * reuse a single static copy since it is identical for all services - * we connect to. + * build an 'authorizer' with our entity_name and global_id. it is + * identical for all services we connect to. */ static int ceph_auth_none_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) { - struct ceph_auth_none_info *ai = ac->private; - struct ceph_none_authorizer *au = &ai->au; - void *p, *end; + struct ceph_none_authorizer *au; int ret; - if (!ai->built_authorizer) { - p = au->buf; - end = p + sizeof(au->buf); - ceph_encode_8(&p, 1); - ret = ceph_entity_name_encode(ac->name, &p, end - 8); - if (ret < 0) - goto bad; - ceph_decode_need(&p, end, sizeof(u64), bad2); - ceph_encode_64(&p, ac->global_id); - au->buf_len = p - (void *)au->buf; - ai->built_authorizer = true; - dout("built authorizer len %d\n", au->buf_len); + au = kmalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + au->base.destroy = ceph_auth_none_destroy_authorizer; + + ret = ceph_auth_none_build_authorizer(ac, au); + if (ret) { + kfree(au); + return ret; } auth->authorizer = (struct ceph_authorizer *) au; @@ -92,17 +112,6 @@ static int ceph_auth_none_create_authorizer( auth->authorizer_reply_buf_len = sizeof (au->reply_buf); return 0; - -bad2: - ret = -ERANGE; -bad: - return ret; -} - -static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - /* nothing to do */ } static const struct ceph_auth_client_ops ceph_auth_none_ops = { @@ -114,7 +123,6 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = { .build_request = build_request, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, - .destroy_authorizer = ceph_auth_none_destroy_authorizer, }; int ceph_auth_none_init(struct ceph_auth_client *ac) @@ -127,7 +135,6 @@ int ceph_auth_none_init(struct ceph_auth_client *ac) return -ENOMEM; xi->starting = true; - xi->built_authorizer = false; ac->protocol = CEPH_AUTH_NONE; ac->private = xi; diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index 059a3ce4b53f..62021535ae4a 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -12,6 +12,7 @@ */ struct ceph_none_authorizer { + struct ceph_authorizer base; char buf[128]; int buf_len; char reply_buf[0]; @@ -19,8 +20,6 @@ struct ceph_none_authorizer { struct ceph_auth_none_info { bool starting; - bool built_authorizer; - struct ceph_none_authorizer au; /* we only need one; it's static */ }; int ceph_auth_none_init(struct ceph_auth_client *ac); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 9e43a315e662..a0905f04bd13 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -565,6 +565,14 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } +static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_x_authorizer_cleanup(au); + kfree(au); +} + static int ceph_x_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) @@ -581,6 +589,8 @@ static int ceph_x_create_authorizer( if (!au) return -ENOMEM; + au->base.destroy = ceph_x_destroy_authorizer; + ret = ceph_x_build_authorizer(ac, th, au); if (ret) { kfree(au); @@ -643,16 +653,6 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, return ret; } -static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - struct ceph_x_authorizer *au = (void *)a; - - ceph_x_authorizer_cleanup(au); - kfree(au); -} - - static void ceph_x_reset(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; @@ -770,7 +770,6 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .create_authorizer = ceph_x_create_authorizer, .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, - .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, .reset = ceph_x_reset, .destroy = ceph_x_destroy, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index 40b1a3cf7397..21a5af904bae 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -26,6 +26,7 @@ struct ceph_x_ticket_handler { struct ceph_x_authorizer { + struct ceph_authorizer base; struct ceph_crypto_key session_key; struct ceph_buffer *buf; unsigned int service; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 32355d9d0103..40a53a70efdf 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1087,10 +1087,8 @@ static void put_osd(struct ceph_osd *osd) dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); if (atomic_dec_and_test(&osd->o_ref)) { - struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_auth.authorizer) - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(osd->o_auth.authorizer); kfree(osd); } } @@ -2984,7 +2982,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(auth->authorizer); auth->authorizer = NULL; } if (!auth->authorizer) { -- cgit v1.2.3-71-gd317 From 841645b5f2dfceac69b78fcd0c9050868d41ea61 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 25 Apr 2016 15:33:55 -0400 Subject: ipv6: Revert optional address flusing on ifdown. This reverts the following three commits: 70af921db6f8835f4b11c65731116560adb00c14 799977d9aafbf0ca0b9c39b04cbfb16db71302c9 f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac The feature was ill conceived, has terrible semantics, and has added nothing but regressions to the already fragile ipv6 stack. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 -- include/linux/ipv6.h | 1 - include/uapi/linux/ipv6.h | 1 - net/ipv6/addrconf.c | 162 +++------------------------------ 4 files changed, 12 insertions(+), 161 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b183e2b606c8..e0ac25210f7c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1568,15 +1568,6 @@ temp_prefered_lft - INTEGER Preferred lifetime (in seconds) for temporary addresses. Default: 86400 (1 day) -keep_addr_on_down - INTEGER - Keep all IPv6 addresses on an interface down event. If set static - global addresses with no expiration time are not flushed. - >0 : enabled - 0 : system default - <0 : disabled - - Default: 0 (addresses are removed) - max_desync_factor - INTEGER Maximum value for DESYNC_FACTOR, which is a random value that ensures that clients don't synchronize with each diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7edc14fb66b6..4b2267e1b7c3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -62,7 +62,6 @@ struct ipv6_devconf { struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; - __s32 keep_addr_on_down; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 395876060f50..ec117b65d5a5 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -176,7 +176,6 @@ enum { DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_DROP_UNSOLICITED_NA, - DEVCONF_KEEP_ADDR_ON_DOWN, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 23cec53b568a..d77ba395d593 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -216,7 +216,6 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, - .keep_addr_on_down = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -261,7 +260,6 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, - .keep_addr_on_down = 0, }; /* Check if a valid qdisc is available */ @@ -3176,81 +3174,6 @@ static void addrconf_gre_config(struct net_device *dev) } #endif -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) -/* If the host route is cached on the addr struct make sure it is associated - * with the proper table. e.g., enslavement can change and if so the cached - * host route needs to move to the new table. - */ -static void l3mdev_check_host_rt(struct inet6_dev *idev, - struct inet6_ifaddr *ifp) -{ - if (ifp->rt) { - u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - - if (tb_id != ifp->rt->rt6i_table->tb6_id) { - ip6_del_rt(ifp->rt); - ifp->rt = NULL; - } - } -} -#else -static void l3mdev_check_host_rt(struct inet6_dev *idev, - struct inet6_ifaddr *ifp) -{ -} -#endif - -static int fixup_permanent_addr(struct inet6_dev *idev, - struct inet6_ifaddr *ifp) -{ - l3mdev_check_host_rt(idev, ifp); - - if (!ifp->rt) { - struct rt6_info *rt; - - rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) - return PTR_ERR(rt); - - ifp->rt = rt; - } - - if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - idev->dev, 0, 0); - } - - addrconf_dad_start(ifp); - - return 0; -} - -static void addrconf_permanent_addr(struct net_device *dev) -{ - struct inet6_ifaddr *ifp, *tmp; - struct inet6_dev *idev; - - idev = __in6_dev_get(dev); - if (!idev) - return; - - write_lock_bh(&idev->lock); - - list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { - if ((ifp->flags & IFA_F_PERMANENT) && - fixup_permanent_addr(idev, ifp) < 0) { - write_unlock_bh(&idev->lock); - ipv6_del_addr(ifp); - write_lock_bh(&idev->lock); - - net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", - idev->dev->name, &ifp->addr); - } - } - - write_unlock_bh(&idev->lock); -} - static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -3337,9 +3260,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - /* restore routes for permanent addresses */ - addrconf_permanent_addr(dev); - switch (dev->type) { #if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: @@ -3448,20 +3368,11 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event) ipv6_mc_unmap(idev); } -static bool addr_is_local(const struct in6_addr *addr) -{ - return ipv6_addr_type(addr) & - (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); -} - static int addrconf_ifdown(struct net_device *dev, int how) { struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa, *tmp; - struct list_head del_list; - int _keep_addr; - bool keep_addr; + struct inet6_ifaddr *ifa; int state, i; ASSERT_RTNL(); @@ -3488,16 +3399,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) } - /* aggregate the system setting and interface setting */ - _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; - if (!_keep_addr) - _keep_addr = idev->cnf.keep_addr_on_down; - - /* combine the user config with event to determine if permanent - * addresses are to be removed from address hash table - */ - keep_addr = !(how || _keep_addr <= 0); - /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; @@ -3506,16 +3407,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { + hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_dad_work(ifa); - /* combined flag + permanent flag decide if - * address is retained on a down event - */ - if (!keep_addr || - !(ifa->flags & IFA_F_PERMANENT) || - addr_is_local(&ifa->addr)) { - hlist_del_init_rcu(&ifa->addr_lst); - goto restart; - } + goto restart; } } spin_unlock_bh(&addrconf_hash_lock); @@ -3549,54 +3443,31 @@ restart: write_lock_bh(&idev->lock); } - /* re-combine the user config with event to determine if permanent - * addresses are to be removed from the interface list - */ - keep_addr = (!how && _keep_addr > 0); - - INIT_LIST_HEAD(&del_list); - list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + while (!list_empty(&idev->addr_list)) { + ifa = list_first_entry(&idev->addr_list, + struct inet6_ifaddr, if_list); addrconf_del_dad_work(ifa); - write_unlock_bh(&idev->lock); - spin_lock_bh(&ifa->lock); - - if (keep_addr && (ifa->flags & IFA_F_PERMANENT) && - !addr_is_local(&ifa->addr)) { - /* set state to skip the notifier below */ - state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; - if (!(ifa->flags & IFA_F_NODAD)) - ifa->flags |= IFA_F_TENTATIVE; - } else { - state = ifa->state; - ifa->state = INET6_IFADDR_STATE_DEAD; + list_del(&ifa->if_list); - list_del(&ifa->if_list); - list_add(&ifa->if_list, &del_list); - } + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; spin_unlock_bh(&ifa->lock); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } + in6_ifa_put(ifa); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); - /* now clean up addresses to be removed */ - while (!list_empty(&del_list)) { - ifa = list_first_entry(&del_list, - struct inet6_ifaddr, if_list); - list_del(&ifa->if_list); - - in6_ifa_put(ifa); - } - /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -4861,7 +4732,6 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; - array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; } static inline size_t inet6_ifla6_size(void) @@ -5949,14 +5819,6 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "keep_addr_on_down", - .data = &ipv6_devconf.keep_addr_on_down, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - - }, { /* sentinel */ } -- cgit v1.2.3-71-gd317 From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: # 4.4+ prerequisite for the next patch --- include/linux/cgroup-defs.h | 1 + include/linux/cpuset.h | 6 ------ kernel/cgroup.c | 7 +++++-- kernel/cpuset.c | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3e39ae5bc799..5b17de62c962 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -444,6 +444,7 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); + void (*post_attach)(void); int (*can_fork)(struct task_struct *task); void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index fea160ee5803..85a868ccb493 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern void cpuset_post_attach_flush(void); - #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline void cpuset_post_attach_flush(void) -{ -} - #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671dc05c0b0f..909a7d31ffd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2875,8 +2876,10 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00ab5c2b7c5b..1902956baba1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = true, -- cgit v1.2.3-71-gd317 From 6a923934c33c750a595868af6bef5f1a1fa90054 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 26 Apr 2016 11:47:41 -0400 Subject: Revert "ipv6: Revert optional address flusing on ifdown." This reverts commit 841645b5f2dfceac69b78fcd0c9050868d41ea61. Ok, this puts the feature back. I've decided to apply David A.'s bug fix and run with that rather than make everyone wait another whole release for this feature. Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 ++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 162 ++++++++++++++++++++++++++++++--- 4 files changed, 161 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e0ac25210f7c..b183e2b606c8 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1568,6 +1568,15 @@ temp_prefered_lft - INTEGER Preferred lifetime (in seconds) for temporary addresses. Default: 86400 (1 day) +keep_addr_on_down - INTEGER + Keep all IPv6 addresses on an interface down event. If set static + global addresses with no expiration time are not flushed. + >0 : enabled + 0 : system default + <0 : disabled + + Default: 0 (addresses are removed) + max_desync_factor - INTEGER Maximum value for DESYNC_FACTOR, which is a random value that ensures that clients don't synchronize with each diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4b2267e1b7c3..7edc14fb66b6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -62,6 +62,7 @@ struct ipv6_devconf { struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; + __s32 keep_addr_on_down; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index ec117b65d5a5..395876060f50 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -176,6 +176,7 @@ enum { DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_DROP_UNSOLICITED_NA, + DEVCONF_KEEP_ADDR_ON_DOWN, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d77ba395d593..23cec53b568a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { }, .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, + .keep_addr_on_down = 0, }; /* Check if a valid qdisc is available */ @@ -3174,6 +3176,81 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) +/* If the host route is cached on the addr struct make sure it is associated + * with the proper table. e.g., enslavement can change and if so the cached + * host route needs to move to the new table. + */ +static void l3mdev_check_host_rt(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + if (ifp->rt) { + u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; + + if (tb_id != ifp->rt->rt6i_table->tb6_id) { + ip6_del_rt(ifp->rt); + ifp->rt = NULL; + } + } +} +#else +static void l3mdev_check_host_rt(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ +} +#endif + +static int fixup_permanent_addr(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + l3mdev_check_host_rt(idev, ifp); + + if (!ifp->rt) { + struct rt6_info *rt; + + rt = addrconf_dst_alloc(idev, &ifp->addr, false); + if (unlikely(IS_ERR(rt))) + return PTR_ERR(rt); + + ifp->rt = rt; + } + + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + idev->dev, 0, 0); + } + + addrconf_dad_start(ifp); + + return 0; +} + +static void addrconf_permanent_addr(struct net_device *dev) +{ + struct inet6_ifaddr *ifp, *tmp; + struct inet6_dev *idev; + + idev = __in6_dev_get(dev); + if (!idev) + return; + + write_lock_bh(&idev->lock); + + list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { + if ((ifp->flags & IFA_F_PERMANENT) && + fixup_permanent_addr(idev, ifp) < 0) { + write_unlock_bh(&idev->lock); + ipv6_del_addr(ifp); + write_lock_bh(&idev->lock); + + net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n", + idev->dev->name, &ifp->addr); + } + } + + write_unlock_bh(&idev->lock); +} + static int addrconf_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -3260,6 +3337,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + switch (dev->type) { #if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: @@ -3368,11 +3448,20 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event) ipv6_mc_unmap(idev); } +static bool addr_is_local(const struct in6_addr *addr) +{ + return ipv6_addr_type(addr) & + (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); +} + static int addrconf_ifdown(struct net_device *dev, int how) { struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + struct inet6_ifaddr *ifa, *tmp; + struct list_head del_list; + int _keep_addr; + bool keep_addr; int state, i; ASSERT_RTNL(); @@ -3399,6 +3488,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) } + /* aggregate the system setting and interface setting */ + _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + if (!_keep_addr) + _keep_addr = idev->cnf.keep_addr_on_down; + + /* combine the user config with event to determine if permanent + * addresses are to be removed from address hash table + */ + keep_addr = !(how || _keep_addr <= 0); + /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { struct hlist_head *h = &inet6_addr_lst[i]; @@ -3407,9 +3506,16 @@ static int addrconf_ifdown(struct net_device *dev, int how) restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - hlist_del_init_rcu(&ifa->addr_lst); addrconf_del_dad_work(ifa); - goto restart; + /* combined flag + permanent flag decide if + * address is retained on a down event + */ + if (!keep_addr || + !(ifa->flags & IFA_F_PERMANENT) || + addr_is_local(&ifa->addr)) { + hlist_del_init_rcu(&ifa->addr_lst); + goto restart; + } } } spin_unlock_bh(&addrconf_hash_lock); @@ -3443,31 +3549,54 @@ restart: write_lock_bh(&idev->lock); } - while (!list_empty(&idev->addr_list)) { - ifa = list_first_entry(&idev->addr_list, - struct inet6_ifaddr, if_list); - addrconf_del_dad_work(ifa); + /* re-combine the user config with event to determine if permanent + * addresses are to be removed from the interface list + */ + keep_addr = (!how && _keep_addr > 0); - list_del(&ifa->if_list); + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + addrconf_del_dad_work(ifa); write_unlock_bh(&idev->lock); - spin_lock_bh(&ifa->lock); - state = ifa->state; - ifa->state = INET6_IFADDR_STATE_DEAD; + + if (keep_addr && (ifa->flags & IFA_F_PERMANENT) && + !addr_is_local(&ifa->addr)) { + /* set state to skip the notifier below */ + state = INET6_IFADDR_STATE_DEAD; + ifa->state = 0; + if (!(ifa->flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; + } else { + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + + list_del(&ifa->if_list); + list_add(&ifa->if_list, &del_list); + } + spin_unlock_bh(&ifa->lock); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); } - in6_ifa_put(ifa); write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, + struct inet6_ifaddr, if_list); + list_del(&ifa->if_list); + + in6_ifa_put(ifa); + } + /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -4732,6 +4861,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; + array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; } static inline size_t inet6_ifla6_size(void) @@ -5819,6 +5949,14 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "keep_addr_on_down", + .data = &ipv6_devconf.keep_addr_on_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, { /* sentinel */ } -- cgit v1.2.3-71-gd317 From 8ead9dd54716d1e05e129959f702fcc1786f82b4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Apr 2016 20:04:08 -0700 Subject: devpts: more pty driver interface cleanups This is more prep-work for the upcoming pty changes. Still just code cleanup with no actual semantic changes. This removes a bunch pointless complexity by just having the slave pty side remember the dentry associated with the devpts slave rather than the inode. That allows us to remove all the "look up the dentry" code for when we want to remove it again. Together with moving the tty pointer from "inode->i_private" to "dentry->d_fsdata" and getting rid of pointless inode locking, this removes about 30 lines of code. Not only is the end result smaller, it's simpler and easier to understand. The old code, for example, depended on the d_find_alias() to not just find the dentry, but also to check that it is still hashed, which in turn validated the tty pointer in the inode. That is a _very_ roundabout way to say "invalidate the cached tty pointer when the dentry is removed". The new code just does dentry->d_fsdata = NULL; in devpts_pty_kill() instead, invalidating the tty pointer rather more directly and obviously. Don't do something complex and subtle when the obvious straightforward approach will do. The rest of the patch (ie apart from code deletion and the above tty pointer clearing) is just switching the calling convention to pass the dentry or file pointer around instead of the inode. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 18 +++++++--------- drivers/tty/tty_io.c | 6 +++--- fs/devpts/inode.c | 53 +++++++++++----------------------------------- include/linux/devpts_fs.h | 6 +++--- include/linux/tty_driver.h | 4 ++-- 5 files changed, 28 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 0058d9fbf931..cf0dc51a2690 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -626,7 +626,7 @@ static int pty_unix98_ioctl(struct tty_struct *tty, */ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, - struct inode *ptm_inode, int idx) + struct file *file, int idx) { /* Master must be open via /dev/ptmx */ return ERR_PTR(-EIO); @@ -642,12 +642,12 @@ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, */ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, - struct inode *pts_inode, int idx) + struct file *file, int idx) { struct tty_struct *tty; mutex_lock(&devpts_mutex); - tty = devpts_get_priv(pts_inode); + tty = devpts_get_priv(file->f_path.dentry); mutex_unlock(&devpts_mutex); /* Master must be open before slave */ if (!tty) @@ -722,7 +722,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; - struct inode *slave_inode; + struct dentry *dentry; int retval; int index; @@ -769,14 +769,12 @@ static int ptmx_open(struct inode *inode, struct file *filp) tty_add_file(tty, filp); - slave_inode = devpts_pty_new(fsi, - MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index, - tty->link); - if (IS_ERR(slave_inode)) { - retval = PTR_ERR(slave_inode); + dentry = devpts_pty_new(fsi, index, tty->link); + if (IS_ERR(dentry)) { + retval = PTR_ERR(dentry); goto err_release; } - tty->link->driver_data = slave_inode; + tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 9b04d72e752e..24d5491ef0da 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1367,12 +1367,12 @@ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, - struct inode *inode, int idx) + struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) - tty = driver->ops->lookup(driver, inode, idx); + tty = driver->ops->lookup(driver, file, idx); else tty = driver->ttys[idx]; @@ -2040,7 +2040,7 @@ static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode, } /* check whether we're reopening an existing tty */ - tty = tty_driver_lookup_tty(driver, inode, index); + tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0af8e7d70d27..0b2954d7172d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -604,8 +604,7 @@ void devpts_put_ref(struct pts_fs_info *fsi) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, - void *priv) +struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; struct super_block *sb; @@ -629,25 +628,21 @@ struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - init_special_inode(inode, S_IFCHR|opts->mode, device); - inode->i_private = priv; + init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); - inode_lock(d_inode(root)); - dentry = d_alloc_name(root, s); if (dentry) { + dentry->d_fsdata = priv; d_add(dentry, inode); fsnotify_create(d_inode(root), dentry); } else { iput(inode); - inode = ERR_PTR(-ENOMEM); + dentry = ERR_PTR(-ENOMEM); } - inode_unlock(d_inode(root)); - - return inode; + return dentry; } /** @@ -656,24 +651,10 @@ struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ -void *devpts_get_priv(struct inode *pts_inode) +void *devpts_get_priv(struct dentry *dentry) { - struct dentry *dentry; - void *priv = NULL; - - BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - - /* Ensure dentry has not been deleted by devpts_pty_kill() */ - dentry = d_find_alias(pts_inode); - if (!dentry) - return NULL; - - if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - priv = pts_inode->i_private; - - dput(dentry); - - return priv; + WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); + return dentry->d_fsdata; } /** @@ -682,24 +663,14 @@ void *devpts_get_priv(struct inode *pts_inode) * * This is an inverse operation of devpts_pty_new. */ -void devpts_pty_kill(struct inode *inode) +void devpts_pty_kill(struct dentry *dentry) { - struct super_block *sb = pts_sb_from_inode(inode); - struct dentry *root = sb->s_root; - struct dentry *dentry; + WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); - BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - - inode_lock(d_inode(root)); - - dentry = d_find_alias(inode); - - drop_nlink(inode); + dentry->d_fsdata = NULL; + drop_nlink(dentry->d_inode); d_delete(dentry); dput(dentry); /* d_alloc_name() in devpts_pty_new() */ - dput(dentry); /* d_find_alias above */ - - inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 358a4db72a27..5871f292b596 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -27,11 +27,11 @@ int devpts_new_index(struct pts_fs_info *); void devpts_kill_index(struct pts_fs_info *, int); /* mknod in devpts */ -struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); +struct dentry *devpts_pty_new(struct pts_fs_info *, int, void *); /* get private structure */ -void *devpts_get_priv(struct inode *pts_inode); +void *devpts_get_priv(struct dentry *); /* unlink */ -void devpts_pty_kill(struct inode *inode); +void devpts_pty_kill(struct dentry *); #endif diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 161052477f77..b742b5e47cc2 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -7,7 +7,7 @@ * defined; unless noted otherwise, they are optional, and can be * filled in with a null pointer. * - * struct tty_struct * (*lookup)(struct tty_driver *self, int idx) + * struct tty_struct * (*lookup)(struct tty_driver *self, struct file *, int idx) * * Return the tty device corresponding to idx, NULL if there is not * one currently in use and an ERR_PTR value on error. Called under @@ -250,7 +250,7 @@ struct serial_icounter_struct; struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, - struct inode *inode, int idx); + struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); -- cgit v1.2.3-71-gd317 From 0224a4a30b57385a60065aa598181868881d8fc6 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 27 Apr 2016 14:04:20 +0300 Subject: device property: Avoid potential dereferences of invalid pointers Since fwnode may hold ERR_PTR(-ENODEV) or it may be NULL, the fwnode type checks is_of_node(), is_acpi_node() and is is_pset_node() need to consider it. Using IS_ERR_OR_NULL() to check it. Fixes: 0d67e0fa1664 (device property: fix for a case of use-after-free) Reported-by: Dan Carpenter Signed-off-by: Heikki Krogerus [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 2 +- include/acpi/acpi_bus.h | 4 ++-- include/linux/of.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/property.c b/drivers/base/property.c index 9b1a65debd49..7f692accdc90 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -21,7 +21,7 @@ static inline bool is_pset_node(struct fwnode_handle *fwnode) { - return fwnode && fwnode->type == FWNODE_PDATA; + return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_PDATA; } static inline struct property_set *to_pset_node(struct fwnode_handle *fwnode) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 14362a84c78e..3a932501d690 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -394,13 +394,13 @@ struct acpi_data_node { static inline bool is_acpi_node(struct fwnode_handle *fwnode) { - return fwnode && (fwnode->type == FWNODE_ACPI + return !IS_ERR_OR_NULL(fwnode) && (fwnode->type == FWNODE_ACPI || fwnode->type == FWNODE_ACPI_DATA); } static inline bool is_acpi_device_node(struct fwnode_handle *fwnode) { - return fwnode && fwnode->type == FWNODE_ACPI; + return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_ACPI; } static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwnode) diff --git a/include/linux/of.h b/include/linux/of.h index 7fcb681baadf..31758036787c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -133,7 +133,7 @@ void of_core_init(void); static inline bool is_of_node(struct fwnode_handle *fwnode) { - return fwnode && fwnode->type == FWNODE_OF; + return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_OF; } static inline struct device_node *to_of_node(struct fwnode_handle *fwnode) -- cgit v1.2.3-71-gd317 From 986ef95ecdd3eb6fa29433e68faa94c7624083be Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 31 Mar 2016 19:03:25 +0300 Subject: IB/mlx5: Expose correct max_sge_rd limit mlx5 devices (Connect-IB, ConnectX-4, ConnectX-4-LX) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Cc: linux-stable@vger.kernel.org Reported-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/device.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5acf346e048e..049754f6e638 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -530,7 +530,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, sizeof(struct mlx5_wqe_ctrl_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); - props->max_sge_rd = props->max_sge; + props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8156e3c9239c..b3575f392492 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -392,6 +392,17 @@ enum { MLX5_CAP_OFF_CMDIF_CSUM = 46, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX5_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; -- cgit v1.2.3-71-gd317 From 7b7483409f09c15f30ac43242ead1ab3061e1f59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 25 Apr 2016 15:13:17 -0300 Subject: net: fix net_gso_ok for new GSO types. Fix casting in net_gso_ok. Otherwise the shift on gso_type << NETIF_F_GSO_SHIFT may hit the 32th bit and make it look like a INT_MIN, which is then promoted from signed to uint64 which is 0xffffffff80000000, resulting in wrong behavior when it is and'ed with the feature itself, as in: This test app: #include #include int main(int argc, char **argv) { uint64_t feature1; uint64_t feature2; int gso_type = 1 << 15; feature1 = gso_type << 16; feature2 = (uint64_t)gso_type << 16; printf("%lx %lx\n", feature1, feature2); return 0; } Gives: ffffffff80000000 80000000 So that this: return (features & feature) == feature; Actually works on more bits than expected and invalid ones. Fix is to promote it earlier. Issue noted while rebasing SCTP GSO patch but posting separetely as someone else may experience this meanwhile. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8395308a2445..b3c46b019ac1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4004,7 +4004,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); -- cgit v1.2.3-71-gd317 From 92117d8443bc5afacc8d5ba82e541946310f106e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 Apr 2016 18:56:20 -0700 Subject: bpf: fix refcnt overflow On a system with >32Gbyte of phyiscal memory and infinite RLIMIT_MEMLOCK, the malicious application may overflow 32-bit bpf program refcnt. It's also possible to overflow map refcnt on 1Tb system. Impose 32k hard limit which means that the same bpf program or map cannot be shared by more than 32k processes. Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 ++- kernel/bpf/inode.c | 7 ++++--- kernel/bpf/syscall.c | 24 ++++++++++++++++++++---- kernel/bpf/verifier.c | 11 +++++++---- 4 files changed, 33 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 21ee41b92e8a..f1d5c5acc8dd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -171,12 +171,13 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_put_rcu(struct bpf_prog *prog); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -void bpf_map_inc(struct bpf_map *map, bool uref); +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..8f94ca1860cf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adc5e4bd74f8..cf5e9f7ad13a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db2574e7b8b0..89bcaa0966da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2049,15 +2049,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; -- cgit v1.2.3-71-gd317 From 66ee95d16a7f1b7b4f1dd74a2d81c6e19dc29a14 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 28 Apr 2016 16:18:24 -0700 Subject: mm: exclude HugeTLB pages from THP page_mapped() logic HugeTLB pages cannot be split, so we use the compound_mapcount to track rmaps. Currently page_mapped() will check the compound_mapcount, but will also go through the constituent pages of a THP compound page and query the individual _mapcount's too. Unfortunately, page_mapped() does not distinguish between HugeTLB and THP compound pages and assumes that a compound page always needs to have HPAGE_PMD_NR pages querying. For most cases when dealing with HugeTLB this is just inefficient, but for scenarios where the HugeTLB page size is less than the pmd block size (e.g. when using contiguous bit on ARM) this can lead to crashes. This patch adjusts the page_mapped function such that we skip the unnecessary THP reference checks for HugeTLB pages. Fixes: e1534ae95004 ("mm: differentiate page_mapped() from page_mapcount() for compound pages") Signed-off-by: Steve Capper Acked-by: Kirill A. Shutemov Cc: Will Deacon Cc: Catalin Marinas Cc: Michal Hocko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a55e5be0894f..79b6c18d0a38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1031,6 +1031,8 @@ static inline bool page_mapped(struct page *page) page = compound_head(page); if (atomic_read(compound_mapcount_ptr(page)) >= 0) return true; + if (PageHuge(page)) + return false; for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; -- cgit v1.2.3-71-gd317 From aa88b68c3b1dce8bc3fd54c8a7372a777ff265cd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Apr 2016 16:18:27 -0700 Subject: thp: keep huge zero page pinned until tlb flush Andrea has found[1] a race condition on MMU-gather based TLB flush vs split_huge_page() or shrinker which frees huge zero under us (patch 1/2 and 2/2 respectively). With new THP refcounting, we don't need patch 1/2: mmu_gather keeps the page pinned until flush is complete and the pin prevents the page from being split under us. We still need patch 2/2. This is simplified version of Andrea's patch. We don't need fancy encoding. [1] http://lkml.kernel.org/r/1447938052-22165-1-git-send-email-aarcange@redhat.com Signed-off-by: Kirill A. Shutemov Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 6 +++--- mm/swap.c | 5 +++++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623e24b1..d7b9e5346fba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -152,6 +152,7 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) } struct page *get_huge_zero_page(void); +void put_huge_zero_page(void); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) @@ -208,6 +209,10 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline void put_huge_zero_page(void) +{ + BUILD_BUG(); +} static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86f9f8b82f8e..5346de05f471 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -232,7 +232,7 @@ retry: return READ_ONCE(huge_zero_page); } -static void put_huge_zero_page(void) +void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -1684,12 +1684,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); diff --git a/mm/swap.c b/mm/swap.c index a0bc206b4ac6..03aacbcb013f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + if (is_huge_zero_page(page)) { + put_huge_zero_page(); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; -- cgit v1.2.3-71-gd317 From 28093f9f34cedeaea0f481c58446d9dac6dd620f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 28 Apr 2016 16:18:35 -0700 Subject: numa: fix /proc//numa_maps for THP In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Cc: Dan Williams Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michael Holzheu Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 2 ++ mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 229cb546bee0..541583510cfb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return page; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, vma, addr); + page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) - gather_stats(page, md, pte_dirty(huge_pte), + gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; @@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; +#endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, vma, addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index 79b6c18d0a38..864d7221de84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1140,6 +1140,8 @@ struct zap_details { struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 93897f23cc11..305537fc8640 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -789,6 +789,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range -- cgit v1.2.3-71-gd317 From 2c94b53738549d81dc7464a32117d1f5112c64d3 Mon Sep 17 00:00:00 2001 From: Tim Bingham Date: Fri, 29 Apr 2016 13:30:23 -0400 Subject: net: Implement net_dbg_ratelimited() for CONFIG_DYNAMIC_DEBUG case Prior to commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") the implementation of net_dbg_ratelimited() was buggy for both the DEBUG and CONFIG_DYNAMIC_DEBUG cases. The bug was that net_ratelimit() was being called and, despite returning true, nothing was being printed to the console. This resulted in messages like the following - "net_ratelimit: %d callbacks suppressed" with no other output nearby. After commit d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") the bug is fixed for the DEBUG case. However, there's no output at all for CONFIG_DYNAMIC_DEBUG case. This patch restores debug output (if enabled) for the CONFIG_DYNAMIC_DEBUG case. Add a definition of net_dbg_ratelimited() for the CONFIG_DYNAMIC_DEBUG case. The implementation takes care to check that dynamic debugging is enabled before calling net_ratelimit(). Fixes: d92cff89a0c8 ("net_dbg_ratelimited: turn into no-op when !DEBUG") Signed-off-by: Tim Bingham Signed-off-by: David S. Miller --- include/linux/net.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 49175e4ced11..f840d77c6c31 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -246,7 +246,15 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) +#define net_dbg_ratelimited(fmt, ...) \ +do { \ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ + if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) && \ + net_ratelimit()) \ + __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__); \ +} while (0) +#elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else -- cgit v1.2.3-71-gd317 From 689de1d6ca95b3b5bd8ee446863bf81a4883ea25 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 May 2016 12:46:42 -0700 Subject: Minimal fix-up of bad hashing behavior of hash_64() This is a fairly minimal fixup to the horribly bad behavior of hash_64() with certain input patterns. In particular, because the multiplicative value used for the 64-bit hash was intentionally bit-sparse (so that the multiply could be done with shifts and adds on architectures without hardware multipliers), some bits did not get spread out very much. In particular, certain fairly common bit ranges in the input (roughly bits 12-20: commonly with the most information in them when you hash things like byte offsets in files or memory that have block factors that mean that the low bits are often zero) would not necessarily show up much in the result. There's a bigger patch-series brewing to fix up things more completely, but this is the fairly minimal fix for the 64-bit hashing problem. It simply picks a much better constant multiplier, spreading the bits out a lot better. NOTE! For 32-bit architectures, the bad old hash_64() remains the same for now, since 64-bit multiplies are expensive. The bigger hashing cleanup will replace the 32-bit case with something better. The new constants were picked by George Spelvin who wrote that bigger cleanup series. I just picked out the constants and part of the comment from that series. Cc: stable@vger.kernel.org Cc: George Spelvin Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/hash.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hash.h b/include/linux/hash.h index 1afde47e1528..79c52fa81cac 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -32,12 +32,28 @@ #error Wordsize not 32 or 64 #endif +/* + * The above primes are actively bad for hashing, since they are + * too sparse. The 32-bit one is mostly ok, the 64-bit one causes + * real problems. Besides, the "prime" part is pointless for the + * multiplicative hash. + * + * Although a random odd number will do, it turns out that the golden + * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice + * properties. + * + * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2. + * (See Knuth vol 3, section 6.4, exercise 9.) + */ +#define GOLDEN_RATIO_32 0x61C88647 +#define GOLDEN_RATIO_64 0x61C8864680B583EBull + static __always_inline u64 hash_64(u64 val, unsigned int bits) { u64 hash = val; -#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 - hash = hash * GOLDEN_RATIO_PRIME_64; +#if BITS_PER_LONG == 64 + hash = hash * GOLDEN_RATIO_64; #else /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ u64 n = hash; -- cgit v1.2.3-71-gd317 From af67eb9e7e1ab37880459f83153d34b3c42b0075 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 2 May 2016 09:25:16 -0700 Subject: vxlan: Add checksum check to the features check function We need to perform an additional check on the inner headers to determine if we can offload the checksum for them. Previously this check didn't occur so we would generate an invalid frame in the case of an IPv6 header encapsulated inside of an IPv4 tunnel. To fix this I added a secondary check to vxlan_features_check so that we can verify that we can offload the inner checksum. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_ether.h | 5 +++++ include/net/vxlan.h | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index d5569734f672..548fd535fd02 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -28,6 +28,11 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) return (struct ethhdr *)skb_mac_header(skb); } +static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb_inner_mac_header(skb); +} + int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 73ed2e951c02..35437c779da8 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -252,7 +252,9 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, (skb->inner_protocol_type != ENCAP_TYPE_ETHER || skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != - sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || + (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; -- cgit v1.2.3-71-gd317 From 4550c4e157ca3da929593bb6c64080a59141af35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 5 May 2016 16:22:03 -0700 Subject: mm: memcontrol: let v2 cgroups follow changes in system swappiness Cgroup2 currently doesn't have a per-cgroup swappiness setting. We might want to add one later - that's a different discussion - but until we do, the cgroups should always follow the system setting. Otherwise it will be unchangeably set to whatever the ancestor inherited from the system setting at the time of cgroup creation. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: [4.5] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2b83359c19ca..0a4cd4703f40 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -533,6 +533,10 @@ static inline swp_entry_t get_swap_page(void) #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { + /* Cgroup2 doesn't have per-cgroup swappiness */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return vm_swappiness; + /* root ? */ if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; -- cgit v1.2.3-71-gd317 From 4e1016dac1ccce6d8a960775526cdc3a5baa690b Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Thu, 5 May 2016 16:22:06 -0700 Subject: rapidio/mport_cdev: fix uapi type definitions Fix problems in uapi definitions reported by Gabriel Laskar: (see https://lkml.org/lkml/2016/4/5/205 for details) - move public header file rio_mport_cdev.h to include/uapi/linux directory - change types in data structures passed as IOCTL parameters - improve parameter checking in some IOCTL service routines Signed-off-by: Alexandre Bounine Reported-by: Gabriel Laskar Tested-by: Barry Wood Cc: Gabriel Laskar Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 115 +++++++------ include/linux/rio_mport_cdev.h | 271 ------------------------------ include/uapi/linux/rio_mport_cdev.h | 277 +++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+), 322 deletions(-) delete mode 100644 include/linux/rio_mport_cdev.h create mode 100644 include/uapi/linux/rio_mport_cdev.h (limited to 'include/linux') diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 96168b819044..e165b7ce29d7 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -126,7 +126,7 @@ struct rio_mport_mapping { struct list_head node; struct mport_dev *md; enum rio_mport_map_dir dir; - u32 rioid; + u16 rioid; u64 rio_addr; dma_addr_t phys_addr; /* for mmap */ void *virt_addr; /* kernel address, for dma_free_coherent */ @@ -137,7 +137,7 @@ struct rio_mport_mapping { struct rio_mport_dma_map { int valid; - uint64_t length; + u64 length; void *vaddr; dma_addr_t paddr; }; @@ -208,7 +208,7 @@ struct mport_cdev_priv { struct kfifo event_fifo; wait_queue_head_t event_rx_wait; spinlock_t fifo_lock; - unsigned int event_mask; /* RIO_DOORBELL, RIO_PORTWRITE */ + u32 event_mask; /* RIO_DOORBELL, RIO_PORTWRITE */ #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_chan *dmach; struct list_head async_list; @@ -276,7 +276,8 @@ static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, return -EFAULT; if ((maint_io.offset % 4) || - (maint_io.length == 0) || (maint_io.length % 4)) + (maint_io.length == 0) || (maint_io.length % 4) || + (maint_io.length + maint_io.offset) > RIO_MAINT_SPACE_SZ) return -EINVAL; buffer = vmalloc(maint_io.length); @@ -298,7 +299,8 @@ static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, offset += 4; } - if (unlikely(copy_to_user(maint_io.buffer, buffer, maint_io.length))) + if (unlikely(copy_to_user((void __user *)(uintptr_t)maint_io.buffer, + buffer, maint_io.length))) ret = -EFAULT; out: vfree(buffer); @@ -319,7 +321,8 @@ static int rio_mport_maint_wr(struct mport_cdev_priv *priv, void __user *arg, return -EFAULT; if ((maint_io.offset % 4) || - (maint_io.length == 0) || (maint_io.length % 4)) + (maint_io.length == 0) || (maint_io.length % 4) || + (maint_io.length + maint_io.offset) > RIO_MAINT_SPACE_SZ) return -EINVAL; buffer = vmalloc(maint_io.length); @@ -327,7 +330,8 @@ static int rio_mport_maint_wr(struct mport_cdev_priv *priv, void __user *arg, return -ENOMEM; length = maint_io.length; - if (unlikely(copy_from_user(buffer, maint_io.buffer, length))) { + if (unlikely(copy_from_user(buffer, + (void __user *)(uintptr_t)maint_io.buffer, length))) { ret = -EFAULT; goto out; } @@ -360,7 +364,7 @@ out: */ static int rio_mport_create_outbound_mapping(struct mport_dev *md, struct file *filp, - u32 rioid, u64 raddr, u32 size, + u16 rioid, u64 raddr, u32 size, dma_addr_t *paddr) { struct rio_mport *mport = md->mport; @@ -369,7 +373,7 @@ rio_mport_create_outbound_mapping(struct mport_dev *md, struct file *filp, rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%x", rioid, raddr, size); - map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) return -ENOMEM; @@ -394,7 +398,7 @@ err_map_outb: static int rio_mport_get_outbound_mapping(struct mport_dev *md, struct file *filp, - u32 rioid, u64 raddr, u32 size, + u16 rioid, u64 raddr, u32 size, dma_addr_t *paddr) { struct rio_mport_mapping *map; @@ -433,7 +437,7 @@ static int rio_mport_obw_map(struct file *filp, void __user *arg) dma_addr_t paddr; int ret; - if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap)))) + if (unlikely(copy_from_user(&map, arg, sizeof(map)))) return -EFAULT; rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%llx", @@ -448,7 +452,7 @@ static int rio_mport_obw_map(struct file *filp, void __user *arg) map.handle = paddr; - if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) + if (unlikely(copy_to_user(arg, &map, sizeof(map)))) return -EFAULT; return 0; } @@ -469,7 +473,7 @@ static int rio_mport_obw_free(struct file *filp, void __user *arg) if (!md->mport->ops->unmap_outb) return -EPROTONOSUPPORT; - if (copy_from_user(&handle, arg, sizeof(u64))) + if (copy_from_user(&handle, arg, sizeof(handle))) return -EFAULT; rmcd_debug(OBW, "h=0x%llx", handle); @@ -498,9 +502,9 @@ static int rio_mport_obw_free(struct file *filp, void __user *arg) static int maint_hdid_set(struct mport_cdev_priv *priv, void __user *arg) { struct mport_dev *md = priv->md; - uint16_t hdid; + u16 hdid; - if (copy_from_user(&hdid, arg, sizeof(uint16_t))) + if (copy_from_user(&hdid, arg, sizeof(hdid))) return -EFAULT; md->mport->host_deviceid = hdid; @@ -520,9 +524,9 @@ static int maint_hdid_set(struct mport_cdev_priv *priv, void __user *arg) static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg) { struct mport_dev *md = priv->md; - uint32_t comptag; + u32 comptag; - if (copy_from_user(&comptag, arg, sizeof(uint32_t))) + if (copy_from_user(&comptag, arg, sizeof(comptag))) return -EFAULT; rio_local_write_config_32(md->mport, RIO_COMPONENT_TAG_CSR, comptag); @@ -837,7 +841,7 @@ err_out: * @xfer: data transfer descriptor structure */ static int -rio_dma_transfer(struct file *filp, uint32_t transfer_mode, +rio_dma_transfer(struct file *filp, u32 transfer_mode, enum rio_transfer_sync sync, enum dma_data_direction dir, struct rio_transfer_io *xfer) { @@ -875,7 +879,7 @@ rio_dma_transfer(struct file *filp, uint32_t transfer_mode, unsigned long offset; long pinned; - offset = (unsigned long)xfer->loc_addr & ~PAGE_MASK; + offset = (unsigned long)(uintptr_t)xfer->loc_addr & ~PAGE_MASK; nr_pages = PAGE_ALIGN(xfer->length + offset) >> PAGE_SHIFT; page_list = kmalloc_array(nr_pages, @@ -1015,19 +1019,20 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction)))) return -EFAULT; - if (transaction.count != 1) + if (transaction.count != 1) /* only single transfer for now */ return -EINVAL; if ((transaction.transfer_mode & priv->md->properties.transfer_mode) == 0) return -ENODEV; - transfer = vmalloc(transaction.count * sizeof(struct rio_transfer_io)); + transfer = vmalloc(transaction.count * sizeof(*transfer)); if (!transfer) return -ENOMEM; - if (unlikely(copy_from_user(transfer, transaction.block, - transaction.count * sizeof(struct rio_transfer_io)))) { + if (unlikely(copy_from_user(transfer, + (void __user *)(uintptr_t)transaction.block, + transaction.count * sizeof(*transfer)))) { ret = -EFAULT; goto out_free; } @@ -1038,8 +1043,9 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) ret = rio_dma_transfer(filp, transaction.transfer_mode, transaction.sync, dir, &transfer[i]); - if (unlikely(copy_to_user(transaction.block, transfer, - transaction.count * sizeof(struct rio_transfer_io)))) + if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block, + transfer, + transaction.count * sizeof(*transfer)))) ret = -EFAULT; out_free: @@ -1129,11 +1135,11 @@ err_tmo: } static int rio_mport_create_dma_mapping(struct mport_dev *md, struct file *filp, - uint64_t size, struct rio_mport_mapping **mapping) + u64 size, struct rio_mport_mapping **mapping) { struct rio_mport_mapping *map; - map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) return -ENOMEM; @@ -1165,7 +1171,7 @@ static int rio_mport_alloc_dma(struct file *filp, void __user *arg) struct rio_mport_mapping *mapping = NULL; int ret; - if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_dma_mem)))) + if (unlikely(copy_from_user(&map, arg, sizeof(map)))) return -EFAULT; ret = rio_mport_create_dma_mapping(md, filp, map.length, &mapping); @@ -1174,7 +1180,7 @@ static int rio_mport_alloc_dma(struct file *filp, void __user *arg) map.dma_handle = mapping->phys_addr; - if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_dma_mem)))) { + if (unlikely(copy_to_user(arg, &map, sizeof(map)))) { mutex_lock(&md->buf_mutex); kref_put(&mapping->ref, mport_release_mapping); mutex_unlock(&md->buf_mutex); @@ -1192,7 +1198,7 @@ static int rio_mport_free_dma(struct file *filp, void __user *arg) int ret = -EFAULT; struct rio_mport_mapping *map, *_map; - if (copy_from_user(&handle, arg, sizeof(u64))) + if (copy_from_user(&handle, arg, sizeof(handle))) return -EFAULT; rmcd_debug(EXIT, "filp=%p", filp); @@ -1242,14 +1248,18 @@ static int rio_mport_free_dma(struct file *filp, void __user *arg) static int rio_mport_create_inbound_mapping(struct mport_dev *md, struct file *filp, - u64 raddr, u32 size, + u64 raddr, u64 size, struct rio_mport_mapping **mapping) { struct rio_mport *mport = md->mport; struct rio_mport_mapping *map; int ret; - map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + /* rio_map_inb_region() accepts u32 size */ + if (size > 0xffffffff) + return -EINVAL; + + map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) return -ENOMEM; @@ -1262,7 +1272,7 @@ rio_mport_create_inbound_mapping(struct mport_dev *md, struct file *filp, if (raddr == RIO_MAP_ANY_ADDR) raddr = map->phys_addr; - ret = rio_map_inb_region(mport, map->phys_addr, raddr, size, 0); + ret = rio_map_inb_region(mport, map->phys_addr, raddr, (u32)size, 0); if (ret < 0) goto err_map_inb; @@ -1288,7 +1298,7 @@ err_dma_alloc: static int rio_mport_get_inbound_mapping(struct mport_dev *md, struct file *filp, - u64 raddr, u32 size, + u64 raddr, u64 size, struct rio_mport_mapping **mapping) { struct rio_mport_mapping *map; @@ -1331,7 +1341,7 @@ static int rio_mport_map_inbound(struct file *filp, void __user *arg) if (!md->mport->ops->map_inb) return -EPROTONOSUPPORT; - if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap)))) + if (unlikely(copy_from_user(&map, arg, sizeof(map)))) return -EFAULT; rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp); @@ -1344,7 +1354,7 @@ static int rio_mport_map_inbound(struct file *filp, void __user *arg) map.handle = mapping->phys_addr; map.rio_addr = mapping->rio_addr; - if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) { + if (unlikely(copy_to_user(arg, &map, sizeof(map)))) { /* Delete mapping if it was created by this request */ if (ret == 0 && mapping->filp == filp) { mutex_lock(&md->buf_mutex); @@ -1375,7 +1385,7 @@ static int rio_mport_inbound_free(struct file *filp, void __user *arg) if (!md->mport->ops->unmap_inb) return -EPROTONOSUPPORT; - if (copy_from_user(&handle, arg, sizeof(u64))) + if (copy_from_user(&handle, arg, sizeof(handle))) return -EFAULT; mutex_lock(&md->buf_mutex); @@ -1401,7 +1411,7 @@ static int rio_mport_inbound_free(struct file *filp, void __user *arg) static int maint_port_idx_get(struct mport_cdev_priv *priv, void __user *arg) { struct mport_dev *md = priv->md; - uint32_t port_idx = md->mport->index; + u32 port_idx = md->mport->index; rmcd_debug(MPORT, "port_index=%d", port_idx); @@ -1451,7 +1461,7 @@ static void rio_mport_doorbell_handler(struct rio_mport *mport, void *dev_id, handled = 0; spin_lock(&data->db_lock); list_for_each_entry(db_filter, &data->doorbells, data_node) { - if (((db_filter->filter.rioid == 0xffffffff || + if (((db_filter->filter.rioid == RIO_INVALID_DESTID || db_filter->filter.rioid == src)) && info >= db_filter->filter.low && info <= db_filter->filter.high) { @@ -1525,6 +1535,9 @@ static int rio_mport_remove_db_filter(struct mport_cdev_priv *priv, if (copy_from_user(&filter, arg, sizeof(filter))) return -EFAULT; + if (filter.low > filter.high) + return -EINVAL; + spin_lock_irqsave(&priv->md->db_lock, flags); list_for_each_entry(db_filter, &priv->db_filters, priv_node) { if (db_filter->filter.rioid == filter.rioid && @@ -1737,10 +1750,10 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, return -EEXIST; } - size = sizeof(struct rio_dev); + size = sizeof(*rdev); mport = md->mport; - destid = (u16)dev_info.destid; - hopcount = (u8)dev_info.hopcount; + destid = dev_info.destid; + hopcount = dev_info.hopcount; if (rio_mport_read_config_32(mport, destid, hopcount, RIO_PEF_CAR, &rval)) @@ -1872,8 +1885,8 @@ static int rio_mport_del_riodev(struct mport_cdev_priv *priv, void __user *arg) do { rdev = rio_get_comptag(dev_info.comptag, rdev); if (rdev && rdev->dev.parent == &mport->net->dev && - rdev->destid == (u16)dev_info.destid && - rdev->hopcount == (u8)dev_info.hopcount) + rdev->destid == dev_info.destid && + rdev->hopcount == dev_info.hopcount) break; } while (rdev); } @@ -2146,8 +2159,8 @@ static long mport_cdev_ioctl(struct file *filp, return maint_port_idx_get(data, (void __user *)arg); case RIO_MPORT_GET_PROPERTIES: md->properties.hdid = md->mport->host_deviceid; - if (copy_to_user((void __user *)arg, &(data->md->properties), - sizeof(data->md->properties))) + if (copy_to_user((void __user *)arg, &(md->properties), + sizeof(md->properties))) return -EFAULT; return 0; case RIO_ENABLE_DOORBELL_RANGE: @@ -2159,11 +2172,11 @@ static long mport_cdev_ioctl(struct file *filp, case RIO_DISABLE_PORTWRITE_RANGE: return rio_mport_remove_pw_filter(data, (void __user *)arg); case RIO_SET_EVENT_MASK: - data->event_mask = arg; + data->event_mask = (u32)arg; return 0; case RIO_GET_EVENT_MASK: if (copy_to_user((void __user *)arg, &data->event_mask, - sizeof(data->event_mask))) + sizeof(u32))) return -EFAULT; return 0; case RIO_MAP_OUTBOUND: @@ -2374,7 +2387,7 @@ static ssize_t mport_write(struct file *filp, const char __user *buf, return -EINVAL; ret = rio_mport_send_doorbell(mport, - (u16)event.u.doorbell.rioid, + event.u.doorbell.rioid, event.u.doorbell.payload); if (ret < 0) return ret; @@ -2421,7 +2434,7 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) struct mport_dev *md; struct rio_mport_attr attr; - md = kzalloc(sizeof(struct mport_dev), GFP_KERNEL); + md = kzalloc(sizeof(*md), GFP_KERNEL); if (!md) { rmcd_error("Unable allocate a device object"); return NULL; @@ -2470,7 +2483,7 @@ static struct mport_dev *mport_cdev_add(struct rio_mport *mport) /* The transfer_mode property will be returned through mport query * interface */ -#ifdef CONFIG_PPC /* for now: only on Freescale's SoCs */ +#ifdef CONFIG_FSL_RIO /* for now: only on Freescale's SoCs */ md->properties.transfer_mode |= RIO_TRANSFER_MODE_MAPPED; #else md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER; diff --git a/include/linux/rio_mport_cdev.h b/include/linux/rio_mport_cdev.h deleted file mode 100644 index b65d19df76d2..000000000000 --- a/include/linux/rio_mport_cdev.h +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2015-2016, Integrated Device Technology Inc. - * Copyright (c) 2015, Prodrive Technologies - * Copyright (c) 2015, Texas Instruments Incorporated - * Copyright (c) 2015, RapidIO Trade Association - * All rights reserved. - * - * This software is available to you under a choice of one of two licenses. - * You may choose to be licensed under the terms of the GNU General Public - * License(GPL) Version 2, or the BSD-3 Clause license below: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _RIO_MPORT_CDEV_H_ -#define _RIO_MPORT_CDEV_H_ - -#ifndef __user -#define __user -#endif - -struct rio_mport_maint_io { - uint32_t rioid; /* destID of remote device */ - uint32_t hopcount; /* hopcount to remote device */ - uint32_t offset; /* offset in register space */ - size_t length; /* length in bytes */ - void __user *buffer; /* data buffer */ -}; - -/* - * Definitions for RapidIO data transfers: - * - memory mapped (MAPPED) - * - packet generation from memory (TRANSFER) - */ -#define RIO_TRANSFER_MODE_MAPPED (1 << 0) -#define RIO_TRANSFER_MODE_TRANSFER (1 << 1) -#define RIO_CAP_DBL_SEND (1 << 2) -#define RIO_CAP_DBL_RECV (1 << 3) -#define RIO_CAP_PW_SEND (1 << 4) -#define RIO_CAP_PW_RECV (1 << 5) -#define RIO_CAP_MAP_OUTB (1 << 6) -#define RIO_CAP_MAP_INB (1 << 7) - -struct rio_mport_properties { - uint16_t hdid; - uint8_t id; /* Physical port ID */ - uint8_t index; - uint32_t flags; - uint32_t sys_size; /* Default addressing size */ - uint8_t port_ok; - uint8_t link_speed; - uint8_t link_width; - uint32_t dma_max_sge; - uint32_t dma_max_size; - uint32_t dma_align; - uint32_t transfer_mode; /* Default transfer mode */ - uint32_t cap_sys_size; /* Capable system sizes */ - uint32_t cap_addr_size; /* Capable addressing sizes */ - uint32_t cap_transfer_mode; /* Capable transfer modes */ - uint32_t cap_mport; /* Mport capabilities */ -}; - -/* - * Definitions for RapidIO events; - * - incoming port-writes - * - incoming doorbells - */ -#define RIO_DOORBELL (1 << 0) -#define RIO_PORTWRITE (1 << 1) - -struct rio_doorbell { - uint32_t rioid; - uint16_t payload; -}; - -struct rio_doorbell_filter { - uint32_t rioid; /* 0xffffffff to match all ids */ - uint16_t low; - uint16_t high; -}; - - -struct rio_portwrite { - uint32_t payload[16]; -}; - -struct rio_pw_filter { - uint32_t mask; - uint32_t low; - uint32_t high; -}; - -/* RapidIO base address for inbound requests set to value defined below - * indicates that no specific RIO-to-local address translation is requested - * and driver should use direct (one-to-one) address mapping. -*/ -#define RIO_MAP_ANY_ADDR (uint64_t)(~((uint64_t) 0)) - -struct rio_mmap { - uint32_t rioid; - uint64_t rio_addr; - uint64_t length; - uint64_t handle; - void *address; -}; - -struct rio_dma_mem { - uint64_t length; /* length of DMA memory */ - uint64_t dma_handle; /* handle associated with this memory */ - void *buffer; /* pointer to this memory */ -}; - - -struct rio_event { - unsigned int header; /* event type RIO_DOORBELL or RIO_PORTWRITE */ - union { - struct rio_doorbell doorbell; /* header for RIO_DOORBELL */ - struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */ - } u; -}; - -enum rio_transfer_sync { - RIO_TRANSFER_SYNC, /* synchronous transfer */ - RIO_TRANSFER_ASYNC, /* asynchronous transfer */ - RIO_TRANSFER_FAF, /* fire-and-forget transfer */ -}; - -enum rio_transfer_dir { - RIO_TRANSFER_DIR_READ, /* Read operation */ - RIO_TRANSFER_DIR_WRITE, /* Write operation */ -}; - -/* - * RapidIO data exchange transactions are lists of individual transfers. Each - * transfer exchanges data between two RapidIO devices by remote direct memory - * access and has its own completion code. - * - * The RapidIO specification defines four types of data exchange requests: - * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows - * to specify the required type of write operation or combination of them when - * only the last data packet requires response. - * - * NREAD: read up to 256 bytes from remote device memory into local memory - * NWRITE: write up to 256 bytes from local memory to remote device memory - * without confirmation - * SWRITE: as NWRITE, but all addresses and payloads must be 64-bit aligned - * NWRITE_R: as NWRITE, but expect acknowledgment from remote device. - * - * The default exchange is chosen from NREAD and any of the WRITE modes as the - * driver sees fit. For write requests the user can explicitly choose between - * any of the write modes for each transaction. - */ -enum rio_exchange { - RIO_EXCHANGE_DEFAULT, /* Default method */ - RIO_EXCHANGE_NWRITE, /* All packets using NWRITE */ - RIO_EXCHANGE_SWRITE, /* All packets using SWRITE */ - RIO_EXCHANGE_NWRITE_R, /* Last packet NWRITE_R, others NWRITE */ - RIO_EXCHANGE_SWRITE_R, /* Last packet NWRITE_R, others SWRITE */ - RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */ -}; - -struct rio_transfer_io { - uint32_t rioid; /* Target destID */ - uint64_t rio_addr; /* Address in target's RIO mem space */ - enum rio_exchange method; /* Data exchange method */ - void __user *loc_addr; - uint64_t handle; - uint64_t offset; /* Offset in buffer */ - uint64_t length; /* Length in bytes */ - uint32_t completion_code; /* Completion code for this transfer */ -}; - -struct rio_transaction { - uint32_t transfer_mode; /* Data transfer mode */ - enum rio_transfer_sync sync; /* Synchronization method */ - enum rio_transfer_dir dir; /* Transfer direction */ - size_t count; /* Number of transfers */ - struct rio_transfer_io __user *block; /* Array of transfers */ -}; - -struct rio_async_tx_wait { - uint32_t token; /* DMA transaction ID token */ - uint32_t timeout; /* Wait timeout in msec, if 0 use default TO */ -}; - -#define RIO_MAX_DEVNAME_SZ 20 - -struct rio_rdev_info { - uint32_t destid; - uint8_t hopcount; - uint32_t comptag; - char name[RIO_MAX_DEVNAME_SZ + 1]; -}; - -/* Driver IOCTL codes */ -#define RIO_MPORT_DRV_MAGIC 'm' - -#define RIO_MPORT_MAINT_HDID_SET \ - _IOW(RIO_MPORT_DRV_MAGIC, 1, uint16_t) -#define RIO_MPORT_MAINT_COMPTAG_SET \ - _IOW(RIO_MPORT_DRV_MAGIC, 2, uint32_t) -#define RIO_MPORT_MAINT_PORT_IDX_GET \ - _IOR(RIO_MPORT_DRV_MAGIC, 3, uint32_t) -#define RIO_MPORT_GET_PROPERTIES \ - _IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties) -#define RIO_MPORT_MAINT_READ_LOCAL \ - _IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_WRITE_LOCAL \ - _IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_READ_REMOTE \ - _IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io) -#define RIO_MPORT_MAINT_WRITE_REMOTE \ - _IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io) -#define RIO_ENABLE_DOORBELL_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter) -#define RIO_DISABLE_DOORBELL_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter) -#define RIO_ENABLE_PORTWRITE_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter) -#define RIO_DISABLE_PORTWRITE_RANGE \ - _IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter) -#define RIO_SET_EVENT_MASK \ - _IOW(RIO_MPORT_DRV_MAGIC, 13, unsigned int) -#define RIO_GET_EVENT_MASK \ - _IOR(RIO_MPORT_DRV_MAGIC, 14, unsigned int) -#define RIO_MAP_OUTBOUND \ - _IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap) -#define RIO_UNMAP_OUTBOUND \ - _IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap) -#define RIO_MAP_INBOUND \ - _IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap) -#define RIO_UNMAP_INBOUND \ - _IOW(RIO_MPORT_DRV_MAGIC, 18, uint64_t) -#define RIO_ALLOC_DMA \ - _IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem) -#define RIO_FREE_DMA \ - _IOW(RIO_MPORT_DRV_MAGIC, 20, uint64_t) -#define RIO_TRANSFER \ - _IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction) -#define RIO_WAIT_FOR_ASYNC \ - _IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait) -#define RIO_DEV_ADD \ - _IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info) -#define RIO_DEV_DEL \ - _IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info) - -#endif /* _RIO_MPORT_CDEV_H_ */ diff --git a/include/uapi/linux/rio_mport_cdev.h b/include/uapi/linux/rio_mport_cdev.h new file mode 100644 index 000000000000..5796bf1d06ad --- /dev/null +++ b/include/uapi/linux/rio_mport_cdev.h @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2015-2016, Integrated Device Technology Inc. + * Copyright (c) 2015, Prodrive Technologies + * Copyright (c) 2015, Texas Instruments Incorporated + * Copyright (c) 2015, RapidIO Trade Association + * All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License(GPL) Version 2, or the BSD-3 Clause license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RIO_MPORT_CDEV_H_ +#define _RIO_MPORT_CDEV_H_ + +#include +#include + +struct rio_mport_maint_io { + __u16 rioid; /* destID of remote device */ + __u8 hopcount; /* hopcount to remote device */ + __u8 pad0[5]; + __u32 offset; /* offset in register space */ + __u32 length; /* length in bytes */ + __u64 buffer; /* pointer to data buffer */ +}; + +/* + * Definitions for RapidIO data transfers: + * - memory mapped (MAPPED) + * - packet generation from memory (TRANSFER) + */ +#define RIO_TRANSFER_MODE_MAPPED (1 << 0) +#define RIO_TRANSFER_MODE_TRANSFER (1 << 1) +#define RIO_CAP_DBL_SEND (1 << 2) +#define RIO_CAP_DBL_RECV (1 << 3) +#define RIO_CAP_PW_SEND (1 << 4) +#define RIO_CAP_PW_RECV (1 << 5) +#define RIO_CAP_MAP_OUTB (1 << 6) +#define RIO_CAP_MAP_INB (1 << 7) + +struct rio_mport_properties { + __u16 hdid; + __u8 id; /* Physical port ID */ + __u8 index; + __u32 flags; + __u32 sys_size; /* Default addressing size */ + __u8 port_ok; + __u8 link_speed; + __u8 link_width; + __u8 pad0; + __u32 dma_max_sge; + __u32 dma_max_size; + __u32 dma_align; + __u32 transfer_mode; /* Default transfer mode */ + __u32 cap_sys_size; /* Capable system sizes */ + __u32 cap_addr_size; /* Capable addressing sizes */ + __u32 cap_transfer_mode; /* Capable transfer modes */ + __u32 cap_mport; /* Mport capabilities */ +}; + +/* + * Definitions for RapidIO events; + * - incoming port-writes + * - incoming doorbells + */ +#define RIO_DOORBELL (1 << 0) +#define RIO_PORTWRITE (1 << 1) + +struct rio_doorbell { + __u16 rioid; + __u16 payload; +}; + +struct rio_doorbell_filter { + __u16 rioid; /* Use RIO_INVALID_DESTID to match all ids */ + __u16 low; + __u16 high; + __u16 pad0; +}; + + +struct rio_portwrite { + __u32 payload[16]; +}; + +struct rio_pw_filter { + __u32 mask; + __u32 low; + __u32 high; + __u32 pad0; +}; + +/* RapidIO base address for inbound requests set to value defined below + * indicates that no specific RIO-to-local address translation is requested + * and driver should use direct (one-to-one) address mapping. +*/ +#define RIO_MAP_ANY_ADDR (__u64)(~((__u64) 0)) + +struct rio_mmap { + __u16 rioid; + __u16 pad0[3]; + __u64 rio_addr; + __u64 length; + __u64 handle; + __u64 address; +}; + +struct rio_dma_mem { + __u64 length; /* length of DMA memory */ + __u64 dma_handle; /* handle associated with this memory */ + __u64 address; +}; + +struct rio_event { + __u32 header; /* event type RIO_DOORBELL or RIO_PORTWRITE */ + union { + struct rio_doorbell doorbell; /* header for RIO_DOORBELL */ + struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */ + } u; + __u32 pad0; +}; + +enum rio_transfer_sync { + RIO_TRANSFER_SYNC, /* synchronous transfer */ + RIO_TRANSFER_ASYNC, /* asynchronous transfer */ + RIO_TRANSFER_FAF, /* fire-and-forget transfer */ +}; + +enum rio_transfer_dir { + RIO_TRANSFER_DIR_READ, /* Read operation */ + RIO_TRANSFER_DIR_WRITE, /* Write operation */ +}; + +/* + * RapidIO data exchange transactions are lists of individual transfers. Each + * transfer exchanges data between two RapidIO devices by remote direct memory + * access and has its own completion code. + * + * The RapidIO specification defines four types of data exchange requests: + * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows + * to specify the required type of write operation or combination of them when + * only the last data packet requires response. + * + * NREAD: read up to 256 bytes from remote device memory into local memory + * NWRITE: write up to 256 bytes from local memory to remote device memory + * without confirmation + * SWRITE: as NWRITE, but all addresses and payloads must be 64-bit aligned + * NWRITE_R: as NWRITE, but expect acknowledgment from remote device. + * + * The default exchange is chosen from NREAD and any of the WRITE modes as the + * driver sees fit. For write requests the user can explicitly choose between + * any of the write modes for each transaction. + */ +enum rio_exchange { + RIO_EXCHANGE_DEFAULT, /* Default method */ + RIO_EXCHANGE_NWRITE, /* All packets using NWRITE */ + RIO_EXCHANGE_SWRITE, /* All packets using SWRITE */ + RIO_EXCHANGE_NWRITE_R, /* Last packet NWRITE_R, others NWRITE */ + RIO_EXCHANGE_SWRITE_R, /* Last packet NWRITE_R, others SWRITE */ + RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */ +}; + +struct rio_transfer_io { + __u64 rio_addr; /* Address in target's RIO mem space */ + __u64 loc_addr; + __u64 handle; + __u64 offset; /* Offset in buffer */ + __u64 length; /* Length in bytes */ + __u16 rioid; /* Target destID */ + __u16 method; /* Data exchange method, one of rio_exchange enum */ + __u32 completion_code; /* Completion code for this transfer */ +}; + +struct rio_transaction { + __u64 block; /* Pointer to array of transfers */ + __u32 count; /* Number of transfers */ + __u32 transfer_mode; /* Data transfer mode */ + __u16 sync; /* Synch method, one of rio_transfer_sync enum */ + __u16 dir; /* Transfer direction, one of rio_transfer_dir enum */ + __u32 pad0; +}; + +struct rio_async_tx_wait { + __u32 token; /* DMA transaction ID token */ + __u32 timeout; /* Wait timeout in msec, if 0 use default TO */ +}; + +#define RIO_MAX_DEVNAME_SZ 20 + +struct rio_rdev_info { + __u16 destid; + __u8 hopcount; + __u8 pad0; + __u32 comptag; + char name[RIO_MAX_DEVNAME_SZ + 1]; +}; + +/* Driver IOCTL codes */ +#define RIO_MPORT_DRV_MAGIC 'm' + +#define RIO_MPORT_MAINT_HDID_SET \ + _IOW(RIO_MPORT_DRV_MAGIC, 1, __u16) +#define RIO_MPORT_MAINT_COMPTAG_SET \ + _IOW(RIO_MPORT_DRV_MAGIC, 2, __u32) +#define RIO_MPORT_MAINT_PORT_IDX_GET \ + _IOR(RIO_MPORT_DRV_MAGIC, 3, __u32) +#define RIO_MPORT_GET_PROPERTIES \ + _IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties) +#define RIO_MPORT_MAINT_READ_LOCAL \ + _IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_WRITE_LOCAL \ + _IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_READ_REMOTE \ + _IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_WRITE_REMOTE \ + _IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io) +#define RIO_ENABLE_DOORBELL_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter) +#define RIO_DISABLE_DOORBELL_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter) +#define RIO_ENABLE_PORTWRITE_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter) +#define RIO_DISABLE_PORTWRITE_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter) +#define RIO_SET_EVENT_MASK \ + _IOW(RIO_MPORT_DRV_MAGIC, 13, __u32) +#define RIO_GET_EVENT_MASK \ + _IOR(RIO_MPORT_DRV_MAGIC, 14, __u32) +#define RIO_MAP_OUTBOUND \ + _IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap) +#define RIO_UNMAP_OUTBOUND \ + _IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap) +#define RIO_MAP_INBOUND \ + _IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap) +#define RIO_UNMAP_INBOUND \ + _IOW(RIO_MPORT_DRV_MAGIC, 18, __u64) +#define RIO_ALLOC_DMA \ + _IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem) +#define RIO_FREE_DMA \ + _IOW(RIO_MPORT_DRV_MAGIC, 20, __u64) +#define RIO_TRANSFER \ + _IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction) +#define RIO_WAIT_FOR_ASYNC \ + _IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait) +#define RIO_DEV_ADD \ + _IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info) +#define RIO_DEV_DEL \ + _IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info) + +#endif /* _RIO_MPORT_CDEV_H_ */ -- cgit v1.2.3-71-gd317 From 127393fbe597dd85863a9bdccaa11007e7d4948f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 5 May 2016 16:22:20 -0700 Subject: mm: thp: kvm: fix memory corruption in KVM with THP enabled After the THP refcounting change, obtaining a compound pages from get_user_pages() no longer allows us to assume the entire compound page is immediately mappable from a secondary MMU. A secondary MMU doesn't want to call get_user_pages() more than once for each compound page, in order to know if it can map the whole compound page. So a secondary MMU needs to know from a single get_user_pages() invocation when it can map immediately the entire compound page to avoid a flood of unnecessary secondary MMU faults and spurious atomic_inc()/atomic_dec() (pages don't have to be pinned by MMU notifier users). Ideally instead of the page->_mapcount < 1 check, get_user_pages() should return the granularity of the "page" mapping in the "mm" passed to get_user_pages(). However it's non trivial change to pass the "pmd" status belonging to the "mm" walked by get_user_pages up the stack (up to the caller of get_user_pages). So the fix just checks if there is not a single pte mapping on the page returned by get_user_pages, and in turn if the caller can assume that the whole compound page is mapped in the current "mm" (in a pmd_trans_huge()). In such case the entire compound page is safe to map into the secondary MMU without additional get_user_pages() calls on the surrounding tail/head pages. In addition of being faster, not having to run other get_user_pages() calls also reduces the memory footprint of the secondary MMU fault in case the pmd split happened as result of memory pressure. Without this fix after a MADV_DONTNEED (like invoked by QEMU during postcopy live migration or balloning) or after generic swapping (with a failure in split_huge_page() that would only result in pmd splitting and not a physical page split), KVM would map the whole compound page into the shadow pagetables, despite regular faults or userfaults (like UFFDIO_COPY) may map regular pages into the primary MMU as result of the pte faults, leading to the guest mode and userland mode going out of sync and not working on the same memory at all times. Any other secondary MMU notifier manager (KVM is just one of the many MMU notifier users) will need the same information if it doesn't want to run a flood of get_user_pages_fast and it can support multiple granularity in the secondary MMU mappings, so I think it is justified to be exposed not just to KVM. The other option would be to move transparent_hugepage_adjust to mm/huge_memory.c but that currently has all kind of KVM data structures in it, so it's definitely not a cut-and-paste work, so I couldn't do a fix as cleaner as this one for 4.6. Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: "Kirill A. Shutemov" Cc: "Li, Liang Z" Cc: Amit Shah Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kvm/mmu.c | 2 +- arch/x86/kvm/mmu.c | 4 ++-- include/linux/page-flags.h | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 58dbd5c439df..d6d4191e68f2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1004,7 +1004,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; - if (PageTransCompound(pfn_to_page(pfn))) { + if (PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ff4dbb73fb7..b6f50e8b0a39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2823,7 +2823,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompound(pfn_to_page(pfn)) && + PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* @@ -4785,7 +4785,7 @@ restart: */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - PageTransCompound(pfn_to_page(pfn))) { + PageTransCompoundMap(pfn_to_page(pfn))) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f4ed4f1b0c77..6b052aa7b5b7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -516,6 +516,27 @@ static inline int PageTransCompound(struct page *page) return PageCompound(page); } +/* + * PageTransCompoundMap is the same as PageTransCompound, but it also + * guarantees the primary MMU has the entire compound page mapped + * through pmd_trans_huge, which in turn guarantees the secondary MMUs + * can also map the entire compound page. This allows the secondary + * MMUs to call get_user_pages() only once for each compound page and + * to immediately map the entire compound page with a single secondary + * MMU fault. If there will be a pmd split later, the secondary MMUs + * will get an update through the MMU notifier invalidation through + * split_huge_pmd(). + * + * Unlike PageTransCompound, this is safe to be called only while + * split_huge_pmd() cannot run from under us, like if protected by the + * MMU notifier, otherwise it may result in page->_mapcount < 0 false + * positives. + */ +static inline int PageTransCompoundMap(struct page *page) +{ + return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; +} + /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -559,6 +580,7 @@ static inline int TestClearPageDoubleMap(struct page *page) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) TESTPAGEFLAG_FALSE(DoubleMap) TESTSETFLAG_FALSE(DoubleMap) -- cgit v1.2.3-71-gd317 From 54d5ca871e72f2bb172ec9323497f01cd5091ec7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 11 May 2016 01:16:37 +0200 Subject: vfs: add vfs_select_inode() helper Signed-off-by: Miklos Szeredi Cc: # v4.2+ --- fs/open.c | 12 ++++-------- include/linux/dcache.h | 12 ++++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/open.c b/fs/open.c index 17cb6b1dab75..081d3d6df74b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = path->dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); - file->f_path = *path; - if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { - inode = dentry->d_op->d_select_inode(dentry, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + file->f_path = *path; return do_dentry_open(file, inode, NULL, cred); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4bb4de8d95ea..7e9422cb5989 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -565,4 +565,16 @@ static inline struct dentry *d_real(struct dentry *dentry) return dentry; } +static inline struct inode *vfs_select_inode(struct dentry *dentry, + unsigned open_flags) +{ + struct inode *inode = d_inode(dentry); + + if (inode && unlikely(dentry->d_flags & DCACHE_OP_SELECT_INODE)) + inode = dentry->d_op->d_select_inode(dentry, open_flags); + + return inode; +} + + #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3-71-gd317 From 3c9fe8cdff1b889a059a30d22f130372f2b3885f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 11 May 2016 01:16:37 +0200 Subject: vfs: add lookup_hash() helper Overlayfs needs lookup without inode_permission() and already has the name hash (in form of dentry->d_name on overlayfs dentry). It also doesn't support filesystems with d_op->d_hash() so basically it only needs the actual hashed lookup from lookup_one_len_unlocked() So add a new helper that does unlocked lookup of a hashed name. Signed-off-by: Miklos Szeredi --- fs/namei.c | 33 ++++++++++++++++++++++++++++----- include/linux/namei.h | 2 ++ 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 3ef87d673bbe..1a1ea79a7ba0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2266,6 +2266,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); +/** + * lookup_hash - lookup single pathname component on already hashed name + * @name: name and hash to lookup + * @base: base directory to lookup from + * + * The name must have been verified and hashed (see lookup_one_len()). Using + * this after just full_name_hash() is unsafe. + * + * This function also doesn't check for search permission on base directory. + * + * Use lookup_one_len_unlocked() instead, unless you really know what you are + * doing. + * + * Do not hold i_mutex; this helper takes i_mutex if necessary. + */ +struct dentry *lookup_hash(const struct qstr *name, struct dentry *base) +{ + struct dentry *ret; + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + + return ret; +} +EXPORT_SYMBOL(lookup_hash); + /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -2337,7 +2364,6 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct qstr this; unsigned int c; int err; - struct dentry *ret; this.name = name; this.len = len; @@ -2369,10 +2395,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); - if (!ret) - ret = lookup_slow(&this, base, 0); - return ret; + return lookup_hash(&this, base); } EXPORT_SYMBOL(lookup_one_len_unlocked); diff --git a/include/linux/namei.h b/include/linux/namei.h index 77d01700daf7..ec5ec2818a28 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,6 +79,8 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); +struct qstr; +extern struct dentry *lookup_hash(const struct qstr *, struct dentry *); extern int follow_down_one(struct path *); extern int follow_down(struct path *); -- cgit v1.2.3-71-gd317