From ffc79664d15841025d90afdd902c4112ffe168d6 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 25 Sep 2013 11:18:14 -0400 Subject: ceph: hung on ceph fscache invalidate in some cases In some cases I'm on my ceph client cluster I'm seeing hunk kernel tasks in the invalidate page code path. This is due to the fact that we don't check if the page is marked as cache before calling fscache_wait_on_page_write(). This is the log from the hang INFO: task XXXXXX:12034 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ... Call Trace: [] schedule+0x29/0x70 [] __fscache_wait_on_page_write+0x6d/0xb0 [fscache] [] ? add_wait_queue+0x60/0x60 [] ceph_invalidate_fscache_page+0x29/0x50 [ceph] [] ceph_invalidatepage+0x70/0x190 [ceph] [] ? delete_from_page_cache+0x5f/0x70 [] truncate_inode_page+0x8b/0x90 [] truncate_inode_pages_range.part.12+0x13d/0x620 [] truncate_inode_pages_range+0x4d/0x60 [] truncate_inode_pages+0x15/0x20 [] evict+0x1a6/0x1b0 [] iput+0x103/0x190 ... Signed-off-by: Milosz Tanski Reviewed-by: Sage Weil --- fs/ceph/cache.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e0b038..360b622b0be0 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -324,6 +324,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); + if (!PageFsCache(page)) + return; + fscache_wait_on_page_write(ci->fscache, page); fscache_uncache_page(ci->fscache, page); } -- cgit v1.2.3-71-gd317 From 53e879a485f9def0e55c404dbc7187470a01602d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:29:07 +0800 Subject: ceph: remove outdated frag information If directory fragments change, fill_inode() inserts new frags into the fragtree, but it does not remove outdated frags from the fragtree. This patch fixes it. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/inode.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8549a48115f7..1c9fea0c9834 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,6 +577,8 @@ static int fill_inode(struct inode *inode, int issued = 0, implemented; struct timespec mtime, atime, ctime; u32 nsplits; + struct ceph_inode_frag *frag; + struct rb_node *rb_node; struct ceph_buffer *xattr_blob = NULL; int err = 0; int queue_trunc = 0; @@ -751,15 +753,38 @@ no_change: /* FIXME: move me up, if/when version reflects fragtree changes */ nsplits = le32_to_cpu(info->fragtree.nsplits); mutex_lock(&ci->i_fragtree_mutex); + rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); dout(" frag %x split by %d\n", frag->frag, frag->split_by); } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ -- cgit v1.2.3-71-gd317 From 81c6aea5275eae453719d7f3924da07e668265c5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:44:13 +0800 Subject: ceph: handle frag mismatch between readdir request and reply If client has outdated directory fragments information, it may request readdir an non-existent directory fragment. In this case, the MDS finds an approximate directory fragment and sends its contents back to the client. When receiving a reply with fragment that is different than the requested one, the client need to reset the 'readdir offset'. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 11 ++++++++++- fs/ceph/inode.c | 16 ++++++++++++++-- fs/ceph/mds_client.c | 3 +-- 3 files changed, 25 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 868b61d56cac..2a0bcaeb189a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -352,8 +352,18 @@ more: } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } fi->offset = fi->next_offset; fi->last_readdir = req; + fi->frag = frag; if (req->r_reply_info.dir_end) { kfree(fi->last_name); @@ -363,7 +373,6 @@ more: else fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1c9fea0c9834..9a8e396aed89 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1275,8 +1275,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); @@ -1340,7 +1352,7 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); /* inode */ if (dn->d_inode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b7bda5d9611d..f51ab2627b41 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2238,8 +2238,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || - req->r_op == CEPH_MDS_OP_LSSNAP) && - rinfo->dir_nr) + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } -- cgit v1.2.3-71-gd317 From e34ecee2ae791df674dfb466ce40692ca6218e43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 10 Oct 2013 19:31:47 -0700 Subject: aio: Fix a trinity splat aio kiocb refcounting was broken - it was relying on keeping track of the number of available ring buffer entries, which it needs to do anyways; then at shutdown time it'd wait for completions to be delivered until the # of available ring buffer entries equalled what it was initialized to. Problem with that is that the ring buffer is mapped writable into userspace, so userspace could futz with the head and tail pointers to cause the kernel to see extra completions, and cause free_ioctx() to return while there were still outstanding kiocbs. Which would be bad. Fix is just to directly refcount the kiocbs - which is more straightforward, and with the new percpu refcounting code doesn't cost us any cacheline bouncing which was the whole point of the original scheme. Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't subtracting from aio_nr if ioctx_add_table() failed. Signed-off-by: Kent Overstreet --- fs/aio.c | 129 ++++++++++++++++++++++++--------------------------------------- 1 file changed, 48 insertions(+), 81 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 067e3d340c35..ee77dc13d5b2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -80,6 +80,8 @@ struct kioctx { struct percpu_ref users; atomic_t dead; + struct percpu_ref reqs; + unsigned long user_id; struct __percpu kioctx_cpu *cpu; @@ -107,7 +109,6 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct rcu_head rcu_head; struct work_struct free_work; struct { @@ -412,26 +413,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) return cancel(kiocb); } -static void free_ioctx_rcu(struct rcu_head *head) +static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + struct kioctx *ctx = container_of(work, struct kioctx, free_work); + + pr_debug("freeing %p\n", ctx); + aio_free_ring(ctx); free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_reqs(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct work_struct *work) +static void free_ioctx_users(struct percpu_ref *ref) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - struct aio_ring *ring; + struct kioctx *ctx = container_of(ref, struct kioctx, users); struct kiocb *req; - unsigned cpu, avail; - DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -445,54 +454,8 @@ static void free_ioctx(struct work_struct *work) spin_unlock_irq(&ctx->ctx_lock); - for_each_possible_cpu(cpu) { - struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - - atomic_add(kcpu->reqs_available, &ctx->reqs_available); - kcpu->reqs_available = 0; - } - - while (1) { - prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - - ring = kmap_atomic(ctx->ring_pages[0]); - avail = (ring->head <= ring->tail) - ? ring->tail - ring->head - : ctx->nr_events - ring->head + ring->tail; - - atomic_add(avail, &ctx->reqs_available); - ring->head = ring->tail; - kunmap_atomic(ring); - - if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) - break; - - schedule(); - } - finish_wait(&ctx->wait, &wait); - - WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); - - aio_free_ring(ctx); - - pr_debug("freeing %p\n", ctx); - - /* - * Here the call_rcu() is between the wait_event() for reqs_active to - * hit 0, and freeing the ioctx. - * - * aio_complete() decrements reqs_active, but it has to touch the ioctx - * after to issue a wakeup so we use rcu. - */ - call_rcu(&ctx->rcu_head, free_ioctx_rcu); -} - -static void free_ioctx_ref(struct percpu_ref *ref) -{ - struct kioctx *ctx = container_of(ref, struct kioctx, users); - - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + percpu_ref_kill(&ctx->reqs); + percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) @@ -551,6 +514,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } } +static void aio_nr_sub(unsigned nr) +{ + spin_lock(&aio_nr_lock); + if (WARN_ON(aio_nr - nr > aio_nr)) + aio_nr = 0; + else + aio_nr -= nr; + spin_unlock(&aio_nr_lock); +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -588,8 +561,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_ref)) - goto out_freectx; + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); @@ -600,10 +576,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freeref; + goto err; if (aio_setup_ring(ctx) < 0) - goto out_freepcpu; + goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); @@ -615,7 +591,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); - goto out_cleanup; + err = -EAGAIN; + goto err; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); @@ -624,23 +601,19 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err = ioctx_add_table(ctx, mm); if (err) - goto out_cleanup_put; + goto err_cleanup; pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; -out_cleanup_put: - percpu_ref_put(&ctx->users); -out_cleanup: - err = -EAGAIN; +err_cleanup: + aio_nr_sub(ctx->max_reqs); +err: aio_free_ring(ctx); -out_freepcpu: free_percpu(ctx->cpu); -out_freeref: + free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); -out_freectx: - put_aio_ring_file(ctx); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -675,10 +648,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); + aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); @@ -810,6 +780,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; return req; out_put: @@ -879,12 +851,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) return; } - /* - * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after incrementing reqs_available. - */ - rcu_read_lock(); - if (iocb->ki_list.next) { unsigned long flags; @@ -959,7 +925,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - rcu_read_unlock(); + percpu_ref_put(&ctx->reqs); } EXPORT_SYMBOL(aio_complete); @@ -1370,6 +1336,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); kiocb_free(req); return ret; } -- cgit v1.2.3-71-gd317 From 81407c84ace88368ff23abb81caaeacf050c8450 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 24 May 2013 09:49:14 -0400 Subject: audit: allow unsetting the loginuid (with priv) If a task has CAP_AUDIT_CONTROL allow that task to unset their loginuid. This would allow a child of that task to set their loginuid without CAP_AUDIT_CONTROL. Thus when launching a new login daemon, a priviledged helper would be able to unset the loginuid and then the daemon, which may be malicious user facing, do not need priv to function correctly. Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- fs/proc/base.c | 14 ++++++++++---- kernel/auditsc.c | 4 +++- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1485e38daaa3..03c8d747be48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } } length = audit_set_loginuid(kloginuid); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b55788bf1607..c75d7813aef2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2019,7 +2019,9 @@ int audit_set_loginuid(kuid_t loginuid) if (rc) goto out; - sessionid = atomic_inc_return(&session_id); + /* are we setting or clearing? */ + if (uid_valid(loginuid)) + sessionid = atomic_inc_return(&session_id); task->sessionid = sessionid; task->loginuid = loginuid; -- cgit v1.2.3-71-gd317 From 14e972b4517128ac8e30e3de2ee4fbd995084223 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 May 2013 10:25:58 -0400 Subject: audit: add child record before the create to handle case where create fails Historically, when a syscall that creates a dentry fails, you get an audit record that looks something like this (when trying to create a file named "new" in "/tmp/tmp.SxiLnCcv63"): type=PATH msg=audit(1366128956.279:965): item=0 name="/tmp/tmp.SxiLnCcv63/new" inode=2138308 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023 This record makes no sense since it's associating the inode information for "/tmp/tmp.SxiLnCcv63" with the path "/tmp/tmp.SxiLnCcv63/new". The recent patch I posted to fix the audit_inode call in do_last fixes this, by making it look more like this: type=PATH msg=audit(1366128765.989:13875): item=0 name="/tmp/tmp.DJ1O8V3e4f/" inode=141 dev=fd:02 mode=040700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmp_t:s15:c0.c1023 While this is more correct, if the creation of the file fails, then we have no record of the filename that the user tried to create. This patch adds a call to audit_inode_child to may_create. This creates an AUDIT_TYPE_CHILD_CREATE record that will sit in place until the create succeeds. When and if the create does succeed, then this record will be updated with the correct inode info from the create. This fixes what was broken in commit bfcec708. Commit 79f6530c should also be backported to stable v3.7+. Signed-off-by: Jeff Layton Signed-off-by: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7720fbd5277b..df9946e83db4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2262,6 +2262,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) -- cgit v1.2.3-71-gd317 From 9410d228a4cf434305306746bb799fb7acdd8648 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 30 Oct 2013 18:05:24 -0400 Subject: audit: call audit_bprm() only once to add AUDIT_EXECVE information Move the audit_bprm() call from search_binary_handler() to exec_binprm(). This allows us to get rid of the mm member of struct audit_aux_data_execve since bprm->mm will equal current->mm. This also mitigates the issue that ->argc could be modified by the load_binary() call in search_binary_handler(). audit_bprm() was being called to add an AUDIT_EXECVE record to the audit context every time search_binary_handler() was recursively called. Only one reference is necessary. Reported-by: Oleg Nesterov Cc: Eric Paris Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- This patch is against 3.11, but was developed on Oleg's post-3.11 patches that introduce exec_binprm(). --- fs/exec.c | 5 +---- include/linux/audit.h | 9 +++------ kernel/audit.h | 1 - kernel/auditsc.c | 4 ---- 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..c5c24f2fc44a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1383,10 +1383,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - retval = audit_bprm(bprm); - if (retval) - return retval; - /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); @@ -1408,6 +1404,7 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth = depth; if (retval >= 0) { if (depth == 0) { + audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); } diff --git a/include/linux/audit.h b/include/linux/audit.h index 08b38bf13eb9..a40641954c29 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -238,11 +238,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } -static inline int audit_bprm(struct linux_binprm *bprm) +static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); - return 0; } static inline int audit_socketcall(int nargs, unsigned long *args) { @@ -369,10 +368,8 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } -static inline int audit_bprm(struct linux_binprm *bprm) -{ - return 0; -} +static inline void audit_bprm(struct linux_binprm *bprm) +{ } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; diff --git a/kernel/audit.h b/kernel/audit.h index e7b94ab66c49..b779642b29af 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -199,7 +199,6 @@ struct audit_context { } mmap; struct { int argc; - struct mm_struct *mm; } execve; }; int fds[2]; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 425a8939be1a..dfc5d6745ee5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1145,9 +1145,6 @@ static void audit_log_execve_info(struct audit_context *context, const char __user *p; char *buf; - if (context->execve.mm != current->mm) - return; /* execve failed, no additional info */ - p = (const char __user *)current->mm->arg_start; audit_log_format(*ab, "argc=%d", context->execve.argc); @@ -2144,7 +2141,6 @@ void __audit_bprm(struct linux_binprm *bprm) context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; - context->execve.mm = bprm->mm; } -- cgit v1.2.3-71-gd317 From 2000f5beabc9c6baf084de5f7879975408e3652c Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 13 Nov 2013 10:09:42 -0800 Subject: eCryptfs: file->private_data is always valid When accessing the lower_file pointer located in private_data of eCryptfs files, there is no need to check to see if the private_data pointer has been initialized to a non-NULL value. The file->private_data and file->private_data->lower_file pointers are always initialized to non-NULL values in ecryptfs_open(). This change quiets a Smatch warning: CHECK /var/scm/kernel/linux/fs/ecryptfs/file.c fs/ecryptfs/file.c:321 ecryptfs_unlocked_ioctl() error: potential NULL dereference 'lower_file'. fs/ecryptfs/file.c:335 ecryptfs_compat_ioctl() error: potential NULL dereference 'lower_file'. Signed-off-by: Tyler Hicks Reported-by: Dan Carpenter Reviewed-by: Geyslan G. Bem Cc: Al Viro --- fs/ecryptfs/file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2229a74aeeed..b1eaa7a1f82c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -313,11 +313,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) static long ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op->unlocked_ioctl) rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); return rc; @@ -327,11 +325,9 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static long ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct file *lower_file = NULL; + struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op && lower_file->f_op->compat_ioctl) rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); return rc; -- cgit v1.2.3-71-gd317 From db51242d89b3059a46a3cf2f3339f8cd975cb954 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2013 21:55:52 -0500 Subject: dump_align(): fix the dumb braino Mea culpa - original variant used 64-by-32-bit division, which got caught very late. Getting rid of that wasn't hard, but I'd managed to botch the calling conventions in process ;-/ Signed-off-by: Al Viro --- fs/coredump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 62406b6959b6..a2856f7bb613 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->written & (align - 1); if (align & (align - 1)) - return -EINVAL; - return mod ? dump_skip(cprm, align - mod) : 0; + return 0; + return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); -- cgit v1.2.3-71-gd317 From 52da40ae67f6192a3bf70a98cb560e1423554953 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2013 21:58:33 -0500 Subject: dump_emit(): use __kernel_write(), not vfs_write() the caller has already done file_start_write()... Signed-off-by: Al Viro --- fs/coredump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index a2856f7bb613..bc3fbcd32558 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) while (nr) { if (dump_interrupted()) return 0; - n = vfs_write(file, addr, nr, &pos); + n = __kernel_write(file, addr, nr, &pos); if (n <= 0) return 0; file->f_pos = pos; -- cgit v1.2.3-71-gd317 From 951b4bd553e35a291e6b5732ab0124619e81da05 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2013 19:53:40 -0400 Subject: gfs2: endianness misannotations Signed-off-by: Al Viro --- fs/gfs2/lock_dlm.c | 8 ++++---- fs/gfs2/quota.c | 23 ++++++++++------------- fs/gfs2/rgrp.c | 4 ++-- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c8423d6de6c3..2a6ba06bee6f 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl) static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); - memcpy(&gen, lvb_bits, sizeof(uint32_t)); + memcpy(&gen, lvb_bits, sizeof(__le32)); *lvb_gen = le32_to_cpu(gen); } static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); gen = cpu_to_le32(lvb_gen); - memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); } static int all_jid_bits_clear(char *lvb) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 453b50eaddec..98236d0df3ca 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct buffer_head *bh; struct page *page; void *kaddr, *ptr; - struct gfs2_quota q, *qp; + struct gfs2_quota q; int err, nbytes; u64 size; @@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; err = -EIO; - qp = &q; - qp->qu_value = be64_to_cpu(qp->qu_value); - qp->qu_value += change; - qp->qu_value = cpu_to_be64(qp->qu_value); - qd->qd_qb.qb_value = qp->qu_value; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_warn = qp->qu_warn; + q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_limit = qp->qu_limit; + q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & FS_DQ_BCOUNT) { - qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_value = qp->qu_value; + q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = q.qu_value; } } /* Write the quota into the quota file on disk */ - ptr = qp; + ptr = &q; nbytes = sizeof(struct gfs2_quota); get_a_page: page = find_or_create_page(mapping, index, GFP_NOFS); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 4d83abdd5635..c8d6161bd682 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; } - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd) if (rgd->rd_flags & GFS2_RDF_UPTODATE) return 0; - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) return gfs2_rgrp_bh_get(rgd); rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); -- cgit v1.2.3-71-gd317 From b26d4cd385fc51e8844e2cdf9ba2051f5bba11a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 18:47:37 -0400 Subject: consolidate simple ->d_delete() instances Rename simple_delete_dentry() to always_delete_dentry() and export it. Export simple_dentry_operations, while we are at it, and get rid of their duplicates Signed-off-by: Al Viro --- arch/ia64/kernel/perfmon.c | 8 +------- fs/9p/vfs_dentry.c | 19 +------------------ fs/configfs/dir.c | 12 +----------- fs/efivarfs/super.c | 11 +---------- fs/hostfs/hostfs_kern.c | 11 +---------- fs/libfs.c | 12 +++++++----- fs/proc/generic.c | 18 +----------------- fs/proc/namespaces.c | 8 +------- include/linux/fs.h | 2 ++ kernel/cgroup.c | 7 +------ net/sunrpc/rpc_pipe.c | 11 +---------- 11 files changed, 18 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5a9ff1c3c3e9..cb592773c78b 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2166,12 +2166,6 @@ static const struct file_operations pfm_file_ops = { .flush = pfm_flush }; -static int -pfmfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]", @@ -2179,7 +2173,7 @@ static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = pfmfs_dname, }; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f039b104a98e..b03dd23feda8 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -42,23 +42,6 @@ #include "v9fs_vfs.h" #include "fid.h" -/** - * v9fs_dentry_delete - called when dentry refcount equals 0 - * @dentry: dentry in question - * - * By returning 1 here we should remove cacheing of unused - * dentry components. - * - */ - -static int v9fs_dentry_delete(const struct dentry *dentry) -{ - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); - - return 1; -} - /** * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question @@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = v9fs_dentry_delete, + .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 277bd1be21fd..4522e0755773 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -66,19 +66,9 @@ static void configfs_d_iput(struct dentry * dentry, iput(inode); } -/* - * We _must_ delete our dentries on last dput, as the chain-to-parent - * behavior is required to clear the parents of default_groups. - */ -static int configfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - /* simple_delete_dentry() isn't exported */ - .d_delete = configfs_d_delete, + .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a8766b880c07..becc725a1953 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -/* - * Retaining negative dentries for an in-memory filesystem just wastes - * memory and lookup time: arrange for them to be deleted immediately. - */ -static int efivarfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = efivarfs_delete_dentry, + .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 25437280a207..db23ce1bd903 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) -static int hostfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations hostfs_dentry_ops = { - .d_delete = hostfs_d_delete, -}; - /* Changed in hostfs_args before the kernel starts running */ static char *root_ino = ""; static int append = 0; @@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &hostfs_dentry_ops; + sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as by sprintf: avoid that. */ diff --git a/fs/libfs.c b/fs/libfs.c index 5de06947ba5e..a1844244246f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs); * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(const struct dentry *dentry) +int always_delete_dentry(const struct dentry *dentry) { return 1; } +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already @@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry) */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - static const struct dentry_operations simple_dentry_operations = { - .d_delete = simple_delete_dentry, - }; - if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 737e15615b04..cca93b6fb9a9 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -174,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = { .follow_link = proc_follow_link, }; -/* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(const struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_dentry_operations); + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 49a7fff2e83a..9ae46b87470d 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = { .setattr = proc_setattr, }; -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; @@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) const struct dentry_operations ns_dentry_operations = { - .d_delete = ns_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = ns_dname, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index bf5d574ebdf4..121f11f001c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2622,7 +2622,9 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); +extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0839bcd48c8..4c62513fe19f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -895,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1486,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d0d14a04dce1..bf04b30a788a 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -471,15 +471,6 @@ struct rpc_filelist { umode_t mode; }; -static int rpc_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations rpc_dentry_operations = { - .d_delete = rpc_delete_dentry, -}; - static struct inode * rpc_get_inode(struct super_block *sb, umode_t mode) { @@ -1266,7 +1257,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &rpc_dentry_operations; + sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); -- cgit v1.2.3-71-gd317 From 2bc74feba12fbf052ec97aee8624c9f13593a9ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 16:39:14 -0400 Subject: take read_seqbegin_or_lock() and friends to seqlock.h Signed-off-by: Al Viro --- fs/dcache.c | 29 ----------------------------- include/linux/seqlock.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 0a38ef8d7f00..667e23ab0b4c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; -/** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked - * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. - */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 1e8a8b6e837d..cf87a24c0f92 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -354,6 +354,35 @@ static inline void read_sequnlock_excl(seqlock_t *sl) spin_unlock(&sl->lock); } +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * @lock: sequence lock + * @seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); -- cgit v1.2.3-71-gd317 From 482db9066199813d6b999b65a3171afdbec040b6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 16:41:01 -0400 Subject: dcache.c: get rid of pointless macros D_HASH{MASK,BITS} are used once each, both in the same function (d_hash()). At this point they are actively misguiding - they imply that values are compiler constants, which is no longer true. Signed-off-by: Al Viro --- fs/dcache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 667e23ab0b4c..7f079d00475a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -96,8 +96,6 @@ static struct kmem_cache *dentry_cache __read_mostly; * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ -#define D_HASHBITS d_hash_shift -#define D_HASHMASK d_hash_mask static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; @@ -108,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); + hash = hash + (hash >> d_hash_shift); + return dentry_hashtable + (hash & d_hash_mask); } /* Statistics gathering. */ -- cgit v1.2.3-71-gd317 From 31dec1327e377b6d91a8a6c92b5cd8513939a233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Oct 2013 17:04:27 -0400 Subject: fold try_to_ascend() into the sole remaining caller There used to be a bunch of tree-walkers in dcache.c, all alike. try_to_ascend() had been introduced to abstract a piece of logics duplicated in all of them. These days all these tree-walkers are implemented via the same iterator (d_walk()), which is the only remaining caller of try_to_ascend(), so let's fold it back... Signed-off-by: Al Viro --- fs/dcache.c | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 7f079d00475a..4bdb300b16e2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -438,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) { list_del(&dentry->d_u.d_child); /* - * Inform try_to_ascend() that we are no longer attached to the + * Inform d_walk() that we are no longer attached to the * dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; @@ -1038,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb) } EXPORT_SYMBOL(shrink_dcache_sb); -/* - * This tries to ascend one level of parenthood, but - * we can race with renaming, so we need to re-check - * the parenthood after dropping the lock and check - * that the sequence number still matches. - */ -static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) -{ - struct dentry *new = old->d_parent; - - rcu_read_lock(); - spin_unlock(&old->d_lock); - spin_lock(&new->d_lock); - - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (new != old->d_parent || - (old->d_flags & DCACHE_DENTRY_KILLED) || - need_seqretry(&rename_lock, seq)) { - spin_unlock(&new->d_lock); - new = NULL; - } - rcu_read_unlock(); - return new; -} - /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk @@ -1154,9 +1126,24 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, seq); - if (!this_parent) + this_parent = child->d_parent; + + rcu_read_lock(); + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (this_parent != child->d_parent || + (child->d_flags & DCACHE_DENTRY_KILLED) || + need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); goto rename_retry; + } + rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } -- cgit v1.2.3-71-gd317 From 9e3908e342eba6684621e616529669c17e271e7e Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 7 Nov 2013 15:43:28 -0600 Subject: xfs: fix unlock in xfs_bmap_add_attrfork xfs_trans_ijoin() activates the inode in a transaction and also can specify which lock to free when the transaction is committed or canceled. xfs_bmap_add_attrfork call locks and adds the lock to the transaction but also manually removes the lock. Change the routine to not add the lock to the transaction and manually remove lock on completion. While here, clean up the xfs_trans_cancel flags and goto names. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 1c02da8bb7df..3ef11b22e750 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ + int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork( if (rsvd) tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) - goto error1; + goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork( default: ASSERT(0); error = XFS_ERROR(EINVAL); - goto error1; + goto trans_cancel; } ASSERT(ip->i_afp == NULL); @@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto error2; + goto bmap_cancel; if (!xfs_sb_version_hasattr(&mp->m_sb) || (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { __int64_t sbfields = 0; @@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: xfs_bmap_cancel(&flist); -error1: +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } -- cgit v1.2.3-71-gd317 From 8f80587bacb6eb893df0ee4e35fefa0dfcfdf9f4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 1 Nov 2013 15:27:20 +1100 Subject: xfs: increase inode cluster size for v5 filesystems v5 filesystems use 512 byte inodes as a minimum, so read inodes in clusters that are effectively half the size of a v4 filesystem with 256 byte inodes. For v5 fielsystems, scale the inode cluster size with the size of the inode so that we keep a constant 32 inodes per cluster ratio for all inode IO. This only works if mkfs.xfs sets the inode alignment appropriately for larger inode clusters, so this functionality is made conditional on mkfs doing the right thing. xfs_repair needs to know about the inode alignment changes, too. Wall time: create bulkstat find+stat ls -R unlink v4 237s 161s 173s 201s 299s v5 235s 163s 205s 31s 356s patched 234s 160s 182s 29s 317s System time: create bulkstat find+stat ls -R unlink v4 2601s 2490s 1653s 1656s 2960s v5 2637s 2497s 1681s 20s 3216s patched 2613s 2451s 1658s 20s 3007s So, wall time same or down across the board, system time same or down across the board, and cache hit rates all improve except for the ls -R case which is a pure cold cache directory read workload on v5 filesystems... So, this patch removes most of the performance and CPU usage differential between v4 and v5 filesystems on traversal related workloads. Note: while this patch is currently for v5 filesystems only, there is no reason it can't be ported back to v4 filesystems. This hasn't been done here because bringing the code back to v4 requires forwards and backwards kernel compatibility testing. i.e. to deterine if older kernels(*) do the right thing with larger inode alignments but still only using 8k inode cluster sizes. None of this testing and validation on v4 filesystems has been done, so for the moment larger inode clusters is limited to v5 superblocks. (*) a current default config v4 filesystem should mount just fine on 2.6.23 (when lazy-count support was introduced), and so if we change the alignment emitted by mkfs without a feature bit then we have to make sure it works properly on all kernels since 2.6.23. And if we allow it to be changed when the lazy-count bit is not set, then it's all kernels since v2 logs were introduced that need to be tested for compatibility... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 15 +++++++++++++++ fs/xfs/xfs_mount.h | 2 +- fs/xfs/xfs_trans_resv.c | 3 +-- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da88f167af78..02df7b408a26 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -41,6 +41,7 @@ #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB @@ -718,8 +719,22 @@ xfs_mountfs( * Set the inode cluster size. * This may still be overridden by the file system * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. */ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + xfs_info(mp, "Using inode cluster size of %d bytes", + mp->m_inode_cluster_size); + } /* * Set inode alignment fields diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d8101a10d8e..a466c5e5826e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -112,7 +112,7 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index d53d9f0627a7..2fd59c0dae66 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -385,8 +385,7 @@ xfs_calc_ifree_reservation( xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + + max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels, 0) + -- cgit v1.2.3-71-gd317 From 2fe8c1c08b3fbd87dd2641c8f032ff6e965d5803 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 1 Nov 2013 15:27:17 +1100 Subject: xfs: open code inc_inode_iversion when logging an inode Michael L Semon reported that generic/069 runtime increased on v5 superblocks by 100% compared to v4 superblocks. his perf-based analysis pointed directly at the timestamp updates being done by the write path in this workload. The append writers are doing 4-byte writes, so there are lots of timestamp updates occurring. The thing is, they aren't being triggered by timestamp changes - they are being triggered by the inode change counter needing to be updated. That is, every write(2) system call needs to bump the inode version count, and it does that through the timestamp update mechanism. Hence for v5 filesystems, test generic/069 is running 3 orders of magnitude more timestmap update transactions on v5 filesystems due to the fact it does a huge number of *4 byte* write(2) calls. This isn't a real world scenario we really need to address - anyone doing such sequential IO should be using fwrite(3), not write(2). i.e. fwrite(3) buffers the writes in userspace to minimise the number of write(2) syscalls, and the problem goes away. However, there is a small change we can make to improve the situation - removing the expensive lock operation on the change counter update. All inode version counter changes in XFS occur under the ip->i_ilock during a transaction, and therefore we don't actually need the spin lock that provides exclusive access to it through inc_inode_iversion(). Hence avoid the lock and just open code the increment ourselves when logging the inode. Reported-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 1bba7f60d94c..50c3f5614288 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -111,12 +111,14 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - inode_inc_iversion(VFS_I(ip)); - ip->i_d.di_changecount = VFS_I(ip)->i_version; + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; flags |= XFS_ILOG_CORE; } -- cgit v1.2.3-71-gd317 From 818e5a22e907fbae75e9c1fd78233baec9fa64b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Nov 2013 05:07:30 -0800 Subject: nfsd: split up nfsd_setattr Split out two helpers to make the code more readable and easier to verify for correctness. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 144 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 94b5f5d2bfed..c3f57cf14a63 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp) } /* - * Set various file attributes. - * N.B. After this call fhp needs an fh_put + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. */ -__be32 -nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - int check_guard, time_t guardtime) +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - struct dentry *dentry; - struct inode *inode; - int accmode = NFSD_MAY_SATTR; - umode_t ftype = 0; - __be32 err; - int host_err; - int size_change = 0; - - if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; - if (iap->ia_valid & ATTR_SIZE) - ftype = S_IFREG; - - /* Get inode */ - err = fh_verify(rqstp, fhp, ftype, accmode); - if (err) - goto out; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - goto out; - /* * NFSv2 does not differentiate between "set-[ac]time-to-now" * which only requires access, and "set-[ac]time-to-X" which @@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * convert to "set to now" instead of "set to explicit time" * * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. It is only - * valid if the filesystem does not define it's own i_op->setattr. + * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) #define MAX_TOUCH_TIME_ERROR (30*60) @@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~BOTH_TIME_SET; } } - - /* - * The size case is special. - * It changes the file as well as the attributes. - */ - if (iap->ia_valid & ATTR_SIZE) { - if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, - NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserr; - - size_change = 1; - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) { - put_write_access(inode); - goto out_nfserr; - } - } /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { @@ -415,8 +361,86 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } } +} - /* Change the attributes. */ +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + int host_err; + + if (iap->ia_size < inode->i_size) { + __be32 err; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } + + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; + + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; + +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); +} + +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + goto out; + + nfsd_sanitize_attrs(inode, iap); + + /* + * The size case is special, it changes the file in addition to the + * attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + goto out; + size_change = 1; + } iap->ia_valid |= ATTR_CTIME; -- cgit v1.2.3-71-gd317 From 987da4791052fa298b7cfcde4dea9f6f2bbc786b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Nov 2013 05:07:47 -0800 Subject: nfsd: make sure to balance get/put_write_access Use a straight goto error label style in nfsd_setattr to make sure we always do the put_write_access call after we got it earlier. Note that the we have been failing to do that in the case nfsd_break_lease() returns an error, a bug introduced into 2.6.38 with 6a76bebefe15d9a08864f824d7f8d5beaf37c997 "nfsd4: break lease on nfsd setattr". Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c3f57cf14a63..7eea63cada1d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -444,27 +444,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= ATTR_CTIME; - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime.tv_sec) { - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_nfserr; - fh_lock(fhp); - - host_err = notify_change(dentry, iap, NULL); - err = nfserrno(host_err); - fh_unlock(fhp); + if (check_guard && guardtime != inode->i_ctime.tv_sec) { + err = nfserr_notsync; + goto out_put_write_access; } + + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_put_write_access_nfserror; + + fh_lock(fhp); + host_err = notify_change(dentry, iap, NULL); + fh_unlock(fhp); + +out_put_write_access_nfserror: + err = nfserrno(host_err); +out_put_write_access: if (size_change) put_write_access(inode); if (!err) commit_metadata(fhp); out: return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } #if defined(CONFIG_NFSD_V2_ACL) || \ -- cgit v1.2.3-71-gd317 From 34f2fd8dfe6185b0eaaf7d661281713a6170b077 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 18 Nov 2013 22:11:42 +0900 Subject: bio: fix argument of __bio_add_page() for max_sectors > 0xffff The data type of max_sectors and max_hw_sectors in queue settings are unsigned int. But these values are passed to __bio_add_page() as an argument whose data type is unsigned short. In the worst case such as max_sectors is 0x10000, bio_add_page() can't add a page and IOs can't proceed. Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Akinobu Mita Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 2bdb4e25ee77..33d79a4eb92d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, - unsigned short max_sectors) + unsigned int max_sectors) { int retried_segments = 0; struct bio_vec *bvec; -- cgit v1.2.3-71-gd317 From 9bf0c9cd431440a831e60c0a0fd0bc4f0e083e7f Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 16 Nov 2013 18:05:28 -0600 Subject: CIFS: Fix SMB2/SMB3 Copy offload support (refcopy) for large files This third version of the patch, incorparating feedback from David Disseldorp extends the ability of copychunk (refcopy) over smb2/smb3 mounts to handle servers with smaller than usual maximum chunk sizes and also fixes it to handle files bigger than the maximum chunk sizes In the future this can be extended further to handle sending multiple chunk requests in on SMB2 ioctl request which will further improve performance, but even with one 1MB chunk per request the speedup on cp is quite large. Reviewed-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++-------- fs/cifs/smb2pdu.c | 9 ++++- 2 files changed, 93 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 11dde4b24f8a..a3968eeb6fac 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -532,7 +532,10 @@ smb2_clone_range(const unsigned int xid, int rc; unsigned int ret_data_len; struct copychunk_ioctl *pcchunk; - char *retbuf = NULL; + struct copychunk_ioctl_rsp *retbuf = NULL; + struct cifs_tcon *tcon; + int chunks_copied = 0; + bool chunk_sizes_updated = false; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); @@ -547,27 +550,96 @@ smb2_clone_range(const unsigned int xid, /* Note: request_res_key sets res_key null only if rc !=0 */ if (rc) - return rc; + goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ pcchunk->ChunkCount = __constant_cpu_to_le32(1); pcchunk->Reserved = 0; - pcchunk->SourceOffset = cpu_to_le64(src_off); - pcchunk->TargetOffset = cpu_to_le64(dest_off); - pcchunk->Length = cpu_to_le32(len); pcchunk->Reserved2 = 0; - /* Request that server copy to target from src file identified by key */ - rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink), - trgtfile->fid.persistent_fid, - trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), &retbuf, &ret_data_len); + tcon = tlink_tcon(trgtfile->tlink); - /* BB need to special case rc = EINVAL to alter chunk size */ + while (len > 0) { + pcchunk->SourceOffset = cpu_to_le64(src_off); + pcchunk->TargetOffset = cpu_to_le64(dest_off); + pcchunk->Length = + cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); - cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len); + /* Request server copy to target from src identified by key */ + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + true /* is_fsctl */, (char *)pcchunk, + sizeof(struct copychunk_ioctl), (char **)&retbuf, + &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { + cifs_dbg(VFS, "invalid cchunk response size\n"); + rc = -EIO; + goto cchunk_out; + } + if (retbuf->TotalBytesWritten == 0) { + cifs_dbg(FYI, "no bytes copied\n"); + rc = -EIO; + goto cchunk_out; + } + /* + * Check if server claimed to write more than we asked + */ + if (le32_to_cpu(retbuf->TotalBytesWritten) > + le32_to_cpu(pcchunk->Length)) { + cifs_dbg(VFS, "invalid copy chunk response\n"); + rc = -EIO; + goto cchunk_out; + } + if (le32_to_cpu(retbuf->ChunksWritten) != 1) { + cifs_dbg(VFS, "invalid num chunks written\n"); + rc = -EIO; + goto cchunk_out; + } + chunks_copied++; + + src_off += le32_to_cpu(retbuf->TotalBytesWritten); + dest_off += le32_to_cpu(retbuf->TotalBytesWritten); + len -= le32_to_cpu(retbuf->TotalBytesWritten); + + cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + } else if (rc == -EINVAL) { + if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) + goto cchunk_out; + + cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + + /* + * Check if this is the first request using these sizes, + * (ie check if copy succeed once with original sizes + * and check if the server gave us different sizes after + * we already updated max sizes on previous request). + * if not then why is the server returning an error now + */ + if ((chunks_copied != 0) || chunk_sizes_updated) + goto cchunk_out; + + /* Check that server is not asking us to grow size */ + if (le32_to_cpu(retbuf->ChunkBytesWritten) < + tcon->max_bytes_chunk) + tcon->max_bytes_chunk = + le32_to_cpu(retbuf->ChunkBytesWritten); + else + goto cchunk_out; /* server gave us bogus size */ + + /* No need to change MaxChunks since already set to 1 */ + chunk_sizes_updated = true; + } + } +cchunk_out: kfree(pcchunk); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d65270c290a1..87f430ecb85f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1214,10 +1214,17 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; - if (rc != 0) { + if ((rc != 0) && (rc != -EINVAL)) { if (tcon) cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; + } else if (rc == -EINVAL) { + if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && + (opcode != FSCTL_SRV_COPYCHUNK)) { + if (tcon) + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } } /* check if caller wants to look at return data or just return rc */ -- cgit v1.2.3-71-gd317 From 7d3fb24bce87a240ee5a5f99cdd72b1f336d5c3b Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Nov 2013 09:56:28 -0600 Subject: Removed duplicated (and unneeded) goto Remove an unneeded goto (and also was duplicated goto target name). Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87f430ecb85f..1e136eee3ea6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2161,11 +2161,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; - if (rc != 0) { + if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); - goto out; - } -out: + free_rsp_buf(resp_buftype, rsp); kfree(iov); return rc; -- cgit v1.2.3-71-gd317 From 694e096fd7c2a7d40760fc9c9dfc826bdd495ea4 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 13 Nov 2013 12:29:08 -0500 Subject: NFS: Enabling v4.2 should not recompile nfsd and lockd When CONFIG_NFS_V4_2 is toggled nfsd and lockd will be recompiled, instead of only the nfs client. This patch moves a small amount of code into the client directory to avoid unnecessary recompiles. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 1 + fs/nfs/dns_resolve.c | 2 ++ fs/nfs/internal.h | 15 +++++++++++++++ fs/nfs/nfs4_fs.h | 8 ++++++++ include/linux/nfs4.h | 10 ---------- include/linux/nfs_fs.h | 18 ------------------ 6 files changed, 26 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 8485978993e8..9838fb020473 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -36,6 +36,7 @@ #include #include +#include "../nfs4_fs.h" #include "../pnfs.h" #include "../netns.h" diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index fc0f95ec7358..d25f10fb4926 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include #include #include +#include +#include "nfs4_fs.h" #include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index bca6a3e3c49c..8b5cc04a8611 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -269,6 +269,21 @@ extern const u32 nfs41_maxgetdevinfo_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ + if (label) { + kfree(label->label); + kfree(label); + } + return; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3ce79b04522e..5609edc742a0 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,6 +9,14 @@ #ifndef __LINUX_FS_NFS_NFS4_FS_H #define __LINUX_FS_NFS_NFS4_FS_H +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif + #if IS_ENABLED(CONFIG_NFS_V4) #define NFS4_MAX_LOOP_ON_RECOVER (10) diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c6f41b616965..ddd84871c0ad 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -410,16 +410,6 @@ enum lock_type4 { #define NFS4_VERSION 4 #define NFS4_MINOR_VERSION 0 -#if defined(CONFIG_NFS_V4_2) -#define NFS4_MAX_MINOR_VERSION 2 -#else -#if defined(CONFIG_NFS_V4_1) -#define NFS4_MAX_MINOR_VERSION 1 -#else -#define NFS4_MAX_MINOR_VERSION 0 -#endif /* CONFIG_NFS_V4_1 */ -#endif /* CONFIG_NFS_V4_2 */ - #define NFS4_DEBUG 1 /* Index of predefined Linux client operations */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 14a48207a304..48997374eaf0 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -506,24 +506,6 @@ extern const struct inode_operations nfs_referral_inode_operations; extern int nfs_mountpoint_expiry_timeout; extern void nfs_release_automount_timer(void); -/* - * linux/fs/nfs/nfs4proc.c - */ -#ifdef CONFIG_NFS_V4_SECURITY_LABEL -extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); -static inline void nfs4_label_free(struct nfs4_label *label) -{ - if (label) { - kfree(label->label); - kfree(label); - } - return; -} -#else -static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } -static inline void nfs4_label_free(void *label) {} -#endif - /* * linux/fs/nfs/unlink.c */ -- cgit v1.2.3-71-gd317 From 829e57d7c590832caad04355b044667902194f6a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Nov 2013 14:33:50 -0500 Subject: NFS: Fix a warning in nfs_setsecurity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warning: linux-nfs/fs/nfs/inode.c:315:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18ab2da4eeb6..00ad1c2b217d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -312,7 +312,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else -void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { } -- cgit v1.2.3-71-gd317 From d558023207e008a4476a3b7bb8706b2a2bf5d84f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 19 Nov 2013 17:33:02 -0500 Subject: aio: prevent double free in ioctx_alloc ioctx_alloc() calls aio_setup_ring() to allocate a ring. If aio_setup_ring() fails to do so it would call aio_free_ring() before returning, but ioctx_alloc() would call aio_free_ring() again causing a double free of the ring. This is easily reproducible from userspace. Signed-off-by: Sasha Levin Signed-off-by: Benjamin LaHaise --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index ee77dc13d5b2..63135331cec2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -610,7 +610,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); err: - aio_free_ring(ctx); free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); -- cgit v1.2.3-71-gd317 From ddb8c45ba15149ebd41d7586261c05f7ca37f9a1 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 19 Nov 2013 17:33:03 -0500 Subject: aio: nullify aio->ring_pages after freeing it After freeing ring_pages we leave it as is causing a dangling pointer. This has already caused an issue so to help catching any issues in the future NULL it out. Signed-off-by: Sasha Levin Signed-off-by: Benjamin LaHaise --- fs/aio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 63135331cec2..ad460d78d6c5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -196,8 +196,10 @@ static void aio_free_ring(struct kioctx *ctx) put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); + ctx->ring_pages = NULL; + } } static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3-71-gd317 From 365da4adebb1c012febf81019ad3dc5bb52e2a13 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 19 Nov 2013 17:32:43 -0500 Subject: nfsd4: fix xdr decoding of large non-write compounds This fixes a regression from 247500820ebd02ad87525db5d9b199e5b66f6636 "nfsd4: fix decoding of compounds across page boundaries". The previous code was correct: argp->pagelist is initialized in nfs4svc_deocde_compoundargs to rqstp->rq_arg.pages, and is therefore a pointer to the page *after* the page we are currently decoding. The reason that patch nevertheless fixed a problem with decoding compounds containing write was a bug in the write decoding introduced by 5a80a54d21c96590d013378d8c5f65f879451ab4 "nfsd4: reorganize write decoding", after which write decoding no longer adhered to the rule that argp->pagelist point to the next page. Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 088de1355e93..ee7237f99f54 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -141,8 +141,8 @@ xdr_error: \ static void next_decode_page(struct nfsd4_compoundargs *argp) { - argp->pagelist++; argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; @@ -1229,6 +1229,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) len -= pages * PAGE_SIZE; argp->p = (__be32 *)page_address(argp->pagelist[0]); + argp->pagelist++; argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } argp->p += XDR_QUADLEN(len); -- cgit v1.2.3-71-gd317 From 9508c6b90b3f57ecea4e7a896cf8325400fc0c6e Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 13 Nov 2013 02:56:26 +0000 Subject: Squashfs: Refactor decompressor interface and code The decompressor interface and code was written from the point of view of single-threaded operation. In doing so it mixed a lot of single-threaded implementation specific aspects into the decompressor code and elsewhere which makes it difficult to seamlessly support multiple different decompressor implementations. This patch does the following: 1. It removes compressor_options parsing from the decompressor init() function. This allows the decompressor init() function to be dynamically called to instantiate multiple decompressors, without the compressor options needing to be read and parsed each time. 2. It moves threading and all sleeping operations out of the decompressors. In doing so, it makes the decompressors non-blocking wrappers which only deal with interfacing with the decompressor implementation. 3. It splits decompressor.[ch] into decompressor generic functions in decompressor.[ch], and moves the single threaded decompressor implementation into decompressor_single.c. The result of this patch is Squashfs should now be able to support multiple decompressors by adding new decompressor_xxx.c files with specialised implementations of the functions in decompressor_single.c Signed-off-by: Phillip Lougher Reviewed-by: Minchan Kim --- fs/squashfs/Makefile | 2 +- fs/squashfs/block.c | 11 +++-- fs/squashfs/decompressor.c | 47 ++++++++++++++------- fs/squashfs/decompressor.h | 21 +++------ fs/squashfs/decompressor_single.c | 86 +++++++++++++++++++++++++++++++++++++ fs/squashfs/lzo_wrapper.c | 24 +++-------- fs/squashfs/squashfs.h | 9 +++- fs/squashfs/squashfs_fs_sb.h | 3 +- fs/squashfs/super.c | 10 ++--- fs/squashfs/xz_wrapper.c | 89 +++++++++++++++++++++------------------ fs/squashfs/zlib_wrapper.c | 50 +++++++--------------- 11 files changed, 216 insertions(+), 136 deletions(-) create mode 100644 fs/squashfs/decompressor_single.c (limited to 'fs') diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 110b0476f3b4..c223c8439c21 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o decompressor_single.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 41d108ecc9be..4dd402597f22 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -93,7 +93,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, struct buffer_head **bh; int offset = index & ((1 << msblk->devblksize_log2) - 1); u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, page = 0, avail; + int bytes, compressed, b = 0, k = 0, page = 0, avail, i; bh = kcalloc(((srclength + msblk->devblksize - 1) >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); @@ -158,6 +158,12 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, ll_rw_block(READ, b - 1, bh + 1); } + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + if (compressed) { length = squashfs_decompress(msblk, buffer, bh, b, offset, length, srclength, pages); @@ -172,9 +178,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, for (bytes = length; k < b; k++) { in = min(bytes, msblk->devblksize - offset); bytes -= in; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto block_release; while (in) { if (pg_offset == PAGE_CACHE_SIZE) { page++; diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 3f6271d86abc..234291f79ba5 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -37,29 +37,29 @@ */ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { - NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 + NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { - NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 + NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif #ifndef CONFIG_SQUASHFS_XZ static const struct squashfs_decompressor squashfs_xz_comp_ops = { - NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 + NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZLIB static const struct squashfs_decompressor squashfs_zlib_comp_ops = { - NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 + NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 }; #endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { - NULL, NULL, NULL, 0, "unknown", 0 + NULL, NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { @@ -83,10 +83,10 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) } -void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) +static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *strm, *buffer = NULL; + void *buffer = NULL, *comp_opts; int length = 0; /* @@ -94,23 +94,40 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) */ if (SQUASHFS_COMP_OPTS(flags)) { buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); - if (buffer == NULL) - return ERR_PTR(-ENOMEM); + if (buffer == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } length = squashfs_read_data(sb, &buffer, sizeof(struct squashfs_super_block), 0, NULL, - PAGE_CACHE_SIZE, 1); + PAGE_CACHE_SIZE, 1); if (length < 0) { - strm = ERR_PTR(length); - goto finished; + comp_opts = ERR_PTR(length); + goto out; } } - strm = msblk->decompressor->init(msblk, buffer, length); + comp_opts = squashfs_comp_opts(msblk, buffer, length); -finished: +out: kfree(buffer); + return comp_opts; +} + + +void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *stream, *comp_opts = get_comp_opts(sb, flags); + + if (IS_ERR(comp_opts)) + return comp_opts; + + stream = squashfs_decompressor_create(msblk, comp_opts); + if (IS_ERR(stream)) + kfree(comp_opts); - return strm; + return stream; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 330073e29029..6cdb20a3878a 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -24,28 +24,21 @@ */ struct squashfs_decompressor { - void *(*init)(struct squashfs_sb_info *, void *, int); + void *(*init)(struct squashfs_sb_info *, void *); + void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); - int (*decompress)(struct squashfs_sb_info *, void **, + int (*decompress)(struct squashfs_sb_info *, void *, void **, struct buffer_head **, int, int, int, int, int); int id; char *name; int supported; }; -static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, - void *s) +static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int length) { - if (msblk->decompressor) - msblk->decompressor->free(s); -} - -static inline int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) -{ - return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + return msblk->decompressor->comp_opts ? + msblk->decompressor->comp_opts(msblk, buff, length) : NULL; } #ifdef CONFIG_SQUASHFS_XZ diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c new file mode 100644 index 000000000000..f857cf6f22d4 --- /dev/null +++ b/fs/squashfs/decompressor_single.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements single-threaded decompression in the + * decompressor framework + */ + +struct squashfs_stream { + void *stream; + struct mutex mutex; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + int err = -ENOMEM; + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto out; + + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + + kfree(comp_opts); + mutex_init(&stream->mutex); + return stream; + +out: + kfree(stream); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + + if (stream) { + msblk->decompressor->free(stream->stream); + kfree(stream); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + + mutex_lock(&stream->mutex); + res = msblk->decompressor->decompress(msblk, stream->stream, buffer, + bh, b, offset, length, srclength, pages); + mutex_unlock(&stream->mutex); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return 1; +} diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 00f4dfc5f088..75c3b5779172 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -37,7 +37,7 @@ struct squashfs_lzo { void *output; }; -static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len) +static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); @@ -74,22 +74,16 @@ static void lzo_free(void *strm) } -static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) { - struct squashfs_lzo *stream = msblk->stream; + struct squashfs_lzo *stream = strm; void *buff = stream->input; int avail, i, bytes = length, res; size_t out_len = srclength; - mutex_lock(&msblk->read_data_mutex); - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - avail = min(bytes, msblk->devblksize - offset); memcpy(buff, bh[i]->b_data + offset, avail); buff += avail; @@ -111,17 +105,9 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, bytes -= avail; } - mutex_unlock(&msblk->read_data_mutex); return res; -block_release: - for (; i < b; i++) - put_bh(bh[i]); - failed: - mutex_unlock(&msblk->read_data_mutex); - - ERROR("lzo decompression failed, data probably corrupt\n"); return -EIO; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index d1266516ed08..2e2751df8452 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int); /* decompressor.c */ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); -extern void *squashfs_decompressor_init(struct super_block *, unsigned short); +extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); + +/* decompressor_xxx.c */ +extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); +extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); +extern int squashfs_decompress(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); +extern int squashfs_max_decompressors(void); /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 52934a22f296..9cdcf4150d59 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,10 +63,9 @@ struct squashfs_sb_info { __le64 *id_table; __le64 *fragment_index; __le64 *xattr_id_table; - struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; - void *stream; + struct squashfs_stream *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60553a9053ca..202df6312d4e 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); - mutex_init(&msblk->read_data_mutex); mutex_init(&msblk->meta_index_mutex); /* @@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; /* Allocate read_page block */ - msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + msblk->read_page = squashfs_cache_init("data", + squashfs_max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { ERROR("Failed to allocate read_page block\n"); goto failed_mount; } - msblk->stream = squashfs_decompressor_init(sb, flags); + msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; @@ -336,7 +336,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_free(msblk, msblk->stream); + squashfs_decompressor_destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -383,7 +383,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_free(sbi, sbi->stream); + squashfs_decompressor_destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 1760b7d108f6..5d1d07cca6b4 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -38,38 +38,63 @@ struct squashfs_xz { struct xz_buf buf; }; -struct comp_opts { +struct disk_comp_opts { __le32 dictionary_size; __le32 flags; }; -static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, - int len) +struct comp_opts { + int dict_size; +}; + +static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) { - struct comp_opts *comp_opts = buff; - struct squashfs_xz *stream; - int dict_size = msblk->block_size; - int err, n; + struct disk_comp_opts *comp_opts = buff; + struct comp_opts *opts; + int err = 0, n; + + opts = kmalloc(sizeof(*opts), GFP_KERNEL); + if (opts == NULL) { + err = -ENOMEM; + goto out2; + } if (comp_opts) { /* check compressor options are the expected length */ if (len < sizeof(*comp_opts)) { err = -EIO; - goto failed; + goto out; } - dict_size = le32_to_cpu(comp_opts->dictionary_size); + opts->dict_size = le32_to_cpu(comp_opts->dictionary_size); /* the dictionary size should be 2^n or 2^n+2^(n+1) */ - n = ffs(dict_size) - 1; - if (dict_size != (1 << n) && dict_size != (1 << n) + + n = ffs(opts->dict_size) - 1; + if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) + (1 << (n + 1))) { err = -EIO; - goto failed; + goto out; } - } + } else + /* use defaults */ + opts->dict_size = max_t(int, msblk->block_size, + SQUASHFS_METADATA_SIZE); - dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE); + return opts; + +out: + kfree(opts); +out2: + return ERR_PTR(err); +} + + +static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct comp_opts *comp_opts = buff; + struct squashfs_xz *stream; + int err; stream = kmalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) { @@ -77,7 +102,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, goto failed; } - stream->state = xz_dec_init(XZ_PREALLOC, dict_size); + stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size); if (stream->state == NULL) { kfree(stream); err = -ENOMEM; @@ -103,15 +128,13 @@ static void squashfs_xz_free(void *strm) } -static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) { enum xz_ret xz_err; int avail, total = 0, k = 0, page = 0; - struct squashfs_xz *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); stream->buf.in_pos = 0; @@ -124,10 +147,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, if (stream->buf.in_pos == stream->buf.in_size && k < b) { avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->buf.in = bh[k]->b_data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; @@ -147,23 +166,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (xz_err == XZ_OK); - if (xz_err != XZ_STREAM_END) { - ERROR("xz_dec_run error, data probably corrupt\n"); - goto release_mutex; - } - - if (k < b) { - ERROR("xz_uncompress error, input remaining\n"); - goto release_mutex; - } - - total += stream->buf.out_pos; - mutex_unlock(&msblk->read_data_mutex); - return total; + if (xz_err != XZ_STREAM_END || k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return total + stream->buf.out_pos; +out: for (; k < b; k++) put_bh(bh[k]); @@ -172,6 +180,7 @@ release_mutex: const struct squashfs_decompressor squashfs_xz_comp_ops = { .init = squashfs_xz_init, + .comp_opts = squashfs_xz_comp_opts, .free = squashfs_xz_free, .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 55d918fd2d86..bb049027d15c 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -33,7 +33,7 @@ #include "squashfs.h" #include "decompressor.h" -static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len) +static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) { z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); if (stream == NULL) @@ -61,15 +61,13 @@ static void zlib_free(void *strm) } -static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) { int zlib_err, zlib_init = 0; int k = 0, page = 0; - z_stream *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + z_stream *stream = strm; stream->avail_out = 0; stream->avail_in = 0; @@ -78,10 +76,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, if (stream->avail_in == 0 && k < b) { int avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->next_in = bh[k]->b_data + offset; stream->avail_in = avail; offset = 0; @@ -94,12 +88,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned unexpected " - "result 0x%x, srclength %d\n", - zlib_err, srclength); - goto release_mutex; - } + if (zlib_err != Z_OK) + goto out; zlib_init = 1; } @@ -109,29 +99,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (zlib_err == Z_OK); - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } + if (zlib_err != Z_STREAM_END) + goto out; zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - - if (k < b) { - ERROR("zlib_uncompress error, data remaining\n"); - goto release_mutex; - } + if (zlib_err != Z_OK) + goto out; - length = stream->total_out; - mutex_unlock(&msblk->read_data_mutex); - return length; + if (k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return stream->total_out; +out: for (; k < b; k++) put_bh(bh[k]); -- cgit v1.2.3-71-gd317 From cd59c2ec5f37a2bc1315c9324aab6c21d43ffa1a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 28 Oct 2013 14:26:30 +0900 Subject: squashfs: Enhance parallel I/O Now squashfs have used for only one stream buffer for decompression so it hurts parallel read performance so this patch supports multiple decompressor to enhance performance parallel I/O. Four 1G file dd read on KVM machine which has 2 CPU and 4G memory. dd if=test/test1.dat of=/dev/null & dd if=test/test2.dat of=/dev/null & dd if=test/test3.dat of=/dev/null & dd if=test/test4.dat of=/dev/null & old : 1m39s -> new : 9s * From v1 * Change comp_strm with decomp_strm - Phillip * Change/add comments - Phillip Signed-off-by: Minchan Kim Signed-off-by: Phillip Lougher --- fs/squashfs/Kconfig | 13 +++ fs/squashfs/Makefile | 9 +- fs/squashfs/decompressor_multi.c | 200 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 fs/squashfs/decompressor_multi.c (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index c70111ebefd4..1c6d340fc61f 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -63,6 +63,19 @@ config SQUASHFS_LZO If unsure, say N. +config SQUASHFS_MULTI_DECOMPRESSOR + bool "Use multiple decompressors for handling parallel I/O" + depends on SQUASHFS + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + If you have a parallel I/O workload and your system has enough memory, + using this option may improve overall I/O performance. + + If unsure, say N. + config SQUASHFS_XZ bool "Include support for XZ compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index c223c8439c21..dfebc3b12d61 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,8 +4,15 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o decompressor_single.o +squashfs-y += namei.o super.o symlink.o decompressor.o + squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o + +ifdef CONFIG_SQUASHFS_MULTI_DECOMPRESSOR + squashfs-y += decompressor_multi.o +else + squashfs-y += decompressor_single.o +endif diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c new file mode 100644 index 000000000000..462731db5130 --- /dev/null +++ b/fs/squashfs/decompressor_multi.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2013 + * Minchan Kim + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression in the + * decompressor framework + */ + + +/* + * The reason that multiply two is that a CPU can request new I/O + * while it is waiting previous request. + */ +#define MAX_DECOMPRESSOR (num_online_cpus() * 2) + + +int squashfs_max_decompressors(void) +{ + return MAX_DECOMPRESSOR; +} + + +struct squashfs_stream { + void *comp_opts; + struct list_head strm_list; + struct mutex mutex; + int avail_decomp; + wait_queue_head_t wait; +}; + + +struct decomp_stream { + void *stream; + struct list_head list; +}; + + +static void put_decomp_stream(struct decomp_stream *decomp_strm, + struct squashfs_stream *stream) +{ + mutex_lock(&stream->mutex); + list_add(&decomp_strm->list, &stream->strm_list); + mutex_unlock(&stream->mutex); + wake_up(&stream->wait); +} + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct decomp_stream *decomp_strm = NULL; + int err = -ENOMEM; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) + goto out; + + stream->comp_opts = comp_opts; + mutex_init(&stream->mutex); + INIT_LIST_HEAD(&stream->strm_list); + init_waitqueue_head(&stream->wait); + + /* + * We should have a decompressor at least as default + * so if we fail to allocate new decompressor dynamically, + * we could always fall back to default decompressor and + * file system works. + */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto out; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + err = PTR_ERR(decomp_strm->stream); + goto out; + } + + list_add(&decomp_strm->list, &stream->strm_list); + stream->avail_decomp = 1; + return stream; + +out: + kfree(decomp_strm); + kfree(stream); + return ERR_PTR(err); +} + + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + if (stream) { + struct decomp_stream *decomp_strm; + + while (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + msblk->decompressor->free(decomp_strm->stream); + kfree(decomp_strm); + stream->avail_decomp--; + } + } + + WARN_ON(stream->avail_decomp); + kfree(stream->comp_opts); + kfree(stream); +} + + +static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, + struct squashfs_stream *stream) +{ + struct decomp_stream *decomp_strm; + + while (1) { + mutex_lock(&stream->mutex); + + /* There is available decomp_stream */ + if (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + mutex_unlock(&stream->mutex); + break; + } + + /* + * If there is no available decomp and already full, + * let's wait for releasing decomp from other users. + */ + if (stream->avail_decomp >= MAX_DECOMPRESSOR) + goto wait; + + /* Let's allocate new decomp */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto wait; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + kfree(decomp_strm); + goto wait; + } + + stream->avail_decomp++; + WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + + mutex_unlock(&stream->mutex); + break; +wait: + /* + * If system memory is tough, let's for other's + * releasing instead of hurting VM because it could + * make page cache thrashing. + */ + mutex_unlock(&stream->mutex); + wait_event(stream->wait, + !list_empty(&stream->strm_list)); + } + + return decomp_strm; +} + + +int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); + res = msblk->decompressor->decompress(msblk, decomp_stream->stream, + buffer, bh, b, offset, length, srclength, pages); + put_decomp_stream(decomp_stream, stream); + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + return res; +} -- cgit v1.2.3-71-gd317 From d208383d640727b70cd6689bc17e67e9b5ebf4ff Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 18 Nov 2013 02:31:36 +0000 Subject: Squashfs: add multi-threaded decompression using percpu variable Add a multi-threaded decompression implementation which uses percpu variables. Using percpu variables has advantages and disadvantages over implementations which do not use percpu variables. Advantages: * the nature of percpu variables ensures decompression is load-balanced across the multiple cores. * simplicity. Disadvantages: it limits decompression to one thread per core. Signed-off-by: Phillip Lougher --- fs/squashfs/Kconfig | 57 ++++++++++++++----- fs/squashfs/Makefile | 10 +--- fs/squashfs/decompressor_multi_percpu.c | 98 +++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 20 deletions(-) create mode 100644 fs/squashfs/decompressor_multi_percpu.c (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 1c6d340fc61f..159bd6676dc2 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -25,6 +25,50 @@ config SQUASHFS If unsure, say N. +choice + prompt "Decompressor parallelisation options" + depends on SQUASHFS + help + Squashfs now supports three parallelisation options for + decompression. Each one exhibits various trade-offs between + decompression performance and CPU and memory usage. + + If in doubt, select "Single threaded compression" + +config SQUASHFS_DECOMP_SINGLE + bool "Single threaded compression" + help + Traditionally Squashfs has used single-threaded decompression. + Only one block (data or metadata) can be decompressed at any + one time. This limits CPU and memory usage to a minimum. + +config SQUASHFS_DECOMP_MULTI + bool "Use multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + If you have a parallel I/O workload and your system has enough memory, + using this option may improve overall I/O performance. + + This decompressor implementation uses up to two parallel + decompressors per core. It dynamically allocates decompressors + on a demand basis. + +config SQUASHFS_DECOMP_MULTI_PERCPU + bool "Use percpu multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + This decompressor implementation uses a maximum of one + decompressor per core. It uses percpu variables to ensure + decompression is load-balanced across the cores. + +endchoice + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS @@ -63,19 +107,6 @@ config SQUASHFS_LZO If unsure, say N. -config SQUASHFS_MULTI_DECOMPRESSOR - bool "Use multiple decompressors for handling parallel I/O" - depends on SQUASHFS - help - By default Squashfs uses a single decompressor but it gives - poor performance on parallel I/O workloads when using multiple CPU - machines due to waiting on decompressor availability. - - If you have a parallel I/O workload and your system has enough memory, - using this option may improve overall I/O performance. - - If unsure, say N. - config SQUASHFS_XZ bool "Include support for XZ compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index dfebc3b12d61..5833b96ee69c 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,14 +5,10 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o decompressor.o - +squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o - -ifdef CONFIG_SQUASHFS_MULTI_DECOMPRESSOR - squashfs-y += decompressor_multi.o -else - squashfs-y += decompressor_single.o -endif diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c new file mode 100644 index 000000000000..0e7b679bc4ad --- /dev/null +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression using percpu + * variables, one thread per cpu core. + */ + +struct squashfs_stream { + void *stream; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu; + int err, cpu; + + percpu = alloc_percpu(struct squashfs_stream); + if (percpu == NULL) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + } + + kfree(comp_opts); + return (__force void *) percpu; + +out: + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + if (!IS_ERR_OR_NULL(stream->stream)) + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream; + int cpu; + + if (msblk->stream) { + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream = get_cpu_ptr(percpu); + int res = msblk->decompressor->decompress(msblk, stream->stream, buffer, + bh, b, offset, length, srclength, pages); + put_cpu_ptr(stream); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return num_possible_cpus(); +} -- cgit v1.2.3-71-gd317 From 846b730e99518a1c9945afcb2afbe4d08a02ed80 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 18 Nov 2013 02:59:12 +0000 Subject: Squashfs: Generalise paging handling in the decompressors Further generalise the decompressors by adding a page handler abstraction. This adds helpers to allow the decompressors to access and process the output buffers in an implementation independant manner. This allows different types of output buffer to be passed to the decompressors, with the implementation specific aspects handled at decompression time, but without the knowledge being held in the decompressor wrapper code. This will allow the decompressors to handle Squashfs cache buffers, and page cache pages. This patch adds the abstraction and an implementation for the caches. Signed-off-by: Phillip Lougher Reviewed-by: Minchan Kim --- fs/squashfs/block.c | 27 ++++++++++-------- fs/squashfs/cache.c | 28 +++++++++++++++---- fs/squashfs/decompressor.c | 14 ++++++++-- fs/squashfs/decompressor.h | 5 ++-- fs/squashfs/decompressor_multi.c | 7 ++--- fs/squashfs/decompressor_multi_percpu.c | 9 +++--- fs/squashfs/decompressor_single.c | 9 +++--- fs/squashfs/lzo_wrapper.c | 27 ++++++++++++------ fs/squashfs/page_actor.h | 49 +++++++++++++++++++++++++++++++++ fs/squashfs/squashfs.h | 8 +++--- fs/squashfs/squashfs_fs_sb.h | 1 + fs/squashfs/xz_wrapper.c | 22 +++++++++------ fs/squashfs/zlib_wrapper.c | 24 ++++++++++------ 13 files changed, 163 insertions(+), 67 deletions(-) create mode 100644 fs/squashfs/page_actor.h (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4dd402597f22..0cea9b9236d0 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -36,6 +36,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" /* * Read the metadata block length, this is stored in the first two @@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with compression * algorithms). */ -int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength, int pages) +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; int offset = index & ((1 << msblk->devblksize_log2) - 1); u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, page = 0, avail, i; + int bytes, compressed, b = 0, k = 0, avail, i; - bh = kcalloc(((srclength + msblk->devblksize - 1) + bh = kcalloc(((output->length + msblk->devblksize - 1) >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); if (bh == NULL) return -ENOMEM; @@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, *next_index = index + length; TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", - index, compressed ? "" : "un", length, srclength); + index, compressed ? "" : "un", length, output->length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto read_failure; @@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, TRACE("Block @ 0x%llx, %scompressed size %d\n", index, compressed ? "" : "un", length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto block_release; @@ -165,8 +166,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (compressed) { - length = squashfs_decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + length = squashfs_decompress(msblk, bh, b, offset, length, + output); if (length < 0) goto read_failure; } else { @@ -174,19 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, * Block is uncompressed. */ int in, pg_offset = 0; + void *data = squashfs_first_page(output); for (bytes = length; k < b; k++) { in = min(bytes, msblk->devblksize - offset); bytes -= in; while (in) { if (pg_offset == PAGE_CACHE_SIZE) { - page++; + data = squashfs_next_page(output); pg_offset = 0; } avail = min_t(int, in, PAGE_CACHE_SIZE - pg_offset); - memcpy(buffer[page] + pg_offset, - bh[k]->b_data + offset, avail); + memcpy(data + pg_offset, bh[k]->b_data + offset, + avail); in -= avail; pg_offset += avail; offset += avail; @@ -194,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; put_bh(bh[k]); } + squashfs_finish_page(output); } kfree(bh); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index af0b73802592..1cb70a0b2168 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -56,6 +56,7 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" +#include "page_actor.h" /* * Look-up block in cache, and increment usage count. If not in cache, read @@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->error = 0; spin_unlock(&cache->lock); - entry->length = squashfs_read_data(sb, entry->data, - block, length, &entry->next_index, - cache->block_size, cache->pages); + entry->length = squashfs_read_data(sb, block, length, + &entry->next_index, entry->actor); spin_lock(&cache->lock); @@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) kfree(cache->entry[i].data[j]); kfree(cache->entry[i].data); } + kfree(cache->entry[i].actor); } kfree(cache->entry); @@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } } + + entry->actor = squashfs_page_actor_init(entry->data, + cache->pages, 0); + if (entry->actor == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } } return cache; @@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; int i, res; void *table, *buffer, **data; + struct squashfs_page_actor *actor; table = buffer = kmalloc(length, GFP_KERNEL); if (table == NULL) @@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) goto failed; } + actor = squashfs_page_actor_init(data, pages, length); + if (actor == NULL) { + res = -ENOMEM; + goto failed2; + } + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; - res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); + res = squashfs_read_data(sb, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); kfree(data); + kfree(actor); if (res < 0) goto failed; return table; +failed2: + kfree(data); failed: kfree(table); return ERR_PTR(res); diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 234291f79ba5..ac22fe73b0ad 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -30,6 +30,7 @@ #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" +#include "page_actor.h" /* * This file (and decompressor.h) implements a decompressor framework for @@ -87,6 +88,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; void *buffer = NULL, *comp_opts; + struct squashfs_page_actor *actor = NULL; int length = 0; /* @@ -99,9 +101,14 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags) goto out; } - length = squashfs_read_data(sb, &buffer, - sizeof(struct squashfs_super_block), 0, NULL, - PAGE_CACHE_SIZE, 1); + actor = squashfs_page_actor_init(&buffer, 1, 0); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); if (length < 0) { comp_opts = ERR_PTR(length); @@ -112,6 +119,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags) comp_opts = squashfs_comp_opts(msblk, buffer, length); out: + kfree(actor); kfree(buffer); return comp_opts; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 6cdb20a3878a..af0985321808 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -27,8 +27,9 @@ struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); - int (*decompress)(struct squashfs_sb_info *, void *, void **, - struct buffer_head **, int, int, int, int, int); + int (*decompress)(struct squashfs_sb_info *, void *, + struct buffer_head **, int, int, int, + struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index 462731db5130..ae54675a3526 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -183,15 +183,14 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - buffer, bh, b, offset, length, srclength, pages); + bh, b, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 0e7b679bc4ad..23a9c28ad8ea 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -74,15 +74,14 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream __percpu *percpu = (struct squashfs_stream __percpu *) msblk->stream; struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, buffer, - bh, b, offset, length, srclength, pages); + int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); put_cpu_ptr(stream); if (res < 0) diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index f857cf6f22d4..a6c75929a00e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -61,16 +61,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, buffer, - bh, b, offset, length, srclength, pages); + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); mutex_unlock(&stream->mutex); if (res < 0) diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 75c3b5779172..244b9fbfff7b 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -31,6 +31,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_lzo { void *input; @@ -75,13 +76,13 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { struct squashfs_lzo *stream = strm; - void *buff = stream->input; + void *buff = stream->input, *data; int avail, i, bytes = length, res; - size_t out_len = srclength; + size_t out_len = output->length; for (i = 0; i < b; i++) { avail = min(bytes, msblk->devblksize - offset); @@ -98,12 +99,20 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, goto failed; res = bytes = (int)out_len; - for (i = 0, buff = stream->output; bytes && i < pages; i++) { - avail = min_t(int, bytes, PAGE_CACHE_SIZE); - memcpy(buffer[i], buff, avail); - buff += avail; - bytes -= avail; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } else { + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } } + squashfs_finish_page(output); return res; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h new file mode 100644 index 000000000000..5b0ba5a7133a --- /dev/null +++ b/fs/squashfs/page_actor.h @@ -0,0 +1,49 @@ +#ifndef PAGE_ACTOR_H +#define PAGE_ACTOR_H +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +struct squashfs_page_actor { + void **page; + int pages; + int length; + int next_page; +}; + +static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + return actor; +} + +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->page[0]; +} + +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->next_page == actor->pages ? NULL : + actor->page[actor->next_page++]; +} + +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} +#endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2e2751df8452..6a97e63ca173 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -28,8 +28,8 @@ #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) /* block.c */ -extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int, int); +extern int squashfs_read_data(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); @@ -53,8 +53,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, void **, - struct buffer_head **, int, int, int, int, int); +extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, + int, int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 9cdcf4150d59..1da565cb50c3 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -50,6 +50,7 @@ struct squashfs_cache_entry { wait_queue_head_t wait_queue; struct squashfs_cache *cache; void **data; + struct squashfs_page_actor *actor; }; struct squashfs_sb_info { diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 5d1d07cca6b4..c609624e4b8a 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -32,6 +32,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_xz { struct xz_dec *state; @@ -129,11 +130,11 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { enum xz_ret xz_err; - int avail, total = 0, k = 0, page = 0; + int avail, total = 0, k = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -141,7 +142,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.in_size = 0; stream->buf.out_pos = 0; stream->buf.out_size = PAGE_CACHE_SIZE; - stream->buf.out = buffer[page++]; + stream->buf.out = squashfs_first_page(output); do { if (stream->buf.in_pos == stream->buf.in_size && k < b) { @@ -153,11 +154,12 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, offset = 0; } - if (stream->buf.out_pos == stream->buf.out_size - && page < pages) { - stream->buf.out = buffer[page++]; - stream->buf.out_pos = 0; - total += PAGE_CACHE_SIZE; + if (stream->buf.out_pos == stream->buf.out_size) { + stream->buf.out = squashfs_next_page(output); + if (stream->buf.out != NULL) { + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } } xz_err = xz_dec_run(stream->state, &stream->buf); @@ -166,6 +168,8 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, put_bh(bh[k++]); } while (xz_err == XZ_OK); + squashfs_finish_page(output); + if (xz_err != XZ_STREAM_END || k < b) goto out; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bb049027d15c..8727caba6882 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -32,6 +32,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) { @@ -62,14 +63,14 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0; - int k = 0, page = 0; + int zlib_err, zlib_init = 0, k = 0; z_stream *stream = strm; - stream->avail_out = 0; + stream->avail_out = PAGE_CACHE_SIZE; + stream->next_out = squashfs_first_page(output); stream->avail_in = 0; do { @@ -81,15 +82,18 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, offset = 0; } - if (stream->avail_out == 0 && page < pages) { - stream->next_out = buffer[page++]; - stream->avail_out = PAGE_CACHE_SIZE; + if (stream->avail_out == 0) { + stream->next_out = squashfs_next_page(output); + if (stream->next_out != NULL) + stream->avail_out = PAGE_CACHE_SIZE; } if (!zlib_init) { zlib_err = zlib_inflateInit(stream); - if (zlib_err != Z_OK) + if (zlib_err != Z_OK) { + squashfs_finish_page(output); goto out; + } zlib_init = 1; } @@ -99,6 +103,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, put_bh(bh[k++]); } while (zlib_err == Z_OK); + squashfs_finish_page(output); + if (zlib_err != Z_STREAM_END) goto out; -- cgit v1.2.3-71-gd317 From 5f55dbc0c5c466a9cdfa4da7ac1bfe351c7fc52a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 31 Oct 2013 19:24:27 +0000 Subject: Squashfs: Restructure squashfs_readpage() Restructure squashfs_readpage() splitting it into separate functions for datablocks, fragments and sparse blocks. Move the memcpying (from squashfs cache entry) implementation of squashfs_readpage_block into file_cache.c This allows different implementations to be supported. Signed-off-by: Phillip Lougher Reviewed-by: Minchan Kim --- fs/squashfs/Makefile | 2 +- fs/squashfs/file.c | 142 ++++++++++++++++++++++++----------------------- fs/squashfs/file_cache.c | 38 +++++++++++++ fs/squashfs/squashfs.h | 7 +++ 4 files changed, 118 insertions(+), 71 deletions(-) create mode 100644 fs/squashfs/file_cache.c (limited to 'fs') diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 5833b96ee69c..e01ba1126c89 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o file_cache.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ca62c28fe12..e5c9689062ba 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) return le32_to_cpu(size); } - -static int squashfs_readpage(struct file *file, struct page *page) +/* Copy data into page cache */ +void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, + int bytes, int offset) { struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int bytes, i, offset = 0, sparse = 0; - struct squashfs_cache_entry *buffer = NULL; void *pageaddr; - - int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; - int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); - int start_index = page->index & ~mask; - int end_index = start_index | mask; - int file_end = i_size_read(inode) >> msblk->block_log; - - TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); - - if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) - goto out; - - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - /* - * Reading a datablock from disk. Need to read block list - * to get location and block size. - */ - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) - goto error_out; - - if (bsize == 0) { /* hole */ - bytes = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - sparse = 1; - } else { - /* - * Read and decompress datablock. - */ - buffer = squashfs_get_datablock(inode->i_sb, - block, bsize); - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x" - "\n", block, bsize); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = buffer->length; - } - } else { - /* - * Datablock is stored inside a fragment (tail-end packed - * block). - */ - buffer = squashfs_get_fragment(inode->i_sb, - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x\n", - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = i_size_read(inode) & (msblk->block_size - 1); - offset = squashfs_i(inode)->fragment_offset; - } + int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = page->index & ~mask, end_index = start_index | mask; /* * Loop copying datablock into pages. As the datablock likely covers @@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page) for (i = start_index; i <= end_index && bytes > 0; i++, bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { struct page *push_page; - int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0; TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); @@ -475,11 +413,75 @@ skip_page: if (i != page->index) page_cache_release(push_page); } +} + +/* Read datablock stored packed inside a fragment (tail-end packed block) */ +static int squashfs_readpage_fragment(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + else + squashfs_copy_cache(page, buffer, i_size_read(inode) & + (msblk->block_size - 1), + squashfs_i(inode)->fragment_offset); + + squashfs_cache_put(buffer); + return res; +} - if (!sparse) - squashfs_cache_put(buffer); +static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + squashfs_copy_cache(page, NULL, bytes, 0); return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int file_end = i_size_read(inode) >> msblk->block_log; + int res; + void *pageaddr; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) + res = squashfs_readpage_sparse(page, index, file_end); + else + res = squashfs_readpage_block(page, block, bsize); + } else + res = squashfs_readpage_fragment(page); + + if (!res) + return 0; error_out: SetPageError(page); diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c new file mode 100644 index 000000000000..f2310d2a2019 --- /dev/null +++ b/fs/squashfs/file_cache.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* Read separately compressed datablock and memcopy into page cache */ +int squashfs_readpage_block(struct page *page, u64 block, int bsize) +{ + struct inode *i = page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + else + squashfs_copy_cache(page, buffer, buffer->length, 0); + + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 6a97e63ca173..9e1bb79f7e6f 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -66,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); extern __le64 *squashfs_read_fragment_index_table(struct super_block *, u64, u64, unsigned int); +/* file.c */ +void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, + int); + +/* file_xxx.c */ +extern int squashfs_readpage_block(struct page *, u64, int); + /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, -- cgit v1.2.3-71-gd317 From 0d455c12c6428647547bacccaaced3cae0f35570 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 13 Nov 2013 02:04:19 +0000 Subject: Squashfs: Directly decompress into the page cache for file data This introduces an implementation of squashfs_readpage_block() that directly decompresses into the page cache. This uses the previously added page handler abstraction to push down the necessary kmap_atomic/kunmap_atomic operations on the page cache buffers into the decompressors. This enables direct copying into the page cache without using the slow kmap/kunmap calls. The code detects when multiple threads are racing in squashfs_readpage() to decompress the same block, and avoids this regression by falling back to using an intermediate buffer. This patch enhances the performance of Squashfs significantly when multiple processes are accessing the filesystem simultaneously because it not only reduces memcopying, but it more importantly eliminates the lock contention on the intermediate buffer. Using single-thread decompression. dd if=file1 of=/dev/null bs=4096 & dd if=file2 of=/dev/null bs=4096 & dd if=file3 of=/dev/null bs=4096 & dd if=file4 of=/dev/null bs=4096 Before: 629145600 bytes (629 MB) copied, 45.8046 s, 13.7 MB/s After: 629145600 bytes (629 MB) copied, 9.29414 s, 67.7 MB/s Signed-off-by: Phillip Lougher Reviewed-by: Minchan Kim --- fs/squashfs/Kconfig | 28 ++++++++ fs/squashfs/Makefile | 4 +- fs/squashfs/file_direct.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++ fs/squashfs/page_actor.c | 100 +++++++++++++++++++++++++++ fs/squashfs/page_actor.h | 32 +++++++++ 5 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 fs/squashfs/file_direct.c create mode 100644 fs/squashfs/page_actor.c (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 159bd6676dc2..b6fa8657dcbc 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -25,6 +25,34 @@ config SQUASHFS If unsure, say N. +choice + prompt "File decompression options" + depends on SQUASHFS + help + Squashfs now supports two options for decompressing file + data. Traditionally Squashfs has decompressed into an + intermediate buffer and then memcopied it into the page cache. + Squashfs now supports the ability to decompress directly into + the page cache. + + If unsure, select "Decompress file data into an intermediate buffer" + +config SQUASHFS_FILE_CACHE + bool "Decompress file data into an intermediate buffer" + help + Decompress file data into an intermediate buffer and then + memcopy it into the page cache. + +config SQUASHFS_FILE_DIRECT + bool "Decompress files directly into the page cache" + help + Directly decompress file data into the page cache. + Doing so can significantly improve performance because + it eliminates a memcpy and it also removes the lock contention + on the single buffer. + +endchoice + choice prompt "Decompressor parallelisation options" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index e01ba1126c89..4132520b4ff2 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,7 +4,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o file_cache.o +squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c new file mode 100644 index 000000000000..2943b2bfae48 --- /dev/null +++ b/fs/squashfs/file_direct.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "page_actor.h" + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page); + +/* Read separately compressed datablock directly into page cache */ +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) + +{ + struct inode *inode = target_page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + + int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = target_page->index & ~mask; + int end_index = start_index | mask; + int i, n, pages, missing_pages, bytes, res = -ENOMEM; + struct page **page; + struct squashfs_page_actor *actor; + void *pageaddr; + + if (end_index > file_end) + end_index = file_end; + + pages = end_index - start_index + 1; + + page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + if (page == NULL) + return res; + + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(page, pages, 0); + if (actor == NULL) + goto out; + + /* Try to grab all the pages covered by the Squashfs block */ + for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + page[i] = (n == target_page->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, n); + + if (page[i] == NULL) { + missing_pages++; + continue; + } + + if (PageUptodate(page[i])) { + unlock_page(page[i]); + page_cache_release(page[i]); + page[i] = NULL; + missing_pages++; + } + } + + if (missing_pages) { + /* + * Couldn't get one or more pages, this page has either + * been VM reclaimed, but others are still in the page cache + * and uptodate, or we're racing with another thread in + * squashfs_readpage also trying to grab them. Fall back to + * using an intermediate buffer. + */ + res = squashfs_read_cache(target_page, block, bsize, pages, + page); + goto out; + } + + /* Decompress directly into the page cache buffers */ + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + if (res < 0) + goto mark_errored; + + /* Last page may have trailing bytes not filled */ + bytes = res % PAGE_CACHE_SIZE; + if (bytes) { + pageaddr = kmap_atomic(page[pages - 1]); + memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(pageaddr); + } + + /* Mark pages as uptodate, unlock and release */ + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + unlock_page(page[i]); + if (page[i] != target_page) + page_cache_release(page[i]); + } + + kfree(actor); + kfree(page); + + return 0; + +mark_errored: + /* Decompression failed, mark pages as errored. Target_page is + * dealt with by the caller + */ + for (i = 0; i < pages; i++) { + if (page[i] == target_page) + continue; + flush_dcache_page(page[i]); + SetPageError(page[i]); + unlock_page(page[i]); + page_cache_release(page[i]); + } + +out: + kfree(actor); + kfree(page); + return res; +} + + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page) +{ + struct inode *i = target_page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int bytes = buffer->length, res = buffer->error, n, offset = 0; + void *pageaddr; + + if (res) { + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + goto out; + } + + for (n = 0; n < pages && bytes > 0; n++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + + if (page[n] == NULL) + continue; + + pageaddr = kmap_atomic(page[n]); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(page[n]); + SetPageUptodate(page[n]); + unlock_page(page[n]); + if (page[n] != target_page) + page_cache_release(page[n]); + } + +out: + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c new file mode 100644 index 000000000000..5a1c11f56441 --- /dev/null +++ b/fs/squashfs/page_actor.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include "page_actor.h" + +/* + * This file contains implementations of page_actor for decompressing into + * an intermediate buffer, and for decompressing directly into the + * page cache. + * + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ + +/* Implementation of page_actor for decompressing into intermediate buffer */ +static void *cache_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->buffer[0]; +} + +static void *cache_next_page(struct squashfs_page_actor *actor) +{ + if (actor->next_page == actor->pages) + return NULL; + + return actor->buffer[actor->next_page++]; +} + +static void cache_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} + +struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->buffer = buffer; + actor->pages = pages; + actor->next_page = 0; + actor->squashfs_first_page = cache_first_page; + actor->squashfs_next_page = cache_next_page; + actor->squashfs_finish_page = cache_finish_page; + return actor; +} + +/* Implementation of page_actor for decompressing directly into page cache. */ +static void *direct_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->pageaddr = kmap_atomic(actor->page[0]); +} + +static void *direct_next_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); + + return actor->pageaddr = actor->next_page == actor->pages ? NULL : + kmap_atomic(actor->page[actor->next_page++]); +} + +static void direct_finish_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); +} + +struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + actor->pageaddr = NULL; + actor->squashfs_first_page = direct_first_page; + actor->squashfs_next_page = direct_next_page; + actor->squashfs_finish_page = direct_finish_page; + return actor; +} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 5b0ba5a7133a..26dd82008b82 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -8,6 +8,7 @@ * the COPYING file in the top-level directory. */ +#ifndef CONFIG_SQUASHFS_FILE_DIRECT struct squashfs_page_actor { void **page; int pages; @@ -46,4 +47,35 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { /* empty */ } +#else +struct squashfs_page_actor { + union { + void **buffer; + struct page **page; + }; + void *pageaddr; + void *(*squashfs_first_page)(struct squashfs_page_actor *); + void *(*squashfs_next_page)(struct squashfs_page_actor *); + void (*squashfs_finish_page)(struct squashfs_page_actor *); + int pages; + int length; + int next_page; +}; + +extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page + **, int, int); +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_first_page(actor); +} +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_next_page(actor); +} +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + actor->squashfs_finish_page(actor); +} +#endif #endif -- cgit v1.2.3-71-gd317 From ed4f381ec15e5f11724cdbc68cffd2c22d1eaebd Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sun, 10 Nov 2013 00:02:29 +0000 Subject: Squashfs: Check stream is not NULL in decompressor_multi.c Fix static checker complaint that stream is not checked in squashfs_decompressor_destroy(). Reported-by: Dan Carpenter Signed-off-by: Phillip Lougher Reviewed-by: Minchan Kim --- fs/squashfs/decompressor_multi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index ae54675a3526..d6008a636479 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -119,11 +119,10 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) kfree(decomp_strm); stream->avail_decomp--; } + WARN_ON(stream->avail_decomp); + kfree(stream->comp_opts); + kfree(stream); } - - WARN_ON(stream->avail_decomp); - kfree(stream->comp_opts); - kfree(stream); } -- cgit v1.2.3-71-gd317 From ff1c038addc4f205d5f1ede449426c7d316c0eed Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Nov 2013 23:44:46 -0600 Subject: Check SMB3 dialects against downgrade attacks When we are running SMB3 or SMB3.02 connections which are signed we need to validate the protocol negotiation information, to ensure that the negotiate protocol response was not tampered with. Add the missing FSCTL which is sent at mount time (immediately after the SMB3 Tree Connect) to validate that the capabilities match what we think the server sent. "Secure dialect negotiation is introduced in SMB3 to protect against man-in-the-middle attempt to downgrade dialect negotiation. The idea is to prevent an eavesdropper from downgrading the initially negotiated dialect and capabilities between the client and the server." For more explanation see 2.2.31.4 of MS-SMB2 or http://blogs.msdn.com/b/openspecification/archive/2012/06/28/smb3-secure-dialect-negotiation.aspx Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/smb2ops.c | 1 + fs/cifs/smb2pdu.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.h | 12 ++++++--- fs/cifs/smb2proto.h | 1 + fs/cifs/smbfsctl.h | 2 +- 6 files changed, 90 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d9ea7ada1378..f918a998a087 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -384,6 +384,7 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); }; struct smb_version_values { diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a3968eeb6fac..757da3e54d3d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1319,6 +1319,7 @@ struct smb_version_operations smb30_operations = { .create_lease_buf = smb3_create_lease_buf, .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, + .validate_negotiate = smb3_validate_negotiate, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1e136eee3ea6..2013234b73ad 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -454,6 +454,81 @@ neg_exit: return rc; } +int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc = 0; + struct validate_negotiate_info_req vneg_inbuf; + struct validate_negotiate_info_rsp *pneg_rsp; + u32 rsplen; + + cifs_dbg(FYI, "validate negotiate\n"); + + /* + * validation ioctl must be signed, so no point sending this if we + * can not sign it. We could eventually change this to selectively + * sign just this, the first and only signed request on a connection. + * This is good enough for now since a user who wants better security + * would also enable signing on the mount. Having validation of + * negotiate info for signed connections helps reduce attack vectors + */ + if (tcon->ses->server->sign == false) + return 0; /* validation requires signing */ + + vneg_inbuf.Capabilities = + cpu_to_le32(tcon->ses->server->vals->req_capabilities); + memcpy(vneg_inbuf.Guid, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); + + if (tcon->ses->sign) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + vneg_inbuf.SecurityMode = 0; + + vneg_inbuf.DialectCount = cpu_to_le16(1); + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), + (char **)&pneg_rsp, &rsplen); + + if (rc != 0) { + cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + return -EIO; + } + + if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); + return -EIO; + } + + /* check validate negotiate info response matches what we got earlier */ + if (pneg_rsp->Dialect != + cpu_to_le16(tcon->ses->server->vals->protocol_id)) + goto vneg_out; + + if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) + goto vneg_out; + + /* do not validate server guid because not saved at negprot time yet */ + + if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND | + SMB2_LARGE_FILES) != tcon->ses->server->capabilities) + goto vneg_out; + + /* validate negotiate successful */ + cifs_dbg(FYI, "validate negotiate info successful\n"); + return 0; + +vneg_out: + cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); + return -EIO; +} + int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@ -829,6 +904,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); init_copy_chunk_defaults(tcon); + if (tcon->ses->server->ops->validate_negotiate) + rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: free_rsp_buf(resp_buftype, rsp); kfree(unc_path); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f88320bbb477..2022c542ea3a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -577,13 +577,19 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; -/* Response and Request are the same format */ -struct validate_negotiate_info { +struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; __le16 SecurityMode; __le16 DialectCount; - __le16 Dialect[1]; + __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ } __packed; #define RSS_CAPABLE 0x00000001 diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index b4eea105b08c..93adc64666f3 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -162,5 +162,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_lock_element *buf); extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state); +extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index a4b2391fe66e..0e538b5c9622 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -90,7 +90,7 @@ #define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ -#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* Perform server-side data movement */ #define FSCTL_SRV_COPYCHUNK 0x001440F2 #define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 -- cgit v1.2.3-71-gd317 From 4a82fd7c4e78a1b7a224f9ae8bb7e1fd95f670e0 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 15 Nov 2013 16:36:16 -0500 Subject: NFSv4 wait on recovery for async session errors When the state manager is processing the NFS4CLNT_DELEGRETURN flag, session draining is off, but DELEGRETURN can still get a session error. The async handler calls nfs4_schedule_session_recovery returns -EAGAIN, and the DELEGRETURN done then restarts the RPC task in the prepare state. With the state manager still processing the NFS4CLNT_DELEGRETURN flag with session draining off, these DELEGRETURNs will cycle with errors filling up the session slots. This prevents OPEN reclaims (from nfs_delegation_claim_opens) required by the NFS4CLNT_DELEGRETURN state manager processing from completing, hanging the state manager in the __rpc_wait_for_completion_task in nfs4_run_open_task as seen in this kernel thread dump: kernel: 4.12.32.53-ma D 0000000000000000 0 3393 2 0x00000000 kernel: ffff88013995fb60 0000000000000046 ffff880138cc5400 ffff88013a9df140 kernel: ffff8800000265c0 ffffffff8116eef0 ffff88013fc10080 0000000300000001 kernel: ffff88013a4ad058 ffff88013995ffd8 000000000000fbc8 ffff88013a4ad058 kernel: Call Trace: kernel: [] ? cache_alloc_refill+0x1c0/0x240 kernel: [] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc] kernel: [] rpc_wait_bit_killable+0x42/0xa0 [sunrpc] kernel: [] __wait_on_bit+0x5f/0x90 kernel: [] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc] kernel: [] out_of_line_wait_on_bit+0x78/0x90 kernel: [] ? wake_bit_function+0x0/0x50 kernel: [] __rpc_wait_for_completion_task+0x2d/0x30 [sunrpc] kernel: [] nfs4_run_open_task+0x11c/0x160 [nfs] kernel: [] nfs4_open_recover_helper+0x87/0x120 [nfs] kernel: [] nfs4_open_recover+0xc6/0x150 [nfs] kernel: [] ? nfs4_open_recoverdata_alloc+0x2f/0x60 [nfs] kernel: [] nfs4_open_delegation_recall+0x6a/0xa0 [nfs] kernel: [] nfs_end_delegation_return+0x120/0x2e0 [nfs] kernel: [] ? queue_work+0x1f/0x30 kernel: [] nfs_client_return_marked_delegations+0xd7/0x110 [nfs] kernel: [] nfs4_run_state_manager+0x548/0x620 [nfs] kernel: [] ? nfs4_run_state_manager+0x0/0x620 [nfs] kernel: [] kthread+0x96/0xa0 kernel: [] child_rip+0xa/0x20 kernel: [] ? kthread+0x0/0xa0 kernel: [] ? child_rip+0x0/0x20 The state manager can not therefore process the DELEGRETURN session errors. Change the async handler to wait for recovery on session errors. Signed-off-by: Andy Adamson Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5ab33c0792df..1f4edfbb4a70 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4803,7 +4803,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - goto restart_call; + goto wait_on_recovery; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); -- cgit v1.2.3-71-gd317 From c97cf606e43b85a6cf158b810375dd77312024db Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Nov 2013 16:34:14 -0500 Subject: NFSv4: Update list of irrecoverable errors on DELEGRETURN If the DELEGRETURN errors out with something like NFS4ERR_BAD_STATEID then there is no recovery possible. Just quit without returning an error. Also, note that the client must not assume that the NFSv4 lease has been renewed when it sees an error on DELEGRETURN. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1f4edfbb4a70..ca36d0d50b7d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4988,11 +4988,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); switch (task->tk_status) { - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: case 0: renew_lease(data->res.server, data->timestamp); break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + task->tk_status = 0; + break; default: if (nfs4_async_handle_error(task, data->res.server, NULL) == -EAGAIN) { -- cgit v1.2.3-71-gd317 From 69794ad70cd3b600319f175fc0cbe7abd510791f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Nov 2013 12:57:19 -0500 Subject: NFSv4: close needs to handle NFS4ERR_ADMIN_REVOKED Also ensure that we zero out the stateid mode when exiting Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ca36d0d50b7d..f01e2aa53210 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2518,9 +2518,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) calldata->roc_barrier); nfs_set_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - nfs4_close_clear_stateid_flags(state, - calldata->arg.fmode); break; + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: @@ -2528,9 +2527,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { rpc_restart_call_prepare(task); + goto out_release; + } } + nfs4_close_clear_stateid_flags(state, calldata->arg.fmode); +out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); -- cgit v1.2.3-71-gd317 From 4724b106b9b8e8b802ca6f6d8a2f74feb8a3c375 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Nov 2013 11:11:40 -0500 Subject: Btrfs: don't BUG_ON() if we get an error walking backrefs We can just return false for this so we stop doing the snapshot aware defrag stuff. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da8d2f696ac5..fd67b34e220d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2129,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, old->extent_offset, fs_info, path, record_one_backref, old); - BUG_ON(ret < 0 && ret != -ENOENT); + if (ret < 0 && ret != -ENOENT) + return false; /* no backref to be processed for this extent */ if (!old->count) { -- cgit v1.2.3-71-gd317 From 6cfab851f42edcfa72763b3cb8dc26b8fd74e203 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Nov 2013 16:25:58 -0500 Subject: Btrfs: make sure to copy everything if we rename If we rename a file that is already in the log and we fsync again we will lose the new name. This is because we just log the inode update and not the new ref. To fix this we just need to check if we are logging the new name of the inode and copy all the metadata instead of just updating the inode itself. With this patch my testcase now passes. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 744553c83fe2..b0c7134162fb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3697,7 +3697,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) { + &BTRFS_I(inode)->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; -- cgit v1.2.3-71-gd317 From d006a04816a306995cd9d394079652a7462f81f8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Nov 2013 20:54:09 -0500 Subject: Btrfs: only drop modified extents if we logged the whole inode If we fsync, seek and write, rename and then fsync again we will lose the modified hole extent because the rename will drop all of the modified extents since we didn't do the fast search. We need to only drop the modified extents if we didn't do the fast search and we were logging the entire inode as we don't need them anymore, otherwise this is being premature. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b0c7134162fb..9f7fc51ca334 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3802,7 +3802,7 @@ log_extents: err = ret; goto out_unlock; } - } else { + } else if (inode_only == LOG_INODE_ALL) { struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; -- cgit v1.2.3-71-gd317 From 33ef30add16905f99bbe799ee5ccea15ba497803 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 3 Nov 2013 19:06:38 +0200 Subject: Btrfs: do not inc uncorrectable_errors counter on ro scrubs Currently if we discover an error when scrubbing in ro mode we a) blindly increment the uncorrectable_errors counter, and b) spam the dmesg with the 'unable to fixup (regular) error at ...' message, even though a) we haven't tried to determine if the error is correctable or not, and b) we haven't tried to fixup anything. Fix this. Cc: Stefan Behrens Signed-off-by: Ilya Dryomov Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2544805544f0..561e2f16ba3e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -938,8 +938,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly && !sctx->is_dev_replace) - goto did_not_correct_error; + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; -- cgit v1.2.3-71-gd317 From 908960c6c0fb3b3ce3971dc0ca47b581d256b968 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 3 Nov 2013 19:06:39 +0200 Subject: Btrfs: disable online raid-repair on ro mounts This disables the "if needed, write the good copy back before the read is completed" part of the read sequence for read-only mounts. Cc: Jan Schmidt Signed-off-by: Ilya Dryomov Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 856bc2b2192c..8e457fca0a0b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1980,6 +1980,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); /* we can't repair anything in raid56 yet */ @@ -2036,6 +2037,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, @@ -2057,12 +2061,12 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; int did_repair = 0; int ret; - struct inode *inode = page->mapping->host; private = 0; ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, @@ -2085,6 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page) did_repair = 1; goto out; } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; spin_lock(&BTRFS_I(inode)->io_tree.lock); state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, @@ -2094,7 +2100,6 @@ static int clean_io_failure(u64 start, struct page *page) if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { - fs_info = BTRFS_I(inode)->root->fs_info; num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { -- cgit v1.2.3-71-gd317 From ba69994a40b242bef26ab2683ea84aa29e2e429f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 3 Nov 2013 19:06:40 +0200 Subject: Btrfs: fix __btrfs_start_workers retval __btrfs_start_workers returns 0 in case it raced with btrfs_stop_workers and lost the race. This is wrong because worker in this case is not allowed to start and is in fact destroyed. Return -EINVAL instead. Signed-off-by: Ilya Dryomov Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8aec751fa464..c1e0b0caf9cc 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -495,6 +495,7 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) spin_lock_irq(&workers->lock); if (workers->stopping) { spin_unlock_irq(&workers->lock); + ret = -EINVAL; goto fail_kthread; } list_add_tail(&worker->worker_list, &workers->idle_list); -- cgit v1.2.3-71-gd317 From d52c1bcc647ca41ee563bc38d8231e5f3816112c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 5 Nov 2013 11:45:53 +0800 Subject: Btrfs: avoid heavy operations in btrfs_commit_super The 'git blame' history shows that, the old transaction commit code has to do twice to ensure roots are updated and we have to flush metadata and super block manually, however, right now all of these can be handled well inside the transaction commit code without extra efforts. And the error handling part remains same with the current code, -- 'return to caller once we get error'. This saves us a transaction commit and a flush of super block, which are both heavy operations according to ftrace output analysis. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c4ed0bb3da1..8072cfa8a3b1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3517,7 +3517,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int btrfs_commit_super(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); @@ -3531,25 +3530,7 @@ int btrfs_commit_super(struct btrfs_root *root) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - ret = btrfs_write_and_wait_transaction(NULL, root); - if (ret) { - btrfs_error(root->fs_info, ret, - "Failed to sync btree inode to disk."); - return ret; - } - - ret = write_ctree_super(NULL, root, 0); - return ret; + return btrfs_commit_transaction(trans, root); } int close_ctree(struct btrfs_root *root) -- cgit v1.2.3-71-gd317 From b1a06a4b574996692b72b742bf6e6aa0c711a948 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 6 Nov 2013 16:57:55 +0800 Subject: Btrfs: fix lockdep error in async commit Lockdep complains about btrfs's async commit: [ 2372.462171] [ BUG: bad unlock balance detected! ] [ 2372.462191] 3.12.0+ #32 Tainted: G W [ 2372.462209] ------------------------------------- [ 2372.462228] ceph-osd/14048 is trying to release lock (sb_internal) at: [ 2372.462275] [] btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs] [ 2372.462305] but there are no more locks to release! [ 2372.462324] [ 2372.462324] other info that might help us debug this: [ 2372.462349] no locks held by ceph-osd/14048. [ 2372.462367] [ 2372.462367] stack backtrace: [ 2372.462386] CPU: 2 PID: 14048 Comm: ceph-osd Tainted: G W 3.12.0+ #32 [ 2372.462414] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015 11/09/2011 [ 2372.462455] ffffffffa022cb10 ffff88007490fd28 ffffffff816f094a ffff8800378aa320 [ 2372.462491] ffff88007490fd50 ffffffff810adf4c ffff8800378aa320 ffff88009af97650 [ 2372.462526] ffffffffa022cb10 ffff88007490fd88 ffffffff810b01ee ffff8800898c0000 [ 2372.462562] Call Trace: [ 2372.462584] [] ? btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs] [ 2372.462619] [] dump_stack+0x45/0x56 [ 2372.462642] [] print_unlock_imbalance_bug+0xec/0x100 [ 2372.462677] [] ? btrfs_commit_transaction_async+0x1b0/0x2a0 [btrfs] [ 2372.462710] [] lock_release+0x18e/0x210 [ 2372.462742] [] btrfs_commit_transaction_async+0x1d6/0x2a0 [btrfs] [ 2372.462783] [] btrfs_ioctl_start_sync+0x3e/0xc0 [btrfs] [ 2372.462822] [] btrfs_ioctl+0x4c3/0x1f70 [btrfs] [ 2372.462849] [] ? avc_has_perm+0x121/0x1b0 [ 2372.462873] [] ? avc_has_perm+0x24/0x1b0 [ 2372.462897] [] ? sched_clock_cpu+0xa8/0x100 [ 2372.462922] [] do_vfs_ioctl+0x2e5/0x4e0 [ 2372.462946] [] ? file_has_perm+0x86/0xa0 [ 2372.462969] [] SyS_ioctl+0x81/0xa0 [ 2372.462991] [] tracesys+0xdd/0xe2 ==================================================== It's because that we don't do the right thing when checking if it's ok to tell lockdep that we're trying to release the rwsem. If the trans handle's type is TRANS_ATTACH, we won't acquire the freeze rwsem, but as TRANS_ATTACH fits the check (trans < TRANS_JOIN_NOLOCK), we'll release the freeze rwsem, which makes lockdep complains a lot. Reported-by: Ma Jianpeng Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 57c16b46afbd..c6a872a8a468 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1480,7 +1480,7 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_acquire_read( &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 0, 1, _THIS_IP_); @@ -1521,7 +1521,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - if (trans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_release( &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); -- cgit v1.2.3-71-gd317 From b52abf1e3b65b4fbd42b39412e040b561faec6a6 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Wed, 6 Nov 2013 15:12:40 +0000 Subject: Btrfs: don't wait for ordered data outside desired range In btrfs_wait_ordered_range(), if we found an extent to the left of the start of our desired wait range and the last byte of that extent is 1 less than the desired range's start, we would would wait for the IO completion of that extent unnecessarily. Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 25a8f3812f14..c37124bccf17 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -803,7 +803,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len < start) { + if (ordered->file_offset + ordered->len <= start) { btrfs_put_ordered_extent(ordered); break; } -- cgit v1.2.3-71-gd317 From 9650e05c071fc92e704d4359d59e3ac3ecae2875 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 12 Nov 2013 19:32:04 +0800 Subject: Btrfs: remove dead codes from ctree.h These two functions are only stated but undefined. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9aeb2759a64..54ab86127f7a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3613,9 +3613,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -int btrfs_csum_truncate(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u64 isize); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ @@ -3744,9 +3741,6 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); -int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, - u64 start, u64 end, int skip_pinned, - int modified); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, -- cgit v1.2.3-71-gd317 From 56d140f5f61d3547dcc194de9cd5193b957b8016 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 13 Nov 2013 17:19:08 +0100 Subject: Btrfs: print bytenr instead of page pointer in check-int The page pointer information was useless. The bytenr is what you want when you search for submitted write bios. Additionally, a new bit in the print mask is added that allows to selectively enable the check-int submit_bio verbose mode. Before, the global verbose mode had to be enabled leading to many million useless lines in the kernel log. And a comment is added that explains that LOG_BUF_SHIFT needs to be set to a really high value. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e0aab4456974..b50764bef141 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,6 +77,15 @@ * the integrity of (super)-block write requests, do not * enable the config option BTRFS_FS_CHECK_INTEGRITY to * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 */ #include @@ -124,6 +133,7 @@ #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 #define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 struct btrfsic_dev_state; struct btrfsic_state; @@ -3015,6 +3025,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (rw & WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; + u64 cur_bytenr; int bio_is_patched; char **mapped_datav; @@ -3033,6 +3044,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) GFP_NOFS); if (!mapped_datav) goto leave; + cur_bytenr = dev_bytenr; for (i = 0; i < bio->bi_vcnt; i++) { BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); @@ -3044,16 +3056,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio) kfree(mapped_datav); goto leave; } - if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE) == - (dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) printk(KERN_INFO - "#%u: page=%p, len=%u, offset=%u\n", - i, bio->bi_io_vec[i].bv_page, - bio->bi_io_vec[i].bv_len, + "#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); + cur_bytenr += bio->bi_io_vec[i].bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, -- cgit v1.2.3-71-gd317 From 931aa87791af46640a46b11fa503a119e36943ec Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 14 Nov 2013 17:33:21 +0800 Subject: Btrfs: fix list delete warning when removing ordered root from the list Commit b02441999efcc6152b87cd58e7970bb7843f76cf "Btrfs: don't wait for the completion of all the ordered extents" introduced a bug that broke the ordered root list: WARNING: CPU: 1 PID: 7119 at lib/list_debug.c:59 __list_del_entry+0x5a/0x98() It is because we forgot to return the roots in the splice list to the ordered list of the fs. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ordered-data.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c37124bccf17..69582d5b69d1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -638,6 +638,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) WARN_ON(nr < 0); } } + list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); } -- cgit v1.2.3-71-gd317 From 52a157592140ac76d9f15637543aecc2100f95cb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 14 Nov 2013 19:52:44 +0800 Subject: btrfs: fix typo in the log message Signed-off-by: Anand Jain Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 342f9fd411e3..2cfc3dfff64f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -366,7 +366,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->tgtdev = tgt_device; printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + "btrfs: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "" : rcu_str_deref(src_device->name), src_device->devid, -- cgit v1.2.3-71-gd317 From 4cd8587ce8fb79e49d1d6d1fc065f056188fb86a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Nov 2013 22:57:29 -0500 Subject: btrfs: Use trace condition for get_extent tracepoint Doing an if statement to test some condition to know if we should trigger a tracepoint is pointless when tracing is disabled. This just adds overhead and wastes a branch prediction. This is why the TRACE_EVENT_CONDITION() was created. It places the check inside the jump label so that the branch does not happen unless tracing is enabled. That is, instead of doing: if (em) trace_btrfs_get_extent(root, em); Which is basically this: if (em) if (static_key(trace_btrfs_get_extent)) { Using a TRACE_EVENT_CONDITION() we can just do: trace_btrfs_get_extent(root, em); And the condition trace event will do: if (static_key(trace_btrfs_get_extent)) { if (em) { ... The static key is a non conditional jump (or nop) that is faster than having to check if em is NULL or not. Signed-off-by: Steven Rostedt Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- include/trace/events/btrfs.h | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fd67b34e220d..f1a77449d032 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6187,8 +6187,7 @@ insert: write_unlock(&em_tree->lock); out: - if (em) - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, em); if (path) btrfs_free_path(path); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f18b3b76e01e..4832d75dcbae 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -162,12 +162,14 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, { EXTENT_FLAG_LOGGING, "LOGGING" }, \ { EXTENT_FLAG_FILLING, "FILLING" }) -TRACE_EVENT(btrfs_get_extent, +TRACE_EVENT_CONDITION(btrfs_get_extent, TP_PROTO(struct btrfs_root *root, struct extent_map *map), TP_ARGS(root, map), + TP_CONDITION(map), + TP_STRUCT__entry( __field( u64, root_objectid ) __field( u64, start ) -- cgit v1.2.3-71-gd317 From 475bf36ffba2ad18631865a2429468dc6c7ea919 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 18 Nov 2013 22:13:18 +0900 Subject: btrfs: fix bio_size_ok() for max_sectors > 0xffff The data type of max_sectors in queue settings is unsigned int. But this value is stored to the local variable whose type is unsigned short in bio_size_ok(). This can cause unexpected result when max_sectors > 0xffff. Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Signed-off-by: Akinobu Mita Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0db637097862..92303f42baaa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5394,7 +5394,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, { struct bio_vec *prev; struct request_queue *q = bdev_get_queue(bdev); - unsigned short max_sectors = queue_max_sectors(q); + unsigned int max_sectors = queue_max_sectors(q); struct bvec_merge_data bvm = { .bi_bdev = bdev, .bi_sector = sector, -- cgit v1.2.3-71-gd317 From 4204617d142c0887e45fda2562cb5c58097b918e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Nov 2013 14:32:34 +0100 Subject: btrfs: update kconfig help text Reflect the current status. Portions of the text taken from the wiki pages. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/Kconfig | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 398cbd517be2..f62389348662 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -9,12 +9,17 @@ config BTRFS_FS select XOR_BLOCKS help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. + + For more information, please see the web pages at + http://btrfs.wiki.kernel.org. To compile this file system support as a module, choose M here. The module will be called btrfs. -- cgit v1.2.3-71-gd317 From e3c4269d139db7471ee560fb30948179e180b3d4 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 12 Nov 2013 13:30:05 +0100 Subject: GFS2: fix potential NULL pointer dereference Commit [e66cf1610: GFS2: Use lockref for glocks] replaced call: atomic_read(&gi->gl->gl_ref) == 0 with: __lockref_is_dead(&gl->gl_lockref) therefore changing how gl is accessed, from gi->gl to plan gl. However, gl can be a NULL pointer, and so gi->gl needs to be used instead (which is guaranteed not to be NULL because fo the while loop checking that condition). Signed-off-by: Michal Nazarewicz Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e66a8009aff1..c8420f7e4db6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1899,7 +1899,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gi->nhash = 0; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref)); + } while (gi->sdp != gi->gl->gl_sbd || + __lockref_is_dead(&gi->gl->gl_lockref)); return 0; } -- cgit v1.2.3-71-gd317 From ea0341e071527d5cec350917b01ab901af09d758 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 21 Nov 2013 18:47:57 +0000 Subject: GFS2: Fix ref count bug relating to atomic_open In the case that atomic_open calls finish_no_open() with the dentry that was supplied to gfs2_atomic_open() an extra reference count is required. This patch fixes that issue preventing a bug trap triggering at umount time. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1615df16cf4e..7119504159f1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1171,8 +1171,11 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (d != NULL) dentry = d; if (dentry->d_inode) { - if (!(*opened & FILE_OPENED)) + if (!(*opened & FILE_OPENED)) { + if (d == NULL) + dget(dentry); return finish_no_open(file, dentry); + } dput(d); return 0; } -- cgit v1.2.3-71-gd317 From 76ae281f6307331aa063288edb6422ae99f435f0 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 21 Nov 2013 14:31:56 -0800 Subject: configfs: fix race between dentry put and lookup A race window in configfs, it starts from one dentry is UNHASHED and end before configfs_d_iput is called. In this window, if a lookup happen, since the original dentry was UNHASHED, so a new dentry will be allocated, and then in configfs_attach_attr(), sd->s_dentry will be updated to the new dentry. Then in configfs_d_iput(), BUG_ON(sd->s_dentry != dentry) will be triggered and system panic. sys_open: sys_close: ... fput dput dentry_kill __d_drop <--- dentry unhashed here, but sd->dentry still point to this dentry. lookup_real configfs_lookup configfs_attach_attr---> update sd->s_dentry to new allocated dentry here. d_kill configfs_d_iput <--- BUG_ON(sd->s_dentry != dentry) triggered here. To fix it, change configfs_d_iput to not update sd->s_dentry if sd->s_count > 2, that means there are another dentry is using the sd beside the one that is going to be put. Use configfs_dirent_lock in configfs_attach_attr to sync with configfs_d_iput. With the following steps, you can reproduce the bug. 1. enable ocfs2, this will mount configfs at /sys/kernel/config and fill configure in it. 2. run the following script. while [ 1 ]; do cat /sys/kernel/config/cluster/$your_cluster_name/idle_timeout_ms > /dev/null; done & while [ 1 ]; do cat /sys/kernel/config/cluster/$your_cluster_name/idle_timeout_ms > /dev/null; done & Signed-off-by: Junxiao Bi Cc: Joel Becker Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/configfs/dir.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 4522e0755773..e081acbac2e7 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -56,10 +56,19 @@ static void configfs_d_iput(struct dentry * dentry, struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { - BUG_ON(sd->s_dentry != dentry); /* Coordinate with configfs_readdir */ spin_lock(&configfs_dirent_lock); - sd->s_dentry = NULL; + /* Coordinate with configfs_attach_attr where will increase + * sd->s_count and update sd->s_dentry to new allocated one. + * Only set sd->dentry to null when this dentry is the only + * sd owner. + * If not do so, configfs_d_iput may run just after + * configfs_attach_attr and set sd->s_dentry to null + * even it's still in use. + */ + if (atomic_read(&sd->s_count) <= 2) + sd->s_dentry = NULL; + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } @@ -416,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + spin_lock(&configfs_dirent_lock); dentry->d_fsdata = configfs_get(sd); sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, configfs_init_file); if (error) { -- cgit v1.2.3-71-gd317 From 54d71145a4548330313ca664a4a009772fe8b7dd Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 20 Nov 2013 11:56:35 +0200 Subject: sysfs: handle duplicate removal attempts in sysfs_remove_group() Commit bcdde7e221a8 (sysfs: make __sysfs_remove_dir() recursive) changed the behavior so that directory removals will be done recursively. This means that the sysfs group might already be removed if its parent directory has been removed. The current code outputs warnings similar to following log snippet when it detects that there is no group for the given kobject: WARNING: CPU: 0 PID: 4 at fs/sysfs/group.c:214 sysfs_remove_group+0xc6/0xd0() sysfs group ffffffff81c6f1e0 not found for kobject 'host7' Modules linked in: CPU: 0 PID: 4 Comm: kworker/0:0 Not tainted 3.12.0+ #13 Hardware name: /D33217CK, BIOS GKPPT10H.86A.0042.2013.0422.1439 04/22/2013 Workqueue: kacpi_hotplug acpi_hotplug_work_fn 0000000000000009 ffff8801002459b0 ffffffff817daab1 ffff8801002459f8 ffff8801002459e8 ffffffff810436b8 0000000000000000 ffffffff81c6f1e0 ffff88006d440358 ffff88006d440188 ffff88006e8b4c28 ffff880100245a48 Call Trace: [] dump_stack+0x45/0x56 [] warn_slowpath_common+0x78/0xa0 [] warn_slowpath_fmt+0x47/0x50 [] ? sysfs_get_dirent_ns+0x49/0x70 [] sysfs_remove_group+0xc6/0xd0 [] dpm_sysfs_remove+0x3e/0x50 [] device_del+0x40/0x1b0 [] device_unregister+0xd/0x20 [] scsi_remove_host+0xba/0x110 [] ata_host_detach+0xc6/0x100 [] ata_pci_remove_one+0x18/0x20 [] pci_device_remove+0x28/0x60 [] __device_release_driver+0x64/0xd0 [] device_release_driver+0x1e/0x30 [] bus_remove_device+0xf7/0x140 [] device_del+0x121/0x1b0 [] pci_stop_bus_device+0x94/0xa0 [] pci_stop_bus_device+0x3b/0xa0 [] pci_stop_bus_device+0x3b/0xa0 [] pci_stop_and_remove_bus_device+0xd/0x20 [] trim_stale_devices+0x73/0xe0 [] trim_stale_devices+0xbb/0xe0 [] trim_stale_devices+0xbb/0xe0 [] acpiphp_check_bridge+0x7e/0xd0 [] hotplug_event+0xcd/0x160 [] hotplug_event_work+0x25/0x60 [] acpi_hotplug_work_fn+0x17/0x22 [] process_one_work+0x17a/0x430 [] worker_thread+0x119/0x390 [] ? manage_workers.isra.25+0x2a0/0x2a0 [] kthread+0xcd/0xf0 [] ? kthread_create_on_node+0x180/0x180 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x180/0x180 On this particular machine I see ~16 of these message during Thunderbolt hot-unplug. Fix this in similar way that was done for sysfs_remove_one() by checking if the parent directory has already been removed and bailing out early. Signed-off-by: Mika Westerberg Acked-by: Rafael J. Wysocki Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 1898a10e38ce..3796afdff40c 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -206,6 +206,15 @@ void sysfs_remove_group(struct kobject *kobj, struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; + /* + * Sysfs directories are now removed recursively by + * sysfs_remove_dir(). This means that the function can be called + * for a group whose sysfs entry is already removed. In that case + * all its groups are guaranteed to be already removed. + */ + if (dir_sd->s_flags & SYSFS_FLAG_REMOVED) + return; + if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { -- cgit v1.2.3-71-gd317 From 027a485d12e089314360d459b8d847104dd28702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 17 Nov 2013 11:17:36 +0900 Subject: sysfs: use a separate locking class for open files depending on mmap The following two commits implemented mmap support in the regular file path and merged bin file support into the regular path. 73d9714627ad ("sysfs: copy bin mmap support from fs/sysfs/bin.c to fs/sysfs/file.c") 3124eb1679b2 ("sysfs: merge regular and bin file handling") After the merge, the following commands trigger a spurious lockdep warning. "test-mmap-read" simply mmaps the file and dumps the content. $ cat /sys/block/sda/trace/act_mask $ test-mmap-read /sys/devices/pci0000\:00/0000\:00\:03.0/resource0 4096 ====================================================== [ INFO: possible circular locking dependency detected ] 3.12.0-work+ #378 Not tainted ------------------------------------------------------- test-mmap-read/567 is trying to acquire lock: (&of->mutex){+.+.+.}, at: [] sysfs_bin_mmap+0x4f/0x120 but task is already holding lock: (&mm->mmap_sem){++++++}, at: [] vm_mmap_pgoff+0x49/0xa0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++++}: ... -> #2 (sr_mutex){+.+.+.}: ... -> #1 (&bdev->bd_mutex){+.+.+.}: ... -> #0 (&of->mutex){+.+.+.}: ... other info that might help us debug this: Chain exists of: &of->mutex --> sr_mutex --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(sr_mutex); lock(&mm->mmap_sem); lock(&of->mutex); *** DEADLOCK *** 1 lock held by test-mmap-read/567: #0: (&mm->mmap_sem){++++++}, at: [] vm_mmap_pgoff+0x49/0xa0 stack backtrace: CPU: 3 PID: 567 Comm: test-mmap-read Not tainted 3.12.0-work+ #378 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ffffffff81ed41a0 ffff880009441bc8 ffffffff81611ad2 ffffffff81eccb80 ffff880009441c08 ffffffff8160f215 ffff880009441c60 ffff880009c75208 0000000000000000 ffff880009c751e0 ffff880009c75208 ffff880009c74ac0 Call Trace: [] dump_stack+0x4e/0x7a [] print_circular_bug+0x2b0/0x2bf [] __lock_acquire+0x1a3a/0x1e60 [] lock_acquire+0x9a/0x1d0 [] mutex_lock_nested+0x67/0x3f0 [] sysfs_bin_mmap+0x4f/0x120 [] mmap_region+0x3b3/0x5b0 [] do_mmap_pgoff+0x34e/0x3d0 [] vm_mmap_pgoff+0x6a/0xa0 [] SyS_mmap_pgoff+0xbe/0x250 [] SyS_mmap+0x22/0x30 [] system_call_fastpath+0x16/0x1b This happens because one file nests sr_mutex, which nests mm->mmap_sem under it, under of->mutex while mmap implementation naturally nests of->mutex under mm->mmap_sem. The warning is false positive as of->mutex is per open-file and the two paths belong to two different files. This warning didn't trigger before regular and bin file supports were merged because only bin file supported mmap and the other side of locking happened only on regular files which used equivalent but separate locking. It'd be best if we give separate locking classes per file but we can't easily do that. Let's differentiate on ->mmap() for now. Later we'll add explicit file operations struct and can add per-ops lockdep key there. Signed-off-by: Tejun Heo Reported-by: Dave Jones Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 79b5da2acbe1..b94f93685093 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -609,7 +609,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; struct sysfs_open_file *of; - bool has_read, has_write; + bool has_read, has_write, has_mmap; int error = -EACCES; /* need attr_sd for attr and ops, its parent for kobj */ @@ -621,6 +621,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) has_read = battr->read || battr->mmap; has_write = battr->write || battr->mmap; + has_mmap = battr->mmap; } else { const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); @@ -632,6 +633,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) has_read = ops->show; has_write = ops->store; + has_mmap = false; } /* check perms and supported operations */ @@ -649,7 +651,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file) if (!of) goto err_out; - mutex_init(&of->mutex); + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + of->sd = attr_sd; of->file = file; -- cgit v1.2.3-71-gd317 From a096b09aeec6ff99edfdfd8cee24d6f25377d585 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 10:15:58 +0800 Subject: ceph: queue cap release in __ceph_remove_cap() call __queue_cap_release() in __ceph_remove_cap(), this avoids acquiring s_cap_lock twice. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 21 +++++++++++---------- fs/ceph/mds_client.c | 6 ++---- fs/ceph/super.h | 8 +------- 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 13976c33332e..d2d6e40e7345 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -897,7 +897,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; @@ -909,6 +909,10 @@ void __ceph_remove_cap(struct ceph_cap *cap) /* remove from session list */ spin_lock(&session->s_cap_lock); + if (queue_release) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -1023,7 +1027,6 @@ void __queue_cap_release(struct ceph_mds_session *session, struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1052,7 +1055,6 @@ void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* @@ -1067,12 +1069,8 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } @@ -2791,7 +2789,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, } spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); } /* else, we already released it */ @@ -2931,9 +2929,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); + spin_unlock(&session->s_cap_lock); + } goto flush_cap_releases; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f51ab2627b41..8f8f5c043c37 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -986,7 +986,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1231,9 +1231,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } else { /* try to drop referring dentries */ spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6014b0a3c405..ef4ac38bb614 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -741,13 +741,7 @@ extern int ceph_add_cap(struct inode *inode, int fmode, unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - spin_lock(&cap->ci->i_ceph_lock); - __ceph_remove_cap(cap); - spin_unlock(&cap->ci->i_ceph_lock); -} +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); -- cgit v1.2.3-71-gd317 From 44c99757fae80e9db058e1f1d7419cf6472e9af1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 10:28:10 +0800 Subject: ceph: set caps count after composing cap reconnect message It's possible that some caps get released while composing the cap reconnect message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8f8f5c043c37..4a93d69abc8b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -43,6 +43,7 @@ */ struct ceph_reconnect_state { + int nr_caps; struct ceph_pagelist *pagelist; bool flock; }; @@ -2549,6 +2550,8 @@ encode_again: } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } + + recon_state->nr_caps++; out_free: kfree(path); out_dput: @@ -2576,6 +2579,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; + int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; @@ -2611,10 +2615,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, discard_cap_releases(mdsc, session); /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); if (err) goto fail; + recon_state.nr_caps = 0; recon_state.pagelist = pagelist; recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; err = iterate_session_caps(session, encode_caps_cb, &recon_state); @@ -2643,11 +2649,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - if (pagelist->length) { - /* set up outbound data if we have any */ - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); } + + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); -- cgit v1.2.3-71-gd317 From 99a9c273b94a087f8feaec6c5ffbe3205a2dbe51 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 11:08:14 +0800 Subject: ceph: handle race between cap reconnect and cap release When a cap get released while composing the cap reconnect message. We should skip queuing the release message if the cap hasn't been added to the cap reconnect message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 8 +++++++- fs/ceph/mds_client.c | 21 ++++++++++++++++++--- fs/ceph/mds_client.h | 1 + 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d2d6e40e7345..3c0a4bd74996 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -909,7 +909,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - if (queue_release) + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, cap->mseq, cap->issue_seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4a93d69abc8b..6d953ab0ac06 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -444,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -1415,7 +1416,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, unsigned num; dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); /* zero out the in-progress message */ msg = list_first_entry(&session->s_cap_releases, @@ -1442,8 +1442,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, msg->front.iov_len = sizeof(*head); list_add(&msg->list_head, &session->s_cap_releases); } - - spin_unlock(&session->s_cap_lock); } /* @@ -2488,6 +2486,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2611,8 +2610,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, session_state_name(session->s_state)); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); /* traverse this session's caps */ s_nr_caps = session->s_nr_caps; @@ -2627,6 +2638,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (err < 0) goto fail; + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c2a19fbbe517..4c053d099ae4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -132,6 +132,7 @@ struct ceph_mds_session { struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; + int s_cap_reconnect; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; -- cgit v1.2.3-71-gd317 From eb1b8af33c2e42a9a57fc0a7588f4a7b255d2e79 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 26 Sep 2013 14:25:36 +0800 Subject: ceph: cleanup aborted requests when re-sending requests. Aborted requests usually get cleared when the reply is received. If MDS crashes, no reply will be received. So we need to cleanup aborted requests when re-sending requests. Signed-off-by: Yan, Zheng Reviewed-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6d953ab0ac06..8ef79266d064 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1872,8 +1872,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { -- cgit v1.2.3-71-gd317 From fc55d2c9448b34218ca58733a6f51fbede09575b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 31 Oct 2013 09:10:47 +0800 Subject: ceph: wake up 'safe' waiters when unregistering request We also need to wake up 'safe' waiters if error occurs or request aborted. Otherwise sync(2)/fsync(2) may hang forever. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8ef79266d064..d90861f45210 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -644,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -2186,7 +2188,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* -- cgit v1.2.3-71-gd317 From ff638b7df5a9264024a6448bdfde2b2bf5d1994a Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sat, 9 Nov 2013 10:26:06 +0800 Subject: ceph: allocate non-zero page to fscache in readpage() ceph_osdc_readpages() returns number of bytes read, currently, the code only allocate full-zero page into fscache, this patch fixes this. Signed-off-by: Li Wang Reviewed-by: Milosz Tanski Reviewed-by: Sage Weil --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd481425..1e561c059539 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -216,7 +216,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) } SetPageUptodate(page); - if (err == 0) + if (err >= 0) ceph_readpage_to_fscache(inode, page); out: -- cgit v1.2.3-71-gd317 From 6d565409503f4e1f74ac30de14e8c91a2b826cd8 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sun, 24 Nov 2013 00:40:49 +0000 Subject: Squashfs: fix failure to unlock pages on decompress error Direct decompression into the page cache. If we fall back to using an intermediate buffer (because we cannot grab all the page cache pages) and we get a decompress fail, we forgot to release the pages. Reported-by: Roman Peniaev Signed-off-by: Phillip Lougher --- fs/squashfs/file_direct.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 2943b2bfae48..62a0de6632e1 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -84,6 +84,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) */ res = squashfs_read_cache(target_page, block, bsize, pages, page); + if (res < 0) + goto mark_errored; + goto out; } @@ -119,7 +122,7 @@ mark_errored: * dealt with by the caller */ for (i = 0; i < pages; i++) { - if (page[i] == target_page) + if (page[i] == NULL || page[i] == target_page) continue; flush_dcache_page(page[i]); SetPageError(page[i]); -- cgit v1.2.3-71-gd317 From c170bbb45febc03ac4d34ba2b8bb55e06104b7e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2013 16:32:22 -0700 Subject: block: submit_bio_wait() conversions It was being open coded in a few places. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Joern Engel Cc: Prasad Joshi Cc: Neil Brown Cc: Chris Mason Acked-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-flush.c | 19 +------------------ drivers/md/md.c | 12 +----------- fs/btrfs/check-integrity.c | 32 +++++++++++++------------------- fs/btrfs/check-integrity.h | 2 ++ fs/btrfs/extent_io.c | 12 +----------- fs/btrfs/scrub.c | 33 ++++----------------------------- fs/hfsplus/wrapper.c | 17 +---------------- fs/logfs/dev_bdev.c | 13 +------------ 8 files changed, 24 insertions(+), 116 deletions(-) (limited to 'fs') diff --git a/block/blk-flush.c b/block/blk-flush.c index 331e627301ea..fb6f3c0ffa49 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); -} - /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for @@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err) int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret = 0; @@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio->bi_end_io = bio_end_flush; bio->bi_bdev = bdev; - bio->bi_private = &wait; - bio_get(bio); - submit_bio(WRITE_FLUSH, bio); - wait_for_completion_io(&wait); + ret = submit_bio_wait(WRITE_FLUSH, bio); /* * The driver must store the error location in ->bi_sector, if @@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (error_sector) *error_sector = bio->bi_sector; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); return ret; } diff --git a/drivers/md/md.c b/drivers/md/md.c index b6b7a2866c9e..8700de376876 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -776,16 +776,10 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; int ret; rw |= REQ_SYNC; @@ -801,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e0aab4456974..f85b1c409003 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -323,7 +323,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -1677,7 +1676,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - DECLARE_COMPLETION_ONSTACK(complete); bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { @@ -1688,8 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_sector = dev_bytenr >> 9; - bio->bi_end_io = btrfsic_complete_bio_end_io; - bio->bi_private = &complete; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1702,12 +1698,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (submit_bio_wait(READ, bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -1730,11 +1721,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, return block_ctx->len; } -static void btrfsic_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static void btrfsic_dump_database(struct btrfsic_state *state) { struct list_head *elem_all; @@ -2998,14 +2984,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) return submit_bh(rw, bh); } -void btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(int rw, struct bio *bio) { struct btrfsic_dev_state *dev_state; - if (!btrfsic_is_initialized) { - submit_bio(rw, bio); + if (!btrfsic_is_initialized) return; - } mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before @@ -3097,10 +3081,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio) } leave: mutex_unlock(&btrfsic_mutex); +} +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); submit_bio(rw, bio); } +int btrfsic_submit_bio_wait(int rw, struct bio *bio) +{ + __btrfsic_submit_bio(rw, bio); + return submit_bio_wait(rw, bio); +} + int btrfsic_mount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices, int including_extent_data, u32 print_mask) diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 8b59175cc502..13b8566c97ab 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -22,9 +22,11 @@ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY int btrfsic_submit_bh(int rw, struct buffer_head *bh); void btrfsic_submit_bio(int rw, struct bio *bio); +int btrfsic_submit_bio_wait(int rw, struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio +#define btrfsic_submit_bio_wait submit_bio_wait #endif int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 856bc2b2192c..014beaa9458c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1952,11 +1952,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, return err; } -static void repair_io_failure_callback(struct bio *bio, int err) -{ - complete(bio->bi_private); -} - /* * this bypasses the standard btrfs submit functions deliberately, as * the standard behavior is to write all copies in a raid setup. here we only @@ -1973,7 +1968,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, { struct bio *bio; struct btrfs_device *dev; - DECLARE_COMPLETION_ONSTACK(compl); u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; @@ -1989,8 +1983,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_private = &compl; - bio->bi_end_io = repair_io_failure_callback; bio->bi_size = 0; map_length = length; @@ -2011,10 +2003,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start - page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2544805544f0..3214ebe593bd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write); @@ -1292,7 +1291,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; struct scrub_page *page = sblock->pagev[page_num]; - DECLARE_COMPLETION_ONSTACK(complete); if (page->dev->bdev == NULL) { page->io_error = 1; @@ -1309,18 +1307,11 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; bio_add_page(bio, page->page, PAGE_SIZE, 0); - btrfsic_submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(READ, bio)) sblock->no_io_error_seen = 0; + bio_put(bio); } @@ -1389,11 +1380,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, sblock->checksum_error = 1; } -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int force_write) @@ -1428,7 +1414,6 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, sblock_bad->checksum_error || page_bad->io_error) { struct bio *bio; int ret; - DECLARE_COMPLETION_ONSTACK(complete); if (!page_bad->dev->bdev) { printk_ratelimited(KERN_WARNING @@ -1441,19 +1426,14 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_sector = page_bad->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { bio_put(bio); return -EIO; } - btrfsic_submit_bio(WRITE, bio); - /* this will also unplug the queue */ - wait_for_completion(&complete); - if (!bio_flagged(bio, BIO_UPTODATE)) { + if (btrfsic_submit_bio_wait(WRITE, bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -3373,7 +3353,6 @@ static int write_page_nocow(struct scrub_ctx *sctx, struct bio *bio; struct btrfs_device *dev; int ret; - DECLARE_COMPLETION_ONSTACK(compl); dev = sctx->wr_ctx.tgtdev; if (!dev) @@ -3390,8 +3369,6 @@ static int write_page_nocow(struct scrub_ctx *sctx, spin_unlock(&sctx->stat_lock); return -ENOMEM; } - bio->bi_private = &compl; - bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_size = 0; bio->bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; @@ -3402,10 +3379,8 @@ leave_with_eio: btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b51a6079108d..e9a97a0d4314 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,13 +24,6 @@ struct hfsplus_wd { u16 embed_count; }; -static void hfsplus_end_io_sync(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - complete(bio->bi_private); -} - /* * hfsplus_submit_bio - Perfrom block I/O * @sb: super block of volume for I/O @@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err) int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, int rw) { - DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; int ret = 0; u64 io_size; @@ -73,8 +65,6 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; bio->bi_bdev = sb->s_bdev; - bio->bi_end_io = hfsplus_end_io_sync; - bio->bi_private = &wait; if (!(rw & WRITE) && data) *data = (u8 *)buf + offset; @@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, buf = (u8 *)buf + len; } - submit_bio(rw, bio); - wait_for_completion(&wait); - - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - + ret = submit_bio_wait(rw, bio); out: bio_put(bio); return ret < 0 ? ret : 0; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 550475ca6a0e..0f95f0d0b313 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,16 +14,10 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static void request_complete(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - static int sync_request(struct page *page, struct block_device *bdev, int rw) { struct bio bio; struct bio_vec bio_vec; - struct completion complete; bio_init(&bio); bio.bi_max_vecs = 1; @@ -35,13 +29,8 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_sector = page->index * (PAGE_SIZE >> 9); - init_completion(&complete); - bio.bi_private = &complete; - bio.bi_end_io = request_complete; - submit_bio(rw, &bio); - wait_for_completion(&complete); - return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; + return submit_bio_wait(rw, &bio); } static int bdev_readpage(void *_sb, struct page *page) -- cgit v1.2.3-71-gd317 From f19e84df37bda502a2248d507a9cf2b9e693279e Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 24 Nov 2013 21:53:17 -0600 Subject: [CIFS] Do not use btrfs refcopy ioctl for SMB2 copy offload Change cifs.ko to using CIFS_IOCTL_COPYCHUNK instead of BTRFS_IOC_CLONE to avoid confusion about whether copy-on-write is required or optional for this operation. SMB2/SMB3 copyoffload had used the BTRFS_IOC_CLONE ioctl since they both speed up copy by offloading the copy rather than passing many read and write requests back and forth and both have identical syntax (passing file handles), but for SMB2/SMB3 CopyChunk the server is not required to use copy-on-write to make a copy of the file (although some do), and Christoph has commented that since CopyChunk does not require copy-on-write we should not reuse BTRFS_IOC_CLONE. This patch renames the ioctl to use a cifs specific IOCTL CIFS_IOCTL_COPYCHUNK. This ioctl is particularly important for SMB2/SMB3 since large file copy over the network otherwise can be very slow, and with this is often more than 100 times faster putting less load on server and client. Note that if a copy syscall is ever introduced, depending on its requirements/format it could end up using one of the other three methods that CIFS/SMB2/SMB3 can do for copy offload, but this method is particularly useful for file copy and broadly supported (not just by Samba server). Signed-off-by: Steve French Reviewed-by: Jeff Layton Reviewed-by: David Disseldorp --- fs/cifs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 409b45eefe70..77492301cc2b 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -26,13 +26,15 @@ #include #include #include -#include #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) + static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, unsigned long srcfd, u64 off, u64 len, u64 destoff) { @@ -213,7 +215,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) cifs_dbg(FYI, "set compress flag rc %d\n", rc); } break; - case BTRFS_IOC_CLONE: + case CIFS_IOC_COPYCHUNK_FILE: rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); break; default: -- cgit v1.2.3-71-gd317 From 81440e73744446adbe5b5d1f19460203d275028f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Nov 2013 09:44:55 -0800 Subject: Revert "sysfs: handle duplicate removal attempts in sysfs_remove_group()" This reverts commit 54d71145a4548330313ca664a4a009772fe8b7dd. The root cause of these "inverted" sysfs removals have now been found, so there is no need for this patch. Keep this functionality around so that this type of error doesn't show up in driver code again. Cc: Mika Westerberg Cc: Rafael J. Wysocki Cc: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 3796afdff40c..1898a10e38ce 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -206,15 +206,6 @@ void sysfs_remove_group(struct kobject *kobj, struct sysfs_dirent *dir_sd = kobj->sd; struct sysfs_dirent *sd; - /* - * Sysfs directories are now removed recursively by - * sysfs_remove_dir(). This means that the function can be called - * for a group whose sysfs entry is already removed. In that case - * all its groups are guaranteed to be already removed. - */ - if (dir_sd->s_flags & SYSFS_FLAG_REMOVED) - return; - if (grp->name) { sd = sysfs_get_dirent(dir_sd, grp->name); if (!sd) { -- cgit v1.2.3-71-gd317 From dad337501d490b26fbf8d83baee99c788461c61c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 27 Nov 2013 15:15:57 -0500 Subject: remove obsolete references to powertweak This tool hasn't been maintained in over a decade, and is pretty much useless these days. Let's pretend it never happened. Also remove a long-dead email address. Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- Documentation/Changes | 11 ----------- drivers/pci/quirks.c | 4 ---- fs/affs/Changes | 2 +- 3 files changed, 1 insertion(+), 16 deletions(-) (limited to 'fs') diff --git a/Documentation/Changes b/Documentation/Changes index b17580885273..07c75d18154e 100644 --- a/Documentation/Changes +++ b/Documentation/Changes @@ -196,13 +196,6 @@ chmod 0644 /dev/cpu/microcode as root before you can use this. You'll probably also want to get the user-space microcode_ctl utility to use with this. -Powertweak ----------- - -If you are running v0.1.17 or earlier, you should upgrade to -version v0.99.0 or higher. Running old versions may cause problems -with programs using shared memory. - udev ---- udev is a userspace application for populating /dev dynamically with @@ -366,10 +359,6 @@ Intel P6 microcode ------------------ o -Powertweak ----------- -o - udev ---- o diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b3b1b9aa8863..3a02717473ad 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -9,10 +9,6 @@ * * Init/reset quirks for USB host controllers should be in the * USB quirks file, where their drivers can access reuse it. - * - * The bridge optimization stuff has been removed. If you really - * have a silly BIOS which is unable to set your host bridge right, - * use the PowerTweak utility (see http://powertweak.sourceforge.net). */ #include diff --git a/fs/affs/Changes b/fs/affs/Changes index a29409c1ffe0..b41c2c9792ff 100644 --- a/fs/affs/Changes +++ b/fs/affs/Changes @@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel] Version 3.11 ------------ -- Converted to use 2.3.x page cache [Dave Jones ] +- Converted to use 2.3.x page cache [Dave Jones] - Corruption in truncate() bugfix [Ken Tyler ] Version 3.10 -- cgit v1.2.3-71-gd317 From d870b4a191a389c661cd40aacb06981c26b5e504 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Nov 2013 01:48:32 -0500 Subject: fix bogus path_put() of nd->root after some unlazy_walk() failures Failure to grab reference to parent dentry should go through the same cleanup as nd->seq mismatch. As it is, we might end up with caller thinking it needs to path_put() nd->root, with obvious nasty results once we'd hit that bug enough times to drive the refcount of root dentry all the way to zero... Signed-off-by: Al Viro --- fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 8f77a8cea289..c53d3a9547f9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -513,8 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) if (!lockref_get_not_dead(&parent->d_lockref)) { nd->path.dentry = NULL; - rcu_read_unlock(); - return -ECHILD; + goto out; } /* -- cgit v1.2.3-71-gd317 From b0d8d2292160bb63de1972361ebed100c64b5b37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 2 Dec 2013 09:44:51 -0800 Subject: vfs: fix subtle use-after-free of pipe_inode_info The pipe code was trying (and failing) to be very careful about freeing the pipe info only after the last access, with a pattern like: spin_lock(&inode->i_lock); if (!--pipe->files) { inode->i_pipe = NULL; kill = 1; } spin_unlock(&inode->i_lock); __pipe_unlock(pipe); if (kill) free_pipe_info(pipe); where the final freeing is done last. HOWEVER. The above is actually broken, because while the freeing is done at the end, if we have two racing processes releasing the pipe inode info, the one that *doesn't* free it will decrement the ->files count, and unlock the inode i_lock, but then still use the "pipe_inode_info" afterwards when it does the "__pipe_unlock(pipe)". This is *very* hard to trigger in practice, since the race window is very small, and adding debug options seems to just hide it by slowing things down. Simon originally reported this way back in July as an Oops in kmem_cache_allocate due to a single bit corruption (due to the final "spin_unlock(pipe->mutex.wait_lock)" incrementing a field in a different allocation that had re-used the free'd pipe-info), it's taken this long to figure out. Since the 'pipe->files' accesses aren't even protected by the pipe lock (we very much use the inode lock for that), the simple solution is to just drop the pipe lock early. And since there were two users of this pattern, create a helper function for it. Introduced commit ba5bb147330a ("pipe: take allocation and freeing of pipe_inode_info out of ->i_mutex"). Reported-by: Simon Kirby Reported-by: Ian Applegate Acked-by: Al Viro Cc: stable@kernel.org # v3.10+ Signed-off-by: Linus Torvalds --- fs/pipe.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index d2c45e14e6d8..0e0752ef2715 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -726,11 +726,25 @@ pipe_poll(struct file *filp, poll_table *wait) return mask; } +static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) +{ + int kill = 0; + + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + + if (kill) + free_pipe_info(pipe); +} + static int pipe_release(struct inode *inode, struct file *file) { - struct pipe_inode_info *pipe = inode->i_pipe; - int kill = 0; + struct pipe_inode_info *pipe = file->private_data; __pipe_lock(pipe); if (file->f_mode & FMODE_READ) @@ -743,17 +757,9 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - spin_lock(&inode->i_lock); - if (!--pipe->files) { - inode->i_pipe = NULL; - kill = 1; - } - spin_unlock(&inode->i_lock); __pipe_unlock(pipe); - if (kill) - free_pipe_info(pipe); - + put_pipe_info(inode, pipe); return 0; } @@ -1014,7 +1020,6 @@ static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; - int kill = 0; int ret; filp->f_version = 0; @@ -1130,15 +1135,9 @@ err_wr: goto err; err: - spin_lock(&inode->i_lock); - if (!--pipe->files) { - inode->i_pipe = NULL; - kill = 1; - } - spin_unlock(&inode->i_lock); __pipe_unlock(pipe); - if (kill) - free_pipe_info(pipe); + + put_pipe_info(inode, pipe); return ret; } -- cgit v1.2.3-71-gd317 From 95f19f658ce177387345b51bd48fed7b9cd7d4e8 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Fri, 15 Nov 2013 12:56:31 +0530 Subject: epoll: drop EPOLLWAKEUP if PM_SLEEP is disabled Drop EPOLLWAKEUP from epoll events mask if CONFIG_PM_SLEEP is disabled. Signed-off-by: Amit Pundir Cc: John Stultz Cc: Alexander Viro Signed-off-by: Rafael J. Wysocki --- fs/eventpoll.c | 3 +-- include/uapi/linux/eventpoll.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 79b65c3b9e87..8b5e2584c840 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1852,8 +1852,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) - epds.events &= ~EPOLLWAKEUP; + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 2c267bcbb85c..bc81fb2e1f0e 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -61,5 +61,16 @@ struct epoll_event { __u64 data; } EPOLL_PACKED; - +#ifdef CONFIG_PM_SLEEP +static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) +{ + if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) + epev->events &= ~EPOLLWAKEUP; +} +#else +static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) +{ + epev->events &= ~EPOLLWAKEUP; +} +#endif #endif /* _UAPI_LINUX_EVENTPOLL_H */ -- cgit v1.2.3-71-gd317 From f22e5edd2244609aed3906207a62223e7707a34d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Dec 2013 12:09:45 -0500 Subject: NFSv4.1: Prevent a 3-way deadlock between layoutreturn, open and state recovery Andy Adamson reports: The state manager is recovering expired state and recovery OPENs are being processed. If kswapd is pruning inodes at the same time, a deadlock can occur when kswapd calls evict_inode on an NFSv4.1 inode with a layout, and the resultant layoutreturn gets an error that the state mangager is to handle, causing the layoutreturn to wait on the (NFS client) cl_rpcwaitq. At the same time an open is waiting for the inode deletion to complete in __wait_on_freeing_inode. If the open is either the open called by the state manager, or an open from the same open owner that is holding the NFSv4 sequence id which causes the OPEN from the state manager to wait for the sequence id on the Seqid_waitqueue, then the state is deadlocked with kswapd. The fix is simply to have layoutreturn ignore all errors except NFS4ERR_DELAY. We already know that layouts are dropped on all server reboots, and that it has to be coded to deal with the "forgetful client model" that doesn't send layoutreturns. Reported-by: Andy Adamson Link: http://lkml.kernel.org/r/1385402270-14284-1-git-send-email-andros@netapp.com Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f01e2aa53210..e040359983ce 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7599,7 +7599,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) return; server = NFS_SERVER(lrp->args.inode); - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + switch (task->tk_status) { + default: + task->tk_status = 0; + case 0: + break; + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + break; rpc_restart_call_prepare(task); return; } -- cgit v1.2.3-71-gd317 From 3873d064b8538686bbbd4b858dc8a07db1f7f43a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 2 Dec 2013 19:59:31 +0100 Subject: nfs: fix do_div() warning by instead using sector_div() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling a 32bit kernel with CONFIG_LBDAF=n the compiler complains like shown below. Fix this warning by instead using sector_div() which is provided by the kernel.h header file. fs/nfs/blocklayout/extents.c: In function ‘normalize’: include/asm-generic/div64.h:43:28: warning: comparison of distinct pointer types lacks a cast [enabled by default] fs/nfs/blocklayout/extents.c:47:13: note: in expansion of macro ‘do_div’ nfs/blocklayout/extents.c:47:2: warning: right shift count >= width of type [enabled by default] fs/nfs/blocklayout/extents.c:47:2: warning: passing argument 1 of ‘__div64_32’ from incompatible pointer type [enabled by default] include/asm-generic/div64.h:35:17: note: expected ‘uint64_t *’ but argument is of type ‘sector_t *’ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); Signed-off-by: Helge Deller Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 9c3e117c3ed1..4d0161442565 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -44,7 +44,7 @@ static inline sector_t normalize(sector_t s, int base) { sector_t tmp = s; /* Since do_div modifies its argument */ - return s - do_div(tmp, base); + return s - sector_div(tmp, base); } static inline sector_t normalize_up(sector_t s, int base) -- cgit v1.2.3-71-gd317 From d1b9432712a25eeb06114fb4b587133525a47de5 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 4 Dec 2013 18:19:06 +0800 Subject: aio: clean up aio ring in the fail path Clean up the aio ring file in the fail path of aio_setup_ring and ioctx_alloc. And maybe it can fix the GPF issue reported by Dave Jones: https://lkml.org/lkml/2013/11/25/898 Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index ad460d78d6c5..a2f92aa23ee3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -316,8 +316,10 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!ctx->ring_pages) + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); return -ENOMEM; + } } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -594,7 +596,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; - goto err; + goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); @@ -611,6 +613,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); +err_ctx: + aio_free_ring(ctx); err: free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); -- cgit v1.2.3-71-gd317 From 31978b5cc66b8ba8a7e8eef60b12395d41b7b890 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 31 Oct 2013 21:00:10 +0300 Subject: xfs: underflow bug in xfs_attrlist_by_handle() If we allocate less than sizeof(struct attrlist) then we end up corrupting memory or doing a ZERO_PTR_SIZE dereference. This can only be triggered with CAP_SYS_ADMIN. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit 071c529eb672648ee8ca3f90944bcbcc730b4c06) --- fs/xfs/xfs_ioctl.c | 3 ++- fs/xfs/xfs_ioctl32.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4d613401a5e0..33ad9a77791f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -442,7 +442,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e8fb1231db81..a7992f8de9d3 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* -- cgit v1.2.3-71-gd317 From 2f42d612e7d4c4fb1819ea7b2b6e18938714ae7a Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 20 Nov 2013 16:08:53 +0800 Subject: xfs: don't perform discard if the given range length is less than block size For discard operation, we should return EINVAL if the given range length is less than a block size, otherwise it will go through the file system to discard data blocks as the end range might be evaluated to -1, e.g, # fstrim -v -o 0 -l 100 /xfs7 /xfs7: 9811378176 bytes were trimmed This issue can be triggered via xfstests/generic/288. Also, it seems to get the request queue pointer via bdev_get_queue() instead of the hard code pointer dereference is not a bad thing. Signed-off-by: Jie Liu Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers (cherry picked from commit f9fd0135610084abef6867d984e9951c3099950d) --- fs/xfs/xfs_discard.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8367d6dc18c9..4f11ef011139 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -157,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -180,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); -- cgit v1.2.3-71-gd317 From f94c44573e7c22860e2c3dfe349c45f72ba35ad3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Nov 2013 15:41:06 +1100 Subject: xfs: growfs overruns AGFL buffer on V4 filesystems This loop in xfs_growfs_data_private() is incorrect for V4 superblocks filesystems: for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); For V4 filesystems, we don't have a agfl header structure, and so XFS_AGFL_SIZE() returns an entire sector's worth of entries, which we then index from an offset into the sector. Hence: buffer overrun. This problem was introduced in 3.10 by commit 77c95bba ("xfs: add CRC checks to the AGFL") which changed the AGFL structure but failed to update the growfs code to handle the different structures. Fix it by using the correct offset into the buffer for both V4 and V5 filesystems. Cc: Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Ben Myers (cherry picked from commit b7d961b35b3ab69609aeea93f870269cb6e7ba4d) --- fs/xfs/xfs_fsops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a6e54b3319bd..02fb943cbf22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -220,6 +220,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -279,8 +281,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); -- cgit v1.2.3-71-gd317 From 781c2a5a5f75eacc04663aced0f0f1a648d4f308 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Dec 2013 15:26:19 -0500 Subject: nfsd: when reusing an existing repcache entry, unhash it first The DRC code will attempt to reuse an existing, expired cache entry in preference to allocating a new one. It'll then search the cache, and if it gets a hit it'll then free the cache entry that it was going to reuse. The cache code doesn't unhash the entry that it's going to reuse however, so it's possible for it end up designating an entry for reuse and then subsequently freeing the same entry after it finds it. This leads it to a later use-after-free situation and usually some list corruption warnings or an oops. Fix this by simply unhashing the entry that we intend to reuse. That will mean that it's not findable via a search and should prevent this situation from occurring. Cc: stable@vger.kernel.org # v3.10+ Reported-by: Christoph Hellwig Reported-by: g. artim Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9186c7ce0b14..b6af150c96b8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -131,6 +131,13 @@ nfsd_reply_cache_alloc(void) return rp; } +static void +nfsd_reply_cache_unhash(struct svc_cacherep *rp) +{ + hlist_del_init(&rp->c_hash); + list_del_init(&rp->c_lru); +} + static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { @@ -417,7 +424,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); if (nfsd_cache_entry_expired(rp) || num_drc_entries >= max_drc_entries) { - lru_put_end(rp); + nfsd_reply_cache_unhash(rp); prune_cache_entries(); goto search_cache; } -- cgit v1.2.3-71-gd317 From a7e252af5a819eb71c9494e62b2bfca982e92f84 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 22 Nov 2013 18:47:59 +0800 Subject: Btrfs: don't clear the default compression type We met a oops caused by the wrong compression type: [ 556.512356] BUG: unable to handle kernel NULL pointer dereference at (null) [ 556.512370] IP: [] __list_del_entry+0x1/0x98 [SNIP] [ 556.512490] [] ? list_del+0xd/0x2b [ 556.512539] [] find_workspace+0x97/0x175 [btrfs] [ 556.512546] [] ? _raw_spin_lock+0xe/0x10 [ 556.512576] [] btrfs_compress_pages+0x2d/0xa2 [btrfs] [ 556.512601] [] compress_file_range.constprop.54+0x1f2/0x4e8 [btrfs] [ 556.512627] [] async_cow_start+0x32/0x4d [btrfs] [ 556.512655] [] worker_loop+0x144/0x4c3 [btrfs] [ 556.512661] [] ? finish_task_switch+0x80/0xb8 [ 556.512689] [] ? btrfs_queue_worker+0x244/0x244 [btrfs] [ 556.512695] [] kthread+0x8d/0x95 [ 556.512699] [] ? bit_waitqueue+0x34/0x7d [ 556.512704] [] ? __kthread_parkme+0x65/0x65 [ 556.512709] [] ret_from_fork+0x7c/0xb0 [ 556.512713] [] ? __kthread_parkme+0x65/0x65 Steps to reproduce: # mkfs.btrfs -f # mount -o nodatacow # touch / # chattr =c / # dd if=/dev/zero of=/ bs=1M count=10 It is because we cleared the default compression type when setting the nodatacow. In fact, we needn't do it because we have used COMPRESS flag to indicate if we need compressed the file data or not, needn't use the variant -- compress_type -- in btrfs_info to do the same thing, and just use it to hold the default compression type. Or we would get a wrong compress type for a file whose own compress flag is set but the compress flag of its filesystem is not set. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2d8ac1bf0cf9..d71a11d13dfa 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -432,7 +432,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else { printk(KERN_INFO "btrfs: setting nodatacow\n"); } - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); @@ -461,7 +460,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -474,9 +472,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); pr_info("btrfs: force %s compression\n", compress_type); - } else + } else if (btrfs_test_opt(root, COMPRESS)) { pr_info("btrfs: use %s compression\n", compress_type); + } break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); -- cgit v1.2.3-71-gd317 From e43f998e47bae27e37e159915625e8d4b130153b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Dec 2013 17:51:32 +0100 Subject: btrfs: call mnt_drop_write after interrupted subvol deletion If btrfs_ioctl_snap_destroy blocks on the mutex and the process is killed, mnt_write count is unbalanced and leads to unmountable filesystem. CC: stable@vger.kernel.org Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a111622598b0..21da5762b0b1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2121,7 +2121,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2284,6 +2284,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); -- cgit v1.2.3-71-gd317 From 639eefc8afd1b8b839b1fd5605378d869d3612a1 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sun, 8 Dec 2013 00:26:29 +0000 Subject: Btrfs: don't miss skinny extent items on delayed ref head contention Currently extent-tree.c:btrfs_lookup_extent_info() can miss the lookup of skinny extent items. This can happen when the execution flow is the following: * We do an extent tree lookup and fail to find a skinny extent item; * As a result, we attempt to see if a non-skinny extent item exists, either by looking at previous item in the leaf or by doing another full extent tree search; * We have a transaction and then we check for a matching delayed ref head in the transaction's delayed refs rbtree; * We find such delayed ref head and then we try to lock it with a call to mutex_trylock(); * The lock was contended so we jump to the label "again", which repeats the extent tree search but for a non-skinny extent item, because we set previously metadata variable to 0 and the search key to look for a non-skinny extent-item; * After the jump (and after releasing the transaction's delayed refs lock), a skinny extent item might have been added to the extent tree but we will miss it because metadata is set to 0 and the search key is set for a non-skinny extent-item. The fix here is to not reset metadata to 0 and to jump to the initial search key setup if the delayed ref head is contended, instead of jumping directly to the extent tree search label ("again"). This issue was found while investigating the issue reported at Bugzilla 64961. David Sterba suspected this function was missing extent items, and that this could be caused by the last change to this function, which was made in the following patch: [PATCH] Btrfs: optimize btrfs_lookup_extent_info() (commit 74be9510876a66ad9826613ac8a526d26f9e7f01) But in fact this issue already existed before, because after failing to find a skinny extent item, the code set the search key for a non-skinny extent item, and on contention of a matching delayed ref head it would not search the extent tree for a skinny extent item anymore. Signed-off-by: Filipe David Borba Manana Reviewed-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45d98d01028f..9c01509dd8ab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -767,20 +767,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); @@ -788,7 +787,6 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - metadata = 0; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -855,7 +853,7 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; -- cgit v1.2.3-71-gd317 From c974c4642fee8c58239c7753d0bf548b23799923 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 11 Dec 2013 19:29:51 +0800 Subject: Btrfs: fix an oops when doing balance relocation I hit an oops when inserting reloc root into @reloc_root_tree(it can be easily triggered when forcing cow for relocation root) [ 866.494539] [] btrfs_init_reloc_root+0x79/0xb0 [btrfs] [ 866.495321] [] record_root_in_trans+0xb0/0x110 [btrfs] [ 866.496109] [] btrfs_record_root_in_trans+0x48/0x80 [btrfs] [ 866.496908] [] select_reloc_root+0xa8/0x210 [btrfs] [ 866.497703] [] do_relocation+0x16a/0x540 [btrfs] This is because reloc root inserted into @reloc_root_tree is not within one transaction,reloc root may be cowed and root block bytenr will be reused then oops happens.We should update reloc root in @reloc_root_tree when cow reloc root node, fix it. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 70 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ce459a7cb16d..7cdc76007c6c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1264,10 +1264,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1275,7 +1275,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1283,23 +1283,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_unlock(&rc->reloc_root_tree.lock); if (!node) - return 0; + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); return 0; } @@ -1420,7 +1442,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) @@ -1432,11 +1453,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (root->fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -2287,7 +2306,7 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -2332,7 +2351,7 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -4522,6 +4541,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) -- cgit v1.2.3-71-gd317 From 6646374863508e24da7c7d21527f8dadc8687ff9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 10 Dec 2013 00:14:34 +0800 Subject: Btrfs: skip building backref tree for uuid and quota tree when doing balance relocation Quota tree and UUID Tree is only cowed, they can not be snapshoted. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7cdc76007c6c..0a3f02562053 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -571,7 +571,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } -- cgit v1.2.3-71-gd317 From 467bb1d27c0b783b73e6349304c0d90b5b4f431b Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 11 Dec 2013 19:29:52 +0800 Subject: Btrfs: make sure we cleanup all reloc roots if error happens I hit an oops when merging reloc roots fails, the reason is that new reloc roots may be added and we should make sure we cleanup all reloc roots. Signed-off-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0a3f02562053..429c73c374b8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2409,6 +2409,13 @@ out: btrfs_std_error(root->fs_info, ret); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); -- cgit v1.2.3-71-gd317 From 700ff4f095d78af0998953e922e041d75254518b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 10 Jan 2013 03:57:25 -0500 Subject: Btrfs: fix access_ok() check in btrfs_ioctl_send() The closing parenthesis is in the wrong place. We want to check "sizeof(*arg->clone_sources) * arg->clone_sources_count" instead of "sizeof(*arg->clone_sources * arg->clone_sources_count)". Signed-off-by: Dan Carpenter Reviewed-by: Jie Liu Signed-off-by: Chris Mason cc: stable@vger.kernel.org --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6837fe87f3a6..945d1db98f26 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4723,8 +4723,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } -- cgit v1.2.3-71-gd317 From a5c21dcefa1c3d759457a604b3cfc4af29c8713f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 12 Dec 2013 17:40:21 +0000 Subject: dcache: allow word-at-a-time name hashing with big-endian CPUs When explicitly hashing the end of a string with the word-at-a-time interface, we have to be careful which end of the word we pick up. On big-endian CPUs, the upper-bits will contain the data we're after, so ensure we generate our masks accordingly (and avoid hashing whatever random junk may have been sitting after the string). This patch adds a new dcache helper, bytemask_from_count, which creates a mask appropriate for the CPU endianness. Cc: Al Viro Signed-off-by: Will Deacon Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/namei.c | 7 +------ include/linux/dcache.h | 2 ++ 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 4bdb300b16e2..6055d61811d3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } diff --git a/fs/namei.c b/fs/namei.c index c53d3a9547f9..3531deebad30 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1598,11 +1598,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1646,7 +1641,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 57e87e749a48..bf72e9ac6de0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -29,8 +29,10 @@ struct vfsmount; /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len; + #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash; + #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* -- cgit v1.2.3-71-gd317 From ae5758a1a7b7bdfdcfaf2d78a5a0f8675149dd79 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Dec 2013 17:12:22 -0800 Subject: procfs: also fix proc_reg_get_unmapped_area() for !MMU case Commit fad1a86e25e0 ("procfs: call default get_unmapped_area on MMU-present architectures"), as its title says, took care of only the MMU case, leaving the !MMU side still in the regressed state (returning -EIO in all cases where pde->proc_fops->get_unmapped_area is NULL). From the fad1a86e25e0 changelog: "Commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)") added proc_reg_get_unmapped_area in proc_reg_file_ops and proc_reg_file_ops_no_compat, by which now mmap always returns EIO if get_unmapped_area method is not defined for the target procfs file, which causes regression of mmap on /proc/vmcore. To address this issue, like get_unmapped_area(), call default current->mm->get_unmapped_area on MMU-present architectures if pde->proc_fops->get_unmapped_area, i.e. the one in actual file operation in the procfs file, is not defined" Signed-off-by: Jan Beulich Cc: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Cc: [3.12.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 28955d4b7218..124fc43c7090 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long) = NULL; + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif - if (pde->proc_fops->get_unmapped_area) - get_area = pde->proc_fops->get_unmapped_area; + if (get_area) rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; unuse_pde(pde); } return rv; -- cgit v1.2.3-71-gd317