journal.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
journal.c (91244B)
      1// SPDX-License-Identifier: GPL-2.0+
      2/*
      3 * linux/fs/jbd2/journal.c
      4 *
      5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
      6 *
      7 * Copyright 1998 Red Hat corp --- All Rights Reserved
      8 *
      9 * Generic filesystem journal-writing code; part of the ext2fs
     10 * journaling system.
     11 *
     12 * This file manages journals: areas of disk reserved for logging
     13 * transactional updates.  This includes the kernel journaling thread
     14 * which is responsible for scheduling updates to the log.
     15 *
     16 * We do not actually manage the physical storage of the journal in this
     17 * file: that is left to a per-journal policy function, which allows us
     18 * to store the journal within a filesystem-specified area for ext2
     19 * journaling (ext2 can use a reserved inode for storing the log).
     20 */
     21
     22#include <linux/module.h>
     23#include <linux/time.h>
     24#include <linux/fs.h>
     25#include <linux/jbd2.h>
     26#include <linux/errno.h>
     27#include <linux/slab.h>
     28#include <linux/init.h>
     29#include <linux/mm.h>
     30#include <linux/freezer.h>
     31#include <linux/pagemap.h>
     32#include <linux/kthread.h>
     33#include <linux/poison.h>
     34#include <linux/proc_fs.h>
     35#include <linux/seq_file.h>
     36#include <linux/math64.h>
     37#include <linux/hash.h>
     38#include <linux/log2.h>
     39#include <linux/vmalloc.h>
     40#include <linux/backing-dev.h>
     41#include <linux/bitops.h>
     42#include <linux/ratelimit.h>
     43#include <linux/sched/mm.h>
     44
     45#define CREATE_TRACE_POINTS
     46#include <trace/events/jbd2.h>
     47
     48#include <linux/uaccess.h>
     49#include <asm/page.h>
     50
     51#ifdef CONFIG_JBD2_DEBUG
     52ushort jbd2_journal_enable_debug __read_mostly;
     53EXPORT_SYMBOL(jbd2_journal_enable_debug);
     54
     55module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
     56MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
     57#endif
     58
     59EXPORT_SYMBOL(jbd2_journal_extend);
     60EXPORT_SYMBOL(jbd2_journal_stop);
     61EXPORT_SYMBOL(jbd2_journal_lock_updates);
     62EXPORT_SYMBOL(jbd2_journal_unlock_updates);
     63EXPORT_SYMBOL(jbd2_journal_get_write_access);
     64EXPORT_SYMBOL(jbd2_journal_get_create_access);
     65EXPORT_SYMBOL(jbd2_journal_get_undo_access);
     66EXPORT_SYMBOL(jbd2_journal_set_triggers);
     67EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
     68EXPORT_SYMBOL(jbd2_journal_forget);
     69EXPORT_SYMBOL(jbd2_journal_flush);
     70EXPORT_SYMBOL(jbd2_journal_revoke);
     71
     72EXPORT_SYMBOL(jbd2_journal_init_dev);
     73EXPORT_SYMBOL(jbd2_journal_init_inode);
     74EXPORT_SYMBOL(jbd2_journal_check_used_features);
     75EXPORT_SYMBOL(jbd2_journal_check_available_features);
     76EXPORT_SYMBOL(jbd2_journal_set_features);
     77EXPORT_SYMBOL(jbd2_journal_load);
     78EXPORT_SYMBOL(jbd2_journal_destroy);
     79EXPORT_SYMBOL(jbd2_journal_abort);
     80EXPORT_SYMBOL(jbd2_journal_errno);
     81EXPORT_SYMBOL(jbd2_journal_ack_err);
     82EXPORT_SYMBOL(jbd2_journal_clear_err);
     83EXPORT_SYMBOL(jbd2_log_wait_commit);
     84EXPORT_SYMBOL(jbd2_log_start_commit);
     85EXPORT_SYMBOL(jbd2_journal_start_commit);
     86EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
     87EXPORT_SYMBOL(jbd2_journal_wipe);
     88EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
     89EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
     90EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
     91EXPORT_SYMBOL(jbd2_journal_force_commit);
     92EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
     93EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
     94EXPORT_SYMBOL(jbd2_journal_submit_inode_data_buffers);
     95EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
     96EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
     97EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
     98EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
     99EXPORT_SYMBOL(jbd2_inode_cache);
    100
    101static int jbd2_journal_create_slab(size_t slab_size);
    102
    103#ifdef CONFIG_JBD2_DEBUG
    104void __jbd2_debug(int level, const char *file, const char *func,
    105		  unsigned int line, const char *fmt, ...)
    106{
    107	struct va_format vaf;
    108	va_list args;
    109
    110	if (level > jbd2_journal_enable_debug)
    111		return;
    112	va_start(args, fmt);
    113	vaf.fmt = fmt;
    114	vaf.va = &args;
    115	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
    116	va_end(args);
    117}
    118EXPORT_SYMBOL(__jbd2_debug);
    119#endif
    120
    121/* Checksumming functions */
    122static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
    123{
    124	if (!jbd2_journal_has_csum_v2or3_feature(j))
    125		return 1;
    126
    127	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
    128}
    129
    130static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
    131{
    132	__u32 csum;
    133	__be32 old_csum;
    134
    135	old_csum = sb->s_checksum;
    136	sb->s_checksum = 0;
    137	csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
    138	sb->s_checksum = old_csum;
    139
    140	return cpu_to_be32(csum);
    141}
    142
    143/*
    144 * Helper function used to manage commit timeouts
    145 */
    146
    147static void commit_timeout(struct timer_list *t)
    148{
    149	journal_t *journal = from_timer(journal, t, j_commit_timer);
    150
    151	wake_up_process(journal->j_task);
    152}
    153
    154/*
    155 * kjournald2: The main thread function used to manage a logging device
    156 * journal.
    157 *
    158 * This kernel thread is responsible for two things:
    159 *
    160 * 1) COMMIT:  Every so often we need to commit the current state of the
    161 *    filesystem to disk.  The journal thread is responsible for writing
    162 *    all of the metadata buffers to disk. If a fast commit is ongoing
    163 *    journal thread waits until it's done and then continues from
    164 *    there on.
    165 *
    166 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
    167 *    of the data in that part of the log has been rewritten elsewhere on
    168 *    the disk.  Flushing these old buffers to reclaim space in the log is
    169 *    known as checkpointing, and this thread is responsible for that job.
    170 */
    171
    172static int kjournald2(void *arg)
    173{
    174	journal_t *journal = arg;
    175	transaction_t *transaction;
    176
    177	/*
    178	 * Set up an interval timer which can be used to trigger a commit wakeup
    179	 * after the commit interval expires
    180	 */
    181	timer_setup(&journal->j_commit_timer, commit_timeout, 0);
    182
    183	set_freezable();
    184
    185	/* Record that the journal thread is running */
    186	journal->j_task = current;
    187	wake_up(&journal->j_wait_done_commit);
    188
    189	/*
    190	 * Make sure that no allocations from this kernel thread will ever
    191	 * recurse to the fs layer because we are responsible for the
    192	 * transaction commit and any fs involvement might get stuck waiting for
    193	 * the trasn. commit.
    194	 */
    195	memalloc_nofs_save();
    196
    197	/*
    198	 * And now, wait forever for commit wakeup events.
    199	 */
    200	write_lock(&journal->j_state_lock);
    201
    202loop:
    203	if (journal->j_flags & JBD2_UNMOUNT)
    204		goto end_loop;
    205
    206	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
    207		journal->j_commit_sequence, journal->j_commit_request);
    208
    209	if (journal->j_commit_sequence != journal->j_commit_request) {
    210		jbd_debug(1, "OK, requests differ\n");
    211		write_unlock(&journal->j_state_lock);
    212		del_timer_sync(&journal->j_commit_timer);
    213		jbd2_journal_commit_transaction(journal);
    214		write_lock(&journal->j_state_lock);
    215		goto loop;
    216	}
    217
    218	wake_up(&journal->j_wait_done_commit);
    219	if (freezing(current)) {
    220		/*
    221		 * The simpler the better. Flushing journal isn't a
    222		 * good idea, because that depends on threads that may
    223		 * be already stopped.
    224		 */
    225		jbd_debug(1, "Now suspending kjournald2\n");
    226		write_unlock(&journal->j_state_lock);
    227		try_to_freeze();
    228		write_lock(&journal->j_state_lock);
    229	} else {
    230		/*
    231		 * We assume on resume that commits are already there,
    232		 * so we don't sleep
    233		 */
    234		DEFINE_WAIT(wait);
    235		int should_sleep = 1;
    236
    237		prepare_to_wait(&journal->j_wait_commit, &wait,
    238				TASK_INTERRUPTIBLE);
    239		if (journal->j_commit_sequence != journal->j_commit_request)
    240			should_sleep = 0;
    241		transaction = journal->j_running_transaction;
    242		if (transaction && time_after_eq(jiffies,
    243						transaction->t_expires))
    244			should_sleep = 0;
    245		if (journal->j_flags & JBD2_UNMOUNT)
    246			should_sleep = 0;
    247		if (should_sleep) {
    248			write_unlock(&journal->j_state_lock);
    249			schedule();
    250			write_lock(&journal->j_state_lock);
    251		}
    252		finish_wait(&journal->j_wait_commit, &wait);
    253	}
    254
    255	jbd_debug(1, "kjournald2 wakes\n");
    256
    257	/*
    258	 * Were we woken up by a commit wakeup event?
    259	 */
    260	transaction = journal->j_running_transaction;
    261	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
    262		journal->j_commit_request = transaction->t_tid;
    263		jbd_debug(1, "woke because of timeout\n");
    264	}
    265	goto loop;
    266
    267end_loop:
    268	del_timer_sync(&journal->j_commit_timer);
    269	journal->j_task = NULL;
    270	wake_up(&journal->j_wait_done_commit);
    271	jbd_debug(1, "Journal thread exiting.\n");
    272	write_unlock(&journal->j_state_lock);
    273	return 0;
    274}
    275
    276static int jbd2_journal_start_thread(journal_t *journal)
    277{
    278	struct task_struct *t;
    279
    280	t = kthread_run(kjournald2, journal, "jbd2/%s",
    281			journal->j_devname);
    282	if (IS_ERR(t))
    283		return PTR_ERR(t);
    284
    285	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
    286	return 0;
    287}
    288
    289static void journal_kill_thread(journal_t *journal)
    290{
    291	write_lock(&journal->j_state_lock);
    292	journal->j_flags |= JBD2_UNMOUNT;
    293
    294	while (journal->j_task) {
    295		write_unlock(&journal->j_state_lock);
    296		wake_up(&journal->j_wait_commit);
    297		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
    298		write_lock(&journal->j_state_lock);
    299	}
    300	write_unlock(&journal->j_state_lock);
    301}
    302
    303/*
    304 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
    305 *
    306 * Writes a metadata buffer to a given disk block.  The actual IO is not
    307 * performed but a new buffer_head is constructed which labels the data
    308 * to be written with the correct destination disk block.
    309 *
    310 * Any magic-number escaping which needs to be done will cause a
    311 * copy-out here.  If the buffer happens to start with the
    312 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
    313 * magic number is only written to the log for descripter blocks.  In
    314 * this case, we copy the data and replace the first word with 0, and we
    315 * return a result code which indicates that this buffer needs to be
    316 * marked as an escaped buffer in the corresponding log descriptor
    317 * block.  The missing word can then be restored when the block is read
    318 * during recovery.
    319 *
    320 * If the source buffer has already been modified by a new transaction
    321 * since we took the last commit snapshot, we use the frozen copy of
    322 * that data for IO. If we end up using the existing buffer_head's data
    323 * for the write, then we have to make sure nobody modifies it while the
    324 * IO is in progress. do_get_write_access() handles this.
    325 *
    326 * The function returns a pointer to the buffer_head to be used for IO.
    327 *
    328 *
    329 * Return value:
    330 *  <0: Error
    331 * >=0: Finished OK
    332 *
    333 * On success:
    334 * Bit 0 set == escape performed on the data
    335 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
    336 */
    337
    338int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
    339				  struct journal_head  *jh_in,
    340				  struct buffer_head **bh_out,
    341				  sector_t blocknr)
    342{
    343	int need_copy_out = 0;
    344	int done_copy_out = 0;
    345	int do_escape = 0;
    346	char *mapped_data;
    347	struct buffer_head *new_bh;
    348	struct page *new_page;
    349	unsigned int new_offset;
    350	struct buffer_head *bh_in = jh2bh(jh_in);
    351	journal_t *journal = transaction->t_journal;
    352
    353	/*
    354	 * The buffer really shouldn't be locked: only the current committing
    355	 * transaction is allowed to write it, so nobody else is allowed
    356	 * to do any IO.
    357	 *
    358	 * akpm: except if we're journalling data, and write() output is
    359	 * also part of a shared mapping, and another thread has
    360	 * decided to launch a writepage() against this buffer.
    361	 */
    362	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
    363
    364	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
    365
    366	/* keep subsequent assertions sane */
    367	atomic_set(&new_bh->b_count, 1);
    368
    369	spin_lock(&jh_in->b_state_lock);
    370repeat:
    371	/*
    372	 * If a new transaction has already done a buffer copy-out, then
    373	 * we use that version of the data for the commit.
    374	 */
    375	if (jh_in->b_frozen_data) {
    376		done_copy_out = 1;
    377		new_page = virt_to_page(jh_in->b_frozen_data);
    378		new_offset = offset_in_page(jh_in->b_frozen_data);
    379	} else {
    380		new_page = jh2bh(jh_in)->b_page;
    381		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
    382	}
    383
    384	mapped_data = kmap_atomic(new_page);
    385	/*
    386	 * Fire data frozen trigger if data already wasn't frozen.  Do this
    387	 * before checking for escaping, as the trigger may modify the magic
    388	 * offset.  If a copy-out happens afterwards, it will have the correct
    389	 * data in the buffer.
    390	 */
    391	if (!done_copy_out)
    392		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
    393					   jh_in->b_triggers);
    394
    395	/*
    396	 * Check for escaping
    397	 */
    398	if (*((__be32 *)(mapped_data + new_offset)) ==
    399				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
    400		need_copy_out = 1;
    401		do_escape = 1;
    402	}
    403	kunmap_atomic(mapped_data);
    404
    405	/*
    406	 * Do we need to do a data copy?
    407	 */
    408	if (need_copy_out && !done_copy_out) {
    409		char *tmp;
    410
    411		spin_unlock(&jh_in->b_state_lock);
    412		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
    413		if (!tmp) {
    414			brelse(new_bh);
    415			return -ENOMEM;
    416		}
    417		spin_lock(&jh_in->b_state_lock);
    418		if (jh_in->b_frozen_data) {
    419			jbd2_free(tmp, bh_in->b_size);
    420			goto repeat;
    421		}
    422
    423		jh_in->b_frozen_data = tmp;
    424		mapped_data = kmap_atomic(new_page);
    425		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
    426		kunmap_atomic(mapped_data);
    427
    428		new_page = virt_to_page(tmp);
    429		new_offset = offset_in_page(tmp);
    430		done_copy_out = 1;
    431
    432		/*
    433		 * This isn't strictly necessary, as we're using frozen
    434		 * data for the escaping, but it keeps consistency with
    435		 * b_frozen_data usage.
    436		 */
    437		jh_in->b_frozen_triggers = jh_in->b_triggers;
    438	}
    439
    440	/*
    441	 * Did we need to do an escaping?  Now we've done all the
    442	 * copying, we can finally do so.
    443	 */
    444	if (do_escape) {
    445		mapped_data = kmap_atomic(new_page);
    446		*((unsigned int *)(mapped_data + new_offset)) = 0;
    447		kunmap_atomic(mapped_data);
    448	}
    449
    450	set_bh_page(new_bh, new_page, new_offset);
    451	new_bh->b_size = bh_in->b_size;
    452	new_bh->b_bdev = journal->j_dev;
    453	new_bh->b_blocknr = blocknr;
    454	new_bh->b_private = bh_in;
    455	set_buffer_mapped(new_bh);
    456	set_buffer_dirty(new_bh);
    457
    458	*bh_out = new_bh;
    459
    460	/*
    461	 * The to-be-written buffer needs to get moved to the io queue,
    462	 * and the original buffer whose contents we are shadowing or
    463	 * copying is moved to the transaction's shadow queue.
    464	 */
    465	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
    466	spin_lock(&journal->j_list_lock);
    467	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
    468	spin_unlock(&journal->j_list_lock);
    469	set_buffer_shadow(bh_in);
    470	spin_unlock(&jh_in->b_state_lock);
    471
    472	return do_escape | (done_copy_out << 1);
    473}
    474
    475/*
    476 * Allocation code for the journal file.  Manage the space left in the
    477 * journal, so that we can begin checkpointing when appropriate.
    478 */
    479
    480/*
    481 * Called with j_state_lock locked for writing.
    482 * Returns true if a transaction commit was started.
    483 */
    484int __jbd2_log_start_commit(journal_t *journal, tid_t target)
    485{
    486	/* Return if the txn has already requested to be committed */
    487	if (journal->j_commit_request == target)
    488		return 0;
    489
    490	/*
    491	 * The only transaction we can possibly wait upon is the
    492	 * currently running transaction (if it exists).  Otherwise,
    493	 * the target tid must be an old one.
    494	 */
    495	if (journal->j_running_transaction &&
    496	    journal->j_running_transaction->t_tid == target) {
    497		/*
    498		 * We want a new commit: OK, mark the request and wakeup the
    499		 * commit thread.  We do _not_ do the commit ourselves.
    500		 */
    501
    502		journal->j_commit_request = target;
    503		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
    504			  journal->j_commit_request,
    505			  journal->j_commit_sequence);
    506		journal->j_running_transaction->t_requested = jiffies;
    507		wake_up(&journal->j_wait_commit);
    508		return 1;
    509	} else if (!tid_geq(journal->j_commit_request, target))
    510		/* This should never happen, but if it does, preserve
    511		   the evidence before kjournald goes into a loop and
    512		   increments j_commit_sequence beyond all recognition. */
    513		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
    514			  journal->j_commit_request,
    515			  journal->j_commit_sequence,
    516			  target, journal->j_running_transaction ?
    517			  journal->j_running_transaction->t_tid : 0);
    518	return 0;
    519}
    520
    521int jbd2_log_start_commit(journal_t *journal, tid_t tid)
    522{
    523	int ret;
    524
    525	write_lock(&journal->j_state_lock);
    526	ret = __jbd2_log_start_commit(journal, tid);
    527	write_unlock(&journal->j_state_lock);
    528	return ret;
    529}
    530
    531/*
    532 * Force and wait any uncommitted transactions.  We can only force the running
    533 * transaction if we don't have an active handle, otherwise, we will deadlock.
    534 * Returns: <0 in case of error,
    535 *           0 if nothing to commit,
    536 *           1 if transaction was successfully committed.
    537 */
    538static int __jbd2_journal_force_commit(journal_t *journal)
    539{
    540	transaction_t *transaction = NULL;
    541	tid_t tid;
    542	int need_to_start = 0, ret = 0;
    543
    544	read_lock(&journal->j_state_lock);
    545	if (journal->j_running_transaction && !current->journal_info) {
    546		transaction = journal->j_running_transaction;
    547		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
    548			need_to_start = 1;
    549	} else if (journal->j_committing_transaction)
    550		transaction = journal->j_committing_transaction;
    551
    552	if (!transaction) {
    553		/* Nothing to commit */
    554		read_unlock(&journal->j_state_lock);
    555		return 0;
    556	}
    557	tid = transaction->t_tid;
    558	read_unlock(&journal->j_state_lock);
    559	if (need_to_start)
    560		jbd2_log_start_commit(journal, tid);
    561	ret = jbd2_log_wait_commit(journal, tid);
    562	if (!ret)
    563		ret = 1;
    564
    565	return ret;
    566}
    567
    568/**
    569 * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
    570 * calling process is not within transaction.
    571 *
    572 * @journal: journal to force
    573 * Returns true if progress was made.
    574 *
    575 * This is used for forcing out undo-protected data which contains
    576 * bitmaps, when the fs is running out of space.
    577 */
    578int jbd2_journal_force_commit_nested(journal_t *journal)
    579{
    580	int ret;
    581
    582	ret = __jbd2_journal_force_commit(journal);
    583	return ret > 0;
    584}
    585
    586/**
    587 * jbd2_journal_force_commit() - force any uncommitted transactions
    588 * @journal: journal to force
    589 *
    590 * Caller want unconditional commit. We can only force the running transaction
    591 * if we don't have an active handle, otherwise, we will deadlock.
    592 */
    593int jbd2_journal_force_commit(journal_t *journal)
    594{
    595	int ret;
    596
    597	J_ASSERT(!current->journal_info);
    598	ret = __jbd2_journal_force_commit(journal);
    599	if (ret > 0)
    600		ret = 0;
    601	return ret;
    602}
    603
    604/*
    605 * Start a commit of the current running transaction (if any).  Returns true
    606 * if a transaction is going to be committed (or is currently already
    607 * committing), and fills its tid in at *ptid
    608 */
    609int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
    610{
    611	int ret = 0;
    612
    613	write_lock(&journal->j_state_lock);
    614	if (journal->j_running_transaction) {
    615		tid_t tid = journal->j_running_transaction->t_tid;
    616
    617		__jbd2_log_start_commit(journal, tid);
    618		/* There's a running transaction and we've just made sure
    619		 * it's commit has been scheduled. */
    620		if (ptid)
    621			*ptid = tid;
    622		ret = 1;
    623	} else if (journal->j_committing_transaction) {
    624		/*
    625		 * If commit has been started, then we have to wait for
    626		 * completion of that transaction.
    627		 */
    628		if (ptid)
    629			*ptid = journal->j_committing_transaction->t_tid;
    630		ret = 1;
    631	}
    632	write_unlock(&journal->j_state_lock);
    633	return ret;
    634}
    635
    636/*
    637 * Return 1 if a given transaction has not yet sent barrier request
    638 * connected with a transaction commit. If 0 is returned, transaction
    639 * may or may not have sent the barrier. Used to avoid sending barrier
    640 * twice in common cases.
    641 */
    642int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
    643{
    644	int ret = 0;
    645	transaction_t *commit_trans;
    646
    647	if (!(journal->j_flags & JBD2_BARRIER))
    648		return 0;
    649	read_lock(&journal->j_state_lock);
    650	/* Transaction already committed? */
    651	if (tid_geq(journal->j_commit_sequence, tid))
    652		goto out;
    653	commit_trans = journal->j_committing_transaction;
    654	if (!commit_trans || commit_trans->t_tid != tid) {
    655		ret = 1;
    656		goto out;
    657	}
    658	/*
    659	 * Transaction is being committed and we already proceeded to
    660	 * submitting a flush to fs partition?
    661	 */
    662	if (journal->j_fs_dev != journal->j_dev) {
    663		if (!commit_trans->t_need_data_flush ||
    664		    commit_trans->t_state >= T_COMMIT_DFLUSH)
    665			goto out;
    666	} else {
    667		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
    668			goto out;
    669	}
    670	ret = 1;
    671out:
    672	read_unlock(&journal->j_state_lock);
    673	return ret;
    674}
    675EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
    676
    677/*
    678 * Wait for a specified commit to complete.
    679 * The caller may not hold the journal lock.
    680 */
    681int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
    682{
    683	int err = 0;
    684
    685	read_lock(&journal->j_state_lock);
    686#ifdef CONFIG_PROVE_LOCKING
    687	/*
    688	 * Some callers make sure transaction is already committing and in that
    689	 * case we cannot block on open handles anymore. So don't warn in that
    690	 * case.
    691	 */
    692	if (tid_gt(tid, journal->j_commit_sequence) &&
    693	    (!journal->j_committing_transaction ||
    694	     journal->j_committing_transaction->t_tid != tid)) {
    695		read_unlock(&journal->j_state_lock);
    696		jbd2_might_wait_for_commit(journal);
    697		read_lock(&journal->j_state_lock);
    698	}
    699#endif
    700#ifdef CONFIG_JBD2_DEBUG
    701	if (!tid_geq(journal->j_commit_request, tid)) {
    702		printk(KERN_ERR
    703		       "%s: error: j_commit_request=%u, tid=%u\n",
    704		       __func__, journal->j_commit_request, tid);
    705	}
    706#endif
    707	while (tid_gt(tid, journal->j_commit_sequence)) {
    708		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
    709				  tid, journal->j_commit_sequence);
    710		read_unlock(&journal->j_state_lock);
    711		wake_up(&journal->j_wait_commit);
    712		wait_event(journal->j_wait_done_commit,
    713				!tid_gt(tid, journal->j_commit_sequence));
    714		read_lock(&journal->j_state_lock);
    715	}
    716	read_unlock(&journal->j_state_lock);
    717
    718	if (unlikely(is_journal_aborted(journal)))
    719		err = -EIO;
    720	return err;
    721}
    722
    723/*
    724 * Start a fast commit. If there's an ongoing fast or full commit wait for
    725 * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
    726 * if a fast commit is not needed, either because there's an already a commit
    727 * going on or this tid has already been committed. Returns -EINVAL if no jbd2
    728 * commit has yet been performed.
    729 */
    730int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
    731{
    732	if (unlikely(is_journal_aborted(journal)))
    733		return -EIO;
    734	/*
    735	 * Fast commits only allowed if at least one full commit has
    736	 * been processed.
    737	 */
    738	if (!journal->j_stats.ts_tid)
    739		return -EINVAL;
    740
    741	write_lock(&journal->j_state_lock);
    742	if (tid <= journal->j_commit_sequence) {
    743		write_unlock(&journal->j_state_lock);
    744		return -EALREADY;
    745	}
    746
    747	if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
    748	    (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
    749		DEFINE_WAIT(wait);
    750
    751		prepare_to_wait(&journal->j_fc_wait, &wait,
    752				TASK_UNINTERRUPTIBLE);
    753		write_unlock(&journal->j_state_lock);
    754		schedule();
    755		finish_wait(&journal->j_fc_wait, &wait);
    756		return -EALREADY;
    757	}
    758	journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
    759	write_unlock(&journal->j_state_lock);
    760	jbd2_journal_lock_updates(journal);
    761
    762	return 0;
    763}
    764EXPORT_SYMBOL(jbd2_fc_begin_commit);
    765
    766/*
    767 * Stop a fast commit. If fallback is set, this function starts commit of
    768 * TID tid before any other fast commit can start.
    769 */
    770static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
    771{
    772	jbd2_journal_unlock_updates(journal);
    773	if (journal->j_fc_cleanup_callback)
    774		journal->j_fc_cleanup_callback(journal, 0, tid);
    775	write_lock(&journal->j_state_lock);
    776	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
    777	if (fallback)
    778		journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
    779	write_unlock(&journal->j_state_lock);
    780	wake_up(&journal->j_fc_wait);
    781	if (fallback)
    782		return jbd2_complete_transaction(journal, tid);
    783	return 0;
    784}
    785
    786int jbd2_fc_end_commit(journal_t *journal)
    787{
    788	return __jbd2_fc_end_commit(journal, 0, false);
    789}
    790EXPORT_SYMBOL(jbd2_fc_end_commit);
    791
    792int jbd2_fc_end_commit_fallback(journal_t *journal)
    793{
    794	tid_t tid;
    795
    796	read_lock(&journal->j_state_lock);
    797	tid = journal->j_running_transaction ?
    798		journal->j_running_transaction->t_tid : 0;
    799	read_unlock(&journal->j_state_lock);
    800	return __jbd2_fc_end_commit(journal, tid, true);
    801}
    802EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
    803
    804/* Return 1 when transaction with given tid has already committed. */
    805int jbd2_transaction_committed(journal_t *journal, tid_t tid)
    806{
    807	int ret = 1;
    808
    809	read_lock(&journal->j_state_lock);
    810	if (journal->j_running_transaction &&
    811	    journal->j_running_transaction->t_tid == tid)
    812		ret = 0;
    813	if (journal->j_committing_transaction &&
    814	    journal->j_committing_transaction->t_tid == tid)
    815		ret = 0;
    816	read_unlock(&journal->j_state_lock);
    817	return ret;
    818}
    819EXPORT_SYMBOL(jbd2_transaction_committed);
    820
    821/*
    822 * When this function returns the transaction corresponding to tid
    823 * will be completed.  If the transaction has currently running, start
    824 * committing that transaction before waiting for it to complete.  If
    825 * the transaction id is stale, it is by definition already completed,
    826 * so just return SUCCESS.
    827 */
    828int jbd2_complete_transaction(journal_t *journal, tid_t tid)
    829{
    830	int	need_to_wait = 1;
    831
    832	read_lock(&journal->j_state_lock);
    833	if (journal->j_running_transaction &&
    834	    journal->j_running_transaction->t_tid == tid) {
    835		if (journal->j_commit_request != tid) {
    836			/* transaction not yet started, so request it */
    837			read_unlock(&journal->j_state_lock);
    838			jbd2_log_start_commit(journal, tid);
    839			goto wait_commit;
    840		}
    841	} else if (!(journal->j_committing_transaction &&
    842		     journal->j_committing_transaction->t_tid == tid))
    843		need_to_wait = 0;
    844	read_unlock(&journal->j_state_lock);
    845	if (!need_to_wait)
    846		return 0;
    847wait_commit:
    848	return jbd2_log_wait_commit(journal, tid);
    849}
    850EXPORT_SYMBOL(jbd2_complete_transaction);
    851
    852/*
    853 * Log buffer allocation routines:
    854 */
    855
    856int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
    857{
    858	unsigned long blocknr;
    859
    860	write_lock(&journal->j_state_lock);
    861	J_ASSERT(journal->j_free > 1);
    862
    863	blocknr = journal->j_head;
    864	journal->j_head++;
    865	journal->j_free--;
    866	if (journal->j_head == journal->j_last)
    867		journal->j_head = journal->j_first;
    868	write_unlock(&journal->j_state_lock);
    869	return jbd2_journal_bmap(journal, blocknr, retp);
    870}
    871
    872/* Map one fast commit buffer for use by the file system */
    873int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
    874{
    875	unsigned long long pblock;
    876	unsigned long blocknr;
    877	int ret = 0;
    878	struct buffer_head *bh;
    879	int fc_off;
    880
    881	*bh_out = NULL;
    882
    883	if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
    884		fc_off = journal->j_fc_off;
    885		blocknr = journal->j_fc_first + fc_off;
    886		journal->j_fc_off++;
    887	} else {
    888		ret = -EINVAL;
    889	}
    890
    891	if (ret)
    892		return ret;
    893
    894	ret = jbd2_journal_bmap(journal, blocknr, &pblock);
    895	if (ret)
    896		return ret;
    897
    898	bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
    899	if (!bh)
    900		return -ENOMEM;
    901
    902
    903	journal->j_fc_wbuf[fc_off] = bh;
    904
    905	*bh_out = bh;
    906
    907	return 0;
    908}
    909EXPORT_SYMBOL(jbd2_fc_get_buf);
    910
    911/*
    912 * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
    913 * for completion.
    914 */
    915int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
    916{
    917	struct buffer_head *bh;
    918	int i, j_fc_off;
    919
    920	j_fc_off = journal->j_fc_off;
    921
    922	/*
    923	 * Wait in reverse order to minimize chances of us being woken up before
    924	 * all IOs have completed
    925	 */
    926	for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
    927		bh = journal->j_fc_wbuf[i];
    928		wait_on_buffer(bh);
    929		put_bh(bh);
    930		journal->j_fc_wbuf[i] = NULL;
    931		if (unlikely(!buffer_uptodate(bh)))
    932			return -EIO;
    933	}
    934
    935	return 0;
    936}
    937EXPORT_SYMBOL(jbd2_fc_wait_bufs);
    938
    939int jbd2_fc_release_bufs(journal_t *journal)
    940{
    941	struct buffer_head *bh;
    942	int i, j_fc_off;
    943
    944	j_fc_off = journal->j_fc_off;
    945
    946	for (i = j_fc_off - 1; i >= 0; i--) {
    947		bh = journal->j_fc_wbuf[i];
    948		if (!bh)
    949			break;
    950		put_bh(bh);
    951		journal->j_fc_wbuf[i] = NULL;
    952	}
    953
    954	return 0;
    955}
    956EXPORT_SYMBOL(jbd2_fc_release_bufs);
    957
    958/*
    959 * Conversion of logical to physical block numbers for the journal
    960 *
    961 * On external journals the journal blocks are identity-mapped, so
    962 * this is a no-op.  If needed, we can use j_blk_offset - everything is
    963 * ready.
    964 */
    965int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
    966		 unsigned long long *retp)
    967{
    968	int err = 0;
    969	unsigned long long ret;
    970	sector_t block = 0;
    971
    972	if (journal->j_inode) {
    973		block = blocknr;
    974		ret = bmap(journal->j_inode, &block);
    975
    976		if (ret || !block) {
    977			printk(KERN_ALERT "%s: journal block not found "
    978					"at offset %lu on %s\n",
    979			       __func__, blocknr, journal->j_devname);
    980			err = -EIO;
    981			jbd2_journal_abort(journal, err);
    982		} else {
    983			*retp = block;
    984		}
    985
    986	} else {
    987		*retp = blocknr; /* +journal->j_blk_offset */
    988	}
    989	return err;
    990}
    991
    992/*
    993 * We play buffer_head aliasing tricks to write data/metadata blocks to
    994 * the journal without copying their contents, but for journal
    995 * descriptor blocks we do need to generate bona fide buffers.
    996 *
    997 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
    998 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
    999 * But we don't bother doing that, so there will be coherency problems with
   1000 * mmaps of blockdevs which hold live JBD-controlled filesystems.
   1001 */
   1002struct buffer_head *
   1003jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
   1004{
   1005	journal_t *journal = transaction->t_journal;
   1006	struct buffer_head *bh;
   1007	unsigned long long blocknr;
   1008	journal_header_t *header;
   1009	int err;
   1010
   1011	err = jbd2_journal_next_log_block(journal, &blocknr);
   1012
   1013	if (err)
   1014		return NULL;
   1015
   1016	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
   1017	if (!bh)
   1018		return NULL;
   1019	atomic_dec(&transaction->t_outstanding_credits);
   1020	lock_buffer(bh);
   1021	memset(bh->b_data, 0, journal->j_blocksize);
   1022	header = (journal_header_t *)bh->b_data;
   1023	header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
   1024	header->h_blocktype = cpu_to_be32(type);
   1025	header->h_sequence = cpu_to_be32(transaction->t_tid);
   1026	set_buffer_uptodate(bh);
   1027	unlock_buffer(bh);
   1028	BUFFER_TRACE(bh, "return this buffer");
   1029	return bh;
   1030}
   1031
   1032void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
   1033{
   1034	struct jbd2_journal_block_tail *tail;
   1035	__u32 csum;
   1036
   1037	if (!jbd2_journal_has_csum_v2or3(j))
   1038		return;
   1039
   1040	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
   1041			sizeof(struct jbd2_journal_block_tail));
   1042	tail->t_checksum = 0;
   1043	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
   1044	tail->t_checksum = cpu_to_be32(csum);
   1045}
   1046
   1047/*
   1048 * Return tid of the oldest transaction in the journal and block in the journal
   1049 * where the transaction starts.
   1050 *
   1051 * If the journal is now empty, return which will be the next transaction ID
   1052 * we will write and where will that transaction start.
   1053 *
   1054 * The return value is 0 if journal tail cannot be pushed any further, 1 if
   1055 * it can.
   1056 */
   1057int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
   1058			      unsigned long *block)
   1059{
   1060	transaction_t *transaction;
   1061	int ret;
   1062
   1063	read_lock(&journal->j_state_lock);
   1064	spin_lock(&journal->j_list_lock);
   1065	transaction = journal->j_checkpoint_transactions;
   1066	if (transaction) {
   1067		*tid = transaction->t_tid;
   1068		*block = transaction->t_log_start;
   1069	} else if ((transaction = journal->j_committing_transaction) != NULL) {
   1070		*tid = transaction->t_tid;
   1071		*block = transaction->t_log_start;
   1072	} else if ((transaction = journal->j_running_transaction) != NULL) {
   1073		*tid = transaction->t_tid;
   1074		*block = journal->j_head;
   1075	} else {
   1076		*tid = journal->j_transaction_sequence;
   1077		*block = journal->j_head;
   1078	}
   1079	ret = tid_gt(*tid, journal->j_tail_sequence);
   1080	spin_unlock(&journal->j_list_lock);
   1081	read_unlock(&journal->j_state_lock);
   1082
   1083	return ret;
   1084}
   1085
   1086/*
   1087 * Update information in journal structure and in on disk journal superblock
   1088 * about log tail. This function does not check whether information passed in
   1089 * really pushes log tail further. It's responsibility of the caller to make
   1090 * sure provided log tail information is valid (e.g. by holding
   1091 * j_checkpoint_mutex all the time between computing log tail and calling this
   1092 * function as is the case with jbd2_cleanup_journal_tail()).
   1093 *
   1094 * Requires j_checkpoint_mutex
   1095 */
   1096int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
   1097{
   1098	unsigned long freed;
   1099	int ret;
   1100
   1101	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
   1102
   1103	/*
   1104	 * We cannot afford for write to remain in drive's caches since as
   1105	 * soon as we update j_tail, next transaction can start reusing journal
   1106	 * space and if we lose sb update during power failure we'd replay
   1107	 * old transaction with possibly newly overwritten data.
   1108	 */
   1109	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
   1110					      REQ_SYNC | REQ_FUA);
   1111	if (ret)
   1112		goto out;
   1113
   1114	write_lock(&journal->j_state_lock);
   1115	freed = block - journal->j_tail;
   1116	if (block < journal->j_tail)
   1117		freed += journal->j_last - journal->j_first;
   1118
   1119	trace_jbd2_update_log_tail(journal, tid, block, freed);
   1120	jbd_debug(1,
   1121		  "Cleaning journal tail from %u to %u (offset %lu), "
   1122		  "freeing %lu\n",
   1123		  journal->j_tail_sequence, tid, block, freed);
   1124
   1125	journal->j_free += freed;
   1126	journal->j_tail_sequence = tid;
   1127	journal->j_tail = block;
   1128	write_unlock(&journal->j_state_lock);
   1129
   1130out:
   1131	return ret;
   1132}
   1133
   1134/*
   1135 * This is a variation of __jbd2_update_log_tail which checks for validity of
   1136 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
   1137 * with other threads updating log tail.
   1138 */
   1139void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
   1140{
   1141	mutex_lock_io(&journal->j_checkpoint_mutex);
   1142	if (tid_gt(tid, journal->j_tail_sequence))
   1143		__jbd2_update_log_tail(journal, tid, block);
   1144	mutex_unlock(&journal->j_checkpoint_mutex);
   1145}
   1146
   1147struct jbd2_stats_proc_session {
   1148	journal_t *journal;
   1149	struct transaction_stats_s *stats;
   1150	int start;
   1151	int max;
   1152};
   1153
   1154static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
   1155{
   1156	return *pos ? NULL : SEQ_START_TOKEN;
   1157}
   1158
   1159static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
   1160{
   1161	(*pos)++;
   1162	return NULL;
   1163}
   1164
   1165static int jbd2_seq_info_show(struct seq_file *seq, void *v)
   1166{
   1167	struct jbd2_stats_proc_session *s = seq->private;
   1168
   1169	if (v != SEQ_START_TOKEN)
   1170		return 0;
   1171	seq_printf(seq, "%lu transactions (%lu requested), "
   1172		   "each up to %u blocks\n",
   1173		   s->stats->ts_tid, s->stats->ts_requested,
   1174		   s->journal->j_max_transaction_buffers);
   1175	if (s->stats->ts_tid == 0)
   1176		return 0;
   1177	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
   1178	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
   1179	seq_printf(seq, "  %ums request delay\n",
   1180	    (s->stats->ts_requested == 0) ? 0 :
   1181	    jiffies_to_msecs(s->stats->run.rs_request_delay /
   1182			     s->stats->ts_requested));
   1183	seq_printf(seq, "  %ums running transaction\n",
   1184	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
   1185	seq_printf(seq, "  %ums transaction was being locked\n",
   1186	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
   1187	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
   1188	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
   1189	seq_printf(seq, "  %ums logging transaction\n",
   1190	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
   1191	seq_printf(seq, "  %lluus average transaction commit time\n",
   1192		   div_u64(s->journal->j_average_commit_time, 1000));
   1193	seq_printf(seq, "  %lu handles per transaction\n",
   1194	    s->stats->run.rs_handle_count / s->stats->ts_tid);
   1195	seq_printf(seq, "  %lu blocks per transaction\n",
   1196	    s->stats->run.rs_blocks / s->stats->ts_tid);
   1197	seq_printf(seq, "  %lu logged blocks per transaction\n",
   1198	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
   1199	return 0;
   1200}
   1201
   1202static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
   1203{
   1204}
   1205
   1206static const struct seq_operations jbd2_seq_info_ops = {
   1207	.start  = jbd2_seq_info_start,
   1208	.next   = jbd2_seq_info_next,
   1209	.stop   = jbd2_seq_info_stop,
   1210	.show   = jbd2_seq_info_show,
   1211};
   1212
   1213static int jbd2_seq_info_open(struct inode *inode, struct file *file)
   1214{
   1215	journal_t *journal = pde_data(inode);
   1216	struct jbd2_stats_proc_session *s;
   1217	int rc, size;
   1218
   1219	s = kmalloc(sizeof(*s), GFP_KERNEL);
   1220	if (s == NULL)
   1221		return -ENOMEM;
   1222	size = sizeof(struct transaction_stats_s);
   1223	s->stats = kmalloc(size, GFP_KERNEL);
   1224	if (s->stats == NULL) {
   1225		kfree(s);
   1226		return -ENOMEM;
   1227	}
   1228	spin_lock(&journal->j_history_lock);
   1229	memcpy(s->stats, &journal->j_stats, size);
   1230	s->journal = journal;
   1231	spin_unlock(&journal->j_history_lock);
   1232
   1233	rc = seq_open(file, &jbd2_seq_info_ops);
   1234	if (rc == 0) {
   1235		struct seq_file *m = file->private_data;
   1236		m->private = s;
   1237	} else {
   1238		kfree(s->stats);
   1239		kfree(s);
   1240	}
   1241	return rc;
   1242
   1243}
   1244
   1245static int jbd2_seq_info_release(struct inode *inode, struct file *file)
   1246{
   1247	struct seq_file *seq = file->private_data;
   1248	struct jbd2_stats_proc_session *s = seq->private;
   1249	kfree(s->stats);
   1250	kfree(s);
   1251	return seq_release(inode, file);
   1252}
   1253
   1254static const struct proc_ops jbd2_info_proc_ops = {
   1255	.proc_open	= jbd2_seq_info_open,
   1256	.proc_read	= seq_read,
   1257	.proc_lseek	= seq_lseek,
   1258	.proc_release	= jbd2_seq_info_release,
   1259};
   1260
   1261static struct proc_dir_entry *proc_jbd2_stats;
   1262
   1263static void jbd2_stats_proc_init(journal_t *journal)
   1264{
   1265	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
   1266	if (journal->j_proc_entry) {
   1267		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
   1268				 &jbd2_info_proc_ops, journal);
   1269	}
   1270}
   1271
   1272static void jbd2_stats_proc_exit(journal_t *journal)
   1273{
   1274	remove_proc_entry("info", journal->j_proc_entry);
   1275	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
   1276}
   1277
   1278/* Minimum size of descriptor tag */
   1279static int jbd2_min_tag_size(void)
   1280{
   1281	/*
   1282	 * Tag with 32-bit block numbers does not use last four bytes of the
   1283	 * structure
   1284	 */
   1285	return sizeof(journal_block_tag_t) - 4;
   1286}
   1287
   1288/**
   1289 * jbd2_journal_shrink_scan()
   1290 * @shrink: shrinker to work on
   1291 * @sc: reclaim request to process
   1292 *
   1293 * Scan the checkpointed buffer on the checkpoint list and release the
   1294 * journal_head.
   1295 */
   1296static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
   1297					      struct shrink_control *sc)
   1298{
   1299	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
   1300	unsigned long nr_to_scan = sc->nr_to_scan;
   1301	unsigned long nr_shrunk;
   1302	unsigned long count;
   1303
   1304	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
   1305	trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
   1306
   1307	nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
   1308
   1309	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
   1310	trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
   1311
   1312	return nr_shrunk;
   1313}
   1314
   1315/**
   1316 * jbd2_journal_shrink_count()
   1317 * @shrink: shrinker to work on
   1318 * @sc: reclaim request to process
   1319 *
   1320 * Count the number of checkpoint buffers on the checkpoint list.
   1321 */
   1322static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
   1323					       struct shrink_control *sc)
   1324{
   1325	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
   1326	unsigned long count;
   1327
   1328	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
   1329	trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
   1330
   1331	return count;
   1332}
   1333
   1334/*
   1335 * Management for journal control blocks: functions to create and
   1336 * destroy journal_t structures, and to initialise and read existing
   1337 * journal blocks from disk.  */
   1338
   1339/* First: create and setup a journal_t object in memory.  We initialise
   1340 * very few fields yet: that has to wait until we have created the
   1341 * journal structures from from scratch, or loaded them from disk. */
   1342
   1343static journal_t *journal_init_common(struct block_device *bdev,
   1344			struct block_device *fs_dev,
   1345			unsigned long long start, int len, int blocksize)
   1346{
   1347	static struct lock_class_key jbd2_trans_commit_key;
   1348	journal_t *journal;
   1349	int err;
   1350	struct buffer_head *bh;
   1351	int n;
   1352
   1353	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
   1354	if (!journal)
   1355		return NULL;
   1356
   1357	init_waitqueue_head(&journal->j_wait_transaction_locked);
   1358	init_waitqueue_head(&journal->j_wait_done_commit);
   1359	init_waitqueue_head(&journal->j_wait_commit);
   1360	init_waitqueue_head(&journal->j_wait_updates);
   1361	init_waitqueue_head(&journal->j_wait_reserved);
   1362	init_waitqueue_head(&journal->j_fc_wait);
   1363	mutex_init(&journal->j_abort_mutex);
   1364	mutex_init(&journal->j_barrier);
   1365	mutex_init(&journal->j_checkpoint_mutex);
   1366	spin_lock_init(&journal->j_revoke_lock);
   1367	spin_lock_init(&journal->j_list_lock);
   1368	rwlock_init(&journal->j_state_lock);
   1369
   1370	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
   1371	journal->j_min_batch_time = 0;
   1372	journal->j_max_batch_time = 15000; /* 15ms */
   1373	atomic_set(&journal->j_reserved_credits, 0);
   1374
   1375	/* The journal is marked for error until we succeed with recovery! */
   1376	journal->j_flags = JBD2_ABORT;
   1377
   1378	/* Set up a default-sized revoke table for the new mount. */
   1379	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
   1380	if (err)
   1381		goto err_cleanup;
   1382
   1383	spin_lock_init(&journal->j_history_lock);
   1384
   1385	lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
   1386			 &jbd2_trans_commit_key, 0);
   1387
   1388	/* journal descriptor can store up to n blocks -bzzz */
   1389	journal->j_blocksize = blocksize;
   1390	journal->j_dev = bdev;
   1391	journal->j_fs_dev = fs_dev;
   1392	journal->j_blk_offset = start;
   1393	journal->j_total_len = len;
   1394	/* We need enough buffers to write out full descriptor block. */
   1395	n = journal->j_blocksize / jbd2_min_tag_size();
   1396	journal->j_wbufsize = n;
   1397	journal->j_fc_wbuf = NULL;
   1398	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
   1399					GFP_KERNEL);
   1400	if (!journal->j_wbuf)
   1401		goto err_cleanup;
   1402
   1403	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
   1404	if (!bh) {
   1405		pr_err("%s: Cannot get buffer for journal superblock\n",
   1406			__func__);
   1407		goto err_cleanup;
   1408	}
   1409	journal->j_sb_buffer = bh;
   1410	journal->j_superblock = (journal_superblock_t *)bh->b_data;
   1411
   1412	journal->j_shrink_transaction = NULL;
   1413	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
   1414	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
   1415	journal->j_shrinker.seeks = DEFAULT_SEEKS;
   1416	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
   1417
   1418	if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
   1419		goto err_cleanup;
   1420
   1421	if (register_shrinker(&journal->j_shrinker)) {
   1422		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
   1423		goto err_cleanup;
   1424	}
   1425	return journal;
   1426
   1427err_cleanup:
   1428	brelse(journal->j_sb_buffer);
   1429	kfree(journal->j_wbuf);
   1430	jbd2_journal_destroy_revoke(journal);
   1431	kfree(journal);
   1432	return NULL;
   1433}
   1434
   1435/* jbd2_journal_init_dev and jbd2_journal_init_inode:
   1436 *
   1437 * Create a journal structure assigned some fixed set of disk blocks to
   1438 * the journal.  We don't actually touch those disk blocks yet, but we
   1439 * need to set up all of the mapping information to tell the journaling
   1440 * system where the journal blocks are.
   1441 *
   1442 */
   1443
   1444/**
   1445 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
   1446 *  @bdev: Block device on which to create the journal
   1447 *  @fs_dev: Device which hold journalled filesystem for this journal.
   1448 *  @start: Block nr Start of journal.
   1449 *  @len:  Length of the journal in blocks.
   1450 *  @blocksize: blocksize of journalling device
   1451 *
   1452 *  Returns: a newly created journal_t *
   1453 *
   1454 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
   1455 *  range of blocks on an arbitrary block device.
   1456 *
   1457 */
   1458journal_t *jbd2_journal_init_dev(struct block_device *bdev,
   1459			struct block_device *fs_dev,
   1460			unsigned long long start, int len, int blocksize)
   1461{
   1462	journal_t *journal;
   1463
   1464	journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
   1465	if (!journal)
   1466		return NULL;
   1467
   1468	bdevname(journal->j_dev, journal->j_devname);
   1469	strreplace(journal->j_devname, '/', '!');
   1470	jbd2_stats_proc_init(journal);
   1471
   1472	return journal;
   1473}
   1474
   1475/**
   1476 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
   1477 *  @inode: An inode to create the journal in
   1478 *
   1479 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
   1480 * the journal.  The inode must exist already, must support bmap() and
   1481 * must have all data blocks preallocated.
   1482 */
   1483journal_t *jbd2_journal_init_inode(struct inode *inode)
   1484{
   1485	journal_t *journal;
   1486	sector_t blocknr;
   1487	char *p;
   1488	int err = 0;
   1489
   1490	blocknr = 0;
   1491	err = bmap(inode, &blocknr);
   1492
   1493	if (err || !blocknr) {
   1494		pr_err("%s: Cannot locate journal superblock\n",
   1495			__func__);
   1496		return NULL;
   1497	}
   1498
   1499	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
   1500		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
   1501		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
   1502
   1503	journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
   1504			blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
   1505			inode->i_sb->s_blocksize);
   1506	if (!journal)
   1507		return NULL;
   1508
   1509	journal->j_inode = inode;
   1510	bdevname(journal->j_dev, journal->j_devname);
   1511	p = strreplace(journal->j_devname, '/', '!');
   1512	sprintf(p, "-%lu", journal->j_inode->i_ino);
   1513	jbd2_stats_proc_init(journal);
   1514
   1515	return journal;
   1516}
   1517
   1518/*
   1519 * If the journal init or create aborts, we need to mark the journal
   1520 * superblock as being NULL to prevent the journal destroy from writing
   1521 * back a bogus superblock.
   1522 */
   1523static void journal_fail_superblock(journal_t *journal)
   1524{
   1525	struct buffer_head *bh = journal->j_sb_buffer;
   1526	brelse(bh);
   1527	journal->j_sb_buffer = NULL;
   1528}
   1529
   1530/*
   1531 * Given a journal_t structure, initialise the various fields for
   1532 * startup of a new journaling session.  We use this both when creating
   1533 * a journal, and after recovering an old journal to reset it for
   1534 * subsequent use.
   1535 */
   1536
   1537static int journal_reset(journal_t *journal)
   1538{
   1539	journal_superblock_t *sb = journal->j_superblock;
   1540	unsigned long long first, last;
   1541
   1542	first = be32_to_cpu(sb->s_first);
   1543	last = be32_to_cpu(sb->s_maxlen);
   1544	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
   1545		printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
   1546		       first, last);
   1547		journal_fail_superblock(journal);
   1548		return -EINVAL;
   1549	}
   1550
   1551	journal->j_first = first;
   1552	journal->j_last = last;
   1553
   1554	journal->j_head = journal->j_first;
   1555	journal->j_tail = journal->j_first;
   1556	journal->j_free = journal->j_last - journal->j_first;
   1557
   1558	journal->j_tail_sequence = journal->j_transaction_sequence;
   1559	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
   1560	journal->j_commit_request = journal->j_commit_sequence;
   1561
   1562	journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
   1563
   1564	/*
   1565	 * Now that journal recovery is done, turn fast commits off here. This
   1566	 * way, if fast commit was enabled before the crash but if now FS has
   1567	 * disabled it, we don't enable fast commits.
   1568	 */
   1569	jbd2_clear_feature_fast_commit(journal);
   1570
   1571	/*
   1572	 * As a special case, if the on-disk copy is already marked as needing
   1573	 * no recovery (s_start == 0), then we can safely defer the superblock
   1574	 * update until the next commit by setting JBD2_FLUSHED.  This avoids
   1575	 * attempting a write to a potential-readonly device.
   1576	 */
   1577	if (sb->s_start == 0) {
   1578		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
   1579			"(start %ld, seq %u, errno %d)\n",
   1580			journal->j_tail, journal->j_tail_sequence,
   1581			journal->j_errno);
   1582		journal->j_flags |= JBD2_FLUSHED;
   1583	} else {
   1584		/* Lock here to make assertions happy... */
   1585		mutex_lock_io(&journal->j_checkpoint_mutex);
   1586		/*
   1587		 * Update log tail information. We use REQ_FUA since new
   1588		 * transaction will start reusing journal space and so we
   1589		 * must make sure information about current log tail is on
   1590		 * disk before that.
   1591		 */
   1592		jbd2_journal_update_sb_log_tail(journal,
   1593						journal->j_tail_sequence,
   1594						journal->j_tail,
   1595						REQ_SYNC | REQ_FUA);
   1596		mutex_unlock(&journal->j_checkpoint_mutex);
   1597	}
   1598	return jbd2_journal_start_thread(journal);
   1599}
   1600
   1601/*
   1602 * This function expects that the caller will have locked the journal
   1603 * buffer head, and will return with it unlocked
   1604 */
   1605static int jbd2_write_superblock(journal_t *journal, int write_flags)
   1606{
   1607	struct buffer_head *bh = journal->j_sb_buffer;
   1608	journal_superblock_t *sb = journal->j_superblock;
   1609	int ret;
   1610
   1611	/* Buffer got discarded which means block device got invalidated */
   1612	if (!buffer_mapped(bh)) {
   1613		unlock_buffer(bh);
   1614		return -EIO;
   1615	}
   1616
   1617	trace_jbd2_write_superblock(journal, write_flags);
   1618	if (!(journal->j_flags & JBD2_BARRIER))
   1619		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
   1620	if (buffer_write_io_error(bh)) {
   1621		/*
   1622		 * Oh, dear.  A previous attempt to write the journal
   1623		 * superblock failed.  This could happen because the
   1624		 * USB device was yanked out.  Or it could happen to
   1625		 * be a transient write error and maybe the block will
   1626		 * be remapped.  Nothing we can do but to retry the
   1627		 * write and hope for the best.
   1628		 */
   1629		printk(KERN_ERR "JBD2: previous I/O error detected "
   1630		       "for journal superblock update for %s.\n",
   1631		       journal->j_devname);
   1632		clear_buffer_write_io_error(bh);
   1633		set_buffer_uptodate(bh);
   1634	}
   1635	if (jbd2_journal_has_csum_v2or3(journal))
   1636		sb->s_checksum = jbd2_superblock_csum(journal, sb);
   1637	get_bh(bh);
   1638	bh->b_end_io = end_buffer_write_sync;
   1639	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
   1640	wait_on_buffer(bh);
   1641	if (buffer_write_io_error(bh)) {
   1642		clear_buffer_write_io_error(bh);
   1643		set_buffer_uptodate(bh);
   1644		ret = -EIO;
   1645	}
   1646	if (ret) {
   1647		printk(KERN_ERR "JBD2: Error %d detected when updating "
   1648		       "journal superblock for %s.\n", ret,
   1649		       journal->j_devname);
   1650		if (!is_journal_aborted(journal))
   1651			jbd2_journal_abort(journal, ret);
   1652	}
   1653
   1654	return ret;
   1655}
   1656
   1657/**
   1658 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
   1659 * @journal: The journal to update.
   1660 * @tail_tid: TID of the new transaction at the tail of the log
   1661 * @tail_block: The first block of the transaction at the tail of the log
   1662 * @write_op: With which operation should we write the journal sb
   1663 *
   1664 * Update a journal's superblock information about log tail and write it to
   1665 * disk, waiting for the IO to complete.
   1666 */
   1667int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
   1668				     unsigned long tail_block, int write_op)
   1669{
   1670	journal_superblock_t *sb = journal->j_superblock;
   1671	int ret;
   1672
   1673	if (is_journal_aborted(journal))
   1674		return -EIO;
   1675	if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) {
   1676		jbd2_journal_abort(journal, -EIO);
   1677		return -EIO;
   1678	}
   1679
   1680	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
   1681	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
   1682		  tail_block, tail_tid);
   1683
   1684	lock_buffer(journal->j_sb_buffer);
   1685	sb->s_sequence = cpu_to_be32(tail_tid);
   1686	sb->s_start    = cpu_to_be32(tail_block);
   1687
   1688	ret = jbd2_write_superblock(journal, write_op);
   1689	if (ret)
   1690		goto out;
   1691
   1692	/* Log is no longer empty */
   1693	write_lock(&journal->j_state_lock);
   1694	WARN_ON(!sb->s_sequence);
   1695	journal->j_flags &= ~JBD2_FLUSHED;
   1696	write_unlock(&journal->j_state_lock);
   1697
   1698out:
   1699	return ret;
   1700}
   1701
   1702/**
   1703 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
   1704 * @journal: The journal to update.
   1705 * @write_op: With which operation should we write the journal sb
   1706 *
   1707 * Update a journal's dynamic superblock fields to show that journal is empty.
   1708 * Write updated superblock to disk waiting for IO to complete.
   1709 */
   1710static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
   1711{
   1712	journal_superblock_t *sb = journal->j_superblock;
   1713	bool had_fast_commit = false;
   1714
   1715	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
   1716	lock_buffer(journal->j_sb_buffer);
   1717	if (sb->s_start == 0) {		/* Is it already empty? */
   1718		unlock_buffer(journal->j_sb_buffer);
   1719		return;
   1720	}
   1721
   1722	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
   1723		  journal->j_tail_sequence);
   1724
   1725	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
   1726	sb->s_start    = cpu_to_be32(0);
   1727	if (jbd2_has_feature_fast_commit(journal)) {
   1728		/*
   1729		 * When journal is clean, no need to commit fast commit flag and
   1730		 * make file system incompatible with older kernels.
   1731		 */
   1732		jbd2_clear_feature_fast_commit(journal);
   1733		had_fast_commit = true;
   1734	}
   1735
   1736	jbd2_write_superblock(journal, write_op);
   1737
   1738	if (had_fast_commit)
   1739		jbd2_set_feature_fast_commit(journal);
   1740
   1741	/* Log is no longer empty */
   1742	write_lock(&journal->j_state_lock);
   1743	journal->j_flags |= JBD2_FLUSHED;
   1744	write_unlock(&journal->j_state_lock);
   1745}
   1746
   1747/**
   1748 * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
   1749 * @journal: The journal to erase.
   1750 * @flags: A discard/zeroout request is sent for each physically contigous
   1751 *	region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
   1752 *	JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
   1753 *	to perform.
   1754 *
   1755 * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
   1756 * will be explicitly written if no hardware offload is available, see
   1757 * blkdev_issue_zeroout for more details.
   1758 */
   1759static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
   1760{
   1761	int err = 0;
   1762	unsigned long block, log_offset; /* logical */
   1763	unsigned long long phys_block, block_start, block_stop; /* physical */
   1764	loff_t byte_start, byte_stop, byte_count;
   1765
   1766	/* flags must be set to either discard or zeroout */
   1767	if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
   1768			((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
   1769			(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
   1770		return -EINVAL;
   1771
   1772	if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
   1773	    !bdev_max_discard_sectors(journal->j_dev))
   1774		return -EOPNOTSUPP;
   1775
   1776	/*
   1777	 * lookup block mapping and issue discard/zeroout for each
   1778	 * contiguous region
   1779	 */
   1780	log_offset = be32_to_cpu(journal->j_superblock->s_first);
   1781	block_start =  ~0ULL;
   1782	for (block = log_offset; block < journal->j_total_len; block++) {
   1783		err = jbd2_journal_bmap(journal, block, &phys_block);
   1784		if (err) {
   1785			pr_err("JBD2: bad block at offset %lu", block);
   1786			return err;
   1787		}
   1788
   1789		if (block_start == ~0ULL) {
   1790			block_start = phys_block;
   1791			block_stop = block_start - 1;
   1792		}
   1793
   1794		/*
   1795		 * last block not contiguous with current block,
   1796		 * process last contiguous region and return to this block on
   1797		 * next loop
   1798		 */
   1799		if (phys_block != block_stop + 1) {
   1800			block--;
   1801		} else {
   1802			block_stop++;
   1803			/*
   1804			 * if this isn't the last block of journal,
   1805			 * no need to process now because next block may also
   1806			 * be part of this contiguous region
   1807			 */
   1808			if (block != journal->j_total_len - 1)
   1809				continue;
   1810		}
   1811
   1812		/*
   1813		 * end of contiguous region or this is last block of journal,
   1814		 * take care of the region
   1815		 */
   1816		byte_start = block_start * journal->j_blocksize;
   1817		byte_stop = block_stop * journal->j_blocksize;
   1818		byte_count = (block_stop - block_start + 1) *
   1819				journal->j_blocksize;
   1820
   1821		truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
   1822				byte_start, byte_stop);
   1823
   1824		if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
   1825			err = blkdev_issue_discard(journal->j_dev,
   1826					byte_start >> SECTOR_SHIFT,
   1827					byte_count >> SECTOR_SHIFT,
   1828					GFP_NOFS);
   1829		} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
   1830			err = blkdev_issue_zeroout(journal->j_dev,
   1831					byte_start >> SECTOR_SHIFT,
   1832					byte_count >> SECTOR_SHIFT,
   1833					GFP_NOFS, 0);
   1834		}
   1835
   1836		if (unlikely(err != 0)) {
   1837			pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
   1838					err, block_start, block_stop);
   1839			return err;
   1840		}
   1841
   1842		/* reset start and stop after processing a region */
   1843		block_start = ~0ULL;
   1844	}
   1845
   1846	return blkdev_issue_flush(journal->j_dev);
   1847}
   1848
   1849/**
   1850 * jbd2_journal_update_sb_errno() - Update error in the journal.
   1851 * @journal: The journal to update.
   1852 *
   1853 * Update a journal's errno.  Write updated superblock to disk waiting for IO
   1854 * to complete.
   1855 */
   1856void jbd2_journal_update_sb_errno(journal_t *journal)
   1857{
   1858	journal_superblock_t *sb = journal->j_superblock;
   1859	int errcode;
   1860
   1861	lock_buffer(journal->j_sb_buffer);
   1862	errcode = journal->j_errno;
   1863	if (errcode == -ESHUTDOWN)
   1864		errcode = 0;
   1865	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
   1866	sb->s_errno    = cpu_to_be32(errcode);
   1867
   1868	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
   1869}
   1870EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
   1871
   1872static int journal_revoke_records_per_block(journal_t *journal)
   1873{
   1874	int record_size;
   1875	int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
   1876
   1877	if (jbd2_has_feature_64bit(journal))
   1878		record_size = 8;
   1879	else
   1880		record_size = 4;
   1881
   1882	if (jbd2_journal_has_csum_v2or3(journal))
   1883		space -= sizeof(struct jbd2_journal_block_tail);
   1884	return space / record_size;
   1885}
   1886
   1887/*
   1888 * Read the superblock for a given journal, performing initial
   1889 * validation of the format.
   1890 */
   1891static int journal_get_superblock(journal_t *journal)
   1892{
   1893	struct buffer_head *bh;
   1894	journal_superblock_t *sb;
   1895	int err = -EIO;
   1896
   1897	bh = journal->j_sb_buffer;
   1898
   1899	J_ASSERT(bh != NULL);
   1900	if (!buffer_uptodate(bh)) {
   1901		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
   1902		wait_on_buffer(bh);
   1903		if (!buffer_uptodate(bh)) {
   1904			printk(KERN_ERR
   1905				"JBD2: IO error reading journal superblock\n");
   1906			goto out;
   1907		}
   1908	}
   1909
   1910	if (buffer_verified(bh))
   1911		return 0;
   1912
   1913	sb = journal->j_superblock;
   1914
   1915	err = -EINVAL;
   1916
   1917	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
   1918	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
   1919		printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
   1920		goto out;
   1921	}
   1922
   1923	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
   1924	case JBD2_SUPERBLOCK_V1:
   1925		journal->j_format_version = 1;
   1926		break;
   1927	case JBD2_SUPERBLOCK_V2:
   1928		journal->j_format_version = 2;
   1929		break;
   1930	default:
   1931		printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
   1932		goto out;
   1933	}
   1934
   1935	if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
   1936		journal->j_total_len = be32_to_cpu(sb->s_maxlen);
   1937	else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
   1938		printk(KERN_WARNING "JBD2: journal file too short\n");
   1939		goto out;
   1940	}
   1941
   1942	if (be32_to_cpu(sb->s_first) == 0 ||
   1943	    be32_to_cpu(sb->s_first) >= journal->j_total_len) {
   1944		printk(KERN_WARNING
   1945			"JBD2: Invalid start block of journal: %u\n",
   1946			be32_to_cpu(sb->s_first));
   1947		goto out;
   1948	}
   1949
   1950	if (jbd2_has_feature_csum2(journal) &&
   1951	    jbd2_has_feature_csum3(journal)) {
   1952		/* Can't have checksum v2 and v3 at the same time! */
   1953		printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
   1954		       "at the same time!\n");
   1955		goto out;
   1956	}
   1957
   1958	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
   1959	    jbd2_has_feature_checksum(journal)) {
   1960		/* Can't have checksum v1 and v2 on at the same time! */
   1961		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
   1962		       "at the same time!\n");
   1963		goto out;
   1964	}
   1965
   1966	if (!jbd2_verify_csum_type(journal, sb)) {
   1967		printk(KERN_ERR "JBD2: Unknown checksum type\n");
   1968		goto out;
   1969	}
   1970
   1971	/* Load the checksum driver */
   1972	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
   1973		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
   1974		if (IS_ERR(journal->j_chksum_driver)) {
   1975			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
   1976			err = PTR_ERR(journal->j_chksum_driver);
   1977			journal->j_chksum_driver = NULL;
   1978			goto out;
   1979		}
   1980	}
   1981
   1982	if (jbd2_journal_has_csum_v2or3(journal)) {
   1983		/* Check superblock checksum */
   1984		if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
   1985			printk(KERN_ERR "JBD2: journal checksum error\n");
   1986			err = -EFSBADCRC;
   1987			goto out;
   1988		}
   1989
   1990		/* Precompute checksum seed for all metadata */
   1991		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
   1992						   sizeof(sb->s_uuid));
   1993	}
   1994
   1995	journal->j_revoke_records_per_block =
   1996				journal_revoke_records_per_block(journal);
   1997	set_buffer_verified(bh);
   1998
   1999	return 0;
   2000
   2001out:
   2002	journal_fail_superblock(journal);
   2003	return err;
   2004}
   2005
   2006/*
   2007 * Load the on-disk journal superblock and read the key fields into the
   2008 * journal_t.
   2009 */
   2010
   2011static int load_superblock(journal_t *journal)
   2012{
   2013	int err;
   2014	journal_superblock_t *sb;
   2015	int num_fc_blocks;
   2016
   2017	err = journal_get_superblock(journal);
   2018	if (err)
   2019		return err;
   2020
   2021	sb = journal->j_superblock;
   2022
   2023	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
   2024	journal->j_tail = be32_to_cpu(sb->s_start);
   2025	journal->j_first = be32_to_cpu(sb->s_first);
   2026	journal->j_errno = be32_to_cpu(sb->s_errno);
   2027	journal->j_last = be32_to_cpu(sb->s_maxlen);
   2028
   2029	if (jbd2_has_feature_fast_commit(journal)) {
   2030		journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
   2031		num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
   2032		if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
   2033			journal->j_last = journal->j_fc_last - num_fc_blocks;
   2034		journal->j_fc_first = journal->j_last + 1;
   2035		journal->j_fc_off = 0;
   2036	}
   2037
   2038	return 0;
   2039}
   2040
   2041
   2042/**
   2043 * jbd2_journal_load() - Read journal from disk.
   2044 * @journal: Journal to act on.
   2045 *
   2046 * Given a journal_t structure which tells us which disk blocks contain
   2047 * a journal, read the journal from disk to initialise the in-memory
   2048 * structures.
   2049 */
   2050int jbd2_journal_load(journal_t *journal)
   2051{
   2052	int err;
   2053	journal_superblock_t *sb;
   2054
   2055	err = load_superblock(journal);
   2056	if (err)
   2057		return err;
   2058
   2059	sb = journal->j_superblock;
   2060	/* If this is a V2 superblock, then we have to check the
   2061	 * features flags on it. */
   2062
   2063	if (journal->j_format_version >= 2) {
   2064		if ((sb->s_feature_ro_compat &
   2065		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
   2066		    (sb->s_feature_incompat &
   2067		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
   2068			printk(KERN_WARNING
   2069				"JBD2: Unrecognised features on journal\n");
   2070			return -EINVAL;
   2071		}
   2072	}
   2073
   2074	/*
   2075	 * Create a slab for this blocksize
   2076	 */
   2077	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
   2078	if (err)
   2079		return err;
   2080
   2081	/* Let the recovery code check whether it needs to recover any
   2082	 * data from the journal. */
   2083	if (jbd2_journal_recover(journal))
   2084		goto recovery_error;
   2085
   2086	if (journal->j_failed_commit) {
   2087		printk(KERN_ERR "JBD2: journal transaction %u on %s "
   2088		       "is corrupt.\n", journal->j_failed_commit,
   2089		       journal->j_devname);
   2090		return -EFSCORRUPTED;
   2091	}
   2092	/*
   2093	 * clear JBD2_ABORT flag initialized in journal_init_common
   2094	 * here to update log tail information with the newest seq.
   2095	 */
   2096	journal->j_flags &= ~JBD2_ABORT;
   2097
   2098	/* OK, we've finished with the dynamic journal bits:
   2099	 * reinitialise the dynamic contents of the superblock in memory
   2100	 * and reset them on disk. */
   2101	if (journal_reset(journal))
   2102		goto recovery_error;
   2103
   2104	journal->j_flags |= JBD2_LOADED;
   2105	return 0;
   2106
   2107recovery_error:
   2108	printk(KERN_WARNING "JBD2: recovery failed\n");
   2109	return -EIO;
   2110}
   2111
   2112/**
   2113 * jbd2_journal_destroy() - Release a journal_t structure.
   2114 * @journal: Journal to act on.
   2115 *
   2116 * Release a journal_t structure once it is no longer in use by the
   2117 * journaled object.
   2118 * Return <0 if we couldn't clean up the journal.
   2119 */
   2120int jbd2_journal_destroy(journal_t *journal)
   2121{
   2122	int err = 0;
   2123
   2124	/* Wait for the commit thread to wake up and die. */
   2125	journal_kill_thread(journal);
   2126
   2127	/* Force a final log commit */
   2128	if (journal->j_running_transaction)
   2129		jbd2_journal_commit_transaction(journal);
   2130
   2131	/* Force any old transactions to disk */
   2132
   2133	/* Totally anal locking here... */
   2134	spin_lock(&journal->j_list_lock);
   2135	while (journal->j_checkpoint_transactions != NULL) {
   2136		spin_unlock(&journal->j_list_lock);
   2137		mutex_lock_io(&journal->j_checkpoint_mutex);
   2138		err = jbd2_log_do_checkpoint(journal);
   2139		mutex_unlock(&journal->j_checkpoint_mutex);
   2140		/*
   2141		 * If checkpointing failed, just free the buffers to avoid
   2142		 * looping forever
   2143		 */
   2144		if (err) {
   2145			jbd2_journal_destroy_checkpoint(journal);
   2146			spin_lock(&journal->j_list_lock);
   2147			break;
   2148		}
   2149		spin_lock(&journal->j_list_lock);
   2150	}
   2151
   2152	J_ASSERT(journal->j_running_transaction == NULL);
   2153	J_ASSERT(journal->j_committing_transaction == NULL);
   2154	J_ASSERT(journal->j_checkpoint_transactions == NULL);
   2155	spin_unlock(&journal->j_list_lock);
   2156
   2157	/*
   2158	 * OK, all checkpoint transactions have been checked, now check the
   2159	 * write out io error flag and abort the journal if some buffer failed
   2160	 * to write back to the original location, otherwise the filesystem
   2161	 * may become inconsistent.
   2162	 */
   2163	if (!is_journal_aborted(journal) &&
   2164	    test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags))
   2165		jbd2_journal_abort(journal, -EIO);
   2166
   2167	if (journal->j_sb_buffer) {
   2168		if (!is_journal_aborted(journal)) {
   2169			mutex_lock_io(&journal->j_checkpoint_mutex);
   2170
   2171			write_lock(&journal->j_state_lock);
   2172			journal->j_tail_sequence =
   2173				++journal->j_transaction_sequence;
   2174			write_unlock(&journal->j_state_lock);
   2175
   2176			jbd2_mark_journal_empty(journal,
   2177					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
   2178			mutex_unlock(&journal->j_checkpoint_mutex);
   2179		} else
   2180			err = -EIO;
   2181		brelse(journal->j_sb_buffer);
   2182	}
   2183
   2184	if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
   2185		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
   2186		unregister_shrinker(&journal->j_shrinker);
   2187	}
   2188	if (journal->j_proc_entry)
   2189		jbd2_stats_proc_exit(journal);
   2190	iput(journal->j_inode);
   2191	if (journal->j_revoke)
   2192		jbd2_journal_destroy_revoke(journal);
   2193	if (journal->j_chksum_driver)
   2194		crypto_free_shash(journal->j_chksum_driver);
   2195	kfree(journal->j_fc_wbuf);
   2196	kfree(journal->j_wbuf);
   2197	kfree(journal);
   2198
   2199	return err;
   2200}
   2201
   2202
   2203/**
   2204 * jbd2_journal_check_used_features() - Check if features specified are used.
   2205 * @journal: Journal to check.
   2206 * @compat: bitmask of compatible features
   2207 * @ro: bitmask of features that force read-only mount
   2208 * @incompat: bitmask of incompatible features
   2209 *
   2210 * Check whether the journal uses all of a given set of
   2211 * features.  Return true (non-zero) if it does.
   2212 **/
   2213
   2214int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
   2215				 unsigned long ro, unsigned long incompat)
   2216{
   2217	journal_superblock_t *sb;
   2218
   2219	if (!compat && !ro && !incompat)
   2220		return 1;
   2221	/* Load journal superblock if it is not loaded yet. */
   2222	if (journal->j_format_version == 0 &&
   2223	    journal_get_superblock(journal) != 0)
   2224		return 0;
   2225	if (journal->j_format_version == 1)
   2226		return 0;
   2227
   2228	sb = journal->j_superblock;
   2229
   2230	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
   2231	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
   2232	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
   2233		return 1;
   2234
   2235	return 0;
   2236}
   2237
   2238/**
   2239 * jbd2_journal_check_available_features() - Check feature set in journalling layer
   2240 * @journal: Journal to check.
   2241 * @compat: bitmask of compatible features
   2242 * @ro: bitmask of features that force read-only mount
   2243 * @incompat: bitmask of incompatible features
   2244 *
   2245 * Check whether the journaling code supports the use of
   2246 * all of a given set of features on this journal.  Return true
   2247 * (non-zero) if it can. */
   2248
   2249int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
   2250				      unsigned long ro, unsigned long incompat)
   2251{
   2252	if (!compat && !ro && !incompat)
   2253		return 1;
   2254
   2255	/* We can support any known requested features iff the
   2256	 * superblock is in version 2.  Otherwise we fail to support any
   2257	 * extended sb features. */
   2258
   2259	if (journal->j_format_version != 2)
   2260		return 0;
   2261
   2262	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
   2263	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
   2264	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
   2265		return 1;
   2266
   2267	return 0;
   2268}
   2269
   2270static int
   2271jbd2_journal_initialize_fast_commit(journal_t *journal)
   2272{
   2273	journal_superblock_t *sb = journal->j_superblock;
   2274	unsigned long long num_fc_blks;
   2275
   2276	num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
   2277	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
   2278		return -ENOSPC;
   2279
   2280	/* Are we called twice? */
   2281	WARN_ON(journal->j_fc_wbuf != NULL);
   2282	journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
   2283				sizeof(struct buffer_head *), GFP_KERNEL);
   2284	if (!journal->j_fc_wbuf)
   2285		return -ENOMEM;
   2286
   2287	journal->j_fc_wbufsize = num_fc_blks;
   2288	journal->j_fc_last = journal->j_last;
   2289	journal->j_last = journal->j_fc_last - num_fc_blks;
   2290	journal->j_fc_first = journal->j_last + 1;
   2291	journal->j_fc_off = 0;
   2292	journal->j_free = journal->j_last - journal->j_first;
   2293	journal->j_max_transaction_buffers =
   2294		jbd2_journal_get_max_txn_bufs(journal);
   2295
   2296	return 0;
   2297}
   2298
   2299/**
   2300 * jbd2_journal_set_features() - Mark a given journal feature in the superblock
   2301 * @journal: Journal to act on.
   2302 * @compat: bitmask of compatible features
   2303 * @ro: bitmask of features that force read-only mount
   2304 * @incompat: bitmask of incompatible features
   2305 *
   2306 * Mark a given journal feature as present on the
   2307 * superblock.  Returns true if the requested features could be set.
   2308 *
   2309 */
   2310
   2311int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
   2312			  unsigned long ro, unsigned long incompat)
   2313{
   2314#define INCOMPAT_FEATURE_ON(f) \
   2315		((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
   2316#define COMPAT_FEATURE_ON(f) \
   2317		((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
   2318	journal_superblock_t *sb;
   2319
   2320	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
   2321		return 1;
   2322
   2323	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
   2324		return 0;
   2325
   2326	/* If enabling v2 checksums, turn on v3 instead */
   2327	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
   2328		incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
   2329		incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
   2330	}
   2331
   2332	/* Asking for checksumming v3 and v1?  Only give them v3. */
   2333	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
   2334	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
   2335		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
   2336
   2337	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
   2338		  compat, ro, incompat);
   2339
   2340	sb = journal->j_superblock;
   2341
   2342	if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
   2343		if (jbd2_journal_initialize_fast_commit(journal)) {
   2344			pr_err("JBD2: Cannot enable fast commits.\n");
   2345			return 0;
   2346		}
   2347	}
   2348
   2349	/* Load the checksum driver if necessary */
   2350	if ((journal->j_chksum_driver == NULL) &&
   2351	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
   2352		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
   2353		if (IS_ERR(journal->j_chksum_driver)) {
   2354			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
   2355			journal->j_chksum_driver = NULL;
   2356			return 0;
   2357		}
   2358		/* Precompute checksum seed for all metadata */
   2359		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
   2360						   sizeof(sb->s_uuid));
   2361	}
   2362
   2363	lock_buffer(journal->j_sb_buffer);
   2364
   2365	/* If enabling v3 checksums, update superblock */
   2366	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
   2367		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
   2368		sb->s_feature_compat &=
   2369			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
   2370	}
   2371
   2372	/* If enabling v1 checksums, downgrade superblock */
   2373	if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
   2374		sb->s_feature_incompat &=
   2375			~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
   2376				     JBD2_FEATURE_INCOMPAT_CSUM_V3);
   2377
   2378	sb->s_feature_compat    |= cpu_to_be32(compat);
   2379	sb->s_feature_ro_compat |= cpu_to_be32(ro);
   2380	sb->s_feature_incompat  |= cpu_to_be32(incompat);
   2381	unlock_buffer(journal->j_sb_buffer);
   2382	journal->j_revoke_records_per_block =
   2383				journal_revoke_records_per_block(journal);
   2384
   2385	return 1;
   2386#undef COMPAT_FEATURE_ON
   2387#undef INCOMPAT_FEATURE_ON
   2388}
   2389
   2390/*
   2391 * jbd2_journal_clear_features() - Clear a given journal feature in the
   2392 * 				    superblock
   2393 * @journal: Journal to act on.
   2394 * @compat: bitmask of compatible features
   2395 * @ro: bitmask of features that force read-only mount
   2396 * @incompat: bitmask of incompatible features
   2397 *
   2398 * Clear a given journal feature as present on the
   2399 * superblock.
   2400 */
   2401void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
   2402				unsigned long ro, unsigned long incompat)
   2403{
   2404	journal_superblock_t *sb;
   2405
   2406	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
   2407		  compat, ro, incompat);
   2408
   2409	sb = journal->j_superblock;
   2410
   2411	sb->s_feature_compat    &= ~cpu_to_be32(compat);
   2412	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
   2413	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
   2414	journal->j_revoke_records_per_block =
   2415				journal_revoke_records_per_block(journal);
   2416}
   2417EXPORT_SYMBOL(jbd2_journal_clear_features);
   2418
   2419/**
   2420 * jbd2_journal_flush() - Flush journal
   2421 * @journal: Journal to act on.
   2422 * @flags: optional operation on the journal blocks after the flush (see below)
   2423 *
   2424 * Flush all data for a given journal to disk and empty the journal.
   2425 * Filesystems can use this when remounting readonly to ensure that
   2426 * recovery does not need to happen on remount. Optionally, a discard or zeroout
   2427 * can be issued on the journal blocks after flushing.
   2428 *
   2429 * flags:
   2430 *	JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
   2431 *	JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
   2432 */
   2433int jbd2_journal_flush(journal_t *journal, unsigned int flags)
   2434{
   2435	int err = 0;
   2436	transaction_t *transaction = NULL;
   2437
   2438	write_lock(&journal->j_state_lock);
   2439
   2440	/* Force everything buffered to the log... */
   2441	if (journal->j_running_transaction) {
   2442		transaction = journal->j_running_transaction;
   2443		__jbd2_log_start_commit(journal, transaction->t_tid);
   2444	} else if (journal->j_committing_transaction)
   2445		transaction = journal->j_committing_transaction;
   2446
   2447	/* Wait for the log commit to complete... */
   2448	if (transaction) {
   2449		tid_t tid = transaction->t_tid;
   2450
   2451		write_unlock(&journal->j_state_lock);
   2452		jbd2_log_wait_commit(journal, tid);
   2453	} else {
   2454		write_unlock(&journal->j_state_lock);
   2455	}
   2456
   2457	/* ...and flush everything in the log out to disk. */
   2458	spin_lock(&journal->j_list_lock);
   2459	while (!err && journal->j_checkpoint_transactions != NULL) {
   2460		spin_unlock(&journal->j_list_lock);
   2461		mutex_lock_io(&journal->j_checkpoint_mutex);
   2462		err = jbd2_log_do_checkpoint(journal);
   2463		mutex_unlock(&journal->j_checkpoint_mutex);
   2464		spin_lock(&journal->j_list_lock);
   2465	}
   2466	spin_unlock(&journal->j_list_lock);
   2467
   2468	if (is_journal_aborted(journal))
   2469		return -EIO;
   2470
   2471	mutex_lock_io(&journal->j_checkpoint_mutex);
   2472	if (!err) {
   2473		err = jbd2_cleanup_journal_tail(journal);
   2474		if (err < 0) {
   2475			mutex_unlock(&journal->j_checkpoint_mutex);
   2476			goto out;
   2477		}
   2478		err = 0;
   2479	}
   2480
   2481	/* Finally, mark the journal as really needing no recovery.
   2482	 * This sets s_start==0 in the underlying superblock, which is
   2483	 * the magic code for a fully-recovered superblock.  Any future
   2484	 * commits of data to the journal will restore the current
   2485	 * s_start value. */
   2486	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
   2487
   2488	if (flags)
   2489		err = __jbd2_journal_erase(journal, flags);
   2490
   2491	mutex_unlock(&journal->j_checkpoint_mutex);
   2492	write_lock(&journal->j_state_lock);
   2493	J_ASSERT(!journal->j_running_transaction);
   2494	J_ASSERT(!journal->j_committing_transaction);
   2495	J_ASSERT(!journal->j_checkpoint_transactions);
   2496	J_ASSERT(journal->j_head == journal->j_tail);
   2497	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
   2498	write_unlock(&journal->j_state_lock);
   2499out:
   2500	return err;
   2501}
   2502
   2503/**
   2504 * jbd2_journal_wipe() - Wipe journal contents
   2505 * @journal: Journal to act on.
   2506 * @write: flag (see below)
   2507 *
   2508 * Wipe out all of the contents of a journal, safely.  This will produce
   2509 * a warning if the journal contains any valid recovery information.
   2510 * Must be called between journal_init_*() and jbd2_journal_load().
   2511 *
   2512 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
   2513 * we merely suppress recovery.
   2514 */
   2515
   2516int jbd2_journal_wipe(journal_t *journal, int write)
   2517{
   2518	int err = 0;
   2519
   2520	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
   2521
   2522	err = load_superblock(journal);
   2523	if (err)
   2524		return err;
   2525
   2526	if (!journal->j_tail)
   2527		goto no_recovery;
   2528
   2529	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
   2530		write ? "Clearing" : "Ignoring");
   2531
   2532	err = jbd2_journal_skip_recovery(journal);
   2533	if (write) {
   2534		/* Lock to make assertions happy... */
   2535		mutex_lock_io(&journal->j_checkpoint_mutex);
   2536		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
   2537		mutex_unlock(&journal->j_checkpoint_mutex);
   2538	}
   2539
   2540 no_recovery:
   2541	return err;
   2542}
   2543
   2544/**
   2545 * jbd2_journal_abort () - Shutdown the journal immediately.
   2546 * @journal: the journal to shutdown.
   2547 * @errno:   an error number to record in the journal indicating
   2548 *           the reason for the shutdown.
   2549 *
   2550 * Perform a complete, immediate shutdown of the ENTIRE
   2551 * journal (not of a single transaction).  This operation cannot be
   2552 * undone without closing and reopening the journal.
   2553 *
   2554 * The jbd2_journal_abort function is intended to support higher level error
   2555 * recovery mechanisms such as the ext2/ext3 remount-readonly error
   2556 * mode.
   2557 *
   2558 * Journal abort has very specific semantics.  Any existing dirty,
   2559 * unjournaled buffers in the main filesystem will still be written to
   2560 * disk by bdflush, but the journaling mechanism will be suspended
   2561 * immediately and no further transaction commits will be honoured.
   2562 *
   2563 * Any dirty, journaled buffers will be written back to disk without
   2564 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
   2565 * filesystem, but we _do_ attempt to leave as much data as possible
   2566 * behind for fsck to use for cleanup.
   2567 *
   2568 * Any attempt to get a new transaction handle on a journal which is in
   2569 * ABORT state will just result in an -EROFS error return.  A
   2570 * jbd2_journal_stop on an existing handle will return -EIO if we have
   2571 * entered abort state during the update.
   2572 *
   2573 * Recursive transactions are not disturbed by journal abort until the
   2574 * final jbd2_journal_stop, which will receive the -EIO error.
   2575 *
   2576 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
   2577 * which will be recorded (if possible) in the journal superblock.  This
   2578 * allows a client to record failure conditions in the middle of a
   2579 * transaction without having to complete the transaction to record the
   2580 * failure to disk.  ext3_error, for example, now uses this
   2581 * functionality.
   2582 *
   2583 */
   2584
   2585void jbd2_journal_abort(journal_t *journal, int errno)
   2586{
   2587	transaction_t *transaction;
   2588
   2589	/*
   2590	 * Lock the aborting procedure until everything is done, this avoid
   2591	 * races between filesystem's error handling flow (e.g. ext4_abort()),
   2592	 * ensure panic after the error info is written into journal's
   2593	 * superblock.
   2594	 */
   2595	mutex_lock(&journal->j_abort_mutex);
   2596	/*
   2597	 * ESHUTDOWN always takes precedence because a file system check
   2598	 * caused by any other journal abort error is not required after
   2599	 * a shutdown triggered.
   2600	 */
   2601	write_lock(&journal->j_state_lock);
   2602	if (journal->j_flags & JBD2_ABORT) {
   2603		int old_errno = journal->j_errno;
   2604
   2605		write_unlock(&journal->j_state_lock);
   2606		if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
   2607			journal->j_errno = errno;
   2608			jbd2_journal_update_sb_errno(journal);
   2609		}
   2610		mutex_unlock(&journal->j_abort_mutex);
   2611		return;
   2612	}
   2613
   2614	/*
   2615	 * Mark the abort as occurred and start current running transaction
   2616	 * to release all journaled buffer.
   2617	 */
   2618	pr_err("Aborting journal on device %s.\n", journal->j_devname);
   2619
   2620	journal->j_flags |= JBD2_ABORT;
   2621	journal->j_errno = errno;
   2622	transaction = journal->j_running_transaction;
   2623	if (transaction)
   2624		__jbd2_log_start_commit(journal, transaction->t_tid);
   2625	write_unlock(&journal->j_state_lock);
   2626
   2627	/*
   2628	 * Record errno to the journal super block, so that fsck and jbd2
   2629	 * layer could realise that a filesystem check is needed.
   2630	 */
   2631	jbd2_journal_update_sb_errno(journal);
   2632	mutex_unlock(&journal->j_abort_mutex);
   2633}
   2634
   2635/**
   2636 * jbd2_journal_errno() - returns the journal's error state.
   2637 * @journal: journal to examine.
   2638 *
   2639 * This is the errno number set with jbd2_journal_abort(), the last
   2640 * time the journal was mounted - if the journal was stopped
   2641 * without calling abort this will be 0.
   2642 *
   2643 * If the journal has been aborted on this mount time -EROFS will
   2644 * be returned.
   2645 */
   2646int jbd2_journal_errno(journal_t *journal)
   2647{
   2648	int err;
   2649
   2650	read_lock(&journal->j_state_lock);
   2651	if (journal->j_flags & JBD2_ABORT)
   2652		err = -EROFS;
   2653	else
   2654		err = journal->j_errno;
   2655	read_unlock(&journal->j_state_lock);
   2656	return err;
   2657}
   2658
   2659/**
   2660 * jbd2_journal_clear_err() - clears the journal's error state
   2661 * @journal: journal to act on.
   2662 *
   2663 * An error must be cleared or acked to take a FS out of readonly
   2664 * mode.
   2665 */
   2666int jbd2_journal_clear_err(journal_t *journal)
   2667{
   2668	int err = 0;
   2669
   2670	write_lock(&journal->j_state_lock);
   2671	if (journal->j_flags & JBD2_ABORT)
   2672		err = -EROFS;
   2673	else
   2674		journal->j_errno = 0;
   2675	write_unlock(&journal->j_state_lock);
   2676	return err;
   2677}
   2678
   2679/**
   2680 * jbd2_journal_ack_err() - Ack journal err.
   2681 * @journal: journal to act on.
   2682 *
   2683 * An error must be cleared or acked to take a FS out of readonly
   2684 * mode.
   2685 */
   2686void jbd2_journal_ack_err(journal_t *journal)
   2687{
   2688	write_lock(&journal->j_state_lock);
   2689	if (journal->j_errno)
   2690		journal->j_flags |= JBD2_ACK_ERR;
   2691	write_unlock(&journal->j_state_lock);
   2692}
   2693
   2694int jbd2_journal_blocks_per_page(struct inode *inode)
   2695{
   2696	return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
   2697}
   2698
   2699/*
   2700 * helper functions to deal with 32 or 64bit block numbers.
   2701 */
   2702size_t journal_tag_bytes(journal_t *journal)
   2703{
   2704	size_t sz;
   2705
   2706	if (jbd2_has_feature_csum3(journal))
   2707		return sizeof(journal_block_tag3_t);
   2708
   2709	sz = sizeof(journal_block_tag_t);
   2710
   2711	if (jbd2_has_feature_csum2(journal))
   2712		sz += sizeof(__u16);
   2713
   2714	if (jbd2_has_feature_64bit(journal))
   2715		return sz;
   2716	else
   2717		return sz - sizeof(__u32);
   2718}
   2719
   2720/*
   2721 * JBD memory management
   2722 *
   2723 * These functions are used to allocate block-sized chunks of memory
   2724 * used for making copies of buffer_head data.  Very often it will be
   2725 * page-sized chunks of data, but sometimes it will be in
   2726 * sub-page-size chunks.  (For example, 16k pages on Power systems
   2727 * with a 4k block file system.)  For blocks smaller than a page, we
   2728 * use a SLAB allocator.  There are slab caches for each block size,
   2729 * which are allocated at mount time, if necessary, and we only free
   2730 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
   2731 * this reason we don't need to a mutex to protect access to
   2732 * jbd2_slab[] allocating or releasing memory; only in
   2733 * jbd2_journal_create_slab().
   2734 */
   2735#define JBD2_MAX_SLABS 8
   2736static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
   2737
   2738static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
   2739	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
   2740	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
   2741};
   2742
   2743
   2744static void jbd2_journal_destroy_slabs(void)
   2745{
   2746	int i;
   2747
   2748	for (i = 0; i < JBD2_MAX_SLABS; i++) {
   2749		kmem_cache_destroy(jbd2_slab[i]);
   2750		jbd2_slab[i] = NULL;
   2751	}
   2752}
   2753
   2754static int jbd2_journal_create_slab(size_t size)
   2755{
   2756	static DEFINE_MUTEX(jbd2_slab_create_mutex);
   2757	int i = order_base_2(size) - 10;
   2758	size_t slab_size;
   2759
   2760	if (size == PAGE_SIZE)
   2761		return 0;
   2762
   2763	if (i >= JBD2_MAX_SLABS)
   2764		return -EINVAL;
   2765
   2766	if (unlikely(i < 0))
   2767		i = 0;
   2768	mutex_lock(&jbd2_slab_create_mutex);
   2769	if (jbd2_slab[i]) {
   2770		mutex_unlock(&jbd2_slab_create_mutex);
   2771		return 0;	/* Already created */
   2772	}
   2773
   2774	slab_size = 1 << (i+10);
   2775	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
   2776					 slab_size, 0, NULL);
   2777	mutex_unlock(&jbd2_slab_create_mutex);
   2778	if (!jbd2_slab[i]) {
   2779		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
   2780		return -ENOMEM;
   2781	}
   2782	return 0;
   2783}
   2784
   2785static struct kmem_cache *get_slab(size_t size)
   2786{
   2787	int i = order_base_2(size) - 10;
   2788
   2789	BUG_ON(i >= JBD2_MAX_SLABS);
   2790	if (unlikely(i < 0))
   2791		i = 0;
   2792	BUG_ON(jbd2_slab[i] == NULL);
   2793	return jbd2_slab[i];
   2794}
   2795
   2796void *jbd2_alloc(size_t size, gfp_t flags)
   2797{
   2798	void *ptr;
   2799
   2800	BUG_ON(size & (size-1)); /* Must be a power of 2 */
   2801
   2802	if (size < PAGE_SIZE)
   2803		ptr = kmem_cache_alloc(get_slab(size), flags);
   2804	else
   2805		ptr = (void *)__get_free_pages(flags, get_order(size));
   2806
   2807	/* Check alignment; SLUB has gotten this wrong in the past,
   2808	 * and this can lead to user data corruption! */
   2809	BUG_ON(((unsigned long) ptr) & (size-1));
   2810
   2811	return ptr;
   2812}
   2813
   2814void jbd2_free(void *ptr, size_t size)
   2815{
   2816	if (size < PAGE_SIZE)
   2817		kmem_cache_free(get_slab(size), ptr);
   2818	else
   2819		free_pages((unsigned long)ptr, get_order(size));
   2820};
   2821
   2822/*
   2823 * Journal_head storage management
   2824 */
   2825static struct kmem_cache *jbd2_journal_head_cache;
   2826#ifdef CONFIG_JBD2_DEBUG
   2827static atomic_t nr_journal_heads = ATOMIC_INIT(0);
   2828#endif
   2829
   2830static int __init jbd2_journal_init_journal_head_cache(void)
   2831{
   2832	J_ASSERT(!jbd2_journal_head_cache);
   2833	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
   2834				sizeof(struct journal_head),
   2835				0,		/* offset */
   2836				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
   2837				NULL);		/* ctor */
   2838	if (!jbd2_journal_head_cache) {
   2839		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
   2840		return -ENOMEM;
   2841	}
   2842	return 0;
   2843}
   2844
   2845static void jbd2_journal_destroy_journal_head_cache(void)
   2846{
   2847	kmem_cache_destroy(jbd2_journal_head_cache);
   2848	jbd2_journal_head_cache = NULL;
   2849}
   2850
   2851/*
   2852 * journal_head splicing and dicing
   2853 */
   2854static struct journal_head *journal_alloc_journal_head(void)
   2855{
   2856	struct journal_head *ret;
   2857
   2858#ifdef CONFIG_JBD2_DEBUG
   2859	atomic_inc(&nr_journal_heads);
   2860#endif
   2861	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
   2862	if (!ret) {
   2863		jbd_debug(1, "out of memory for journal_head\n");
   2864		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
   2865		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
   2866				GFP_NOFS | __GFP_NOFAIL);
   2867	}
   2868	if (ret)
   2869		spin_lock_init(&ret->b_state_lock);
   2870	return ret;
   2871}
   2872
   2873static void journal_free_journal_head(struct journal_head *jh)
   2874{
   2875#ifdef CONFIG_JBD2_DEBUG
   2876	atomic_dec(&nr_journal_heads);
   2877	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
   2878#endif
   2879	kmem_cache_free(jbd2_journal_head_cache, jh);
   2880}
   2881
   2882/*
   2883 * A journal_head is attached to a buffer_head whenever JBD has an
   2884 * interest in the buffer.
   2885 *
   2886 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
   2887 * is set.  This bit is tested in core kernel code where we need to take
   2888 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
   2889 * there.
   2890 *
   2891 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
   2892 *
   2893 * When a buffer has its BH_JBD bit set it is immune from being released by
   2894 * core kernel code, mainly via ->b_count.
   2895 *
   2896 * A journal_head is detached from its buffer_head when the journal_head's
   2897 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
   2898 * transaction (b_cp_transaction) hold their references to b_jcount.
   2899 *
   2900 * Various places in the kernel want to attach a journal_head to a buffer_head
   2901 * _before_ attaching the journal_head to a transaction.  To protect the
   2902 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
   2903 * journal_head's b_jcount refcount by one.  The caller must call
   2904 * jbd2_journal_put_journal_head() to undo this.
   2905 *
   2906 * So the typical usage would be:
   2907 *
   2908 *	(Attach a journal_head if needed.  Increments b_jcount)
   2909 *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
   2910 *	...
   2911 *      (Get another reference for transaction)
   2912 *	jbd2_journal_grab_journal_head(bh);
   2913 *	jh->b_transaction = xxx;
   2914 *	(Put original reference)
   2915 *	jbd2_journal_put_journal_head(jh);
   2916 */
   2917
   2918/*
   2919 * Give a buffer_head a journal_head.
   2920 *
   2921 * May sleep.
   2922 */
   2923struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
   2924{
   2925	struct journal_head *jh;
   2926	struct journal_head *new_jh = NULL;
   2927
   2928repeat:
   2929	if (!buffer_jbd(bh))
   2930		new_jh = journal_alloc_journal_head();
   2931
   2932	jbd_lock_bh_journal_head(bh);
   2933	if (buffer_jbd(bh)) {
   2934		jh = bh2jh(bh);
   2935	} else {
   2936		J_ASSERT_BH(bh,
   2937			(atomic_read(&bh->b_count) > 0) ||
   2938			(bh->b_page && bh->b_page->mapping));
   2939
   2940		if (!new_jh) {
   2941			jbd_unlock_bh_journal_head(bh);
   2942			goto repeat;
   2943		}
   2944
   2945		jh = new_jh;
   2946		new_jh = NULL;		/* We consumed it */
   2947		set_buffer_jbd(bh);
   2948		bh->b_private = jh;
   2949		jh->b_bh = bh;
   2950		get_bh(bh);
   2951		BUFFER_TRACE(bh, "added journal_head");
   2952	}
   2953	jh->b_jcount++;
   2954	jbd_unlock_bh_journal_head(bh);
   2955	if (new_jh)
   2956		journal_free_journal_head(new_jh);
   2957	return bh->b_private;
   2958}
   2959
   2960/*
   2961 * Grab a ref against this buffer_head's journal_head.  If it ended up not
   2962 * having a journal_head, return NULL
   2963 */
   2964struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
   2965{
   2966	struct journal_head *jh = NULL;
   2967
   2968	jbd_lock_bh_journal_head(bh);
   2969	if (buffer_jbd(bh)) {
   2970		jh = bh2jh(bh);
   2971		jh->b_jcount++;
   2972	}
   2973	jbd_unlock_bh_journal_head(bh);
   2974	return jh;
   2975}
   2976EXPORT_SYMBOL(jbd2_journal_grab_journal_head);
   2977
   2978static void __journal_remove_journal_head(struct buffer_head *bh)
   2979{
   2980	struct journal_head *jh = bh2jh(bh);
   2981
   2982	J_ASSERT_JH(jh, jh->b_transaction == NULL);
   2983	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
   2984	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
   2985	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
   2986	J_ASSERT_BH(bh, buffer_jbd(bh));
   2987	J_ASSERT_BH(bh, jh2bh(jh) == bh);
   2988	BUFFER_TRACE(bh, "remove journal_head");
   2989
   2990	/* Unlink before dropping the lock */
   2991	bh->b_private = NULL;
   2992	jh->b_bh = NULL;	/* debug, really */
   2993	clear_buffer_jbd(bh);
   2994}
   2995
   2996static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
   2997{
   2998	if (jh->b_frozen_data) {
   2999		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
   3000		jbd2_free(jh->b_frozen_data, b_size);
   3001	}
   3002	if (jh->b_committed_data) {
   3003		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
   3004		jbd2_free(jh->b_committed_data, b_size);
   3005	}
   3006	journal_free_journal_head(jh);
   3007}
   3008
   3009/*
   3010 * Drop a reference on the passed journal_head.  If it fell to zero then
   3011 * release the journal_head from the buffer_head.
   3012 */
   3013void jbd2_journal_put_journal_head(struct journal_head *jh)
   3014{
   3015	struct buffer_head *bh = jh2bh(jh);
   3016
   3017	jbd_lock_bh_journal_head(bh);
   3018	J_ASSERT_JH(jh, jh->b_jcount > 0);
   3019	--jh->b_jcount;
   3020	if (!jh->b_jcount) {
   3021		__journal_remove_journal_head(bh);
   3022		jbd_unlock_bh_journal_head(bh);
   3023		journal_release_journal_head(jh, bh->b_size);
   3024		__brelse(bh);
   3025	} else {
   3026		jbd_unlock_bh_journal_head(bh);
   3027	}
   3028}
   3029EXPORT_SYMBOL(jbd2_journal_put_journal_head);
   3030
   3031/*
   3032 * Initialize jbd inode head
   3033 */
   3034void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
   3035{
   3036	jinode->i_transaction = NULL;
   3037	jinode->i_next_transaction = NULL;
   3038	jinode->i_vfs_inode = inode;
   3039	jinode->i_flags = 0;
   3040	jinode->i_dirty_start = 0;
   3041	jinode->i_dirty_end = 0;
   3042	INIT_LIST_HEAD(&jinode->i_list);
   3043}
   3044
   3045/*
   3046 * Function to be called before we start removing inode from memory (i.e.,
   3047 * clear_inode() is a fine place to be called from). It removes inode from
   3048 * transaction's lists.
   3049 */
   3050void jbd2_journal_release_jbd_inode(journal_t *journal,
   3051				    struct jbd2_inode *jinode)
   3052{
   3053	if (!journal)
   3054		return;
   3055restart:
   3056	spin_lock(&journal->j_list_lock);
   3057	/* Is commit writing out inode - we have to wait */
   3058	if (jinode->i_flags & JI_COMMIT_RUNNING) {
   3059		wait_queue_head_t *wq;
   3060		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
   3061		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
   3062		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
   3063		spin_unlock(&journal->j_list_lock);
   3064		schedule();
   3065		finish_wait(wq, &wait.wq_entry);
   3066		goto restart;
   3067	}
   3068
   3069	if (jinode->i_transaction) {
   3070		list_del(&jinode->i_list);
   3071		jinode->i_transaction = NULL;
   3072	}
   3073	spin_unlock(&journal->j_list_lock);
   3074}
   3075
   3076
   3077#ifdef CONFIG_PROC_FS
   3078
   3079#define JBD2_STATS_PROC_NAME "fs/jbd2"
   3080
   3081static void __init jbd2_create_jbd_stats_proc_entry(void)
   3082{
   3083	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
   3084}
   3085
   3086static void __exit jbd2_remove_jbd_stats_proc_entry(void)
   3087{
   3088	if (proc_jbd2_stats)
   3089		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
   3090}
   3091
   3092#else
   3093
   3094#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
   3095#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
   3096
   3097#endif
   3098
   3099struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
   3100
   3101static int __init jbd2_journal_init_inode_cache(void)
   3102{
   3103	J_ASSERT(!jbd2_inode_cache);
   3104	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
   3105	if (!jbd2_inode_cache) {
   3106		pr_emerg("JBD2: failed to create inode cache\n");
   3107		return -ENOMEM;
   3108	}
   3109	return 0;
   3110}
   3111
   3112static int __init jbd2_journal_init_handle_cache(void)
   3113{
   3114	J_ASSERT(!jbd2_handle_cache);
   3115	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
   3116	if (!jbd2_handle_cache) {
   3117		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
   3118		return -ENOMEM;
   3119	}
   3120	return 0;
   3121}
   3122
   3123static void jbd2_journal_destroy_inode_cache(void)
   3124{
   3125	kmem_cache_destroy(jbd2_inode_cache);
   3126	jbd2_inode_cache = NULL;
   3127}
   3128
   3129static void jbd2_journal_destroy_handle_cache(void)
   3130{
   3131	kmem_cache_destroy(jbd2_handle_cache);
   3132	jbd2_handle_cache = NULL;
   3133}
   3134
   3135/*
   3136 * Module startup and shutdown
   3137 */
   3138
   3139static int __init journal_init_caches(void)
   3140{
   3141	int ret;
   3142
   3143	ret = jbd2_journal_init_revoke_record_cache();
   3144	if (ret == 0)
   3145		ret = jbd2_journal_init_revoke_table_cache();
   3146	if (ret == 0)
   3147		ret = jbd2_journal_init_journal_head_cache();
   3148	if (ret == 0)
   3149		ret = jbd2_journal_init_handle_cache();
   3150	if (ret == 0)
   3151		ret = jbd2_journal_init_inode_cache();
   3152	if (ret == 0)
   3153		ret = jbd2_journal_init_transaction_cache();
   3154	return ret;
   3155}
   3156
   3157static void jbd2_journal_destroy_caches(void)
   3158{
   3159	jbd2_journal_destroy_revoke_record_cache();
   3160	jbd2_journal_destroy_revoke_table_cache();
   3161	jbd2_journal_destroy_journal_head_cache();
   3162	jbd2_journal_destroy_handle_cache();
   3163	jbd2_journal_destroy_inode_cache();
   3164	jbd2_journal_destroy_transaction_cache();
   3165	jbd2_journal_destroy_slabs();
   3166}
   3167
   3168static int __init journal_init(void)
   3169{
   3170	int ret;
   3171
   3172	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
   3173
   3174	ret = journal_init_caches();
   3175	if (ret == 0) {
   3176		jbd2_create_jbd_stats_proc_entry();
   3177	} else {
   3178		jbd2_journal_destroy_caches();
   3179	}
   3180	return ret;
   3181}
   3182
   3183static void __exit journal_exit(void)
   3184{
   3185#ifdef CONFIG_JBD2_DEBUG
   3186	int n = atomic_read(&nr_journal_heads);
   3187	if (n)
   3188		printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
   3189#endif
   3190	jbd2_remove_jbd_stats_proc_entry();
   3191	jbd2_journal_destroy_caches();
   3192}
   3193
   3194MODULE_LICENSE("GPL");
   3195module_init(journal_init);
   3196module_exit(journal_exit);
   3197