journal.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
journal.c (123625B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Write ahead logging implementation copyright Chris Mason 2000
      4 *
      5 * The background commits make this code very interrelated, and
      6 * overly complex.  I need to rethink things a bit....The major players:
      7 *
      8 * journal_begin -- call with the number of blocks you expect to log.
      9 *                  If the current transaction is too
     10 *		    old, it will block until the current transaction is
     11 *		    finished, and then start a new one.
     12 *		    Usually, your transaction will get joined in with
     13 *                  previous ones for speed.
     14 *
     15 * journal_join  -- same as journal_begin, but won't block on the current
     16 *                  transaction regardless of age.  Don't ever call
     17 *                  this.  Ever.  There are only two places it should be
     18 *                  called from, and they are both inside this file.
     19 *
     20 * journal_mark_dirty -- adds blocks into this transaction.  clears any flags
     21 *                       that might make them get sent to disk
     22 *                       and then marks them BH_JDirty.  Puts the buffer head
     23 *                       into the current transaction hash.
     24 *
     25 * journal_end -- if the current transaction is batchable, it does nothing
     26 *                   otherwise, it could do an async/synchronous commit, or
     27 *                   a full flush of all log and real blocks in the
     28 *                   transaction.
     29 *
     30 * flush_old_commits -- if the current transaction is too old, it is ended and
     31 *                      commit blocks are sent to disk.  Forces commit blocks
     32 *                      to disk for all backgrounded commits that have been
     33 *                      around too long.
     34 *		     -- Note, if you call this as an immediate flush from
     35 *		        within kupdate, it will ignore the immediate flag
     36 */
     37
     38#include <linux/time.h>
     39#include <linux/semaphore.h>
     40#include <linux/vmalloc.h>
     41#include "reiserfs.h"
     42#include <linux/kernel.h>
     43#include <linux/errno.h>
     44#include <linux/fcntl.h>
     45#include <linux/stat.h>
     46#include <linux/string.h>
     47#include <linux/buffer_head.h>
     48#include <linux/workqueue.h>
     49#include <linux/writeback.h>
     50#include <linux/blkdev.h>
     51#include <linux/backing-dev.h>
     52#include <linux/uaccess.h>
     53#include <linux/slab.h>
     54
     55
     56/* gets a struct reiserfs_journal_list * from a list head */
     57#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
     58                               j_list))
     59
     60/* must be correct to keep the desc and commit structs at 4k */
     61#define JOURNAL_TRANS_HALF 1018
     62#define BUFNR 64		/*read ahead */
     63
     64/* cnode stat bits.  Move these into reiserfs_fs.h */
     65
     66/* this block was freed, and can't be written.  */
     67#define BLOCK_FREED 2
     68/* this block was freed during this transaction, and can't be written */
     69#define BLOCK_FREED_HOLDER 3
     70
     71/* used in flush_journal_list */
     72#define BLOCK_NEEDS_FLUSH 4
     73#define BLOCK_DIRTIED 5
     74
     75/* journal list state bits */
     76#define LIST_TOUCHED 1
     77#define LIST_DIRTY   2
     78#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
     79
     80/* flags for do_journal_end */
     81#define FLUSH_ALL   1		/* flush commit and real blocks */
     82#define COMMIT_NOW  2		/* end and commit this transaction */
     83#define WAIT        4		/* wait for the log blocks to hit the disk */
     84
     85static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
     86static int flush_journal_list(struct super_block *s,
     87			      struct reiserfs_journal_list *jl, int flushall);
     88static int flush_commit_list(struct super_block *s,
     89			     struct reiserfs_journal_list *jl, int flushall);
     90static int can_dirty(struct reiserfs_journal_cnode *cn);
     91static int journal_join(struct reiserfs_transaction_handle *th,
     92			struct super_block *sb);
     93static void release_journal_dev(struct super_block *super,
     94			       struct reiserfs_journal *journal);
     95static void dirty_one_transaction(struct super_block *s,
     96				 struct reiserfs_journal_list *jl);
     97static void flush_async_commits(struct work_struct *work);
     98static void queue_log_writer(struct super_block *s);
     99
    100/* values for join in do_journal_begin_r */
    101enum {
    102	JBEGIN_REG = 0,		/* regular journal begin */
    103	/* join the running transaction if at all possible */
    104	JBEGIN_JOIN = 1,
    105	/* called from cleanup code, ignores aborted flag */
    106	JBEGIN_ABORT = 2,
    107};
    108
    109static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
    110			      struct super_block *sb,
    111			      unsigned long nblocks, int join);
    112
    113static void init_journal_hash(struct super_block *sb)
    114{
    115	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    116	memset(journal->j_hash_table, 0,
    117	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
    118}
    119
    120/*
    121 * clears BH_Dirty and sticks the buffer on the clean list.  Called because
    122 * I can't allow refile_buffer to make schedule happen after I've freed a
    123 * block.  Look at remove_from_transaction and journal_mark_freed for
    124 * more details.
    125 */
    126static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
    127{
    128	if (bh) {
    129		clear_buffer_dirty(bh);
    130		clear_buffer_journal_test(bh);
    131	}
    132	return 0;
    133}
    134
    135static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
    136							 *sb)
    137{
    138	struct reiserfs_bitmap_node *bn;
    139	static int id;
    140
    141	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
    142	if (!bn) {
    143		return NULL;
    144	}
    145	bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
    146	if (!bn->data) {
    147		kfree(bn);
    148		return NULL;
    149	}
    150	bn->id = id++;
    151	INIT_LIST_HEAD(&bn->list);
    152	return bn;
    153}
    154
    155static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
    156{
    157	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    158	struct reiserfs_bitmap_node *bn = NULL;
    159	struct list_head *entry = journal->j_bitmap_nodes.next;
    160
    161	journal->j_used_bitmap_nodes++;
    162repeat:
    163
    164	if (entry != &journal->j_bitmap_nodes) {
    165		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
    166		list_del(entry);
    167		memset(bn->data, 0, sb->s_blocksize);
    168		journal->j_free_bitmap_nodes--;
    169		return bn;
    170	}
    171	bn = allocate_bitmap_node(sb);
    172	if (!bn) {
    173		yield();
    174		goto repeat;
    175	}
    176	return bn;
    177}
    178static inline void free_bitmap_node(struct super_block *sb,
    179				    struct reiserfs_bitmap_node *bn)
    180{
    181	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    182	journal->j_used_bitmap_nodes--;
    183	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
    184		kfree(bn->data);
    185		kfree(bn);
    186	} else {
    187		list_add(&bn->list, &journal->j_bitmap_nodes);
    188		journal->j_free_bitmap_nodes++;
    189	}
    190}
    191
    192static void allocate_bitmap_nodes(struct super_block *sb)
    193{
    194	int i;
    195	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    196	struct reiserfs_bitmap_node *bn = NULL;
    197	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
    198		bn = allocate_bitmap_node(sb);
    199		if (bn) {
    200			list_add(&bn->list, &journal->j_bitmap_nodes);
    201			journal->j_free_bitmap_nodes++;
    202		} else {
    203			/* this is ok, we'll try again when more are needed */
    204			break;
    205		}
    206	}
    207}
    208
    209static int set_bit_in_list_bitmap(struct super_block *sb,
    210				  b_blocknr_t block,
    211				  struct reiserfs_list_bitmap *jb)
    212{
    213	unsigned int bmap_nr = block / (sb->s_blocksize << 3);
    214	unsigned int bit_nr = block % (sb->s_blocksize << 3);
    215
    216	if (!jb->bitmaps[bmap_nr]) {
    217		jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
    218	}
    219	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
    220	return 0;
    221}
    222
    223static void cleanup_bitmap_list(struct super_block *sb,
    224				struct reiserfs_list_bitmap *jb)
    225{
    226	int i;
    227	if (jb->bitmaps == NULL)
    228		return;
    229
    230	for (i = 0; i < reiserfs_bmap_count(sb); i++) {
    231		if (jb->bitmaps[i]) {
    232			free_bitmap_node(sb, jb->bitmaps[i]);
    233			jb->bitmaps[i] = NULL;
    234		}
    235	}
    236}
    237
    238/*
    239 * only call this on FS unmount.
    240 */
    241static int free_list_bitmaps(struct super_block *sb,
    242			     struct reiserfs_list_bitmap *jb_array)
    243{
    244	int i;
    245	struct reiserfs_list_bitmap *jb;
    246	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
    247		jb = jb_array + i;
    248		jb->journal_list = NULL;
    249		cleanup_bitmap_list(sb, jb);
    250		vfree(jb->bitmaps);
    251		jb->bitmaps = NULL;
    252	}
    253	return 0;
    254}
    255
    256static int free_bitmap_nodes(struct super_block *sb)
    257{
    258	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    259	struct list_head *next = journal->j_bitmap_nodes.next;
    260	struct reiserfs_bitmap_node *bn;
    261
    262	while (next != &journal->j_bitmap_nodes) {
    263		bn = list_entry(next, struct reiserfs_bitmap_node, list);
    264		list_del(next);
    265		kfree(bn->data);
    266		kfree(bn);
    267		next = journal->j_bitmap_nodes.next;
    268		journal->j_free_bitmap_nodes--;
    269	}
    270
    271	return 0;
    272}
    273
    274/*
    275 * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
    276 * jb_array is the array to be filled in.
    277 */
    278int reiserfs_allocate_list_bitmaps(struct super_block *sb,
    279				   struct reiserfs_list_bitmap *jb_array,
    280				   unsigned int bmap_nr)
    281{
    282	int i;
    283	int failed = 0;
    284	struct reiserfs_list_bitmap *jb;
    285	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
    286
    287	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
    288		jb = jb_array + i;
    289		jb->journal_list = NULL;
    290		jb->bitmaps = vzalloc(mem);
    291		if (!jb->bitmaps) {
    292			reiserfs_warning(sb, "clm-2000", "unable to "
    293					 "allocate bitmaps for journal lists");
    294			failed = 1;
    295			break;
    296		}
    297	}
    298	if (failed) {
    299		free_list_bitmaps(sb, jb_array);
    300		return -1;
    301	}
    302	return 0;
    303}
    304
    305/*
    306 * find an available list bitmap.  If you can't find one, flush a commit list
    307 * and try again
    308 */
    309static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
    310						    struct reiserfs_journal_list
    311						    *jl)
    312{
    313	int i, j;
    314	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    315	struct reiserfs_list_bitmap *jb = NULL;
    316
    317	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
    318		i = journal->j_list_bitmap_index;
    319		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
    320		jb = journal->j_list_bitmap + i;
    321		if (journal->j_list_bitmap[i].journal_list) {
    322			flush_commit_list(sb,
    323					  journal->j_list_bitmap[i].
    324					  journal_list, 1);
    325			if (!journal->j_list_bitmap[i].journal_list) {
    326				break;
    327			}
    328		} else {
    329			break;
    330		}
    331	}
    332	/* double check to make sure if flushed correctly */
    333	if (jb->journal_list)
    334		return NULL;
    335	jb->journal_list = jl;
    336	return jb;
    337}
    338
    339/*
    340 * allocates a new chunk of X nodes, and links them all together as a list.
    341 * Uses the cnode->next and cnode->prev pointers
    342 * returns NULL on failure
    343 */
    344static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
    345{
    346	struct reiserfs_journal_cnode *head;
    347	int i;
    348	if (num_cnodes <= 0) {
    349		return NULL;
    350	}
    351	head = vzalloc(array_size(num_cnodes,
    352				  sizeof(struct reiserfs_journal_cnode)));
    353	if (!head) {
    354		return NULL;
    355	}
    356	head[0].prev = NULL;
    357	head[0].next = head + 1;
    358	for (i = 1; i < num_cnodes; i++) {
    359		head[i].prev = head + (i - 1);
    360		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
    361	}
    362	head[num_cnodes - 1].next = NULL;
    363	return head;
    364}
    365
    366/* pulls a cnode off the free list, or returns NULL on failure */
    367static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
    368{
    369	struct reiserfs_journal_cnode *cn;
    370	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    371
    372	reiserfs_check_lock_depth(sb, "get_cnode");
    373
    374	if (journal->j_cnode_free <= 0) {
    375		return NULL;
    376	}
    377	journal->j_cnode_used++;
    378	journal->j_cnode_free--;
    379	cn = journal->j_cnode_free_list;
    380	if (!cn) {
    381		return cn;
    382	}
    383	if (cn->next) {
    384		cn->next->prev = NULL;
    385	}
    386	journal->j_cnode_free_list = cn->next;
    387	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
    388	return cn;
    389}
    390
    391/*
    392 * returns a cnode to the free list
    393 */
    394static void free_cnode(struct super_block *sb,
    395		       struct reiserfs_journal_cnode *cn)
    396{
    397	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    398
    399	reiserfs_check_lock_depth(sb, "free_cnode");
    400
    401	journal->j_cnode_used--;
    402	journal->j_cnode_free++;
    403	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
    404	cn->next = journal->j_cnode_free_list;
    405	if (journal->j_cnode_free_list) {
    406		journal->j_cnode_free_list->prev = cn;
    407	}
    408	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
    409	journal->j_cnode_free_list = cn;
    410}
    411
    412static void clear_prepared_bits(struct buffer_head *bh)
    413{
    414	clear_buffer_journal_prepared(bh);
    415	clear_buffer_journal_restore_dirty(bh);
    416}
    417
    418/*
    419 * return a cnode with same dev, block number and size in table,
    420 * or null if not found
    421 */
    422static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
    423								  super_block
    424								  *sb,
    425								  struct
    426								  reiserfs_journal_cnode
    427								  **table,
    428								  long bl)
    429{
    430	struct reiserfs_journal_cnode *cn;
    431	cn = journal_hash(table, sb, bl);
    432	while (cn) {
    433		if (cn->blocknr == bl && cn->sb == sb)
    434			return cn;
    435		cn = cn->hnext;
    436	}
    437	return (struct reiserfs_journal_cnode *)0;
    438}
    439
    440/*
    441 * this actually means 'can this block be reallocated yet?'.  If you set
    442 * search_all, a block can only be allocated if it is not in the current
    443 * transaction, was not freed by the current transaction, and has no chance
    444 * of ever being overwritten by a replay after crashing.
    445 *
    446 * If you don't set search_all, a block can only be allocated if it is not
    447 * in the current transaction.  Since deleting a block removes it from the
    448 * current transaction, this case should never happen.  If you don't set
    449 * search_all, make sure you never write the block without logging it.
    450 *
    451 * next_zero_bit is a suggestion about the next block to try for find_forward.
    452 * when bl is rejected because it is set in a journal list bitmap, we search
    453 * for the next zero bit in the bitmap that rejected bl.  Then, we return
    454 * that through next_zero_bit for find_forward to try.
    455 *
    456 * Just because we return something in next_zero_bit does not mean we won't
    457 * reject it on the next call to reiserfs_in_journal
    458 */
    459int reiserfs_in_journal(struct super_block *sb,
    460			unsigned int bmap_nr, int bit_nr, int search_all,
    461			b_blocknr_t * next_zero_bit)
    462{
    463	struct reiserfs_journal *journal = SB_JOURNAL(sb);
    464	struct reiserfs_list_bitmap *jb;
    465	int i;
    466	unsigned long bl;
    467
    468	*next_zero_bit = 0;	/* always start this at zero. */
    469
    470	PROC_INFO_INC(sb, journal.in_journal);
    471	/*
    472	 * If we aren't doing a search_all, this is a metablock, and it
    473	 * will be logged before use.  if we crash before the transaction
    474	 * that freed it commits,  this transaction won't have committed
    475	 * either, and the block will never be written
    476	 */
    477	if (search_all) {
    478		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
    479			PROC_INFO_INC(sb, journal.in_journal_bitmap);
    480			jb = journal->j_list_bitmap + i;
    481			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
    482			    test_bit(bit_nr,
    483				     (unsigned long *)jb->bitmaps[bmap_nr]->
    484				     data)) {
    485				*next_zero_bit =
    486				    find_next_zero_bit((unsigned long *)
    487						       (jb->bitmaps[bmap_nr]->
    488							data),
    489						       sb->s_blocksize << 3,
    490						       bit_nr + 1);
    491				return 1;
    492			}
    493		}
    494	}
    495
    496	bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
    497	/* is it in any old transactions? */
    498	if (search_all
    499	    && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
    500		return 1;
    501	}
    502
    503	/* is it in the current transaction.  This should never happen */
    504	if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
    505		BUG();
    506		return 1;
    507	}
    508
    509	PROC_INFO_INC(sb, journal.in_journal_reusable);
    510	/* safe for reuse */
    511	return 0;
    512}
    513
    514/* insert cn into table */
    515static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
    516				       struct reiserfs_journal_cnode *cn)
    517{
    518	struct reiserfs_journal_cnode *cn_orig;
    519
    520	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
    521	cn->hnext = cn_orig;
    522	cn->hprev = NULL;
    523	if (cn_orig) {
    524		cn_orig->hprev = cn;
    525	}
    526	journal_hash(table, cn->sb, cn->blocknr) = cn;
    527}
    528
    529/* lock the current transaction */
    530static inline void lock_journal(struct super_block *sb)
    531{
    532	PROC_INFO_INC(sb, journal.lock_journal);
    533
    534	reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
    535}
    536
    537/* unlock the current transaction */
    538static inline void unlock_journal(struct super_block *sb)
    539{
    540	mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
    541}
    542
    543static inline void get_journal_list(struct reiserfs_journal_list *jl)
    544{
    545	jl->j_refcount++;
    546}
    547
    548static inline void put_journal_list(struct super_block *s,
    549				    struct reiserfs_journal_list *jl)
    550{
    551	if (jl->j_refcount < 1) {
    552		reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
    553			       jl->j_trans_id, jl->j_refcount);
    554	}
    555	if (--jl->j_refcount == 0)
    556		kfree(jl);
    557}
    558
    559/*
    560 * this used to be much more involved, and I'm keeping it just in case
    561 * things get ugly again.  it gets called by flush_commit_list, and
    562 * cleans up any data stored about blocks freed during a transaction.
    563 */
    564static void cleanup_freed_for_journal_list(struct super_block *sb,
    565					   struct reiserfs_journal_list *jl)
    566{
    567
    568	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
    569	if (jb) {
    570		cleanup_bitmap_list(sb, jb);
    571	}
    572	jl->j_list_bitmap->journal_list = NULL;
    573	jl->j_list_bitmap = NULL;
    574}
    575
    576static int journal_list_still_alive(struct super_block *s,
    577				    unsigned int trans_id)
    578{
    579	struct reiserfs_journal *journal = SB_JOURNAL(s);
    580	struct list_head *entry = &journal->j_journal_list;
    581	struct reiserfs_journal_list *jl;
    582
    583	if (!list_empty(entry)) {
    584		jl = JOURNAL_LIST_ENTRY(entry->next);
    585		if (jl->j_trans_id <= trans_id) {
    586			return 1;
    587		}
    588	}
    589	return 0;
    590}
    591
    592/*
    593 * If page->mapping was null, we failed to truncate this page for
    594 * some reason.  Most likely because it was truncated after being
    595 * logged via data=journal.
    596 *
    597 * This does a check to see if the buffer belongs to one of these
    598 * lost pages before doing the final put_bh.  If page->mapping was
    599 * null, it tries to free buffers on the page, which should make the
    600 * final put_page drop the page from the lru.
    601 */
    602static void release_buffer_page(struct buffer_head *bh)
    603{
    604	struct folio *folio = page_folio(bh->b_page);
    605	if (!folio->mapping && folio_trylock(folio)) {
    606		folio_get(folio);
    607		put_bh(bh);
    608		if (!folio->mapping)
    609			try_to_free_buffers(folio);
    610		folio_unlock(folio);
    611		folio_put(folio);
    612	} else {
    613		put_bh(bh);
    614	}
    615}
    616
    617static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
    618{
    619	if (buffer_journaled(bh)) {
    620		reiserfs_warning(NULL, "clm-2084",
    621				 "pinned buffer %lu:%pg sent to disk",
    622				 bh->b_blocknr, bh->b_bdev);
    623	}
    624	if (uptodate)
    625		set_buffer_uptodate(bh);
    626	else
    627		clear_buffer_uptodate(bh);
    628
    629	unlock_buffer(bh);
    630	release_buffer_page(bh);
    631}
    632
    633static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
    634{
    635	if (uptodate)
    636		set_buffer_uptodate(bh);
    637	else
    638		clear_buffer_uptodate(bh);
    639	unlock_buffer(bh);
    640	put_bh(bh);
    641}
    642
    643static void submit_logged_buffer(struct buffer_head *bh)
    644{
    645	get_bh(bh);
    646	bh->b_end_io = reiserfs_end_buffer_io_sync;
    647	clear_buffer_journal_new(bh);
    648	clear_buffer_dirty(bh);
    649	if (!test_clear_buffer_journal_test(bh))
    650		BUG();
    651	if (!buffer_uptodate(bh))
    652		BUG();
    653	submit_bh(REQ_OP_WRITE, 0, bh);
    654}
    655
    656static void submit_ordered_buffer(struct buffer_head *bh)
    657{
    658	get_bh(bh);
    659	bh->b_end_io = reiserfs_end_ordered_io;
    660	clear_buffer_dirty(bh);
    661	if (!buffer_uptodate(bh))
    662		BUG();
    663	submit_bh(REQ_OP_WRITE, 0, bh);
    664}
    665
    666#define CHUNK_SIZE 32
    667struct buffer_chunk {
    668	struct buffer_head *bh[CHUNK_SIZE];
    669	int nr;
    670};
    671
    672static void write_chunk(struct buffer_chunk *chunk)
    673{
    674	int i;
    675	for (i = 0; i < chunk->nr; i++) {
    676		submit_logged_buffer(chunk->bh[i]);
    677	}
    678	chunk->nr = 0;
    679}
    680
    681static void write_ordered_chunk(struct buffer_chunk *chunk)
    682{
    683	int i;
    684	for (i = 0; i < chunk->nr; i++) {
    685		submit_ordered_buffer(chunk->bh[i]);
    686	}
    687	chunk->nr = 0;
    688}
    689
    690static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
    691			spinlock_t * lock, void (fn) (struct buffer_chunk *))
    692{
    693	int ret = 0;
    694	BUG_ON(chunk->nr >= CHUNK_SIZE);
    695	chunk->bh[chunk->nr++] = bh;
    696	if (chunk->nr >= CHUNK_SIZE) {
    697		ret = 1;
    698		if (lock) {
    699			spin_unlock(lock);
    700			fn(chunk);
    701			spin_lock(lock);
    702		} else {
    703			fn(chunk);
    704		}
    705	}
    706	return ret;
    707}
    708
    709static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
    710static struct reiserfs_jh *alloc_jh(void)
    711{
    712	struct reiserfs_jh *jh;
    713	while (1) {
    714		jh = kmalloc(sizeof(*jh), GFP_NOFS);
    715		if (jh) {
    716			atomic_inc(&nr_reiserfs_jh);
    717			return jh;
    718		}
    719		yield();
    720	}
    721}
    722
    723/*
    724 * we want to free the jh when the buffer has been written
    725 * and waited on
    726 */
    727void reiserfs_free_jh(struct buffer_head *bh)
    728{
    729	struct reiserfs_jh *jh;
    730
    731	jh = bh->b_private;
    732	if (jh) {
    733		bh->b_private = NULL;
    734		jh->bh = NULL;
    735		list_del_init(&jh->list);
    736		kfree(jh);
    737		if (atomic_read(&nr_reiserfs_jh) <= 0)
    738			BUG();
    739		atomic_dec(&nr_reiserfs_jh);
    740		put_bh(bh);
    741	}
    742}
    743
    744static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
    745			   int tail)
    746{
    747	struct reiserfs_jh *jh;
    748
    749	if (bh->b_private) {
    750		spin_lock(&j->j_dirty_buffers_lock);
    751		if (!bh->b_private) {
    752			spin_unlock(&j->j_dirty_buffers_lock);
    753			goto no_jh;
    754		}
    755		jh = bh->b_private;
    756		list_del_init(&jh->list);
    757	} else {
    758no_jh:
    759		get_bh(bh);
    760		jh = alloc_jh();
    761		spin_lock(&j->j_dirty_buffers_lock);
    762		/*
    763		 * buffer must be locked for __add_jh, should be able to have
    764		 * two adds at the same time
    765		 */
    766		BUG_ON(bh->b_private);
    767		jh->bh = bh;
    768		bh->b_private = jh;
    769	}
    770	jh->jl = j->j_current_jl;
    771	if (tail)
    772		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
    773	else {
    774		list_add_tail(&jh->list, &jh->jl->j_bh_list);
    775	}
    776	spin_unlock(&j->j_dirty_buffers_lock);
    777	return 0;
    778}
    779
    780int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
    781{
    782	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
    783}
    784int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
    785{
    786	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
    787}
    788
    789#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
    790static int write_ordered_buffers(spinlock_t * lock,
    791				 struct reiserfs_journal *j,
    792				 struct reiserfs_journal_list *jl,
    793				 struct list_head *list)
    794{
    795	struct buffer_head *bh;
    796	struct reiserfs_jh *jh;
    797	int ret = j->j_errno;
    798	struct buffer_chunk chunk;
    799	struct list_head tmp;
    800	INIT_LIST_HEAD(&tmp);
    801
    802	chunk.nr = 0;
    803	spin_lock(lock);
    804	while (!list_empty(list)) {
    805		jh = JH_ENTRY(list->next);
    806		bh = jh->bh;
    807		get_bh(bh);
    808		if (!trylock_buffer(bh)) {
    809			if (!buffer_dirty(bh)) {
    810				list_move(&jh->list, &tmp);
    811				goto loop_next;
    812			}
    813			spin_unlock(lock);
    814			if (chunk.nr)
    815				write_ordered_chunk(&chunk);
    816			wait_on_buffer(bh);
    817			cond_resched();
    818			spin_lock(lock);
    819			goto loop_next;
    820		}
    821		/*
    822		 * in theory, dirty non-uptodate buffers should never get here,
    823		 * but the upper layer io error paths still have a few quirks.
    824		 * Handle them here as gracefully as we can
    825		 */
    826		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
    827			clear_buffer_dirty(bh);
    828			ret = -EIO;
    829		}
    830		if (buffer_dirty(bh)) {
    831			list_move(&jh->list, &tmp);
    832			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
    833		} else {
    834			reiserfs_free_jh(bh);
    835			unlock_buffer(bh);
    836		}
    837loop_next:
    838		put_bh(bh);
    839		cond_resched_lock(lock);
    840	}
    841	if (chunk.nr) {
    842		spin_unlock(lock);
    843		write_ordered_chunk(&chunk);
    844		spin_lock(lock);
    845	}
    846	while (!list_empty(&tmp)) {
    847		jh = JH_ENTRY(tmp.prev);
    848		bh = jh->bh;
    849		get_bh(bh);
    850		reiserfs_free_jh(bh);
    851
    852		if (buffer_locked(bh)) {
    853			spin_unlock(lock);
    854			wait_on_buffer(bh);
    855			spin_lock(lock);
    856		}
    857		if (!buffer_uptodate(bh)) {
    858			ret = -EIO;
    859		}
    860		/*
    861		 * ugly interaction with invalidate_folio here.
    862		 * reiserfs_invalidate_folio will pin any buffer that has a
    863		 * valid journal head from an older transaction.  If someone
    864		 * else sets our buffer dirty after we write it in the first
    865		 * loop, and then someone truncates the page away, nobody
    866		 * will ever write the buffer. We're safe if we write the
    867		 * page one last time after freeing the journal header.
    868		 */
    869		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
    870			spin_unlock(lock);
    871			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
    872			spin_lock(lock);
    873		}
    874		put_bh(bh);
    875		cond_resched_lock(lock);
    876	}
    877	spin_unlock(lock);
    878	return ret;
    879}
    880
    881static int flush_older_commits(struct super_block *s,
    882			       struct reiserfs_journal_list *jl)
    883{
    884	struct reiserfs_journal *journal = SB_JOURNAL(s);
    885	struct reiserfs_journal_list *other_jl;
    886	struct reiserfs_journal_list *first_jl;
    887	struct list_head *entry;
    888	unsigned int trans_id = jl->j_trans_id;
    889	unsigned int other_trans_id;
    890
    891find_first:
    892	/*
    893	 * first we walk backwards to find the oldest uncommitted transation
    894	 */
    895	first_jl = jl;
    896	entry = jl->j_list.prev;
    897	while (1) {
    898		other_jl = JOURNAL_LIST_ENTRY(entry);
    899		if (entry == &journal->j_journal_list ||
    900		    atomic_read(&other_jl->j_older_commits_done))
    901			break;
    902
    903		first_jl = other_jl;
    904		entry = other_jl->j_list.prev;
    905	}
    906
    907	/* if we didn't find any older uncommitted transactions, return now */
    908	if (first_jl == jl) {
    909		return 0;
    910	}
    911
    912	entry = &first_jl->j_list;
    913	while (1) {
    914		other_jl = JOURNAL_LIST_ENTRY(entry);
    915		other_trans_id = other_jl->j_trans_id;
    916
    917		if (other_trans_id < trans_id) {
    918			if (atomic_read(&other_jl->j_commit_left) != 0) {
    919				flush_commit_list(s, other_jl, 0);
    920
    921				/* list we were called with is gone, return */
    922				if (!journal_list_still_alive(s, trans_id))
    923					return 1;
    924
    925				/*
    926				 * the one we just flushed is gone, this means
    927				 * all older lists are also gone, so first_jl
    928				 * is no longer valid either.  Go back to the
    929				 * beginning.
    930				 */
    931				if (!journal_list_still_alive
    932				    (s, other_trans_id)) {
    933					goto find_first;
    934				}
    935			}
    936			entry = entry->next;
    937			if (entry == &journal->j_journal_list)
    938				return 0;
    939		} else {
    940			return 0;
    941		}
    942	}
    943	return 0;
    944}
    945
    946static int reiserfs_async_progress_wait(struct super_block *s)
    947{
    948	struct reiserfs_journal *j = SB_JOURNAL(s);
    949
    950	if (atomic_read(&j->j_async_throttle)) {
    951		int depth;
    952
    953		depth = reiserfs_write_unlock_nested(s);
    954		wait_var_event_timeout(&j->j_async_throttle,
    955				       atomic_read(&j->j_async_throttle) == 0,
    956				       HZ / 10);
    957		reiserfs_write_lock_nested(s, depth);
    958	}
    959
    960	return 0;
    961}
    962
    963/*
    964 * if this journal list still has commit blocks unflushed, send them to disk.
    965 *
    966 * log areas must be flushed in order (transaction 2 can't commit before
    967 * transaction 1) Before the commit block can by written, every other log
    968 * block must be safely on disk
    969 */
    970static int flush_commit_list(struct super_block *s,
    971			     struct reiserfs_journal_list *jl, int flushall)
    972{
    973	int i;
    974	b_blocknr_t bn;
    975	struct buffer_head *tbh = NULL;
    976	unsigned int trans_id = jl->j_trans_id;
    977	struct reiserfs_journal *journal = SB_JOURNAL(s);
    978	int retval = 0;
    979	int write_len;
    980	int depth;
    981
    982	reiserfs_check_lock_depth(s, "flush_commit_list");
    983
    984	if (atomic_read(&jl->j_older_commits_done)) {
    985		return 0;
    986	}
    987
    988	/*
    989	 * before we can put our commit blocks on disk, we have to make
    990	 * sure everyone older than us is on disk too
    991	 */
    992	BUG_ON(jl->j_len <= 0);
    993	BUG_ON(trans_id == journal->j_trans_id);
    994
    995	get_journal_list(jl);
    996	if (flushall) {
    997		if (flush_older_commits(s, jl) == 1) {
    998			/*
    999			 * list disappeared during flush_older_commits.
   1000			 * return
   1001			 */
   1002			goto put_jl;
   1003		}
   1004	}
   1005
   1006	/* make sure nobody is trying to flush this one at the same time */
   1007	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
   1008
   1009	if (!journal_list_still_alive(s, trans_id)) {
   1010		mutex_unlock(&jl->j_commit_mutex);
   1011		goto put_jl;
   1012	}
   1013	BUG_ON(jl->j_trans_id == 0);
   1014
   1015	/* this commit is done, exit */
   1016	if (atomic_read(&jl->j_commit_left) <= 0) {
   1017		if (flushall) {
   1018			atomic_set(&jl->j_older_commits_done, 1);
   1019		}
   1020		mutex_unlock(&jl->j_commit_mutex);
   1021		goto put_jl;
   1022	}
   1023
   1024	if (!list_empty(&jl->j_bh_list)) {
   1025		int ret;
   1026
   1027		/*
   1028		 * We might sleep in numerous places inside
   1029		 * write_ordered_buffers. Relax the write lock.
   1030		 */
   1031		depth = reiserfs_write_unlock_nested(s);
   1032		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
   1033					    journal, jl, &jl->j_bh_list);
   1034		if (ret < 0 && retval == 0)
   1035			retval = ret;
   1036		reiserfs_write_lock_nested(s, depth);
   1037	}
   1038	BUG_ON(!list_empty(&jl->j_bh_list));
   1039	/*
   1040	 * for the description block and all the log blocks, submit any buffers
   1041	 * that haven't already reached the disk.  Try to write at least 256
   1042	 * log blocks. later on, we will only wait on blocks that correspond
   1043	 * to this transaction, but while we're unplugging we might as well
   1044	 * get a chunk of data on there.
   1045	 */
   1046	atomic_inc(&journal->j_async_throttle);
   1047	write_len = jl->j_len + 1;
   1048	if (write_len < 256)
   1049		write_len = 256;
   1050	for (i = 0 ; i < write_len ; i++) {
   1051		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
   1052		    SB_ONDISK_JOURNAL_SIZE(s);
   1053		tbh = journal_find_get_block(s, bn);
   1054		if (tbh) {
   1055			if (buffer_dirty(tbh)) {
   1056		            depth = reiserfs_write_unlock_nested(s);
   1057			    ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh);
   1058			    reiserfs_write_lock_nested(s, depth);
   1059			}
   1060			put_bh(tbh) ;
   1061		}
   1062	}
   1063	if (atomic_dec_and_test(&journal->j_async_throttle))
   1064		wake_up_var(&journal->j_async_throttle);
   1065
   1066	for (i = 0; i < (jl->j_len + 1); i++) {
   1067		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
   1068		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
   1069		tbh = journal_find_get_block(s, bn);
   1070
   1071		depth = reiserfs_write_unlock_nested(s);
   1072		__wait_on_buffer(tbh);
   1073		reiserfs_write_lock_nested(s, depth);
   1074		/*
   1075		 * since we're using ll_rw_blk above, it might have skipped
   1076		 * over a locked buffer.  Double check here
   1077		 */
   1078		/* redundant, sync_dirty_buffer() checks */
   1079		if (buffer_dirty(tbh)) {
   1080			depth = reiserfs_write_unlock_nested(s);
   1081			sync_dirty_buffer(tbh);
   1082			reiserfs_write_lock_nested(s, depth);
   1083		}
   1084		if (unlikely(!buffer_uptodate(tbh))) {
   1085#ifdef CONFIG_REISERFS_CHECK
   1086			reiserfs_warning(s, "journal-601",
   1087					 "buffer write failed");
   1088#endif
   1089			retval = -EIO;
   1090		}
   1091		/* once for journal_find_get_block */
   1092		put_bh(tbh);
   1093		/* once due to original getblk in do_journal_end */
   1094		put_bh(tbh);
   1095		atomic_dec(&jl->j_commit_left);
   1096	}
   1097
   1098	BUG_ON(atomic_read(&jl->j_commit_left) != 1);
   1099
   1100	/*
   1101	 * If there was a write error in the journal - we can't commit
   1102	 * this transaction - it will be invalid and, if successful,
   1103	 * will just end up propagating the write error out to
   1104	 * the file system.
   1105	 */
   1106	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
   1107		if (buffer_dirty(jl->j_commit_bh))
   1108			BUG();
   1109		mark_buffer_dirty(jl->j_commit_bh) ;
   1110		depth = reiserfs_write_unlock_nested(s);
   1111		if (reiserfs_barrier_flush(s))
   1112			__sync_dirty_buffer(jl->j_commit_bh,
   1113					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
   1114		else
   1115			sync_dirty_buffer(jl->j_commit_bh);
   1116		reiserfs_write_lock_nested(s, depth);
   1117	}
   1118
   1119	/*
   1120	 * If there was a write error in the journal - we can't commit this
   1121	 * transaction - it will be invalid and, if successful, will just end
   1122	 * up propagating the write error out to the filesystem.
   1123	 */
   1124	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
   1125#ifdef CONFIG_REISERFS_CHECK
   1126		reiserfs_warning(s, "journal-615", "buffer write failed");
   1127#endif
   1128		retval = -EIO;
   1129	}
   1130	bforget(jl->j_commit_bh);
   1131	if (journal->j_last_commit_id != 0 &&
   1132	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
   1133		reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
   1134				 journal->j_last_commit_id, jl->j_trans_id);
   1135	}
   1136	journal->j_last_commit_id = jl->j_trans_id;
   1137
   1138	/*
   1139	 * now, every commit block is on the disk.  It is safe to allow
   1140	 * blocks freed during this transaction to be reallocated
   1141	 */
   1142	cleanup_freed_for_journal_list(s, jl);
   1143
   1144	retval = retval ? retval : journal->j_errno;
   1145
   1146	/* mark the metadata dirty */
   1147	if (!retval)
   1148		dirty_one_transaction(s, jl);
   1149	atomic_dec(&jl->j_commit_left);
   1150
   1151	if (flushall) {
   1152		atomic_set(&jl->j_older_commits_done, 1);
   1153	}
   1154	mutex_unlock(&jl->j_commit_mutex);
   1155put_jl:
   1156	put_journal_list(s, jl);
   1157
   1158	if (retval)
   1159		reiserfs_abort(s, retval, "Journal write error in %s",
   1160			       __func__);
   1161	return retval;
   1162}
   1163
   1164/*
   1165 * flush_journal_list frequently needs to find a newer transaction for a
   1166 * given block.  This does that, or returns NULL if it can't find anything
   1167 */
   1168static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
   1169							  reiserfs_journal_cnode
   1170							  *cn)
   1171{
   1172	struct super_block *sb = cn->sb;
   1173	b_blocknr_t blocknr = cn->blocknr;
   1174
   1175	cn = cn->hprev;
   1176	while (cn) {
   1177		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
   1178			return cn->jlist;
   1179		}
   1180		cn = cn->hprev;
   1181	}
   1182	return NULL;
   1183}
   1184
   1185static void remove_journal_hash(struct super_block *,
   1186				struct reiserfs_journal_cnode **,
   1187				struct reiserfs_journal_list *, unsigned long,
   1188				int);
   1189
   1190/*
   1191 * once all the real blocks have been flushed, it is safe to remove them
   1192 * from the journal list for this transaction.  Aside from freeing the
   1193 * cnode, this also allows the block to be reallocated for data blocks
   1194 * if it had been deleted.
   1195 */
   1196static void remove_all_from_journal_list(struct super_block *sb,
   1197					 struct reiserfs_journal_list *jl,
   1198					 int debug)
   1199{
   1200	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   1201	struct reiserfs_journal_cnode *cn, *last;
   1202	cn = jl->j_realblock;
   1203
   1204	/*
   1205	 * which is better, to lock once around the whole loop, or
   1206	 * to lock for each call to remove_journal_hash?
   1207	 */
   1208	while (cn) {
   1209		if (cn->blocknr != 0) {
   1210			if (debug) {
   1211				reiserfs_warning(sb, "reiserfs-2201",
   1212						 "block %u, bh is %d, state %ld",
   1213						 cn->blocknr, cn->bh ? 1 : 0,
   1214						 cn->state);
   1215			}
   1216			cn->state = 0;
   1217			remove_journal_hash(sb, journal->j_list_hash_table,
   1218					    jl, cn->blocknr, 1);
   1219		}
   1220		last = cn;
   1221		cn = cn->next;
   1222		free_cnode(sb, last);
   1223	}
   1224	jl->j_realblock = NULL;
   1225}
   1226
   1227/*
   1228 * if this timestamp is greater than the timestamp we wrote last to the
   1229 * header block, write it to the header block.  once this is done, I can
   1230 * safely say the log area for this transaction won't ever be replayed,
   1231 * and I can start releasing blocks in this transaction for reuse as data
   1232 * blocks.  called by flush_journal_list, before it calls
   1233 * remove_all_from_journal_list
   1234 */
   1235static int _update_journal_header_block(struct super_block *sb,
   1236					unsigned long offset,
   1237					unsigned int trans_id)
   1238{
   1239	struct reiserfs_journal_header *jh;
   1240	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   1241	int depth;
   1242
   1243	if (reiserfs_is_journal_aborted(journal))
   1244		return -EIO;
   1245
   1246	if (trans_id >= journal->j_last_flush_trans_id) {
   1247		if (buffer_locked((journal->j_header_bh))) {
   1248			depth = reiserfs_write_unlock_nested(sb);
   1249			__wait_on_buffer(journal->j_header_bh);
   1250			reiserfs_write_lock_nested(sb, depth);
   1251			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
   1252#ifdef CONFIG_REISERFS_CHECK
   1253				reiserfs_warning(sb, "journal-699",
   1254						 "buffer write failed");
   1255#endif
   1256				return -EIO;
   1257			}
   1258		}
   1259		journal->j_last_flush_trans_id = trans_id;
   1260		journal->j_first_unflushed_offset = offset;
   1261		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
   1262							b_data);
   1263		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
   1264		jh->j_first_unflushed_offset = cpu_to_le32(offset);
   1265		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
   1266
   1267		set_buffer_dirty(journal->j_header_bh);
   1268		depth = reiserfs_write_unlock_nested(sb);
   1269
   1270		if (reiserfs_barrier_flush(sb))
   1271			__sync_dirty_buffer(journal->j_header_bh,
   1272					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
   1273		else
   1274			sync_dirty_buffer(journal->j_header_bh);
   1275
   1276		reiserfs_write_lock_nested(sb, depth);
   1277		if (!buffer_uptodate(journal->j_header_bh)) {
   1278			reiserfs_warning(sb, "journal-837",
   1279					 "IO error during journal replay");
   1280			return -EIO;
   1281		}
   1282	}
   1283	return 0;
   1284}
   1285
   1286static int update_journal_header_block(struct super_block *sb,
   1287				       unsigned long offset,
   1288				       unsigned int trans_id)
   1289{
   1290	return _update_journal_header_block(sb, offset, trans_id);
   1291}
   1292
   1293/*
   1294** flush any and all journal lists older than you are
   1295** can only be called from flush_journal_list
   1296*/
   1297static int flush_older_journal_lists(struct super_block *sb,
   1298				     struct reiserfs_journal_list *jl)
   1299{
   1300	struct list_head *entry;
   1301	struct reiserfs_journal_list *other_jl;
   1302	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   1303	unsigned int trans_id = jl->j_trans_id;
   1304
   1305	/*
   1306	 * we know we are the only ones flushing things, no extra race
   1307	 * protection is required.
   1308	 */
   1309restart:
   1310	entry = journal->j_journal_list.next;
   1311	/* Did we wrap? */
   1312	if (entry == &journal->j_journal_list)
   1313		return 0;
   1314	other_jl = JOURNAL_LIST_ENTRY(entry);
   1315	if (other_jl->j_trans_id < trans_id) {
   1316		BUG_ON(other_jl->j_refcount <= 0);
   1317		/* do not flush all */
   1318		flush_journal_list(sb, other_jl, 0);
   1319
   1320		/* other_jl is now deleted from the list */
   1321		goto restart;
   1322	}
   1323	return 0;
   1324}
   1325
   1326static void del_from_work_list(struct super_block *s,
   1327			       struct reiserfs_journal_list *jl)
   1328{
   1329	struct reiserfs_journal *journal = SB_JOURNAL(s);
   1330	if (!list_empty(&jl->j_working_list)) {
   1331		list_del_init(&jl->j_working_list);
   1332		journal->j_num_work_lists--;
   1333	}
   1334}
   1335
   1336/*
   1337 * flush a journal list, both commit and real blocks
   1338 *
   1339 * always set flushall to 1, unless you are calling from inside
   1340 * flush_journal_list
   1341 *
   1342 * IMPORTANT.  This can only be called while there are no journal writers,
   1343 * and the journal is locked.  That means it can only be called from
   1344 * do_journal_end, or by journal_release
   1345 */
   1346static int flush_journal_list(struct super_block *s,
   1347			      struct reiserfs_journal_list *jl, int flushall)
   1348{
   1349	struct reiserfs_journal_list *pjl;
   1350	struct reiserfs_journal_cnode *cn;
   1351	int count;
   1352	int was_jwait = 0;
   1353	int was_dirty = 0;
   1354	struct buffer_head *saved_bh;
   1355	unsigned long j_len_saved = jl->j_len;
   1356	struct reiserfs_journal *journal = SB_JOURNAL(s);
   1357	int err = 0;
   1358	int depth;
   1359
   1360	BUG_ON(j_len_saved <= 0);
   1361
   1362	if (atomic_read(&journal->j_wcount) != 0) {
   1363		reiserfs_warning(s, "clm-2048", "called with wcount %d",
   1364				 atomic_read(&journal->j_wcount));
   1365	}
   1366
   1367	/* if flushall == 0, the lock is already held */
   1368	if (flushall) {
   1369		reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
   1370	} else if (mutex_trylock(&journal->j_flush_mutex)) {
   1371		BUG();
   1372	}
   1373
   1374	count = 0;
   1375	if (j_len_saved > journal->j_trans_max) {
   1376		reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
   1377			       j_len_saved, jl->j_trans_id);
   1378		return 0;
   1379	}
   1380
   1381	/* if all the work is already done, get out of here */
   1382	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
   1383	    atomic_read(&jl->j_commit_left) <= 0) {
   1384		goto flush_older_and_return;
   1385	}
   1386
   1387	/*
   1388	 * start by putting the commit list on disk.  This will also flush
   1389	 * the commit lists of any olders transactions
   1390	 */
   1391	flush_commit_list(s, jl, 1);
   1392
   1393	if (!(jl->j_state & LIST_DIRTY)
   1394	    && !reiserfs_is_journal_aborted(journal))
   1395		BUG();
   1396
   1397	/* are we done now? */
   1398	if (atomic_read(&jl->j_nonzerolen) <= 0 &&
   1399	    atomic_read(&jl->j_commit_left) <= 0) {
   1400		goto flush_older_and_return;
   1401	}
   1402
   1403	/*
   1404	 * loop through each cnode, see if we need to write it,
   1405	 * or wait on a more recent transaction, or just ignore it
   1406	 */
   1407	if (atomic_read(&journal->j_wcount) != 0) {
   1408		reiserfs_panic(s, "journal-844", "journal list is flushing, "
   1409			       "wcount is not 0");
   1410	}
   1411	cn = jl->j_realblock;
   1412	while (cn) {
   1413		was_jwait = 0;
   1414		was_dirty = 0;
   1415		saved_bh = NULL;
   1416		/* blocknr of 0 is no longer in the hash, ignore it */
   1417		if (cn->blocknr == 0) {
   1418			goto free_cnode;
   1419		}
   1420
   1421		/*
   1422		 * This transaction failed commit.
   1423		 * Don't write out to the disk
   1424		 */
   1425		if (!(jl->j_state & LIST_DIRTY))
   1426			goto free_cnode;
   1427
   1428		pjl = find_newer_jl_for_cn(cn);
   1429		/*
   1430		 * the order is important here.  We check pjl to make sure we
   1431		 * don't clear BH_JDirty_wait if we aren't the one writing this
   1432		 * block to disk
   1433		 */
   1434		if (!pjl && cn->bh) {
   1435			saved_bh = cn->bh;
   1436
   1437			/*
   1438			 * we do this to make sure nobody releases the
   1439			 * buffer while we are working with it
   1440			 */
   1441			get_bh(saved_bh);
   1442
   1443			if (buffer_journal_dirty(saved_bh)) {
   1444				BUG_ON(!can_dirty(cn));
   1445				was_jwait = 1;
   1446				was_dirty = 1;
   1447			} else if (can_dirty(cn)) {
   1448				/*
   1449				 * everything with !pjl && jwait
   1450				 * should be writable
   1451				 */
   1452				BUG();
   1453			}
   1454		}
   1455
   1456		/*
   1457		 * if someone has this block in a newer transaction, just make
   1458		 * sure they are committed, and don't try writing it to disk
   1459		 */
   1460		if (pjl) {
   1461			if (atomic_read(&pjl->j_commit_left))
   1462				flush_commit_list(s, pjl, 1);
   1463			goto free_cnode;
   1464		}
   1465
   1466		/*
   1467		 * bh == NULL when the block got to disk on its own, OR,
   1468		 * the block got freed in a future transaction
   1469		 */
   1470		if (saved_bh == NULL) {
   1471			goto free_cnode;
   1472		}
   1473
   1474		/*
   1475		 * this should never happen.  kupdate_one_transaction has
   1476		 * this list locked while it works, so we should never see a
   1477		 * buffer here that is not marked JDirty_wait
   1478		 */
   1479		if ((!was_jwait) && !buffer_locked(saved_bh)) {
   1480			reiserfs_warning(s, "journal-813",
   1481					 "BAD! buffer %llu %cdirty %cjwait, "
   1482					 "not in a newer transaction",
   1483					 (unsigned long long)saved_bh->
   1484					 b_blocknr, was_dirty ? ' ' : '!',
   1485					 was_jwait ? ' ' : '!');
   1486		}
   1487		if (was_dirty) {
   1488			/*
   1489			 * we inc again because saved_bh gets decremented
   1490			 * at free_cnode
   1491			 */
   1492			get_bh(saved_bh);
   1493			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
   1494			lock_buffer(saved_bh);
   1495			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
   1496			if (buffer_dirty(saved_bh))
   1497				submit_logged_buffer(saved_bh);
   1498			else
   1499				unlock_buffer(saved_bh);
   1500			count++;
   1501		} else {
   1502			reiserfs_warning(s, "clm-2082",
   1503					 "Unable to flush buffer %llu in %s",
   1504					 (unsigned long long)saved_bh->
   1505					 b_blocknr, __func__);
   1506		}
   1507free_cnode:
   1508		cn = cn->next;
   1509		if (saved_bh) {
   1510			/*
   1511			 * we incremented this to keep others from
   1512			 * taking the buffer head away
   1513			 */
   1514			put_bh(saved_bh);
   1515			if (atomic_read(&saved_bh->b_count) < 0) {
   1516				reiserfs_warning(s, "journal-945",
   1517						 "saved_bh->b_count < 0");
   1518			}
   1519		}
   1520	}
   1521	if (count > 0) {
   1522		cn = jl->j_realblock;
   1523		while (cn) {
   1524			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
   1525				if (!cn->bh) {
   1526					reiserfs_panic(s, "journal-1011",
   1527						       "cn->bh is NULL");
   1528				}
   1529
   1530				depth = reiserfs_write_unlock_nested(s);
   1531				__wait_on_buffer(cn->bh);
   1532				reiserfs_write_lock_nested(s, depth);
   1533
   1534				if (!cn->bh) {
   1535					reiserfs_panic(s, "journal-1012",
   1536						       "cn->bh is NULL");
   1537				}
   1538				if (unlikely(!buffer_uptodate(cn->bh))) {
   1539#ifdef CONFIG_REISERFS_CHECK
   1540					reiserfs_warning(s, "journal-949",
   1541							 "buffer write failed");
   1542#endif
   1543					err = -EIO;
   1544				}
   1545				/*
   1546				 * note, we must clear the JDirty_wait bit
   1547				 * after the up to date check, otherwise we
   1548				 * race against our flushpage routine
   1549				 */
   1550				BUG_ON(!test_clear_buffer_journal_dirty
   1551				       (cn->bh));
   1552
   1553				/* drop one ref for us */
   1554				put_bh(cn->bh);
   1555				/* drop one ref for journal_mark_dirty */
   1556				release_buffer_page(cn->bh);
   1557			}
   1558			cn = cn->next;
   1559		}
   1560	}
   1561
   1562	if (err)
   1563		reiserfs_abort(s, -EIO,
   1564			       "Write error while pushing transaction to disk in %s",
   1565			       __func__);
   1566flush_older_and_return:
   1567
   1568	/*
   1569	 * before we can update the journal header block, we _must_ flush all
   1570	 * real blocks from all older transactions to disk.  This is because
   1571	 * once the header block is updated, this transaction will not be
   1572	 * replayed after a crash
   1573	 */
   1574	if (flushall) {
   1575		flush_older_journal_lists(s, jl);
   1576	}
   1577
   1578	err = journal->j_errno;
   1579	/*
   1580	 * before we can remove everything from the hash tables for this
   1581	 * transaction, we must make sure it can never be replayed
   1582	 *
   1583	 * since we are only called from do_journal_end, we know for sure there
   1584	 * are no allocations going on while we are flushing journal lists.  So,
   1585	 * we only need to update the journal header block for the last list
   1586	 * being flushed
   1587	 */
   1588	if (!err && flushall) {
   1589		err =
   1590		    update_journal_header_block(s,
   1591						(jl->j_start + jl->j_len +
   1592						 2) % SB_ONDISK_JOURNAL_SIZE(s),
   1593						jl->j_trans_id);
   1594		if (err)
   1595			reiserfs_abort(s, -EIO,
   1596				       "Write error while updating journal header in %s",
   1597				       __func__);
   1598	}
   1599	remove_all_from_journal_list(s, jl, 0);
   1600	list_del_init(&jl->j_list);
   1601	journal->j_num_lists--;
   1602	del_from_work_list(s, jl);
   1603
   1604	if (journal->j_last_flush_id != 0 &&
   1605	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
   1606		reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
   1607				 journal->j_last_flush_id, jl->j_trans_id);
   1608	}
   1609	journal->j_last_flush_id = jl->j_trans_id;
   1610
   1611	/*
   1612	 * not strictly required since we are freeing the list, but it should
   1613	 * help find code using dead lists later on
   1614	 */
   1615	jl->j_len = 0;
   1616	atomic_set(&jl->j_nonzerolen, 0);
   1617	jl->j_start = 0;
   1618	jl->j_realblock = NULL;
   1619	jl->j_commit_bh = NULL;
   1620	jl->j_trans_id = 0;
   1621	jl->j_state = 0;
   1622	put_journal_list(s, jl);
   1623	if (flushall)
   1624		mutex_unlock(&journal->j_flush_mutex);
   1625	return err;
   1626}
   1627
   1628static int write_one_transaction(struct super_block *s,
   1629				 struct reiserfs_journal_list *jl,
   1630				 struct buffer_chunk *chunk)
   1631{
   1632	struct reiserfs_journal_cnode *cn;
   1633	int ret = 0;
   1634
   1635	jl->j_state |= LIST_TOUCHED;
   1636	del_from_work_list(s, jl);
   1637	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
   1638		return 0;
   1639	}
   1640
   1641	cn = jl->j_realblock;
   1642	while (cn) {
   1643		/*
   1644		 * if the blocknr == 0, this has been cleared from the hash,
   1645		 * skip it
   1646		 */
   1647		if (cn->blocknr == 0) {
   1648			goto next;
   1649		}
   1650		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
   1651			struct buffer_head *tmp_bh;
   1652			/*
   1653			 * we can race against journal_mark_freed when we try
   1654			 * to lock_buffer(cn->bh), so we have to inc the buffer
   1655			 * count, and recheck things after locking
   1656			 */
   1657			tmp_bh = cn->bh;
   1658			get_bh(tmp_bh);
   1659			lock_buffer(tmp_bh);
   1660			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
   1661				if (!buffer_journal_dirty(tmp_bh) ||
   1662				    buffer_journal_prepared(tmp_bh))
   1663					BUG();
   1664				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
   1665				ret++;
   1666			} else {
   1667				/* note, cn->bh might be null now */
   1668				unlock_buffer(tmp_bh);
   1669			}
   1670			put_bh(tmp_bh);
   1671		}
   1672next:
   1673		cn = cn->next;
   1674		cond_resched();
   1675	}
   1676	return ret;
   1677}
   1678
   1679/* used by flush_commit_list */
   1680static void dirty_one_transaction(struct super_block *s,
   1681				 struct reiserfs_journal_list *jl)
   1682{
   1683	struct reiserfs_journal_cnode *cn;
   1684	struct reiserfs_journal_list *pjl;
   1685
   1686	jl->j_state |= LIST_DIRTY;
   1687	cn = jl->j_realblock;
   1688	while (cn) {
   1689		/*
   1690		 * look for a more recent transaction that logged this
   1691		 * buffer.  Only the most recent transaction with a buffer in
   1692		 * it is allowed to send that buffer to disk
   1693		 */
   1694		pjl = find_newer_jl_for_cn(cn);
   1695		if (!pjl && cn->blocknr && cn->bh
   1696		    && buffer_journal_dirty(cn->bh)) {
   1697			BUG_ON(!can_dirty(cn));
   1698			/*
   1699			 * if the buffer is prepared, it will either be logged
   1700			 * or restored.  If restored, we need to make sure
   1701			 * it actually gets marked dirty
   1702			 */
   1703			clear_buffer_journal_new(cn->bh);
   1704			if (buffer_journal_prepared(cn->bh)) {
   1705				set_buffer_journal_restore_dirty(cn->bh);
   1706			} else {
   1707				set_buffer_journal_test(cn->bh);
   1708				mark_buffer_dirty(cn->bh);
   1709			}
   1710		}
   1711		cn = cn->next;
   1712	}
   1713}
   1714
   1715static int kupdate_transactions(struct super_block *s,
   1716				struct reiserfs_journal_list *jl,
   1717				struct reiserfs_journal_list **next_jl,
   1718				unsigned int *next_trans_id,
   1719				int num_blocks, int num_trans)
   1720{
   1721	int ret = 0;
   1722	int written = 0;
   1723	int transactions_flushed = 0;
   1724	unsigned int orig_trans_id = jl->j_trans_id;
   1725	struct buffer_chunk chunk;
   1726	struct list_head *entry;
   1727	struct reiserfs_journal *journal = SB_JOURNAL(s);
   1728	chunk.nr = 0;
   1729
   1730	reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
   1731	if (!journal_list_still_alive(s, orig_trans_id)) {
   1732		goto done;
   1733	}
   1734
   1735	/*
   1736	 * we've got j_flush_mutex held, nobody is going to delete any
   1737	 * of these lists out from underneath us
   1738	 */
   1739	while ((num_trans && transactions_flushed < num_trans) ||
   1740	       (!num_trans && written < num_blocks)) {
   1741
   1742		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
   1743		    atomic_read(&jl->j_commit_left)
   1744		    || !(jl->j_state & LIST_DIRTY)) {
   1745			del_from_work_list(s, jl);
   1746			break;
   1747		}
   1748		ret = write_one_transaction(s, jl, &chunk);
   1749
   1750		if (ret < 0)
   1751			goto done;
   1752		transactions_flushed++;
   1753		written += ret;
   1754		entry = jl->j_list.next;
   1755
   1756		/* did we wrap? */
   1757		if (entry == &journal->j_journal_list) {
   1758			break;
   1759		}
   1760		jl = JOURNAL_LIST_ENTRY(entry);
   1761
   1762		/* don't bother with older transactions */
   1763		if (jl->j_trans_id <= orig_trans_id)
   1764			break;
   1765	}
   1766	if (chunk.nr) {
   1767		write_chunk(&chunk);
   1768	}
   1769
   1770done:
   1771	mutex_unlock(&journal->j_flush_mutex);
   1772	return ret;
   1773}
   1774
   1775/*
   1776 * for o_sync and fsync heavy applications, they tend to use
   1777 * all the journa list slots with tiny transactions.  These
   1778 * trigger lots and lots of calls to update the header block, which
   1779 * adds seeks and slows things down.
   1780 *
   1781 * This function tries to clear out a large chunk of the journal lists
   1782 * at once, which makes everything faster since only the newest journal
   1783 * list updates the header block
   1784 */
   1785static int flush_used_journal_lists(struct super_block *s,
   1786				    struct reiserfs_journal_list *jl)
   1787{
   1788	unsigned long len = 0;
   1789	unsigned long cur_len;
   1790	int i;
   1791	int limit = 256;
   1792	struct reiserfs_journal_list *tjl;
   1793	struct reiserfs_journal_list *flush_jl;
   1794	unsigned int trans_id;
   1795	struct reiserfs_journal *journal = SB_JOURNAL(s);
   1796
   1797	flush_jl = tjl = jl;
   1798
   1799	/* in data logging mode, try harder to flush a lot of blocks */
   1800	if (reiserfs_data_log(s))
   1801		limit = 1024;
   1802	/* flush for 256 transactions or limit blocks, whichever comes first */
   1803	for (i = 0; i < 256 && len < limit; i++) {
   1804		if (atomic_read(&tjl->j_commit_left) ||
   1805		    tjl->j_trans_id < jl->j_trans_id) {
   1806			break;
   1807		}
   1808		cur_len = atomic_read(&tjl->j_nonzerolen);
   1809		if (cur_len > 0) {
   1810			tjl->j_state &= ~LIST_TOUCHED;
   1811		}
   1812		len += cur_len;
   1813		flush_jl = tjl;
   1814		if (tjl->j_list.next == &journal->j_journal_list)
   1815			break;
   1816		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
   1817	}
   1818	get_journal_list(jl);
   1819	get_journal_list(flush_jl);
   1820
   1821	/*
   1822	 * try to find a group of blocks we can flush across all the
   1823	 * transactions, but only bother if we've actually spanned
   1824	 * across multiple lists
   1825	 */
   1826	if (flush_jl != jl)
   1827		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
   1828
   1829	flush_journal_list(s, flush_jl, 1);
   1830	put_journal_list(s, flush_jl);
   1831	put_journal_list(s, jl);
   1832	return 0;
   1833}
   1834
   1835/*
   1836 * removes any nodes in table with name block and dev as bh.
   1837 * only touchs the hnext and hprev pointers.
   1838 */
   1839static void remove_journal_hash(struct super_block *sb,
   1840			 struct reiserfs_journal_cnode **table,
   1841			 struct reiserfs_journal_list *jl,
   1842			 unsigned long block, int remove_freed)
   1843{
   1844	struct reiserfs_journal_cnode *cur;
   1845	struct reiserfs_journal_cnode **head;
   1846
   1847	head = &(journal_hash(table, sb, block));
   1848	if (!head) {
   1849		return;
   1850	}
   1851	cur = *head;
   1852	while (cur) {
   1853		if (cur->blocknr == block && cur->sb == sb
   1854		    && (jl == NULL || jl == cur->jlist)
   1855		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
   1856			if (cur->hnext) {
   1857				cur->hnext->hprev = cur->hprev;
   1858			}
   1859			if (cur->hprev) {
   1860				cur->hprev->hnext = cur->hnext;
   1861			} else {
   1862				*head = cur->hnext;
   1863			}
   1864			cur->blocknr = 0;
   1865			cur->sb = NULL;
   1866			cur->state = 0;
   1867			/*
   1868			 * anybody who clears the cur->bh will also
   1869			 * dec the nonzerolen
   1870			 */
   1871			if (cur->bh && cur->jlist)
   1872				atomic_dec(&cur->jlist->j_nonzerolen);
   1873			cur->bh = NULL;
   1874			cur->jlist = NULL;
   1875		}
   1876		cur = cur->hnext;
   1877	}
   1878}
   1879
   1880static void free_journal_ram(struct super_block *sb)
   1881{
   1882	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   1883	kfree(journal->j_current_jl);
   1884	journal->j_num_lists--;
   1885
   1886	vfree(journal->j_cnode_free_orig);
   1887	free_list_bitmaps(sb, journal->j_list_bitmap);
   1888	free_bitmap_nodes(sb);	/* must be after free_list_bitmaps */
   1889	if (journal->j_header_bh) {
   1890		brelse(journal->j_header_bh);
   1891	}
   1892	/*
   1893	 * j_header_bh is on the journal dev, make sure
   1894	 * not to release the journal dev until we brelse j_header_bh
   1895	 */
   1896	release_journal_dev(sb, journal);
   1897	vfree(journal);
   1898}
   1899
   1900/*
   1901 * call on unmount.  Only set error to 1 if you haven't made your way out
   1902 * of read_super() yet.  Any other caller must keep error at 0.
   1903 */
   1904static int do_journal_release(struct reiserfs_transaction_handle *th,
   1905			      struct super_block *sb, int error)
   1906{
   1907	struct reiserfs_transaction_handle myth;
   1908	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   1909
   1910	/*
   1911	 * we only want to flush out transactions if we were
   1912	 * called with error == 0
   1913	 */
   1914	if (!error && !sb_rdonly(sb)) {
   1915		/* end the current trans */
   1916		BUG_ON(!th->t_trans_id);
   1917		do_journal_end(th, FLUSH_ALL);
   1918
   1919		/*
   1920		 * make sure something gets logged to force
   1921		 * our way into the flush code
   1922		 */
   1923		if (!journal_join(&myth, sb)) {
   1924			reiserfs_prepare_for_journal(sb,
   1925						     SB_BUFFER_WITH_SB(sb),
   1926						     1);
   1927			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
   1928			do_journal_end(&myth, FLUSH_ALL);
   1929		}
   1930	}
   1931
   1932	/* this also catches errors during the do_journal_end above */
   1933	if (!error && reiserfs_is_journal_aborted(journal)) {
   1934		memset(&myth, 0, sizeof(myth));
   1935		if (!journal_join_abort(&myth, sb)) {
   1936			reiserfs_prepare_for_journal(sb,
   1937						     SB_BUFFER_WITH_SB(sb),
   1938						     1);
   1939			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
   1940			do_journal_end(&myth, FLUSH_ALL);
   1941		}
   1942	}
   1943
   1944
   1945	/*
   1946	 * We must release the write lock here because
   1947	 * the workqueue job (flush_async_commit) needs this lock
   1948	 */
   1949	reiserfs_write_unlock(sb);
   1950
   1951	/*
   1952	 * Cancel flushing of old commits. Note that neither of these works
   1953	 * will be requeued because superblock is being shutdown and doesn't
   1954	 * have SB_ACTIVE set.
   1955	 */
   1956	reiserfs_cancel_old_flush(sb);
   1957	/* wait for all commits to finish */
   1958	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
   1959
   1960	free_journal_ram(sb);
   1961
   1962	reiserfs_write_lock(sb);
   1963
   1964	return 0;
   1965}
   1966
   1967/* * call on unmount.  flush all journal trans, release all alloc'd ram */
   1968int journal_release(struct reiserfs_transaction_handle *th,
   1969		    struct super_block *sb)
   1970{
   1971	return do_journal_release(th, sb, 0);
   1972}
   1973
   1974/* only call from an error condition inside reiserfs_read_super!  */
   1975int journal_release_error(struct reiserfs_transaction_handle *th,
   1976			  struct super_block *sb)
   1977{
   1978	return do_journal_release(th, sb, 1);
   1979}
   1980
   1981/*
   1982 * compares description block with commit block.
   1983 * returns 1 if they differ, 0 if they are the same
   1984 */
   1985static int journal_compare_desc_commit(struct super_block *sb,
   1986				       struct reiserfs_journal_desc *desc,
   1987				       struct reiserfs_journal_commit *commit)
   1988{
   1989	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
   1990	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
   1991	    get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
   1992	    get_commit_trans_len(commit) <= 0) {
   1993		return 1;
   1994	}
   1995	return 0;
   1996}
   1997
   1998/*
   1999 * returns 0 if it did not find a description block
   2000 * returns -1 if it found a corrupt commit block
   2001 * returns 1 if both desc and commit were valid
   2002 * NOTE: only called during fs mount
   2003 */
   2004static int journal_transaction_is_valid(struct super_block *sb,
   2005					struct buffer_head *d_bh,
   2006					unsigned int *oldest_invalid_trans_id,
   2007					unsigned long *newest_mount_id)
   2008{
   2009	struct reiserfs_journal_desc *desc;
   2010	struct reiserfs_journal_commit *commit;
   2011	struct buffer_head *c_bh;
   2012	unsigned long offset;
   2013
   2014	if (!d_bh)
   2015		return 0;
   2016
   2017	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
   2018	if (get_desc_trans_len(desc) > 0
   2019	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
   2020		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
   2021		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
   2022			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2023				       "journal-986: transaction "
   2024				       "is valid returning because trans_id %d is greater than "
   2025				       "oldest_invalid %lu",
   2026				       get_desc_trans_id(desc),
   2027				       *oldest_invalid_trans_id);
   2028			return 0;
   2029		}
   2030		if (newest_mount_id
   2031		    && *newest_mount_id > get_desc_mount_id(desc)) {
   2032			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2033				       "journal-1087: transaction "
   2034				       "is valid returning because mount_id %d is less than "
   2035				       "newest_mount_id %lu",
   2036				       get_desc_mount_id(desc),
   2037				       *newest_mount_id);
   2038			return -1;
   2039		}
   2040		if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
   2041			reiserfs_warning(sb, "journal-2018",
   2042					 "Bad transaction length %d "
   2043					 "encountered, ignoring transaction",
   2044					 get_desc_trans_len(desc));
   2045			return -1;
   2046		}
   2047		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
   2048
   2049		/*
   2050		 * ok, we have a journal description block,
   2051		 * let's see if the transaction was valid
   2052		 */
   2053		c_bh =
   2054		    journal_bread(sb,
   2055				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2056				  ((offset + get_desc_trans_len(desc) +
   2057				    1) % SB_ONDISK_JOURNAL_SIZE(sb)));
   2058		if (!c_bh)
   2059			return 0;
   2060		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
   2061		if (journal_compare_desc_commit(sb, desc, commit)) {
   2062			reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2063				       "journal_transaction_is_valid, commit offset %ld had bad "
   2064				       "time %d or length %d",
   2065				       c_bh->b_blocknr -
   2066				       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2067				       get_commit_trans_id(commit),
   2068				       get_commit_trans_len(commit));
   2069			brelse(c_bh);
   2070			if (oldest_invalid_trans_id) {
   2071				*oldest_invalid_trans_id =
   2072				    get_desc_trans_id(desc);
   2073				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2074					       "journal-1004: "
   2075					       "transaction_is_valid setting oldest invalid trans_id "
   2076					       "to %d",
   2077					       get_desc_trans_id(desc));
   2078			}
   2079			return -1;
   2080		}
   2081		brelse(c_bh);
   2082		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2083			       "journal-1006: found valid "
   2084			       "transaction start offset %llu, len %d id %d",
   2085			       d_bh->b_blocknr -
   2086			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2087			       get_desc_trans_len(desc),
   2088			       get_desc_trans_id(desc));
   2089		return 1;
   2090	} else {
   2091		return 0;
   2092	}
   2093}
   2094
   2095static void brelse_array(struct buffer_head **heads, int num)
   2096{
   2097	int i;
   2098	for (i = 0; i < num; i++) {
   2099		brelse(heads[i]);
   2100	}
   2101}
   2102
   2103/*
   2104 * given the start, and values for the oldest acceptable transactions,
   2105 * this either reads in a replays a transaction, or returns because the
   2106 * transaction is invalid, or too old.
   2107 * NOTE: only called during fs mount
   2108 */
   2109static int journal_read_transaction(struct super_block *sb,
   2110				    unsigned long cur_dblock,
   2111				    unsigned long oldest_start,
   2112				    unsigned int oldest_trans_id,
   2113				    unsigned long newest_mount_id)
   2114{
   2115	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   2116	struct reiserfs_journal_desc *desc;
   2117	struct reiserfs_journal_commit *commit;
   2118	unsigned int trans_id = 0;
   2119	struct buffer_head *c_bh;
   2120	struct buffer_head *d_bh;
   2121	struct buffer_head **log_blocks = NULL;
   2122	struct buffer_head **real_blocks = NULL;
   2123	unsigned int trans_offset;
   2124	int i;
   2125	int trans_half;
   2126
   2127	d_bh = journal_bread(sb, cur_dblock);
   2128	if (!d_bh)
   2129		return 1;
   2130	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
   2131	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
   2132	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
   2133		       "journal_read_transaction, offset %llu, len %d mount_id %d",
   2134		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2135		       get_desc_trans_len(desc), get_desc_mount_id(desc));
   2136	if (get_desc_trans_id(desc) < oldest_trans_id) {
   2137		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
   2138			       "journal_read_trans skipping because %lu is too old",
   2139			       cur_dblock -
   2140			       SB_ONDISK_JOURNAL_1st_BLOCK(sb));
   2141		brelse(d_bh);
   2142		return 1;
   2143	}
   2144	if (get_desc_mount_id(desc) != newest_mount_id) {
   2145		reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
   2146			       "journal_read_trans skipping because %d is != "
   2147			       "newest_mount_id %lu", get_desc_mount_id(desc),
   2148			       newest_mount_id);
   2149		brelse(d_bh);
   2150		return 1;
   2151	}
   2152	c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2153			     ((trans_offset + get_desc_trans_len(desc) + 1) %
   2154			      SB_ONDISK_JOURNAL_SIZE(sb)));
   2155	if (!c_bh) {
   2156		brelse(d_bh);
   2157		return 1;
   2158	}
   2159	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
   2160	if (journal_compare_desc_commit(sb, desc, commit)) {
   2161		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2162			       "journal_read_transaction, "
   2163			       "commit offset %llu had bad time %d or length %d",
   2164			       c_bh->b_blocknr -
   2165			       SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2166			       get_commit_trans_id(commit),
   2167			       get_commit_trans_len(commit));
   2168		brelse(c_bh);
   2169		brelse(d_bh);
   2170		return 1;
   2171	}
   2172
   2173	if (bdev_read_only(sb->s_bdev)) {
   2174		reiserfs_warning(sb, "clm-2076",
   2175				 "device is readonly, unable to replay log");
   2176		brelse(c_bh);
   2177		brelse(d_bh);
   2178		return -EROFS;
   2179	}
   2180
   2181	trans_id = get_desc_trans_id(desc);
   2182	/*
   2183	 * now we know we've got a good transaction, and it was
   2184	 * inside the valid time ranges
   2185	 */
   2186	log_blocks = kmalloc_array(get_desc_trans_len(desc),
   2187				   sizeof(struct buffer_head *),
   2188				   GFP_NOFS);
   2189	real_blocks = kmalloc_array(get_desc_trans_len(desc),
   2190				    sizeof(struct buffer_head *),
   2191				    GFP_NOFS);
   2192	if (!log_blocks || !real_blocks) {
   2193		brelse(c_bh);
   2194		brelse(d_bh);
   2195		kfree(log_blocks);
   2196		kfree(real_blocks);
   2197		reiserfs_warning(sb, "journal-1169",
   2198				 "kmalloc failed, unable to mount FS");
   2199		return -1;
   2200	}
   2201	/* get all the buffer heads */
   2202	trans_half = journal_trans_half(sb->s_blocksize);
   2203	for (i = 0; i < get_desc_trans_len(desc); i++) {
   2204		log_blocks[i] =
   2205		    journal_getblk(sb,
   2206				   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2207				   (trans_offset + 1 +
   2208				    i) % SB_ONDISK_JOURNAL_SIZE(sb));
   2209		if (i < trans_half) {
   2210			real_blocks[i] =
   2211			    sb_getblk(sb,
   2212				      le32_to_cpu(desc->j_realblock[i]));
   2213		} else {
   2214			real_blocks[i] =
   2215			    sb_getblk(sb,
   2216				      le32_to_cpu(commit->
   2217						  j_realblock[i - trans_half]));
   2218		}
   2219		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
   2220			reiserfs_warning(sb, "journal-1207",
   2221					 "REPLAY FAILURE fsck required! "
   2222					 "Block to replay is outside of "
   2223					 "filesystem");
   2224			goto abort_replay;
   2225		}
   2226		/* make sure we don't try to replay onto log or reserved area */
   2227		if (is_block_in_log_or_reserved_area
   2228		    (sb, real_blocks[i]->b_blocknr)) {
   2229			reiserfs_warning(sb, "journal-1204",
   2230					 "REPLAY FAILURE fsck required! "
   2231					 "Trying to replay onto a log block");
   2232abort_replay:
   2233			brelse_array(log_blocks, i);
   2234			brelse_array(real_blocks, i);
   2235			brelse(c_bh);
   2236			brelse(d_bh);
   2237			kfree(log_blocks);
   2238			kfree(real_blocks);
   2239			return -1;
   2240		}
   2241	}
   2242	/* read in the log blocks, memcpy to the corresponding real block */
   2243	ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks);
   2244	for (i = 0; i < get_desc_trans_len(desc); i++) {
   2245
   2246		wait_on_buffer(log_blocks[i]);
   2247		if (!buffer_uptodate(log_blocks[i])) {
   2248			reiserfs_warning(sb, "journal-1212",
   2249					 "REPLAY FAILURE fsck required! "
   2250					 "buffer write failed");
   2251			brelse_array(log_blocks + i,
   2252				     get_desc_trans_len(desc) - i);
   2253			brelse_array(real_blocks, get_desc_trans_len(desc));
   2254			brelse(c_bh);
   2255			brelse(d_bh);
   2256			kfree(log_blocks);
   2257			kfree(real_blocks);
   2258			return -1;
   2259		}
   2260		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
   2261		       real_blocks[i]->b_size);
   2262		set_buffer_uptodate(real_blocks[i]);
   2263		brelse(log_blocks[i]);
   2264	}
   2265	/* flush out the real blocks */
   2266	for (i = 0; i < get_desc_trans_len(desc); i++) {
   2267		set_buffer_dirty(real_blocks[i]);
   2268		write_dirty_buffer(real_blocks[i], 0);
   2269	}
   2270	for (i = 0; i < get_desc_trans_len(desc); i++) {
   2271		wait_on_buffer(real_blocks[i]);
   2272		if (!buffer_uptodate(real_blocks[i])) {
   2273			reiserfs_warning(sb, "journal-1226",
   2274					 "REPLAY FAILURE, fsck required! "
   2275					 "buffer write failed");
   2276			brelse_array(real_blocks + i,
   2277				     get_desc_trans_len(desc) - i);
   2278			brelse(c_bh);
   2279			brelse(d_bh);
   2280			kfree(log_blocks);
   2281			kfree(real_blocks);
   2282			return -1;
   2283		}
   2284		brelse(real_blocks[i]);
   2285	}
   2286	cur_dblock =
   2287	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2288	    ((trans_offset + get_desc_trans_len(desc) +
   2289	      2) % SB_ONDISK_JOURNAL_SIZE(sb));
   2290	reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2291		       "journal-1095: setting journal " "start to offset %ld",
   2292		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
   2293
   2294	/*
   2295	 * init starting values for the first transaction, in case
   2296	 * this is the last transaction to be replayed.
   2297	 */
   2298	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
   2299	journal->j_last_flush_trans_id = trans_id;
   2300	journal->j_trans_id = trans_id + 1;
   2301	/* check for trans_id overflow */
   2302	if (journal->j_trans_id == 0)
   2303		journal->j_trans_id = 10;
   2304	brelse(c_bh);
   2305	brelse(d_bh);
   2306	kfree(log_blocks);
   2307	kfree(real_blocks);
   2308	return 0;
   2309}
   2310
   2311/*
   2312 * This function reads blocks starting from block and to max_block of bufsize
   2313 * size (but no more than BUFNR blocks at a time). This proved to improve
   2314 * mounting speed on self-rebuilding raid5 arrays at least.
   2315 * Right now it is only used from journal code. But later we might use it
   2316 * from other places.
   2317 * Note: Do not use journal_getblk/sb_getblk functions here!
   2318 */
   2319static struct buffer_head *reiserfs_breada(struct block_device *dev,
   2320					   b_blocknr_t block, int bufsize,
   2321					   b_blocknr_t max_block)
   2322{
   2323	struct buffer_head *bhlist[BUFNR];
   2324	unsigned int blocks = BUFNR;
   2325	struct buffer_head *bh;
   2326	int i, j;
   2327
   2328	bh = __getblk(dev, block, bufsize);
   2329	if (buffer_uptodate(bh))
   2330		return (bh);
   2331
   2332	if (block + BUFNR > max_block) {
   2333		blocks = max_block - block;
   2334	}
   2335	bhlist[0] = bh;
   2336	j = 1;
   2337	for (i = 1; i < blocks; i++) {
   2338		bh = __getblk(dev, block + i, bufsize);
   2339		if (buffer_uptodate(bh)) {
   2340			brelse(bh);
   2341			break;
   2342		} else
   2343			bhlist[j++] = bh;
   2344	}
   2345	ll_rw_block(REQ_OP_READ, 0, j, bhlist);
   2346	for (i = 1; i < j; i++)
   2347		brelse(bhlist[i]);
   2348	bh = bhlist[0];
   2349	wait_on_buffer(bh);
   2350	if (buffer_uptodate(bh))
   2351		return bh;
   2352	brelse(bh);
   2353	return NULL;
   2354}
   2355
   2356/*
   2357 * read and replay the log
   2358 * on a clean unmount, the journal header's next unflushed pointer will be
   2359 * to an invalid transaction.  This tests that before finding all the
   2360 * transactions in the log, which makes normal mount times fast.
   2361 *
   2362 * After a crash, this starts with the next unflushed transaction, and
   2363 * replays until it finds one too old, or invalid.
   2364 *
   2365 * On exit, it sets things up so the first transaction will work correctly.
   2366 * NOTE: only called during fs mount
   2367 */
   2368static int journal_read(struct super_block *sb)
   2369{
   2370	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   2371	struct reiserfs_journal_desc *desc;
   2372	unsigned int oldest_trans_id = 0;
   2373	unsigned int oldest_invalid_trans_id = 0;
   2374	time64_t start;
   2375	unsigned long oldest_start = 0;
   2376	unsigned long cur_dblock = 0;
   2377	unsigned long newest_mount_id = 9;
   2378	struct buffer_head *d_bh;
   2379	struct reiserfs_journal_header *jh;
   2380	int valid_journal_header = 0;
   2381	int replay_count = 0;
   2382	int continue_replay = 1;
   2383	int ret;
   2384
   2385	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
   2386	reiserfs_info(sb, "checking transaction log (%pg)\n",
   2387		      journal->j_dev_bd);
   2388	start = ktime_get_seconds();
   2389
   2390	/*
   2391	 * step 1, read in the journal header block.  Check the transaction
   2392	 * it says is the first unflushed, and if that transaction is not
   2393	 * valid, replay is done
   2394	 */
   2395	journal->j_header_bh = journal_bread(sb,
   2396					     SB_ONDISK_JOURNAL_1st_BLOCK(sb)
   2397					     + SB_ONDISK_JOURNAL_SIZE(sb));
   2398	if (!journal->j_header_bh) {
   2399		return 1;
   2400	}
   2401	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
   2402	if (le32_to_cpu(jh->j_first_unflushed_offset) <
   2403	    SB_ONDISK_JOURNAL_SIZE(sb)
   2404	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
   2405		oldest_start =
   2406		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2407		    le32_to_cpu(jh->j_first_unflushed_offset);
   2408		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
   2409		newest_mount_id = le32_to_cpu(jh->j_mount_id);
   2410		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2411			       "journal-1153: found in "
   2412			       "header: first_unflushed_offset %d, last_flushed_trans_id "
   2413			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
   2414			       le32_to_cpu(jh->j_last_flush_trans_id));
   2415		valid_journal_header = 1;
   2416
   2417		/*
   2418		 * now, we try to read the first unflushed offset.  If it
   2419		 * is not valid, there is nothing more we can do, and it
   2420		 * makes no sense to read through the whole log.
   2421		 */
   2422		d_bh =
   2423		    journal_bread(sb,
   2424				  SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2425				  le32_to_cpu(jh->j_first_unflushed_offset));
   2426		ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
   2427		if (!ret) {
   2428			continue_replay = 0;
   2429		}
   2430		brelse(d_bh);
   2431		goto start_log_replay;
   2432	}
   2433
   2434	/*
   2435	 * ok, there are transactions that need to be replayed.  start
   2436	 * with the first log block, find all the valid transactions, and
   2437	 * pick out the oldest.
   2438	 */
   2439	while (continue_replay
   2440	       && cur_dblock <
   2441	       (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2442		SB_ONDISK_JOURNAL_SIZE(sb))) {
   2443		/*
   2444		 * Note that it is required for blocksize of primary fs
   2445		 * device and journal device to be the same
   2446		 */
   2447		d_bh =
   2448		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
   2449				    sb->s_blocksize,
   2450				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2451				    SB_ONDISK_JOURNAL_SIZE(sb));
   2452		ret =
   2453		    journal_transaction_is_valid(sb, d_bh,
   2454						 &oldest_invalid_trans_id,
   2455						 &newest_mount_id);
   2456		if (ret == 1) {
   2457			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
   2458			if (oldest_start == 0) {	/* init all oldest_ values */
   2459				oldest_trans_id = get_desc_trans_id(desc);
   2460				oldest_start = d_bh->b_blocknr;
   2461				newest_mount_id = get_desc_mount_id(desc);
   2462				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2463					       "journal-1179: Setting "
   2464					       "oldest_start to offset %llu, trans_id %lu",
   2465					       oldest_start -
   2466					       SB_ONDISK_JOURNAL_1st_BLOCK
   2467					       (sb), oldest_trans_id);
   2468			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
   2469				/* one we just read was older */
   2470				oldest_trans_id = get_desc_trans_id(desc);
   2471				oldest_start = d_bh->b_blocknr;
   2472				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2473					       "journal-1180: Resetting "
   2474					       "oldest_start to offset %lu, trans_id %lu",
   2475					       oldest_start -
   2476					       SB_ONDISK_JOURNAL_1st_BLOCK
   2477					       (sb), oldest_trans_id);
   2478			}
   2479			if (newest_mount_id < get_desc_mount_id(desc)) {
   2480				newest_mount_id = get_desc_mount_id(desc);
   2481				reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2482					       "journal-1299: Setting "
   2483					       "newest_mount_id to %d",
   2484					       get_desc_mount_id(desc));
   2485			}
   2486			cur_dblock += get_desc_trans_len(desc) + 2;
   2487		} else {
   2488			cur_dblock++;
   2489		}
   2490		brelse(d_bh);
   2491	}
   2492
   2493start_log_replay:
   2494	cur_dblock = oldest_start;
   2495	if (oldest_trans_id) {
   2496		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2497			       "journal-1206: Starting replay "
   2498			       "from offset %llu, trans_id %lu",
   2499			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2500			       oldest_trans_id);
   2501
   2502	}
   2503	replay_count = 0;
   2504	while (continue_replay && oldest_trans_id > 0) {
   2505		ret =
   2506		    journal_read_transaction(sb, cur_dblock, oldest_start,
   2507					     oldest_trans_id, newest_mount_id);
   2508		if (ret < 0) {
   2509			return ret;
   2510		} else if (ret != 0) {
   2511			break;
   2512		}
   2513		cur_dblock =
   2514		    SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
   2515		replay_count++;
   2516		if (cur_dblock == oldest_start)
   2517			break;
   2518	}
   2519
   2520	if (oldest_trans_id == 0) {
   2521		reiserfs_debug(sb, REISERFS_DEBUG_CODE,
   2522			       "journal-1225: No valid " "transactions found");
   2523	}
   2524	/*
   2525	 * j_start does not get set correctly if we don't replay any
   2526	 * transactions.  if we had a valid journal_header, set j_start
   2527	 * to the first unflushed transaction value, copy the trans_id
   2528	 * from the header
   2529	 */
   2530	if (valid_journal_header && replay_count == 0) {
   2531		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
   2532		journal->j_trans_id =
   2533		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
   2534		/* check for trans_id overflow */
   2535		if (journal->j_trans_id == 0)
   2536			journal->j_trans_id = 10;
   2537		journal->j_last_flush_trans_id =
   2538		    le32_to_cpu(jh->j_last_flush_trans_id);
   2539		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
   2540	} else {
   2541		journal->j_mount_id = newest_mount_id + 1;
   2542	}
   2543	reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
   2544		       "newest_mount_id to %lu", journal->j_mount_id);
   2545	journal->j_first_unflushed_offset = journal->j_start;
   2546	if (replay_count > 0) {
   2547		reiserfs_info(sb,
   2548			      "replayed %d transactions in %lu seconds\n",
   2549			      replay_count, ktime_get_seconds() - start);
   2550	}
   2551	/* needed to satisfy the locking in _update_journal_header_block */
   2552	reiserfs_write_lock(sb);
   2553	if (!bdev_read_only(sb->s_bdev) &&
   2554	    _update_journal_header_block(sb, journal->j_start,
   2555					 journal->j_last_flush_trans_id)) {
   2556		reiserfs_write_unlock(sb);
   2557		/*
   2558		 * replay failed, caller must call free_journal_ram and abort
   2559		 * the mount
   2560		 */
   2561		return -1;
   2562	}
   2563	reiserfs_write_unlock(sb);
   2564	return 0;
   2565}
   2566
   2567static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
   2568{
   2569	struct reiserfs_journal_list *jl;
   2570	jl = kzalloc(sizeof(struct reiserfs_journal_list),
   2571		     GFP_NOFS | __GFP_NOFAIL);
   2572	INIT_LIST_HEAD(&jl->j_list);
   2573	INIT_LIST_HEAD(&jl->j_working_list);
   2574	INIT_LIST_HEAD(&jl->j_tail_bh_list);
   2575	INIT_LIST_HEAD(&jl->j_bh_list);
   2576	mutex_init(&jl->j_commit_mutex);
   2577	SB_JOURNAL(s)->j_num_lists++;
   2578	get_journal_list(jl);
   2579	return jl;
   2580}
   2581
   2582static void journal_list_init(struct super_block *sb)
   2583{
   2584	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
   2585}
   2586
   2587static void release_journal_dev(struct super_block *super,
   2588			       struct reiserfs_journal *journal)
   2589{
   2590	if (journal->j_dev_bd != NULL) {
   2591		blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
   2592		journal->j_dev_bd = NULL;
   2593	}
   2594}
   2595
   2596static int journal_init_dev(struct super_block *super,
   2597			    struct reiserfs_journal *journal,
   2598			    const char *jdev_name)
   2599{
   2600	int result;
   2601	dev_t jdev;
   2602	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
   2603
   2604	result = 0;
   2605
   2606	journal->j_dev_bd = NULL;
   2607	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
   2608	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
   2609
   2610	if (bdev_read_only(super->s_bdev))
   2611		blkdev_mode = FMODE_READ;
   2612
   2613	/* there is no "jdev" option and journal is on separate device */
   2614	if ((!jdev_name || !jdev_name[0])) {
   2615		if (jdev == super->s_dev)
   2616			blkdev_mode &= ~FMODE_EXCL;
   2617		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
   2618						      journal);
   2619		journal->j_dev_mode = blkdev_mode;
   2620		if (IS_ERR(journal->j_dev_bd)) {
   2621			result = PTR_ERR(journal->j_dev_bd);
   2622			journal->j_dev_bd = NULL;
   2623			reiserfs_warning(super, "sh-458",
   2624					 "cannot init journal device unknown-block(%u,%u): %i",
   2625					 MAJOR(jdev), MINOR(jdev), result);
   2626			return result;
   2627		} else if (jdev != super->s_dev)
   2628			set_blocksize(journal->j_dev_bd, super->s_blocksize);
   2629
   2630		return 0;
   2631	}
   2632
   2633	journal->j_dev_mode = blkdev_mode;
   2634	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
   2635	if (IS_ERR(journal->j_dev_bd)) {
   2636		result = PTR_ERR(journal->j_dev_bd);
   2637		journal->j_dev_bd = NULL;
   2638		reiserfs_warning(super, "sh-457",
   2639				 "journal_init_dev: Cannot open '%s': %i",
   2640				 jdev_name, result);
   2641		return result;
   2642	}
   2643
   2644	set_blocksize(journal->j_dev_bd, super->s_blocksize);
   2645	reiserfs_info(super,
   2646		      "journal_init_dev: journal device: %pg\n",
   2647		      journal->j_dev_bd);
   2648	return 0;
   2649}
   2650
   2651/*
   2652 * When creating/tuning a file system user can assign some
   2653 * journal params within boundaries which depend on the ratio
   2654 * blocksize/standard_blocksize.
   2655 *
   2656 * For blocks >= standard_blocksize transaction size should
   2657 * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
   2658 * then JOURNAL_TRANS_MAX_DEFAULT.
   2659 *
   2660 * For blocks < standard_blocksize these boundaries should be
   2661 * decreased proportionally.
   2662 */
   2663#define REISERFS_STANDARD_BLKSIZE (4096)
   2664
   2665static int check_advise_trans_params(struct super_block *sb,
   2666				     struct reiserfs_journal *journal)
   2667{
   2668        if (journal->j_trans_max) {
   2669		/* Non-default journal params.  Do sanity check for them. */
   2670	        int ratio = 1;
   2671		if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
   2672		        ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
   2673
   2674		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
   2675		    journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
   2676		    SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
   2677		    JOURNAL_MIN_RATIO) {
   2678			reiserfs_warning(sb, "sh-462",
   2679					 "bad transaction max size (%u). "
   2680					 "FSCK?", journal->j_trans_max);
   2681			return 1;
   2682		}
   2683		if (journal->j_max_batch != (journal->j_trans_max) *
   2684		        JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
   2685			reiserfs_warning(sb, "sh-463",
   2686					 "bad transaction max batch (%u). "
   2687					 "FSCK?", journal->j_max_batch);
   2688			return 1;
   2689		}
   2690	} else {
   2691		/*
   2692		 * Default journal params.
   2693		 * The file system was created by old version
   2694		 * of mkreiserfs, so some fields contain zeros,
   2695		 * and we need to advise proper values for them
   2696		 */
   2697		if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
   2698			reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
   2699					 sb->s_blocksize);
   2700			return 1;
   2701		}
   2702		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
   2703		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
   2704		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
   2705	}
   2706	return 0;
   2707}
   2708
   2709/* must be called once on fs mount.  calls journal_read for you */
   2710int journal_init(struct super_block *sb, const char *j_dev_name,
   2711		 int old_format, unsigned int commit_max_age)
   2712{
   2713	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
   2714	struct buffer_head *bhjh;
   2715	struct reiserfs_super_block *rs;
   2716	struct reiserfs_journal_header *jh;
   2717	struct reiserfs_journal *journal;
   2718	struct reiserfs_journal_list *jl;
   2719	int ret;
   2720
   2721	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
   2722	if (!journal) {
   2723		reiserfs_warning(sb, "journal-1256",
   2724				 "unable to get memory for journal structure");
   2725		return 1;
   2726	}
   2727	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
   2728	INIT_LIST_HEAD(&journal->j_prealloc_list);
   2729	INIT_LIST_HEAD(&journal->j_working_list);
   2730	INIT_LIST_HEAD(&journal->j_journal_list);
   2731	journal->j_persistent_trans = 0;
   2732	if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
   2733					   reiserfs_bmap_count(sb)))
   2734		goto free_and_return;
   2735
   2736	allocate_bitmap_nodes(sb);
   2737
   2738	/* reserved for journal area support */
   2739	SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
   2740						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
   2741						 / sb->s_blocksize +
   2742						 reiserfs_bmap_count(sb) +
   2743						 1 :
   2744						 REISERFS_DISK_OFFSET_IN_BYTES /
   2745						 sb->s_blocksize + 2);
   2746
   2747	/*
   2748	 * Sanity check to see is the standard journal fitting
   2749	 * within first bitmap (actual for small blocksizes)
   2750	 */
   2751	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
   2752	    (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
   2753	     SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
   2754		reiserfs_warning(sb, "journal-1393",
   2755				 "journal does not fit for area addressed "
   2756				 "by first of bitmap blocks. It starts at "
   2757				 "%u and its size is %u. Block size %ld",
   2758				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
   2759				 SB_ONDISK_JOURNAL_SIZE(sb),
   2760				 sb->s_blocksize);
   2761		goto free_and_return;
   2762	}
   2763
   2764	/*
   2765	 * Sanity check to see if journal first block is correct.
   2766	 * If journal first block is invalid it can cause
   2767	 * zeroing important superblock members.
   2768	 */
   2769	if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
   2770	    SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
   2771		reiserfs_warning(sb, "journal-1393",
   2772				 "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
   2773				 SB_JOURNAL_1st_RESERVED_BLOCK(sb),
   2774				 SB_ONDISK_JOURNAL_1st_BLOCK(sb));
   2775		goto free_and_return;
   2776	}
   2777
   2778	if (journal_init_dev(sb, journal, j_dev_name) != 0) {
   2779		reiserfs_warning(sb, "sh-462",
   2780				 "unable to initialize journal device");
   2781		goto free_and_return;
   2782	}
   2783
   2784	rs = SB_DISK_SUPER_BLOCK(sb);
   2785
   2786	/* read journal header */
   2787	bhjh = journal_bread(sb,
   2788			     SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   2789			     SB_ONDISK_JOURNAL_SIZE(sb));
   2790	if (!bhjh) {
   2791		reiserfs_warning(sb, "sh-459",
   2792				 "unable to read journal header");
   2793		goto free_and_return;
   2794	}
   2795	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
   2796
   2797	/* make sure that journal matches to the super block */
   2798	if (is_reiserfs_jr(rs)
   2799	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
   2800		sb_jp_journal_magic(rs))) {
   2801		reiserfs_warning(sb, "sh-460",
   2802				 "journal header magic %x (device %pg) does "
   2803				 "not match to magic found in super block %x",
   2804				 jh->jh_journal.jp_journal_magic,
   2805				 journal->j_dev_bd,
   2806				 sb_jp_journal_magic(rs));
   2807		brelse(bhjh);
   2808		goto free_and_return;
   2809	}
   2810
   2811	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
   2812	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
   2813	journal->j_max_commit_age =
   2814	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
   2815	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
   2816
   2817	if (check_advise_trans_params(sb, journal) != 0)
   2818	        goto free_and_return;
   2819	journal->j_default_max_commit_age = journal->j_max_commit_age;
   2820
   2821	if (commit_max_age != 0) {
   2822		journal->j_max_commit_age = commit_max_age;
   2823		journal->j_max_trans_age = commit_max_age;
   2824	}
   2825
   2826	reiserfs_info(sb, "journal params: device %pg, size %u, "
   2827		      "journal first block %u, max trans len %u, max batch %u, "
   2828		      "max commit age %u, max trans age %u\n",
   2829		      journal->j_dev_bd,
   2830		      SB_ONDISK_JOURNAL_SIZE(sb),
   2831		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
   2832		      journal->j_trans_max,
   2833		      journal->j_max_batch,
   2834		      journal->j_max_commit_age, journal->j_max_trans_age);
   2835
   2836	brelse(bhjh);
   2837
   2838	journal->j_list_bitmap_index = 0;
   2839	journal_list_init(sb);
   2840
   2841	memset(journal->j_list_hash_table, 0,
   2842	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
   2843
   2844	INIT_LIST_HEAD(&journal->j_dirty_buffers);
   2845	spin_lock_init(&journal->j_dirty_buffers_lock);
   2846
   2847	journal->j_start = 0;
   2848	journal->j_len = 0;
   2849	journal->j_len_alloc = 0;
   2850	atomic_set(&journal->j_wcount, 0);
   2851	atomic_set(&journal->j_async_throttle, 0);
   2852	journal->j_bcount = 0;
   2853	journal->j_trans_start_time = 0;
   2854	journal->j_last = NULL;
   2855	journal->j_first = NULL;
   2856	init_waitqueue_head(&journal->j_join_wait);
   2857	mutex_init(&journal->j_mutex);
   2858	mutex_init(&journal->j_flush_mutex);
   2859
   2860	journal->j_trans_id = 10;
   2861	journal->j_mount_id = 10;
   2862	journal->j_state = 0;
   2863	atomic_set(&journal->j_jlock, 0);
   2864	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
   2865	journal->j_cnode_free_orig = journal->j_cnode_free_list;
   2866	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
   2867	journal->j_cnode_used = 0;
   2868	journal->j_must_wait = 0;
   2869
   2870	if (journal->j_cnode_free == 0) {
   2871		reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
   2872		                 "allocation failed (%ld bytes). Journal is "
   2873		                 "too large for available memory. Usually "
   2874		                 "this is due to a journal that is too large.",
   2875		                 sizeof (struct reiserfs_journal_cnode) * num_cnodes);
   2876        	goto free_and_return;
   2877	}
   2878
   2879	init_journal_hash(sb);
   2880	jl = journal->j_current_jl;
   2881
   2882	/*
   2883	 * get_list_bitmap() may call flush_commit_list() which
   2884	 * requires the lock. Calling flush_commit_list() shouldn't happen
   2885	 * this early but I like to be paranoid.
   2886	 */
   2887	reiserfs_write_lock(sb);
   2888	jl->j_list_bitmap = get_list_bitmap(sb, jl);
   2889	reiserfs_write_unlock(sb);
   2890	if (!jl->j_list_bitmap) {
   2891		reiserfs_warning(sb, "journal-2005",
   2892				 "get_list_bitmap failed for journal list 0");
   2893		goto free_and_return;
   2894	}
   2895
   2896	ret = journal_read(sb);
   2897	if (ret < 0) {
   2898		reiserfs_warning(sb, "reiserfs-2006",
   2899				 "Replay Failure, unable to mount");
   2900		goto free_and_return;
   2901	}
   2902
   2903	INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
   2904	journal->j_work_sb = sb;
   2905	return 0;
   2906free_and_return:
   2907	free_journal_ram(sb);
   2908	return 1;
   2909}
   2910
   2911/*
   2912 * test for a polite end of the current transaction.  Used by file_write,
   2913 * and should be used by delete to make sure they don't write more than
   2914 * can fit inside a single transaction
   2915 */
   2916int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
   2917				   int new_alloc)
   2918{
   2919	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
   2920	time64_t now = ktime_get_seconds();
   2921	/* cannot restart while nested */
   2922	BUG_ON(!th->t_trans_id);
   2923	if (th->t_refcount > 1)
   2924		return 0;
   2925	if (journal->j_must_wait > 0 ||
   2926	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
   2927	    atomic_read(&journal->j_jlock) ||
   2928	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
   2929	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
   2930		return 1;
   2931	}
   2932
   2933	journal->j_len_alloc += new_alloc;
   2934	th->t_blocks_allocated += new_alloc ;
   2935	return 0;
   2936}
   2937
   2938/* this must be called inside a transaction */
   2939void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
   2940{
   2941	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
   2942	BUG_ON(!th->t_trans_id);
   2943	journal->j_must_wait = 1;
   2944	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
   2945	return;
   2946}
   2947
   2948/* this must be called without a transaction started */
   2949void reiserfs_allow_writes(struct super_block *s)
   2950{
   2951	struct reiserfs_journal *journal = SB_JOURNAL(s);
   2952	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
   2953	wake_up(&journal->j_join_wait);
   2954}
   2955
   2956/* this must be called without a transaction started */
   2957void reiserfs_wait_on_write_block(struct super_block *s)
   2958{
   2959	struct reiserfs_journal *journal = SB_JOURNAL(s);
   2960	wait_event(journal->j_join_wait,
   2961		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
   2962}
   2963
   2964static void queue_log_writer(struct super_block *s)
   2965{
   2966	wait_queue_entry_t wait;
   2967	struct reiserfs_journal *journal = SB_JOURNAL(s);
   2968	set_bit(J_WRITERS_QUEUED, &journal->j_state);
   2969
   2970	/*
   2971	 * we don't want to use wait_event here because
   2972	 * we only want to wait once.
   2973	 */
   2974	init_waitqueue_entry(&wait, current);
   2975	add_wait_queue(&journal->j_join_wait, &wait);
   2976	set_current_state(TASK_UNINTERRUPTIBLE);
   2977	if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
   2978		int depth = reiserfs_write_unlock_nested(s);
   2979		schedule();
   2980		reiserfs_write_lock_nested(s, depth);
   2981	}
   2982	__set_current_state(TASK_RUNNING);
   2983	remove_wait_queue(&journal->j_join_wait, &wait);
   2984}
   2985
   2986static void wake_queued_writers(struct super_block *s)
   2987{
   2988	struct reiserfs_journal *journal = SB_JOURNAL(s);
   2989	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
   2990		wake_up(&journal->j_join_wait);
   2991}
   2992
   2993static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
   2994{
   2995	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   2996	unsigned long bcount = journal->j_bcount;
   2997	while (1) {
   2998		int depth;
   2999
   3000		depth = reiserfs_write_unlock_nested(sb);
   3001		schedule_timeout_uninterruptible(1);
   3002		reiserfs_write_lock_nested(sb, depth);
   3003
   3004		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
   3005		while ((atomic_read(&journal->j_wcount) > 0 ||
   3006			atomic_read(&journal->j_jlock)) &&
   3007		       journal->j_trans_id == trans_id) {
   3008			queue_log_writer(sb);
   3009		}
   3010		if (journal->j_trans_id != trans_id)
   3011			break;
   3012		if (bcount == journal->j_bcount)
   3013			break;
   3014		bcount = journal->j_bcount;
   3015	}
   3016}
   3017
   3018/*
   3019 * join == true if you must join an existing transaction.
   3020 * join == false if you can deal with waiting for others to finish
   3021 *
   3022 * this will block until the transaction is joinable.  send the number of
   3023 * blocks you expect to use in nblocks.
   3024*/
   3025static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
   3026			      struct super_block *sb, unsigned long nblocks,
   3027			      int join)
   3028{
   3029	time64_t now = ktime_get_seconds();
   3030	unsigned int old_trans_id;
   3031	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3032	struct reiserfs_transaction_handle myth;
   3033	int sched_count = 0;
   3034	int retval;
   3035	int depth;
   3036
   3037	reiserfs_check_lock_depth(sb, "journal_begin");
   3038	BUG_ON(nblocks > journal->j_trans_max);
   3039
   3040	PROC_INFO_INC(sb, journal.journal_being);
   3041	/* set here for journal_join */
   3042	th->t_refcount = 1;
   3043	th->t_super = sb;
   3044
   3045relock:
   3046	lock_journal(sb);
   3047	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
   3048		unlock_journal(sb);
   3049		retval = journal->j_errno;
   3050		goto out_fail;
   3051	}
   3052	journal->j_bcount++;
   3053
   3054	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
   3055		unlock_journal(sb);
   3056		depth = reiserfs_write_unlock_nested(sb);
   3057		reiserfs_wait_on_write_block(sb);
   3058		reiserfs_write_lock_nested(sb, depth);
   3059		PROC_INFO_INC(sb, journal.journal_relock_writers);
   3060		goto relock;
   3061	}
   3062	now = ktime_get_seconds();
   3063
   3064	/*
   3065	 * if there is no room in the journal OR
   3066	 * if this transaction is too old, and we weren't called joinable,
   3067	 * wait for it to finish before beginning we don't sleep if there
   3068	 * aren't other writers
   3069	 */
   3070
   3071	if ((!join && journal->j_must_wait > 0) ||
   3072	    (!join
   3073	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
   3074	    || (!join && atomic_read(&journal->j_wcount) > 0
   3075		&& journal->j_trans_start_time > 0
   3076		&& (now - journal->j_trans_start_time) >
   3077		journal->j_max_trans_age) || (!join
   3078					      && atomic_read(&journal->j_jlock))
   3079	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
   3080
   3081		old_trans_id = journal->j_trans_id;
   3082		/* allow others to finish this transaction */
   3083		unlock_journal(sb);
   3084
   3085		if (!join && (journal->j_len_alloc + nblocks + 2) >=
   3086		    journal->j_max_batch &&
   3087		    ((journal->j_len + nblocks + 2) * 100) <
   3088		    (journal->j_len_alloc * 75)) {
   3089			if (atomic_read(&journal->j_wcount) > 10) {
   3090				sched_count++;
   3091				queue_log_writer(sb);
   3092				goto relock;
   3093			}
   3094		}
   3095		/*
   3096		 * don't mess with joining the transaction if all we
   3097		 * have to do is wait for someone else to do a commit
   3098		 */
   3099		if (atomic_read(&journal->j_jlock)) {
   3100			while (journal->j_trans_id == old_trans_id &&
   3101			       atomic_read(&journal->j_jlock)) {
   3102				queue_log_writer(sb);
   3103			}
   3104			goto relock;
   3105		}
   3106		retval = journal_join(&myth, sb);
   3107		if (retval)
   3108			goto out_fail;
   3109
   3110		/* someone might have ended the transaction while we joined */
   3111		if (old_trans_id != journal->j_trans_id) {
   3112			retval = do_journal_end(&myth, 0);
   3113		} else {
   3114			retval = do_journal_end(&myth, COMMIT_NOW);
   3115		}
   3116
   3117		if (retval)
   3118			goto out_fail;
   3119
   3120		PROC_INFO_INC(sb, journal.journal_relock_wcount);
   3121		goto relock;
   3122	}
   3123	/* we are the first writer, set trans_id */
   3124	if (journal->j_trans_start_time == 0) {
   3125		journal->j_trans_start_time = ktime_get_seconds();
   3126	}
   3127	atomic_inc(&journal->j_wcount);
   3128	journal->j_len_alloc += nblocks;
   3129	th->t_blocks_logged = 0;
   3130	th->t_blocks_allocated = nblocks;
   3131	th->t_trans_id = journal->j_trans_id;
   3132	unlock_journal(sb);
   3133	INIT_LIST_HEAD(&th->t_list);
   3134	return 0;
   3135
   3136out_fail:
   3137	memset(th, 0, sizeof(*th));
   3138	/*
   3139	 * Re-set th->t_super, so we can properly keep track of how many
   3140	 * persistent transactions there are. We need to do this so if this
   3141	 * call is part of a failed restart_transaction, we can free it later
   3142	 */
   3143	th->t_super = sb;
   3144	return retval;
   3145}
   3146
   3147struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
   3148								    super_block
   3149								    *s,
   3150								    int nblocks)
   3151{
   3152	int ret;
   3153	struct reiserfs_transaction_handle *th;
   3154
   3155	/*
   3156	 * if we're nesting into an existing transaction.  It will be
   3157	 * persistent on its own
   3158	 */
   3159	if (reiserfs_transaction_running(s)) {
   3160		th = current->journal_info;
   3161		th->t_refcount++;
   3162		BUG_ON(th->t_refcount < 2);
   3163
   3164		return th;
   3165	}
   3166	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
   3167	if (!th)
   3168		return NULL;
   3169	ret = journal_begin(th, s, nblocks);
   3170	if (ret) {
   3171		kfree(th);
   3172		return NULL;
   3173	}
   3174
   3175	SB_JOURNAL(s)->j_persistent_trans++;
   3176	return th;
   3177}
   3178
   3179int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
   3180{
   3181	struct super_block *s = th->t_super;
   3182	int ret = 0;
   3183	if (th->t_trans_id)
   3184		ret = journal_end(th);
   3185	else
   3186		ret = -EIO;
   3187	if (th->t_refcount == 0) {
   3188		SB_JOURNAL(s)->j_persistent_trans--;
   3189		kfree(th);
   3190	}
   3191	return ret;
   3192}
   3193
   3194static int journal_join(struct reiserfs_transaction_handle *th,
   3195			struct super_block *sb)
   3196{
   3197	struct reiserfs_transaction_handle *cur_th = current->journal_info;
   3198
   3199	/*
   3200	 * this keeps do_journal_end from NULLing out the
   3201	 * current->journal_info pointer
   3202	 */
   3203	th->t_handle_save = cur_th;
   3204	BUG_ON(cur_th && cur_th->t_refcount > 1);
   3205	return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
   3206}
   3207
   3208int journal_join_abort(struct reiserfs_transaction_handle *th,
   3209		       struct super_block *sb)
   3210{
   3211	struct reiserfs_transaction_handle *cur_th = current->journal_info;
   3212
   3213	/*
   3214	 * this keeps do_journal_end from NULLing out the
   3215	 * current->journal_info pointer
   3216	 */
   3217	th->t_handle_save = cur_th;
   3218	BUG_ON(cur_th && cur_th->t_refcount > 1);
   3219	return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
   3220}
   3221
   3222int journal_begin(struct reiserfs_transaction_handle *th,
   3223		  struct super_block *sb, unsigned long nblocks)
   3224{
   3225	struct reiserfs_transaction_handle *cur_th = current->journal_info;
   3226	int ret;
   3227
   3228	th->t_handle_save = NULL;
   3229	if (cur_th) {
   3230		/* we are nesting into the current transaction */
   3231		if (cur_th->t_super == sb) {
   3232			BUG_ON(!cur_th->t_refcount);
   3233			cur_th->t_refcount++;
   3234			memcpy(th, cur_th, sizeof(*th));
   3235			if (th->t_refcount <= 1)
   3236				reiserfs_warning(sb, "reiserfs-2005",
   3237						 "BAD: refcount <= 1, but "
   3238						 "journal_info != 0");
   3239			return 0;
   3240		} else {
   3241			/*
   3242			 * we've ended up with a handle from a different
   3243			 * filesystem.  save it and restore on journal_end.
   3244			 * This should never really happen...
   3245			 */
   3246			reiserfs_warning(sb, "clm-2100",
   3247					 "nesting info a different FS");
   3248			th->t_handle_save = current->journal_info;
   3249			current->journal_info = th;
   3250		}
   3251	} else {
   3252		current->journal_info = th;
   3253	}
   3254	ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
   3255	BUG_ON(current->journal_info != th);
   3256
   3257	/*
   3258	 * I guess this boils down to being the reciprocal of clm-2100 above.
   3259	 * If do_journal_begin_r fails, we need to put it back, since
   3260	 * journal_end won't be called to do it. */
   3261	if (ret)
   3262		current->journal_info = th->t_handle_save;
   3263	else
   3264		BUG_ON(!th->t_refcount);
   3265
   3266	return ret;
   3267}
   3268
   3269/*
   3270 * puts bh into the current transaction.  If it was already there, reorders
   3271 * removes the old pointers from the hash, and puts new ones in (to make
   3272 * sure replay happen in the right order).
   3273 *
   3274 * if it was dirty, cleans and files onto the clean list.  I can't let it
   3275 * be dirty again until the transaction is committed.
   3276 *
   3277 * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
   3278 */
   3279int journal_mark_dirty(struct reiserfs_transaction_handle *th,
   3280		       struct buffer_head *bh)
   3281{
   3282	struct super_block *sb = th->t_super;
   3283	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3284	struct reiserfs_journal_cnode *cn = NULL;
   3285	int count_already_incd = 0;
   3286	int prepared = 0;
   3287	BUG_ON(!th->t_trans_id);
   3288
   3289	PROC_INFO_INC(sb, journal.mark_dirty);
   3290	if (th->t_trans_id != journal->j_trans_id) {
   3291		reiserfs_panic(th->t_super, "journal-1577",
   3292			       "handle trans id %ld != current trans id %ld",
   3293			       th->t_trans_id, journal->j_trans_id);
   3294	}
   3295
   3296	prepared = test_clear_buffer_journal_prepared(bh);
   3297	clear_buffer_journal_restore_dirty(bh);
   3298	/* already in this transaction, we are done */
   3299	if (buffer_journaled(bh)) {
   3300		PROC_INFO_INC(sb, journal.mark_dirty_already);
   3301		return 0;
   3302	}
   3303
   3304	/*
   3305	 * this must be turned into a panic instead of a warning.  We can't
   3306	 * allow a dirty or journal_dirty or locked buffer to be logged, as
   3307	 * some changes could get to disk too early.  NOT GOOD.
   3308	 */
   3309	if (!prepared || buffer_dirty(bh)) {
   3310		reiserfs_warning(sb, "journal-1777",
   3311				 "buffer %llu bad state "
   3312				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
   3313				 (unsigned long long)bh->b_blocknr,
   3314				 prepared ? ' ' : '!',
   3315				 buffer_locked(bh) ? ' ' : '!',
   3316				 buffer_dirty(bh) ? ' ' : '!',
   3317				 buffer_journal_dirty(bh) ? ' ' : '!');
   3318	}
   3319
   3320	if (atomic_read(&journal->j_wcount) <= 0) {
   3321		reiserfs_warning(sb, "journal-1409",
   3322				 "returning because j_wcount was %d",
   3323				 atomic_read(&journal->j_wcount));
   3324		return 1;
   3325	}
   3326	/*
   3327	 * this error means I've screwed up, and we've overflowed
   3328	 * the transaction.  Nothing can be done here, except make the
   3329	 * FS readonly or panic.
   3330	 */
   3331	if (journal->j_len >= journal->j_trans_max) {
   3332		reiserfs_panic(th->t_super, "journal-1413",
   3333			       "j_len (%lu) is too big",
   3334			       journal->j_len);
   3335	}
   3336
   3337	if (buffer_journal_dirty(bh)) {
   3338		count_already_incd = 1;
   3339		PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
   3340		clear_buffer_journal_dirty(bh);
   3341	}
   3342
   3343	if (journal->j_len > journal->j_len_alloc) {
   3344		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
   3345	}
   3346
   3347	set_buffer_journaled(bh);
   3348
   3349	/* now put this guy on the end */
   3350	if (!cn) {
   3351		cn = get_cnode(sb);
   3352		if (!cn) {
   3353			reiserfs_panic(sb, "journal-4", "get_cnode failed!");
   3354		}
   3355
   3356		if (th->t_blocks_logged == th->t_blocks_allocated) {
   3357			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
   3358			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
   3359		}
   3360		th->t_blocks_logged++;
   3361		journal->j_len++;
   3362
   3363		cn->bh = bh;
   3364		cn->blocknr = bh->b_blocknr;
   3365		cn->sb = sb;
   3366		cn->jlist = NULL;
   3367		insert_journal_hash(journal->j_hash_table, cn);
   3368		if (!count_already_incd) {
   3369			get_bh(bh);
   3370		}
   3371	}
   3372	cn->next = NULL;
   3373	cn->prev = journal->j_last;
   3374	cn->bh = bh;
   3375	if (journal->j_last) {
   3376		journal->j_last->next = cn;
   3377		journal->j_last = cn;
   3378	} else {
   3379		journal->j_first = cn;
   3380		journal->j_last = cn;
   3381	}
   3382	reiserfs_schedule_old_flush(sb);
   3383	return 0;
   3384}
   3385
   3386int journal_end(struct reiserfs_transaction_handle *th)
   3387{
   3388	struct super_block *sb = th->t_super;
   3389	if (!current->journal_info && th->t_refcount > 1)
   3390		reiserfs_warning(sb, "REISER-NESTING",
   3391				 "th NULL, refcount %d", th->t_refcount);
   3392
   3393	if (!th->t_trans_id) {
   3394		WARN_ON(1);
   3395		return -EIO;
   3396	}
   3397
   3398	th->t_refcount--;
   3399	if (th->t_refcount > 0) {
   3400		struct reiserfs_transaction_handle *cur_th =
   3401		    current->journal_info;
   3402
   3403		/*
   3404		 * we aren't allowed to close a nested transaction on a
   3405		 * different filesystem from the one in the task struct
   3406		 */
   3407		BUG_ON(cur_th->t_super != th->t_super);
   3408
   3409		if (th != cur_th) {
   3410			memcpy(current->journal_info, th, sizeof(*th));
   3411			th->t_trans_id = 0;
   3412		}
   3413		return 0;
   3414	} else {
   3415		return do_journal_end(th, 0);
   3416	}
   3417}
   3418
   3419/*
   3420 * removes from the current transaction, relsing and descrementing any counters.
   3421 * also files the removed buffer directly onto the clean list
   3422 *
   3423 * called by journal_mark_freed when a block has been deleted
   3424 *
   3425 * returns 1 if it cleaned and relsed the buffer. 0 otherwise
   3426 */
   3427static int remove_from_transaction(struct super_block *sb,
   3428				   b_blocknr_t blocknr, int already_cleaned)
   3429{
   3430	struct buffer_head *bh;
   3431	struct reiserfs_journal_cnode *cn;
   3432	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3433	int ret = 0;
   3434
   3435	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
   3436	if (!cn || !cn->bh) {
   3437		return ret;
   3438	}
   3439	bh = cn->bh;
   3440	if (cn->prev) {
   3441		cn->prev->next = cn->next;
   3442	}
   3443	if (cn->next) {
   3444		cn->next->prev = cn->prev;
   3445	}
   3446	if (cn == journal->j_first) {
   3447		journal->j_first = cn->next;
   3448	}
   3449	if (cn == journal->j_last) {
   3450		journal->j_last = cn->prev;
   3451	}
   3452	remove_journal_hash(sb, journal->j_hash_table, NULL,
   3453			    bh->b_blocknr, 0);
   3454	clear_buffer_journaled(bh);	/* don't log this one */
   3455
   3456	if (!already_cleaned) {
   3457		clear_buffer_journal_dirty(bh);
   3458		clear_buffer_dirty(bh);
   3459		clear_buffer_journal_test(bh);
   3460		put_bh(bh);
   3461		if (atomic_read(&bh->b_count) < 0) {
   3462			reiserfs_warning(sb, "journal-1752",
   3463					 "b_count < 0");
   3464		}
   3465		ret = 1;
   3466	}
   3467	journal->j_len--;
   3468	journal->j_len_alloc--;
   3469	free_cnode(sb, cn);
   3470	return ret;
   3471}
   3472
   3473/*
   3474 * for any cnode in a journal list, it can only be dirtied of all the
   3475 * transactions that include it are committed to disk.
   3476 * this checks through each transaction, and returns 1 if you are allowed
   3477 * to dirty, and 0 if you aren't
   3478 *
   3479 * it is called by dirty_journal_list, which is called after
   3480 * flush_commit_list has gotten all the log blocks for a given
   3481 * transaction on disk
   3482 *
   3483 */
   3484static int can_dirty(struct reiserfs_journal_cnode *cn)
   3485{
   3486	struct super_block *sb = cn->sb;
   3487	b_blocknr_t blocknr = cn->blocknr;
   3488	struct reiserfs_journal_cnode *cur = cn->hprev;
   3489	int can_dirty = 1;
   3490
   3491	/*
   3492	 * first test hprev.  These are all newer than cn, so any node here
   3493	 * with the same block number and dev means this node can't be sent
   3494	 * to disk right now.
   3495	 */
   3496	while (cur && can_dirty) {
   3497		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
   3498		    cur->blocknr == blocknr) {
   3499			can_dirty = 0;
   3500		}
   3501		cur = cur->hprev;
   3502	}
   3503	/*
   3504	 * then test hnext.  These are all older than cn.  As long as they
   3505	 * are committed to the log, it is safe to write cn to disk
   3506	 */
   3507	cur = cn->hnext;
   3508	while (cur && can_dirty) {
   3509		if (cur->jlist && cur->jlist->j_len > 0 &&
   3510		    atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
   3511		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
   3512			can_dirty = 0;
   3513		}
   3514		cur = cur->hnext;
   3515	}
   3516	return can_dirty;
   3517}
   3518
   3519/*
   3520 * syncs the commit blocks, but does not force the real buffers to disk
   3521 * will wait until the current transaction is done/committed before returning
   3522 */
   3523int journal_end_sync(struct reiserfs_transaction_handle *th)
   3524{
   3525	struct super_block *sb = th->t_super;
   3526	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3527
   3528	BUG_ON(!th->t_trans_id);
   3529	/* you can sync while nested, very, very bad */
   3530	BUG_ON(th->t_refcount > 1);
   3531	if (journal->j_len == 0) {
   3532		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
   3533					     1);
   3534		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
   3535	}
   3536	return do_journal_end(th, COMMIT_NOW | WAIT);
   3537}
   3538
   3539/* writeback the pending async commits to disk */
   3540static void flush_async_commits(struct work_struct *work)
   3541{
   3542	struct reiserfs_journal *journal =
   3543		container_of(work, struct reiserfs_journal, j_work.work);
   3544	struct super_block *sb = journal->j_work_sb;
   3545	struct reiserfs_journal_list *jl;
   3546	struct list_head *entry;
   3547
   3548	reiserfs_write_lock(sb);
   3549	if (!list_empty(&journal->j_journal_list)) {
   3550		/* last entry is the youngest, commit it and you get everything */
   3551		entry = journal->j_journal_list.prev;
   3552		jl = JOURNAL_LIST_ENTRY(entry);
   3553		flush_commit_list(sb, jl, 1);
   3554	}
   3555	reiserfs_write_unlock(sb);
   3556}
   3557
   3558/*
   3559 * flushes any old transactions to disk
   3560 * ends the current transaction if it is too old
   3561 */
   3562void reiserfs_flush_old_commits(struct super_block *sb)
   3563{
   3564	time64_t now;
   3565	struct reiserfs_transaction_handle th;
   3566	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3567
   3568	now = ktime_get_seconds();
   3569	/*
   3570	 * safety check so we don't flush while we are replaying the log during
   3571	 * mount
   3572	 */
   3573	if (list_empty(&journal->j_journal_list))
   3574		return;
   3575
   3576	/*
   3577	 * check the current transaction.  If there are no writers, and it is
   3578	 * too old, finish it, and force the commit blocks to disk
   3579	 */
   3580	if (atomic_read(&journal->j_wcount) <= 0 &&
   3581	    journal->j_trans_start_time > 0 &&
   3582	    journal->j_len > 0 &&
   3583	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
   3584		if (!journal_join(&th, sb)) {
   3585			reiserfs_prepare_for_journal(sb,
   3586						     SB_BUFFER_WITH_SB(sb),
   3587						     1);
   3588			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
   3589
   3590			/*
   3591			 * we're only being called from kreiserfsd, it makes
   3592			 * no sense to do an async commit so that kreiserfsd
   3593			 * can do it later
   3594			 */
   3595			do_journal_end(&th, COMMIT_NOW | WAIT);
   3596		}
   3597	}
   3598}
   3599
   3600/*
   3601 * returns 0 if do_journal_end should return right away, returns 1 if
   3602 * do_journal_end should finish the commit
   3603 *
   3604 * if the current transaction is too old, but still has writers, this will
   3605 * wait on j_join_wait until all the writers are done.  By the time it
   3606 * wakes up, the transaction it was called has already ended, so it just
   3607 * flushes the commit list and returns 0.
   3608 *
   3609 * Won't batch when flush or commit_now is set.  Also won't batch when
   3610 * others are waiting on j_join_wait.
   3611 *
   3612 * Note, we can't allow the journal_end to proceed while there are still
   3613 * writers in the log.
   3614 */
   3615static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
   3616{
   3617
   3618	time64_t now;
   3619	int flush = flags & FLUSH_ALL;
   3620	int commit_now = flags & COMMIT_NOW;
   3621	int wait_on_commit = flags & WAIT;
   3622	struct reiserfs_journal_list *jl;
   3623	struct super_block *sb = th->t_super;
   3624	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3625
   3626	BUG_ON(!th->t_trans_id);
   3627
   3628	if (th->t_trans_id != journal->j_trans_id) {
   3629		reiserfs_panic(th->t_super, "journal-1577",
   3630			       "handle trans id %ld != current trans id %ld",
   3631			       th->t_trans_id, journal->j_trans_id);
   3632	}
   3633
   3634	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
   3635	/* <= 0 is allowed.  unmounting might not call begin */
   3636	if (atomic_read(&journal->j_wcount) > 0)
   3637		atomic_dec(&journal->j_wcount);
   3638
   3639	/*
   3640	 * BUG, deal with case where j_len is 0, but people previously
   3641	 * freed blocks need to be released will be dealt with by next
   3642	 * transaction that actually writes something, but should be taken
   3643	 * care of in this trans
   3644	 */
   3645	BUG_ON(journal->j_len == 0);
   3646
   3647	/*
   3648	 * if wcount > 0, and we are called to with flush or commit_now,
   3649	 * we wait on j_join_wait.  We will wake up when the last writer has
   3650	 * finished the transaction, and started it on its way to the disk.
   3651	 * Then, we flush the commit or journal list, and just return 0
   3652	 * because the rest of journal end was already done for this
   3653	 * transaction.
   3654	 */
   3655	if (atomic_read(&journal->j_wcount) > 0) {
   3656		if (flush || commit_now) {
   3657			unsigned trans_id;
   3658
   3659			jl = journal->j_current_jl;
   3660			trans_id = jl->j_trans_id;
   3661			if (wait_on_commit)
   3662				jl->j_state |= LIST_COMMIT_PENDING;
   3663			atomic_set(&journal->j_jlock, 1);
   3664			if (flush) {
   3665				journal->j_next_full_flush = 1;
   3666			}
   3667			unlock_journal(sb);
   3668
   3669			/*
   3670			 * sleep while the current transaction is
   3671			 * still j_jlocked
   3672			 */
   3673			while (journal->j_trans_id == trans_id) {
   3674				if (atomic_read(&journal->j_jlock)) {
   3675					queue_log_writer(sb);
   3676				} else {
   3677					lock_journal(sb);
   3678					if (journal->j_trans_id == trans_id) {
   3679						atomic_set(&journal->j_jlock,
   3680							   1);
   3681					}
   3682					unlock_journal(sb);
   3683				}
   3684			}
   3685			BUG_ON(journal->j_trans_id == trans_id);
   3686
   3687			if (commit_now
   3688			    && journal_list_still_alive(sb, trans_id)
   3689			    && wait_on_commit) {
   3690				flush_commit_list(sb, jl, 1);
   3691			}
   3692			return 0;
   3693		}
   3694		unlock_journal(sb);
   3695		return 0;
   3696	}
   3697
   3698	/* deal with old transactions where we are the last writers */
   3699	now = ktime_get_seconds();
   3700	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
   3701		commit_now = 1;
   3702		journal->j_next_async_flush = 1;
   3703	}
   3704	/* don't batch when someone is waiting on j_join_wait */
   3705	/* don't batch when syncing the commit or flushing the whole trans */
   3706	if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
   3707	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
   3708	    && journal->j_len_alloc < journal->j_max_batch
   3709	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
   3710		journal->j_bcount++;
   3711		unlock_journal(sb);
   3712		return 0;
   3713	}
   3714
   3715	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
   3716		reiserfs_panic(sb, "journal-003",
   3717			       "j_start (%ld) is too high",
   3718			       journal->j_start);
   3719	}
   3720	return 1;
   3721}
   3722
   3723/*
   3724 * Does all the work that makes deleting blocks safe.
   3725 * when deleting a block mark BH_JNew, just remove it from the current
   3726 * transaction, clean it's buffer_head and move on.
   3727 *
   3728 * otherwise:
   3729 * set a bit for the block in the journal bitmap.  That will prevent it from
   3730 * being allocated for unformatted nodes before this transaction has finished.
   3731 *
   3732 * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
   3733 * That will prevent any old transactions with this block from trying to flush
   3734 * to the real location.  Since we aren't removing the cnode from the
   3735 * journal_list_hash, *the block can't be reallocated yet.
   3736 *
   3737 * Then remove it from the current transaction, decrementing any counters and
   3738 * filing it on the clean list.
   3739 */
   3740int journal_mark_freed(struct reiserfs_transaction_handle *th,
   3741		       struct super_block *sb, b_blocknr_t blocknr)
   3742{
   3743	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3744	struct reiserfs_journal_cnode *cn = NULL;
   3745	struct buffer_head *bh = NULL;
   3746	struct reiserfs_list_bitmap *jb = NULL;
   3747	int cleaned = 0;
   3748	BUG_ON(!th->t_trans_id);
   3749
   3750	cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
   3751	if (cn && cn->bh) {
   3752		bh = cn->bh;
   3753		get_bh(bh);
   3754	}
   3755	/* if it is journal new, we just remove it from this transaction */
   3756	if (bh && buffer_journal_new(bh)) {
   3757		clear_buffer_journal_new(bh);
   3758		clear_prepared_bits(bh);
   3759		reiserfs_clean_and_file_buffer(bh);
   3760		cleaned = remove_from_transaction(sb, blocknr, cleaned);
   3761	} else {
   3762		/*
   3763		 * set the bit for this block in the journal bitmap
   3764		 * for this transaction
   3765		 */
   3766		jb = journal->j_current_jl->j_list_bitmap;
   3767		if (!jb) {
   3768			reiserfs_panic(sb, "journal-1702",
   3769				       "journal_list_bitmap is NULL");
   3770		}
   3771		set_bit_in_list_bitmap(sb, blocknr, jb);
   3772
   3773		/* Note, the entire while loop is not allowed to schedule.  */
   3774
   3775		if (bh) {
   3776			clear_prepared_bits(bh);
   3777			reiserfs_clean_and_file_buffer(bh);
   3778		}
   3779		cleaned = remove_from_transaction(sb, blocknr, cleaned);
   3780
   3781		/*
   3782		 * find all older transactions with this block,
   3783		 * make sure they don't try to write it out
   3784		 */
   3785		cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
   3786					  blocknr);
   3787		while (cn) {
   3788			if (sb == cn->sb && blocknr == cn->blocknr) {
   3789				set_bit(BLOCK_FREED, &cn->state);
   3790				if (cn->bh) {
   3791					/*
   3792					 * remove_from_transaction will brelse
   3793					 * the buffer if it was in the current
   3794					 * trans
   3795					 */
   3796					if (!cleaned) {
   3797						clear_buffer_journal_dirty(cn->
   3798									   bh);
   3799						clear_buffer_dirty(cn->bh);
   3800						clear_buffer_journal_test(cn->
   3801									  bh);
   3802						cleaned = 1;
   3803						put_bh(cn->bh);
   3804						if (atomic_read
   3805						    (&cn->bh->b_count) < 0) {
   3806							reiserfs_warning(sb,
   3807								 "journal-2138",
   3808								 "cn->bh->b_count < 0");
   3809						}
   3810					}
   3811					/*
   3812					 * since we are clearing the bh,
   3813					 * we MUST dec nonzerolen
   3814					 */
   3815					if (cn->jlist) {
   3816						atomic_dec(&cn->jlist->
   3817							   j_nonzerolen);
   3818					}
   3819					cn->bh = NULL;
   3820				}
   3821			}
   3822			cn = cn->hnext;
   3823		}
   3824	}
   3825
   3826	if (bh)
   3827		release_buffer_page(bh); /* get_hash grabs the buffer */
   3828	return 0;
   3829}
   3830
   3831void reiserfs_update_inode_transaction(struct inode *inode)
   3832{
   3833	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
   3834	REISERFS_I(inode)->i_jl = journal->j_current_jl;
   3835	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
   3836}
   3837
   3838/*
   3839 * returns -1 on error, 0 if no commits/barriers were done and 1
   3840 * if a transaction was actually committed and the barrier was done
   3841 */
   3842static int __commit_trans_jl(struct inode *inode, unsigned long id,
   3843			     struct reiserfs_journal_list *jl)
   3844{
   3845	struct reiserfs_transaction_handle th;
   3846	struct super_block *sb = inode->i_sb;
   3847	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3848	int ret = 0;
   3849
   3850	/*
   3851	 * is it from the current transaction,
   3852	 * or from an unknown transaction?
   3853	 */
   3854	if (id == journal->j_trans_id) {
   3855		jl = journal->j_current_jl;
   3856		/*
   3857		 * try to let other writers come in and
   3858		 * grow this transaction
   3859		 */
   3860		let_transaction_grow(sb, id);
   3861		if (journal->j_trans_id != id) {
   3862			goto flush_commit_only;
   3863		}
   3864
   3865		ret = journal_begin(&th, sb, 1);
   3866		if (ret)
   3867			return ret;
   3868
   3869		/* someone might have ended this transaction while we joined */
   3870		if (journal->j_trans_id != id) {
   3871			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
   3872						     1);
   3873			journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
   3874			ret = journal_end(&th);
   3875			goto flush_commit_only;
   3876		}
   3877
   3878		ret = journal_end_sync(&th);
   3879		if (!ret)
   3880			ret = 1;
   3881
   3882	} else {
   3883		/*
   3884		 * this gets tricky, we have to make sure the journal list in
   3885		 * the inode still exists.  We know the list is still around
   3886		 * if we've got a larger transaction id than the oldest list
   3887		 */
   3888flush_commit_only:
   3889		if (journal_list_still_alive(inode->i_sb, id)) {
   3890			/*
   3891			 * we only set ret to 1 when we know for sure
   3892			 * the barrier hasn't been started yet on the commit
   3893			 * block.
   3894			 */
   3895			if (atomic_read(&jl->j_commit_left) > 1)
   3896				ret = 1;
   3897			flush_commit_list(sb, jl, 1);
   3898			if (journal->j_errno)
   3899				ret = journal->j_errno;
   3900		}
   3901	}
   3902	/* otherwise the list is gone, and long since committed */
   3903	return ret;
   3904}
   3905
   3906int reiserfs_commit_for_inode(struct inode *inode)
   3907{
   3908	unsigned int id = REISERFS_I(inode)->i_trans_id;
   3909	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
   3910
   3911	/*
   3912	 * for the whole inode, assume unset id means it was
   3913	 * changed in the current transaction.  More conservative
   3914	 */
   3915	if (!id || !jl) {
   3916		reiserfs_update_inode_transaction(inode);
   3917		id = REISERFS_I(inode)->i_trans_id;
   3918		/* jl will be updated in __commit_trans_jl */
   3919	}
   3920
   3921	return __commit_trans_jl(inode, id, jl);
   3922}
   3923
   3924void reiserfs_restore_prepared_buffer(struct super_block *sb,
   3925				      struct buffer_head *bh)
   3926{
   3927	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3928	PROC_INFO_INC(sb, journal.restore_prepared);
   3929	if (!bh) {
   3930		return;
   3931	}
   3932	if (test_clear_buffer_journal_restore_dirty(bh) &&
   3933	    buffer_journal_dirty(bh)) {
   3934		struct reiserfs_journal_cnode *cn;
   3935		reiserfs_write_lock(sb);
   3936		cn = get_journal_hash_dev(sb,
   3937					  journal->j_list_hash_table,
   3938					  bh->b_blocknr);
   3939		if (cn && can_dirty(cn)) {
   3940			set_buffer_journal_test(bh);
   3941			mark_buffer_dirty(bh);
   3942		}
   3943		reiserfs_write_unlock(sb);
   3944	}
   3945	clear_buffer_journal_prepared(bh);
   3946}
   3947
   3948extern struct tree_balance *cur_tb;
   3949/*
   3950 * before we can change a metadata block, we have to make sure it won't
   3951 * be written to disk while we are altering it.  So, we must:
   3952 * clean it
   3953 * wait on it.
   3954 */
   3955int reiserfs_prepare_for_journal(struct super_block *sb,
   3956				 struct buffer_head *bh, int wait)
   3957{
   3958	PROC_INFO_INC(sb, journal.prepare);
   3959
   3960	if (!trylock_buffer(bh)) {
   3961		if (!wait)
   3962			return 0;
   3963		lock_buffer(bh);
   3964	}
   3965	set_buffer_journal_prepared(bh);
   3966	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
   3967		clear_buffer_journal_test(bh);
   3968		set_buffer_journal_restore_dirty(bh);
   3969	}
   3970	unlock_buffer(bh);
   3971	return 1;
   3972}
   3973
   3974/*
   3975 * long and ugly.  If flush, will not return until all commit
   3976 * blocks and all real buffers in the trans are on disk.
   3977 * If no_async, won't return until all commit blocks are on disk.
   3978 *
   3979 * keep reading, there are comments as you go along
   3980 *
   3981 * If the journal is aborted, we just clean up. Things like flushing
   3982 * journal lists, etc just won't happen.
   3983 */
   3984static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
   3985{
   3986	struct super_block *sb = th->t_super;
   3987	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   3988	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
   3989	struct reiserfs_journal_cnode *last_cn = NULL;
   3990	struct reiserfs_journal_desc *desc;
   3991	struct reiserfs_journal_commit *commit;
   3992	struct buffer_head *c_bh;	/* commit bh */
   3993	struct buffer_head *d_bh;	/* desc bh */
   3994	int cur_write_start = 0;	/* start index of current log write */
   3995	int i;
   3996	int flush;
   3997	int wait_on_commit;
   3998	struct reiserfs_journal_list *jl, *temp_jl;
   3999	struct list_head *entry, *safe;
   4000	unsigned long jindex;
   4001	unsigned int commit_trans_id;
   4002	int trans_half;
   4003	int depth;
   4004
   4005	BUG_ON(th->t_refcount > 1);
   4006	BUG_ON(!th->t_trans_id);
   4007	BUG_ON(!th->t_super);
   4008
   4009	/*
   4010	 * protect flush_older_commits from doing mistakes if the
   4011	 * transaction ID counter gets overflowed.
   4012	 */
   4013	if (th->t_trans_id == ~0U)
   4014		flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
   4015	flush = flags & FLUSH_ALL;
   4016	wait_on_commit = flags & WAIT;
   4017
   4018	current->journal_info = th->t_handle_save;
   4019	reiserfs_check_lock_depth(sb, "journal end");
   4020	if (journal->j_len == 0) {
   4021		reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
   4022					     1);
   4023		journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
   4024	}
   4025
   4026	lock_journal(sb);
   4027	if (journal->j_next_full_flush) {
   4028		flags |= FLUSH_ALL;
   4029		flush = 1;
   4030	}
   4031	if (journal->j_next_async_flush) {
   4032		flags |= COMMIT_NOW | WAIT;
   4033		wait_on_commit = 1;
   4034	}
   4035
   4036	/*
   4037	 * check_journal_end locks the journal, and unlocks if it does
   4038	 * not return 1 it tells us if we should continue with the
   4039	 * journal_end, or just return
   4040	 */
   4041	if (!check_journal_end(th, flags)) {
   4042		reiserfs_schedule_old_flush(sb);
   4043		wake_queued_writers(sb);
   4044		reiserfs_async_progress_wait(sb);
   4045		goto out;
   4046	}
   4047
   4048	/* check_journal_end might set these, check again */
   4049	if (journal->j_next_full_flush) {
   4050		flush = 1;
   4051	}
   4052
   4053	/*
   4054	 * j must wait means we have to flush the log blocks, and the
   4055	 * real blocks for this transaction
   4056	 */
   4057	if (journal->j_must_wait > 0) {
   4058		flush = 1;
   4059	}
   4060#ifdef REISERFS_PREALLOCATE
   4061	/*
   4062	 * quota ops might need to nest, setup the journal_info pointer
   4063	 * for them and raise the refcount so that it is > 0.
   4064	 */
   4065	current->journal_info = th;
   4066	th->t_refcount++;
   4067
   4068	/* it should not involve new blocks into the transaction */
   4069	reiserfs_discard_all_prealloc(th);
   4070
   4071	th->t_refcount--;
   4072	current->journal_info = th->t_handle_save;
   4073#endif
   4074
   4075	/* setup description block */
   4076	d_bh =
   4077	    journal_getblk(sb,
   4078			   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   4079			   journal->j_start);
   4080	set_buffer_uptodate(d_bh);
   4081	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
   4082	memset(d_bh->b_data, 0, d_bh->b_size);
   4083	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
   4084	set_desc_trans_id(desc, journal->j_trans_id);
   4085
   4086	/*
   4087	 * setup commit block.  Don't write (keep it clean too) this one
   4088	 * until after everyone else is written
   4089	 */
   4090	c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   4091			      ((journal->j_start + journal->j_len +
   4092				1) % SB_ONDISK_JOURNAL_SIZE(sb)));
   4093	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
   4094	memset(c_bh->b_data, 0, c_bh->b_size);
   4095	set_commit_trans_id(commit, journal->j_trans_id);
   4096	set_buffer_uptodate(c_bh);
   4097
   4098	/* init this journal list */
   4099	jl = journal->j_current_jl;
   4100
   4101	/*
   4102	 * we lock the commit before doing anything because
   4103	 * we want to make sure nobody tries to run flush_commit_list until
   4104	 * the new transaction is fully setup, and we've already flushed the
   4105	 * ordered bh list
   4106	 */
   4107	reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
   4108
   4109	/* save the transaction id in case we need to commit it later */
   4110	commit_trans_id = jl->j_trans_id;
   4111
   4112	atomic_set(&jl->j_older_commits_done, 0);
   4113	jl->j_trans_id = journal->j_trans_id;
   4114	jl->j_timestamp = journal->j_trans_start_time;
   4115	jl->j_commit_bh = c_bh;
   4116	jl->j_start = journal->j_start;
   4117	jl->j_len = journal->j_len;
   4118	atomic_set(&jl->j_nonzerolen, journal->j_len);
   4119	atomic_set(&jl->j_commit_left, journal->j_len + 2);
   4120	jl->j_realblock = NULL;
   4121
   4122	/*
   4123	 * The ENTIRE FOR LOOP MUST not cause schedule to occur.
   4124	 * for each real block, add it to the journal list hash,
   4125	 * copy into real block index array in the commit or desc block
   4126	 */
   4127	trans_half = journal_trans_half(sb->s_blocksize);
   4128	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
   4129		if (buffer_journaled(cn->bh)) {
   4130			jl_cn = get_cnode(sb);
   4131			if (!jl_cn) {
   4132				reiserfs_panic(sb, "journal-1676",
   4133					       "get_cnode returned NULL");
   4134			}
   4135			if (i == 0) {
   4136				jl->j_realblock = jl_cn;
   4137			}
   4138			jl_cn->prev = last_cn;
   4139			jl_cn->next = NULL;
   4140			if (last_cn) {
   4141				last_cn->next = jl_cn;
   4142			}
   4143			last_cn = jl_cn;
   4144			/*
   4145			 * make sure the block we are trying to log
   4146			 * is not a block of journal or reserved area
   4147			 */
   4148			if (is_block_in_log_or_reserved_area
   4149			    (sb, cn->bh->b_blocknr)) {
   4150				reiserfs_panic(sb, "journal-2332",
   4151					       "Trying to log block %lu, "
   4152					       "which is a log block",
   4153					       cn->bh->b_blocknr);
   4154			}
   4155			jl_cn->blocknr = cn->bh->b_blocknr;
   4156			jl_cn->state = 0;
   4157			jl_cn->sb = sb;
   4158			jl_cn->bh = cn->bh;
   4159			jl_cn->jlist = jl;
   4160			insert_journal_hash(journal->j_list_hash_table, jl_cn);
   4161			if (i < trans_half) {
   4162				desc->j_realblock[i] =
   4163				    cpu_to_le32(cn->bh->b_blocknr);
   4164			} else {
   4165				commit->j_realblock[i - trans_half] =
   4166				    cpu_to_le32(cn->bh->b_blocknr);
   4167			}
   4168		} else {
   4169			i--;
   4170		}
   4171	}
   4172	set_desc_trans_len(desc, journal->j_len);
   4173	set_desc_mount_id(desc, journal->j_mount_id);
   4174	set_desc_trans_id(desc, journal->j_trans_id);
   4175	set_commit_trans_len(commit, journal->j_len);
   4176
   4177	/*
   4178	 * special check in case all buffers in the journal
   4179	 * were marked for not logging
   4180	 */
   4181	BUG_ON(journal->j_len == 0);
   4182
   4183	/*
   4184	 * we're about to dirty all the log blocks, mark the description block
   4185	 * dirty now too.  Don't mark the commit block dirty until all the
   4186	 * others are on disk
   4187	 */
   4188	mark_buffer_dirty(d_bh);
   4189
   4190	/*
   4191	 * first data block is j_start + 1, so add one to
   4192	 * cur_write_start wherever you use it
   4193	 */
   4194	cur_write_start = journal->j_start;
   4195	cn = journal->j_first;
   4196	jindex = 1;	/* start at one so we don't get the desc again */
   4197	while (cn) {
   4198		clear_buffer_journal_new(cn->bh);
   4199		/* copy all the real blocks into log area.  dirty log blocks */
   4200		if (buffer_journaled(cn->bh)) {
   4201			struct buffer_head *tmp_bh;
   4202			char *addr;
   4203			struct page *page;
   4204			tmp_bh =
   4205			    journal_getblk(sb,
   4206					   SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
   4207					   ((cur_write_start +
   4208					     jindex) %
   4209					    SB_ONDISK_JOURNAL_SIZE(sb)));
   4210			set_buffer_uptodate(tmp_bh);
   4211			page = cn->bh->b_page;
   4212			addr = kmap(page);
   4213			memcpy(tmp_bh->b_data,
   4214			       addr + offset_in_page(cn->bh->b_data),
   4215			       cn->bh->b_size);
   4216			kunmap(page);
   4217			mark_buffer_dirty(tmp_bh);
   4218			jindex++;
   4219			set_buffer_journal_dirty(cn->bh);
   4220			clear_buffer_journaled(cn->bh);
   4221		} else {
   4222			/*
   4223			 * JDirty cleared sometime during transaction.
   4224			 * don't log this one
   4225			 */
   4226			reiserfs_warning(sb, "journal-2048",
   4227					 "BAD, buffer in journal hash, "
   4228					 "but not JDirty!");
   4229			brelse(cn->bh);
   4230		}
   4231		next = cn->next;
   4232		free_cnode(sb, cn);
   4233		cn = next;
   4234		reiserfs_cond_resched(sb);
   4235	}
   4236
   4237	/*
   4238	 * we are done with both the c_bh and d_bh, but
   4239	 * c_bh must be written after all other commit blocks,
   4240	 * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
   4241	 */
   4242
   4243	journal->j_current_jl = alloc_journal_list(sb);
   4244
   4245	/* now it is safe to insert this transaction on the main list */
   4246	list_add_tail(&jl->j_list, &journal->j_journal_list);
   4247	list_add_tail(&jl->j_working_list, &journal->j_working_list);
   4248	journal->j_num_work_lists++;
   4249
   4250	/* reset journal values for the next transaction */
   4251	journal->j_start =
   4252	    (journal->j_start + journal->j_len +
   4253	     2) % SB_ONDISK_JOURNAL_SIZE(sb);
   4254	atomic_set(&journal->j_wcount, 0);
   4255	journal->j_bcount = 0;
   4256	journal->j_last = NULL;
   4257	journal->j_first = NULL;
   4258	journal->j_len = 0;
   4259	journal->j_trans_start_time = 0;
   4260	/* check for trans_id overflow */
   4261	if (++journal->j_trans_id == 0)
   4262		journal->j_trans_id = 10;
   4263	journal->j_current_jl->j_trans_id = journal->j_trans_id;
   4264	journal->j_must_wait = 0;
   4265	journal->j_len_alloc = 0;
   4266	journal->j_next_full_flush = 0;
   4267	journal->j_next_async_flush = 0;
   4268	init_journal_hash(sb);
   4269
   4270	/*
   4271	 * make sure reiserfs_add_jh sees the new current_jl before we
   4272	 * write out the tails
   4273	 */
   4274	smp_mb();
   4275
   4276	/*
   4277	 * tail conversion targets have to hit the disk before we end the
   4278	 * transaction.  Otherwise a later transaction might repack the tail
   4279	 * before this transaction commits, leaving the data block unflushed
   4280	 * and clean, if we crash before the later transaction commits, the
   4281	 * data block is lost.
   4282	 */
   4283	if (!list_empty(&jl->j_tail_bh_list)) {
   4284		depth = reiserfs_write_unlock_nested(sb);
   4285		write_ordered_buffers(&journal->j_dirty_buffers_lock,
   4286				      journal, jl, &jl->j_tail_bh_list);
   4287		reiserfs_write_lock_nested(sb, depth);
   4288	}
   4289	BUG_ON(!list_empty(&jl->j_tail_bh_list));
   4290	mutex_unlock(&jl->j_commit_mutex);
   4291
   4292	/*
   4293	 * honor the flush wishes from the caller, simple commits can
   4294	 * be done outside the journal lock, they are done below
   4295	 *
   4296	 * if we don't flush the commit list right now, we put it into
   4297	 * the work queue so the people waiting on the async progress work
   4298	 * queue don't wait for this proc to flush journal lists and such.
   4299	 */
   4300	if (flush) {
   4301		flush_commit_list(sb, jl, 1);
   4302		flush_journal_list(sb, jl, 1);
   4303	} else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
   4304		/*
   4305		 * Avoid queueing work when sb is being shut down. Transaction
   4306		 * will be flushed on journal shutdown.
   4307		 */
   4308		if (sb->s_flags & SB_ACTIVE)
   4309			queue_delayed_work(REISERFS_SB(sb)->commit_wq,
   4310					   &journal->j_work, HZ / 10);
   4311	}
   4312
   4313	/*
   4314	 * if the next transaction has any chance of wrapping, flush
   4315	 * transactions that might get overwritten.  If any journal lists
   4316	 * are very old flush them as well.
   4317	 */
   4318first_jl:
   4319	list_for_each_safe(entry, safe, &journal->j_journal_list) {
   4320		temp_jl = JOURNAL_LIST_ENTRY(entry);
   4321		if (journal->j_start <= temp_jl->j_start) {
   4322			if ((journal->j_start + journal->j_trans_max + 1) >=
   4323			    temp_jl->j_start) {
   4324				flush_used_journal_lists(sb, temp_jl);
   4325				goto first_jl;
   4326			} else if ((journal->j_start +
   4327				    journal->j_trans_max + 1) <
   4328				   SB_ONDISK_JOURNAL_SIZE(sb)) {
   4329				/*
   4330				 * if we don't cross into the next
   4331				 * transaction and we don't wrap, there is
   4332				 * no way we can overlap any later transactions
   4333				 * break now
   4334				 */
   4335				break;
   4336			}
   4337		} else if ((journal->j_start +
   4338			    journal->j_trans_max + 1) >
   4339			   SB_ONDISK_JOURNAL_SIZE(sb)) {
   4340			if (((journal->j_start + journal->j_trans_max + 1) %
   4341			     SB_ONDISK_JOURNAL_SIZE(sb)) >=
   4342			    temp_jl->j_start) {
   4343				flush_used_journal_lists(sb, temp_jl);
   4344				goto first_jl;
   4345			} else {
   4346				/*
   4347				* we don't overlap anything from out start
   4348				* to the end of the log, and our wrapped
   4349				* portion doesn't overlap anything at
   4350				* the start of the log.  We can break
   4351				*/
   4352				break;
   4353			}
   4354		}
   4355	}
   4356
   4357	journal->j_current_jl->j_list_bitmap =
   4358	    get_list_bitmap(sb, journal->j_current_jl);
   4359
   4360	if (!(journal->j_current_jl->j_list_bitmap)) {
   4361		reiserfs_panic(sb, "journal-1996",
   4362			       "could not get a list bitmap");
   4363	}
   4364
   4365	atomic_set(&journal->j_jlock, 0);
   4366	unlock_journal(sb);
   4367	/* wake up any body waiting to join. */
   4368	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
   4369	wake_up(&journal->j_join_wait);
   4370
   4371	if (!flush && wait_on_commit &&
   4372	    journal_list_still_alive(sb, commit_trans_id)) {
   4373		flush_commit_list(sb, jl, 1);
   4374	}
   4375out:
   4376	reiserfs_check_lock_depth(sb, "journal end2");
   4377
   4378	memset(th, 0, sizeof(*th));
   4379	/*
   4380	 * Re-set th->t_super, so we can properly keep track of how many
   4381	 * persistent transactions there are. We need to do this so if this
   4382	 * call is part of a failed restart_transaction, we can free it later
   4383	 */
   4384	th->t_super = sb;
   4385
   4386	return journal->j_errno;
   4387}
   4388
   4389/* Send the file system read only and refuse new transactions */
   4390void reiserfs_abort_journal(struct super_block *sb, int errno)
   4391{
   4392	struct reiserfs_journal *journal = SB_JOURNAL(sb);
   4393	if (test_bit(J_ABORTED, &journal->j_state))
   4394		return;
   4395
   4396	if (!journal->j_errno)
   4397		journal->j_errno = errno;
   4398
   4399	sb->s_flags |= SB_RDONLY;
   4400	set_bit(J_ABORTED, &journal->j_state);
   4401
   4402#ifdef CONFIG_REISERFS_CHECK
   4403	dump_stack();
   4404#endif
   4405}