lops.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
lops.c (29042B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
      4 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
      5 */
      6
      7#include <linux/sched.h>
      8#include <linux/slab.h>
      9#include <linux/spinlock.h>
     10#include <linux/completion.h>
     11#include <linux/buffer_head.h>
     12#include <linux/mempool.h>
     13#include <linux/gfs2_ondisk.h>
     14#include <linux/bio.h>
     15#include <linux/fs.h>
     16#include <linux/list_sort.h>
     17#include <linux/blkdev.h>
     18
     19#include "bmap.h"
     20#include "dir.h"
     21#include "gfs2.h"
     22#include "incore.h"
     23#include "inode.h"
     24#include "glock.h"
     25#include "glops.h"
     26#include "log.h"
     27#include "lops.h"
     28#include "meta_io.h"
     29#include "recovery.h"
     30#include "rgrp.h"
     31#include "trans.h"
     32#include "util.h"
     33#include "trace_gfs2.h"
     34
     35/**
     36 * gfs2_pin - Pin a buffer in memory
     37 * @sdp: The superblock
     38 * @bh: The buffer to be pinned
     39 *
     40 * The log lock must be held when calling this function
     41 */
     42void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
     43{
     44	struct gfs2_bufdata *bd;
     45
     46	BUG_ON(!current->journal_info);
     47
     48	clear_buffer_dirty(bh);
     49	if (test_set_buffer_pinned(bh))
     50		gfs2_assert_withdraw(sdp, 0);
     51	if (!buffer_uptodate(bh))
     52		gfs2_io_error_bh_wd(sdp, bh);
     53	bd = bh->b_private;
     54	/* If this buffer is in the AIL and it has already been written
     55	 * to in-place disk block, remove it from the AIL.
     56	 */
     57	spin_lock(&sdp->sd_ail_lock);
     58	if (bd->bd_tr)
     59		list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
     60	spin_unlock(&sdp->sd_ail_lock);
     61	get_bh(bh);
     62	atomic_inc(&sdp->sd_log_pinned);
     63	trace_gfs2_pin(bd, 1);
     64}
     65
     66static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
     67{
     68	return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
     69}
     70
     71static void maybe_release_space(struct gfs2_bufdata *bd)
     72{
     73	struct gfs2_glock *gl = bd->bd_gl;
     74	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
     75	struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
     76	unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
     77	struct gfs2_bitmap *bi = rgd->rd_bits + index;
     78
     79	rgrp_lock_local(rgd);
     80	if (bi->bi_clone == NULL)
     81		goto out;
     82	if (sdp->sd_args.ar_discard)
     83		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
     84	memcpy(bi->bi_clone + bi->bi_offset,
     85	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
     86	clear_bit(GBF_FULL, &bi->bi_flags);
     87	rgd->rd_free_clone = rgd->rd_free;
     88	BUG_ON(rgd->rd_free_clone < rgd->rd_reserved);
     89	rgd->rd_extfail_pt = rgd->rd_free;
     90
     91out:
     92	rgrp_unlock_local(rgd);
     93}
     94
     95/**
     96 * gfs2_unpin - Unpin a buffer
     97 * @sdp: the filesystem the buffer belongs to
     98 * @bh: The buffer to unpin
     99 * @tr: The system transaction being flushed
    100 */
    101
    102static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
    103		       struct gfs2_trans *tr)
    104{
    105	struct gfs2_bufdata *bd = bh->b_private;
    106
    107	BUG_ON(!buffer_uptodate(bh));
    108	BUG_ON(!buffer_pinned(bh));
    109
    110	lock_buffer(bh);
    111	mark_buffer_dirty(bh);
    112	clear_buffer_pinned(bh);
    113
    114	if (buffer_is_rgrp(bd))
    115		maybe_release_space(bd);
    116
    117	spin_lock(&sdp->sd_ail_lock);
    118	if (bd->bd_tr) {
    119		list_del(&bd->bd_ail_st_list);
    120		brelse(bh);
    121	} else {
    122		struct gfs2_glock *gl = bd->bd_gl;
    123		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
    124		atomic_inc(&gl->gl_ail_count);
    125	}
    126	bd->bd_tr = tr;
    127	list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
    128	spin_unlock(&sdp->sd_ail_lock);
    129
    130	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
    131	trace_gfs2_pin(bd, 0);
    132	unlock_buffer(bh);
    133	atomic_dec(&sdp->sd_log_pinned);
    134}
    135
    136void gfs2_log_incr_head(struct gfs2_sbd *sdp)
    137{
    138	BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
    139	       (sdp->sd_log_flush_head != sdp->sd_log_head));
    140
    141	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks)
    142		sdp->sd_log_flush_head = 0;
    143}
    144
    145u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock)
    146{
    147	struct gfs2_journal_extent *je;
    148
    149	list_for_each_entry(je, &jd->extent_list, list) {
    150		if (lblock >= je->lblock && lblock < je->lblock + je->blocks)
    151			return je->dblock + lblock - je->lblock;
    152	}
    153
    154	return -1;
    155}
    156
    157/**
    158 * gfs2_end_log_write_bh - end log write of pagecache data with buffers
    159 * @sdp: The superblock
    160 * @bvec: The bio_vec
    161 * @error: The i/o status
    162 *
    163 * This finds the relevant buffers and unlocks them and sets the
    164 * error flag according to the status of the i/o request. This is
    165 * used when the log is writing data which has an in-place version
    166 * that is pinned in the pagecache.
    167 */
    168
    169static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
    170				  struct bio_vec *bvec,
    171				  blk_status_t error)
    172{
    173	struct buffer_head *bh, *next;
    174	struct page *page = bvec->bv_page;
    175	unsigned size;
    176
    177	bh = page_buffers(page);
    178	size = bvec->bv_len;
    179	while (bh_offset(bh) < bvec->bv_offset)
    180		bh = bh->b_this_page;
    181	do {
    182		if (error)
    183			mark_buffer_write_io_error(bh);
    184		unlock_buffer(bh);
    185		next = bh->b_this_page;
    186		size -= bh->b_size;
    187		brelse(bh);
    188		bh = next;
    189	} while(bh && size);
    190}
    191
    192/**
    193 * gfs2_end_log_write - end of i/o to the log
    194 * @bio: The bio
    195 *
    196 * Each bio_vec contains either data from the pagecache or data
    197 * relating to the log itself. Here we iterate over the bio_vec
    198 * array, processing both kinds of data.
    199 *
    200 */
    201
    202static void gfs2_end_log_write(struct bio *bio)
    203{
    204	struct gfs2_sbd *sdp = bio->bi_private;
    205	struct bio_vec *bvec;
    206	struct page *page;
    207	struct bvec_iter_all iter_all;
    208
    209	if (bio->bi_status) {
    210		if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status))
    211			fs_err(sdp, "Error %d writing to journal, jid=%u\n",
    212			       bio->bi_status, sdp->sd_jdesc->jd_jid);
    213		gfs2_withdraw_delayed(sdp);
    214		/* prevent more writes to the journal */
    215		clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
    216		wake_up(&sdp->sd_logd_waitq);
    217	}
    218
    219	bio_for_each_segment_all(bvec, bio, iter_all) {
    220		page = bvec->bv_page;
    221		if (page_has_buffers(page))
    222			gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
    223		else
    224			mempool_free(page, gfs2_page_pool);
    225	}
    226
    227	bio_put(bio);
    228	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
    229		wake_up(&sdp->sd_log_flush_wait);
    230}
    231
    232/**
    233 * gfs2_log_submit_bio - Submit any pending log bio
    234 * @biop: Address of the bio pointer
    235 * @opf: REQ_OP | op_flags
    236 *
    237 * Submit any pending part-built or full bio to the block device. If
    238 * there is no pending bio, then this is a no-op.
    239 */
    240
    241void gfs2_log_submit_bio(struct bio **biop, int opf)
    242{
    243	struct bio *bio = *biop;
    244	if (bio) {
    245		struct gfs2_sbd *sdp = bio->bi_private;
    246		atomic_inc(&sdp->sd_log_in_flight);
    247		bio->bi_opf = opf;
    248		submit_bio(bio);
    249		*biop = NULL;
    250	}
    251}
    252
    253/**
    254 * gfs2_log_alloc_bio - Allocate a bio
    255 * @sdp: The super block
    256 * @blkno: The device block number we want to write to
    257 * @end_io: The bi_end_io callback
    258 *
    259 * Allocate a new bio, initialize it with the given parameters and return it.
    260 *
    261 * Returns: The newly allocated bio
    262 */
    263
    264static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
    265				      bio_end_io_t *end_io)
    266{
    267	struct super_block *sb = sdp->sd_vfs;
    268	struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
    269
    270	bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
    271	bio->bi_end_io = end_io;
    272	bio->bi_private = sdp;
    273
    274	return bio;
    275}
    276
    277/**
    278 * gfs2_log_get_bio - Get cached log bio, or allocate a new one
    279 * @sdp: The super block
    280 * @blkno: The device block number we want to write to
    281 * @biop: The bio to get or allocate
    282 * @op: REQ_OP
    283 * @end_io: The bi_end_io callback
    284 * @flush: Always flush the current bio and allocate a new one?
    285 *
    286 * If there is a cached bio, then if the next block number is sequential
    287 * with the previous one, return it, otherwise flush the bio to the
    288 * device. If there is no cached bio, or we just flushed it, then
    289 * allocate a new one.
    290 *
    291 * Returns: The bio to use for log writes
    292 */
    293
    294static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
    295				    struct bio **biop, int op,
    296				    bio_end_io_t *end_io, bool flush)
    297{
    298	struct bio *bio = *biop;
    299
    300	if (bio) {
    301		u64 nblk;
    302
    303		nblk = bio_end_sector(bio);
    304		nblk >>= sdp->sd_fsb2bb_shift;
    305		if (blkno == nblk && !flush)
    306			return bio;
    307		gfs2_log_submit_bio(biop, op);
    308	}
    309
    310	*biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
    311	return *biop;
    312}
    313
    314/**
    315 * gfs2_log_write - write to log
    316 * @sdp: the filesystem
    317 * @jd: The journal descriptor
    318 * @page: the page to write
    319 * @size: the size of the data to write
    320 * @offset: the offset within the page 
    321 * @blkno: block number of the log entry
    322 *
    323 * Try and add the page segment to the current bio. If that fails,
    324 * submit the current bio to the device and create a new one, and
    325 * then add the page segment to that.
    326 */
    327
    328void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
    329		    struct page *page, unsigned size, unsigned offset,
    330		    u64 blkno)
    331{
    332	struct bio *bio;
    333	int ret;
    334
    335	bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE,
    336			       gfs2_end_log_write, false);
    337	ret = bio_add_page(bio, page, size, offset);
    338	if (ret == 0) {
    339		bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio,
    340				       REQ_OP_WRITE, gfs2_end_log_write, true);
    341		ret = bio_add_page(bio, page, size, offset);
    342		WARN_ON(ret == 0);
    343	}
    344}
    345
    346/**
    347 * gfs2_log_write_bh - write a buffer's content to the log
    348 * @sdp: The super block
    349 * @bh: The buffer pointing to the in-place location
    350 * 
    351 * This writes the content of the buffer to the next available location
    352 * in the log. The buffer will be unlocked once the i/o to the log has
    353 * completed.
    354 */
    355
    356static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
    357{
    358	u64 dblock;
    359
    360	dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head);
    361	gfs2_log_incr_head(sdp);
    362	gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size,
    363		       bh_offset(bh), dblock);
    364}
    365
    366/**
    367 * gfs2_log_write_page - write one block stored in a page, into the log
    368 * @sdp: The superblock
    369 * @page: The struct page
    370 *
    371 * This writes the first block-sized part of the page into the log. Note
    372 * that the page must have been allocated from the gfs2_page_pool mempool
    373 * and that after this has been called, ownership has been transferred and
    374 * the page may be freed at any time.
    375 */
    376
    377static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
    378{
    379	struct super_block *sb = sdp->sd_vfs;
    380	u64 dblock;
    381
    382	dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head);
    383	gfs2_log_incr_head(sdp);
    384	gfs2_log_write(sdp, sdp->sd_jdesc, page, sb->s_blocksize, 0, dblock);
    385}
    386
    387/**
    388 * gfs2_end_log_read - end I/O callback for reads from the log
    389 * @bio: The bio
    390 *
    391 * Simply unlock the pages in the bio. The main thread will wait on them and
    392 * process them in order as necessary.
    393 */
    394
    395static void gfs2_end_log_read(struct bio *bio)
    396{
    397	struct page *page;
    398	struct bio_vec *bvec;
    399	struct bvec_iter_all iter_all;
    400
    401	bio_for_each_segment_all(bvec, bio, iter_all) {
    402		page = bvec->bv_page;
    403		if (bio->bi_status) {
    404			int err = blk_status_to_errno(bio->bi_status);
    405
    406			SetPageError(page);
    407			mapping_set_error(page->mapping, err);
    408		}
    409		unlock_page(page);
    410	}
    411
    412	bio_put(bio);
    413}
    414
    415/**
    416 * gfs2_jhead_pg_srch - Look for the journal head in a given page.
    417 * @jd: The journal descriptor
    418 * @head: The journal head to start from
    419 * @page: The page to look in
    420 *
    421 * Returns: 1 if found, 0 otherwise.
    422 */
    423
    424static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
    425			      struct gfs2_log_header_host *head,
    426			      struct page *page)
    427{
    428	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    429	struct gfs2_log_header_host lh;
    430	void *kaddr = kmap_atomic(page);
    431	unsigned int offset;
    432	bool ret = false;
    433
    434	for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
    435		if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
    436			if (lh.lh_sequence >= head->lh_sequence)
    437				*head = lh;
    438			else {
    439				ret = true;
    440				break;
    441			}
    442		}
    443	}
    444	kunmap_atomic(kaddr);
    445	return ret;
    446}
    447
    448/**
    449 * gfs2_jhead_process_page - Search/cleanup a page
    450 * @jd: The journal descriptor
    451 * @index: Index of the page to look into
    452 * @head: The journal head to start from
    453 * @done: If set, perform only cleanup, else search and set if found.
    454 *
    455 * Find the page with 'index' in the journal's mapping. Search the page for
    456 * the journal head if requested (cleanup == false). Release refs on the
    457 * page so the page cache can reclaim it (put_page() twice). We grabbed a
    458 * reference on this page two times, first when we did a find_or_create_page()
    459 * to obtain the page to add it to the bio and second when we do a
    460 * find_get_page() here to get the page to wait on while I/O on it is being
    461 * completed.
    462 * This function is also used to free up a page we might've grabbed but not
    463 * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
    464 * submitted the I/O, but we already found the jhead so we only need to drop
    465 * our references to the page.
    466 */
    467
    468static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
    469				    struct gfs2_log_header_host *head,
    470				    bool *done)
    471{
    472	struct page *page;
    473
    474	page = find_get_page(jd->jd_inode->i_mapping, index);
    475	wait_on_page_locked(page);
    476
    477	if (PageError(page))
    478		*done = true;
    479
    480	if (!*done)
    481		*done = gfs2_jhead_pg_srch(jd, head, page);
    482
    483	put_page(page); /* Once for find_get_page */
    484	put_page(page); /* Once more for find_or_create_page */
    485}
    486
    487static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
    488{
    489	struct bio *new;
    490
    491	new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
    492	bio_clone_blkg_association(new, prev);
    493	new->bi_iter.bi_sector = bio_end_sector(prev);
    494	bio_chain(new, prev);
    495	submit_bio(prev);
    496	return new;
    497}
    498
    499/**
    500 * gfs2_find_jhead - find the head of a log
    501 * @jd: The journal descriptor
    502 * @head: The log descriptor for the head of the log is returned here
    503 * @keep_cache: If set inode pages will not be truncated
    504 *
    505 * Do a search of a journal by reading it in large chunks using bios and find
    506 * the valid log entry with the highest sequence number.  (i.e. the log head)
    507 *
    508 * Returns: 0 on success, errno otherwise
    509 */
    510int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
    511		    bool keep_cache)
    512{
    513	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    514	struct address_space *mapping = jd->jd_inode->i_mapping;
    515	unsigned int block = 0, blocks_submitted = 0, blocks_read = 0;
    516	unsigned int bsize = sdp->sd_sb.sb_bsize, off;
    517	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
    518	unsigned int shift = PAGE_SHIFT - bsize_shift;
    519	unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
    520	struct gfs2_journal_extent *je;
    521	int sz, ret = 0;
    522	struct bio *bio = NULL;
    523	struct page *page = NULL;
    524	bool done = false;
    525	errseq_t since;
    526
    527	memset(head, 0, sizeof(*head));
    528	if (list_empty(&jd->extent_list))
    529		gfs2_map_journal_extents(sdp, jd);
    530
    531	since = filemap_sample_wb_err(mapping);
    532	list_for_each_entry(je, &jd->extent_list, list) {
    533		u64 dblock = je->dblock;
    534
    535		for (; block < je->lblock + je->blocks; block++, dblock++) {
    536			if (!page) {
    537				page = find_or_create_page(mapping,
    538						block >> shift, GFP_NOFS);
    539				if (!page) {
    540					ret = -ENOMEM;
    541					done = true;
    542					goto out;
    543				}
    544				off = 0;
    545			}
    546
    547			if (bio && (off || block < blocks_submitted + max_blocks)) {
    548				sector_t sector = dblock << sdp->sd_fsb2bb_shift;
    549
    550				if (bio_end_sector(bio) == sector) {
    551					sz = bio_add_page(bio, page, bsize, off);
    552					if (sz == bsize)
    553						goto block_added;
    554				}
    555				if (off) {
    556					unsigned int blocks =
    557						(PAGE_SIZE - off) >> bsize_shift;
    558
    559					bio = gfs2_chain_bio(bio, blocks);
    560					goto add_block_to_new_bio;
    561				}
    562			}
    563
    564			if (bio) {
    565				blocks_submitted = block;
    566				submit_bio(bio);
    567			}
    568
    569			bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
    570			bio->bi_opf = REQ_OP_READ;
    571add_block_to_new_bio:
    572			sz = bio_add_page(bio, page, bsize, off);
    573			BUG_ON(sz != bsize);
    574block_added:
    575			off += bsize;
    576			if (off == PAGE_SIZE)
    577				page = NULL;
    578			if (blocks_submitted <= blocks_read + max_blocks) {
    579				/* Keep at least one bio in flight */
    580				continue;
    581			}
    582
    583			gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
    584			blocks_read += PAGE_SIZE >> bsize_shift;
    585			if (done)
    586				goto out;  /* found */
    587		}
    588	}
    589
    590out:
    591	if (bio)
    592		submit_bio(bio);
    593	while (blocks_read < block) {
    594		gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
    595		blocks_read += PAGE_SIZE >> bsize_shift;
    596	}
    597
    598	if (!ret)
    599		ret = filemap_check_wb_err(mapping, since);
    600
    601	if (!keep_cache)
    602		truncate_inode_pages(mapping, 0);
    603
    604	return ret;
    605}
    606
    607static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
    608				      u32 ld_length, u32 ld_data1)
    609{
    610	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
    611	struct gfs2_log_descriptor *ld = page_address(page);
    612	clear_page(ld);
    613	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
    614	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
    615	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
    616	ld->ld_type = cpu_to_be32(ld_type);
    617	ld->ld_length = cpu_to_be32(ld_length);
    618	ld->ld_data1 = cpu_to_be32(ld_data1);
    619	ld->ld_data2 = 0;
    620	return page;
    621}
    622
    623static void gfs2_check_magic(struct buffer_head *bh)
    624{
    625	void *kaddr;
    626	__be32 *ptr;
    627
    628	clear_buffer_escaped(bh);
    629	kaddr = kmap_atomic(bh->b_page);
    630	ptr = kaddr + bh_offset(bh);
    631	if (*ptr == cpu_to_be32(GFS2_MAGIC))
    632		set_buffer_escaped(bh);
    633	kunmap_atomic(kaddr);
    634}
    635
    636static int blocknr_cmp(void *priv, const struct list_head *a,
    637		       const struct list_head *b)
    638{
    639	struct gfs2_bufdata *bda, *bdb;
    640
    641	bda = list_entry(a, struct gfs2_bufdata, bd_list);
    642	bdb = list_entry(b, struct gfs2_bufdata, bd_list);
    643
    644	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
    645		return -1;
    646	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
    647		return 1;
    648	return 0;
    649}
    650
    651static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
    652				unsigned int total, struct list_head *blist,
    653				bool is_databuf)
    654{
    655	struct gfs2_log_descriptor *ld;
    656	struct gfs2_bufdata *bd1 = NULL, *bd2;
    657	struct page *page;
    658	unsigned int num;
    659	unsigned n;
    660	__be64 *ptr;
    661
    662	gfs2_log_lock(sdp);
    663	list_sort(NULL, blist, blocknr_cmp);
    664	bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
    665	while(total) {
    666		num = total;
    667		if (total > limit)
    668			num = limit;
    669		gfs2_log_unlock(sdp);
    670		page = gfs2_get_log_desc(sdp,
    671					 is_databuf ? GFS2_LOG_DESC_JDATA :
    672					 GFS2_LOG_DESC_METADATA, num + 1, num);
    673		ld = page_address(page);
    674		gfs2_log_lock(sdp);
    675		ptr = (__be64 *)(ld + 1);
    676
    677		n = 0;
    678		list_for_each_entry_continue(bd1, blist, bd_list) {
    679			*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
    680			if (is_databuf) {
    681				gfs2_check_magic(bd1->bd_bh);
    682				*ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0);
    683			}
    684			if (++n >= num)
    685				break;
    686		}
    687
    688		gfs2_log_unlock(sdp);
    689		gfs2_log_write_page(sdp, page);
    690		gfs2_log_lock(sdp);
    691
    692		n = 0;
    693		list_for_each_entry_continue(bd2, blist, bd_list) {
    694			get_bh(bd2->bd_bh);
    695			gfs2_log_unlock(sdp);
    696			lock_buffer(bd2->bd_bh);
    697
    698			if (buffer_escaped(bd2->bd_bh)) {
    699				void *kaddr;
    700				page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
    701				ptr = page_address(page);
    702				kaddr = kmap_atomic(bd2->bd_bh->b_page);
    703				memcpy(ptr, kaddr + bh_offset(bd2->bd_bh),
    704				       bd2->bd_bh->b_size);
    705				kunmap_atomic(kaddr);
    706				*(__be32 *)ptr = 0;
    707				clear_buffer_escaped(bd2->bd_bh);
    708				unlock_buffer(bd2->bd_bh);
    709				brelse(bd2->bd_bh);
    710				gfs2_log_write_page(sdp, page);
    711			} else {
    712				gfs2_log_write_bh(sdp, bd2->bd_bh);
    713			}
    714			gfs2_log_lock(sdp);
    715			if (++n >= num)
    716				break;
    717		}
    718
    719		BUG_ON(total < num);
    720		total -= num;
    721	}
    722	gfs2_log_unlock(sdp);
    723}
    724
    725static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
    726{
    727	unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
    728	unsigned int nbuf;
    729	if (tr == NULL)
    730		return;
    731	nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
    732	gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
    733}
    734
    735static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
    736{
    737	struct list_head *head;
    738	struct gfs2_bufdata *bd;
    739
    740	if (tr == NULL)
    741		return;
    742
    743	head = &tr->tr_buf;
    744	while (!list_empty(head)) {
    745		bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
    746		list_del_init(&bd->bd_list);
    747		gfs2_unpin(sdp, bd->bd_bh, tr);
    748	}
    749}
    750
    751static void buf_lo_before_scan(struct gfs2_jdesc *jd,
    752			       struct gfs2_log_header_host *head, int pass)
    753{
    754	if (pass != 0)
    755		return;
    756
    757	jd->jd_found_blocks = 0;
    758	jd->jd_replayed_blocks = 0;
    759}
    760
    761#define obsolete_rgrp_replay \
    762"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n"
    763#define obsolete_rgrp_replay2 \
    764"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n"
    765
    766static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log,
    767			  u64 blkno)
    768{
    769	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    770	struct gfs2_rgrpd *rgd;
    771	struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data;
    772
    773	rgd = gfs2_blk2rgrpd(sdp, blkno, false);
    774	if (rgd && rgd->rd_addr == blkno &&
    775	    rgd->rd_bits && rgd->rd_bits->bi_bh) {
    776		fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno,
    777			jd->jd_jid, bh_log->b_blocknr);
    778		fs_info(sdp, obsolete_rgrp_replay2,
    779			buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
    780			buffer_pinned(rgd->rd_bits->bi_bh),
    781			rgd->rd_igeneration,
    782			be64_to_cpu(jrgd->rg_igeneration));
    783		gfs2_dump_glock(NULL, rgd->rd_gl, true);
    784	}
    785}
    786
    787static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
    788				struct gfs2_log_descriptor *ld, __be64 *ptr,
    789				int pass)
    790{
    791	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
    792	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    793	struct gfs2_glock *gl = ip->i_gl;
    794	unsigned int blks = be32_to_cpu(ld->ld_data1);
    795	struct buffer_head *bh_log, *bh_ip;
    796	u64 blkno;
    797	int error = 0;
    798
    799	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
    800		return 0;
    801
    802	gfs2_replay_incr_blk(jd, &start);
    803
    804	for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
    805		blkno = be64_to_cpu(*ptr++);
    806
    807		jd->jd_found_blocks++;
    808
    809		if (gfs2_revoke_check(jd, blkno, start))
    810			continue;
    811
    812		error = gfs2_replay_read_block(jd, start, &bh_log);
    813		if (error)
    814			return error;
    815
    816		bh_ip = gfs2_meta_new(gl, blkno);
    817		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
    818
    819		if (gfs2_meta_check(sdp, bh_ip))
    820			error = -EIO;
    821		else {
    822			struct gfs2_meta_header *mh =
    823				(struct gfs2_meta_header *)bh_ip->b_data;
    824
    825			if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG))
    826				obsolete_rgrp(jd, bh_log, blkno);
    827
    828			mark_buffer_dirty(bh_ip);
    829		}
    830		brelse(bh_log);
    831		brelse(bh_ip);
    832
    833		if (error)
    834			break;
    835
    836		jd->jd_replayed_blocks++;
    837	}
    838
    839	return error;
    840}
    841
    842static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
    843{
    844	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
    845	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    846
    847	if (error) {
    848		gfs2_inode_metasync(ip->i_gl);
    849		return;
    850	}
    851	if (pass != 1)
    852		return;
    853
    854	gfs2_inode_metasync(ip->i_gl);
    855
    856	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
    857	        jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
    858}
    859
    860static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
    861{
    862	struct gfs2_meta_header *mh;
    863	unsigned int offset;
    864	struct list_head *head = &sdp->sd_log_revokes;
    865	struct gfs2_bufdata *bd;
    866	struct page *page;
    867	unsigned int length;
    868
    869	gfs2_flush_revokes(sdp);
    870	if (!sdp->sd_log_num_revoke)
    871		return;
    872
    873	length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke);
    874	page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
    875	offset = sizeof(struct gfs2_log_descriptor);
    876
    877	list_for_each_entry(bd, head, bd_list) {
    878		sdp->sd_log_num_revoke--;
    879
    880		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
    881			gfs2_log_write_page(sdp, page);
    882			page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
    883			mh = page_address(page);
    884			clear_page(mh);
    885			mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
    886			mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
    887			mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
    888			offset = sizeof(struct gfs2_meta_header);
    889		}
    890
    891		*(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno);
    892		offset += sizeof(u64);
    893	}
    894	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
    895
    896	gfs2_log_write_page(sdp, page);
    897}
    898
    899void gfs2_drain_revokes(struct gfs2_sbd *sdp)
    900{
    901	struct list_head *head = &sdp->sd_log_revokes;
    902	struct gfs2_bufdata *bd;
    903	struct gfs2_glock *gl;
    904
    905	while (!list_empty(head)) {
    906		bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
    907		list_del_init(&bd->bd_list);
    908		gl = bd->bd_gl;
    909		gfs2_glock_remove_revoke(gl);
    910		kmem_cache_free(gfs2_bufdata_cachep, bd);
    911	}
    912}
    913
    914static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
    915{
    916	gfs2_drain_revokes(sdp);
    917}
    918
    919static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
    920				  struct gfs2_log_header_host *head, int pass)
    921{
    922	if (pass != 0)
    923		return;
    924
    925	jd->jd_found_revokes = 0;
    926	jd->jd_replay_tail = head->lh_tail;
    927}
    928
    929static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
    930				   struct gfs2_log_descriptor *ld, __be64 *ptr,
    931				   int pass)
    932{
    933	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    934	unsigned int blks = be32_to_cpu(ld->ld_length);
    935	unsigned int revokes = be32_to_cpu(ld->ld_data1);
    936	struct buffer_head *bh;
    937	unsigned int offset;
    938	u64 blkno;
    939	int first = 1;
    940	int error;
    941
    942	if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
    943		return 0;
    944
    945	offset = sizeof(struct gfs2_log_descriptor);
    946
    947	for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
    948		error = gfs2_replay_read_block(jd, start, &bh);
    949		if (error)
    950			return error;
    951
    952		if (!first)
    953			gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
    954
    955		while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
    956			blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
    957
    958			error = gfs2_revoke_add(jd, blkno, start);
    959			if (error < 0) {
    960				brelse(bh);
    961				return error;
    962			}
    963			else if (error)
    964				jd->jd_found_revokes++;
    965
    966			if (!--revokes)
    967				break;
    968			offset += sizeof(u64);
    969		}
    970
    971		brelse(bh);
    972		offset = sizeof(struct gfs2_meta_header);
    973		first = 0;
    974	}
    975
    976	return 0;
    977}
    978
    979static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
    980{
    981	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
    982
    983	if (error) {
    984		gfs2_revoke_clean(jd);
    985		return;
    986	}
    987	if (pass != 1)
    988		return;
    989
    990	fs_info(sdp, "jid=%u: Found %u revoke tags\n",
    991	        jd->jd_jid, jd->jd_found_revokes);
    992
    993	gfs2_revoke_clean(jd);
    994}
    995
    996/**
    997 * databuf_lo_before_commit - Scan the data buffers, writing as we go
    998 * @sdp: The filesystem
    999 * @tr: The system transaction being flushed
   1000 */
   1001
   1002static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
   1003{
   1004	unsigned int limit = databuf_limit(sdp);
   1005	unsigned int nbuf;
   1006	if (tr == NULL)
   1007		return;
   1008	nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
   1009	gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
   1010}
   1011
   1012static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
   1013				    struct gfs2_log_descriptor *ld,
   1014				    __be64 *ptr, int pass)
   1015{
   1016	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
   1017	struct gfs2_glock *gl = ip->i_gl;
   1018	unsigned int blks = be32_to_cpu(ld->ld_data1);
   1019	struct buffer_head *bh_log, *bh_ip;
   1020	u64 blkno;
   1021	u64 esc;
   1022	int error = 0;
   1023
   1024	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
   1025		return 0;
   1026
   1027	gfs2_replay_incr_blk(jd, &start);
   1028	for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
   1029		blkno = be64_to_cpu(*ptr++);
   1030		esc = be64_to_cpu(*ptr++);
   1031
   1032		jd->jd_found_blocks++;
   1033
   1034		if (gfs2_revoke_check(jd, blkno, start))
   1035			continue;
   1036
   1037		error = gfs2_replay_read_block(jd, start, &bh_log);
   1038		if (error)
   1039			return error;
   1040
   1041		bh_ip = gfs2_meta_new(gl, blkno);
   1042		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
   1043
   1044		/* Unescape */
   1045		if (esc) {
   1046			__be32 *eptr = (__be32 *)bh_ip->b_data;
   1047			*eptr = cpu_to_be32(GFS2_MAGIC);
   1048		}
   1049		mark_buffer_dirty(bh_ip);
   1050
   1051		brelse(bh_log);
   1052		brelse(bh_ip);
   1053
   1054		jd->jd_replayed_blocks++;
   1055	}
   1056
   1057	return error;
   1058}
   1059
   1060/* FIXME: sort out accounting for log blocks etc. */
   1061
   1062static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
   1063{
   1064	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
   1065	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
   1066
   1067	if (error) {
   1068		gfs2_inode_metasync(ip->i_gl);
   1069		return;
   1070	}
   1071	if (pass != 1)
   1072		return;
   1073
   1074	/* data sync? */
   1075	gfs2_inode_metasync(ip->i_gl);
   1076
   1077	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
   1078		jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
   1079}
   1080
   1081static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
   1082{
   1083	struct list_head *head;
   1084	struct gfs2_bufdata *bd;
   1085
   1086	if (tr == NULL)
   1087		return;
   1088
   1089	head = &tr->tr_databuf;
   1090	while (!list_empty(head)) {
   1091		bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
   1092		list_del_init(&bd->bd_list);
   1093		gfs2_unpin(sdp, bd->bd_bh, tr);
   1094	}
   1095}
   1096
   1097
   1098static const struct gfs2_log_operations gfs2_buf_lops = {
   1099	.lo_before_commit = buf_lo_before_commit,
   1100	.lo_after_commit = buf_lo_after_commit,
   1101	.lo_before_scan = buf_lo_before_scan,
   1102	.lo_scan_elements = buf_lo_scan_elements,
   1103	.lo_after_scan = buf_lo_after_scan,
   1104	.lo_name = "buf",
   1105};
   1106
   1107static const struct gfs2_log_operations gfs2_revoke_lops = {
   1108	.lo_before_commit = revoke_lo_before_commit,
   1109	.lo_after_commit = revoke_lo_after_commit,
   1110	.lo_before_scan = revoke_lo_before_scan,
   1111	.lo_scan_elements = revoke_lo_scan_elements,
   1112	.lo_after_scan = revoke_lo_after_scan,
   1113	.lo_name = "revoke",
   1114};
   1115
   1116static const struct gfs2_log_operations gfs2_databuf_lops = {
   1117	.lo_before_commit = databuf_lo_before_commit,
   1118	.lo_after_commit = databuf_lo_after_commit,
   1119	.lo_scan_elements = databuf_lo_scan_elements,
   1120	.lo_after_scan = databuf_lo_after_scan,
   1121	.lo_name = "databuf",
   1122};
   1123
   1124const struct gfs2_log_operations *gfs2_log_ops[] = {
   1125	&gfs2_databuf_lops,
   1126	&gfs2_buf_lops,
   1127	&gfs2_revoke_lops,
   1128	NULL,
   1129};
   1130