move_extents.c (25704B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * move_extents.c 4 * 5 * Copyright (C) 2011 Oracle. All rights reserved. 6 */ 7#include <linux/fs.h> 8#include <linux/types.h> 9#include <linux/mount.h> 10#include <linux/swap.h> 11 12#include <cluster/masklog.h> 13 14#include "ocfs2.h" 15#include "ocfs2_ioctl.h" 16 17#include "alloc.h" 18#include "localalloc.h" 19#include "aops.h" 20#include "dlmglue.h" 21#include "extent_map.h" 22#include "inode.h" 23#include "journal.h" 24#include "suballoc.h" 25#include "uptodate.h" 26#include "super.h" 27#include "dir.h" 28#include "buffer_head_io.h" 29#include "sysfile.h" 30#include "refcounttree.h" 31#include "move_extents.h" 32 33struct ocfs2_move_extents_context { 34 struct inode *inode; 35 struct file *file; 36 int auto_defrag; 37 int partial; 38 int credits; 39 u32 new_phys_cpos; 40 u32 clusters_moved; 41 u64 refcount_loc; 42 struct ocfs2_move_extents *range; 43 struct ocfs2_extent_tree et; 44 struct ocfs2_alloc_context *meta_ac; 45 struct ocfs2_alloc_context *data_ac; 46 struct ocfs2_cached_dealloc_ctxt dealloc; 47}; 48 49static int __ocfs2_move_extent(handle_t *handle, 50 struct ocfs2_move_extents_context *context, 51 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 52 int ext_flags) 53{ 54 int ret = 0, index; 55 struct inode *inode = context->inode; 56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 57 struct ocfs2_extent_rec *rec, replace_rec; 58 struct ocfs2_path *path = NULL; 59 struct ocfs2_extent_list *el; 60 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 61 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 62 63 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, 64 p_cpos, new_p_cpos, len); 65 if (ret) { 66 mlog_errno(ret); 67 goto out; 68 } 69 70 memset(&replace_rec, 0, sizeof(replace_rec)); 71 replace_rec.e_cpos = cpu_to_le32(cpos); 72 replace_rec.e_leaf_clusters = cpu_to_le16(len); 73 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 74 new_p_cpos)); 75 76 path = ocfs2_new_path_from_et(&context->et); 77 if (!path) { 78 ret = -ENOMEM; 79 mlog_errno(ret); 80 goto out; 81 } 82 83 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 84 if (ret) { 85 mlog_errno(ret); 86 goto out; 87 } 88 89 el = path_leaf_el(path); 90 91 index = ocfs2_search_extent_list(el, cpos); 92 if (index == -1) { 93 ret = ocfs2_error(inode->i_sb, 94 "Inode %llu has an extent at cpos %u which can no longer be found\n", 95 (unsigned long long)ino, cpos); 96 goto out; 97 } 98 99 rec = &el->l_recs[index]; 100 101 BUG_ON(ext_flags != rec->e_flags); 102 /* 103 * after moving/defraging to new location, the extent is not going 104 * to be refcounted anymore. 105 */ 106 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 107 108 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 109 context->et.et_root_bh, 110 OCFS2_JOURNAL_ACCESS_WRITE); 111 if (ret) { 112 mlog_errno(ret); 113 goto out; 114 } 115 116 ret = ocfs2_split_extent(handle, &context->et, path, index, 117 &replace_rec, context->meta_ac, 118 &context->dealloc); 119 if (ret) { 120 mlog_errno(ret); 121 goto out; 122 } 123 124 ocfs2_journal_dirty(handle, context->et.et_root_bh); 125 126 context->new_phys_cpos = new_p_cpos; 127 128 /* 129 * need I to append truncate log for old clusters? 130 */ 131 if (old_blkno) { 132 if (ext_flags & OCFS2_EXT_REFCOUNTED) 133 ret = ocfs2_decrease_refcount(inode, handle, 134 ocfs2_blocks_to_clusters(osb->sb, 135 old_blkno), 136 len, context->meta_ac, 137 &context->dealloc, 1); 138 else 139 ret = ocfs2_truncate_log_append(osb, handle, 140 old_blkno, len); 141 } 142 143 ocfs2_update_inode_fsync_trans(handle, inode, 0); 144out: 145 ocfs2_free_path(path); 146 return ret; 147} 148 149/* 150 * lock allocator, and reserve appropriate number of bits for 151 * meta blocks. 152 */ 153static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, 154 struct ocfs2_extent_tree *et, 155 u32 clusters_to_move, 156 u32 extents_to_split, 157 struct ocfs2_alloc_context **meta_ac, 158 int extra_blocks, 159 int *credits) 160{ 161 int ret, num_free_extents; 162 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 164 165 num_free_extents = ocfs2_num_free_extents(et); 166 if (num_free_extents < 0) { 167 ret = num_free_extents; 168 mlog_errno(ret); 169 goto out; 170 } 171 172 if (!num_free_extents || 173 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 174 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 175 176 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 177 if (ret) { 178 mlog_errno(ret); 179 goto out; 180 } 181 182 183 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); 184 185 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 186 extra_blocks, clusters_to_move, *credits); 187out: 188 if (ret) { 189 if (*meta_ac) { 190 ocfs2_free_alloc_context(*meta_ac); 191 *meta_ac = NULL; 192 } 193 } 194 195 return ret; 196} 197 198/* 199 * Using one journal handle to guarantee the data consistency in case 200 * crash happens anywhere. 201 * 202 * XXX: defrag can end up with finishing partial extent as requested, 203 * due to not enough contiguous clusters can be found in allocator. 204 */ 205static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 206 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 207{ 208 int ret, credits = 0, extra_blocks = 0, partial = context->partial; 209 handle_t *handle; 210 struct inode *inode = context->inode; 211 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 212 struct inode *tl_inode = osb->osb_tl_inode; 213 struct ocfs2_refcount_tree *ref_tree = NULL; 214 u32 new_phys_cpos, new_len; 215 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 216 int need_free = 0; 217 218 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 219 BUG_ON(!ocfs2_is_refcount_inode(inode)); 220 BUG_ON(!context->refcount_loc); 221 222 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 223 &ref_tree, NULL); 224 if (ret) { 225 mlog_errno(ret); 226 return ret; 227 } 228 229 ret = ocfs2_prepare_refcount_change_for_del(inode, 230 context->refcount_loc, 231 phys_blkno, 232 *len, 233 &credits, 234 &extra_blocks); 235 if (ret) { 236 mlog_errno(ret); 237 goto out; 238 } 239 } 240 241 ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, 242 *len, 1, 243 &context->meta_ac, 244 extra_blocks, &credits); 245 if (ret) { 246 mlog_errno(ret); 247 goto out; 248 } 249 250 /* 251 * should be using allocation reservation strategy there? 252 * 253 * if (context->data_ac) 254 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 255 */ 256 257 inode_lock(tl_inode); 258 259 if (ocfs2_truncate_log_needs_flush(osb)) { 260 ret = __ocfs2_flush_truncate_log(osb); 261 if (ret < 0) { 262 mlog_errno(ret); 263 goto out_unlock_mutex; 264 } 265 } 266 267 /* 268 * Make sure ocfs2_reserve_cluster is called after 269 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. 270 * 271 * If ocfs2_reserve_cluster is called 272 * before __ocfs2_flush_truncate_log, dead lock on global bitmap 273 * may happen. 274 * 275 */ 276 ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); 277 if (ret) { 278 mlog_errno(ret); 279 goto out_unlock_mutex; 280 } 281 282 handle = ocfs2_start_trans(osb, credits); 283 if (IS_ERR(handle)) { 284 ret = PTR_ERR(handle); 285 mlog_errno(ret); 286 goto out_unlock_mutex; 287 } 288 289 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 290 &new_phys_cpos, &new_len); 291 if (ret) { 292 mlog_errno(ret); 293 goto out_commit; 294 } 295 296 /* 297 * allowing partial extent moving is kind of 'pros and cons', it makes 298 * whole defragmentation less likely to fail, on the contrary, the bad 299 * thing is it may make the fs even more fragmented after moving, let 300 * userspace make a good decision here. 301 */ 302 if (new_len != *len) { 303 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 304 if (!partial) { 305 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 306 ret = -ENOSPC; 307 need_free = 1; 308 goto out_commit; 309 } 310 } 311 312 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 313 phys_cpos, new_phys_cpos); 314 315 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 316 new_phys_cpos, ext_flags); 317 if (ret) 318 mlog_errno(ret); 319 320 if (partial && (new_len != *len)) 321 *len = new_len; 322 323 /* 324 * Here we should write the new page out first if we are 325 * in write-back mode. 326 */ 327 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 328 if (ret) 329 mlog_errno(ret); 330 331out_commit: 332 if (need_free && context->data_ac) { 333 struct ocfs2_alloc_context *data_ac = context->data_ac; 334 335 if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) 336 ocfs2_free_local_alloc_bits(osb, handle, data_ac, 337 new_phys_cpos, new_len); 338 else 339 ocfs2_free_clusters(handle, 340 data_ac->ac_inode, 341 data_ac->ac_bh, 342 ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), 343 new_len); 344 } 345 346 ocfs2_commit_trans(osb, handle); 347 348out_unlock_mutex: 349 inode_unlock(tl_inode); 350 351 if (context->data_ac) { 352 ocfs2_free_alloc_context(context->data_ac); 353 context->data_ac = NULL; 354 } 355 356 if (context->meta_ac) { 357 ocfs2_free_alloc_context(context->meta_ac); 358 context->meta_ac = NULL; 359 } 360 361out: 362 if (ref_tree) 363 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 364 365 return ret; 366} 367 368/* 369 * find the victim alloc group, where #blkno fits. 370 */ 371static int ocfs2_find_victim_alloc_group(struct inode *inode, 372 u64 vict_blkno, 373 int type, int slot, 374 int *vict_bit, 375 struct buffer_head **ret_bh) 376{ 377 int ret, i, bits_per_unit = 0; 378 u64 blkno; 379 char namebuf[40]; 380 381 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 382 struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 383 struct ocfs2_chain_list *cl; 384 struct ocfs2_chain_rec *rec; 385 struct ocfs2_dinode *ac_dinode; 386 struct ocfs2_group_desc *bg; 387 388 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 389 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 390 strlen(namebuf), &blkno); 391 if (ret) { 392 ret = -ENOENT; 393 goto out; 394 } 395 396 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 397 if (ret) { 398 mlog_errno(ret); 399 goto out; 400 } 401 402 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 403 cl = &(ac_dinode->id2.i_chain); 404 rec = &(cl->cl_recs[0]); 405 406 if (type == GLOBAL_BITMAP_SYSTEM_INODE) 407 bits_per_unit = osb->s_clustersize_bits - 408 inode->i_sb->s_blocksize_bits; 409 /* 410 * 'vict_blkno' was out of the valid range. 411 */ 412 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 413 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 414 bits_per_unit))) { 415 ret = -EINVAL; 416 goto out; 417 } 418 419 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 420 421 rec = &(cl->cl_recs[i]); 422 if (!rec) 423 continue; 424 425 bg = NULL; 426 427 do { 428 if (!bg) 429 blkno = le64_to_cpu(rec->c_blkno); 430 else 431 blkno = le64_to_cpu(bg->bg_next_group); 432 433 if (gd_bh) { 434 brelse(gd_bh); 435 gd_bh = NULL; 436 } 437 438 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 439 if (ret) { 440 mlog_errno(ret); 441 goto out; 442 } 443 444 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 445 446 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 447 le16_to_cpu(bg->bg_bits))) { 448 449 *ret_bh = gd_bh; 450 *vict_bit = (vict_blkno - blkno) >> 451 bits_per_unit; 452 mlog(0, "find the victim group: #%llu, " 453 "total_bits: %u, vict_bit: %u\n", 454 blkno, le16_to_cpu(bg->bg_bits), 455 *vict_bit); 456 goto out; 457 } 458 459 } while (le64_to_cpu(bg->bg_next_group)); 460 } 461 462 ret = -EINVAL; 463out: 464 brelse(ac_bh); 465 466 /* 467 * caller has to release the gd_bh properly. 468 */ 469 return ret; 470} 471 472/* 473 * XXX: helper to validate and adjust moving goal. 474 */ 475static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 476 struct ocfs2_move_extents *range) 477{ 478 int ret, goal_bit = 0; 479 480 struct buffer_head *gd_bh = NULL; 481 struct ocfs2_group_desc *bg; 482 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 483 int c_to_b = 1 << (osb->s_clustersize_bits - 484 inode->i_sb->s_blocksize_bits); 485 486 /* 487 * make goal become cluster aligned. 488 */ 489 range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 490 range->me_goal); 491 /* 492 * validate goal sits within global_bitmap, and return the victim 493 * group desc 494 */ 495 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 496 GLOBAL_BITMAP_SYSTEM_INODE, 497 OCFS2_INVALID_SLOT, 498 &goal_bit, &gd_bh); 499 if (ret) 500 goto out; 501 502 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 503 504 /* 505 * moving goal is not allowd to start with a group desc blok(#0 blk) 506 * let's compromise to the latter cluster. 507 */ 508 if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 509 range->me_goal += c_to_b; 510 511 /* 512 * movement is not gonna cross two groups. 513 */ 514 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 515 range->me_len) { 516 ret = -EINVAL; 517 goto out; 518 } 519 /* 520 * more exact validations/adjustments will be performed later during 521 * moving operation for each extent range. 522 */ 523 mlog(0, "extents get ready to be moved to #%llu block\n", 524 range->me_goal); 525 526out: 527 brelse(gd_bh); 528 529 return ret; 530} 531 532static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 533 int *goal_bit, u32 move_len, u32 max_hop, 534 u32 *phys_cpos) 535{ 536 int i, used, last_free_bits = 0, base_bit = *goal_bit; 537 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 538 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 539 le64_to_cpu(gd->bg_blkno)); 540 541 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 542 543 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 544 if (used) { 545 /* 546 * we even tried searching the free chunk by jumping 547 * a 'max_hop' distance, but still failed. 548 */ 549 if ((i - base_bit) > max_hop) { 550 *phys_cpos = 0; 551 break; 552 } 553 554 if (last_free_bits) 555 last_free_bits = 0; 556 557 continue; 558 } else 559 last_free_bits++; 560 561 if (last_free_bits == move_len) { 562 *goal_bit = i; 563 *phys_cpos = base_cpos + i; 564 break; 565 } 566 } 567 568 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 569} 570 571static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 572 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 573 u32 len, int ext_flags) 574{ 575 int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 576 handle_t *handle; 577 struct inode *inode = context->inode; 578 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 579 struct inode *tl_inode = osb->osb_tl_inode; 580 struct inode *gb_inode = NULL; 581 struct buffer_head *gb_bh = NULL; 582 struct buffer_head *gd_bh = NULL; 583 struct ocfs2_group_desc *gd; 584 struct ocfs2_refcount_tree *ref_tree = NULL; 585 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 586 context->range->me_threshold); 587 u64 phys_blkno, new_phys_blkno; 588 589 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 590 591 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 592 BUG_ON(!ocfs2_is_refcount_inode(inode)); 593 BUG_ON(!context->refcount_loc); 594 595 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 596 &ref_tree, NULL); 597 if (ret) { 598 mlog_errno(ret); 599 return ret; 600 } 601 602 ret = ocfs2_prepare_refcount_change_for_del(inode, 603 context->refcount_loc, 604 phys_blkno, 605 len, 606 &credits, 607 &extra_blocks); 608 if (ret) { 609 mlog_errno(ret); 610 goto out; 611 } 612 } 613 614 ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, 615 len, 1, 616 &context->meta_ac, 617 extra_blocks, &credits); 618 if (ret) { 619 mlog_errno(ret); 620 goto out; 621 } 622 623 /* 624 * need to count 2 extra credits for global_bitmap inode and 625 * group descriptor. 626 */ 627 credits += OCFS2_INODE_UPDATE_CREDITS + 1; 628 629 /* 630 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 631 * logic, while we still need to lock the global_bitmap. 632 */ 633 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 634 OCFS2_INVALID_SLOT); 635 if (!gb_inode) { 636 mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 637 ret = -EIO; 638 goto out; 639 } 640 641 inode_lock(gb_inode); 642 643 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 644 if (ret) { 645 mlog_errno(ret); 646 goto out_unlock_gb_mutex; 647 } 648 649 inode_lock(tl_inode); 650 651 handle = ocfs2_start_trans(osb, credits); 652 if (IS_ERR(handle)) { 653 ret = PTR_ERR(handle); 654 mlog_errno(ret); 655 goto out_unlock_tl_inode; 656 } 657 658 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 659 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 660 GLOBAL_BITMAP_SYSTEM_INODE, 661 OCFS2_INVALID_SLOT, 662 &goal_bit, &gd_bh); 663 if (ret) { 664 mlog_errno(ret); 665 goto out_commit; 666 } 667 668 /* 669 * probe the victim cluster group to find a proper 670 * region to fit wanted movement, it even will perfrom 671 * a best-effort attempt by compromising to a threshold 672 * around the goal. 673 */ 674 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 675 new_phys_cpos); 676 if (!*new_phys_cpos) { 677 ret = -ENOSPC; 678 goto out_commit; 679 } 680 681 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 682 *new_phys_cpos, ext_flags); 683 if (ret) { 684 mlog_errno(ret); 685 goto out_commit; 686 } 687 688 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 689 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 690 le16_to_cpu(gd->bg_chain)); 691 if (ret) { 692 mlog_errno(ret); 693 goto out_commit; 694 } 695 696 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 697 goal_bit, len); 698 if (ret) { 699 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, 700 le16_to_cpu(gd->bg_chain)); 701 mlog_errno(ret); 702 } 703 704 /* 705 * Here we should write the new page out first if we are 706 * in write-back mode. 707 */ 708 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 709 if (ret) 710 mlog_errno(ret); 711 712out_commit: 713 ocfs2_commit_trans(osb, handle); 714 brelse(gd_bh); 715 716out_unlock_tl_inode: 717 inode_unlock(tl_inode); 718 719 ocfs2_inode_unlock(gb_inode, 1); 720out_unlock_gb_mutex: 721 inode_unlock(gb_inode); 722 brelse(gb_bh); 723 iput(gb_inode); 724 725out: 726 if (context->meta_ac) { 727 ocfs2_free_alloc_context(context->meta_ac); 728 context->meta_ac = NULL; 729 } 730 731 if (ref_tree) 732 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 733 734 return ret; 735} 736 737/* 738 * Helper to calculate the defraging length in one run according to threshold. 739 */ 740static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 741 u32 threshold, int *skip) 742{ 743 if ((*alloc_size + *len_defraged) < threshold) { 744 /* 745 * proceed defragmentation until we meet the thresh 746 */ 747 *len_defraged += *alloc_size; 748 } else if (*len_defraged == 0) { 749 /* 750 * XXX: skip a large extent. 751 */ 752 *skip = 1; 753 } else { 754 /* 755 * split this extent to coalesce with former pieces as 756 * to reach the threshold. 757 * 758 * we're done here with one cycle of defragmentation 759 * in a size of 'thresh', resetting 'len_defraged' 760 * forces a new defragmentation. 761 */ 762 *alloc_size = threshold - *len_defraged; 763 *len_defraged = 0; 764 } 765} 766 767static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 768 struct ocfs2_move_extents_context *context) 769{ 770 int ret = 0, flags, do_defrag, skip = 0; 771 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 772 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 773 774 struct inode *inode = context->inode; 775 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 776 struct ocfs2_move_extents *range = context->range; 777 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 778 779 if ((i_size_read(inode) == 0) || (range->me_len == 0)) 780 return 0; 781 782 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 783 return 0; 784 785 context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 786 787 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 788 ocfs2_init_dealloc_ctxt(&context->dealloc); 789 790 /* 791 * TO-DO XXX: 792 * 793 * - xattr extents. 794 */ 795 796 do_defrag = context->auto_defrag; 797 798 /* 799 * extents moving happens in unit of clusters, for the sake 800 * of simplicity, we may ignore two clusters where 'byte_start' 801 * and 'byte_start + len' were within. 802 */ 803 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 804 len_to_move = (range->me_start + range->me_len) >> 805 osb->s_clustersize_bits; 806 if (len_to_move >= move_start) 807 len_to_move -= move_start; 808 else 809 len_to_move = 0; 810 811 if (do_defrag) { 812 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 813 if (defrag_thresh <= 1) 814 goto done; 815 } else 816 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 817 range->me_goal); 818 819 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 820 "thresh: %u\n", 821 (unsigned long long)OCFS2_I(inode)->ip_blkno, 822 (unsigned long long)range->me_start, 823 (unsigned long long)range->me_len, 824 move_start, len_to_move, defrag_thresh); 825 826 cpos = move_start; 827 while (len_to_move) { 828 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 829 &flags); 830 if (ret) { 831 mlog_errno(ret); 832 goto out; 833 } 834 835 if (alloc_size > len_to_move) 836 alloc_size = len_to_move; 837 838 /* 839 * XXX: how to deal with a hole: 840 * 841 * - skip the hole of course 842 * - force a new defragmentation 843 */ 844 if (!phys_cpos) { 845 if (do_defrag) 846 len_defraged = 0; 847 848 goto next; 849 } 850 851 if (do_defrag) { 852 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 853 defrag_thresh, &skip); 854 /* 855 * skip large extents 856 */ 857 if (skip) { 858 skip = 0; 859 goto next; 860 } 861 862 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 863 "alloc_size: %u, len_defraged: %u\n", 864 cpos, phys_cpos, alloc_size, len_defraged); 865 866 ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 867 &alloc_size, flags); 868 } else { 869 ret = ocfs2_move_extent(context, cpos, phys_cpos, 870 &new_phys_cpos, alloc_size, 871 flags); 872 873 new_phys_cpos += alloc_size; 874 } 875 876 if (ret < 0) { 877 mlog_errno(ret); 878 goto out; 879 } 880 881 context->clusters_moved += alloc_size; 882next: 883 cpos += alloc_size; 884 len_to_move -= alloc_size; 885 } 886 887done: 888 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 889 890out: 891 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 892 context->clusters_moved); 893 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 894 context->new_phys_cpos); 895 896 ocfs2_schedule_truncate_log_flush(osb, 1); 897 ocfs2_run_deallocs(osb, &context->dealloc); 898 899 return ret; 900} 901 902static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 903{ 904 int status; 905 handle_t *handle; 906 struct inode *inode = context->inode; 907 struct ocfs2_dinode *di; 908 struct buffer_head *di_bh = NULL; 909 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 910 911 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 912 return -EROFS; 913 914 inode_lock(inode); 915 916 /* 917 * This prevents concurrent writes from other nodes 918 */ 919 status = ocfs2_rw_lock(inode, 1); 920 if (status) { 921 mlog_errno(status); 922 goto out; 923 } 924 925 status = ocfs2_inode_lock(inode, &di_bh, 1); 926 if (status) { 927 mlog_errno(status); 928 goto out_rw_unlock; 929 } 930 931 /* 932 * rememer ip_xattr_sem also needs to be held if necessary 933 */ 934 down_write(&OCFS2_I(inode)->ip_alloc_sem); 935 936 status = __ocfs2_move_extents_range(di_bh, context); 937 938 up_write(&OCFS2_I(inode)->ip_alloc_sem); 939 if (status) { 940 mlog_errno(status); 941 goto out_inode_unlock; 942 } 943 944 /* 945 * We update ctime for these changes 946 */ 947 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 948 if (IS_ERR(handle)) { 949 status = PTR_ERR(handle); 950 mlog_errno(status); 951 goto out_inode_unlock; 952 } 953 954 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 955 OCFS2_JOURNAL_ACCESS_WRITE); 956 if (status) { 957 mlog_errno(status); 958 goto out_commit; 959 } 960 961 di = (struct ocfs2_dinode *)di_bh->b_data; 962 inode->i_ctime = current_time(inode); 963 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 964 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 965 ocfs2_update_inode_fsync_trans(handle, inode, 0); 966 967 ocfs2_journal_dirty(handle, di_bh); 968 969out_commit: 970 ocfs2_commit_trans(osb, handle); 971 972out_inode_unlock: 973 brelse(di_bh); 974 ocfs2_inode_unlock(inode, 1); 975out_rw_unlock: 976 ocfs2_rw_unlock(inode, 1); 977out: 978 inode_unlock(inode); 979 980 return status; 981} 982 983int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 984{ 985 int status; 986 987 struct inode *inode = file_inode(filp); 988 struct ocfs2_move_extents range; 989 struct ocfs2_move_extents_context *context; 990 991 if (!argp) 992 return -EINVAL; 993 994 status = mnt_want_write_file(filp); 995 if (status) 996 return status; 997 998 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { 999 status = -EPERM; 1000 goto out_drop; 1001 } 1002 1003 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1004 status = -EPERM; 1005 goto out_drop; 1006 } 1007 1008 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 1009 if (!context) { 1010 status = -ENOMEM; 1011 mlog_errno(status); 1012 goto out_drop; 1013 } 1014 1015 context->inode = inode; 1016 context->file = filp; 1017 1018 if (copy_from_user(&range, argp, sizeof(range))) { 1019 status = -EFAULT; 1020 goto out_free; 1021 } 1022 1023 if (range.me_start > i_size_read(inode)) { 1024 status = -EINVAL; 1025 goto out_free; 1026 } 1027 1028 if (range.me_start + range.me_len > i_size_read(inode)) 1029 range.me_len = i_size_read(inode) - range.me_start; 1030 1031 context->range = ⦥ 1032 1033 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 1034 context->auto_defrag = 1; 1035 /* 1036 * ok, the default theshold for the defragmentation 1037 * is 1M, since our maximum clustersize was 1M also. 1038 * any thought? 1039 */ 1040 if (!range.me_threshold) 1041 range.me_threshold = 1024 * 1024; 1042 1043 if (range.me_threshold > i_size_read(inode)) 1044 range.me_threshold = i_size_read(inode); 1045 1046 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 1047 context->partial = 1; 1048 } else { 1049 /* 1050 * first best-effort attempt to validate and adjust the goal 1051 * (physical address in block), while it can't guarantee later 1052 * operation can succeed all the time since global_bitmap may 1053 * change a bit over time. 1054 */ 1055 1056 status = ocfs2_validate_and_adjust_move_goal(inode, &range); 1057 if (status) 1058 goto out_copy; 1059 } 1060 1061 status = ocfs2_move_extents(context); 1062 if (status) 1063 mlog_errno(status); 1064out_copy: 1065 /* 1066 * movement/defragmentation may end up being partially completed, 1067 * that's the reason why we need to return userspace the finished 1068 * length and new_offset even if failure happens somewhere. 1069 */ 1070 if (copy_to_user(argp, &range, sizeof(range))) 1071 status = -EFAULT; 1072 1073out_free: 1074 kfree(context); 1075out_drop: 1076 mnt_drop_write_file(filp); 1077 1078 return status; 1079}