userdlm.c (17627B)
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * userdlm.c 4 * 5 * Code which implements the kernel side of a minimal userspace 6 * interface to our DLM. 7 * 8 * Many of the functions here are pared down versions of dlmglue.c 9 * functions. 10 * 11 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 12 */ 13 14#include <linux/signal.h> 15#include <linux/sched/signal.h> 16 17#include <linux/module.h> 18#include <linux/fs.h> 19#include <linux/types.h> 20#include <linux/crc32.h> 21 22#include "../ocfs2_lockingver.h" 23#include "../stackglue.h" 24#include "userdlm.h" 25 26#define MLOG_MASK_PREFIX ML_DLMFS 27#include "../cluster/masklog.h" 28 29 30static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) 31{ 32 return container_of(lksb, struct user_lock_res, l_lksb); 33} 34 35static inline int user_check_wait_flag(struct user_lock_res *lockres, 36 int flag) 37{ 38 int ret; 39 40 spin_lock(&lockres->l_lock); 41 ret = lockres->l_flags & flag; 42 spin_unlock(&lockres->l_lock); 43 44 return ret; 45} 46 47static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) 48 49{ 50 wait_event(lockres->l_event, 51 !user_check_wait_flag(lockres, USER_LOCK_BUSY)); 52} 53 54static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) 55 56{ 57 wait_event(lockres->l_event, 58 !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); 59} 60 61/* I heart container_of... */ 62static inline struct ocfs2_cluster_connection * 63cluster_connection_from_user_lockres(struct user_lock_res *lockres) 64{ 65 struct dlmfs_inode_private *ip; 66 67 ip = container_of(lockres, 68 struct dlmfs_inode_private, 69 ip_lockres); 70 return ip->ip_conn; 71} 72 73static struct inode * 74user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) 75{ 76 struct dlmfs_inode_private *ip; 77 78 ip = container_of(lockres, 79 struct dlmfs_inode_private, 80 ip_lockres); 81 return &ip->ip_vfs_inode; 82} 83 84static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) 85{ 86 spin_lock(&lockres->l_lock); 87 lockres->l_flags &= ~USER_LOCK_BUSY; 88 spin_unlock(&lockres->l_lock); 89} 90 91#define user_log_dlm_error(_func, _stat, _lockres) do { \ 92 mlog(ML_ERROR, "Dlm error %d while calling %s on " \ 93 "resource %.*s\n", _stat, _func, \ 94 _lockres->l_namelen, _lockres->l_name); \ 95} while (0) 96 97/* WARNING: This function lives in a world where the only three lock 98 * levels are EX, PR, and NL. It *will* have to be adjusted when more 99 * lock types are added. */ 100static inline int user_highest_compat_lock_level(int level) 101{ 102 int new_level = DLM_LOCK_EX; 103 104 if (level == DLM_LOCK_EX) 105 new_level = DLM_LOCK_NL; 106 else if (level == DLM_LOCK_PR) 107 new_level = DLM_LOCK_PR; 108 return new_level; 109} 110 111static void user_ast(struct ocfs2_dlm_lksb *lksb) 112{ 113 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); 114 int status; 115 116 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", 117 lockres->l_namelen, lockres->l_name, lockres->l_level, 118 lockres->l_requested); 119 120 spin_lock(&lockres->l_lock); 121 122 status = ocfs2_dlm_lock_status(&lockres->l_lksb); 123 if (status) { 124 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 125 status, lockres->l_namelen, lockres->l_name); 126 spin_unlock(&lockres->l_lock); 127 return; 128 } 129 130 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, 131 "Lockres %.*s, requested ivmode. flags 0x%x\n", 132 lockres->l_namelen, lockres->l_name, lockres->l_flags); 133 134 /* we're downconverting. */ 135 if (lockres->l_requested < lockres->l_level) { 136 if (lockres->l_requested <= 137 user_highest_compat_lock_level(lockres->l_blocking)) { 138 lockres->l_blocking = DLM_LOCK_NL; 139 lockres->l_flags &= ~USER_LOCK_BLOCKED; 140 } 141 } 142 143 lockres->l_level = lockres->l_requested; 144 lockres->l_requested = DLM_LOCK_IV; 145 lockres->l_flags |= USER_LOCK_ATTACHED; 146 lockres->l_flags &= ~USER_LOCK_BUSY; 147 148 spin_unlock(&lockres->l_lock); 149 150 wake_up(&lockres->l_event); 151} 152 153static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) 154{ 155 struct inode *inode; 156 inode = user_dlm_inode_from_user_lockres(lockres); 157 if (!igrab(inode)) 158 BUG(); 159} 160 161static void user_dlm_unblock_lock(struct work_struct *work); 162 163static void __user_dlm_queue_lockres(struct user_lock_res *lockres) 164{ 165 if (!(lockres->l_flags & USER_LOCK_QUEUED)) { 166 user_dlm_grab_inode_ref(lockres); 167 168 INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); 169 170 queue_work(user_dlm_worker, &lockres->l_work); 171 lockres->l_flags |= USER_LOCK_QUEUED; 172 } 173} 174 175static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) 176{ 177 int queue = 0; 178 179 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) 180 return; 181 182 switch (lockres->l_blocking) { 183 case DLM_LOCK_EX: 184 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 185 queue = 1; 186 break; 187 case DLM_LOCK_PR: 188 if (!lockres->l_ex_holders) 189 queue = 1; 190 break; 191 default: 192 BUG(); 193 } 194 195 if (queue) 196 __user_dlm_queue_lockres(lockres); 197} 198 199static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) 200{ 201 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); 202 203 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", 204 lockres->l_namelen, lockres->l_name, level, lockres->l_level); 205 206 spin_lock(&lockres->l_lock); 207 lockres->l_flags |= USER_LOCK_BLOCKED; 208 if (level > lockres->l_blocking) 209 lockres->l_blocking = level; 210 211 __user_dlm_queue_lockres(lockres); 212 spin_unlock(&lockres->l_lock); 213 214 wake_up(&lockres->l_event); 215} 216 217static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) 218{ 219 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); 220 221 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", 222 lockres->l_namelen, lockres->l_name, lockres->l_flags); 223 224 if (status) 225 mlog(ML_ERROR, "dlm returns status %d\n", status); 226 227 spin_lock(&lockres->l_lock); 228 /* The teardown flag gets set early during the unlock process, 229 * so test the cancel flag to make sure that this ast isn't 230 * for a concurrent cancel. */ 231 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 232 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 233 lockres->l_level = DLM_LOCK_IV; 234 } else if (status == DLM_CANCELGRANT) { 235 /* We tried to cancel a convert request, but it was 236 * already granted. Don't clear the busy flag - the 237 * ast should've done this already. */ 238 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 239 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 240 goto out_noclear; 241 } else { 242 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 243 /* Cancel succeeded, we want to re-queue */ 244 lockres->l_requested = DLM_LOCK_IV; /* cancel an 245 * upconvert 246 * request. */ 247 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 248 /* we want the unblock thread to look at it again 249 * now. */ 250 if (lockres->l_flags & USER_LOCK_BLOCKED) 251 __user_dlm_queue_lockres(lockres); 252 } 253 254 lockres->l_flags &= ~USER_LOCK_BUSY; 255out_noclear: 256 spin_unlock(&lockres->l_lock); 257 258 wake_up(&lockres->l_event); 259} 260 261/* 262 * This is the userdlmfs locking protocol version. 263 * 264 * See fs/ocfs2/dlmglue.c for more details on locking versions. 265 */ 266static struct ocfs2_locking_protocol user_dlm_lproto = { 267 .lp_max_version = { 268 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 269 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 270 }, 271 .lp_lock_ast = user_ast, 272 .lp_blocking_ast = user_bast, 273 .lp_unlock_ast = user_unlock_ast, 274}; 275 276static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 277{ 278 struct inode *inode; 279 inode = user_dlm_inode_from_user_lockres(lockres); 280 iput(inode); 281} 282 283static void user_dlm_unblock_lock(struct work_struct *work) 284{ 285 int new_level, status; 286 struct user_lock_res *lockres = 287 container_of(work, struct user_lock_res, l_work); 288 struct ocfs2_cluster_connection *conn = 289 cluster_connection_from_user_lockres(lockres); 290 291 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); 292 293 spin_lock(&lockres->l_lock); 294 295 mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), 296 "Lockres %.*s, flags 0x%x\n", 297 lockres->l_namelen, lockres->l_name, lockres->l_flags); 298 299 /* notice that we don't clear USER_LOCK_BLOCKED here. If it's 300 * set, we want user_ast clear it. */ 301 lockres->l_flags &= ~USER_LOCK_QUEUED; 302 303 /* It's valid to get here and no longer be blocked - if we get 304 * several basts in a row, we might be queued by the first 305 * one, the unblock thread might run and clear the queued 306 * flag, and finally we might get another bast which re-queues 307 * us before our ast for the downconvert is called. */ 308 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 309 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", 310 lockres->l_namelen, lockres->l_name); 311 spin_unlock(&lockres->l_lock); 312 goto drop_ref; 313 } 314 315 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 316 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", 317 lockres->l_namelen, lockres->l_name); 318 spin_unlock(&lockres->l_lock); 319 goto drop_ref; 320 } 321 322 if (lockres->l_flags & USER_LOCK_BUSY) { 323 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", 325 lockres->l_namelen, lockres->l_name); 326 spin_unlock(&lockres->l_lock); 327 goto drop_ref; 328 } 329 330 lockres->l_flags |= USER_LOCK_IN_CANCEL; 331 spin_unlock(&lockres->l_lock); 332 333 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, 334 DLM_LKF_CANCEL); 335 if (status) 336 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); 337 goto drop_ref; 338 } 339 340 /* If there are still incompat holders, we can exit safely 341 * without worrying about re-queueing this lock as that will 342 * happen on the last call to user_cluster_unlock. */ 343 if ((lockres->l_blocking == DLM_LOCK_EX) 344 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 345 spin_unlock(&lockres->l_lock); 346 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", 347 lockres->l_namelen, lockres->l_name, 348 lockres->l_ex_holders, lockres->l_ro_holders); 349 goto drop_ref; 350 } 351 352 if ((lockres->l_blocking == DLM_LOCK_PR) 353 && lockres->l_ex_holders) { 354 spin_unlock(&lockres->l_lock); 355 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", 356 lockres->l_namelen, lockres->l_name, 357 lockres->l_ex_holders); 358 goto drop_ref; 359 } 360 361 /* yay, we can downconvert now. */ 362 new_level = user_highest_compat_lock_level(lockres->l_blocking); 363 lockres->l_requested = new_level; 364 lockres->l_flags |= USER_LOCK_BUSY; 365 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", 366 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); 367 spin_unlock(&lockres->l_lock); 368 369 /* need lock downconvert request now... */ 370 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, 371 DLM_LKF_CONVERT|DLM_LKF_VALBLK, 372 lockres->l_name, 373 lockres->l_namelen); 374 if (status) { 375 user_log_dlm_error("ocfs2_dlm_lock", status, lockres); 376 user_recover_from_dlm_error(lockres); 377 } 378 379drop_ref: 380 user_dlm_drop_inode_ref(lockres); 381} 382 383static inline void user_dlm_inc_holders(struct user_lock_res *lockres, 384 int level) 385{ 386 switch(level) { 387 case DLM_LOCK_EX: 388 lockres->l_ex_holders++; 389 break; 390 case DLM_LOCK_PR: 391 lockres->l_ro_holders++; 392 break; 393 default: 394 BUG(); 395 } 396} 397 398/* predict what lock level we'll be dropping down to on behalf 399 * of another node, and return true if the currently wanted 400 * level will be compatible with it. */ 401static inline int 402user_may_continue_on_blocked_lock(struct user_lock_res *lockres, 403 int wanted) 404{ 405 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); 406 407 return wanted <= user_highest_compat_lock_level(lockres->l_blocking); 408} 409 410int user_dlm_cluster_lock(struct user_lock_res *lockres, 411 int level, 412 int lkm_flags) 413{ 414 int status, local_flags; 415 struct ocfs2_cluster_connection *conn = 416 cluster_connection_from_user_lockres(lockres); 417 418 if (level != DLM_LOCK_EX && 419 level != DLM_LOCK_PR) { 420 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 421 lockres->l_namelen, lockres->l_name); 422 status = -EINVAL; 423 goto bail; 424 } 425 426 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", 427 lockres->l_namelen, lockres->l_name, level, lkm_flags); 428 429again: 430 if (signal_pending(current)) { 431 status = -ERESTARTSYS; 432 goto bail; 433 } 434 435 spin_lock(&lockres->l_lock); 436 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 437 spin_unlock(&lockres->l_lock); 438 status = -EAGAIN; 439 goto bail; 440 } 441 442 /* We only compare against the currently granted level 443 * here. If the lock is blocked waiting on a downconvert, 444 * we'll get caught below. */ 445 if ((lockres->l_flags & USER_LOCK_BUSY) && 446 (level > lockres->l_level)) { 447 /* is someone sitting in dlm_lock? If so, wait on 448 * them. */ 449 spin_unlock(&lockres->l_lock); 450 451 user_wait_on_busy_lock(lockres); 452 goto again; 453 } 454 455 if ((lockres->l_flags & USER_LOCK_BLOCKED) && 456 (!user_may_continue_on_blocked_lock(lockres, level))) { 457 /* is the lock is currently blocked on behalf of 458 * another node */ 459 spin_unlock(&lockres->l_lock); 460 461 user_wait_on_blocked_lock(lockres); 462 goto again; 463 } 464 465 if (level > lockres->l_level) { 466 local_flags = lkm_flags | DLM_LKF_VALBLK; 467 if (lockres->l_level != DLM_LOCK_IV) 468 local_flags |= DLM_LKF_CONVERT; 469 470 lockres->l_requested = level; 471 lockres->l_flags |= USER_LOCK_BUSY; 472 spin_unlock(&lockres->l_lock); 473 474 BUG_ON(level == DLM_LOCK_IV); 475 BUG_ON(level == DLM_LOCK_NL); 476 477 /* call dlm_lock to upgrade lock now */ 478 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, 479 local_flags, lockres->l_name, 480 lockres->l_namelen); 481 if (status) { 482 if ((lkm_flags & DLM_LKF_NOQUEUE) && 483 (status != -EAGAIN)) 484 user_log_dlm_error("ocfs2_dlm_lock", 485 status, lockres); 486 user_recover_from_dlm_error(lockres); 487 goto bail; 488 } 489 490 user_wait_on_busy_lock(lockres); 491 goto again; 492 } 493 494 user_dlm_inc_holders(lockres, level); 495 spin_unlock(&lockres->l_lock); 496 497 status = 0; 498bail: 499 return status; 500} 501 502static inline void user_dlm_dec_holders(struct user_lock_res *lockres, 503 int level) 504{ 505 switch(level) { 506 case DLM_LOCK_EX: 507 BUG_ON(!lockres->l_ex_holders); 508 lockres->l_ex_holders--; 509 break; 510 case DLM_LOCK_PR: 511 BUG_ON(!lockres->l_ro_holders); 512 lockres->l_ro_holders--; 513 break; 514 default: 515 BUG(); 516 } 517} 518 519void user_dlm_cluster_unlock(struct user_lock_res *lockres, 520 int level) 521{ 522 if (level != DLM_LOCK_EX && 523 level != DLM_LOCK_PR) { 524 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 525 lockres->l_namelen, lockres->l_name); 526 return; 527 } 528 529 spin_lock(&lockres->l_lock); 530 user_dlm_dec_holders(lockres, level); 531 __user_dlm_cond_queue_lockres(lockres); 532 spin_unlock(&lockres->l_lock); 533} 534 535void user_dlm_write_lvb(struct inode *inode, 536 const char *val, 537 unsigned int len) 538{ 539 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 540 char *lvb; 541 542 BUG_ON(len > DLM_LVB_LEN); 543 544 spin_lock(&lockres->l_lock); 545 546 BUG_ON(lockres->l_level < DLM_LOCK_EX); 547 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 548 memcpy(lvb, val, len); 549 550 spin_unlock(&lockres->l_lock); 551} 552 553bool user_dlm_read_lvb(struct inode *inode, char *val) 554{ 555 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 556 char *lvb; 557 bool ret = true; 558 559 spin_lock(&lockres->l_lock); 560 561 BUG_ON(lockres->l_level < DLM_LOCK_PR); 562 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { 563 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 564 memcpy(val, lvb, DLM_LVB_LEN); 565 } else 566 ret = false; 567 568 spin_unlock(&lockres->l_lock); 569 return ret; 570} 571 572void user_dlm_lock_res_init(struct user_lock_res *lockres, 573 struct dentry *dentry) 574{ 575 memset(lockres, 0, sizeof(*lockres)); 576 577 spin_lock_init(&lockres->l_lock); 578 init_waitqueue_head(&lockres->l_event); 579 lockres->l_level = DLM_LOCK_IV; 580 lockres->l_requested = DLM_LOCK_IV; 581 lockres->l_blocking = DLM_LOCK_IV; 582 583 /* should have been checked before getting here. */ 584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 585 586 memcpy(lockres->l_name, 587 dentry->d_name.name, 588 dentry->d_name.len); 589 lockres->l_namelen = dentry->d_name.len; 590} 591 592int user_dlm_destroy_lock(struct user_lock_res *lockres) 593{ 594 int status = -EBUSY; 595 struct ocfs2_cluster_connection *conn = 596 cluster_connection_from_user_lockres(lockres); 597 598 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); 599 600 spin_lock(&lockres->l_lock); 601 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 602 spin_unlock(&lockres->l_lock); 603 goto bail; 604 } 605 606 lockres->l_flags |= USER_LOCK_IN_TEARDOWN; 607 608 while (lockres->l_flags & USER_LOCK_BUSY) { 609 spin_unlock(&lockres->l_lock); 610 611 user_wait_on_busy_lock(lockres); 612 613 spin_lock(&lockres->l_lock); 614 } 615 616 if (lockres->l_ro_holders || lockres->l_ex_holders) { 617 lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; 618 spin_unlock(&lockres->l_lock); 619 goto bail; 620 } 621 622 status = 0; 623 if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { 624 /* 625 * lock is never requested, leave USER_LOCK_IN_TEARDOWN set 626 * to avoid new lock request coming in. 627 */ 628 spin_unlock(&lockres->l_lock); 629 goto bail; 630 } 631 632 lockres->l_flags |= USER_LOCK_BUSY; 633 spin_unlock(&lockres->l_lock); 634 635 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); 636 if (status) { 637 spin_lock(&lockres->l_lock); 638 lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; 639 lockres->l_flags &= ~USER_LOCK_BUSY; 640 spin_unlock(&lockres->l_lock); 641 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); 642 goto bail; 643 } 644 645 user_wait_on_busy_lock(lockres); 646 647 status = 0; 648bail: 649 return status; 650} 651 652static void user_dlm_recovery_handler_noop(int node_num, 653 void *recovery_data) 654{ 655 /* We ignore recovery events */ 656 return; 657} 658 659void user_dlm_set_locking_protocol(void) 660{ 661 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); 662} 663 664struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) 665{ 666 int rc; 667 struct ocfs2_cluster_connection *conn; 668 669 rc = ocfs2_cluster_connect_agnostic(name->name, name->len, 670 &user_dlm_lproto, 671 user_dlm_recovery_handler_noop, 672 NULL, &conn); 673 if (rc) 674 mlog_errno(rc); 675 676 return rc ? ERR_PTR(rc) : conn; 677} 678 679void user_dlm_unregister(struct ocfs2_cluster_connection *conn) 680{ 681 ocfs2_cluster_disconnect(conn, 0); 682}