user_namespace.c (36425B)
1// SPDX-License-Identifier: GPL-2.0-only 2 3#include <linux/export.h> 4#include <linux/nsproxy.h> 5#include <linux/slab.h> 6#include <linux/sched/signal.h> 7#include <linux/user_namespace.h> 8#include <linux/proc_ns.h> 9#include <linux/highuid.h> 10#include <linux/cred.h> 11#include <linux/securebits.h> 12#include <linux/keyctl.h> 13#include <linux/key-type.h> 14#include <keys/user-type.h> 15#include <linux/seq_file.h> 16#include <linux/fs.h> 17#include <linux/uaccess.h> 18#include <linux/ctype.h> 19#include <linux/projid.h> 20#include <linux/fs_struct.h> 21#include <linux/bsearch.h> 22#include <linux/sort.h> 23 24static struct kmem_cache *user_ns_cachep __read_mostly; 25static DEFINE_MUTEX(userns_state_mutex); 26 27static bool new_idmap_permitted(const struct file *file, 28 struct user_namespace *ns, int cap_setid, 29 struct uid_gid_map *map); 30static void free_user_ns(struct work_struct *work); 31 32static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) 33{ 34 return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); 35} 36 37static void dec_user_namespaces(struct ucounts *ucounts) 38{ 39 return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); 40} 41 42static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 43{ 44 /* Start with the same capabilities as init but useless for doing 45 * anything as the capabilities are bound to the new user namespace. 46 */ 47 cred->securebits = SECUREBITS_DEFAULT; 48 cred->cap_inheritable = CAP_EMPTY_SET; 49 cred->cap_permitted = CAP_FULL_SET; 50 cred->cap_effective = CAP_FULL_SET; 51 cred->cap_ambient = CAP_EMPTY_SET; 52 cred->cap_bset = CAP_FULL_SET; 53#ifdef CONFIG_KEYS 54 key_put(cred->request_key_auth); 55 cred->request_key_auth = NULL; 56#endif 57 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 58 cred->user_ns = user_ns; 59} 60 61static unsigned long enforced_nproc_rlimit(void) 62{ 63 unsigned long limit = RLIM_INFINITY; 64 65 /* Is RLIMIT_NPROC currently enforced? */ 66 if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || 67 (current_user_ns() != &init_user_ns)) 68 limit = rlimit(RLIMIT_NPROC); 69 70 return limit; 71} 72 73/* 74 * Create a new user namespace, deriving the creator from the user in the 75 * passed credentials, and replacing that user with the new root user for the 76 * new namespace. 77 * 78 * This is called by copy_creds(), which will finish setting the target task's 79 * credentials. 80 */ 81int create_user_ns(struct cred *new) 82{ 83 struct user_namespace *ns, *parent_ns = new->user_ns; 84 kuid_t owner = new->euid; 85 kgid_t group = new->egid; 86 struct ucounts *ucounts; 87 int ret, i; 88 89 ret = -ENOSPC; 90 if (parent_ns->level > 32) 91 goto fail; 92 93 ucounts = inc_user_namespaces(parent_ns, owner); 94 if (!ucounts) 95 goto fail; 96 97 /* 98 * Verify that we can not violate the policy of which files 99 * may be accessed that is specified by the root directory, 100 * by verifying that the root directory is at the root of the 101 * mount namespace which allows all files to be accessed. 102 */ 103 ret = -EPERM; 104 if (current_chrooted()) 105 goto fail_dec; 106 107 /* The creator needs a mapping in the parent user namespace 108 * or else we won't be able to reasonably tell userspace who 109 * created a user_namespace. 110 */ 111 ret = -EPERM; 112 if (!kuid_has_mapping(parent_ns, owner) || 113 !kgid_has_mapping(parent_ns, group)) 114 goto fail_dec; 115 116 ret = -ENOMEM; 117 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); 118 if (!ns) 119 goto fail_dec; 120 121 ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); 122 ret = ns_alloc_inum(&ns->ns); 123 if (ret) 124 goto fail_free; 125 ns->ns.ops = &userns_operations; 126 127 refcount_set(&ns->ns.count, 1); 128 /* Leave the new->user_ns reference with the new user namespace. */ 129 ns->parent = parent_ns; 130 ns->level = parent_ns->level + 1; 131 ns->owner = owner; 132 ns->group = group; 133 INIT_WORK(&ns->work, free_user_ns); 134 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { 135 ns->ucount_max[i] = INT_MAX; 136 } 137 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); 138 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); 139 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); 140 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); 141 ns->ucounts = ucounts; 142 143 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ 144 mutex_lock(&userns_state_mutex); 145 ns->flags = parent_ns->flags; 146 mutex_unlock(&userns_state_mutex); 147 148#ifdef CONFIG_KEYS 149 INIT_LIST_HEAD(&ns->keyring_name_list); 150 init_rwsem(&ns->keyring_sem); 151#endif 152 ret = -ENOMEM; 153 if (!setup_userns_sysctls(ns)) 154 goto fail_keyring; 155 156 set_cred_user_ns(new, ns); 157 return 0; 158fail_keyring: 159#ifdef CONFIG_PERSISTENT_KEYRINGS 160 key_put(ns->persistent_keyring_register); 161#endif 162 ns_free_inum(&ns->ns); 163fail_free: 164 kmem_cache_free(user_ns_cachep, ns); 165fail_dec: 166 dec_user_namespaces(ucounts); 167fail: 168 return ret; 169} 170 171int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 172{ 173 struct cred *cred; 174 int err = -ENOMEM; 175 176 if (!(unshare_flags & CLONE_NEWUSER)) 177 return 0; 178 179 cred = prepare_creds(); 180 if (cred) { 181 err = create_user_ns(cred); 182 if (err) 183 put_cred(cred); 184 else 185 *new_cred = cred; 186 } 187 188 return err; 189} 190 191static void free_user_ns(struct work_struct *work) 192{ 193 struct user_namespace *parent, *ns = 194 container_of(work, struct user_namespace, work); 195 196 do { 197 struct ucounts *ucounts = ns->ucounts; 198 parent = ns->parent; 199 if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { 200 kfree(ns->gid_map.forward); 201 kfree(ns->gid_map.reverse); 202 } 203 if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { 204 kfree(ns->uid_map.forward); 205 kfree(ns->uid_map.reverse); 206 } 207 if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { 208 kfree(ns->projid_map.forward); 209 kfree(ns->projid_map.reverse); 210 } 211 retire_userns_sysctls(ns); 212 key_free_user_ns(ns); 213 ns_free_inum(&ns->ns); 214 kmem_cache_free(user_ns_cachep, ns); 215 dec_user_namespaces(ucounts); 216 ns = parent; 217 } while (refcount_dec_and_test(&parent->ns.count)); 218} 219 220void __put_user_ns(struct user_namespace *ns) 221{ 222 schedule_work(&ns->work); 223} 224EXPORT_SYMBOL(__put_user_ns); 225 226/** 227 * idmap_key struct holds the information necessary to find an idmapping in a 228 * sorted idmap array. It is passed to cmp_map_id() as first argument. 229 */ 230struct idmap_key { 231 bool map_up; /* true -> id from kid; false -> kid from id */ 232 u32 id; /* id to find */ 233 u32 count; /* == 0 unless used with map_id_range_down() */ 234}; 235 236/** 237 * cmp_map_id - Function to be passed to bsearch() to find the requested 238 * idmapping. Expects struct idmap_key to be passed via @k. 239 */ 240static int cmp_map_id(const void *k, const void *e) 241{ 242 u32 first, last, id2; 243 const struct idmap_key *key = k; 244 const struct uid_gid_extent *el = e; 245 246 id2 = key->id + key->count - 1; 247 248 /* handle map_id_{down,up}() */ 249 if (key->map_up) 250 first = el->lower_first; 251 else 252 first = el->first; 253 254 last = first + el->count - 1; 255 256 if (key->id >= first && key->id <= last && 257 (id2 >= first && id2 <= last)) 258 return 0; 259 260 if (key->id < first || id2 < first) 261 return -1; 262 263 return 1; 264} 265 266/** 267 * map_id_range_down_max - Find idmap via binary search in ordered idmap array. 268 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. 269 */ 270static struct uid_gid_extent * 271map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) 272{ 273 struct idmap_key key; 274 275 key.map_up = false; 276 key.count = count; 277 key.id = id; 278 279 return bsearch(&key, map->forward, extents, 280 sizeof(struct uid_gid_extent), cmp_map_id); 281} 282 283/** 284 * map_id_range_down_base - Find idmap via binary search in static extent array. 285 * Can only be called if number of mappings is equal or less than 286 * UID_GID_MAP_MAX_BASE_EXTENTS. 287 */ 288static struct uid_gid_extent * 289map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) 290{ 291 unsigned idx; 292 u32 first, last, id2; 293 294 id2 = id + count - 1; 295 296 /* Find the matching extent */ 297 for (idx = 0; idx < extents; idx++) { 298 first = map->extent[idx].first; 299 last = first + map->extent[idx].count - 1; 300 if (id >= first && id <= last && 301 (id2 >= first && id2 <= last)) 302 return &map->extent[idx]; 303 } 304 return NULL; 305} 306 307static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) 308{ 309 struct uid_gid_extent *extent; 310 unsigned extents = map->nr_extents; 311 smp_rmb(); 312 313 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 314 extent = map_id_range_down_base(extents, map, id, count); 315 else 316 extent = map_id_range_down_max(extents, map, id, count); 317 318 /* Map the id or note failure */ 319 if (extent) 320 id = (id - extent->first) + extent->lower_first; 321 else 322 id = (u32) -1; 323 324 return id; 325} 326 327static u32 map_id_down(struct uid_gid_map *map, u32 id) 328{ 329 return map_id_range_down(map, id, 1); 330} 331 332/** 333 * map_id_up_base - Find idmap via binary search in static extent array. 334 * Can only be called if number of mappings is equal or less than 335 * UID_GID_MAP_MAX_BASE_EXTENTS. 336 */ 337static struct uid_gid_extent * 338map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) 339{ 340 unsigned idx; 341 u32 first, last; 342 343 /* Find the matching extent */ 344 for (idx = 0; idx < extents; idx++) { 345 first = map->extent[idx].lower_first; 346 last = first + map->extent[idx].count - 1; 347 if (id >= first && id <= last) 348 return &map->extent[idx]; 349 } 350 return NULL; 351} 352 353/** 354 * map_id_up_max - Find idmap via binary search in ordered idmap array. 355 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. 356 */ 357static struct uid_gid_extent * 358map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) 359{ 360 struct idmap_key key; 361 362 key.map_up = true; 363 key.count = 1; 364 key.id = id; 365 366 return bsearch(&key, map->reverse, extents, 367 sizeof(struct uid_gid_extent), cmp_map_id); 368} 369 370static u32 map_id_up(struct uid_gid_map *map, u32 id) 371{ 372 struct uid_gid_extent *extent; 373 unsigned extents = map->nr_extents; 374 smp_rmb(); 375 376 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 377 extent = map_id_up_base(extents, map, id); 378 else 379 extent = map_id_up_max(extents, map, id); 380 381 /* Map the id or note failure */ 382 if (extent) 383 id = (id - extent->lower_first) + extent->first; 384 else 385 id = (u32) -1; 386 387 return id; 388} 389 390/** 391 * make_kuid - Map a user-namespace uid pair into a kuid. 392 * @ns: User namespace that the uid is in 393 * @uid: User identifier 394 * 395 * Maps a user-namespace uid pair into a kernel internal kuid, 396 * and returns that kuid. 397 * 398 * When there is no mapping defined for the user-namespace uid 399 * pair INVALID_UID is returned. Callers are expected to test 400 * for and handle INVALID_UID being returned. INVALID_UID 401 * may be tested for using uid_valid(). 402 */ 403kuid_t make_kuid(struct user_namespace *ns, uid_t uid) 404{ 405 /* Map the uid to a global kernel uid */ 406 return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); 407} 408EXPORT_SYMBOL(make_kuid); 409 410/** 411 * from_kuid - Create a uid from a kuid user-namespace pair. 412 * @targ: The user namespace we want a uid in. 413 * @kuid: The kernel internal uid to start with. 414 * 415 * Map @kuid into the user-namespace specified by @targ and 416 * return the resulting uid. 417 * 418 * There is always a mapping into the initial user_namespace. 419 * 420 * If @kuid has no mapping in @targ (uid_t)-1 is returned. 421 */ 422uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) 423{ 424 /* Map the uid from a global kernel uid */ 425 return map_id_up(&targ->uid_map, __kuid_val(kuid)); 426} 427EXPORT_SYMBOL(from_kuid); 428 429/** 430 * from_kuid_munged - Create a uid from a kuid user-namespace pair. 431 * @targ: The user namespace we want a uid in. 432 * @kuid: The kernel internal uid to start with. 433 * 434 * Map @kuid into the user-namespace specified by @targ and 435 * return the resulting uid. 436 * 437 * There is always a mapping into the initial user_namespace. 438 * 439 * Unlike from_kuid from_kuid_munged never fails and always 440 * returns a valid uid. This makes from_kuid_munged appropriate 441 * for use in syscalls like stat and getuid where failing the 442 * system call and failing to provide a valid uid are not an 443 * options. 444 * 445 * If @kuid has no mapping in @targ overflowuid is returned. 446 */ 447uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) 448{ 449 uid_t uid; 450 uid = from_kuid(targ, kuid); 451 452 if (uid == (uid_t) -1) 453 uid = overflowuid; 454 return uid; 455} 456EXPORT_SYMBOL(from_kuid_munged); 457 458/** 459 * make_kgid - Map a user-namespace gid pair into a kgid. 460 * @ns: User namespace that the gid is in 461 * @gid: group identifier 462 * 463 * Maps a user-namespace gid pair into a kernel internal kgid, 464 * and returns that kgid. 465 * 466 * When there is no mapping defined for the user-namespace gid 467 * pair INVALID_GID is returned. Callers are expected to test 468 * for and handle INVALID_GID being returned. INVALID_GID may be 469 * tested for using gid_valid(). 470 */ 471kgid_t make_kgid(struct user_namespace *ns, gid_t gid) 472{ 473 /* Map the gid to a global kernel gid */ 474 return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); 475} 476EXPORT_SYMBOL(make_kgid); 477 478/** 479 * from_kgid - Create a gid from a kgid user-namespace pair. 480 * @targ: The user namespace we want a gid in. 481 * @kgid: The kernel internal gid to start with. 482 * 483 * Map @kgid into the user-namespace specified by @targ and 484 * return the resulting gid. 485 * 486 * There is always a mapping into the initial user_namespace. 487 * 488 * If @kgid has no mapping in @targ (gid_t)-1 is returned. 489 */ 490gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) 491{ 492 /* Map the gid from a global kernel gid */ 493 return map_id_up(&targ->gid_map, __kgid_val(kgid)); 494} 495EXPORT_SYMBOL(from_kgid); 496 497/** 498 * from_kgid_munged - Create a gid from a kgid user-namespace pair. 499 * @targ: The user namespace we want a gid in. 500 * @kgid: The kernel internal gid to start with. 501 * 502 * Map @kgid into the user-namespace specified by @targ and 503 * return the resulting gid. 504 * 505 * There is always a mapping into the initial user_namespace. 506 * 507 * Unlike from_kgid from_kgid_munged never fails and always 508 * returns a valid gid. This makes from_kgid_munged appropriate 509 * for use in syscalls like stat and getgid where failing the 510 * system call and failing to provide a valid gid are not options. 511 * 512 * If @kgid has no mapping in @targ overflowgid is returned. 513 */ 514gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) 515{ 516 gid_t gid; 517 gid = from_kgid(targ, kgid); 518 519 if (gid == (gid_t) -1) 520 gid = overflowgid; 521 return gid; 522} 523EXPORT_SYMBOL(from_kgid_munged); 524 525/** 526 * make_kprojid - Map a user-namespace projid pair into a kprojid. 527 * @ns: User namespace that the projid is in 528 * @projid: Project identifier 529 * 530 * Maps a user-namespace uid pair into a kernel internal kuid, 531 * and returns that kuid. 532 * 533 * When there is no mapping defined for the user-namespace projid 534 * pair INVALID_PROJID is returned. Callers are expected to test 535 * for and handle INVALID_PROJID being returned. INVALID_PROJID 536 * may be tested for using projid_valid(). 537 */ 538kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) 539{ 540 /* Map the uid to a global kernel uid */ 541 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); 542} 543EXPORT_SYMBOL(make_kprojid); 544 545/** 546 * from_kprojid - Create a projid from a kprojid user-namespace pair. 547 * @targ: The user namespace we want a projid in. 548 * @kprojid: The kernel internal project identifier to start with. 549 * 550 * Map @kprojid into the user-namespace specified by @targ and 551 * return the resulting projid. 552 * 553 * There is always a mapping into the initial user_namespace. 554 * 555 * If @kprojid has no mapping in @targ (projid_t)-1 is returned. 556 */ 557projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) 558{ 559 /* Map the uid from a global kernel uid */ 560 return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); 561} 562EXPORT_SYMBOL(from_kprojid); 563 564/** 565 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. 566 * @targ: The user namespace we want a projid in. 567 * @kprojid: The kernel internal projid to start with. 568 * 569 * Map @kprojid into the user-namespace specified by @targ and 570 * return the resulting projid. 571 * 572 * There is always a mapping into the initial user_namespace. 573 * 574 * Unlike from_kprojid from_kprojid_munged never fails and always 575 * returns a valid projid. This makes from_kprojid_munged 576 * appropriate for use in syscalls like stat and where 577 * failing the system call and failing to provide a valid projid are 578 * not an options. 579 * 580 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. 581 */ 582projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) 583{ 584 projid_t projid; 585 projid = from_kprojid(targ, kprojid); 586 587 if (projid == (projid_t) -1) 588 projid = OVERFLOW_PROJID; 589 return projid; 590} 591EXPORT_SYMBOL(from_kprojid_munged); 592 593 594static int uid_m_show(struct seq_file *seq, void *v) 595{ 596 struct user_namespace *ns = seq->private; 597 struct uid_gid_extent *extent = v; 598 struct user_namespace *lower_ns; 599 uid_t lower; 600 601 lower_ns = seq_user_ns(seq); 602 if ((lower_ns == ns) && lower_ns->parent) 603 lower_ns = lower_ns->parent; 604 605 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); 606 607 seq_printf(seq, "%10u %10u %10u\n", 608 extent->first, 609 lower, 610 extent->count); 611 612 return 0; 613} 614 615static int gid_m_show(struct seq_file *seq, void *v) 616{ 617 struct user_namespace *ns = seq->private; 618 struct uid_gid_extent *extent = v; 619 struct user_namespace *lower_ns; 620 gid_t lower; 621 622 lower_ns = seq_user_ns(seq); 623 if ((lower_ns == ns) && lower_ns->parent) 624 lower_ns = lower_ns->parent; 625 626 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); 627 628 seq_printf(seq, "%10u %10u %10u\n", 629 extent->first, 630 lower, 631 extent->count); 632 633 return 0; 634} 635 636static int projid_m_show(struct seq_file *seq, void *v) 637{ 638 struct user_namespace *ns = seq->private; 639 struct uid_gid_extent *extent = v; 640 struct user_namespace *lower_ns; 641 projid_t lower; 642 643 lower_ns = seq_user_ns(seq); 644 if ((lower_ns == ns) && lower_ns->parent) 645 lower_ns = lower_ns->parent; 646 647 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); 648 649 seq_printf(seq, "%10u %10u %10u\n", 650 extent->first, 651 lower, 652 extent->count); 653 654 return 0; 655} 656 657static void *m_start(struct seq_file *seq, loff_t *ppos, 658 struct uid_gid_map *map) 659{ 660 loff_t pos = *ppos; 661 unsigned extents = map->nr_extents; 662 smp_rmb(); 663 664 if (pos >= extents) 665 return NULL; 666 667 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 668 return &map->extent[pos]; 669 670 return &map->forward[pos]; 671} 672 673static void *uid_m_start(struct seq_file *seq, loff_t *ppos) 674{ 675 struct user_namespace *ns = seq->private; 676 677 return m_start(seq, ppos, &ns->uid_map); 678} 679 680static void *gid_m_start(struct seq_file *seq, loff_t *ppos) 681{ 682 struct user_namespace *ns = seq->private; 683 684 return m_start(seq, ppos, &ns->gid_map); 685} 686 687static void *projid_m_start(struct seq_file *seq, loff_t *ppos) 688{ 689 struct user_namespace *ns = seq->private; 690 691 return m_start(seq, ppos, &ns->projid_map); 692} 693 694static void *m_next(struct seq_file *seq, void *v, loff_t *pos) 695{ 696 (*pos)++; 697 return seq->op->start(seq, pos); 698} 699 700static void m_stop(struct seq_file *seq, void *v) 701{ 702 return; 703} 704 705const struct seq_operations proc_uid_seq_operations = { 706 .start = uid_m_start, 707 .stop = m_stop, 708 .next = m_next, 709 .show = uid_m_show, 710}; 711 712const struct seq_operations proc_gid_seq_operations = { 713 .start = gid_m_start, 714 .stop = m_stop, 715 .next = m_next, 716 .show = gid_m_show, 717}; 718 719const struct seq_operations proc_projid_seq_operations = { 720 .start = projid_m_start, 721 .stop = m_stop, 722 .next = m_next, 723 .show = projid_m_show, 724}; 725 726static bool mappings_overlap(struct uid_gid_map *new_map, 727 struct uid_gid_extent *extent) 728{ 729 u32 upper_first, lower_first, upper_last, lower_last; 730 unsigned idx; 731 732 upper_first = extent->first; 733 lower_first = extent->lower_first; 734 upper_last = upper_first + extent->count - 1; 735 lower_last = lower_first + extent->count - 1; 736 737 for (idx = 0; idx < new_map->nr_extents; idx++) { 738 u32 prev_upper_first, prev_lower_first; 739 u32 prev_upper_last, prev_lower_last; 740 struct uid_gid_extent *prev; 741 742 if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 743 prev = &new_map->extent[idx]; 744 else 745 prev = &new_map->forward[idx]; 746 747 prev_upper_first = prev->first; 748 prev_lower_first = prev->lower_first; 749 prev_upper_last = prev_upper_first + prev->count - 1; 750 prev_lower_last = prev_lower_first + prev->count - 1; 751 752 /* Does the upper range intersect a previous extent? */ 753 if ((prev_upper_first <= upper_last) && 754 (prev_upper_last >= upper_first)) 755 return true; 756 757 /* Does the lower range intersect a previous extent? */ 758 if ((prev_lower_first <= lower_last) && 759 (prev_lower_last >= lower_first)) 760 return true; 761 } 762 return false; 763} 764 765/** 766 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. 767 * Takes care to allocate a 4K block of memory if the number of mappings exceeds 768 * UID_GID_MAP_MAX_BASE_EXTENTS. 769 */ 770static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) 771{ 772 struct uid_gid_extent *dest; 773 774 if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { 775 struct uid_gid_extent *forward; 776 777 /* Allocate memory for 340 mappings. */ 778 forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, 779 sizeof(struct uid_gid_extent), 780 GFP_KERNEL); 781 if (!forward) 782 return -ENOMEM; 783 784 /* Copy over memory. Only set up memory for the forward pointer. 785 * Defer the memory setup for the reverse pointer. 786 */ 787 memcpy(forward, map->extent, 788 map->nr_extents * sizeof(map->extent[0])); 789 790 map->forward = forward; 791 map->reverse = NULL; 792 } 793 794 if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) 795 dest = &map->extent[map->nr_extents]; 796 else 797 dest = &map->forward[map->nr_extents]; 798 799 *dest = *extent; 800 map->nr_extents++; 801 return 0; 802} 803 804/* cmp function to sort() forward mappings */ 805static int cmp_extents_forward(const void *a, const void *b) 806{ 807 const struct uid_gid_extent *e1 = a; 808 const struct uid_gid_extent *e2 = b; 809 810 if (e1->first < e2->first) 811 return -1; 812 813 if (e1->first > e2->first) 814 return 1; 815 816 return 0; 817} 818 819/* cmp function to sort() reverse mappings */ 820static int cmp_extents_reverse(const void *a, const void *b) 821{ 822 const struct uid_gid_extent *e1 = a; 823 const struct uid_gid_extent *e2 = b; 824 825 if (e1->lower_first < e2->lower_first) 826 return -1; 827 828 if (e1->lower_first > e2->lower_first) 829 return 1; 830 831 return 0; 832} 833 834/** 835 * sort_idmaps - Sorts an array of idmap entries. 836 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. 837 */ 838static int sort_idmaps(struct uid_gid_map *map) 839{ 840 if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 841 return 0; 842 843 /* Sort forward array. */ 844 sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), 845 cmp_extents_forward, NULL); 846 847 /* Only copy the memory from forward we actually need. */ 848 map->reverse = kmemdup(map->forward, 849 map->nr_extents * sizeof(struct uid_gid_extent), 850 GFP_KERNEL); 851 if (!map->reverse) 852 return -ENOMEM; 853 854 /* Sort reverse array. */ 855 sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), 856 cmp_extents_reverse, NULL); 857 858 return 0; 859} 860 861/** 862 * verify_root_map() - check the uid 0 mapping 863 * @file: idmapping file 864 * @map_ns: user namespace of the target process 865 * @new_map: requested idmap 866 * 867 * If a process requests mapping parent uid 0 into the new ns, verify that the 868 * process writing the map had the CAP_SETFCAP capability as the target process 869 * will be able to write fscaps that are valid in ancestor user namespaces. 870 * 871 * Return: true if the mapping is allowed, false if not. 872 */ 873static bool verify_root_map(const struct file *file, 874 struct user_namespace *map_ns, 875 struct uid_gid_map *new_map) 876{ 877 int idx; 878 const struct user_namespace *file_ns = file->f_cred->user_ns; 879 struct uid_gid_extent *extent0 = NULL; 880 881 for (idx = 0; idx < new_map->nr_extents; idx++) { 882 if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 883 extent0 = &new_map->extent[idx]; 884 else 885 extent0 = &new_map->forward[idx]; 886 if (extent0->lower_first == 0) 887 break; 888 889 extent0 = NULL; 890 } 891 892 if (!extent0) 893 return true; 894 895 if (map_ns == file_ns) { 896 /* The process unshared its ns and is writing to its own 897 * /proc/self/uid_map. User already has full capabilites in 898 * the new namespace. Verify that the parent had CAP_SETFCAP 899 * when it unshared. 900 * */ 901 if (!file_ns->parent_could_setfcap) 902 return false; 903 } else { 904 /* Process p1 is writing to uid_map of p2, who is in a child 905 * user namespace to p1's. Verify that the opener of the map 906 * file has CAP_SETFCAP against the parent of the new map 907 * namespace */ 908 if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) 909 return false; 910 } 911 912 return true; 913} 914 915static ssize_t map_write(struct file *file, const char __user *buf, 916 size_t count, loff_t *ppos, 917 int cap_setid, 918 struct uid_gid_map *map, 919 struct uid_gid_map *parent_map) 920{ 921 struct seq_file *seq = file->private_data; 922 struct user_namespace *map_ns = seq->private; 923 struct uid_gid_map new_map; 924 unsigned idx; 925 struct uid_gid_extent extent; 926 char *kbuf = NULL, *pos, *next_line; 927 ssize_t ret; 928 929 /* Only allow < page size writes at the beginning of the file */ 930 if ((*ppos != 0) || (count >= PAGE_SIZE)) 931 return -EINVAL; 932 933 /* Slurp in the user data */ 934 kbuf = memdup_user_nul(buf, count); 935 if (IS_ERR(kbuf)) 936 return PTR_ERR(kbuf); 937 938 /* 939 * The userns_state_mutex serializes all writes to any given map. 940 * 941 * Any map is only ever written once. 942 * 943 * An id map fits within 1 cache line on most architectures. 944 * 945 * On read nothing needs to be done unless you are on an 946 * architecture with a crazy cache coherency model like alpha. 947 * 948 * There is a one time data dependency between reading the 949 * count of the extents and the values of the extents. The 950 * desired behavior is to see the values of the extents that 951 * were written before the count of the extents. 952 * 953 * To achieve this smp_wmb() is used on guarantee the write 954 * order and smp_rmb() is guaranteed that we don't have crazy 955 * architectures returning stale data. 956 */ 957 mutex_lock(&userns_state_mutex); 958 959 memset(&new_map, 0, sizeof(struct uid_gid_map)); 960 961 ret = -EPERM; 962 /* Only allow one successful write to the map */ 963 if (map->nr_extents != 0) 964 goto out; 965 966 /* 967 * Adjusting namespace settings requires capabilities on the target. 968 */ 969 if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) 970 goto out; 971 972 /* Parse the user data */ 973 ret = -EINVAL; 974 pos = kbuf; 975 for (; pos; pos = next_line) { 976 977 /* Find the end of line and ensure I don't look past it */ 978 next_line = strchr(pos, '\n'); 979 if (next_line) { 980 *next_line = '\0'; 981 next_line++; 982 if (*next_line == '\0') 983 next_line = NULL; 984 } 985 986 pos = skip_spaces(pos); 987 extent.first = simple_strtoul(pos, &pos, 10); 988 if (!isspace(*pos)) 989 goto out; 990 991 pos = skip_spaces(pos); 992 extent.lower_first = simple_strtoul(pos, &pos, 10); 993 if (!isspace(*pos)) 994 goto out; 995 996 pos = skip_spaces(pos); 997 extent.count = simple_strtoul(pos, &pos, 10); 998 if (*pos && !isspace(*pos)) 999 goto out; 1000 1001 /* Verify there is not trailing junk on the line */ 1002 pos = skip_spaces(pos); 1003 if (*pos != '\0') 1004 goto out; 1005 1006 /* Verify we have been given valid starting values */ 1007 if ((extent.first == (u32) -1) || 1008 (extent.lower_first == (u32) -1)) 1009 goto out; 1010 1011 /* Verify count is not zero and does not cause the 1012 * extent to wrap 1013 */ 1014 if ((extent.first + extent.count) <= extent.first) 1015 goto out; 1016 if ((extent.lower_first + extent.count) <= 1017 extent.lower_first) 1018 goto out; 1019 1020 /* Do the ranges in extent overlap any previous extents? */ 1021 if (mappings_overlap(&new_map, &extent)) 1022 goto out; 1023 1024 if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && 1025 (next_line != NULL)) 1026 goto out; 1027 1028 ret = insert_extent(&new_map, &extent); 1029 if (ret < 0) 1030 goto out; 1031 ret = -EINVAL; 1032 } 1033 /* Be very certain the new map actually exists */ 1034 if (new_map.nr_extents == 0) 1035 goto out; 1036 1037 ret = -EPERM; 1038 /* Validate the user is allowed to use user id's mapped to. */ 1039 if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) 1040 goto out; 1041 1042 ret = -EPERM; 1043 /* Map the lower ids from the parent user namespace to the 1044 * kernel global id space. 1045 */ 1046 for (idx = 0; idx < new_map.nr_extents; idx++) { 1047 struct uid_gid_extent *e; 1048 u32 lower_first; 1049 1050 if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) 1051 e = &new_map.extent[idx]; 1052 else 1053 e = &new_map.forward[idx]; 1054 1055 lower_first = map_id_range_down(parent_map, 1056 e->lower_first, 1057 e->count); 1058 1059 /* Fail if we can not map the specified extent to 1060 * the kernel global id space. 1061 */ 1062 if (lower_first == (u32) -1) 1063 goto out; 1064 1065 e->lower_first = lower_first; 1066 } 1067 1068 /* 1069 * If we want to use binary search for lookup, this clones the extent 1070 * array and sorts both copies. 1071 */ 1072 ret = sort_idmaps(&new_map); 1073 if (ret < 0) 1074 goto out; 1075 1076 /* Install the map */ 1077 if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { 1078 memcpy(map->extent, new_map.extent, 1079 new_map.nr_extents * sizeof(new_map.extent[0])); 1080 } else { 1081 map->forward = new_map.forward; 1082 map->reverse = new_map.reverse; 1083 } 1084 smp_wmb(); 1085 map->nr_extents = new_map.nr_extents; 1086 1087 *ppos = count; 1088 ret = count; 1089out: 1090 if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { 1091 kfree(new_map.forward); 1092 kfree(new_map.reverse); 1093 map->forward = NULL; 1094 map->reverse = NULL; 1095 map->nr_extents = 0; 1096 } 1097 1098 mutex_unlock(&userns_state_mutex); 1099 kfree(kbuf); 1100 return ret; 1101} 1102 1103ssize_t proc_uid_map_write(struct file *file, const char __user *buf, 1104 size_t size, loff_t *ppos) 1105{ 1106 struct seq_file *seq = file->private_data; 1107 struct user_namespace *ns = seq->private; 1108 struct user_namespace *seq_ns = seq_user_ns(seq); 1109 1110 if (!ns->parent) 1111 return -EPERM; 1112 1113 if ((seq_ns != ns) && (seq_ns != ns->parent)) 1114 return -EPERM; 1115 1116 return map_write(file, buf, size, ppos, CAP_SETUID, 1117 &ns->uid_map, &ns->parent->uid_map); 1118} 1119 1120ssize_t proc_gid_map_write(struct file *file, const char __user *buf, 1121 size_t size, loff_t *ppos) 1122{ 1123 struct seq_file *seq = file->private_data; 1124 struct user_namespace *ns = seq->private; 1125 struct user_namespace *seq_ns = seq_user_ns(seq); 1126 1127 if (!ns->parent) 1128 return -EPERM; 1129 1130 if ((seq_ns != ns) && (seq_ns != ns->parent)) 1131 return -EPERM; 1132 1133 return map_write(file, buf, size, ppos, CAP_SETGID, 1134 &ns->gid_map, &ns->parent->gid_map); 1135} 1136 1137ssize_t proc_projid_map_write(struct file *file, const char __user *buf, 1138 size_t size, loff_t *ppos) 1139{ 1140 struct seq_file *seq = file->private_data; 1141 struct user_namespace *ns = seq->private; 1142 struct user_namespace *seq_ns = seq_user_ns(seq); 1143 1144 if (!ns->parent) 1145 return -EPERM; 1146 1147 if ((seq_ns != ns) && (seq_ns != ns->parent)) 1148 return -EPERM; 1149 1150 /* Anyone can set any valid project id no capability needed */ 1151 return map_write(file, buf, size, ppos, -1, 1152 &ns->projid_map, &ns->parent->projid_map); 1153} 1154 1155static bool new_idmap_permitted(const struct file *file, 1156 struct user_namespace *ns, int cap_setid, 1157 struct uid_gid_map *new_map) 1158{ 1159 const struct cred *cred = file->f_cred; 1160 1161 if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) 1162 return false; 1163 1164 /* Don't allow mappings that would allow anything that wouldn't 1165 * be allowed without the establishment of unprivileged mappings. 1166 */ 1167 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && 1168 uid_eq(ns->owner, cred->euid)) { 1169 u32 id = new_map->extent[0].lower_first; 1170 if (cap_setid == CAP_SETUID) { 1171 kuid_t uid = make_kuid(ns->parent, id); 1172 if (uid_eq(uid, cred->euid)) 1173 return true; 1174 } else if (cap_setid == CAP_SETGID) { 1175 kgid_t gid = make_kgid(ns->parent, id); 1176 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && 1177 gid_eq(gid, cred->egid)) 1178 return true; 1179 } 1180 } 1181 1182 /* Allow anyone to set a mapping that doesn't require privilege */ 1183 if (!cap_valid(cap_setid)) 1184 return true; 1185 1186 /* Allow the specified ids if we have the appropriate capability 1187 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 1188 * And the opener of the id file also has the appropriate capability. 1189 */ 1190 if (ns_capable(ns->parent, cap_setid) && 1191 file_ns_capable(file, ns->parent, cap_setid)) 1192 return true; 1193 1194 return false; 1195} 1196 1197int proc_setgroups_show(struct seq_file *seq, void *v) 1198{ 1199 struct user_namespace *ns = seq->private; 1200 unsigned long userns_flags = READ_ONCE(ns->flags); 1201 1202 seq_printf(seq, "%s\n", 1203 (userns_flags & USERNS_SETGROUPS_ALLOWED) ? 1204 "allow" : "deny"); 1205 return 0; 1206} 1207 1208ssize_t proc_setgroups_write(struct file *file, const char __user *buf, 1209 size_t count, loff_t *ppos) 1210{ 1211 struct seq_file *seq = file->private_data; 1212 struct user_namespace *ns = seq->private; 1213 char kbuf[8], *pos; 1214 bool setgroups_allowed; 1215 ssize_t ret; 1216 1217 /* Only allow a very narrow range of strings to be written */ 1218 ret = -EINVAL; 1219 if ((*ppos != 0) || (count >= sizeof(kbuf))) 1220 goto out; 1221 1222 /* What was written? */ 1223 ret = -EFAULT; 1224 if (copy_from_user(kbuf, buf, count)) 1225 goto out; 1226 kbuf[count] = '\0'; 1227 pos = kbuf; 1228 1229 /* What is being requested? */ 1230 ret = -EINVAL; 1231 if (strncmp(pos, "allow", 5) == 0) { 1232 pos += 5; 1233 setgroups_allowed = true; 1234 } 1235 else if (strncmp(pos, "deny", 4) == 0) { 1236 pos += 4; 1237 setgroups_allowed = false; 1238 } 1239 else 1240 goto out; 1241 1242 /* Verify there is not trailing junk on the line */ 1243 pos = skip_spaces(pos); 1244 if (*pos != '\0') 1245 goto out; 1246 1247 ret = -EPERM; 1248 mutex_lock(&userns_state_mutex); 1249 if (setgroups_allowed) { 1250 /* Enabling setgroups after setgroups has been disabled 1251 * is not allowed. 1252 */ 1253 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) 1254 goto out_unlock; 1255 } else { 1256 /* Permanently disabling setgroups after setgroups has 1257 * been enabled by writing the gid_map is not allowed. 1258 */ 1259 if (ns->gid_map.nr_extents != 0) 1260 goto out_unlock; 1261 ns->flags &= ~USERNS_SETGROUPS_ALLOWED; 1262 } 1263 mutex_unlock(&userns_state_mutex); 1264 1265 /* Report a successful write */ 1266 *ppos = count; 1267 ret = count; 1268out: 1269 return ret; 1270out_unlock: 1271 mutex_unlock(&userns_state_mutex); 1272 goto out; 1273} 1274 1275bool userns_may_setgroups(const struct user_namespace *ns) 1276{ 1277 bool allowed; 1278 1279 mutex_lock(&userns_state_mutex); 1280 /* It is not safe to use setgroups until a gid mapping in 1281 * the user namespace has been established. 1282 */ 1283 allowed = ns->gid_map.nr_extents != 0; 1284 /* Is setgroups allowed? */ 1285 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); 1286 mutex_unlock(&userns_state_mutex); 1287 1288 return allowed; 1289} 1290 1291/* 1292 * Returns true if @child is the same namespace or a descendant of 1293 * @ancestor. 1294 */ 1295bool in_userns(const struct user_namespace *ancestor, 1296 const struct user_namespace *child) 1297{ 1298 const struct user_namespace *ns; 1299 for (ns = child; ns->level > ancestor->level; ns = ns->parent) 1300 ; 1301 return (ns == ancestor); 1302} 1303 1304bool current_in_userns(const struct user_namespace *target_ns) 1305{ 1306 return in_userns(target_ns, current_user_ns()); 1307} 1308EXPORT_SYMBOL(current_in_userns); 1309 1310static inline struct user_namespace *to_user_ns(struct ns_common *ns) 1311{ 1312 return container_of(ns, struct user_namespace, ns); 1313} 1314 1315static struct ns_common *userns_get(struct task_struct *task) 1316{ 1317 struct user_namespace *user_ns; 1318 1319 rcu_read_lock(); 1320 user_ns = get_user_ns(__task_cred(task)->user_ns); 1321 rcu_read_unlock(); 1322 1323 return user_ns ? &user_ns->ns : NULL; 1324} 1325 1326static void userns_put(struct ns_common *ns) 1327{ 1328 put_user_ns(to_user_ns(ns)); 1329} 1330 1331static int userns_install(struct nsset *nsset, struct ns_common *ns) 1332{ 1333 struct user_namespace *user_ns = to_user_ns(ns); 1334 struct cred *cred; 1335 1336 /* Don't allow gaining capabilities by reentering 1337 * the same user namespace. 1338 */ 1339 if (user_ns == current_user_ns()) 1340 return -EINVAL; 1341 1342 /* Tasks that share a thread group must share a user namespace */ 1343 if (!thread_group_empty(current)) 1344 return -EINVAL; 1345 1346 if (current->fs->users != 1) 1347 return -EINVAL; 1348 1349 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 1350 return -EPERM; 1351 1352 cred = nsset_cred(nsset); 1353 if (!cred) 1354 return -EINVAL; 1355 1356 put_user_ns(cred->user_ns); 1357 set_cred_user_ns(cred, get_user_ns(user_ns)); 1358 1359 if (set_cred_ucounts(cred) < 0) 1360 return -EINVAL; 1361 1362 return 0; 1363} 1364 1365struct ns_common *ns_get_owner(struct ns_common *ns) 1366{ 1367 struct user_namespace *my_user_ns = current_user_ns(); 1368 struct user_namespace *owner, *p; 1369 1370 /* See if the owner is in the current user namespace */ 1371 owner = p = ns->ops->owner(ns); 1372 for (;;) { 1373 if (!p) 1374 return ERR_PTR(-EPERM); 1375 if (p == my_user_ns) 1376 break; 1377 p = p->parent; 1378 } 1379 1380 return &get_user_ns(owner)->ns; 1381} 1382 1383static struct user_namespace *userns_owner(struct ns_common *ns) 1384{ 1385 return to_user_ns(ns)->parent; 1386} 1387 1388const struct proc_ns_operations userns_operations = { 1389 .name = "user", 1390 .type = CLONE_NEWUSER, 1391 .get = userns_get, 1392 .put = userns_put, 1393 .install = userns_install, 1394 .owner = userns_owner, 1395 .get_parent = ns_get_owner, 1396}; 1397 1398static __init int user_namespaces_init(void) 1399{ 1400 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); 1401 return 0; 1402} 1403subsys_initcall(user_namespaces_init);