slob.c (19087B)
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * SLOB Allocator: Simple List Of Blocks 4 * 5 * Matt Mackall <mpm@selenic.com> 12/30/03 6 * 7 * NUMA support by Paul Mundt, 2007. 8 * 9 * How SLOB works: 10 * 11 * The core of SLOB is a traditional K&R style heap allocator, with 12 * support for returning aligned objects. The granularity of this 13 * allocator is as little as 2 bytes, however typically most architectures 14 * will require 4 bytes on 32-bit and 8 bytes on 64-bit. 15 * 16 * The slob heap is a set of linked list of pages from alloc_pages(), 17 * and within each page, there is a singly-linked list of free blocks 18 * (slob_t). The heap is grown on demand. To reduce fragmentation, 19 * heap pages are segregated into three lists, with objects less than 20 * 256 bytes, objects less than 1024 bytes, and all other objects. 21 * 22 * Allocation from heap involves first searching for a page with 23 * sufficient free blocks (using a next-fit-like approach) followed by 24 * a first-fit scan of the page. Deallocation inserts objects back 25 * into the free list in address order, so this is effectively an 26 * address-ordered first fit. 27 * 28 * Above this is an implementation of kmalloc/kfree. Blocks returned 29 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 30 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 31 * alloc_pages() directly, allocating compound pages so the page order 32 * does not have to be separately tracked. 33 * These objects are detected in kfree() because folio_test_slab() 34 * is false for them. 35 * 36 * SLAB is emulated on top of SLOB by simply calling constructors and 37 * destructors for every SLAB allocation. Objects are returned with the 38 * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which 39 * case the low-level allocator will fragment blocks to create the proper 40 * alignment. Again, objects of page-size or greater are allocated by 41 * calling alloc_pages(). As SLAB objects know their size, no separate 42 * size bookkeeping is necessary and there is essentially no allocation 43 * space overhead, and compound pages aren't needed for multi-page 44 * allocations. 45 * 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 47 * logic down to the page allocator, and simply doing the node accounting 48 * on the upper levels. In the event that a node id is explicitly 49 * provided, __alloc_pages_node() with the specified node id is used 50 * instead. The common case (or when the node id isn't explicitly provided) 51 * will default to the current node, as per numa_node_id(). 52 * 53 * Node aware pages are still inserted in to the global freelist, and 54 * these are scanned for by matching against the node id encoded in the 55 * page flags. As a result, block allocations that can be satisfied from 56 * the freelist will only be done so on pages residing on the same node, 57 * in order to prevent random node placement. 58 */ 59 60#include <linux/kernel.h> 61#include <linux/slab.h> 62 63#include <linux/mm.h> 64#include <linux/swap.h> /* struct reclaim_state */ 65#include <linux/cache.h> 66#include <linux/init.h> 67#include <linux/export.h> 68#include <linux/rcupdate.h> 69#include <linux/list.h> 70#include <linux/kmemleak.h> 71 72#include <trace/events/kmem.h> 73 74#include <linux/atomic.h> 75 76#include "slab.h" 77/* 78 * slob_block has a field 'units', which indicates size of block if +ve, 79 * or offset of next block if -ve (in SLOB_UNITs). 80 * 81 * Free blocks of size 1 unit simply contain the offset of the next block. 82 * Those with larger size contain their size in the first SLOB_UNIT of 83 * memory, and the offset of the next free block in the second SLOB_UNIT. 84 */ 85#if PAGE_SIZE <= (32767 * 2) 86typedef s16 slobidx_t; 87#else 88typedef s32 slobidx_t; 89#endif 90 91struct slob_block { 92 slobidx_t units; 93}; 94typedef struct slob_block slob_t; 95 96/* 97 * All partially free slob pages go on these lists. 98 */ 99#define SLOB_BREAK1 256 100#define SLOB_BREAK2 1024 101static LIST_HEAD(free_slob_small); 102static LIST_HEAD(free_slob_medium); 103static LIST_HEAD(free_slob_large); 104 105/* 106 * slob_page_free: true for pages on free_slob_pages list. 107 */ 108static inline int slob_page_free(struct slab *slab) 109{ 110 return PageSlobFree(slab_page(slab)); 111} 112 113static void set_slob_page_free(struct slab *slab, struct list_head *list) 114{ 115 list_add(&slab->slab_list, list); 116 __SetPageSlobFree(slab_page(slab)); 117} 118 119static inline void clear_slob_page_free(struct slab *slab) 120{ 121 list_del(&slab->slab_list); 122 __ClearPageSlobFree(slab_page(slab)); 123} 124 125#define SLOB_UNIT sizeof(slob_t) 126#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) 127 128/* 129 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 130 * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free 131 * the block using call_rcu. 132 */ 133struct slob_rcu { 134 struct rcu_head head; 135 int size; 136}; 137 138/* 139 * slob_lock protects all slob allocator structures. 140 */ 141static DEFINE_SPINLOCK(slob_lock); 142 143/* 144 * Encode the given size and next info into a free slob block s. 145 */ 146static void set_slob(slob_t *s, slobidx_t size, slob_t *next) 147{ 148 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); 149 slobidx_t offset = next - base; 150 151 if (size > 1) { 152 s[0].units = size; 153 s[1].units = offset; 154 } else 155 s[0].units = -offset; 156} 157 158/* 159 * Return the size of a slob block. 160 */ 161static slobidx_t slob_units(slob_t *s) 162{ 163 if (s->units > 0) 164 return s->units; 165 return 1; 166} 167 168/* 169 * Return the next free slob block pointer after this one. 170 */ 171static slob_t *slob_next(slob_t *s) 172{ 173 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); 174 slobidx_t next; 175 176 if (s[0].units < 0) 177 next = -s[0].units; 178 else 179 next = s[1].units; 180 return base+next; 181} 182 183/* 184 * Returns true if s is the last free block in its page. 185 */ 186static int slob_last(slob_t *s) 187{ 188 return !((unsigned long)slob_next(s) & ~PAGE_MASK); 189} 190 191static void *slob_new_pages(gfp_t gfp, int order, int node) 192{ 193 struct page *page; 194 195#ifdef CONFIG_NUMA 196 if (node != NUMA_NO_NODE) 197 page = __alloc_pages_node(node, gfp, order); 198 else 199#endif 200 page = alloc_pages(gfp, order); 201 202 if (!page) 203 return NULL; 204 205 mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, 206 PAGE_SIZE << order); 207 return page_address(page); 208} 209 210static void slob_free_pages(void *b, int order) 211{ 212 struct page *sp = virt_to_page(b); 213 214 if (current->reclaim_state) 215 current->reclaim_state->reclaimed_slab += 1 << order; 216 217 mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, 218 -(PAGE_SIZE << order)); 219 __free_pages(sp, order); 220} 221 222/* 223 * slob_page_alloc() - Allocate a slob block within a given slob_page sp. 224 * @sp: Page to look in. 225 * @size: Size of the allocation. 226 * @align: Allocation alignment. 227 * @align_offset: Offset in the allocated block that will be aligned. 228 * @page_removed_from_list: Return parameter. 229 * 230 * Tries to find a chunk of memory at least @size bytes big within @page. 231 * 232 * Return: Pointer to memory if allocated, %NULL otherwise. If the 233 * allocation fills up @page then the page is removed from the 234 * freelist, in this case @page_removed_from_list will be set to 235 * true (set to false otherwise). 236 */ 237static void *slob_page_alloc(struct slab *sp, size_t size, int align, 238 int align_offset, bool *page_removed_from_list) 239{ 240 slob_t *prev, *cur, *aligned = NULL; 241 int delta = 0, units = SLOB_UNITS(size); 242 243 *page_removed_from_list = false; 244 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { 245 slobidx_t avail = slob_units(cur); 246 247 /* 248 * 'aligned' will hold the address of the slob block so that the 249 * address 'aligned'+'align_offset' is aligned according to the 250 * 'align' parameter. This is for kmalloc() which prepends the 251 * allocated block with its size, so that the block itself is 252 * aligned when needed. 253 */ 254 if (align) { 255 aligned = (slob_t *) 256 (ALIGN((unsigned long)cur + align_offset, align) 257 - align_offset); 258 delta = aligned - cur; 259 } 260 if (avail >= units + delta) { /* room enough? */ 261 slob_t *next; 262 263 if (delta) { /* need to fragment head to align? */ 264 next = slob_next(cur); 265 set_slob(aligned, avail - delta, next); 266 set_slob(cur, delta, aligned); 267 prev = cur; 268 cur = aligned; 269 avail = slob_units(cur); 270 } 271 272 next = slob_next(cur); 273 if (avail == units) { /* exact fit? unlink. */ 274 if (prev) 275 set_slob(prev, slob_units(prev), next); 276 else 277 sp->freelist = next; 278 } else { /* fragment */ 279 if (prev) 280 set_slob(prev, slob_units(prev), cur + units); 281 else 282 sp->freelist = cur + units; 283 set_slob(cur + units, avail - units, next); 284 } 285 286 sp->units -= units; 287 if (!sp->units) { 288 clear_slob_page_free(sp); 289 *page_removed_from_list = true; 290 } 291 return cur; 292 } 293 if (slob_last(cur)) 294 return NULL; 295 } 296} 297 298/* 299 * slob_alloc: entry point into the slob allocator. 300 */ 301static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, 302 int align_offset) 303{ 304 struct folio *folio; 305 struct slab *sp; 306 struct list_head *slob_list; 307 slob_t *b = NULL; 308 unsigned long flags; 309 bool _unused; 310 311 if (size < SLOB_BREAK1) 312 slob_list = &free_slob_small; 313 else if (size < SLOB_BREAK2) 314 slob_list = &free_slob_medium; 315 else 316 slob_list = &free_slob_large; 317 318 spin_lock_irqsave(&slob_lock, flags); 319 /* Iterate through each partially free page, try to find room */ 320 list_for_each_entry(sp, slob_list, slab_list) { 321 bool page_removed_from_list = false; 322#ifdef CONFIG_NUMA 323 /* 324 * If there's a node specification, search for a partial 325 * page with a matching node id in the freelist. 326 */ 327 if (node != NUMA_NO_NODE && slab_nid(sp) != node) 328 continue; 329#endif 330 /* Enough room on this page? */ 331 if (sp->units < SLOB_UNITS(size)) 332 continue; 333 334 b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); 335 if (!b) 336 continue; 337 338 /* 339 * If slob_page_alloc() removed sp from the list then we 340 * cannot call list functions on sp. If so allocation 341 * did not fragment the page anyway so optimisation is 342 * unnecessary. 343 */ 344 if (!page_removed_from_list) { 345 /* 346 * Improve fragment distribution and reduce our average 347 * search time by starting our next search here. (see 348 * Knuth vol 1, sec 2.5, pg 449) 349 */ 350 if (!list_is_first(&sp->slab_list, slob_list)) 351 list_rotate_to_front(&sp->slab_list, slob_list); 352 } 353 break; 354 } 355 spin_unlock_irqrestore(&slob_lock, flags); 356 357 /* Not enough space: must allocate a new page */ 358 if (!b) { 359 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); 360 if (!b) 361 return NULL; 362 folio = virt_to_folio(b); 363 __folio_set_slab(folio); 364 sp = folio_slab(folio); 365 366 spin_lock_irqsave(&slob_lock, flags); 367 sp->units = SLOB_UNITS(PAGE_SIZE); 368 sp->freelist = b; 369 INIT_LIST_HEAD(&sp->slab_list); 370 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 371 set_slob_page_free(sp, slob_list); 372 b = slob_page_alloc(sp, size, align, align_offset, &_unused); 373 BUG_ON(!b); 374 spin_unlock_irqrestore(&slob_lock, flags); 375 } 376 if (unlikely(gfp & __GFP_ZERO)) 377 memset(b, 0, size); 378 return b; 379} 380 381/* 382 * slob_free: entry point into the slob allocator. 383 */ 384static void slob_free(void *block, int size) 385{ 386 struct slab *sp; 387 slob_t *prev, *next, *b = (slob_t *)block; 388 slobidx_t units; 389 unsigned long flags; 390 struct list_head *slob_list; 391 392 if (unlikely(ZERO_OR_NULL_PTR(block))) 393 return; 394 BUG_ON(!size); 395 396 sp = virt_to_slab(block); 397 units = SLOB_UNITS(size); 398 399 spin_lock_irqsave(&slob_lock, flags); 400 401 if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { 402 /* Go directly to page allocator. Do not pass slob allocator */ 403 if (slob_page_free(sp)) 404 clear_slob_page_free(sp); 405 spin_unlock_irqrestore(&slob_lock, flags); 406 __folio_clear_slab(slab_folio(sp)); 407 slob_free_pages(b, 0); 408 return; 409 } 410 411 if (!slob_page_free(sp)) { 412 /* This slob page is about to become partially free. Easy! */ 413 sp->units = units; 414 sp->freelist = b; 415 set_slob(b, units, 416 (void *)((unsigned long)(b + 417 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 418 if (size < SLOB_BREAK1) 419 slob_list = &free_slob_small; 420 else if (size < SLOB_BREAK2) 421 slob_list = &free_slob_medium; 422 else 423 slob_list = &free_slob_large; 424 set_slob_page_free(sp, slob_list); 425 goto out; 426 } 427 428 /* 429 * Otherwise the page is already partially free, so find reinsertion 430 * point. 431 */ 432 sp->units += units; 433 434 if (b < (slob_t *)sp->freelist) { 435 if (b + units == sp->freelist) { 436 units += slob_units(sp->freelist); 437 sp->freelist = slob_next(sp->freelist); 438 } 439 set_slob(b, units, sp->freelist); 440 sp->freelist = b; 441 } else { 442 prev = sp->freelist; 443 next = slob_next(prev); 444 while (b > next) { 445 prev = next; 446 next = slob_next(prev); 447 } 448 449 if (!slob_last(prev) && b + units == next) { 450 units += slob_units(next); 451 set_slob(b, units, slob_next(next)); 452 } else 453 set_slob(b, units, next); 454 455 if (prev + slob_units(prev) == b) { 456 units = slob_units(b) + slob_units(prev); 457 set_slob(prev, units, slob_next(b)); 458 } else 459 set_slob(prev, slob_units(prev), b); 460 } 461out: 462 spin_unlock_irqrestore(&slob_lock, flags); 463} 464 465#ifdef CONFIG_PRINTK 466void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) 467{ 468 kpp->kp_ptr = object; 469 kpp->kp_slab = slab; 470} 471#endif 472 473/* 474 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. 475 */ 476 477static __always_inline void * 478__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) 479{ 480 unsigned int *m; 481 unsigned int minalign; 482 void *ret; 483 484 minalign = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, 485 arch_slab_minalign()); 486 gfp &= gfp_allowed_mask; 487 488 might_alloc(gfp); 489 490 if (size < PAGE_SIZE - minalign) { 491 int align = minalign; 492 493 /* 494 * For power of two sizes, guarantee natural alignment for 495 * kmalloc()'d objects. 496 */ 497 if (is_power_of_2(size)) 498 align = max_t(unsigned int, minalign, size); 499 500 if (!size) 501 return ZERO_SIZE_PTR; 502 503 m = slob_alloc(size + minalign, gfp, align, node, minalign); 504 505 if (!m) 506 return NULL; 507 *m = size; 508 ret = (void *)m + minalign; 509 510 trace_kmalloc_node(caller, ret, 511 size, size + minalign, gfp, node); 512 } else { 513 unsigned int order = get_order(size); 514 515 if (likely(order)) 516 gfp |= __GFP_COMP; 517 ret = slob_new_pages(gfp, order, node); 518 519 trace_kmalloc_node(caller, ret, 520 size, PAGE_SIZE << order, gfp, node); 521 } 522 523 kmemleak_alloc(ret, size, 1, gfp); 524 return ret; 525} 526 527void *__kmalloc(size_t size, gfp_t gfp) 528{ 529 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); 530} 531EXPORT_SYMBOL(__kmalloc); 532 533void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) 534{ 535 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); 536} 537EXPORT_SYMBOL(__kmalloc_track_caller); 538 539#ifdef CONFIG_NUMA 540void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, 541 int node, unsigned long caller) 542{ 543 return __do_kmalloc_node(size, gfp, node, caller); 544} 545EXPORT_SYMBOL(__kmalloc_node_track_caller); 546#endif 547 548void kfree(const void *block) 549{ 550 struct folio *sp; 551 552 trace_kfree(_RET_IP_, block); 553 554 if (unlikely(ZERO_OR_NULL_PTR(block))) 555 return; 556 kmemleak_free(block); 557 558 sp = virt_to_folio(block); 559 if (folio_test_slab(sp)) { 560 unsigned int align = max_t(unsigned int, 561 ARCH_KMALLOC_MINALIGN, 562 arch_slab_minalign()); 563 unsigned int *m = (unsigned int *)(block - align); 564 565 slob_free(m, *m + align); 566 } else { 567 unsigned int order = folio_order(sp); 568 569 mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, 570 -(PAGE_SIZE << order)); 571 __free_pages(folio_page(sp, 0), order); 572 573 } 574} 575EXPORT_SYMBOL(kfree); 576 577/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ 578size_t __ksize(const void *block) 579{ 580 struct folio *folio; 581 unsigned int align; 582 unsigned int *m; 583 584 BUG_ON(!block); 585 if (unlikely(block == ZERO_SIZE_PTR)) 586 return 0; 587 588 folio = virt_to_folio(block); 589 if (unlikely(!folio_test_slab(folio))) 590 return folio_size(folio); 591 592 align = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, 593 arch_slab_minalign()); 594 m = (unsigned int *)(block - align); 595 return SLOB_UNITS(*m) * SLOB_UNIT; 596} 597EXPORT_SYMBOL(__ksize); 598 599int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) 600{ 601 if (flags & SLAB_TYPESAFE_BY_RCU) { 602 /* leave room for rcu footer at the end of object */ 603 c->size += sizeof(struct slob_rcu); 604 } 605 c->flags = flags; 606 return 0; 607} 608 609static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) 610{ 611 void *b; 612 613 flags &= gfp_allowed_mask; 614 615 might_alloc(flags); 616 617 if (c->size < PAGE_SIZE) { 618 b = slob_alloc(c->size, flags, c->align, node, 0); 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, 620 SLOB_UNITS(c->size) * SLOB_UNIT, 621 flags, node); 622 } else { 623 b = slob_new_pages(flags, get_order(c->size), node); 624 trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, 625 PAGE_SIZE << get_order(c->size), 626 flags, node); 627 } 628 629 if (b && c->ctor) { 630 WARN_ON_ONCE(flags & __GFP_ZERO); 631 c->ctor(b); 632 } 633 634 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 635 return b; 636} 637 638void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 639{ 640 return slob_alloc_node(cachep, flags, NUMA_NO_NODE); 641} 642EXPORT_SYMBOL(kmem_cache_alloc); 643 644 645void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags) 646{ 647 return slob_alloc_node(cachep, flags, NUMA_NO_NODE); 648} 649EXPORT_SYMBOL(kmem_cache_alloc_lru); 650#ifdef CONFIG_NUMA 651void *__kmalloc_node(size_t size, gfp_t gfp, int node) 652{ 653 return __do_kmalloc_node(size, gfp, node, _RET_IP_); 654} 655EXPORT_SYMBOL(__kmalloc_node); 656 657void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) 658{ 659 return slob_alloc_node(cachep, gfp, node); 660} 661EXPORT_SYMBOL(kmem_cache_alloc_node); 662#endif 663 664static void __kmem_cache_free(void *b, int size) 665{ 666 if (size < PAGE_SIZE) 667 slob_free(b, size); 668 else 669 slob_free_pages(b, get_order(size)); 670} 671 672static void kmem_rcu_free(struct rcu_head *head) 673{ 674 struct slob_rcu *slob_rcu = (struct slob_rcu *)head; 675 void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); 676 677 __kmem_cache_free(b, slob_rcu->size); 678} 679 680void kmem_cache_free(struct kmem_cache *c, void *b) 681{ 682 kmemleak_free_recursive(b, c->flags); 683 trace_kmem_cache_free(_RET_IP_, b, c->name); 684 if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { 685 struct slob_rcu *slob_rcu; 686 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 687 slob_rcu->size = c->size; 688 call_rcu(&slob_rcu->head, kmem_rcu_free); 689 } else { 690 __kmem_cache_free(b, c->size); 691 } 692} 693EXPORT_SYMBOL(kmem_cache_free); 694 695void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 696{ 697 __kmem_cache_free_bulk(s, size, p); 698} 699EXPORT_SYMBOL(kmem_cache_free_bulk); 700 701int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 702 void **p) 703{ 704 return __kmem_cache_alloc_bulk(s, flags, size, p); 705} 706EXPORT_SYMBOL(kmem_cache_alloc_bulk); 707 708int __kmem_cache_shutdown(struct kmem_cache *c) 709{ 710 /* No way to check for remaining objects */ 711 return 0; 712} 713 714void __kmem_cache_release(struct kmem_cache *c) 715{ 716} 717 718int __kmem_cache_shrink(struct kmem_cache *d) 719{ 720 return 0; 721} 722 723static struct kmem_cache kmem_cache_boot = { 724 .name = "kmem_cache", 725 .size = sizeof(struct kmem_cache), 726 .flags = SLAB_PANIC, 727 .align = ARCH_KMALLOC_MINALIGN, 728}; 729 730void __init kmem_cache_init(void) 731{ 732 kmem_cache = &kmem_cache_boot; 733 slab_state = UP; 734} 735 736void __init kmem_cache_init_late(void) 737{ 738 slab_state = FULL; 739}