之前多多少少接触过cache之类的东西,总觉的很神秘,当然cache就是为了读写内存更高效。比如查看meminfo或者slabinfo的时候,你是否真的对内存机制理解的很清晰?
参考内核linux 3.8.13
我们看看调用它的函数接口:
- /*
- * Set up kernel memory allocators
- */
- static void __init mm_init(void)
- {
- /*
- * page_cgroup requires contiguous pages,
- * bigger than MAX_ORDER unless SPARSEMEM.
- */
- page_cgroup_init_flatmem();
- mem_init();
- kmem_cache_init();
- percpu_init_late();
- pgtable_cache_init();
- vmalloc_init();
- }
这个函数在start_kernel里调用. 下面我们就看看
kmem_cache_init ( ) ; //默认slab分配器
- /*
- * Initialisation. Called after the page allocator have been initialised and
- * before smp_init().
- */
- void __init kmem_cache_init(void)
- {
- struct cache_sizes *sizes;
- struct cache_names *names;
- int i;
-
- kmem_cache = &kmem_cache_boot;
- setup_nodelists_pointer(kmem_cache); // 关于为什么要设置这个玩意,我找到一个patch说明
-
- From 3c58346525d82625e68e24f071804c2dc057b6f4 Mon Sep 17 00:00:00 2001
- From: Christoph Lameter cl@linux.com>
- Date: Wed, 28 Nov 2012 16:23:01 +0000
- Subject: [PATCH] slab: Simplify bootstrap
-
- The nodelists field in kmem_cache is pointing to the first unused
- object in the array field when bootstrap is complete.
-
- A problem with the current approach is that the statically sized
- kmem_cache structure use on boot can only contain NR_CPUS entries.
- If the number of nodes plus the number of cpus is greater then we
- would overwrite memory following the kmem_cache_boot definition.
-
- Increase the size of the array field to ensure that also the node
- pointers fit into the array field.
-
- Once we do that we no longer need the kmem_cache_nodelists
- array and we can then also use that structure elsewhere.
-
- Acked-by: Glauber Costa glommer@parallels.com>
- Signed-off-by: Christoph Lameter cl@linux.com>
- Signed-off-by: Pekka Enberg penberg@kernel.org>
-
- if (num_possible_nodes() == 1)
- use_alien_caches = 0;
-
- for (i = 0; i NUM_INIT_LISTS; i++)
- kmem_list3_init(&initkmem_list3[i]);
-
- set_up_list3s(kmem_cache, CACHE_CACHE);
-
- /*
- * Fragmentation resistance on low memory - only use bigger
- * page orders on machines with more than 32MB of memory if
- * not overridden on the command line.
- */
- if (!slab_max_order_set && totalram_pages > (32 20) >> PAGE_SHIFT)
- slab_max_order = SLAB_MAX_ORDER_HI;
-
- /* Bootstrap is tricky, because several objects are allocated
- * from caches that do not exist yet:
- * 1) initialize the kmem_cache cache: it contains the struct
- * kmem_cache structures of all caches, except kmem_cache itself:
- * kmem_cache is statically allocated.
- * Initially an __init data area is used for the head array and the
- * kmem_list3 structures, it's replaced with a kmalloc allocated
- * array at the end of the bootstrap.
- * 2) Create the first kmalloc cache.
- * The struct kmem_cache for the new cache is allocated normally.
- * An __init data area is used for the head array.
- * 3) Create the remaining kmalloc caches, with minimally sized
- * head arrays.
- * 4) Replace the __init data head arrays for kmem_cache and the first
- * kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_list3 for kmem_cache and
- * the other cache's with kmalloc allocated memory.
- * 6) Resize the head arrays of the kmalloc caches to their final sizes.
- */
-
- /* 1) create the kmem_cache */
-
- /*
- * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
- */
- create_boot_cache(kmem_cache, "kmem_cache",
- offsetof(struct kmem_cache, array[nr_cpu_ids]) +
- nr_node_ids * sizeof(struct kmem_list3 *),
- SLAB_HWCACHE_ALIGN);
- list_add(&kmem_cache->list, &slab_caches); // create kmem_cache后把它添加到slab_caches全局链表.
-
- /* 2+3) create the kmalloc caches */
- sizes = malloc_sizes;
- names = cache_names;
-
- /*
- * Initialize the caches that provide memory for the array cache and the
- * kmem_list3 structures first. Without this, further allocations will
- * bug.
- */
-
- sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
-
- if (INDEX_AC != INDEX_L3)
- sizes[INDEX_L3].cs_cachep =
- create_kmalloc_cache(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
-
- slab_early_init = 0;
-
- while (sizes->cs_size != ULONG_MAX) {
- /*
- * For performance, all the general caches are L1 aligned.
- * This should be particularly beneficial on SMP boxes, as it
- * eliminates "false sharing".
- * Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches.
- */
- if (!sizes->cs_cachep)
- sizes->cs_cachep = create_kmalloc_cache(names->name,
- sizes->cs_size, ARCH_KMALLOC_FLAGS);
-
- #ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = create_kmalloc_cache(
- names->name_dma, sizes->cs_size,
- SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
- #endif
- sizes++;
- names++;
- }
- /* 4) Replace the bootstrap head arrays */
- {
- struct array_cache *ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- memcpy(ptr, cpu_cache_get(kmem_cache),
- sizeof(struct arraycache_init));
- /*
- * Do not assume that spinlocks can be initialized via memcpy:
- */
- spin_lock_init(&ptr->lock);
-
- kmem_cache->array[smp_processor_id()] = ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
- != &initarray_generic.cache);
- memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
- sizeof(struct arraycache_init));
- /*
- * Do not assume that spinlocks can be initialized via memcpy:
- */
- spin_lock_init(&ptr->lock);
-
- malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
- ptr;
- }
- /* 5) Replace the bootstrap kmem_list3's */
- {
- int nid;
-
- for_each_online_node(nid) {
- init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
-
- init_list(malloc_sizes[INDEX_AC].cs_cachep,
- &initkmem_list3[SIZE_AC + nid], nid);
-
- if (INDEX_AC != INDEX_L3) {
- init_list(malloc_sizes[INDEX_L3].cs_cachep,
- &initkmem_list3[SIZE_L3 + nid], nid);
- }
- }
- }
-
- slab_state = UP;
- }
第一行来自一个全局的指针变量,即为创建第一个cache( kmem_cache)
在mm/slab_common.c中
struct kmem_cache *kmem_cache;
创建的所有cache都会挂在LIST_HEAD(slab_caches); 这个全局链表上.在cat /proc/slabinfo可以查看》
这里可以看看 struct kmem_cache:在slab_def.h中
- struct kmem_cache {
- /* 1) Cache tunables. Protected by cache_chain_mutex */
- unsigned int batchcount;
- unsigned int limit;
- unsigned int shared;
-
- unsigned int size;
- u32 reciprocal_buffer_size;
- /* 2) touched by every alloc & free from the backend */
-
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
-
- /* 3) cache_grow/shrink */
- /* order of pgs per slab (2^n) */
- unsigned int gfporder;
-
- /* force GFP flags, e.g. GFP_DMA */
- gfp_t allocflags;
-
- size_t colour; /* cache colouring range */
- unsigned int colour_off; /* colour offset */
- struct kmem_cache *slabp_cache;
- unsigned int slab_size;
-
- /* constructor func */
- void (*ctor)(void *obj);
-
- /* 4) cache creation/removal */
- const char *name;
- struct list_head list;
- int refcount;
- int object_size;
- int align;
-
- /* 5) statistics */
- #ifdef CONFIG_DEBUG_SLAB
- unsigned long num_active;
- unsigned long num_allocations;
- unsigned long high_mark;
- unsigned long grown;
- unsigned long reaped;
- unsigned long errors;
- unsigned long max_freeable;
- unsigned long node_allocs;
- unsigned long node_frees;
- unsigned long node_overflow;
- atomic_t allochit;
- atomic_t allocmiss;
- atomic_t freehit;
- atomic_t freemiss;
-
- /*
- * If debugging is enabled, then the allocator can add additional
- * fields and/or padding to every object. size contains the total
- * object size including these internal fields, the following two
- * variables contain the offset to the user object and its size.
- */
- int obj_offset;
- #endif /* CONFIG_DEBUG_SLAB */
- #ifdef CONFIG_MEMCG_KMEM
- struct memcg_cache_params *memcg_params;
- #endif
-
- /* 6) per-cpu/per-node data, touched during every alloc/free */
- /*
- * We put array[] at the end of kmem_cache, because we want to size
- * this array to nr_cpu_ids slots instead of NR_CPUS
- * (see kmem_cache_init())
- * We still use [NR_CPUS] and not [1] or [0] because cache_cache
- * is statically defined, so we reserve the max number of cpus.
- *
- * We also need to guarantee that the list is able to accomodate a
- * pointer for each node since "nodelists" uses the remainder of
- * available pointers.
- */
- struct kmem_list3 **nodelists;
- struct array_cache *array[NR_CPUS + MAX_NUMNODES];
- /*
- * Do not add fields after array[]
- */
- }
这个结构体里面几个关键的元素之前在kmalloc里已经说到了。
而
kmem_cache_boot则是: - /* internal cache of cache description objs */
- static struct kmem_cache kmem_cache_boot = {
- .batchcount = 1,
- .limit = BOOT_CPUCACHE_ENTRIES, // 默认为 1
- .shared = 1,
- .size = sizeof(struct kmem_cache),
- .name = "kmem_cache",
- };
注释解释的已经很清晰了. 而setup_nodelists_pointer的作用就是把struct kmem_cache里array指针地址存放在nodelists.目的是为了便于操作指针. 对于一致性内存访问,inode只有一个. - static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
它是slab.c中静态全局变量 - /*
- * Need this for bootstrapping a per node allocator.
- */
kmem_list3_init初始化slab的三个链表slabs_full、slabs_partial、slabs_free.为什么初始化这个和cache组成结构有关系,可以看个图: 这里CACHE_CACHE在文件的开头部分被定义为0. - /*
- * For setting up all the kmem_list3s for cache whose buffer_size is same as
- * size of kmem_list3.
- */
- static void __init set_up_list3s(struct kmem_cache *cachep, int index)
- {
- int node;
-
- for_each_online_node(node) {
- cachep->nodelists[node] = &initkmem_list3[index + node];
- cachep->nodelists[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
- }
- }
接着就要开始了真正的创建cache的工作,并且给出了初始化步骤和说明: - /* Bootstrap is tricky, because several objects are allocated
- * from caches that do not exist yet:
- * 1) initialize the kmem_cache cache: it contains the struct
- * kmem_cache structures of all caches, except kmem_cache itself:
- * kmem_cache is statically allocated.
- * Initially an __init data area is used for the head array and the
- * kmem_list3 structures, it's replaced with a kmalloc allocated
- * array at the end of the bootstrap.
- * 2) Create the first kmalloc cache.
- * The struct kmem_cache for the new cache is allocated normally.
- * An __init data area is used for the head array.
- * 3) Create the remaining kmalloc caches, with minimally sized
- * head arrays.
- * 4) Replace the __init data head arrays for kmem_cache and the first
- * kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_list3 for kmem_cache and
- * the other cache's with kmalloc allocated memory.
- * 6) Resize the head arrays of the kmalloc caches to their final sizes.
- */
-
- /* 1) create the kmem_cache */
-
- /*
- * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
- */
- create_boot_cache(kmem_cache, "kmem_cache",
- offsetof(struct kmem_cache, array[nr_cpu_ids]) +
- nr_node_ids * sizeof(struct kmem_list3 *),
- SLAB_HWCACHE_ALIGN);
- list_add(&kmem_cache->list, &slab_caches);
首先创建第一个cache它名为kmem_cache,并且kmem_cache指针变量指向了kmem_cache_boot. 下面我们看看 create_boot_cache函数 - #ifndef CONFIG_SLOB
- /* Create a cache during boot when no slab services are available yet */
- void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
- unsigned long flags)
- {
- int err;
-
- s->name = name;
- s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
- err = __kmem_cache_create(s, flags);
-
- if (err)
- panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
- name, size, err);
-
- s->refcount = -1; /* Exempt from merging for now */
- }
-
- struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
- unsigned long flags)
- {
- struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-
- if (!s)
- panic("Out of memory when creating slab %s\n", name);
-
- create_boot_cache(s, name, size, flags);
- list_add(&s->list, &slab_caches);
- s->refcount = 1;
- return s;
- }
-
- #endif /* !CONFIG_SLOB */
而它接着调用了__kmem_cache_create:这是最关键的函数 - /**
- * __kmem_cache_create - Create a cache.
- * @cachep: cache management descriptor
- * @flags: SLAB flags
- *
- * Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * The flags are
- *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline. This can be beneficial if you're counting cycles as closely
- * as davem.
- */
- int
- __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
- {
- size_t left_over, slab_size, ralign;
- gfp_t gfp;
- int err;
- size_t size = cachep->size;
-
- #if DEBUG
- #if FORCED_DEBUG
- /*
- * Enable redzoning and last user accounting, except for caches with
- * large objects, if the increased size would increase the object size
- * above the next power of two: caches with object sizes just above a
- * power of two have a significant amount of internal fragmentation.
- */
- if (size 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
- 2 * sizeof(unsigned long long)))
- flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
- if (!(flags & SLAB_DESTROY_BY_RCU))
- flags |= SLAB_POISON;
- #endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
- #endif
-
- /*
- * Check that size is in terms of words. This is needed to avoid
- * unaligned accesses for some archs when redzoning is used, and makes
- * sure any on-slab bufctl's are also correctly aligned.
- */
- if (size & (BYTES_PER_WORD - 1)) {
- size += (BYTES_PER_WORD - 1);
- size &= ~(BYTES_PER_WORD - 1);
- } //4//四字节对齐
-
- /*
- * Redzoning and user store require word alignment or possibly larger.
- * Note this will be overridden by architecture or caller mandated
- * alignment if either is greater than BYTES_PER_WORD.
- */
- if (flags & SLAB_STORE_USER)
- ralign = BYTES_PER_WORD;
-
- if (flags & SLAB_RED_ZONE) {
- ralign = REDZONE_ALIGN;
- /* If redzoning, ensure that the second redzone is suitably
- * aligned, by adjusting the object size accordingly. */
- size += REDZONE_ALIGN - 1;
- size &= ~(REDZONE_ALIGN - 1);
- }
-
- /* 3) caller mandated alignment */
- if (ralign cachep->align) {
- ralign = cachep->align;
- }
- /* disable debug if necessary */
- if (ralign > __alignof__(unsigned long long))
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
- /*
- * 4) Store it.
- */
- cachep->align = ralign;
-
- if (slab_is_available()) // 为什么要插入这一段注释,因为它就是判断slab_state的值,默认它的值没人初始化即为DOWN.
- /*
- * State of the slab allocator.
- *
- * This is used to describe the states of the allocator during bootup.
- * Allocators use this to gradually bootstrap themselves. Most allocators
- * have the problem that the structures used for managing slab caches are
- * allocated from slab caches themselves.
- */
- enum slab_state {
- DOWN, /* No slab functionality yet */
- PARTIAL, /* SLUB: kmem_cache_node available */
- PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
- PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
- UP, /* Slab caches usable but not all extras yet */
- FULL /* Everything is working */
- };
- gfp = GFP_KERNEL;
- else
- gfp = GFP_NOWAIT;
- //点击(此处)折叠或打开
- #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
-
- setup_nodelists_pointer(cachep);
- #if DEBUG
-
- /*
- * Both debugging options require word-alignment which is calculated
- * into align above.
- */
- if (flags & SLAB_RED_ZONE) {
- /* add space for red zone words */
- cachep->obj_offset += sizeof(unsigned long long);
- size += 2 * sizeof(unsigned long long);
- }
- if (flags & SLAB_STORE_USER) {
- /* user store requires one word storage behind the end of
- * the real object. But if the second red zone needs to be
- * aligned to 64 bits, we must allow that much space.
- */
- if (flags & SLAB_RED_ZONE)
- size += REDZONE_ALIGN;
- else
- size += BYTES_PER_WORD;
- }
- #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->object_size > cache_line_size()
- && ALIGN(size, cachep->align) PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
- #endif
- #endif
-
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do // 判断slab管理信息是否在slab分配的内存页上,判断条件见下面:
- * it too early on. Always use on-slab management when // size >= (默认page =4k/8k) 512/1024 ; slab_early_init在创建kmem_cache的时候为1;当创建通用cache
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) //的时才会把它初始化为0 . 而第一传递的flags为 SLAB_HWCACHE_ALIGN
- */
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
-
- size = ALIGN(size, cachep->align);
-
- left_over = calculate_slab_order(cachep, size, cachep->align, flags); // 根据obj size 计算申请page的个数即一个slab包含多少个pages,
- if (!cachep->num) // 也包含了多少个obj,除去管理信息等 剩余的空间。很简单易懂.
- return -E2BIG;
-
- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), cachep->align);
-
- /*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
- */
- if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= slab_size;
- }
-
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- slab_size =
- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
-
- #ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
- #endif
- }
-
- cachep->colour_off = cache_line_size(); //32B
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off; // slab 着色的初始化工作.
- cachep->slab_size = slab_size;
- cachep->flags = flags;
- cachep->allocflags = 0;
- if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
- cachep->allocflags |= GFP_DMA;
- cachep->size = size;
- cachep->reciprocal_buffer_size = reciprocal_value(size);
-
- if (flags & CFLGS_OFF_SLAB) {
- cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
- /*
- * This is a possibility for one of the malloc_sizes caches.
- * But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
- * this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
- }
-
- err = setup_cpu_cache(cachep, gfp);
- if (err) {
- __kmem_cache_shutdown(cachep);
- return err;
- }
-
- if (flags & SLAB_DEBUG_OBJECTS) {
- /*
- * Would deadlock through slab_destroy()->call_rcu()->
- * debug_object_activate()->kmem_cache_alloc().
- */
- WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
-
- slab_set_debugobj_lock_classes(cachep);
- } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
- on_slab_lock_classes(cachep);
-
- return 0;
- }
它里面有个很有趣的函数很关键的一个函数:它泄露了slab具体管理obj的布局和方法.
- /**
- * calculate_slab_order - calculate size (page order) of slabs
- * @cachep: pointer to the cache that is being created
- * @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
- * @flags: slab allocation flags
- *
- * Also calculates the number of objects per slab.
- *
- * This could be made much more intelligent. For now, try to avoid using
- * high order pages for slabs. When the gfp() functions are more friendly
- * towards high-order requests, this should be changed.
- */
- static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
- {
- unsigned long offslab_limit;
- size_t left_over = 0;
- int gfporder;
-
- for (gfporder = 0; gfporder = KMALLOC_MAX_ORDER; gfporder++) {
- unsigned int num;
- size_t remainder;
-
- cache_estimate(gfporder, size, align, flags, &remainder, &num); // 根据是off-slab 还是on-slab除去管理信息后多少个页面才能存下一个obj.以及其他信息,值得仔细看看.
- if (!num) // 必须保证slab至少能装下一个obj
- continue;
-
- if (flags & CFLGS_OFF_SLAB) {
- /*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
- */
- offslab_limit = size - sizeof(struct slab);
- offslab_limit /= sizeof(kmem_bufctl_t);
-
- if (num > offslab_limit)
- break;
- }
-
- /* Found something acceptable - save it away */
- cachep->num = num;
- cachep->gfporder = gfporder;
- left_over = remainder;
-
- /*
- * A VFS-reclaimable slab tends to have most allocations
- * as GFP_NOFS and we really don't want to have to be allocating
- * higher-order pages when we are unable to shrink dcache.
- */
- if (flags & SLAB_RECLAIM_ACCOUNT)
- break;
-
- /*
- * Large number of objects is good, but very large slabs are
- * currently bad for the gfp()s.
- */
- if (gfporder >= slab_max_order)
- break;
-
- /*
- * Acceptable internal fragmentation?
- */
- if (left_over * 8 = (PAGE_SIZE gfporder))
- break;
- }
- return left_over;
- }
经过上面的初始化和设置,最后调用setup_cpu_cache就完成了一个创建cache的工作.接着进行第2、3步的工作:
- /* 2+3) create the kmalloc caches */
- sizes = malloc_sizes;
- names = cache_names;
-
- /*
- * Initialize the caches that provide memory for the array cache and the
- * kmem_list3 structures first. Without this, further allocations will
- * bug.
- */
-
- sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, // create obj size 为sizeof(struct arraycache_init) 的cache
- sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
-
- if (INDEX_AC != INDEX_L3)
- sizes[INDEX_L3].cs_cachep =
- create_kmalloc_cache(names[INDEX_L3].name, //// create obj size 为sizeof(struct kmem_list3) 的cache
- sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
-
- slab_early_init = 0;
-
- while (sizes->cs_size != ULONG_MAX) { //创建通用cache 根据 malloc_sizes ,cache_names
- /*
- * For performance, all the general caches are L1 aligned.
- * This should be particularly beneficial on SMP boxes, as it
- * eliminates "false sharing".
- * Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches.
- */
- if (!sizes->cs_cachep)
- sizes->cs_cachep = create_kmalloc_cache(names->name,
- sizes->cs_size, ARCH_KMALLOC_FLAGS);
-
- #ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = create_kmalloc_cache(
- names->name_dma, sizes->cs_size,
- SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
- #endif
- sizes++;
- names++;
- }
这里在说一下
cache_names和 malloc_sizes: - /*
- * These are the default caches for kmalloc. Custom caches can have other sizes.
- */
- struct cache_sizes malloc_sizes[] = {
- #define CACHE(x) { .cs_size = (x) },
- #include linux/kmalloc_sizes.h>
- CACHE(ULONG_MAX)
- #undef CACHE
- };
这里就不扩展开了.
- /* Must match cache_sizes above. Out of line to keep cache footprint low. */
- struct cache_names {
- char *name;
- char *name_dma;
- };
-
- static struct cache_names __initdata cache_names[] = {
- #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
- #include linux/kmalloc_sizes.h>
- { NULL,}
- #undef CACHE
- };
create_kmalloc_cache实际上是调用create_boot_cache. 把kernel预定义的通用cache创建一遍.之后我们进入第四步、第5步: - /* 4) Replace the bootstrap head arrays */
- {
- struct array_cache *ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- memcpy(ptr, cpu_cache_get(kmem_cache),
- sizeof(struct arraycache_init));
- /*
- * Do not assume that spinlocks can be initialized via memcpy:
- */
- spin_lock_init(&ptr->lock);
-
- kmem_cache->array[smp_processor_id()] = ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
- != &initarray_generic.cache);
- memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
- sizeof(struct arraycache_init));
- /*
- * Do not assume that spinlocks can be initialized via memcpy:
- */
- spin_lock_init(&ptr->lock);
-
- malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
- ptr;
- }
- /* 5) Replace the bootstrap kmem_list3's */
- {
- int nid;
-
- for_each_online_node(nid) {
- init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
-
- init_list(malloc_sizes[INDEX_AC].cs_cachep,
- &initkmem_list3[SIZE_AC + nid], nid);
-
- if (INDEX_AC != INDEX_L3) {
- init_list(malloc_sizes[INDEX_L3].cs_cachep,
- &initkmem_list3[SIZE_L3 + nid], nid);
- }
- }
- }
-
- slab_state = UP;
最后把slab_state状态设置为up 即已经可以正常使用了。虽然上面大部分是代码,具体申请内存的流程前面kmalloc已经讲过了。仅仅是为了弄明白cache到底是个什么玩意,以及如何初始化的。
在kmem_cache_init后,还有一个kmem_cache_init_late函数.
它主要是调用了enable_cpucache和注册一个cpu通知连
- /*
- * Register a cpu startup notifier callback that initializes
- * cpu_cache_get for all new cpus
- */
- register_cpu_notifier(&cpucache_notifier);
还记不记得之前我们分析batchcount的时候的矛盾点?
- /* Called with slab_mutex held always */
- static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
- {
- int err;
- int limit = 0;
- int shared = 0;
- int batchcount = 0;
-
- if (!is_root_cache(cachep)) {
- struct kmem_cache *root = memcg_root_cache(cachep);
- limit = root->limit;
- shared = root->shared;
- batchcount = root->batchcount;
- }
-
- if (limit && shared && batchcount)
- goto skip_setup;
- /*
- * The head array serves three purposes:
- * - create a LIFO ordering, i.e. return objects that are cache-warm
- * - reduce the number of spinlock operations.
- * - reduce the number of linked list operations on the slab and
- * bufctl chains: array operations are cheaper.
- * The numbers are guessed, we should auto-tune as described by
- * Bonwick.
- */
- if (cachep->size > 131072)
- limit = 1;
- else if (cachep->size > PAGE_SIZE)
- limit = 8;
- else if (cachep->size > 1024)
- limit = 24;
- else if (cachep->size > 256)
- limit = 54;
- else
- limit = 120;
-
- /*
- * CPU bound tasks (e.g. network routing) can exhibit cpu bound
- * allocation behaviour: Most allocs on one cpu, most free operations
- * on another cpu. For these cases, an efficient object passing between
- * cpus is necessary. This is provided by a shared array. The array
- * replaces Bonwick's magazine layer.
- * On uniprocessor, it's functionally equivalent (but less efficient)
- * to a larger limit. Thus disabled by default.
- */
- shared = 0;
- if (cachep->size = PAGE_SIZE && num_possible_cpus() > 1)
- shared = 8;
-
- #if DEBUG
- /*
- * With debugging enabled, large batchcount lead to excessively long
- * periods with disabled local interrupts. Limit the batchcount
- */
- if (limit > 32)
- limit = 32;
- #endif
- batchcount = (limit + 1) / 2;
- skip_setup:
- err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
- if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
- cachep->name, -err);
- return err;
- }
它会根据obj size 计算limit值 ,再去计算batchcount的值.
这个只是一个小小的开始吧,内存管理本来就博大精深,只有遇到具体问题具体分析,来加深理解了.