/* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo * * This file is released under the GPLv2. * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks. Each chunk is * consisted of boot-time determined number of units and the first * chunk is used for static percpu variables in the kernel image * (special boot time alloc/init handling necessary as these areas * need to be brought up before allocation services are running). * Unit grows as necessary and all units grow or shrink in unison. * When a chunk is filled up, another chunk is allocated. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to * cpus. On NUMA, the mapping can be non-linear and even sparse. * Percpu access can be done by configuring percpu base registers * according to cpu to unit mapping and pcpu_unit_size. * * There are usually many small percpu allocations many of them being * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be equal to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * * Allocation state in each chunk is kept using an array of integers * on chunk->map. A positive value in the map represents a free * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. * Chunks can be determined from the address using the index field * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PCPU_SLOT_BASE_SHIFT 5 #define PCPU_DFL_MAP_ALLOC 16 #ifdef CONFIG_SMP #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif struct pcpu_chunk { struct list_head list; int free_size; int contig_hint; void *base_addr; int map_used; int map_alloc; int *map; void *data; bool immutable; unsigned long populated[]; }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; static int pcpu_nr_units __read_mostly; static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; static unsigned int pcpu_low_unit_cpu __read_mostly; static unsigned int pcpu_high_unit_cpu __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); static const int *pcpu_unit_map __read_mostly; const unsigned long *pcpu_unit_offsets __read_mostly; static int pcpu_nr_groups __read_mostly; static const unsigned long *pcpu_group_offsets __read_mostly; static const size_t *pcpu_group_sizes __read_mostly; static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; static DEFINE_MUTEX(pcpu_alloc_mutex); static DEFINE_SPINLOCK(pcpu_lock); static struct list_head *pcpu_slot __read_mostly; static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); static bool pcpu_addr_in_first_chunk(void *addr) { void *first_start = pcpu_first_chunk->base_addr; return addr >= first_start && addr < first_start + pcpu_unit_size; } static bool pcpu_addr_in_reserved_chunk(void *addr) { void *first_start = pcpu_first_chunk->base_addr; return addr >= first_start && addr < first_start + pcpu_reserved_chunk_limit; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_nr_slots - 1; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) return 0; return pcpu_size_to_slot(chunk->free_size); } static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) { *rs = find_next_zero_bit(chunk->populated, end, *rs); *re = find_next_bit(chunk->populated, end, *rs + 1); } static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) { *rs = find_next_bit(chunk->populated, end, *rs); *re = find_next_zero_bit(chunk->populated, end, *rs + 1); } #define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ (rs) < (re); \ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) #define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ (rs) < (re); \ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) static void *pcpu_mem_zalloc(size_t size) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static void pcpu_mem_free(void *ptr, size_t size) { if (size <= PAGE_SIZE) kfree(ptr); else vfree(ptr); } static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); if (chunk != pcpu_reserved_chunk && oslot != nslot) { if (oslot < nslot) list_move(&chunk->list, &pcpu_slot[nslot]); else list_move_tail(&chunk->list, &pcpu_slot[nslot]); } } static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; if (chunk->map_alloc >= chunk->map_used + 2) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) new_alloc *= 2; return new_alloc; } static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) { int *old = NULL, *new = NULL; size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; spin_lock_irqsave(&pcpu_lock, flags); if (new_alloc <= chunk->map_alloc) goto out_unlock; old_size = chunk->map_alloc * sizeof(chunk->map[0]); old = chunk->map; memcpy(new, old, old_size); chunk->map_alloc = new_alloc; chunk->map = new; new = NULL; out_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); pcpu_mem_free(old, old_size); pcpu_mem_free(new, new_size); return 0; } static void pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) { int nr_extra = !!head + !!tail; BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); memmove(&chunk->map[i + nr_extra], &chunk->map[i], sizeof(chunk->map[0]) * (chunk->map_used - i)); chunk->map_used += nr_extra; if (head) { chunk->map[i + 1] = chunk->map[i] - head; chunk->map[i++] = head; } if (tail) { chunk->map[i++] -= tail; chunk->map[i] = tail; } } static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { bool is_last = i + 1 == chunk->map_used; int head, tail; head = ALIGN(off, align) - off; BUG_ON(i == 0 && head != 0); if (chunk->map[i] < 0) continue; if (chunk->map[i] < head + size) { max_contig = max(chunk->map[i], max_contig); continue; } if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { if (chunk->map[i - 1] > 0) chunk->map[i - 1] += head; else { chunk->map[i - 1] -= head; chunk->free_size -= head; } chunk->map[i] -= head; off += head; head = 0; } tail = chunk->map[i] - head - size; if (tail < sizeof(int)) tail = 0; if (head || tail) { pcpu_split_block(chunk, i, head, tail); if (head) { i++; off += head; max_contig = max(chunk->map[i - 1], max_contig); } if (tail) max_contig = max(chunk->map[i + 1], max_contig); } if (is_last) chunk->contig_hint = max_contig; else chunk->contig_hint = max(chunk->contig_hint, max_contig); chunk->free_size -= chunk->map[i]; chunk->map[i] = -chunk->map[i]; pcpu_chunk_relocate(chunk, oslot); return off; } chunk->contig_hint = max_contig; pcpu_chunk_relocate(chunk, oslot); return -1; } static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); int i, off; for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) if (off == freeme) break; BUG_ON(off != freeme); BUG_ON(chunk->map[i] > 0); chunk->map[i] = -chunk->map[i]; chunk->free_size += chunk->map[i]; if (i > 0 && chunk->map[i - 1] >= 0) { chunk->map[i - 1] += chunk->map[i]; chunk->map_used--; memmove(&chunk->map[i], &chunk->map[i + 1], (chunk->map_used - i) * sizeof(chunk->map[0])); i--; } if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { chunk->map[i] += chunk->map[i + 1]; chunk->map_used--; memmove(&chunk->map[i + 1], &chunk->map[i + 2], (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); } chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { pcpu_mem_free(chunk, pcpu_chunk_struct_size); return NULL; } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; return chunk; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); static struct pcpu_chunk *pcpu_create_chunk(void); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { if (pcpu_addr_in_first_chunk(addr)) { if (pcpu_addr_in_reserved_chunk(addr)) return pcpu_reserved_chunk; return pcpu_first_chunk; } addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; int slot, off, new_alloc; unsigned long flags; void __percpu *ptr; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); return NULL; } mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint) { err = "alloc from reserved chunk failed"; goto fail_unlock; } while ((new_alloc = pcpu_need_to_extend(chunk))) { spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; goto fail_unlock_mutex; } spin_lock_irqsave(&pcpu_lock, flags); } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; new_alloc = pcpu_need_to_extend(chunk); if (new_alloc) { spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; goto fail_unlock_mutex; } spin_lock_irqsave(&pcpu_lock, flags); goto restart; } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; } } spin_unlock_irqrestore(&pcpu_lock, flags); chunk = pcpu_create_chunk(); if (!chunk) { err = "failed to allocate new chunk"; goto fail_unlock_mutex; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } mutex_unlock(&pcpu_alloc_mutex); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); if (warn_limit) { pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " "%s\n", size, align, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } return NULL; } void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } EXPORT_SYMBOL_GPL(__alloc_percpu); void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } static void pcpu_reclaim(struct work_struct *work) { LIST_HEAD(todo); struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, head, list) { WARN_ON(chunk->immutable); if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; list_move(&chunk->list, &todo); } spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); pcpu_destroy_chunk(chunk); } mutex_unlock(&pcpu_alloc_mutex); } void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; if (!ptr) return; kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; pcpu_free_area(chunk, off); if (chunk->free_size == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { schedule_work(&pcpu_reclaim_work); break; } } spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); bool is_kernel_percpu_address(unsigned long addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if ((void *)addr >= start && (void *)addr < start + static_size) return true; } #endif return false; } phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsigned long first_low, first_high; unsigned int cpu; first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if (addr >= start && addr < start + pcpu_unit_size) { in_first_chunk = true; break; } } } if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr); } else return page_to_phys(pcpu_addr_to_page(addr)) + offset_in_page(addr); } struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units) { struct pcpu_alloc_info *ai; size_t base_size, ai_size; void *ptr; int unit; base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); if (!ptr) return NULL; ai = ptr; ptr += base_size; ai->groups[0].cpu_map = ptr; for (unit = 0; unit < nr_units; unit++) ai->groups[0].cpu_map[unit] = NR_CPUS; ai->nr_groups = nr_groups; ai->__ai_size = PFN_ALIGN(ai_size); return ai; } void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { free_bootmem(__pa(ai), ai->__ai_size); } static void pcpu_dump_alloc_info(const char *lvl, const struct pcpu_alloc_info *ai) { int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; v = ai->nr_groups; while (v /= 10) group_width++; v = num_possible_cpus(); while (v /= 10) cpu_width++; empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; upa = ai->alloc_size / ai->unit_size; width = upa * (cpu_width + 1) + group_width + 3; apl = rounddown_pow_of_two(max(60 / width, 1)); printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", lvl, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); for (group = 0; group < ai->nr_groups; group++) { const struct pcpu_group_info *gi = &ai->groups[group]; int unit = 0, unit_end = 0; BUG_ON(gi->nr_units % upa); for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { printk(KERN_CONT "\n"); printk("%spcpu-alloc: ", lvl); } printk(KERN_CONT "[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) printk(KERN_CONT "%0*d ", cpu_width, gi->cpu_map[unit]); else printk(KERN_CONT "%s ", empty_str); } } printk(KERN_CONT "\n"); } int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { static char cpus_buf[4096] __initdata; static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("PERCPU: failed to initialize, %s", #cond); \ pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ } while (0) PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); #endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; pcpu_low_unit_cpu = NR_CPUS; pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; group_offsets[group] = gi->base_offset; group_sizes[group] = gi->nr_units * ai->unit_size; for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) continue; PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; if (pcpu_low_unit_cpu == NR_CPUS || unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) pcpu_low_unit_cpu = cpu; if (pcpu_high_unit_cpu == NR_CPUS || unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; for_each_possible_cpu(cpu) PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); #undef PCPU_SETUP_BUG_ON pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); if (ai->reserved_size) { schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; } schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; if (dyn_size) { dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_base_addr = base_addr; return 0; } #ifdef CONFIG_SMP const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { if (0) ; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK else if (!strcmp(str, "embed")) pcpu_chosen_fc = PCPU_FC_EMBED; #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else pr_warning("PERCPU: unknown allocator %s specified\n", str); return 0; } early_param("percpu_alloc", percpu_alloc_setup); #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) static struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; unsigned int *cpu_map; memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); dyn_size = size_sum - static_size - reserved_size; min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) upa--; max_upa = upa; for_each_possible_cpu(cpu) { group = 0; next_group: for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; nr_groups = max(nr_groups, group + 1); goto next_group; } } group_map[cpu] = group; group_cnt[group]++; } last_allocs = INT_MAX; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) continue; for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; } if (wasted > num_possible_cpus() / 3) continue; if (allocs > last_allocs) break; last_allocs = allocs; best_upa = upa; } upa = best_upa; for (group = 0; group < nr_groups; group++) nr_units += roundup(group_cnt[group], upa); ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM); cpu_map = ai->groups[0].cpu_map; for (group = 0; group < nr_groups; group++) { ai->groups[group].cpu_map = cpu_map; cpu_map += roundup(group_cnt[group], upa); } ai->static_size = static_size; ai->reserved_size = reserved_size; ai->dyn_size = dyn_size; ai->unit_size = alloc_size / upa; ai->atom_size = atom_size; ai->alloc_size = alloc_size; for (group = 0, unit = 0; group_cnt[group]; group++) { struct pcpu_group_info *gi = &ai->groups[group]; gi->base_offset = unit * ai->unit_size; for_each_possible_cpu(cpu) if (group_map[cpu] == group) gi->cpu_map[gi->nr_units++] = cpu; gi->nr_units = roundup(gi->nr_units, upa); unit += gi->nr_units; } BUG_ON(unit != nr_units); return ai; } #endif #if defined(BUILD_EMBED_FIRST_CHUNK) int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; size_t size_sum, areas_size, max_distance; int group, i, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); areas = alloc_bootmem_nopanic(areas_size); if (!areas) { rc = -ENOMEM; goto out_free; } for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; void *ptr; for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) cpu = gi->cpu_map[i]; BUG_ON(cpu == NR_CPUS); ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); if (!ptr) { rc = -ENOMEM; goto out_free_areas; } kmemleak_free(ptr); areas[group] = ptr; base = min(ptr, base); } for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group]; for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { free_fn(ptr, ai->unit_size); continue; } memcpy(ptr, __per_cpu_load, ai->static_size); free_fn(ptr + size_sum, ai->unit_size - size_sum); } } max_distance = 0; for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; max_distance = max_t(size_t, max_distance, ai->groups[group].base_offset); } max_distance += ai->unit_size; if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " "space 0x%lx\n", max_distance, (unsigned long)(VMALLOC_END - VMALLOC_START)); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK rc = -EINVAL; goto out_free; #endif } pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: for (group = 0; group < ai->nr_groups; group++) free_fn(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) free_bootmem(__pa(areas), areas_size); return rc; } #endif #ifdef BUILD_PAGE_FIRST_CHUNK int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; int unit, i, j, rc; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); unit_pages = ai->unit_size >> PAGE_SHIFT; pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = alloc_bootmem(pages_size); j = 0; for (unit = 0; unit < num_possible_cpus(); unit++) for (i = 0; i < unit_pages; i++) { unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate %s page " "for cpu%u\n", psize_str, cpu); goto enomem; } kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } #endif #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { free_bootmem(__pa(ptr), size); } void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc; rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif #else void __init setup_per_cpu_areas(void) { const size_t unit_size = roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc; ai = pcpu_alloc_alloc_info(1, 1); fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); kmemleak_free(fc); ai->dyn_size = unit_size; ai->unit_size = unit_size; ai->atom_size = unit_size; ai->alloc_size = unit_size; ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); } #endif void __init percpu_init_late(void) { struct pcpu_chunk *target_chunks[] = { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; struct pcpu_chunk *chunk; unsigned long flags; int i; for (i = 0; (chunk = target_chunks[i]); i++) { int *map; const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); BUILD_BUG_ON(size > PAGE_SIZE); map = pcpu_mem_zalloc(size); BUG_ON(!map); spin_lock_irqsave(&pcpu_lock, flags); memcpy(map, chunk->map, size); chunk->map = map; spin_unlock_irqrestore(&pcpu_lock, flags); } }