/* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES unsigned long max_mapnr; struct page *mem_map; EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(mem_map); #endif unsigned long num_physpages; void * high_memory; EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; unsigned long highest_memmap_pfn __read_mostly; static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } core_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { if (current->rss_stat.count[i]) { add_mm_counter(mm, i, current->rss_stat.count[i]); current->rss_stat.count[i] = 0; } } current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) { struct task_struct *task = current; if (likely(task->mm == mm)) task->rss_stat.count[member] += val; else add_mm_counter(mm, member, val); } #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) #define TASK_RSS_EVENTS_THRESH (64) static void check_sync_rss_stat(struct task_struct *task) { if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) sync_mm_rss(task->mm); } #else #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) static void check_sync_rss_stat(struct task_struct *task) { } #endif #ifdef HAVE_GENERIC_MMU_GATHER static int tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return 1; } batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return 1; } void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; tlb->need_flush = 0; tlb->fast_mode = (num_possible_cpus() == 1); tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; if (!tlb->need_flush) return; tlb->need_flush = 0; tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif if (tlb_fast_mode(tlb)) return; for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; } tlb->active = &tlb->local; } void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { struct mmu_gather_batch *batch, *next; tlb_flush_mmu(tlb); check_pgt_cache(); for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->need_flush); if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); return 1; } batch = tlb->active; batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return 0; batch = tlb->active; } VM_BUG_ON(batch->nr > batch->max); return batch->max - batch->nr; } #endif #ifdef CONFIG_HAVE_RCU_TABLE_FREE static void tlb_remove_table_smp_sync(void *arg) { } static void tlb_remove_table_one(void *table) { smp_call_function(tlb_remove_table_smp_sync, NULL, 1); __tlb_remove_table(table); } static void tlb_remove_table_rcu(struct rcu_head *head) { struct mmu_table_batch *batch; int i; batch = container_of(head, struct mmu_table_batch, rcu); for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; tlb->need_flush = 1; if (atomic_read(&tlb->mm->mm_users) < 2) { __tlb_remove_table(table); return; } if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } #endif void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); } void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } vma = next; } } int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) return -ENOMEM; smp_wmb(); spin_lock(&mm->page_table_lock); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); if (wait_split_huge_page) wait_split_huge_page(vma->anon_vma, pmd); return 0; } int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { pte_t *new = pte_alloc_one_kernel(&init_mm, address); if (!new) return -ENOMEM; smp_wmb(); spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } else VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; if (current->mm == mm) sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { printk(KERN_ALERT "BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); printk(KERN_ALERT "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); if (vma->vm_ops) print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", (unsigned long)vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef is_zero_pfn static inline int is_zero_pfn(unsigned long pfn) { return pfn == zero_pfn; } #endif #ifndef my_zero_pfn static inline unsigned long my_zero_pfn(unsigned long addr) { return zero_pfn; } #endif #ifdef __HAVE_ARCH_PTE_SPECIAL # define HAVE_PTE_SPECIAL 1 #else # define HAVE_PTE_SPECIAL 0 #endif struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } out: return pfn_to_page(pfn); } static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); if (swap_duplicate(entry) < 0) return entry.val; if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } if (likely(!non_swap_entry(entry))) rss[MM_SWAPENTS]++; else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); if (PageAnon(page)) rss[MM_ANONPAGES]++; else rss[MM_FILEPAGES]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); set_pte_at(src_mm, addr, src_pte, pte); } } } goto out_set_pte; } if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); page_dup_rmap(page); if (PageAnon(page)) rss[MM_ANONPAGES]++; else rss[MM_FILEPAGES]++; } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { progress++; continue; } entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); if (entry.val) break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (entry.val) { if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) return -ENOMEM; progress = 0; } if (addr != end) goto again; return 0; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd)) { int err; VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_pgd, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; int ret; if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); if (unlikely(is_pfn_mapping(vma))) { ret = track_pfn_vma_copy(vma); if (ret) return ret; } if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_start(src_mm, addr, end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_end(src_mm, vma->vm_start, end); return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { struct mm_struct *mm = tlb->mm; int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { continue; } if (pte_present(ptent)) { struct page *page; page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { if (details->check_mapping && details->check_mapping != page->mapping) continue; if (details->nonlinear_vma && (page->index < details->first_index || page->index > details->last_index)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) break; continue; } if (unlikely(details)) continue; if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); } else { swp_entry_t entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; else if (is_migration_entry(entry)) { struct page *page; page = migration_entry_to_page(entry); if (PageAnon(page)) rss[MM_ANONPAGES]--; else rss[MM_FILEPAGES]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); if (force_flush) { force_flush = 0; tlb_flush_mmu(tlb); if (addr != end) goto again; } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; } if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); next: cond_resched(); } while (pmd++, addr = next, addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); } while (pud++, addr = next, addr != end); return addr; } static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; if (details && !details->check_mapping && !details->nonlinear_vma) details = NULL; BUG_ON(addr >= end); mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); mem_cgroup_uncharge_end(); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { if (vma->vm_file) unmap_hugepage_range(vma, start, end, NULL); } else unmap_page_range(tlb, vma, start, end, details); } } void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, details); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); } void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); tlb_finish_mmu(&tlb, address, end); } static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); mmu_notifier_invalidate_range_end(mm, address, end); tlb_finish_mmu(&tlb, address, end); } int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (address < vma->vm_start || address + size > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) return -1; zap_page_range_single(vma, address, size, NULL); return 0; } EXPORT_SYMBOL_GPL(zap_vma_ptes); struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); goto out; } page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto no_page_table; pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } if (unlikely(pud_bad(*pud))) goto no_page_table; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(mm, pmd); goto split_fallthrough; } spin_lock(&mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { spin_unlock(&mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(mm, address, pmd, flags); spin_unlock(&mm->page_table_lock); goto out; } } else spin_unlock(&mm->page_table_lock); } split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { if ((flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(pte))) goto bad_page; page = pte_page(pte); } if (flags & FOLL_GET) get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) mlock_vma_page(page); unlock_page(page); } } unlock: pte_unmap_unlock(ptep, ptl); out: return page; bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; no_page_table: if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return page; } static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) { return stack_guard_page_start(vma, addr) || stack_guard_page_end(vma, addr+PAGE_SIZE); } /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * * Must be called with mmap_sem held for read or write. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, * *@nonblocking will be set to 0. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { int i; unsigned long vm_flags; if (nr_pages <= 0) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); vm_flags = (gup_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; if (gup_flags & FOLL_WRITE) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); else pgd = pgd_offset_gate(mm, pg); BUG_ON(pgd_none(*pgd)); pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); return i ? : -EFAULT; } vma = get_gate_vma(mm); if (pages) { struct page *page; page = vm_normal_page(vma, start, *pte); if (!page) { if (!(gup_flags & FOLL_DUMP) && is_zero_pfn(pte_pfn(*pte))) page = pte_page(*pte); else { pte_unmap(pte); return i ? : -EFAULT; } } pages[i] = page; get_page(page); } pte_unmap(pte); goto next_page; } if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags); continue; } do { struct page *page; unsigned int foll_flags = gup_flags; if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; unsigned int fault_flags = 0; if (foll_flags & FOLL_MLOCK) { if (stack_guard_page(vma, start)) goto next_page; } if (foll_flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (foll_flags & FOLL_NOWAIT) fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); ret = handle_mm_fault(mm, vma, start, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { if (i) return i; else if (gup_flags & FOLL_HWPOISON) return -EHWPOISON; else return -EFAULT; } if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } if (tsk) { if (ret & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; } if (ret & VM_FAULT_RETRY) { if (nonblocking) *nonblocking = 0; return i; } if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); } if (IS_ERR(page)) return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); } next_page: if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; nr_pages--; } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; } EXPORT_SYMBOL(__get_user_pages); int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) { struct vm_area_struct *vma; vm_flags_t vm_flags; int ret; vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return -EHWPOISON; if (ret & VM_FAULT_SIGBUS) return -EFAULT; BUG(); } if (tsk) { if (ret & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; } return 0; } /* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. This will result in the page being COWed even * in MAP_SHARED mappings. You do not want this. * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * * Must be called with mmap_sem held for read or write. * * get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If write=0, the page must not be written to. If the page is written to, * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called * after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual * addresses. The pages may be submitted for DMA to devices or accessed via * their kernel linear mapping (via the kmap APIs). Care should be taken to * use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; if (pages) flags |= FOLL_GET; if (write) flags |= FOLL_WRITE; if (force) flags |= FOLL_FORCE; return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, NULL); } EXPORT_SYMBOL(get_user_pages); #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct vm_area_struct *vma; struct page *page; if (__get_user_pages(current, current->mm, addr, 1, FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; } #endif pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); if (pmd) { VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); } } return NULL; } static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; retval = -EINVAL; if (PageAnon(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; get_page(page); inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; pte_unmap_unlock(pte, ptl); return retval; out_unlock: pte_unmap_unlock(pte, ptl); out: return retval; } int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!page_count(page)) return -EINVAL; vma->vm_flags |= VM_INSERTPAGE; return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte, entry; spinlock_t *ptl; retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); out: return retval; } int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { int ret; pgprot_t pgprot = vma->vm_page_prot; BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); if (ret) untrack_pfn_vma(vma, pfn, PAGE_SIZE); return ret; } EXPORT_SYMBOL(vm_insert_pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { struct page *page; page = pfn_to_page(pfn); return insert_page(vma, addr, page, vma->vm_page_prot); } return insert_pfn(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_mixed); static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte; spinlock_t *ptl; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, pgd, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; if (addr == vma->vm_start && end == vma->vm_end) { vma->vm_pgoff = pfn; vma->vm_flags |= VM_PFN_AT_MMAP; } else if (is_cow_mapping(vma->vm_flags)) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); vma->vm_flags &= ~VM_PFN_AT_MMAP; return -EINVAL; } BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) break; } while (pgd++, addr = next, addr != end); if (err) untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); pte = (mm == &init_mm) ? pte_alloc_kernel(pmd, addr) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; BUG_ON(pmd_huge(*pmd)); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); do { err = fn(pte++, token, addr, data); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pmd_t *pmd; unsigned long next; int err; BUG_ON(pud_huge(*pud)); pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); err = apply_to_pte_range(mm, pmd, addr, next, fn, data); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pud_t *pud; unsigned long next; int err; pud = pud_alloc(mm, pgd, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = apply_to_pmd_range(mm, pud, addr, next, fn, data); if (err) break; } while (pud++, addr = next, addr != end); return err; } int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { pgd_t *pgd; unsigned long next; unsigned long end = addr + size; int err; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); err = apply_to_pud_range(mm, pgd, addr, next, fn, data); if (err) break; } while (pgd++, addr = next, addr != end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { spinlock_t *ptl = pte_lockptr(mm, pmd); spin_lock(ptl); same = pte_same(*page_table, orig_pte); spin_unlock(ptl); } #endif pte_unmap(page_table); return same; } static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { if (unlikely(!src)) { void *kaddr = kmap_atomic(dst); void __user *uaddr = (void __user *)(va & PAGE_MASK); if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) clear_page(kaddr); kunmap_atomic(kaddr); flush_dcache_page(dst); } else copy_user_highpage(dst, src, va, vma); } static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { struct page *old_page, *new_page; pte_t entry; int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) goto reuse; goto gotten; } if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); goto unlock; } page_cache_release(old_page); } if (reuse_swap_page(old_page)) { page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); goto reuse; } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { if (vma->vm_ops && vma->vm_ops->page_mkwrite) { struct vm_fault vmf; int tmp; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = old_page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = old_page; page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { lock_page(old_page); if (!old_page->mapping) { ret = 0; unlock_page(old_page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(old_page)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); goto unlock; } page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; if (!dirty_page) return ret; if (!page_mkwrite) { wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page, page_mkwrite); } put_page(dirty_page); if (page_mkwrite) { struct address_space *mapping = dirty_page->mapping; set_page_dirty(dirty_page); unlock_page(dirty_page); page_cache_release(dirty_page); if (mapping) { balance_dirty_pages_ratelimited(mapping); } } if (vma->vm_file) file_update_time(vma->vm_file); return ret; } page_cache_get(old_page); gotten: pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); } __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, page_table); if (old_page) { page_remove_rmap(old_page); } new_page = old_page; ret |= VM_FAULT_WRITE; } else mem_cgroup_uncharge_page(new_page); if (new_page) page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); if (old_page) { if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); } return ret; oom_free_new: page_cache_release(new_page); oom: if (old_page) { if (page_mkwrite) { unlock_page(old_page); page_cache_release(old_page); } page_cache_release(old_page); } return VM_FAULT_OOM; unwritable_page: page_cache_release(old_page); return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct prio_tree_root *root, struct zap_details *details) { struct vm_area_struct *vma; struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; vma_prio_tree_foreach(vma, &iter, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; zba = details->first_index; if (zba < vba) zba = vba; zea = details->last_index; if (zea > vea) zea = vea; unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } static inline void unmap_mapping_range_list(struct list_head *head, struct zap_details *details) { struct vm_area_struct *vma; list_for_each_entry(vma, head, shared.vm_set.list) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } } void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } details.check_mapping = even_cows? NULL: mapping; details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; mutex_lock(&mapping->i_mmap_mutex); if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); mutex_unlock(&mapping->i_mmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; int locked; struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(mm, pmd, address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, address, orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(mm, PGMAJFAULT); } else if (PageHWPoison(page)) { ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; } if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; if (ksm_might_need_to_copy(page, vma, address)) { swapcache = page; page = ksm_does_need_to_copy(page, vma, address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; swapcache = NULL; goto out_page; } } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; goto out_page; } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*page_table, orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = 1; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); do_page_add_anon_rmap(page, vma, address, exclusive); mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (swapcache) { unlock_page(swapcache); page_cache_release(swapcache); } if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge_swapin(ptr); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); out_release: page_cache_release(page); if (swapcache) { unlock_page(swapcache); page_cache_release(swapcache); } return ret; } static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) { address &= PAGE_MASK; if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { struct vm_area_struct *prev = vma->vm_prev; if (prev && prev->vm_end == address) return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; expand_downwards(vma, address - PAGE_SIZE); } if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { struct vm_area_struct *next = vma->vm_next; if (next && next->vm_start == address + PAGE_SIZE) return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; expand_upwards(vma, address + PAGE_SIZE); } return 0; } static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { struct page *page; spinlock_t *ptl; pte_t entry; pte_unmap(page_table); if (check_stack_guard_page(vma, address) < 0) return VM_FAULT_SIGBUS; if (!(flags & FAULT_FLAG_WRITE)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; goto setpte; } if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; __SetPageUptodate(page); if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; release: mem_cgroup_uncharge_page(page); page_cache_release(page); goto unlock; oom_free_page: page_cache_release(page); oom: return VM_FAULT_OOM; } static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { pte_t *page_table; spinlock_t *ptl; struct page *page; struct page *cow_page; pte_t entry; int anon = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!cow_page) return VM_FAULT_OOM; if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { page_cache_release(cow_page); return VM_FAULT_OOM; } } else cow_page = NULL; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); ret = VM_FAULT_HWPOISON; goto uncharge_out; } if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else VM_BUG_ON(!PageLocked(vmf.page)); page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { page = cow_page; anon = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { if (vma->vm_ops->page_mkwrite) { int tmp; unlock_page(page); vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { lock_page(page); if (!page->mapping) { ret = 0; unlock_page(page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(page)); page_mkwrite = 1; } } } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; get_page(dirty_page); } } set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, page_table); } else { if (cow_page) mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else anon = 1; } pte_unmap_unlock(page_table, ptl); if (dirty_page) { struct address_space *mapping = page->mapping; if (set_page_dirty(dirty_page)) page_mkwrite = 1; unlock_page(dirty_page); put_page(dirty_page); if (page_mkwrite && mapping) { balance_dirty_pages_ratelimited(mapping); } if (vma->vm_file) file_update_time(vma->vm_file); } else { unlock_page(vmf.page); if (anon) page_cache_release(vmf.page); } return ret; unwritable_page: page_cache_release(page); return ret; uncharge_out: if (cow_page) { mem_cgroup_uncharge_page(cow_page); page_cache_release(cow_page); } return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { pgoff_t pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { pgoff_t pgoff; flags |= FAULT_FLAG_NONLINEAR; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_SIGBUS; } pgoff = pte_to_pgoff(orig_pte); return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, pte, pmd, flags); } if (pte_file(entry)) return do_nonlinear_fault(mm, vma, address, pte, pmd, flags, entry); return do_swap_page(mm, vma, address, pte, pmd, flags, entry); } ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, pte); } else { if (flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); return 0; } int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); mem_cgroup_count_vm_event(mm, PGFAULT); check_sync_rss_stat(current); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { if (!vma->vm_ops) return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); } else { pmd_t orig_pmd = *pmd; barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && !pmd_trans_splitting(orig_pmd)) return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); return 0; } } if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) return VM_FAULT_OOM; if (unlikely(pmd_trans_huge(*pmd))) return 0; pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } #ifndef __PAGETABLE_PUD_FOLDED int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) pud_free(mm, new); else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); return 0; } #endif #ifndef __PAGETABLE_PMD_FOLDED int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) pmd_free(mm, new); else pud_populate(mm, pud, new); #else if (pgd_present(*pud)) pmd_free(mm, new); else pgd_populate(mm, pud, new); #endif spin_unlock(&mm->page_table_lock); return 0; } #endif int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; struct vm_area_struct * vma; vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); if (ret < 0) return ret; return ret == len ? 0 : -EFAULT; } #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { gate_vma.vm_mm = NULL; gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; return 0; } __initcall(gate_vma_init); #endif struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; #else return NULL; #endif } int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) return 1; #endif return 0; } #endif static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out; pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; if (pmd_huge(*pmd)) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; return 0; unlock: pte_unmap_unlock(ptep, *ptlp); out: return -EINVAL; } static inline int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { int res; (void) __cond_lock(*ptlp, !(res = __follow_pte(mm, address, ptepp, ptlp))); return res; } int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) { int ret = -EINVAL; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); pte_unmap_unlock(ptep, ptl); return 0; } EXPORT_SYMBOL(follow_pfn); #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys) { int ret = -EINVAL; pte_t *ptep, pte; spinlock_t *ptl; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; pte = *ptep; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: return ret; } int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); iounmap(maddr); return len; } #endif static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { struct vm_area_struct *vma; void *old_buf = buf; down_read(&mm->mmap_sem); while (len) { int bytes, ret, offset; void *maddr; struct page *page = NULL; ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) #endif break; bytes = ret; } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } kunmap(page); page_cache_release(page); } len -= bytes; buf += bytes; addr += bytes; } up_read(&mm->mmap_sem); return buf - old_buf; } int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { return __access_remote_vm(NULL, mm, addr, buf, len, write); } int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(tsk, mm, addr, buf, len, write); mmput(mm); return ret; } void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; if (preempt_count()) return; down_read(&mm->mmap_sem); vma = find_vma(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; char *buf = (char *)__get_free_page(GFP_KERNEL); if (buf) { char *p, *s; p = d_path(&f->f_path, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; s = strrchr(p, '/'); if (s) p = s+1; printk("%s%s[%lx+%lx]", prefix, p, vma->vm_start, vma->vm_end - vma->vm_start); free_page((unsigned long)buf); } } up_read(¤t->mm->mmap_sem); } #ifdef CONFIG_PROVE_LOCKING void might_fault(void) { if (segment_eq(get_fs(), KERNEL_DS)) return; might_sleep(); if (!in_atomic() && current->mm) might_lock_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL(might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) static void clear_gigantic_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) { int i; struct page *p = page; might_sleep(); for (i = 0; i < pages_per_huge_page; i++, p = mem_map_next(p, page, i)) { cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } } void clear_huge_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) { int i; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } might_sleep(); for (i = 0; i < pages_per_huge_page; i++) { cond_resched(); clear_user_highpage(page + i, addr + i * PAGE_SIZE); } } static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { int i; struct page *dst_base = dst; struct page *src_base = src; for (i = 0; i < pages_per_huge_page; ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); i++; dst = mem_map_next(dst, dst_base, i); src = mem_map_next(src, src_base, i); } } void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { int i; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, pages_per_huge_page); return; } might_sleep(); for (i = 0; i < pages_per_huge_page; i++) { cond_resched(); copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } #endif