/* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "internal.h" #include #include void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; if (PageUptodate(page) && PageMappedToDisk(page)) cleancache_put_page(page); else cleancache_invalidate_page(mapping, page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); } } void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); if (freepage) freepage(page); page_cache_release(page); } EXPORT_SYMBOL(delete_from_page_cache); static int sleep_on_page(void *word) { io_schedule(); return 0; } static int sleep_on_page_killable(void *word) { sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; if (!mapping_cap_writeback_dirty(mapping)) return 0; ret = do_writepages(mapping, &wbc); return ret; } static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_flush(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } EXPORT_SYMBOL(filemap_flush); int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; if (end_byte < start_byte) return 0; pagevec_init(&pvec, 0); while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { unsigned i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; if (page->index > end) continue; wait_on_page_writeback(page); if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); cond_resched(); } if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_fdatawait_range); int filemap_fdatawait(struct address_space *mapping) { loff_t i_size = i_size_read(mapping->host); if (i_size == 0) return 0; return filemap_fdatawait_range(mapping, 0, i_size - 1); } EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { int err = 0; if (mapping->nrpages) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) { int err2 = filemap_fdatawait(mapping); if (!err) err = err2; } } return err; } EXPORT_SYMBOL(filemap_write_and_wait); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that `lend' is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0; if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); if (err != -EIO) { int err2 = filemap_fdatawait_range(mapping, lstart, lend); if (!err) err = err2; } } return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; page_cache_get(new); new->mapping = mapping; new->index = offset; spin_lock_irq(&mapping->tree_lock); __delete_from_page_cache(old); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); } return error; } EXPORT_SYMBOL_GPL(replace_page_cache_page); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; page->index = offset; spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); out: return error; } EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int ret; ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0) lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { int n; struct page *page; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = get_mems_allowed(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); } while (!put_mems_allowed(cpuset_mems_cookie) && !page); return page; } return alloc_pages(gfp, 0); } EXPORT_SYMBOL(__page_cache_alloc); #endif static wait_queue_head_t *page_waitqueue(struct page *page) { const struct zone *zone = page_zone(page); return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } static inline void wake_up_page(struct page *page, int bit) { __wake_up_bit(page_waitqueue(page), &page->flags, bit); } void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); int wait_on_page_bit_killable(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (!test_bit(bit_nr, &page->flags)) return 0; return __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page_killable, TASK_KILLABLE); } void add_page_wait_queue(struct page *page, wait_queue_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, waiter); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(add_page_wait_queue); void unlock_page(struct page *page) { VM_BUG_ON(!PageLocked(page)); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); void end_page_writeback(struct page *page) { if (TestClearPageReclaim(page)) rotate_reclaimable_page(page); if (!test_clear_page_writeback(page)) BUG(); smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page_killable, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { if (flags & FAULT_FLAG_ALLOW_RETRY) { if (flags & FAULT_FLAG_RETRY_NOWAIT) return 0; up_read(&mm->mmap_sem); if (flags & FAULT_FLAG_KILLABLE) wait_on_page_locked_killable(page); else wait_on_page_locked(page); return 0; } else { if (flags & FAULT_FLAG_KILLABLE) { int ret; ret = __lock_page_killable(page); if (ret) { up_read(&mm->mmap_sem); return 0; } } else __lock_page(page); return 1; } } struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; rcu_read_lock(); repeat: page = NULL; pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) goto repeat; goto out; } if (!page_cache_get_speculative(page)) goto repeat; if (unlikely(page != *pagep)) { page_cache_release(page); goto repeat; } } out: rcu_read_unlock(); return page; } EXPORT_SYMBOL(find_get_page); struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: page = find_get_page(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); if (unlikely(page->mapping != mapping)) { unlock_page(page); page_cache_release(page); goto repeat; } VM_BUG_ON(page->index != offset); } return page; } EXPORT_SYMBOL(find_lock_page); struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) { struct page *page; int err; repeat: page = find_lock_page(mapping, index); if (!page) { page = __page_cache_alloc(gfp_mask); if (!page) return NULL; err = add_to_page_cache_lru(page, mapping, index, (gfp_mask & GFP_RECLAIM_MASK)); if (unlikely(err)) { page_cache_release(page); page = NULL; if (err == -EEXIST) goto repeat; } } return page; } EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { struct radix_tree_iter iter; void **slot; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { WARN_ON(iter.index); goto restart; } continue; } if (!page_cache_get_speculative(page)) goto repeat; if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { struct radix_tree_iter iter; void **slot; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { goto restart; } break; } if (!page_cache_get_speculative(page)) goto repeat; if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { struct radix_tree_iter iter; void **slot; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { goto restart; } BUG(); } if (!page_cache_get_speculative(page)) goto repeat; if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; if (++ret == nr_pages) break; } rcu_read_unlock(); if (ret) *index = pages[ret - 1]->index + 1; return ret; } EXPORT_SYMBOL(find_get_pages_tag); struct page * grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { struct page *page = find_get_page(mapping, index); if (page) { if (trylock_page(page)) return page; page_cache_release(page); return NULL; } page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; } return page; } EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { ra->ra_pages /= 4; } static void do_generic_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct file_ra_state *ra = &filp->f_ra; pgoff_t index; pgoff_t last_index; pgoff_t prev_index; unsigned long offset; unsigned int prev_offset; int error; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { struct page *page; pgoff_t end_index; loff_t isize; unsigned long nr, ret; cond_resched(); find_page: page = find_get_page(mapping, index); if (!page) { page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); page = find_get_page(mapping, index); if (unlikely(page == NULL)) goto no_cached_page; } if (PageReadahead(page)) { page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) goto page_not_up_to_date_locked; unlock_page(page); } page_ok: isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) { page_cache_release(page); goto out; } nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { page_cache_release(page); goto out; } } nr = nr - offset; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); if (prev_index != index || offset != prev_offset) mark_page_accessed(page); prev_index = index; ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); if (ret == nr && desc->count) continue; goto out; page_not_up_to_date: error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; page_not_up_to_date_locked: if (!page->mapping) { unlock_page(page); page_cache_release(page); continue; } if (PageUptodate(page)) { unlock_page(page); goto page_ok; } readpage: ClearPageError(page); error = mapping->a_ops->readpage(filp, page); if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } goto readpage_error; } if (!PageUptodate(page)) { error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { unlock_page(page); page_cache_release(page); goto find_page; } unlock_page(page); shrink_readahead_size_eio(filp, ra); error = -EIO; goto readpage_error; } unlock_page(page); } goto page_ok; readpage_error: desc->error = error; page_cache_release(page); goto out; no_cached_page: page = page_cache_alloc_cold(mapping); if (!page) { desc->error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); if (error == -EEXIST) goto find_page; desc->error = error; goto out; } goto readpage; } out: ra->prev_pos = prev_index; ra->prev_pos <<= PAGE_CACHE_SHIFT; ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); } int file_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset, unsigned long size) { char *kaddr; unsigned long left, count = desc->count; if (size > count) size = count; if (!fault_in_pages_writeable(desc->arg.buf, size)) { kaddr = kmap_atomic(page); left = __copy_to_user_inatomic(desc->arg.buf, kaddr + offset, size); kunmap_atomic(kaddr); if (left == 0) goto success; } kaddr = kmap(page); left = __copy_to_user(desc->arg.buf, kaddr + offset, size); kunmap(page); if (left) { size -= left; desc->error = -EFAULT; } success: desc->count = count - size; desc->written += size; desc->arg.buf += size; return size; } int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags) { unsigned long seg; size_t cnt = 0; for (seg = 0; seg < *nr_segs; seg++) { const struct iovec *iv = &iov[seg]; cnt += iv->iov_len; if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) return -EINVAL; if (access_ok(access_flags, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; *nr_segs = seg; cnt -= iv->iov_len; break; } *count = cnt; return 0; } EXPORT_SYMBOL(generic_segment_checks); ssize_t generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; if (filp->f_flags & O_DIRECT) { loff_t size; struct address_space *mapping; struct inode *inode; mapping = filp->f_mapping; inode = mapping->host; if (!count) goto out; size = i_size_read(inode); if (pos < size) { retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { struct blk_plug plug; blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; count -= retval; } if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; loff_t offset = 0; if (count) { if (count > iov[seg].iov_len) { count -= iov[seg].iov_len; continue; } offset = count; count = 0; } desc.written = 0; desc.arg.buf = iov[seg].iov_base + offset; desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; do_generic_file_read(filp, ppos, &desc, file_read_actor); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; break; } if (desc.count > 0) break; } out: return retval; } EXPORT_SYMBOL(generic_file_aio_read); static ssize_t do_readahead(struct address_space *mapping, struct file *filp, pgoff_t index, unsigned long nr) { if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; force_page_cache_readahead(mapping, filp, index, nr); return 0; } SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; ret = -EBADF; file = fget(fd); if (file) { if (file->f_mode & FMODE_READ) { struct address_space *mapping = file->f_mapping; pgoff_t start = offset >> PAGE_CACHE_SHIFT; pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, file, start, len); } fput(file); } return ret; } #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS asmlinkage long SyS_readahead(long fd, loff_t offset, long count) { return SYSC_readahead((int) fd, offset, (size_t) count); } SYSCALL_ALIAS(sys_readahead, SyS_readahead); #endif #ifdef CONFIG_MMU static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; int ret; do { page = page_cache_alloc_cold(mapping); if (!page) return -ENOMEM; ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) ret = 0; page_cache_release(page); } while (ret == AOP_TRUNCATED_PAGE); return ret; } #define MMAP_LOTSAMISS (100) static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct file_ra_state *ra, struct file *file, pgoff_t offset) { unsigned long ra_pages; struct address_space *mapping = file->f_mapping; if (VM_RandomReadHint(vma)) return; if (!ra->ra_pages) return; if (VM_SequentialReadHint(vma)) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; } if (ra->mmap_miss < MMAP_LOTSAMISS * 10) ra->mmap_miss++; if (ra->mmap_miss > MMAP_LOTSAMISS) return; ra_pages = max_sane_readahead(ra->ra_pages); ra->start = max_t(long, 0, offset - ra_pages / 2); ra->size = ra_pages; ra->async_size = ra_pages / 4; ra_submit(ra, mapping, file); } static void do_async_mmap_readahead(struct vm_area_struct *vma, struct file_ra_state *ra, struct file *file, struct page *page, pgoff_t offset) { struct address_space *mapping = file->f_mapping; if (VM_RandomReadHint(vma)) return; if (ra->mmap_miss > 0) ra->mmap_miss--; if (PageReadahead(page)) page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); } int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int error; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (offset >= size) return VM_FAULT_SIGBUS; page = find_get_page(mapping, offset); if (likely(page)) { do_async_mmap_readahead(vma, ra, file, page, offset); } else { do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { page_cache_release(page); return ret | VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto retry_find; } VM_BUG_ON(page->index != offset); if (unlikely(!PageUptodate(page))) goto page_not_uptodate; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } vmf->page = page; return ret | VM_FAULT_LOCKED; no_cached_page: error = page_cache_read(file, offset); if (error >= 0) goto retry_find; if (error == -ENOMEM) return VM_FAULT_OOM; return VM_FAULT_SIGBUS; page_not_uptodate: ClearPageError(page); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } page_cache_release(page); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; } EXPORT_SYMBOL(filemap_fault); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->readpage) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) return -EINVAL; return generic_file_mmap(file, vma); } #else int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { return -ENOSYS; } int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) { return -ENOSYS; } #endif EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) goto repeat; return ERR_PTR(err); } err = filler(data, page); if (err < 0) { page_cache_release(page); page = ERR_PTR(err); } } return page; } static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, gfp_t gfp) { struct page *page; int err; retry: page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) goto out; lock_page(page); if (!page->mapping) { unlock_page(page); page_cache_release(page); goto retry; } if (PageUptodate(page)) { unlock_page(page); goto out; } err = filler(data, page); if (err < 0) { page_cache_release(page); return ERR_PTR(err); } out: mark_page_accessed(page); return page; } struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page_async); static struct page *wait_on_page_read(struct page *page) { if (!IS_ERR(page)) { wait_on_page_locked(page); if (!PageUptodate(page)) { page_cache_release(page); page = ERR_PTR(-EIO); } } return page; } struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { filler_t *filler = (filler_t *)mapping->a_ops->readpage; return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); } EXPORT_SYMBOL(read_cache_page_gfp); struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); int should_remove_suid(struct dentry *dentry) { umode_t mode = dentry->d_inode->i_mode; int kill = 0; if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(should_remove_suid); static int __remove_suid(struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; return notify_change(dentry, &newattrs); } int file_remove_suid(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; int killsuid; int killpriv; int error = 0; if (IS_NOSEC(inode)) return 0; killsuid = should_remove_suid(dentry); killpriv = security_inode_need_killpriv(dentry); if (killpriv < 0) return killpriv; if (killpriv) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); if (!error && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; return error; } EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; while (bytes) { char __user *buf = iov->iov_base + base; int copy = min(bytes, iov->iov_len - base); base = 0; left = __copy_from_user_inatomic(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; iov++; if (unlikely(left)) break; } return copied - left; } size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr; size_t copied; BUG_ON(!in_atomic()); kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, i->iov, i->iov_offset, bytes); } kunmap_atomic(kaddr); return copied; } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); size_t iov_iter_copy_from_user(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr; size_t copied; kaddr = kmap(page); if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; left = __copy_from_user(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, i->iov, i->iov_offset, bytes); } kunmap(page); return copied; } EXPORT_SYMBOL(iov_iter_copy_from_user); void iov_iter_advance(struct iov_iter *i, size_t bytes) { BUG_ON(i->count < bytes); if (likely(i->nr_segs == 1)) { i->iov_offset += bytes; i->count -= bytes; } else { const struct iovec *iov = i->iov; size_t base = i->iov_offset; unsigned long nr_segs = i->nr_segs; while (bytes || unlikely(i->count && !iov->iov_len)) { int copy; copy = min(bytes, iov->iov_len - base); BUG_ON(!i->count || i->count < copy); i->count -= copy; bytes -= copy; base += copy; if (iov->iov_len == base) { iov++; nr_segs--; base = 0; } } i->iov = iov; i->iov_offset = base; i->nr_segs = nr_segs; } } EXPORT_SYMBOL(iov_iter_advance); int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { char __user *buf = i->iov->iov_base + i->iov_offset; bytes = min(bytes, i->iov->iov_len - i->iov_offset); return fault_in_pages_readable(buf, bytes); } EXPORT_SYMBOL(iov_iter_fault_in_readable); size_t iov_iter_single_seg_count(struct iov_iter *i) { const struct iovec *iov = i->iov; if (i->nr_segs == 1) return i->count; else return min(i->count, iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; unsigned long limit = rlimit(RLIMIT_FSIZE); if (unlikely(*pos < 0)) return -EINVAL; if (!isblk) { if (file->f_flags & O_APPEND) *pos = i_size_read(inode); if (limit != RLIM_INFINITY) { if (*pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (*count > limit - (typeof(limit))*pos) { *count = limit - (typeof(limit))*pos; } } } if (unlikely(*pos + *count > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { if (*pos >= MAX_NON_LFS) { return -EFBIG; } if (*count > MAX_NON_LFS - (unsigned long)*pos) { *count = MAX_NON_LFS - (unsigned long)*pos; } } /* * Are we about to exceed the fs block limit ? * * If we have written data it becomes a short write. If we have * exceeded without writing data we send a signal and return EFBIG. * Linus frestrict idea will clean these up nicely.. */ if (likely(!isblk)) { if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { if (*count || *pos > inode->i_sb->s_maxbytes) { return -EFBIG; } } if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) *count = inode->i_sb->s_maxbytes - *pos; } else { #ifdef CONFIG_BLOCK loff_t isize; if (bdev_read_only(I_BDEV(inode))) return -EPERM; isize = i_size_read(inode); if (*pos >= isize) { if (*count || *pos > isize) return -ENOSPC; } if (*pos + *count > isize) *count = isize - *pos; #else return -EPERM; #endif } return 0; } EXPORT_SYMBOL(generic_write_checks); int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { const struct address_space_operations *aops = mapping->a_ops; return aops->write_begin(file, mapping, pos, len, flags, pagep, fsdata); } EXPORT_SYMBOL(pagecache_write_begin); int pagecache_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { const struct address_space_operations *aops = mapping->a_ops; mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, size_t count, size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; size_t write_len; pgoff_t end; if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); write_len = iov_length(iov, *nr_segs); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) goto out; if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); if (written) { if (written == -EBUSY) return 0; goto out; } } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); } if (written > 0) { pos += written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } *ppos = pos; } out: return written; } EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { int status; gfp_t gfp_mask; struct page *page; gfp_t gfp_notmask = 0; gfp_mask = mapping_gfp_mask(mapping); if (mapping_cap_account_dirty(mapping)) gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); if (page) goto found; page = __page_cache_alloc(gfp_mask & ~gfp_notmask); if (!page) return NULL; status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL & ~gfp_notmask); if (unlikely(status)) { page_cache_release(page); if (status == -EEXIST) goto repeat; return NULL; } found: wait_on_page_writeback(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); static ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; ssize_t written = 0; unsigned int flags = 0; if (segment_eq(get_fs(), KERNEL_DS)) flags |= AOP_FLAG_UNINTERRUPTIBLE; do { struct page *page; unsigned long offset; unsigned long bytes; size_t copied; void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); again: if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) break; copied = status; cond_resched(); iov_iter_advance(i, copied); if (unlikely(copied == 0)) { bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_single_seg_count(i)); goto again; } pos += copied; written += copied; balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; break; } } while (iov_iter_count(i)); return written ? written : status; } ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, loff_t *ppos, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; ssize_t status; struct iov_iter i; iov_iter_init(&i, iov, nr_segs, count, written); status = generic_perform_write(file, &i, pos); if (likely(status >= 0)) { written += status; *ppos = pos + status; } return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; size_t count; struct inode *inode = mapping->host; loff_t pos; ssize_t written; ssize_t err; ocount = 0; err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); if (err) return err; count = ocount; pos = *ppos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); current->backing_dev_info = mapping->backing_dev_info; written = 0; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; if (count == 0) goto out; err = file_remove_suid(file); if (err) goto out; file_update_time(file); if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; ssize_t written_buffered; written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, count, ocount); if (written < 0 || written == count) goto out; pos += written; count -= written; written_buffered = generic_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count, written); /* * If generic_file_buffered_write() retuned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ if (written_buffered < 0) { err = written_buffered; goto out; } /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ endbyte = pos + written_buffered - written - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { written = written_buffered; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); } else { /* * We don't know how much we wrote, so just return * the number of bytes which were direct-written */ } } else { written = generic_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count, written); } out: current->backing_dev_info = NULL; return written ? written : err; } EXPORT_SYMBOL(__generic_file_aio_write); ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; err = generic_write_sync(file, pos, ret); if (err < 0 && ret > 0) ret = err; } blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); int try_to_release_page(struct page *page, gfp_t gfp_mask) { struct address_space * const mapping = page->mapping; BUG_ON(!PageLocked(page)); if (PageWriteback(page)) return 0; if (mapping && mapping->a_ops->releasepage) return mapping->a_ops->releasepage(page, gfp_mask); return try_to_free_buffers(page); } EXPORT_SYMBOL(try_to_release_page);