00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #include <linux/kernel_stat.h>
00042 #include <linux/mm.h>
00043 #include <linux/hugetlb.h>
00044 #include <linux/mman.h>
00045 #include <linux/swap.h>
00046 #include <linux/highmem.h>
00047 #include <linux/pagemap.h>
00048 #ifndef DDE_LINUX
00049 #include <linux/rmap.h>
00050 #endif
00051 #include <linux/module.h>
00052 #include <linux/delayacct.h>
00053 #include <linux/init.h>
00054 #include <linux/writeback.h>
00055 #include <linux/memcontrol.h>
00056 #include <linux/mmu_notifier.h>
00057 #include <linux/kallsyms.h>
00058 #include <linux/swapops.h>
00059 #include <linux/elf.h>
00060
00061 #include <asm/pgalloc.h>
00062 #include <asm/uaccess.h>
00063 #include <asm/tlb.h>
00064 #include <asm/tlbflush.h>
00065 #include <asm/pgtable.h>
00066
00067 #include "internal.h"
00068
00069 #ifndef CONFIG_NEED_MULTIPLE_NODES
00070
00071 unsigned long max_mapnr;
00072 #ifndef DDE_LINUX
00073 struct page *mem_map;
00074 #endif
00075
00076 EXPORT_SYMBOL(max_mapnr);
00077 #ifndef DDE_LINUX
00078 EXPORT_SYMBOL(mem_map);
00079 #endif
00080 #endif
00081
00082 unsigned long num_physpages;
00083
00084
00085
00086
00087
00088
00089
00090 void * high_memory;
00091
00092 EXPORT_SYMBOL(num_physpages);
00093 EXPORT_SYMBOL(high_memory);
00094
00095
00096
00097
00098
00099
00100
00101 int randomize_va_space __read_mostly =
00102 #ifdef CONFIG_COMPAT_BRK
00103 1;
00104 #else
00105 2;
00106 #endif
00107
00108 #ifndef DDE_LINUX
00109 static int __init disable_randmaps(char *s)
00110 {
00111 randomize_va_space = 0;
00112 return 1;
00113 }
00114 __setup("norandmaps", disable_randmaps);
00115
00116
00117
00118
00119
00120
00121
00122
00123 void pgd_clear_bad(pgd_t *pgd)
00124 {
00125 pgd_ERROR(*pgd);
00126 pgd_clear(pgd);
00127 }
00128
00129 void pud_clear_bad(pud_t *pud)
00130 {
00131 pud_ERROR(*pud);
00132 pud_clear(pud);
00133 }
00134
00135 void pmd_clear_bad(pmd_t *pmd)
00136 {
00137 pmd_ERROR(*pmd);
00138 pmd_clear(pmd);
00139 }
00140
00141
00142
00143
00144
00145 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
00146 {
00147 pgtable_t token = pmd_pgtable(*pmd);
00148 pmd_clear(pmd);
00149 pte_free_tlb(tlb, token);
00150 tlb->mm->nr_ptes--;
00151 }
00152
00153 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
00154 unsigned long addr, unsigned long end,
00155 unsigned long floor, unsigned long ceiling)
00156 {
00157 pmd_t *pmd;
00158 unsigned long next;
00159 unsigned long start;
00160
00161 start = addr;
00162 pmd = pmd_offset(pud, addr);
00163 do {
00164 next = pmd_addr_end(addr, end);
00165 if (pmd_none_or_clear_bad(pmd))
00166 continue;
00167 free_pte_range(tlb, pmd);
00168 } while (pmd++, addr = next, addr != end);
00169
00170 start &= PUD_MASK;
00171 if (start < floor)
00172 return;
00173 if (ceiling) {
00174 ceiling &= PUD_MASK;
00175 if (!ceiling)
00176 return;
00177 }
00178 if (end - 1 > ceiling - 1)
00179 return;
00180
00181 pmd = pmd_offset(pud, start);
00182 pud_clear(pud);
00183 pmd_free_tlb(tlb, pmd);
00184 }
00185
00186 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
00187 unsigned long addr, unsigned long end,
00188 unsigned long floor, unsigned long ceiling)
00189 {
00190 pud_t *pud;
00191 unsigned long next;
00192 unsigned long start;
00193
00194 start = addr;
00195 pud = pud_offset(pgd, addr);
00196 do {
00197 next = pud_addr_end(addr, end);
00198 if (pud_none_or_clear_bad(pud))
00199 continue;
00200 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
00201 } while (pud++, addr = next, addr != end);
00202
00203 start &= PGDIR_MASK;
00204 if (start < floor)
00205 return;
00206 if (ceiling) {
00207 ceiling &= PGDIR_MASK;
00208 if (!ceiling)
00209 return;
00210 }
00211 if (end - 1 > ceiling - 1)
00212 return;
00213
00214 pud = pud_offset(pgd, start);
00215 pgd_clear(pgd);
00216 pud_free_tlb(tlb, pud);
00217 }
00218
00219
00220
00221
00222
00223
00224 void free_pgd_range(struct mmu_gather *tlb,
00225 unsigned long addr, unsigned long end,
00226 unsigned long floor, unsigned long ceiling)
00227 {
00228 pgd_t *pgd;
00229 unsigned long next;
00230 unsigned long start;
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258 addr &= PMD_MASK;
00259 if (addr < floor) {
00260 addr += PMD_SIZE;
00261 if (!addr)
00262 return;
00263 }
00264 if (ceiling) {
00265 ceiling &= PMD_MASK;
00266 if (!ceiling)
00267 return;
00268 }
00269 if (end - 1 > ceiling - 1)
00270 end -= PMD_SIZE;
00271 if (addr > end - 1)
00272 return;
00273
00274 start = addr;
00275 pgd = pgd_offset(tlb->mm, addr);
00276 do {
00277 next = pgd_addr_end(addr, end);
00278 if (pgd_none_or_clear_bad(pgd))
00279 continue;
00280 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
00281 } while (pgd++, addr = next, addr != end);
00282 }
00283
00284 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
00285 unsigned long floor, unsigned long ceiling)
00286 {
00287 while (vma) {
00288 struct vm_area_struct *next = vma->vm_next;
00289 unsigned long addr = vma->vm_start;
00290
00291
00292
00293
00294 anon_vma_unlink(vma);
00295 unlink_file_vma(vma);
00296
00297 if (is_vm_hugetlb_page(vma)) {
00298 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
00299 floor, next? next->vm_start: ceiling);
00300 } else {
00301
00302
00303
00304 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
00305 && !is_vm_hugetlb_page(next)) {
00306 vma = next;
00307 next = vma->vm_next;
00308 anon_vma_unlink(vma);
00309 unlink_file_vma(vma);
00310 }
00311 free_pgd_range(tlb, addr, vma->vm_end,
00312 floor, next? next->vm_start: ceiling);
00313 }
00314 vma = next;
00315 }
00316 }
00317
00318 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
00319 {
00320 pgtable_t new = pte_alloc_one(mm, address);
00321 if (!new)
00322 return -ENOMEM;
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337 smp_wmb();
00338
00339 spin_lock(&mm->page_table_lock);
00340 if (!pmd_present(*pmd)) {
00341 mm->nr_ptes++;
00342 pmd_populate(mm, pmd, new);
00343 new = NULL;
00344 }
00345 spin_unlock(&mm->page_table_lock);
00346 if (new)
00347 pte_free(mm, new);
00348 return 0;
00349 }
00350
00351 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
00352 {
00353 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
00354 if (!new)
00355 return -ENOMEM;
00356
00357 smp_wmb();
00358
00359 spin_lock(&init_mm.page_table_lock);
00360 if (!pmd_present(*pmd)) {
00361 pmd_populate_kernel(&init_mm, pmd, new);
00362 new = NULL;
00363 }
00364 spin_unlock(&init_mm.page_table_lock);
00365 if (new)
00366 pte_free_kernel(&init_mm, new);
00367 return 0;
00368 }
00369
00370 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
00371 {
00372 if (file_rss)
00373 add_mm_counter(mm, file_rss, file_rss);
00374 if (anon_rss)
00375 add_mm_counter(mm, anon_rss, anon_rss);
00376 }
00377
00378
00379
00380
00381
00382
00383
00384
00385 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
00386 pte_t pte, struct page *page)
00387 {
00388 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
00389 pud_t *pud = pud_offset(pgd, addr);
00390 pmd_t *pmd = pmd_offset(pud, addr);
00391 struct address_space *mapping;
00392 pgoff_t index;
00393 static unsigned long resume;
00394 static unsigned long nr_shown;
00395 static unsigned long nr_unshown;
00396
00397
00398
00399
00400
00401 if (nr_shown == 60) {
00402 if (time_before(jiffies, resume)) {
00403 nr_unshown++;
00404 return;
00405 }
00406 if (nr_unshown) {
00407 printk(KERN_ALERT
00408 "BUG: Bad page map: %lu messages suppressed\n",
00409 nr_unshown);
00410 nr_unshown = 0;
00411 }
00412 nr_shown = 0;
00413 }
00414 if (nr_shown++ == 0)
00415 resume = jiffies + 60 * HZ;
00416
00417 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
00418 index = linear_page_index(vma, addr);
00419
00420 printk(KERN_ALERT
00421 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
00422 current->comm,
00423 (long long)pte_val(pte), (long long)pmd_val(*pmd));
00424 if (page) {
00425 printk(KERN_ALERT
00426 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
00427 page, (void *)page->flags, page_count(page),
00428 page_mapcount(page), page->mapping, page->index);
00429 }
00430 printk(KERN_ALERT
00431 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
00432 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
00433
00434
00435
00436 if (vma->vm_ops)
00437 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
00438 (unsigned long)vma->vm_ops->fault);
00439 if (vma->vm_file && vma->vm_file->f_op)
00440 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
00441 (unsigned long)vma->vm_file->f_op->mmap);
00442 dump_stack();
00443 add_taint(TAINT_BAD_PAGE);
00444 }
00445
00446 static inline int is_cow_mapping(unsigned int flags)
00447 {
00448 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
00449 }
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493 #ifdef __HAVE_ARCH_PTE_SPECIAL
00494 # define HAVE_PTE_SPECIAL 1
00495 #else
00496 # define HAVE_PTE_SPECIAL 0
00497 #endif
00498 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
00499 pte_t pte)
00500 {
00501 unsigned long pfn = pte_pfn(pte);
00502
00503 if (HAVE_PTE_SPECIAL) {
00504 if (likely(!pte_special(pte)))
00505 goto check_pfn;
00506 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
00507 print_bad_pte(vma, addr, pte, NULL);
00508 return NULL;
00509 }
00510
00511
00512
00513 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
00514 if (vma->vm_flags & VM_MIXEDMAP) {
00515 if (!pfn_valid(pfn))
00516 return NULL;
00517 goto out;
00518 } else {
00519 unsigned long off;
00520 off = (addr - vma->vm_start) >> PAGE_SHIFT;
00521 if (pfn == vma->vm_pgoff + off)
00522 return NULL;
00523 if (!is_cow_mapping(vma->vm_flags))
00524 return NULL;
00525 }
00526 }
00527
00528 check_pfn:
00529 if (unlikely(pfn > highest_memmap_pfn)) {
00530 print_bad_pte(vma, addr, pte, NULL);
00531 return NULL;
00532 }
00533 #endif
00534
00535
00536
00537
00538
00539 out:
00540 return pfn_to_page(pfn);
00541 }
00542
00543
00544
00545
00546
00547
00548
00549 static inline void
00550 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
00551 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
00552 unsigned long addr, int *rss)
00553 {
00554 unsigned long vm_flags = vma->vm_flags;
00555 pte_t pte = *src_pte;
00556 struct page *page;
00557
00558
00559 if (unlikely(!pte_present(pte))) {
00560 if (!pte_file(pte)) {
00561 swp_entry_t entry = pte_to_swp_entry(pte);
00562
00563 swap_duplicate(entry);
00564
00565 if (unlikely(list_empty(&dst_mm->mmlist))) {
00566 spin_lock(&mmlist_lock);
00567 if (list_empty(&dst_mm->mmlist))
00568 list_add(&dst_mm->mmlist,
00569 &src_mm->mmlist);
00570 spin_unlock(&mmlist_lock);
00571 }
00572 if (is_write_migration_entry(entry) &&
00573 is_cow_mapping(vm_flags)) {
00574
00575
00576
00577
00578 make_migration_entry_read(&entry);
00579 pte = swp_entry_to_pte(entry);
00580 set_pte_at(src_mm, addr, src_pte, pte);
00581 }
00582 }
00583 goto out_set_pte;
00584 }
00585
00586
00587
00588
00589
00590 if (is_cow_mapping(vm_flags)) {
00591 ptep_set_wrprotect(src_mm, addr, src_pte);
00592 pte = pte_wrprotect(pte);
00593 }
00594
00595
00596
00597
00598
00599 if (vm_flags & VM_SHARED)
00600 pte = pte_mkclean(pte);
00601 pte = pte_mkold(pte);
00602
00603 page = vm_normal_page(vma, addr, pte);
00604 if (page) {
00605 get_page(page);
00606 page_dup_rmap(page, vma, addr);
00607 rss[!!PageAnon(page)]++;
00608 }
00609
00610 out_set_pte:
00611 set_pte_at(dst_mm, addr, dst_pte, pte);
00612 }
00613
00614 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
00615 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
00616 unsigned long addr, unsigned long end)
00617 {
00618 pte_t *src_pte, *dst_pte;
00619 spinlock_t *src_ptl, *dst_ptl;
00620 int progress = 0;
00621 int rss[2];
00622
00623 again:
00624 rss[1] = rss[0] = 0;
00625 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
00626 if (!dst_pte)
00627 return -ENOMEM;
00628 src_pte = pte_offset_map_nested(src_pmd, addr);
00629 src_ptl = pte_lockptr(src_mm, src_pmd);
00630 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
00631 arch_enter_lazy_mmu_mode();
00632
00633 do {
00634
00635
00636
00637
00638 if (progress >= 32) {
00639 progress = 0;
00640 if (need_resched() ||
00641 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
00642 break;
00643 }
00644 if (pte_none(*src_pte)) {
00645 progress++;
00646 continue;
00647 }
00648 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
00649 progress += 8;
00650 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
00651
00652 arch_leave_lazy_mmu_mode();
00653 spin_unlock(src_ptl);
00654 pte_unmap_nested(src_pte - 1);
00655 add_mm_rss(dst_mm, rss[0], rss[1]);
00656 pte_unmap_unlock(dst_pte - 1, dst_ptl);
00657 cond_resched();
00658 if (addr != end)
00659 goto again;
00660 return 0;
00661 }
00662
00663 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
00664 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
00665 unsigned long addr, unsigned long end)
00666 {
00667 pmd_t *src_pmd, *dst_pmd;
00668 unsigned long next;
00669
00670 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
00671 if (!dst_pmd)
00672 return -ENOMEM;
00673 src_pmd = pmd_offset(src_pud, addr);
00674 do {
00675 next = pmd_addr_end(addr, end);
00676 if (pmd_none_or_clear_bad(src_pmd))
00677 continue;
00678 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
00679 vma, addr, next))
00680 return -ENOMEM;
00681 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
00682 return 0;
00683 }
00684
00685 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
00686 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
00687 unsigned long addr, unsigned long end)
00688 {
00689 pud_t *src_pud, *dst_pud;
00690 unsigned long next;
00691
00692 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
00693 if (!dst_pud)
00694 return -ENOMEM;
00695 src_pud = pud_offset(src_pgd, addr);
00696 do {
00697 next = pud_addr_end(addr, end);
00698 if (pud_none_or_clear_bad(src_pud))
00699 continue;
00700 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
00701 vma, addr, next))
00702 return -ENOMEM;
00703 } while (dst_pud++, src_pud++, addr = next, addr != end);
00704 return 0;
00705 }
00706
00707 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
00708 struct vm_area_struct *vma)
00709 {
00710 pgd_t *src_pgd, *dst_pgd;
00711 unsigned long next;
00712 unsigned long addr = vma->vm_start;
00713 unsigned long end = vma->vm_end;
00714 int ret;
00715
00716
00717
00718
00719
00720
00721
00722 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
00723 if (!vma->anon_vma)
00724 return 0;
00725 }
00726
00727 if (is_vm_hugetlb_page(vma))
00728 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
00729
00730 if (unlikely(is_pfn_mapping(vma))) {
00731
00732
00733
00734
00735 ret = track_pfn_vma_copy(vma);
00736 if (ret)
00737 return ret;
00738 }
00739
00740
00741
00742
00743
00744
00745
00746 if (is_cow_mapping(vma->vm_flags))
00747 mmu_notifier_invalidate_range_start(src_mm, addr, end);
00748
00749 ret = 0;
00750 dst_pgd = pgd_offset(dst_mm, addr);
00751 src_pgd = pgd_offset(src_mm, addr);
00752 do {
00753 next = pgd_addr_end(addr, end);
00754 if (pgd_none_or_clear_bad(src_pgd))
00755 continue;
00756 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
00757 vma, addr, next))) {
00758 ret = -ENOMEM;
00759 break;
00760 }
00761 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
00762
00763 if (is_cow_mapping(vma->vm_flags))
00764 mmu_notifier_invalidate_range_end(src_mm,
00765 vma->vm_start, end);
00766 return ret;
00767 }
00768
00769 static unsigned long zap_pte_range(struct mmu_gather *tlb,
00770 struct vm_area_struct *vma, pmd_t *pmd,
00771 unsigned long addr, unsigned long end,
00772 long *zap_work, struct zap_details *details)
00773 {
00774 struct mm_struct *mm = tlb->mm;
00775 pte_t *pte;
00776 spinlock_t *ptl;
00777 int file_rss = 0;
00778 int anon_rss = 0;
00779
00780 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
00781 arch_enter_lazy_mmu_mode();
00782 do {
00783 pte_t ptent = *pte;
00784 if (pte_none(ptent)) {
00785 (*zap_work)--;
00786 continue;
00787 }
00788
00789 (*zap_work) -= PAGE_SIZE;
00790
00791 if (pte_present(ptent)) {
00792 struct page *page;
00793
00794 page = vm_normal_page(vma, addr, ptent);
00795 if (unlikely(details) && page) {
00796
00797
00798
00799
00800
00801 if (details->check_mapping &&
00802 details->check_mapping != page->mapping)
00803 continue;
00804
00805
00806
00807
00808 if (details->nonlinear_vma &&
00809 (page->index < details->first_index ||
00810 page->index > details->last_index))
00811 continue;
00812 }
00813 ptent = ptep_get_and_clear_full(mm, addr, pte,
00814 tlb->fullmm);
00815 tlb_remove_tlb_entry(tlb, pte, addr);
00816 if (unlikely(!page))
00817 continue;
00818 if (unlikely(details) && details->nonlinear_vma
00819 && linear_page_index(details->nonlinear_vma,
00820 addr) != page->index)
00821 set_pte_at(mm, addr, pte,
00822 pgoff_to_pte(page->index));
00823 if (PageAnon(page))
00824 anon_rss--;
00825 else {
00826 if (pte_dirty(ptent))
00827 set_page_dirty(page);
00828 if (pte_young(ptent) &&
00829 likely(!VM_SequentialReadHint(vma)))
00830 mark_page_accessed(page);
00831 file_rss--;
00832 }
00833 page_remove_rmap(page);
00834 if (unlikely(page_mapcount(page) < 0))
00835 print_bad_pte(vma, addr, ptent, page);
00836 tlb_remove_page(tlb, page);
00837 continue;
00838 }
00839
00840
00841
00842
00843 if (unlikely(details))
00844 continue;
00845 if (pte_file(ptent)) {
00846 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
00847 print_bad_pte(vma, addr, ptent, NULL);
00848 } else if
00849 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
00850 print_bad_pte(vma, addr, ptent, NULL);
00851 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
00852 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
00853
00854 add_mm_rss(mm, file_rss, anon_rss);
00855 arch_leave_lazy_mmu_mode();
00856 pte_unmap_unlock(pte - 1, ptl);
00857
00858 return addr;
00859 }
00860
00861 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
00862 struct vm_area_struct *vma, pud_t *pud,
00863 unsigned long addr, unsigned long end,
00864 long *zap_work, struct zap_details *details)
00865 {
00866 pmd_t *pmd;
00867 unsigned long next;
00868
00869 pmd = pmd_offset(pud, addr);
00870 do {
00871 next = pmd_addr_end(addr, end);
00872 if (pmd_none_or_clear_bad(pmd)) {
00873 (*zap_work)--;
00874 continue;
00875 }
00876 next = zap_pte_range(tlb, vma, pmd, addr, next,
00877 zap_work, details);
00878 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
00879
00880 return addr;
00881 }
00882
00883 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
00884 struct vm_area_struct *vma, pgd_t *pgd,
00885 unsigned long addr, unsigned long end,
00886 long *zap_work, struct zap_details *details)
00887 {
00888 pud_t *pud;
00889 unsigned long next;
00890
00891 pud = pud_offset(pgd, addr);
00892 do {
00893 next = pud_addr_end(addr, end);
00894 if (pud_none_or_clear_bad(pud)) {
00895 (*zap_work)--;
00896 continue;
00897 }
00898 next = zap_pmd_range(tlb, vma, pud, addr, next,
00899 zap_work, details);
00900 } while (pud++, addr = next, (addr != end && *zap_work > 0));
00901
00902 return addr;
00903 }
00904
00905 static unsigned long unmap_page_range(struct mmu_gather *tlb,
00906 struct vm_area_struct *vma,
00907 unsigned long addr, unsigned long end,
00908 long *zap_work, struct zap_details *details)
00909 {
00910 pgd_t *pgd;
00911 unsigned long next;
00912
00913 if (details && !details->check_mapping && !details->nonlinear_vma)
00914 details = NULL;
00915
00916 BUG_ON(addr >= end);
00917 tlb_start_vma(tlb, vma);
00918 pgd = pgd_offset(vma->vm_mm, addr);
00919 do {
00920 next = pgd_addr_end(addr, end);
00921 if (pgd_none_or_clear_bad(pgd)) {
00922 (*zap_work)--;
00923 continue;
00924 }
00925 next = zap_pud_range(tlb, vma, pgd, addr, next,
00926 zap_work, details);
00927 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
00928 tlb_end_vma(tlb, vma);
00929
00930 return addr;
00931 }
00932
00933 #ifdef CONFIG_PREEMPT
00934 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
00935 #else
00936
00937 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
00938 #endif
00939
00940
00941
00942
00943
00944
00945
00946
00947
00948
00949
00950
00951
00952
00953
00954
00955
00956
00957
00958
00959
00960
00961
00962
00963
00964
00965
00966 unsigned long unmap_vmas(struct mmu_gather **tlbp,
00967 struct vm_area_struct *vma, unsigned long start_addr,
00968 unsigned long end_addr, unsigned long *nr_accounted,
00969 struct zap_details *details)
00970 {
00971 long zap_work = ZAP_BLOCK_SIZE;
00972 unsigned long tlb_start = 0;
00973 int tlb_start_valid = 0;
00974 unsigned long start = start_addr;
00975 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
00976 int fullmm = (*tlbp)->fullmm;
00977 struct mm_struct *mm = vma->vm_mm;
00978
00979 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
00980 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
00981 unsigned long end;
00982
00983 start = max(vma->vm_start, start_addr);
00984 if (start >= vma->vm_end)
00985 continue;
00986 end = min(vma->vm_end, end_addr);
00987 if (end <= vma->vm_start)
00988 continue;
00989
00990 if (vma->vm_flags & VM_ACCOUNT)
00991 *nr_accounted += (end - start) >> PAGE_SHIFT;
00992
00993 if (unlikely(is_pfn_mapping(vma)))
00994 untrack_pfn_vma(vma, 0, 0);
00995
00996 while (start != end) {
00997 if (!tlb_start_valid) {
00998 tlb_start = start;
00999 tlb_start_valid = 1;
01000 }
01001
01002 if (unlikely(is_vm_hugetlb_page(vma))) {
01003
01004
01005
01006
01007
01008
01009
01010
01011
01012
01013
01014 if (vma->vm_file) {
01015 unmap_hugepage_range(vma, start, end, NULL);
01016 zap_work -= (end - start) /
01017 pages_per_huge_page(hstate_vma(vma));
01018 }
01019
01020 start = end;
01021 } else
01022 start = unmap_page_range(*tlbp, vma,
01023 start, end, &zap_work, details);
01024
01025 if (zap_work > 0) {
01026 BUG_ON(start != end);
01027 break;
01028 }
01029
01030 tlb_finish_mmu(*tlbp, tlb_start, start);
01031
01032 if (need_resched() ||
01033 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
01034 if (i_mmap_lock) {
01035 *tlbp = NULL;
01036 goto out;
01037 }
01038 cond_resched();
01039 }
01040
01041 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
01042 tlb_start_valid = 0;
01043 zap_work = ZAP_BLOCK_SIZE;
01044 }
01045 }
01046 out:
01047 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
01048 return start;
01049 }
01050
01051
01052
01053
01054
01055
01056
01057
01058 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
01059 unsigned long size, struct zap_details *details)
01060 {
01061 struct mm_struct *mm = vma->vm_mm;
01062 struct mmu_gather *tlb;
01063 unsigned long end = address + size;
01064 unsigned long nr_accounted = 0;
01065
01066 lru_add_drain();
01067 tlb = tlb_gather_mmu(mm, 0);
01068 update_hiwater_rss(mm);
01069 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
01070 if (tlb)
01071 tlb_finish_mmu(tlb, address, end);
01072 return end;
01073 }
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083
01084
01085
01086
01087 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
01088 unsigned long size)
01089 {
01090 if (address < vma->vm_start || address + size > vma->vm_end ||
01091 !(vma->vm_flags & VM_PFNMAP))
01092 return -1;
01093 zap_page_range(vma, address, size, NULL);
01094 return 0;
01095 }
01096 EXPORT_SYMBOL_GPL(zap_vma_ptes);
01097
01098
01099
01100
01101 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
01102 unsigned int flags)
01103 {
01104 pgd_t *pgd;
01105 pud_t *pud;
01106 pmd_t *pmd;
01107 pte_t *ptep, pte;
01108 spinlock_t *ptl;
01109 struct page *page;
01110 struct mm_struct *mm = vma->vm_mm;
01111
01112 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
01113 if (!IS_ERR(page)) {
01114 BUG_ON(flags & FOLL_GET);
01115 goto out;
01116 }
01117
01118 page = NULL;
01119 pgd = pgd_offset(mm, address);
01120 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
01121 goto no_page_table;
01122
01123 pud = pud_offset(pgd, address);
01124 if (pud_none(*pud))
01125 goto no_page_table;
01126 if (pud_huge(*pud)) {
01127 BUG_ON(flags & FOLL_GET);
01128 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
01129 goto out;
01130 }
01131 if (unlikely(pud_bad(*pud)))
01132 goto no_page_table;
01133
01134 pmd = pmd_offset(pud, address);
01135 if (pmd_none(*pmd))
01136 goto no_page_table;
01137 if (pmd_huge(*pmd)) {
01138 BUG_ON(flags & FOLL_GET);
01139 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
01140 goto out;
01141 }
01142 if (unlikely(pmd_bad(*pmd)))
01143 goto no_page_table;
01144
01145 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
01146
01147 pte = *ptep;
01148 if (!pte_present(pte))
01149 goto no_page;
01150 if ((flags & FOLL_WRITE) && !pte_write(pte))
01151 goto unlock;
01152 page = vm_normal_page(vma, address, pte);
01153 if (unlikely(!page))
01154 goto bad_page;
01155
01156 if (flags & FOLL_GET)
01157 get_page(page);
01158 if (flags & FOLL_TOUCH) {
01159 if ((flags & FOLL_WRITE) &&
01160 !pte_dirty(pte) && !PageDirty(page))
01161 set_page_dirty(page);
01162 mark_page_accessed(page);
01163 }
01164 unlock:
01165 pte_unmap_unlock(ptep, ptl);
01166 out:
01167 return page;
01168
01169 bad_page:
01170 pte_unmap_unlock(ptep, ptl);
01171 return ERR_PTR(-EFAULT);
01172
01173 no_page:
01174 pte_unmap_unlock(ptep, ptl);
01175 if (!pte_none(pte))
01176 return page;
01177
01178 no_page_table:
01179
01180
01181
01182
01183 if (flags & FOLL_ANON) {
01184 page = ZERO_PAGE(0);
01185 if (flags & FOLL_GET)
01186 get_page(page);
01187 BUG_ON(flags & FOLL_WRITE);
01188 }
01189 return page;
01190 }
01191
01192
01193 static inline int use_zero_page(struct vm_area_struct *vma)
01194 {
01195
01196
01197
01198
01199
01200
01201
01202 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
01203 return 0;
01204
01205
01206
01207 return !vma->vm_ops || !vma->vm_ops->fault;
01208 }
01209
01210
01211
01212 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
01213 unsigned long start, int len, int flags,
01214 struct page **pages, struct vm_area_struct **vmas)
01215 {
01216 int i;
01217 unsigned int vm_flags = 0;
01218 int write = !!(flags & GUP_FLAGS_WRITE);
01219 int force = !!(flags & GUP_FLAGS_FORCE);
01220 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
01221 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
01222
01223 if (len <= 0)
01224 return 0;
01225
01226
01227
01228
01229 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
01230 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
01231 i = 0;
01232
01233 do {
01234 struct vm_area_struct *vma;
01235 unsigned int foll_flags;
01236
01237 vma = find_extend_vma(mm, start);
01238 if (!vma && in_gate_area(tsk, start)) {
01239 unsigned long pg = start & PAGE_MASK;
01240 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
01241 pgd_t *pgd;
01242 pud_t *pud;
01243 pmd_t *pmd;
01244 pte_t *pte;
01245
01246
01247 if (!ignore && write)
01248 return i ? : -EFAULT;
01249 if (pg > TASK_SIZE)
01250 pgd = pgd_offset_k(pg);
01251 else
01252 pgd = pgd_offset_gate(mm, pg);
01253 BUG_ON(pgd_none(*pgd));
01254 pud = pud_offset(pgd, pg);
01255 BUG_ON(pud_none(*pud));
01256 pmd = pmd_offset(pud, pg);
01257 if (pmd_none(*pmd))
01258 return i ? : -EFAULT;
01259 pte = pte_offset_map(pmd, pg);
01260 if (pte_none(*pte)) {
01261 pte_unmap(pte);
01262 return i ? : -EFAULT;
01263 }
01264 if (pages) {
01265 struct page *page = vm_normal_page(gate_vma, start, *pte);
01266 pages[i] = page;
01267 if (page)
01268 get_page(page);
01269 }
01270 pte_unmap(pte);
01271 if (vmas)
01272 vmas[i] = gate_vma;
01273 i++;
01274 start += PAGE_SIZE;
01275 len--;
01276 continue;
01277 }
01278
01279 if (!vma ||
01280 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
01281 (!ignore && !(vm_flags & vma->vm_flags)))
01282 return i ? : -EFAULT;
01283
01284 if (is_vm_hugetlb_page(vma)) {
01285 i = follow_hugetlb_page(mm, vma, pages, vmas,
01286 &start, &len, i, write);
01287 continue;
01288 }
01289
01290 foll_flags = FOLL_TOUCH;
01291 if (pages)
01292 foll_flags |= FOLL_GET;
01293 if (!write && use_zero_page(vma))
01294 foll_flags |= FOLL_ANON;
01295
01296 do {
01297 struct page *page;
01298
01299
01300
01301
01302
01303
01304
01305
01306 if (unlikely(!ignore_sigkill &&
01307 fatal_signal_pending(current)))
01308 return i ? i : -ERESTARTSYS;
01309
01310 if (write)
01311 foll_flags |= FOLL_WRITE;
01312
01313 cond_resched();
01314 while (!(page = follow_page(vma, start, foll_flags))) {
01315 int ret;
01316 ret = handle_mm_fault(mm, vma, start,
01317 foll_flags & FOLL_WRITE);
01318 if (ret & VM_FAULT_ERROR) {
01319 if (ret & VM_FAULT_OOM)
01320 return i ? i : -ENOMEM;
01321 else if (ret & VM_FAULT_SIGBUS)
01322 return i ? i : -EFAULT;
01323 BUG();
01324 }
01325 if (ret & VM_FAULT_MAJOR)
01326 tsk->maj_flt++;
01327 else
01328 tsk->min_flt++;
01329
01330
01331
01332
01333
01334
01335
01336
01337
01338
01339
01340
01341
01342 if ((ret & VM_FAULT_WRITE) &&
01343 !(vma->vm_flags & VM_WRITE))
01344 foll_flags &= ~FOLL_WRITE;
01345
01346 cond_resched();
01347 }
01348 if (IS_ERR(page))
01349 return i ? i : PTR_ERR(page);
01350 if (pages) {
01351 pages[i] = page;
01352
01353 flush_anon_page(vma, page, start);
01354 flush_dcache_page(page);
01355 }
01356 if (vmas)
01357 vmas[i] = vma;
01358 i++;
01359 start += PAGE_SIZE;
01360 len--;
01361 } while (len && start < vma->vm_end);
01362 } while (len);
01363 return i;
01364 }
01365
01366 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
01367 unsigned long start, int len, int write, int force,
01368 struct page **pages, struct vm_area_struct **vmas)
01369 {
01370 int flags = 0;
01371
01372 if (write)
01373 flags |= GUP_FLAGS_WRITE;
01374 if (force)
01375 flags |= GUP_FLAGS_FORCE;
01376
01377 return __get_user_pages(tsk, mm,
01378 start, len, flags,
01379 pages, vmas);
01380 }
01381
01382 EXPORT_SYMBOL(get_user_pages);
01383
01384 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
01385 spinlock_t **ptl)
01386 {
01387 pgd_t * pgd = pgd_offset(mm, addr);
01388 pud_t * pud = pud_alloc(mm, pgd, addr);
01389 if (pud) {
01390 pmd_t * pmd = pmd_alloc(mm, pud, addr);
01391 if (pmd)
01392 return pte_alloc_map_lock(mm, pmd, addr, ptl);
01393 }
01394 return NULL;
01395 }
01396
01397
01398
01399
01400
01401
01402
01403
01404 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
01405 struct page *page, pgprot_t prot)
01406 {
01407 struct mm_struct *mm = vma->vm_mm;
01408 int retval;
01409 pte_t *pte;
01410 spinlock_t *ptl;
01411
01412 retval = -EINVAL;
01413 if (PageAnon(page))
01414 goto out;
01415 retval = -ENOMEM;
01416 flush_dcache_page(page);
01417 pte = get_locked_pte(mm, addr, &ptl);
01418 if (!pte)
01419 goto out;
01420 retval = -EBUSY;
01421 if (!pte_none(*pte))
01422 goto out_unlock;
01423
01424
01425 get_page(page);
01426 inc_mm_counter(mm, file_rss);
01427 page_add_file_rmap(page);
01428 set_pte_at(mm, addr, pte, mk_pte(page, prot));
01429
01430 retval = 0;
01431 pte_unmap_unlock(pte, ptl);
01432 return retval;
01433 out_unlock:
01434 pte_unmap_unlock(pte, ptl);
01435 out:
01436 return retval;
01437 }
01438
01439
01440
01441
01442
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452
01453
01454
01455
01456
01457
01458
01459
01460
01461 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
01462 struct page *page)
01463 {
01464 if (addr < vma->vm_start || addr >= vma->vm_end)
01465 return -EFAULT;
01466 if (!page_count(page))
01467 return -EINVAL;
01468 vma->vm_flags |= VM_INSERTPAGE;
01469 return insert_page(vma, addr, page, vma->vm_page_prot);
01470 }
01471 EXPORT_SYMBOL(vm_insert_page);
01472
01473 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
01474 unsigned long pfn, pgprot_t prot)
01475 {
01476 struct mm_struct *mm = vma->vm_mm;
01477 int retval;
01478 pte_t *pte, entry;
01479 spinlock_t *ptl;
01480
01481 retval = -ENOMEM;
01482 pte = get_locked_pte(mm, addr, &ptl);
01483 if (!pte)
01484 goto out;
01485 retval = -EBUSY;
01486 if (!pte_none(*pte))
01487 goto out_unlock;
01488
01489
01490 entry = pte_mkspecial(pfn_pte(pfn, prot));
01491 set_pte_at(mm, addr, pte, entry);
01492 update_mmu_cache(vma, addr, entry);
01493
01494 retval = 0;
01495 out_unlock:
01496 pte_unmap_unlock(pte, ptl);
01497 out:
01498 return retval;
01499 }
01500
01501
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513
01514
01515
01516
01517
01518 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
01519 unsigned long pfn)
01520 {
01521 int ret;
01522 pgprot_t pgprot = vma->vm_page_prot;
01523
01524
01525
01526
01527
01528
01529 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
01530 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
01531 (VM_PFNMAP|VM_MIXEDMAP));
01532 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
01533 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
01534
01535 if (addr < vma->vm_start || addr >= vma->vm_end)
01536 return -EFAULT;
01537 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
01538 return -EINVAL;
01539
01540 ret = insert_pfn(vma, addr, pfn, pgprot);
01541
01542 if (ret)
01543 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
01544
01545 return ret;
01546 }
01547 EXPORT_SYMBOL(vm_insert_pfn);
01548
01549 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
01550 unsigned long pfn)
01551 {
01552 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
01553
01554 if (addr < vma->vm_start || addr >= vma->vm_end)
01555 return -EFAULT;
01556
01557
01558
01559
01560
01561
01562
01563 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
01564 struct page *page;
01565
01566 page = pfn_to_page(pfn);
01567 return insert_page(vma, addr, page, vma->vm_page_prot);
01568 }
01569 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
01570 }
01571 EXPORT_SYMBOL(vm_insert_mixed);
01572
01573
01574
01575
01576
01577
01578 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
01579 unsigned long addr, unsigned long end,
01580 unsigned long pfn, pgprot_t prot)
01581 {
01582 pte_t *pte;
01583 spinlock_t *ptl;
01584
01585 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
01586 if (!pte)
01587 return -ENOMEM;
01588 arch_enter_lazy_mmu_mode();
01589 do {
01590 BUG_ON(!pte_none(*pte));
01591 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
01592 pfn++;
01593 } while (pte++, addr += PAGE_SIZE, addr != end);
01594 arch_leave_lazy_mmu_mode();
01595 pte_unmap_unlock(pte - 1, ptl);
01596 return 0;
01597 }
01598
01599 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
01600 unsigned long addr, unsigned long end,
01601 unsigned long pfn, pgprot_t prot)
01602 {
01603 pmd_t *pmd;
01604 unsigned long next;
01605
01606 pfn -= addr >> PAGE_SHIFT;
01607 pmd = pmd_alloc(mm, pud, addr);
01608 if (!pmd)
01609 return -ENOMEM;
01610 do {
01611 next = pmd_addr_end(addr, end);
01612 if (remap_pte_range(mm, pmd, addr, next,
01613 pfn + (addr >> PAGE_SHIFT), prot))
01614 return -ENOMEM;
01615 } while (pmd++, addr = next, addr != end);
01616 return 0;
01617 }
01618
01619 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
01620 unsigned long addr, unsigned long end,
01621 unsigned long pfn, pgprot_t prot)
01622 {
01623 pud_t *pud;
01624 unsigned long next;
01625
01626 pfn -= addr >> PAGE_SHIFT;
01627 pud = pud_alloc(mm, pgd, addr);
01628 if (!pud)
01629 return -ENOMEM;
01630 do {
01631 next = pud_addr_end(addr, end);
01632 if (remap_pmd_range(mm, pud, addr, next,
01633 pfn + (addr >> PAGE_SHIFT), prot))
01634 return -ENOMEM;
01635 } while (pud++, addr = next, addr != end);
01636 return 0;
01637 }
01638
01639
01640
01641
01642
01643
01644
01645
01646
01647
01648
01649 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
01650 unsigned long pfn, unsigned long size, pgprot_t prot)
01651 {
01652 pgd_t *pgd;
01653 unsigned long next;
01654 unsigned long end = addr + PAGE_ALIGN(size);
01655 struct mm_struct *mm = vma->vm_mm;
01656 int err;
01657
01658
01659
01660
01661
01662
01663
01664
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676 if (addr == vma->vm_start && end == vma->vm_end)
01677 vma->vm_pgoff = pfn;
01678 else if (is_cow_mapping(vma->vm_flags))
01679 return -EINVAL;
01680
01681 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
01682
01683 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
01684 if (err) {
01685
01686
01687
01688
01689 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
01690 return -EINVAL;
01691 }
01692
01693 BUG_ON(addr >= end);
01694 pfn -= addr >> PAGE_SHIFT;
01695 pgd = pgd_offset(mm, addr);
01696 flush_cache_range(vma, addr, end);
01697 do {
01698 next = pgd_addr_end(addr, end);
01699 err = remap_pud_range(mm, pgd, addr, next,
01700 pfn + (addr >> PAGE_SHIFT), prot);
01701 if (err)
01702 break;
01703 } while (pgd++, addr = next, addr != end);
01704
01705 if (err)
01706 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
01707
01708 return err;
01709 }
01710 EXPORT_SYMBOL(remap_pfn_range);
01711
01712 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
01713 unsigned long addr, unsigned long end,
01714 pte_fn_t fn, void *data)
01715 {
01716 pte_t *pte;
01717 int err;
01718 pgtable_t token;
01719 spinlock_t *uninitialized_var(ptl);
01720
01721 pte = (mm == &init_mm) ?
01722 pte_alloc_kernel(pmd, addr) :
01723 pte_alloc_map_lock(mm, pmd, addr, &ptl);
01724 if (!pte)
01725 return -ENOMEM;
01726
01727 BUG_ON(pmd_huge(*pmd));
01728
01729 arch_enter_lazy_mmu_mode();
01730
01731 token = pmd_pgtable(*pmd);
01732
01733 do {
01734 err = fn(pte, token, addr, data);
01735 if (err)
01736 break;
01737 } while (pte++, addr += PAGE_SIZE, addr != end);
01738
01739 arch_leave_lazy_mmu_mode();
01740
01741 if (mm != &init_mm)
01742 pte_unmap_unlock(pte-1, ptl);
01743 return err;
01744 }
01745
01746 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
01747 unsigned long addr, unsigned long end,
01748 pte_fn_t fn, void *data)
01749 {
01750 pmd_t *pmd;
01751 unsigned long next;
01752 int err;
01753
01754 BUG_ON(pud_huge(*pud));
01755
01756 pmd = pmd_alloc(mm, pud, addr);
01757 if (!pmd)
01758 return -ENOMEM;
01759 do {
01760 next = pmd_addr_end(addr, end);
01761 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
01762 if (err)
01763 break;
01764 } while (pmd++, addr = next, addr != end);
01765 return err;
01766 }
01767
01768 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
01769 unsigned long addr, unsigned long end,
01770 pte_fn_t fn, void *data)
01771 {
01772 pud_t *pud;
01773 unsigned long next;
01774 int err;
01775
01776 pud = pud_alloc(mm, pgd, addr);
01777 if (!pud)
01778 return -ENOMEM;
01779 do {
01780 next = pud_addr_end(addr, end);
01781 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
01782 if (err)
01783 break;
01784 } while (pud++, addr = next, addr != end);
01785 return err;
01786 }
01787
01788
01789
01790
01791
01792 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
01793 unsigned long size, pte_fn_t fn, void *data)
01794 {
01795 pgd_t *pgd;
01796 unsigned long next;
01797 unsigned long start = addr, end = addr + size;
01798 int err;
01799
01800 BUG_ON(addr >= end);
01801 mmu_notifier_invalidate_range_start(mm, start, end);
01802 pgd = pgd_offset(mm, addr);
01803 do {
01804 next = pgd_addr_end(addr, end);
01805 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
01806 if (err)
01807 break;
01808 } while (pgd++, addr = next, addr != end);
01809 mmu_notifier_invalidate_range_end(mm, start, end);
01810 return err;
01811 }
01812 EXPORT_SYMBOL_GPL(apply_to_page_range);
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
01824 pte_t *page_table, pte_t orig_pte)
01825 {
01826 int same = 1;
01827 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
01828 if (sizeof(pte_t) > sizeof(unsigned long)) {
01829 spinlock_t *ptl = pte_lockptr(mm, pmd);
01830 spin_lock(ptl);
01831 same = pte_same(*page_table, orig_pte);
01832 spin_unlock(ptl);
01833 }
01834 #endif
01835 pte_unmap(page_table);
01836 return same;
01837 }
01838
01839
01840
01841
01842
01843
01844
01845 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
01846 {
01847 if (likely(vma->vm_flags & VM_WRITE))
01848 pte = pte_mkwrite(pte);
01849 return pte;
01850 }
01851
01852 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
01853 {
01854
01855
01856
01857
01858
01859
01860 if (unlikely(!src)) {
01861 void *kaddr = kmap_atomic(dst, KM_USER0);
01862 void __user *uaddr = (void __user *)(va & PAGE_MASK);
01863
01864
01865
01866
01867
01868
01869
01870 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
01871 memset(kaddr, 0, PAGE_SIZE);
01872 kunmap_atomic(kaddr, KM_USER0);
01873 flush_dcache_page(dst);
01874 } else
01875 copy_user_highpage(dst, src, va, vma);
01876 }
01877
01878
01879
01880
01881
01882
01883
01884
01885
01886
01887
01888
01889
01890
01891
01892
01893
01894
01895
01896 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
01897 unsigned long address, pte_t *page_table, pmd_t *pmd,
01898 spinlock_t *ptl, pte_t orig_pte)
01899 {
01900 struct page *old_page, *new_page;
01901 pte_t entry;
01902 int reuse = 0, ret = 0;
01903 int page_mkwrite = 0;
01904 struct page *dirty_page = NULL;
01905
01906 old_page = vm_normal_page(vma, address, orig_pte);
01907 if (!old_page) {
01908
01909
01910
01911
01912
01913
01914
01915 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
01916 (VM_WRITE|VM_SHARED))
01917 goto reuse;
01918 goto gotten;
01919 }
01920
01921
01922
01923
01924
01925 if (PageAnon(old_page)) {
01926 if (!trylock_page(old_page)) {
01927 page_cache_get(old_page);
01928 pte_unmap_unlock(page_table, ptl);
01929 lock_page(old_page);
01930 page_table = pte_offset_map_lock(mm, pmd, address,
01931 &ptl);
01932 if (!pte_same(*page_table, orig_pte)) {
01933 unlock_page(old_page);
01934 page_cache_release(old_page);
01935 goto unlock;
01936 }
01937 page_cache_release(old_page);
01938 }
01939 reuse = reuse_swap_page(old_page);
01940 unlock_page(old_page);
01941 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
01942 (VM_WRITE|VM_SHARED))) {
01943
01944
01945
01946
01947
01948 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
01949
01950
01951
01952
01953
01954
01955
01956
01957 page_cache_get(old_page);
01958 pte_unmap_unlock(page_table, ptl);
01959
01960 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
01961 goto unwritable_page;
01962
01963
01964
01965
01966
01967
01968
01969 page_table = pte_offset_map_lock(mm, pmd, address,
01970 &ptl);
01971 page_cache_release(old_page);
01972 if (!pte_same(*page_table, orig_pte))
01973 goto unlock;
01974
01975 page_mkwrite = 1;
01976 }
01977 dirty_page = old_page;
01978 get_page(dirty_page);
01979 reuse = 1;
01980 }
01981
01982 if (reuse) {
01983 reuse:
01984 flush_cache_page(vma, address, pte_pfn(orig_pte));
01985 entry = pte_mkyoung(orig_pte);
01986 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
01987 if (ptep_set_access_flags(vma, address, page_table, entry,1))
01988 update_mmu_cache(vma, address, entry);
01989 ret |= VM_FAULT_WRITE;
01990 goto unlock;
01991 }
01992
01993
01994
01995
01996 page_cache_get(old_page);
01997 gotten:
01998 pte_unmap_unlock(page_table, ptl);
01999
02000 if (unlikely(anon_vma_prepare(vma)))
02001 goto oom;
02002 VM_BUG_ON(old_page == ZERO_PAGE(0));
02003 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
02004 if (!new_page)
02005 goto oom;
02006
02007
02008
02009
02010 if ((vma->vm_flags & VM_LOCKED) && old_page) {
02011 lock_page(old_page);
02012 clear_page_mlock(old_page);
02013 unlock_page(old_page);
02014 }
02015 cow_user_page(new_page, old_page, address, vma);
02016 __SetPageUptodate(new_page);
02017
02018 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
02019 goto oom_free_new;
02020
02021
02022
02023
02024 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
02025 if (likely(pte_same(*page_table, orig_pte))) {
02026 if (old_page) {
02027 if (!PageAnon(old_page)) {
02028 dec_mm_counter(mm, file_rss);
02029 inc_mm_counter(mm, anon_rss);
02030 }
02031 } else
02032 inc_mm_counter(mm, anon_rss);
02033 flush_cache_page(vma, address, pte_pfn(orig_pte));
02034 entry = mk_pte(new_page, vma->vm_page_prot);
02035 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
02036
02037
02038
02039
02040
02041
02042 ptep_clear_flush_notify(vma, address, page_table);
02043 page_add_new_anon_rmap(new_page, vma, address);
02044 set_pte_at(mm, address, page_table, entry);
02045 update_mmu_cache(vma, address, entry);
02046 if (old_page) {
02047
02048
02049
02050
02051
02052
02053
02054
02055
02056
02057
02058
02059
02060
02061
02062
02063
02064
02065
02066
02067
02068
02069 page_remove_rmap(old_page);
02070 }
02071
02072
02073 new_page = old_page;
02074 ret |= VM_FAULT_WRITE;
02075 } else
02076 mem_cgroup_uncharge_page(new_page);
02077
02078 if (new_page)
02079 page_cache_release(new_page);
02080 if (old_page)
02081 page_cache_release(old_page);
02082 unlock:
02083 pte_unmap_unlock(page_table, ptl);
02084 if (dirty_page) {
02085 if (vma->vm_file)
02086 file_update_time(vma->vm_file);
02087
02088
02089
02090
02091
02092
02093
02094
02095
02096 wait_on_page_locked(dirty_page);
02097 set_page_dirty_balance(dirty_page, page_mkwrite);
02098 put_page(dirty_page);
02099 }
02100 return ret;
02101 oom_free_new:
02102 page_cache_release(new_page);
02103 oom:
02104 if (old_page)
02105 page_cache_release(old_page);
02106 return VM_FAULT_OOM;
02107
02108 unwritable_page:
02109 page_cache_release(old_page);
02110 return VM_FAULT_SIGBUS;
02111 }
02112
02113
02114
02115
02116
02117
02118
02119
02120
02121
02122
02123
02124
02125
02126
02127
02128
02129
02130
02131
02132
02133
02134
02135
02136
02137
02138
02139
02140
02141
02142
02143
02144
02145 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
02146
02147 static void reset_vma_truncate_counts(struct address_space *mapping)
02148 {
02149 struct vm_area_struct *vma;
02150 struct prio_tree_iter iter;
02151
02152 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
02153 vma->vm_truncate_count = 0;
02154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
02155 vma->vm_truncate_count = 0;
02156 }
02157
02158 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
02159 unsigned long start_addr, unsigned long end_addr,
02160 struct zap_details *details)
02161 {
02162 unsigned long restart_addr;
02163 int need_break;
02164
02165
02166
02167
02168
02169
02170
02171
02172 again:
02173 restart_addr = vma->vm_truncate_count;
02174 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
02175 start_addr = restart_addr;
02176 if (start_addr >= end_addr) {
02177
02178 vma->vm_truncate_count = details->truncate_count;
02179 return 0;
02180 }
02181 }
02182
02183 restart_addr = zap_page_range(vma, start_addr,
02184 end_addr - start_addr, details);
02185 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
02186
02187 if (restart_addr >= end_addr) {
02188
02189 vma->vm_truncate_count = details->truncate_count;
02190 if (!need_break)
02191 return 0;
02192 } else {
02193
02194 vma->vm_truncate_count = restart_addr;
02195 if (!need_break)
02196 goto again;
02197 }
02198
02199 spin_unlock(details->i_mmap_lock);
02200 cond_resched();
02201 spin_lock(details->i_mmap_lock);
02202 return -EINTR;
02203 }
02204
02205 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
02206 struct zap_details *details)
02207 {
02208 struct vm_area_struct *vma;
02209 struct prio_tree_iter iter;
02210 pgoff_t vba, vea, zba, zea;
02211
02212 restart:
02213 vma_prio_tree_foreach(vma, &iter, root,
02214 details->first_index, details->last_index) {
02215
02216 if (vma->vm_truncate_count == details->truncate_count)
02217 continue;
02218
02219 vba = vma->vm_pgoff;
02220 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
02221
02222 zba = details->first_index;
02223 if (zba < vba)
02224 zba = vba;
02225 zea = details->last_index;
02226 if (zea > vea)
02227 zea = vea;
02228
02229 if (unmap_mapping_range_vma(vma,
02230 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
02231 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
02232 details) < 0)
02233 goto restart;
02234 }
02235 }
02236
02237 static inline void unmap_mapping_range_list(struct list_head *head,
02238 struct zap_details *details)
02239 {
02240 struct vm_area_struct *vma;
02241
02242
02243
02244
02245
02246
02247
02248 restart:
02249 list_for_each_entry(vma, head, shared.vm_set.list) {
02250
02251 if (vma->vm_truncate_count == details->truncate_count)
02252 continue;
02253 details->nonlinear_vma = vma;
02254 if (unmap_mapping_range_vma(vma, vma->vm_start,
02255 vma->vm_end, details) < 0)
02256 goto restart;
02257 }
02258 }
02259
02260
02261
02262
02263
02264
02265
02266
02267
02268
02269
02270
02271
02272
02273
02274 void unmap_mapping_range(struct address_space *mapping,
02275 loff_t const holebegin, loff_t const holelen, int even_cows)
02276 {
02277 struct zap_details details;
02278 pgoff_t hba = holebegin >> PAGE_SHIFT;
02279 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
02280
02281
02282 if (sizeof(holelen) > sizeof(hlen)) {
02283 long long holeend =
02284 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
02285 if (holeend & ~(long long)ULONG_MAX)
02286 hlen = ULONG_MAX - hba + 1;
02287 }
02288
02289 details.check_mapping = even_cows? NULL: mapping;
02290 details.nonlinear_vma = NULL;
02291 details.first_index = hba;
02292 details.last_index = hba + hlen - 1;
02293 if (details.last_index < details.first_index)
02294 details.last_index = ULONG_MAX;
02295 details.i_mmap_lock = &mapping->i_mmap_lock;
02296
02297 spin_lock(&mapping->i_mmap_lock);
02298
02299
02300 mapping->truncate_count++;
02301 if (unlikely(is_restart_addr(mapping->truncate_count))) {
02302 if (mapping->truncate_count == 0)
02303 reset_vma_truncate_counts(mapping);
02304 mapping->truncate_count++;
02305 }
02306 details.truncate_count = mapping->truncate_count;
02307
02308 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
02309 unmap_mapping_range_tree(&mapping->i_mmap, &details);
02310 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
02311 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
02312 spin_unlock(&mapping->i_mmap_lock);
02313 }
02314 EXPORT_SYMBOL(unmap_mapping_range);
02315
02316
02317
02318
02319
02320
02321
02322
02323
02324
02325 int vmtruncate(struct inode * inode, loff_t offset)
02326 {
02327 if (inode->i_size < offset) {
02328 unsigned long limit;
02329
02330 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
02331 if (limit != RLIM_INFINITY && offset > limit)
02332 goto out_sig;
02333 if (offset > inode->i_sb->s_maxbytes)
02334 goto out_big;
02335 i_size_write(inode, offset);
02336 } else {
02337 struct address_space *mapping = inode->i_mapping;
02338
02339
02340
02341
02342
02343
02344 if (IS_SWAPFILE(inode))
02345 return -ETXTBSY;
02346 i_size_write(inode, offset);
02347
02348
02349
02350
02351
02352
02353
02354
02355
02356
02357 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
02358 truncate_inode_pages(mapping, offset);
02359 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
02360 }
02361
02362 if (inode->i_op->truncate)
02363 inode->i_op->truncate(inode);
02364 return 0;
02365
02366 out_sig:
02367 send_sig(SIGXFSZ, current, 0);
02368 out_big:
02369 return -EFBIG;
02370 }
02371 EXPORT_SYMBOL(vmtruncate);
02372
02373 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
02374 {
02375 struct address_space *mapping = inode->i_mapping;
02376
02377
02378
02379
02380
02381
02382 if (!inode->i_op->truncate_range)
02383 return -ENOSYS;
02384
02385 mutex_lock(&inode->i_mutex);
02386 down_write(&inode->i_alloc_sem);
02387 unmap_mapping_range(mapping, offset, (end - offset), 1);
02388 truncate_inode_pages_range(mapping, offset, end);
02389 unmap_mapping_range(mapping, offset, (end - offset), 1);
02390 inode->i_op->truncate_range(inode, offset, end);
02391 up_write(&inode->i_alloc_sem);
02392 mutex_unlock(&inode->i_mutex);
02393
02394 return 0;
02395 }
02396
02397
02398
02399
02400
02401
02402 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
02403 unsigned long address, pte_t *page_table, pmd_t *pmd,
02404 int write_access, pte_t orig_pte)
02405 {
02406 spinlock_t *ptl;
02407 struct page *page;
02408 swp_entry_t entry;
02409 pte_t pte;
02410 struct mem_cgroup *ptr = NULL;
02411 int ret = 0;
02412
02413 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
02414 goto out;
02415
02416 entry = pte_to_swp_entry(orig_pte);
02417 if (is_migration_entry(entry)) {
02418 migration_entry_wait(mm, pmd, address);
02419 goto out;
02420 }
02421 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
02422 page = lookup_swap_cache(entry);
02423 if (!page) {
02424 grab_swap_token();
02425 page = swapin_readahead(entry,
02426 GFP_HIGHUSER_MOVABLE, vma, address);
02427 if (!page) {
02428
02429
02430
02431
02432 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
02433 if (likely(pte_same(*page_table, orig_pte)))
02434 ret = VM_FAULT_OOM;
02435 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
02436 goto unlock;
02437 }
02438
02439
02440 ret = VM_FAULT_MAJOR;
02441 count_vm_event(PGMAJFAULT);
02442 }
02443
02444 mark_page_accessed(page);
02445
02446 lock_page(page);
02447 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
02448
02449 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
02450 ret = VM_FAULT_OOM;
02451 unlock_page(page);
02452 goto out;
02453 }
02454
02455
02456
02457
02458 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
02459 if (unlikely(!pte_same(*page_table, orig_pte)))
02460 goto out_nomap;
02461
02462 if (unlikely(!PageUptodate(page))) {
02463 ret = VM_FAULT_SIGBUS;
02464 goto out_nomap;
02465 }
02466
02467
02468
02469
02470
02471
02472
02473
02474
02475
02476
02477
02478
02479
02480
02481 inc_mm_counter(mm, anon_rss);
02482 pte = mk_pte(page, vma->vm_page_prot);
02483 if (write_access && reuse_swap_page(page)) {
02484 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
02485 write_access = 0;
02486 }
02487 flush_icache_page(vma, page);
02488 set_pte_at(mm, address, page_table, pte);
02489 page_add_anon_rmap(page, vma, address);
02490
02491 mem_cgroup_commit_charge_swapin(page, ptr);
02492
02493 swap_free(entry);
02494 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
02495 try_to_free_swap(page);
02496 unlock_page(page);
02497
02498 if (write_access) {
02499 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
02500 if (ret & VM_FAULT_ERROR)
02501 ret &= VM_FAULT_ERROR;
02502 goto out;
02503 }
02504
02505
02506 update_mmu_cache(vma, address, pte);
02507 unlock:
02508 pte_unmap_unlock(page_table, ptl);
02509 out:
02510 return ret;
02511 out_nomap:
02512 mem_cgroup_cancel_charge_swapin(ptr);
02513 pte_unmap_unlock(page_table, ptl);
02514 unlock_page(page);
02515 page_cache_release(page);
02516 return ret;
02517 }
02518
02519
02520
02521
02522
02523
02524 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
02525 unsigned long address, pte_t *page_table, pmd_t *pmd,
02526 int write_access)
02527 {
02528 struct page *page;
02529 spinlock_t *ptl;
02530 pte_t entry;
02531
02532
02533 pte_unmap(page_table);
02534
02535 if (unlikely(anon_vma_prepare(vma)))
02536 goto oom;
02537 page = alloc_zeroed_user_highpage_movable(vma, address);
02538 if (!page)
02539 goto oom;
02540 __SetPageUptodate(page);
02541
02542 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
02543 goto oom_free_page;
02544
02545 entry = mk_pte(page, vma->vm_page_prot);
02546 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
02547
02548 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
02549 if (!pte_none(*page_table))
02550 goto release;
02551 inc_mm_counter(mm, anon_rss);
02552 page_add_new_anon_rmap(page, vma, address);
02553 set_pte_at(mm, address, page_table, entry);
02554
02555
02556 update_mmu_cache(vma, address, entry);
02557 unlock:
02558 pte_unmap_unlock(page_table, ptl);
02559 return 0;
02560 release:
02561 mem_cgroup_uncharge_page(page);
02562 page_cache_release(page);
02563 goto unlock;
02564 oom_free_page:
02565 page_cache_release(page);
02566 oom:
02567 return VM_FAULT_OOM;
02568 }
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
02584 unsigned long address, pmd_t *pmd,
02585 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
02586 {
02587 pte_t *page_table;
02588 spinlock_t *ptl;
02589 struct page *page;
02590 pte_t entry;
02591 int anon = 0;
02592 int charged = 0;
02593 struct page *dirty_page = NULL;
02594 struct vm_fault vmf;
02595 int ret;
02596 int page_mkwrite = 0;
02597
02598 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
02599 vmf.pgoff = pgoff;
02600 vmf.flags = flags;
02601 vmf.page = NULL;
02602
02603 ret = vma->vm_ops->fault(vma, &vmf);
02604 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
02605 return ret;
02606
02607
02608
02609
02610
02611 if (unlikely(!(ret & VM_FAULT_LOCKED)))
02612 lock_page(vmf.page);
02613 else
02614 VM_BUG_ON(!PageLocked(vmf.page));
02615
02616
02617
02618
02619 page = vmf.page;
02620 if (flags & FAULT_FLAG_WRITE) {
02621 if (!(vma->vm_flags & VM_SHARED)) {
02622 anon = 1;
02623 if (unlikely(anon_vma_prepare(vma))) {
02624 ret = VM_FAULT_OOM;
02625 goto out;
02626 }
02627 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
02628 vma, address);
02629 if (!page) {
02630 ret = VM_FAULT_OOM;
02631 goto out;
02632 }
02633 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
02634 ret = VM_FAULT_OOM;
02635 page_cache_release(page);
02636 goto out;
02637 }
02638 charged = 1;
02639
02640
02641
02642
02643 if (vma->vm_flags & VM_LOCKED)
02644 clear_page_mlock(vmf.page);
02645 copy_user_highpage(page, vmf.page, address, vma);
02646 __SetPageUptodate(page);
02647 } else {
02648
02649
02650
02651
02652
02653 if (vma->vm_ops->page_mkwrite) {
02654 unlock_page(page);
02655 if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
02656 ret = VM_FAULT_SIGBUS;
02657 anon = 1;
02658 goto out_unlocked;
02659 }
02660 lock_page(page);
02661
02662
02663
02664
02665
02666
02667
02668 if (!page->mapping) {
02669 ret = 0;
02670 anon = 1;
02671 goto out;
02672 }
02673 page_mkwrite = 1;
02674 }
02675 }
02676
02677 }
02678
02679 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
02680
02681
02682
02683
02684
02685
02686
02687
02688
02689
02690
02691
02692 if (likely(pte_same(*page_table, orig_pte))) {
02693 flush_icache_page(vma, page);
02694 entry = mk_pte(page, vma->vm_page_prot);
02695 if (flags & FAULT_FLAG_WRITE)
02696 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
02697 if (anon) {
02698 inc_mm_counter(mm, anon_rss);
02699 page_add_new_anon_rmap(page, vma, address);
02700 } else {
02701 inc_mm_counter(mm, file_rss);
02702 page_add_file_rmap(page);
02703 if (flags & FAULT_FLAG_WRITE) {
02704 dirty_page = page;
02705 get_page(dirty_page);
02706 }
02707 }
02708 set_pte_at(mm, address, page_table, entry);
02709
02710
02711 update_mmu_cache(vma, address, entry);
02712 } else {
02713 if (charged)
02714 mem_cgroup_uncharge_page(page);
02715 if (anon)
02716 page_cache_release(page);
02717 else
02718 anon = 1;
02719 }
02720
02721 pte_unmap_unlock(page_table, ptl);
02722
02723 out:
02724 unlock_page(vmf.page);
02725 out_unlocked:
02726 if (anon)
02727 page_cache_release(vmf.page);
02728 else if (dirty_page) {
02729 if (vma->vm_file)
02730 file_update_time(vma->vm_file);
02731
02732 set_page_dirty_balance(dirty_page, page_mkwrite);
02733 put_page(dirty_page);
02734 }
02735
02736 return ret;
02737 }
02738
02739 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
02740 unsigned long address, pte_t *page_table, pmd_t *pmd,
02741 int write_access, pte_t orig_pte)
02742 {
02743 pgoff_t pgoff = (((address & PAGE_MASK)
02744 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
02745 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
02746
02747 pte_unmap(page_table);
02748 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
02749 }
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
02761 unsigned long address, pte_t *page_table, pmd_t *pmd,
02762 int write_access, pte_t orig_pte)
02763 {
02764 unsigned int flags = FAULT_FLAG_NONLINEAR |
02765 (write_access ? FAULT_FLAG_WRITE : 0);
02766 pgoff_t pgoff;
02767
02768 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
02769 return 0;
02770
02771 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
02772
02773
02774
02775 print_bad_pte(vma, address, orig_pte, NULL);
02776 return VM_FAULT_OOM;
02777 }
02778
02779 pgoff = pte_to_pgoff(orig_pte);
02780 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
02781 }
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796 static inline int handle_pte_fault(struct mm_struct *mm,
02797 struct vm_area_struct *vma, unsigned long address,
02798 pte_t *pte, pmd_t *pmd, int write_access)
02799 {
02800 pte_t entry;
02801 spinlock_t *ptl;
02802
02803 entry = *pte;
02804 if (!pte_present(entry)) {
02805 if (pte_none(entry)) {
02806 if (vma->vm_ops) {
02807 if (likely(vma->vm_ops->fault))
02808 return do_linear_fault(mm, vma, address,
02809 pte, pmd, write_access, entry);
02810 }
02811 return do_anonymous_page(mm, vma, address,
02812 pte, pmd, write_access);
02813 }
02814 if (pte_file(entry))
02815 return do_nonlinear_fault(mm, vma, address,
02816 pte, pmd, write_access, entry);
02817 return do_swap_page(mm, vma, address,
02818 pte, pmd, write_access, entry);
02819 }
02820
02821 ptl = pte_lockptr(mm, pmd);
02822 spin_lock(ptl);
02823 if (unlikely(!pte_same(*pte, entry)))
02824 goto unlock;
02825 if (write_access) {
02826 if (!pte_write(entry))
02827 return do_wp_page(mm, vma, address,
02828 pte, pmd, ptl, entry);
02829 entry = pte_mkdirty(entry);
02830 }
02831 entry = pte_mkyoung(entry);
02832 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
02833 update_mmu_cache(vma, address, entry);
02834 } else {
02835
02836
02837
02838
02839
02840
02841 if (write_access)
02842 flush_tlb_page(vma, address);
02843 }
02844 unlock:
02845 pte_unmap_unlock(pte, ptl);
02846 return 0;
02847 }
02848
02849
02850
02851
02852 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
02853 unsigned long address, int write_access)
02854 {
02855 pgd_t *pgd;
02856 pud_t *pud;
02857 pmd_t *pmd;
02858 pte_t *pte;
02859
02860 __set_current_state(TASK_RUNNING);
02861
02862 count_vm_event(PGFAULT);
02863
02864 if (unlikely(is_vm_hugetlb_page(vma)))
02865 return hugetlb_fault(mm, vma, address, write_access);
02866
02867 pgd = pgd_offset(mm, address);
02868 pud = pud_alloc(mm, pgd, address);
02869 if (!pud)
02870 return VM_FAULT_OOM;
02871 pmd = pmd_alloc(mm, pud, address);
02872 if (!pmd)
02873 return VM_FAULT_OOM;
02874 pte = pte_alloc_map(mm, pmd, address);
02875 if (!pte)
02876 return VM_FAULT_OOM;
02877
02878 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
02879 }
02880
02881 #ifndef __PAGETABLE_PUD_FOLDED
02882
02883
02884
02885
02886 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
02887 {
02888 pud_t *new = pud_alloc_one(mm, address);
02889 if (!new)
02890 return -ENOMEM;
02891
02892 smp_wmb();
02893
02894 spin_lock(&mm->page_table_lock);
02895 if (pgd_present(*pgd))
02896 pud_free(mm, new);
02897 else
02898 pgd_populate(mm, pgd, new);
02899 spin_unlock(&mm->page_table_lock);
02900 return 0;
02901 }
02902 #endif
02903
02904 #ifndef __PAGETABLE_PMD_FOLDED
02905
02906
02907
02908
02909 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
02910 {
02911 pmd_t *new = pmd_alloc_one(mm, address);
02912 if (!new)
02913 return -ENOMEM;
02914
02915 smp_wmb();
02916
02917 spin_lock(&mm->page_table_lock);
02918 #ifndef __ARCH_HAS_4LEVEL_HACK
02919 if (pud_present(*pud))
02920 pmd_free(mm, new);
02921 else
02922 pud_populate(mm, pud, new);
02923 #else
02924 if (pgd_present(*pud))
02925 pmd_free(mm, new);
02926 else
02927 pgd_populate(mm, pud, new);
02928 #endif
02929 spin_unlock(&mm->page_table_lock);
02930 return 0;
02931 }
02932 #endif
02933
02934 int make_pages_present(unsigned long addr, unsigned long end)
02935 {
02936 int ret, len, write;
02937 struct vm_area_struct * vma;
02938
02939 vma = find_vma(current->mm, addr);
02940 if (!vma)
02941 return -ENOMEM;
02942 write = (vma->vm_flags & VM_WRITE) != 0;
02943 BUG_ON(addr >= end);
02944 BUG_ON(end > vma->vm_end);
02945 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
02946 ret = get_user_pages(current, current->mm, addr,
02947 len, write, 0, NULL, NULL);
02948 if (ret < 0)
02949 return ret;
02950 return ret == len ? 0 : -EFAULT;
02951 }
02952
02953 #if !defined(__HAVE_ARCH_GATE_AREA)
02954
02955 #if defined(AT_SYSINFO_EHDR)
02956 static struct vm_area_struct gate_vma;
02957
02958 static int __init gate_vma_init(void)
02959 {
02960 gate_vma.vm_mm = NULL;
02961 gate_vma.vm_start = FIXADDR_USER_START;
02962 gate_vma.vm_end = FIXADDR_USER_END;
02963 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
02964 gate_vma.vm_page_prot = __P101;
02965
02966
02967
02968
02969
02970
02971 gate_vma.vm_flags |= VM_ALWAYSDUMP;
02972 return 0;
02973 }
02974 __initcall(gate_vma_init);
02975 #endif
02976
02977 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
02978 {
02979 #ifdef AT_SYSINFO_EHDR
02980 return &gate_vma;
02981 #else
02982 return NULL;
02983 #endif
02984 }
02985
02986 int in_gate_area_no_task(unsigned long addr)
02987 {
02988 #ifdef AT_SYSINFO_EHDR
02989 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
02990 return 1;
02991 #endif
02992 return 0;
02993 }
02994
02995 #endif
02996
02997 #ifdef CONFIG_HAVE_IOREMAP_PROT
02998 int follow_phys(struct vm_area_struct *vma,
02999 unsigned long address, unsigned int flags,
03000 unsigned long *prot, resource_size_t *phys)
03001 {
03002 pgd_t *pgd;
03003 pud_t *pud;
03004 pmd_t *pmd;
03005 pte_t *ptep, pte;
03006 spinlock_t *ptl;
03007 resource_size_t phys_addr = 0;
03008 struct mm_struct *mm = vma->vm_mm;
03009 int ret = -EINVAL;
03010
03011 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
03012 goto out;
03013
03014 pgd = pgd_offset(mm, address);
03015 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
03016 goto out;
03017
03018 pud = pud_offset(pgd, address);
03019 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
03020 goto out;
03021
03022 pmd = pmd_offset(pud, address);
03023 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
03024 goto out;
03025
03026
03027 if (pmd_huge(*pmd))
03028 goto out;
03029
03030 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
03031 if (!ptep)
03032 goto out;
03033
03034 pte = *ptep;
03035 if (!pte_present(pte))
03036 goto unlock;
03037 if ((flags & FOLL_WRITE) && !pte_write(pte))
03038 goto unlock;
03039 phys_addr = pte_pfn(pte);
03040 phys_addr <<= PAGE_SHIFT;
03041
03042 *prot = pgprot_val(pte_pgprot(pte));
03043 *phys = phys_addr;
03044 ret = 0;
03045
03046 unlock:
03047 pte_unmap_unlock(ptep, ptl);
03048 out:
03049 return ret;
03050 }
03051
03052 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
03053 void *buf, int len, int write)
03054 {
03055 resource_size_t phys_addr;
03056 unsigned long prot = 0;
03057 void __iomem *maddr;
03058 int offset = addr & (PAGE_SIZE-1);
03059
03060 if (follow_phys(vma, addr, write, &prot, &phys_addr))
03061 return -EINVAL;
03062
03063 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
03064 if (write)
03065 memcpy_toio(maddr + offset, buf, len);
03066 else
03067 memcpy_fromio(buf, maddr + offset, len);
03068 iounmap(maddr);
03069
03070 return len;
03071 }
03072 #endif
03073
03074
03075
03076
03077
03078
03079 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
03080 {
03081 struct mm_struct *mm;
03082 struct vm_area_struct *vma;
03083 void *old_buf = buf;
03084
03085 mm = get_task_mm(tsk);
03086 if (!mm)
03087 return 0;
03088
03089 down_read(&mm->mmap_sem);
03090
03091 while (len) {
03092 int bytes, ret, offset;
03093 void *maddr;
03094 struct page *page = NULL;
03095
03096 ret = get_user_pages(tsk, mm, addr, 1,
03097 write, 1, &page, &vma);
03098 if (ret <= 0) {
03099
03100
03101
03102
03103 #ifdef CONFIG_HAVE_IOREMAP_PROT
03104 vma = find_vma(mm, addr);
03105 if (!vma)
03106 break;
03107 if (vma->vm_ops && vma->vm_ops->access)
03108 ret = vma->vm_ops->access(vma, addr, buf,
03109 len, write);
03110 if (ret <= 0)
03111 #endif
03112 break;
03113 bytes = ret;
03114 } else {
03115 bytes = len;
03116 offset = addr & (PAGE_SIZE-1);
03117 if (bytes > PAGE_SIZE-offset)
03118 bytes = PAGE_SIZE-offset;
03119
03120 maddr = kmap(page);
03121 if (write) {
03122 copy_to_user_page(vma, page, addr,
03123 maddr + offset, buf, bytes);
03124 set_page_dirty_lock(page);
03125 } else {
03126 copy_from_user_page(vma, page, addr,
03127 buf, maddr + offset, bytes);
03128 }
03129 kunmap(page);
03130 page_cache_release(page);
03131 }
03132 len -= bytes;
03133 buf += bytes;
03134 addr += bytes;
03135 }
03136 up_read(&mm->mmap_sem);
03137 mmput(mm);
03138
03139 return buf - old_buf;
03140 }
03141
03142
03143
03144
03145 void print_vma_addr(char *prefix, unsigned long ip)
03146 {
03147 struct mm_struct *mm = current->mm;
03148 struct vm_area_struct *vma;
03149
03150
03151
03152
03153
03154 if (preempt_count())
03155 return;
03156
03157 down_read(&mm->mmap_sem);
03158 vma = find_vma(mm, ip);
03159 if (vma && vma->vm_file) {
03160 struct file *f = vma->vm_file;
03161 char *buf = (char *)__get_free_page(GFP_KERNEL);
03162 if (buf) {
03163 char *p, *s;
03164
03165 p = d_path(&f->f_path, buf, PAGE_SIZE);
03166 if (IS_ERR(p))
03167 p = "?";
03168 s = strrchr(p, '/');
03169 if (s)
03170 p = s+1;
03171 printk("%s%s[%lx+%lx]", prefix, p,
03172 vma->vm_start,
03173 vma->vm_end - vma->vm_start);
03174 free_page((unsigned long)buf);
03175 }
03176 }
03177 up_read(¤t->mm->mmap_sem);
03178 }
03179
03180 #ifdef CONFIG_PROVE_LOCKING
03181 void might_fault(void)
03182 {
03183
03184
03185
03186
03187
03188
03189 if (segment_eq(get_fs(), KERNEL_DS))
03190 return;
03191
03192 might_sleep();
03193
03194
03195
03196
03197
03198 if (!in_atomic() && current->mm)
03199 might_lock_read(¤t->mm->mmap_sem);
03200 }
03201 EXPORT_SYMBOL(might_fault);
03202 #endif
03203 #endif