Skip to content
/ linux Public
forked from torvalds/linux

Commit

Permalink
mm/munlock: rmap call mlock_vma_page() munlock_vma_page()
Browse files Browse the repository at this point in the history
Add vma argument to mlock_vma_page() and munlock_vma_page(), make them
inline functions which check (vma->vm_flags & VM_LOCKED) before calling
mlock_page() and munlock_page() in mm/mlock.c.

Add bool compound to mlock_vma_page() and munlock_vma_page(): this is
because we have understandable difficulty in accounting pte maps of THPs,
and if passed a PageHead page, mlock_page() and munlock_page() cannot
tell whether it's a pmd map to be counted or a pte map to be ignored.

Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the
others, and use that to call mlock_vma_page() at the end of the page
adds, and munlock_vma_page() at the end of page_remove_rmap() (end or
beginning? unimportant, but end was easier for assertions in testing).

No page lock is required (although almost all adds happen to hold it):
delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s.
Certainly page lock did serialize with page migration, but I'm having
difficulty explaining why that was ever important.

Mlock accounting on THPs has been hard to define, differed between anon
and file, involved PageDoubleMap in some places and not others, required
clear_page_mlock() at some points.  Keep it simple now: just count the
pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks.

page_add_new_anon_rmap() callers unchanged: they have long been calling
lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED
handling (it also checks for not VM_SPECIAL: I think that's overcautious,
and inconsistent with other checks, that mmap_region() already prevents
VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it).

Signed-off-by: Hugh Dickins <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
  • Loading branch information
Hugh Dickins authored and Matthew Wilcox (Oracle) committed Feb 17, 2022
1 parent a213e5c commit cea86fe
Show file tree
Hide file tree
Showing 12 changed files with 113 additions and 129 deletions.
17 changes: 9 additions & 8 deletions include/linux/rmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,18 +167,19 @@ struct anon_vma *page_get_anon_vma(struct page *page);
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
unsigned long address, bool compound);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
unsigned long address, int flags);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);

unsigned long address, bool compound);
void page_add_file_rmap(struct page *, struct vm_area_struct *,
bool compound);
void page_remove_rmap(struct page *, struct vm_area_struct *,
bool compound);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
unsigned long address);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
unsigned long address);

static inline void page_dup_rmap(struct page *page, bool compound)
{
Expand Down
7 changes: 2 additions & 5 deletions kernel/events/uprobes.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
return err;
}

/* For try_to_free_swap() and munlock_vma_page() below */
/* For try_to_free_swap() below */
lock_page(old_page);

mmu_notifier_invalidate_range_start(&range);
Expand Down Expand Up @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));

page_remove_rmap(old_page, false);
page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);

if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
munlock_vma_page(old_page);
put_page(old_page);

err = 0;
Expand Down
17 changes: 8 additions & 9 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1577,7 +1577,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,

if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
page_remove_rmap(page, vma, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
Expand Down Expand Up @@ -1962,7 +1962,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
page_remove_rmap(page, true);
page_remove_rmap(page, vma, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
Expand Down Expand Up @@ -2096,14 +2096,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
}
unlock_page_memcg(page);

/* Above is effectively page_remove_rmap(page, vma, true) */
munlock_vma_page(page, vma, true);
}

smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);

if (freeze) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
page_remove_rmap(page + i, false);
page_remove_rmap(page + i, vma, false);
put_page(page + i);
}
}
Expand Down Expand Up @@ -2163,8 +2166,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
do_unlock_page = true;
}
}
if (PageMlocked(page))
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
Expand Down Expand Up @@ -3138,7 +3139,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, true);
page_remove_rmap(page, vma, true);
put_page(page);
}

Expand Down Expand Up @@ -3168,10 +3169,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (PageAnon(new))
page_add_anon_rmap(new, vma, mmun_start, true);
else
page_add_file_rmap(new, true);
page_add_file_rmap(new, vma, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
mlock_vma_page(new);
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
#endif
4 changes: 2 additions & 2 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -5014,7 +5014,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_page_dirty(page);

hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, true);
page_remove_rmap(page, vma, true);

spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
Expand Down Expand Up @@ -5259,7 +5259,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(old_page, true);
page_remove_rmap(old_page, vma, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
Expand Down
36 changes: 31 additions & 5 deletions mm/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,12 +395,35 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
bool write, int *locked);
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len);

/*
* must be called with vma's mmap_lock held for read or write, and page locked.
* mlock_vma_page() and munlock_vma_page():
* should be called with vma's mmap_lock held for read or write,
* under page table lock for the pte/pmd being added or removed.
*
* mlock is usually called at the end of page_add_*_rmap(),
* munlock at the end of page_remove_rmap(); but new anon
* pages are managed in lru_cache_add_inactive_or_unevictable().
*
* @compound is used to include pmd mappings of THPs, but filter out
* pte mappings of THPs, which cannot be consistently counted: a pte
* mapping of the THP head cannot be distinguished by the page alone.
*/
extern void mlock_vma_page(struct page *page);
extern void munlock_vma_page(struct page *page);
void mlock_page(struct page *page);
static inline void mlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound)
{
if (unlikely(vma->vm_flags & VM_LOCKED) &&
(compound || !PageTransCompound(page)))
mlock_page(page);
}
void munlock_page(struct page *page);
static inline void munlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound)
{
if (unlikely(vma->vm_flags & VM_LOCKED) &&
(compound || !PageTransCompound(page)))
munlock_page(page);
}

/*
* Clear the page's PageMlocked(). This can be useful in a situation where
Expand Down Expand Up @@ -487,7 +510,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound) { }
static inline void munlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
Expand Down
4 changes: 2 additions & 2 deletions mm/khugepaged.c
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, false);
page_remove_rmap(src_page, vma, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
Expand Down Expand Up @@ -1513,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
page_remove_rmap(page, false);
page_remove_rmap(page, vma, false);
}

pte_unmap_unlock(start_pte, ptl);
Expand Down
12 changes: 1 addition & 11 deletions mm/ksm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);

page_remove_rmap(page, false);
page_remove_rmap(page, vma, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
Expand Down Expand Up @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
err = replace_page(vma, page, kpage, orig_pte);
}

if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}

out_unlock:
unlock_page(page);
out:
Expand Down
45 changes: 15 additions & 30 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,

set_pte_at(vma->vm_mm, address, ptep, pte);

if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(page);

/*
* No need to invalidate - it was non-present before. However
* secondary CPUs may have mappings that need invalidating.
Expand Down Expand Up @@ -1377,7 +1374,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
page_remove_rmap(page, false);
page_remove_rmap(page, vma, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
Expand All @@ -1397,10 +1394,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;

if (is_device_private_entry(entry))
page_remove_rmap(page, false);

page_remove_rmap(page, vma, false);
put_page(page);
continue;
}
Expand Down Expand Up @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page)
return 0;
}

static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page, false);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
}

Expand All @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
Expand All @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
if (retval)
goto out;
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}

#ifdef pte_index
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
Expand All @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
return insert_page_into_pte_locked(mm, pte, addr, page, prot);
return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}

/* insert_pages() amortizes the cost of spinlock operations
Expand Down Expand Up @@ -1842,7 +1836,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,

start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(mm, pte,
int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
Expand Down Expand Up @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page, false);
page_remove_rmap(old_page, vma, false);
}

/* Free the old page.. */
Expand All @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
unlock_page(old_page);
}
if (page_copied)
free_swap_cache(old_page);
put_page(old_page);
Expand Down Expand Up @@ -3947,7 +3931,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
page_add_file_rmap(page, vma, true);

/*
* deposit and withdraw with pmd lock held
*/
Expand Down Expand Up @@ -3996,7 +3981,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
Expand Down
Loading

0 comments on commit cea86fe

Please sign in to comment.