patch-2.3.10 linux/mm/memory.c
Next file: linux/mm/mlock.c
Previous file: linux/mm/filemap.c
Back to the patch index
Back to the overall index
- Lines: 432
- Date:
Tue Jul 6 23:06:05 1999
- Orig file:
v2.3.9/linux/mm/memory.c
- Orig date:
Wed Jun 30 13:38:20 1999
diff -u --recursive --new-file v2.3.9/linux/mm/memory.c linux/mm/memory.c
@@ -36,7 +36,9 @@
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
+#include <linux/pagemap.h>
#include <linux/smp_lock.h>
+#include <linux/swapctl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -320,7 +322,7 @@
}
}
-static inline int zap_pte_range(pmd_t * pmd, unsigned long address, unsigned long size)
+static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size)
{
pte_t * pte;
int freed;
@@ -345,15 +347,15 @@
page = *pte;
pte++;
size--;
+ pte_clear(pte-1);
if (pte_none(page))
continue;
- pte_clear(pte-1);
freed += free_pte(page);
}
return freed;
}
-static inline int zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
+static inline int zap_pmd_range(struct mm_struct *mm, pgd_t * dir, unsigned long address, unsigned long size)
{
pmd_t * pmd;
unsigned long end;
@@ -373,7 +375,7 @@
end = PGDIR_SIZE;
freed = 0;
do {
- freed += zap_pte_range(pmd, address, end - address);
+ freed += zap_pte_range(mm, pmd, address, end - address);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
@@ -390,11 +392,21 @@
int freed = 0;
dir = pgd_offset(mm, address);
+
+ /*
+ * This is a long-lived spinlock. That's fine.
+ * There's no contention, because the page table
+ * lock only protects against kswapd anyway, and
+ * even if kswapd happened to be looking at this
+ * process we _want_ it to get stuck.
+ */
+ spin_lock(&mm->page_table_lock);
while (address < end) {
- freed += zap_pmd_range(dir, address, end - address);
+ freed += zap_pmd_range(mm, dir, address, end - address);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
}
+ spin_unlock(&mm->page_table_lock);
/*
* Update rss for the mm_struct (not necessarily current->mm)
*/
@@ -599,17 +611,16 @@
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
+ *
+ * We enter with the page table read-lock held, and need to exit without
+ * it.
*/
static int do_wp_page(struct task_struct * tsk, struct vm_area_struct * vma,
unsigned long address, pte_t *page_table, pte_t pte)
{
unsigned long old_page, new_page;
struct page * page;
-
- new_page = __get_free_page(GFP_USER);
- /* Did swap_out() unmap the protected page while we slept? */
- if (pte_val(*page_table) != pte_val(pte))
- goto end_wp_page;
+
old_page = pte_page(pte);
if (MAP_NR(old_page) >= max_mapnr)
goto bad_wp_page;
@@ -634,44 +645,44 @@
/* FallThrough */
case 1:
flush_cache_page(vma, address);
- set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
+ set_pte(page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
flush_tlb_page(vma, address);
-end_wp_page:
- /*
- * We can release the kernel lock now.. Now swap_out will see
- * a dirty page and so won't get confused and flush_tlb_page
- * won't SMP race. -Andrea
- */
- unlock_kernel();
-
- if (new_page)
- free_page(new_page);
+ spin_unlock(&tsk->mm->page_table_lock);
return 1;
}
-
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ spin_unlock(&tsk->mm->page_table_lock);
+ new_page = __get_free_page(GFP_USER);
if (!new_page)
- goto no_new_page;
+ return -1;
+ spin_lock(&tsk->mm->page_table_lock);
- if (PageReserved(page))
- ++vma->vm_mm->rss;
- copy_cow_page(old_page,new_page);
- flush_page_to_ram(old_page);
- flush_page_to_ram(new_page);
- flush_cache_page(vma, address);
- set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
- flush_tlb_page(vma, address);
- unlock_kernel();
- __free_page(page);
+ /*
+ * Re-check the pte - we dropped the lock
+ */
+ if (pte_val(*page_table) == pte_val(pte)) {
+ if (PageReserved(page))
+ ++vma->vm_mm->rss;
+ copy_cow_page(old_page,new_page);
+ flush_page_to_ram(old_page);
+ flush_page_to_ram(new_page);
+ flush_cache_page(vma, address);
+ set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+ flush_tlb_page(vma, address);
+
+ /* Free the old page.. */
+ new_page = old_page;
+ }
+ spin_unlock(&tsk->mm->page_table_lock);
+ free_page(new_page);
return 1;
bad_wp_page:
printk("do_wp_page: bogus page at address %08lx (%08lx)\n",address,old_page);
- send_sig(SIGKILL, tsk, 1);
-no_new_page:
- unlock_kernel();
- if (new_page)
- free_page(new_page);
- return 0;
+ return -1;
}
/*
@@ -725,8 +736,9 @@
struct vm_area_struct * mpnt;
truncate_inode_pages(inode, offset);
+ spin_lock(&inode->i_shared_lock);
if (!inode->i_mmap)
- return;
+ goto out_unlock;
mpnt = inode->i_mmap;
do {
struct mm_struct *mm = mpnt->vm_mm;
@@ -757,35 +769,81 @@
zap_page_range(mm, start, len);
flush_tlb_range(mm, start, end);
} while ((mpnt = mpnt->vm_next_share) != NULL);
+out_unlock:
+ spin_unlock(&inode->i_shared_lock);
}
-/*
- * This is called with the kernel lock held, we need
- * to return without it.
+
+/*
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
*/
-static int do_swap_page(struct task_struct * tsk,
+static void swapin_readahead(unsigned long entry)
+{
+ int i;
+ struct page *new_page;
+ unsigned long offset = SWP_OFFSET(entry);
+ struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
+
+ offset = (offset >> page_cluster) << page_cluster;
+
+ i = 1 << page_cluster;
+ do {
+ /* Don't read-ahead past the end of the swap area */
+ if (offset >= swapdev->max)
+ break;
+ /* Don't block on I/O for read-ahead */
+ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+ break;
+ /* Don't read in bad or busy pages */
+ if (!swapdev->swap_map[offset])
+ break;
+ if (swapdev->swap_map[offset] == SWAP_MAP_BAD)
+ break;
+
+ /* Ok, do the async read-ahead now */
+ new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
+ if (new_page != NULL)
+ __free_page(new_page);
+ offset++;
+ } while (--i);
+ return;
+}
+
+static int do_swap_page(struct task_struct * tsk,
struct vm_area_struct * vma, unsigned long address,
- pte_t * page_table, pte_t entry, int write_access)
+ pte_t * page_table, unsigned long entry, int write_access)
{
- if (!vma->vm_ops || !vma->vm_ops->swapin) {
- swap_in(tsk, vma, page_table, pte_val(entry), write_access);
- flush_page_to_ram(pte_page(*page_table));
- } else {
- pte_t page = vma->vm_ops->swapin(vma, address - vma->vm_start + vma->vm_offset, pte_val(entry));
- if (pte_val(*page_table) != pte_val(entry)) {
- free_page(pte_page(page));
- } else {
- if (page_count(mem_map + MAP_NR(pte_page(page))) > 1 &&
- !(vma->vm_flags & VM_SHARED))
- page = pte_wrprotect(page);
- ++vma->vm_mm->rss;
- ++tsk->maj_flt;
- flush_page_to_ram(pte_page(page));
- set_pte(page_table, page);
- }
+ struct page *page = lookup_swap_cache(entry);
+ pte_t pte;
+
+ if (!page) {
+ lock_kernel();
+ swapin_readahead(entry);
+ page = read_swap_cache(entry);
+ unlock_kernel();
+ if (!page)
+ return -1;
+
+ flush_page_to_ram(page_address(page));
+ }
+
+ vma->vm_mm->rss++;
+ tsk->min_flt++;
+ swap_free(entry);
+
+ pte = mk_pte(page_address(page), vma->vm_page_prot);
+
+ if (write_access && !is_page_shared(page)) {
+ delete_from_swap_cache(page);
+ pte = pte_mkwrite(pte_mkdirty(pte));
}
- unlock_kernel();
+ set_pte(page_table, pte);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
return 1;
}
@@ -798,7 +856,7 @@
if (write_access) {
unsigned long page = __get_free_page(GFP_USER);
if (!page)
- return 0;
+ return -1;
clear_page(page);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
vma->vm_mm->rss++;
@@ -806,6 +864,8 @@
flush_page_to_ram(page);
}
set_pte(page_table, entry);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, addr, entry);
return 1;
}
@@ -827,23 +887,17 @@
unsigned long page;
pte_t entry;
- if (!vma->vm_ops || !vma->vm_ops->nopage) {
- unlock_kernel();
- return do_anonymous_page(tsk, vma, page_table, write_access,
- address);
- }
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ return do_anonymous_page(tsk, vma, page_table, write_access, address);
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection.
*/
- page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
- (vma->vm_flags & VM_SHARED)?0:write_access);
-
- unlock_kernel();
+ page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
if (!page)
- return 0;
+ return 0; /* SIGBUS - but we _really_ should know whether it is OOM or SIGBUS */
++tsk->maj_flt;
++vma->vm_mm->rss;
@@ -866,6 +920,7 @@
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
+ update_mmu_cache(vma, address, entry);
return 1;
}
@@ -877,6 +932,15 @@
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
+ *
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
*/
static inline int handle_pte_fault(struct task_struct *tsk,
struct vm_area_struct * vma, unsigned long address,
@@ -884,27 +948,32 @@
{
pte_t entry;
- lock_kernel();
entry = *pte;
-
if (!pte_present(entry)) {
if (pte_none(entry))
return do_no_page(tsk, vma, address, write_access, pte);
- return do_swap_page(tsk, vma, address, pte, entry, write_access);
+ return do_swap_page(tsk, vma, address, pte, pte_val(entry), write_access);
}
- entry = pte_mkyoung(entry);
- set_pte(pte, entry);
- flush_tlb_page(vma, address);
- if (write_access) {
- if (!pte_write(entry))
- return do_wp_page(tsk, vma, address, pte, entry);
+ /*
+ * Ok, the entry was present, we need to get the page table
+ * lock to synchronize with kswapd, and verify that the entry
+ * didn't change from under us..
+ */
+ spin_lock(&tsk->mm->page_table_lock);
+ if (pte_val(entry) == pte_val(*pte)) {
+ if (write_access) {
+ if (!pte_write(entry))
+ return do_wp_page(tsk, vma, address, pte, entry);
- entry = pte_mkdirty(entry);
+ entry = pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
set_pte(pte, entry);
flush_tlb_page(vma, address);
+ update_mmu_cache(vma, address, entry);
}
- unlock_kernel();
+ spin_unlock(&tsk->mm->page_table_lock);
return 1;
}
@@ -921,28 +990,27 @@
pmd = pmd_alloc(pgd, address);
if (pmd) {
pte_t * pte = pte_alloc(pmd, address);
- if (pte) {
- if (handle_pte_fault(tsk, vma, address, write_access, pte)) {
- update_mmu_cache(vma, address, *pte);
- return 1;
- }
- }
+ if (pte)
+ return handle_pte_fault(tsk, vma, address, write_access, pte);
}
- return 0;
+ return -1;
}
/*
* Simplistic page force-in..
*/
-void make_pages_present(unsigned long addr, unsigned long end)
+int make_pages_present(unsigned long addr, unsigned long end)
{
int write;
+ struct task_struct *tsk = current;
struct vm_area_struct * vma;
- vma = find_vma(current->mm, addr);
+ vma = find_vma(tsk->mm, addr);
write = (vma->vm_flags & VM_WRITE) != 0;
while (addr < end) {
- handle_mm_fault(current, vma, addr, write);
+ if (handle_mm_fault(tsk, vma, addr, write) < 0)
+ return -1;
addr += PAGE_SIZE;
}
+ return 0;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)