Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F144666552
D24652.1776085949.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
58 KB
Referenced Files
None
Subscribers
None
D24652.1776085949.diff
View Options
Index: TODO
===================================================================
--- /dev/null
+++ TODO
@@ -0,0 +1,9 @@
+- per-user limit on the total superpages allocations
+- make pmap_superpagesizes[] per-pmap ?
+- more test programs
+
+<kib> 1. I either add a new pager type or allow to specify populate method, and make sure that vm_fault_populate() can cope
+<kib> 2. I will not expose pmap_enter_largepage() but I still want to keep it
+<kib> 3. there will be a new SHM flag to shm_open2() that opens special large-page shm, with phys pager backing, and some defaults
+<kib> 4. I do not need even a new mmap flag, if vm_mmap detects this object it should do the right thing, I just need to stash psind somewhere
+<kib> 5. shm pages will be instantiated with ftruncate() which must be done before mmap
\ No newline at end of file
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -1316,6 +1316,8 @@
{
pd_entry_t *pde;
+ KASSERT((*pdpe & PG_PS) == 0,
+ ("pmap_pdpe_to_pde for 1G page, va %#lx", va));
pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
return (&pde[pmap_pde_index(va)]);
}
@@ -1331,6 +1333,8 @@
pdpe = pmap_pdpe(pmap, va);
if (pdpe == NULL || (*pdpe & PG_V) == 0)
return (NULL);
+ KASSERT((*pdpe & PG_PS) == 0,
+ ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va));
return (pmap_pdpe_to_pde(pdpe, va));
}
@@ -2136,6 +2140,11 @@
KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
("pmap_init: can't assign to pagesizes[1]"));
pagesizes[1] = NBPDR;
+ if ((amd_feature & AMDID_PAGE1GB) != 0) {
+ KASSERT(MAXPAGESIZES > 1 && pagesizes[2] == 0,
+ ("pmap_init: can't assign to pagesizes[2]"));
+ pagesizes[2] = NBPDP;
+ }
}
/*
@@ -3780,6 +3789,19 @@
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
+ *
+ * Page table entry at address va page index is defined as follows:
+ * - for page table (last level), ptepindex = pmap_pde_pindex(va) =
+ * = va >> PDRSHIFT, in other words, it is just the index of the PDE.
+ * - for page directory page, ptepindex = NUPDE (number of userland PD
+ * entries) + (pmap_pde_index(va) >> NPDEPGSHIFT)
+ * i.e. index of PDPE is put after the last index of PDE,
+ * - for page directory pointer page, ptepindex = NUPDE + NUPDPE +
+ * (pmap_pde_index(va) >> (NPDEPGSHIFT + NPML4EPGSHIFT),
+ * i.e. index of pml4e is put after the last index of PDPE.
+ * In other words, is it sequential number of the corresponding paging entry
+ * in the order where all entries of the same rank are put together, then
+ * ranks are put from deepest to root.
*/
static vm_page_t
_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
@@ -5395,6 +5417,7 @@
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
struct rwlock *lock;
+ vm_page_t mt;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
@@ -5447,13 +5470,28 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ anyvalid = 1;
+ *pdpe = 0;
+ pmap_resident_count_dec(pmap, NBPDP / PAGE_SIZE);
+ mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME);
+ pmap_unwire_ptp(pmap, sva, mt, &free);
+ continue;
+ }
+
/*
* Calculate index for next page table.
*/
@@ -5669,11 +5707,13 @@
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
+ vm_page_t m;
vm_offset_t va_next;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
+ pt_entry_t obits, pbits;
boolean_t anychanged;
KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
@@ -5724,13 +5764,36 @@
}
pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+ va_next = (sva + NBPDP) & ~PDPMASK;
if ((*pdpe & PG_V) == 0) {
- va_next = (sva + NBPDP) & ~PDPMASK;
if (va_next < sva)
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_remove of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+retry_pdpe:
+ obits = pbits = *pdpe;
+ MPASS((pbits & (PG_MANAGED | PG_G)) == 0);
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ if ((prot & VM_PROT_WRITE) == 0)
+ pbits &= ~(PG_RW | PG_M);
+ if ((prot & VM_PROT_EXECUTE) == 0)
+ pbits |= pg_nx;
+
+ if (pbits != obits) {
+ if (!atomic_cmpset_long(pdpe, obits, pbits))
+ /* PG_PS cannot be cleared under us, */
+ goto retry_pdpe;
+ anychanged = TRUE;
+ }
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -5773,9 +5836,6 @@
for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
sva += PAGE_SIZE) {
- pt_entry_t obits, pbits;
- vm_page_t m;
-
retry:
obits = pbits = *pte;
if ((pbits & PG_V) == 0)
@@ -5950,6 +6010,115 @@
}
#endif /* VM_NRESERVLEVEL > 0 */
+static int
+pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
+ int psind)
+{
+ vm_page_t mp;
+ pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V;
+ vm_pindex_t ptepindex;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(psind > 0 && psind < MAXPAGESIZES,
+ ("psind %d unexpected", psind));
+ KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0,
+ ("unaligned phys address %#lx newpte %#lx psind %d",
+ newpte & PG_FRAME, newpte, psind));
+ KASSERT((va & (pagesizes[psind] - 1)) == 0,
+ ("unaligned va %#lx psind %d", va, psind));
+ KASSERT(va < VM_MAXUSER_ADDRESS,
+ ("kernel mode non-transparent superpage")); /* XXXKIB */
+ KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS,
+ ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */
+
+ PG_V = pmap_valid_bit(pmap);
+
+restart:
+ pten = newpte;
+ if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
+ pten |= pmap_pkru_get(pmap, va);
+
+ ptepindex = pmap_pde_pindex(va);
+
+ if (psind == 2) { /* 1G */
+ if (!pmap_pkru_same(pmap, va, va + NBPDP))
+ return (KERN_PROTECTION_FAILURE);
+ pml4e = pmap_pml4e(pmap, va);
+ if ((*pml4e & PG_V) == 0) {
+ mp = _pmap_allocpte(pmap, NUPDE + NUPDPE +
+ ((ptepindex - NUPDE) >> NPML4EPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+
+ /*
+ * Restart at least to recalcuate the pkru
+ * key. Our caller must keep the map locked
+ * so no paging structure can be validated
+ * under us.
+ */
+ goto restart;
+ }
+ } else {
+ mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME);
+ mp->ref_count++;
+ }
+ pdpe = pmap_pdpe(pmap, va);
+ KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va));
+ origpte = *pdpe;
+ KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
+ (origpte & PG_FRAME) == (newpte & PG_FRAME)),
+ ("va %#lx changing 1G phys page pdpe %#lx newpte %#lx",
+ va, origpte, newpte));
+ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
+ pmap->pm_stats.wired_count += NBPDP / PAGE_SIZE;
+ else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
+ pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
+ *pdpe = newpte;
+ } else /* (psind == 1) */ { /* 2M */
+ if (!pmap_pkru_same(pmap, va, va + NBPDR))
+ return (KERN_PROTECTION_FAILURE);
+ pde = pmap_pde(pmap, va);
+ if (pde == NULL) {
+ mp = _pmap_allocpte(pmap, NUPDE +
+ (ptepindex >> NPDPEPGSHIFT), NULL);
+ if (mp == NULL) {
+ if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+ return (KERN_RESOURCE_SHORTAGE);
+ PMAP_UNLOCK(pmap);
+ vm_wait(NULL);
+ PMAP_LOCK(pmap);
+ goto restart;
+ }
+ pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
+ pde = &pde[pmap_pde_index(va)];
+ } else {
+ pdpe = pmap_pdpe(pmap, va);
+ MPASS(pdpe != NULL && (*pdpe & PG_V) != 0);
+ mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
+ mp->ref_count++;
+ }
+ KASSERT(pde != NULL, ("va %#lx lost pde", va));
+ origpte = *pde;
+ KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 &&
+ (origpte & PG_FRAME) == (newpte & PG_FRAME)),
+ ("va %#lx changing 2M phys page pde %#lx newpte %#lx",
+ va, origpte, newpte));
+ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
+ pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
+ else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
+ pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
+ *pde = newpte;
+ }
+ if ((origpte & PG_V) != 0)
+ pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
+
+ return (KERN_SUCCESS);
+}
+
/*
* Insert the given physical page (p) at
* the specified virtual address (v) in the
@@ -6029,6 +6198,13 @@
lock = NULL;
PMAP_LOCK(pmap);
+ if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
+ KASSERT((m->oflags & VPO_UNMANAGED) != 0,
+ ("managed largepage va %#lx flags %#x", va, flags));
+ rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags,
+ psind);
+ goto out;
+ }
if (psind == 1) {
/* Assert the required virtual and physical alignment. */
KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
@@ -6714,9 +6890,10 @@
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
- pt_entry_t *pte, PG_V;
+ pt_entry_t *pte, PG_V, PG_G;
PG_V = pmap_valid_bit(pmap);
+ PG_G = pmap_global_bit(pmap);
PMAP_LOCK(pmap);
for (; sva < eva; sva = va_next) {
pml4e = pmap_pml4e(pmap, sva);
@@ -6733,6 +6910,18 @@
va_next = eva;
continue;
}
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_unwire of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0) {
+ MPASS(pmap != kernel_pmap); /* XXXKIB */
+ MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0);
+ atomic_clear_long(pdpe, PG_W);
+ pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE;
+ continue;
+ }
+
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
@@ -6849,6 +7038,12 @@
}
va_next = (addr + NBPDR) & ~PDRMASK;
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= end_addr,
+ ("pmap_copy of partial non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, addr, end_addr, va_next));
+ if ((*pdpe & PG_PS) != 0)
+ continue;
if (va_next < addr)
va_next = end_addr;
@@ -7905,6 +8100,12 @@
va_next = (sva + NBPDR) & ~PDRMASK;
if (va_next < sva)
va_next = eva;
+ KASSERT((*pdpe & PG_PS) == 0 || va_next <= eva,
+ ("pmap_advise of non-transient 1G page "
+ "pdpe %#lx sva %#lx eva %#lx va_next %#lx",
+ *pdpe, sva, eva, va_next));
+ if ((*pdpe & PG_PS) != 0)
+ continue;
pde = pmap_pdpe_to_pde(pdpe, sva);
oldpde = *pde;
if ((oldpde & PG_V) == 0)
@@ -8664,6 +8865,7 @@
int
pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
{
+ pdp_entry_t *pdpe;
pd_entry_t *pdep;
pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
vm_paddr_t pa;
@@ -8675,23 +8877,32 @@
PG_RW = pmap_rw_bit(pmap);
PMAP_LOCK(pmap);
- pdep = pmap_pde(pmap, addr);
- if (pdep != NULL && (*pdep & PG_V)) {
- if (*pdep & PG_PS) {
- pte = *pdep;
- /* Compute the physical address of the 4KB page. */
- pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
+ pte = 0;
+ pa = 0;
+ val = 0;
+ pdpe = pmap_pdpe(pmap, addr);
+ if ((*pdpe & PG_V) != 0) {
+ if ((*pdpe & PG_PS) != 0) {
+ pte = *pdpe;
+ pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) &
PG_FRAME;
val = MINCORE_SUPER;
} else {
- pte = *pmap_pde_to_pte(pdep, addr);
- pa = pte & PG_FRAME;
- val = 0;
+ pdep = pmap_pde(pmap, addr);
+ if (pdep != NULL && (*pdep & PG_V) != 0) {
+ if ((*pdep & PG_PS) != 0) {
+ pte = *pdep;
+ /* Compute the physical address of the 4KB page. */
+ pa = ((pte & PG_PS_FRAME) | (addr &
+ PDRMASK)) & PG_FRAME;
+ val = MINCORE_SUPER;
+ } else {
+ pte = *pmap_pde_to_pte(pdep, addr);
+ pa = pte & PG_FRAME;
+ val = 0;
+ }
+ }
}
- } else {
- pte = 0;
- pa = 0;
- val = 0;
}
if ((pte & PG_V) != 0) {
val |= MINCORE_INCORE;
Index: sys/dev/ksyms/ksyms.c
===================================================================
--- sys/dev/ksyms/ksyms.c
+++ sys/dev/ksyms/ksyms.c
@@ -41,6 +41,7 @@
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/uio.h>
@@ -51,6 +52,8 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include "linker_if.h"
@@ -442,8 +445,8 @@
ksyms_size_calc(&ts);
elfsz = sizeof(struct ksyms_hdr) + ts.ts_symsz + ts.ts_strsz;
- object = vm_object_allocate(OBJT_PHYS,
- OFF_TO_IDX(round_page(elfsz)));
+ object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(elfsz),
+ VM_PROT_ALL, 0, td->td_ucred);
sc->sc_obj = object;
sc->sc_objsz = elfsz;
Index: sys/dev/xen/gntdev/gntdev.c
===================================================================
--- sys/dev/xen/gntdev/gntdev.c
+++ sys/dev/xen/gntdev/gntdev.c
@@ -1068,7 +1068,8 @@
vm_object_t mem_obj;
struct gntdev_gref *gref;
- mem_obj = vm_object_allocate(OBJT_PHYS, size);
+ mem_obj = vm_pager_allocate(OBJT_PHYS, NULL, size, VM_PROT_ALL, 0,
+ curthread->td_ucred);
if (mem_obj == NULL)
return (ENOMEM);
Index: sys/kern/kern_umtx.c
===================================================================
--- sys/kern/kern_umtx.c
+++ sys/kern/kern_umtx.c
@@ -3933,7 +3933,7 @@
reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
reg->ushm_refcnt = 1;
bcopy(key, ®->ushm_key, sizeof(*key));
- reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
+ reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false);
reg->ushm_cred = crhold(cred);
error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
if (error != 0) {
Index: sys/kern/link_elf.c
===================================================================
--- sys/kern/link_elf.c
+++ sys/kern/link_elf.c
@@ -1089,7 +1089,8 @@
ef = (elf_file_t) lf;
#ifdef SPARSE_MAPPING
- ef->object = vm_object_allocate(OBJT_PHYS, atop(mapsize));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, mapsize, VM_PROT_ALL,
+ 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/link_elf_obj.c
===================================================================
--- sys/kern/link_elf_obj.c
+++ sys/kern/link_elf_obj.c
@@ -34,16 +34,17 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
+#include <sys/linker.h>
#include <sys/mutex.h>
#include <sys/mount.h>
-#include <sys/proc.h>
#include <sys/namei.h>
-#include <sys/fcntl.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/vnode.h>
-#include <sys/linker.h>
#include <machine/elf.h>
@@ -53,11 +54,13 @@
#include <vm/vm.h>
#include <vm/vm_param.h>
-#include <vm/vm_object.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_extern.h>
#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
#include <sys/link_elf.h>
@@ -905,7 +908,8 @@
* This stuff needs to be in a single chunk so that profiling etc
* can get the bounds and gdb can associate offsets with modules
*/
- ef->object = vm_object_allocate(OBJT_PHYS, atop(round_page(mapsize)));
+ ef->object = vm_pager_allocate(OBJT_PHYS, NULL, round_page(mapsize),
+ VM_PROT_ALL, 0, thread0.td_ucred);
if (ef->object == NULL) {
error = ENOMEM;
goto out;
Index: sys/kern/uipc_shm.c
===================================================================
--- sys/kern/uipc_shm.c
+++ sys/kern/uipc_shm.c
@@ -159,7 +159,7 @@
.fo_get_seals = shm_get_seals,
.fo_add_seals = shm_add_seals,
.fo_fallocate = shm_fallocate,
- .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
};
FEATURE(posix_shm, "POSIX shared memory");
@@ -242,6 +242,84 @@
return (error);
}
+static u_long count_largepages[MAXPAGESIZES];
+
+static int
+shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ vm_page_t m;
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind == 0 || pidx >= object->size)
+ return (VM_PAGER_FAIL);
+ *first = rounddown(pidx, pagesizes[psind] / PAGE_SIZE);
+
+ /*
+ * We only busy the first page in the superpage run. It is
+ * useless to busy whole run since we only remove full
+ * superpage, and its take too long to busy e.g. 512 * 512 ==
+ * 262144 pages constituing 1G amd64 superage.
+ */
+ m = vm_page_grab(object, *first, VM_ALLOC_NORMAL);
+
+ *last = roundup(pidx, pagesizes[psind] / PAGE_SIZE);
+ return (VM_PAGER_OK);
+}
+
+static boolean_t
+shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
+ int *before, int *after)
+{
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind == 0 || pindex >= object->size)
+ return (FALSE);
+ if (before != NULL) {
+ *before = pindex - rounddown(pindex, pagesizes[psind] /
+ PAGE_SIZE);
+ }
+ if (after != NULL) {
+ *after = roundup(pindex, pagesizes[psind] / PAGE_SIZE) -
+ pindex;
+ }
+ return (TRUE);
+}
+
+static void
+shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred)
+{
+}
+
+static void
+shm_largepage_phys_dtor(vm_object_t object)
+{
+ int psind;
+
+ psind = object->un_pager.phys.data_val;
+ if (psind != 0) {
+ atomic_subtract_long(&count_largepages[psind],
+ object->size / (pagesizes[psind] / PAGE_SIZE));
+ }
+}
+
+static struct phys_pager_ops shm_largepage_phys_ops = {
+ .phys_pg_populate = shm_largepage_phys_populate,
+ .phys_pg_haspage = shm_largepage_phys_haspage,
+ .phys_pg_ctor = shm_largepage_phys_ctor,
+ .phys_pg_dtor = shm_largepage_phys_dtor,
+};
+
+static inline bool
+shm_largepage(struct shmfd *shmfd)
+{
+ return (shmfd->shm_object->type == OBJT_PHYS &&
+ shmfd->shm_object->un_pager.phys.ops == &shm_largepage_phys_ops);
+}
+
static int
shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
{
@@ -320,6 +398,8 @@
if (error)
return (error);
#endif
+ if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
+ return (EINVAL);
foffset_lock_uio(fp, uio, flags);
if ((flags & FOF_OFFSET) == 0) {
rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
@@ -359,7 +439,11 @@
shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
struct thread *td)
{
+ struct shmfd *shmfd;
+ struct shm_largepage_conf *conf;
+ void *rl_cookie;
+ shmfd = fp->f_data;
switch (com) {
case FIONBIO:
case FIOASYNC:
@@ -368,6 +452,28 @@
* just like it would on an unlinked regular file
*/
return (0);
+ case FIOSHMLPGCNF:
+ if (!shm_largepage(shmfd))
+ return (ENOTTY);
+ conf = data;
+ if (shmfd->shm_lp_psind != 0 &&
+ conf->psind != shmfd->shm_lp_psind)
+ return (EINVAL);
+ if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
+ pagesizes[conf->psind] == 0)
+ return (EINVAL);
+ if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
+ conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
+ return (EINVAL);
+
+ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+ &shmfd->shm_mtx);
+ shmfd->shm_lp_psind = conf->psind;
+ shmfd->shm_lp_alloc_policy = conf->alloc_policy;
+ shmfd->shm_object->un_pager.phys.data_val = conf->psind;
+ rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+ return (0);
default:
return (ENOTTY);
}
@@ -410,6 +516,8 @@
sb->st_dev = shm_dev_ino;
sb->st_ino = shmfd->shm_ino;
sb->st_nlink = shmfd->shm_object->ref_count;
+ sb->st_blocks = shmfd->shm_object->size /
+ (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
return (0);
}
@@ -571,6 +679,100 @@
return (0);
}
+static int
+shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
+{
+ vm_object_t object;
+ vm_page_t m;
+ vm_pindex_t a, a1, newobjsz, oldobjsz;
+ int aflags, error, i;
+
+ KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
+ object = shmfd->shm_object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+ rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
+
+ oldobjsz = object->size;
+ newobjsz = OFF_TO_IDX(length);
+ if (length == shmfd->shm_size)
+ return (0);
+ if (shmfd->shm_lp_psind == 0 && length != 0)
+ return (EINVAL);
+ if ((length & (pagesizes[shmfd->shm_lp_psind] - 1)) != 0)
+ return (EINVAL);
+
+ if (length < shmfd->shm_size) {
+ if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
+ return (EPERM);
+ if (shmfd->shm_kmappings > 0)
+ return (EBUSY);
+ return (ENOTSUP); /* Pages are unmanaged. */
+#if 0
+ vm_object_page_remove(object, newobjsz, oldobjsz, 0);
+ object->size = newobjsz;
+ shmfd->shm_size = length;
+ return (0);
+#endif
+ }
+
+ aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
+ if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
+ aflags |= VM_ALLOC_WAITFAIL;
+
+ a = oldobjsz;
+ for (; a < newobjsz;
+ a += OFF_TO_IDX(pagesizes[shmfd->shm_lp_psind])) {
+ m = vm_page_alloc_contig(object, a, aflags,
+ pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0,
+ VM_MEMATTR_DEFAULT);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ if (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_NOWAIT) {
+ error = ENOMEM;
+ goto fail;
+ }
+ if (shmfd->shm_lp_alloc_policy ==
+ SHM_LARGEPAGE_ALLOC_HARD) {
+ if (!vm_page_reclaim_contig(aflags,
+ pagesizes[shmfd->shm_lp_psind] /
+ PAGE_SIZE, 0, ~0,
+ pagesizes[shmfd->shm_lp_psind], 0))
+ vm_wait(object);
+ } else {
+ vm_wait(object);
+ }
+ error = thread_check_susp(curthread, false);
+ if (error != 0)
+ goto fail;
+ VM_OBJECT_WLOCK(object);
+ continue;
+ }
+ for (i = 0; i < pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE;
+ i++) {
+ if ((m[i].flags & PG_ZERO) == 0)
+ pmap_zero_page(&m[i]);
+ vm_page_valid(&m[i]);
+ vm_page_xunbusy(&m[i]);
+ }
+ }
+ object->size = newobjsz;
+ shmfd->shm_size = length;
+ atomic_add_long(&count_largepages[shmfd->shm_lp_psind], (newobjsz -
+ oldobjsz) / (pagesizes[shmfd->shm_lp_psind] / PAGE_SIZE));
+ return (0);
+
+fail:
+ VM_OBJECT_WLOCK(object);
+ for (a1 = oldobjsz; a1 < a; a++) {
+ m = vm_page_lookup(object, a1);
+ vm_page_free(m);
+ }
+ VM_OBJECT_WUNLOCK(object);
+ return (error);
+}
+
int
shm_dotruncate(struct shmfd *shmfd, off_t length)
{
@@ -580,7 +782,10 @@
rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
&shmfd->shm_mtx);
VM_OBJECT_WLOCK(shmfd->shm_object);
- error = shm_dotruncate_locked(shmfd, length, rl_cookie);
+ if (shm_largepage(shmfd))
+ error = shm_dotruncate_largepage(shmfd, length, rl_cookie);
+ else
+ error = shm_dotruncate_locked(shmfd, length, rl_cookie);
VM_OBJECT_WUNLOCK(shmfd->shm_object);
rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
return (error);
@@ -591,7 +796,7 @@
* routines.
*/
struct shmfd *
-shm_alloc(struct ucred *ucred, mode_t mode)
+shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
{
struct shmfd *shmfd;
@@ -600,8 +805,15 @@
shmfd->shm_uid = ucred->cr_uid;
shmfd->shm_gid = ucred->cr_gid;
shmfd->shm_mode = mode;
- shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
- shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ if (largepage) {
+ shmfd->shm_object = phys_pager_allocate(NULL,
+ &shm_largepage_phys_ops, NULL, shmfd->shm_size,
+ VM_PROT_DEFAULT, 0, ucred);
+ shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
+ } else {
+ shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
+ shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ }
KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
vfs_timestamp(&shmfd->shm_birthtime);
shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
@@ -663,14 +875,14 @@
return (error);
}
-/*
- * Dictionary management. We maintain an in-kernel dictionary to map
- * paths to shmfd objects. We use the FNV hash on the path to store
- * the mappings in a hash table.
- */
+static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "");
+
static void
shm_init(void *arg)
{
+ char name[32];
+ int i;
mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
sx_init(&shm_dict_lock, "shm dictionary");
@@ -678,9 +890,32 @@
new_unrhdr64(&shm_ino_unr, 1);
shm_dev_ino = devfs_alloc_cdp_inode();
KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
+
+ for (i = 1; i < MAXPAGESIZES; i++) {
+ if (pagesizes[i] == 0)
+ break;
+#define M (1024 * 1024)
+#define G (1024 * M)
+ if (pagesizes[i] >= G)
+ snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
+ else if (pagesizes[i] >= M)
+ snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
+ else
+ snprintf(name, sizeof(name), "%lu", pagesizes[i]);
+#undef G
+#undef M
+ SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
+ OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
+ "number of non-transient largepages allocated");
+ }
}
SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
+/*
+ * Dictionary management. We maintain an in-kernel dictionary to map
+ * paths to shmfd objects. We use the FNV hash on the path to store
+ * the mappings in a hash table.
+ */
static struct shmfd *
shm_lookup(char *path, Fnv32_t fnv)
{
@@ -752,8 +987,9 @@
Fnv32_t fnv;
mode_t cmode;
int error, fd, initial_seals;
+ bool largepage;
- if ((shmflags & ~SHM_ALLOW_SEALING) != 0)
+ if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_LARGEPAGE)) != 0)
return (EINVAL);
initial_seals = F_SEAL_SEAL;
@@ -777,6 +1013,8 @@
if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
return (EINVAL);
+ largepage = (shmflags & SHM_LARGEPAGE) != 0;
+
/*
* Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
* If the decision is made later to allow additional seals, care must be
@@ -810,7 +1048,7 @@
fdrop(fp, td);
return (EINVAL);
}
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode, largepage);
shmfd->shm_seals = initial_seals;
} else {
error = shm_copyin_path(td, userpath, &path);
@@ -832,7 +1070,8 @@
path);
if (error == 0) {
#endif
- shmfd = shm_alloc(td->td_ucred, cmode);
+ shmfd = shm_alloc(td->td_ucred, cmode,
+ largepage);
shmfd->shm_seals = initial_seals;
shm_insert(path, fnv, shmfd);
#ifdef MAC
@@ -1114,7 +1353,95 @@
return (error);
}
-int
+static int
+shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
+ vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
+ vm_ooffset_t foff, bool writecounted, struct thread *td)
+{
+ struct vmspace *vms;
+ vm_map_entry_t next_entry, prev_entry;
+ vm_offset_t mask, maxaddr;
+ int docow, error, rv, try;
+ bool curmap;
+
+ if (shmfd->shm_lp_psind == 0)
+ return (EINVAL);
+
+ vms = td->td_proc->p_vmspace;
+ curmap = map == &vms->vm_map;
+ if (curmap) {
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
+ }
+
+ docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
+ if ((flags & MAP_NOCORE) != 0)
+ docow |= MAP_DISABLE_COREDUMP;
+ if ((flags & MAP_SHARED) != 0)
+ docow |= MAP_INHERIT_SHARE;
+ if (writecounted)
+ docow |= MAP_WRITECOUNT;
+
+ mask = pagesizes[shmfd->shm_lp_psind] - 1;
+ if ((foff & mask) != 0)
+ return (EINVAL);
+ maxaddr = vm_map_max(map);
+#ifdef MAP_32BIT
+ if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
+ maxaddr = MAP_32BIT_MAX_ADDR;
+#endif
+ if (size == 0 || (size & mask) != 0 ||
+ (*addr != 0 && ((*addr & mask) != 0 ||
+ *addr + size < *addr || *addr + size > maxaddr)))
+ return (EINVAL);
+
+ vm_map_lock(map);
+ if ((flags & MAP_FIXED) == 0) {
+ try = 1;
+ if (curmap && (*addr == 0 ||
+ (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
+ *addr < round_page((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA))))) {
+ *addr = roundup2((vm_offset_t)vms->vm_daddr +
+ lim_max(td, RLIMIT_DATA),
+ pagesizes[shmfd->shm_lp_psind]);
+ }
+again:
+ rv = vm_map_find_aligned(map, addr, size, maxaddr,
+ pagesizes[shmfd->shm_lp_psind]);
+ if (rv != KERN_SUCCESS) {
+ if (try == 1) {
+ try = 2;
+ *addr = vm_map_min(map);
+ if ((*addr & mask) != 0)
+ *addr = (*addr + mask) & mask;
+ goto again;
+ }
+ error = vm_mmap_to_errno(rv);
+ goto fail;
+ }
+ } else if ((flags & MAP_EXCL) == 0) {
+ vm_map_delete(map, *addr, *addr + size);
+ } else {
+ error = ENOSPC;
+ if (vm_map_lookup_entry(map, *addr, &prev_entry))
+ goto fail;
+ next_entry = vm_map_entry_succ(prev_entry);
+ if (next_entry->start < *addr + size)
+ goto fail;
+ }
+
+ rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
+ prot, max_prot, docow);
+ if (rv != KERN_SUCCESS)
+ error = vm_mmap_to_errno(rv);
+fail:
+ vm_map_unlock(map);
+ return (error);
+}
+
+static int
shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
vm_ooffset_t foff, struct thread *td)
@@ -1186,8 +1513,15 @@
if (writecnt)
vm_pager_update_writecount(shmfd->shm_object, 0, objsize);
- error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
- shmfd->shm_object, foff, writecnt, td);
+ if (shm_largepage(shmfd)) {
+ error = shm_mmap_large(shmfd, map, addr, objsize, prot,
+ maxprot, flags, foff, writecnt, td);
+ } else if ((flags & MAP_LARGEPAGE) != 0) {
+ error = EINVAL;
+ } else {
+ error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
+ shmfd->shm_object, foff, writecnt, td);
+ }
if (error != 0) {
if (writecnt)
vm_pager_release_writecount(shmfd->shm_object, 0,
Index: sys/sys/filio.h
===================================================================
--- sys/sys/filio.h
+++ sys/sys/filio.h
@@ -70,6 +70,7 @@
};
/* Get the file's bmap info for the logical block bn. */
#define FIOBMAP2 _IOWR('f', 99, struct fiobmap2_arg)
+#define FIOSHMLPGCNF _IOW('f', 100, struct shm_largepage_conf)
#ifdef _KERNEL
#ifdef COMPAT_FREEBSD32
Index: sys/sys/mman.h
===================================================================
--- sys/sys/mman.h
+++ sys/sys/mman.h
@@ -107,6 +107,8 @@
#ifdef __LP64__
#define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */
#endif
+#define MAP_LARGEPAGE 0x00100000 /* ensure that mapping uses large TLB
+ entries */
/*
* Request specific alignment (n == log2 of the desired alignment).
@@ -190,6 +192,17 @@
* shmflags for shm_open2()
*/
#define SHM_ALLOW_SEALING 0x00000001
+#define SHM_LARGEPAGE 0x00000002
+
+#define SHM_LARGEPAGE_ALLOC_DEFAULT 0
+#define SHM_LARGEPAGE_ALLOC_NOWAIT 1
+#define SHM_LARGEPAGE_ALLOC_HARD 2
+
+struct shm_largepage_conf {
+ int psind;
+ int alloc_policy;
+ int pad[10];
+};
/*
* Flags for memfd_create().
@@ -279,6 +292,10 @@
struct mtx shm_mtx;
int shm_seals;
+
+ /* largepage config */
+ int shm_lp_psind;
+ int shm_lp_alloc_policy;
};
#endif
@@ -287,12 +304,15 @@
int shm_unmap(struct file *fp, void *mem, size_t size);
int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
-struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage);
struct shmfd *shm_hold(struct shmfd *shmfd);
void shm_drop(struct shmfd *shmfd);
int shm_dotruncate(struct shmfd *shmfd, off_t length);
extern struct fileops shm_ops;
+
+#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
+
#else /* !_KERNEL */
__BEGIN_DECLS
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -62,6 +62,7 @@
struct stat;
struct thr_param;
struct uio;
+struct vm_map;
typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
@@ -195,8 +196,10 @@
size_t len);
int kern_mmap(struct thread *td, uintptr_t addr, size_t len, int prot,
int flags, int fd, off_t pos);
-int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
+int kern_mmap_racct_check(struct thread *td, struct vm_map *map,
+ vm_size_t size);
int kern_mmap_maxprot(struct proc *p, int prot);
+int kern_mmap_req(struct thread *td, const struct mmap_req *mrp);
int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
int kern_msgctl(struct thread *, int, int, struct msqid_ds *);
int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
Index: sys/vm/phys_pager.c
===================================================================
--- sys/vm/phys_pager.c
+++ sys/vm/phys_pager.c
@@ -51,6 +51,20 @@
/* protect access to phys_pager_object_list */
static struct mtx phys_pager_mtx;
+static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m,
+ int count, int *rbehind, int *rahead);
+static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last);
+static boolean_t default_phys_pager_haspage(vm_object_t object,
+ vm_pindex_t pindex, int *before, int *after);
+struct phys_pager_ops default_phys_pg_ops = {
+ .phys_pg_getpages = default_phys_pager_getpages,
+ .phys_pg_populate = default_phys_pager_populate,
+ .phys_pg_haspage = default_phys_pager_haspage,
+ .phys_pg_ctor = NULL,
+ .phys_pg_dtor = NULL,
+};
+
static void
phys_pager_init(void)
{
@@ -59,12 +73,13 @@
mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF);
}
-static vm_object_t
-phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
- vm_ooffset_t foff, struct ucred *cred)
+vm_object_t
+phys_pager_allocate(void *handle, struct phys_pager_ops *ops, void *data,
+ vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
{
vm_object_t object, object1;
vm_pindex_t pindex;
+ bool init;
/*
* Offset should be page aligned.
@@ -73,6 +88,7 @@
return (NULL);
pindex = OFF_TO_IDX(foff + PAGE_MASK + size);
+ init = true;
if (handle != NULL) {
mtx_lock(&phys_pager_mtx);
@@ -97,11 +113,15 @@
*/
if (pindex > object->size)
object->size = pindex;
+ init = false;
} else {
object = object1;
object1 = NULL;
object->handle = handle;
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data_ptr = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
TAILQ_INSERT_TAIL(&phys_pager_object_list,
object, pager_object_list);
}
@@ -113,12 +133,25 @@
vm_object_deallocate(object1);
} else {
object = vm_object_allocate(OBJT_PHYS, pindex);
- vm_object_set_flag(object, OBJ_POPULATE);
+ object->un_pager.phys.ops = ops;
+ object->un_pager.phys.data_ptr = data;
+ if (ops->phys_pg_populate != NULL)
+ vm_object_set_flag(object, OBJ_POPULATE);
}
+ if (init && ops->phys_pg_ctor != NULL)
+ ops->phys_pg_ctor(object, prot, foff, cred);
return (object);
}
+static vm_object_t
+phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *ucred)
+{
+ return (phys_pager_allocate(handle, &default_phys_pg_ops, NULL,
+ size, prot, foff, ucred));
+}
+
static void
phys_pager_dealloc(vm_object_t object)
{
@@ -130,16 +163,18 @@
mtx_unlock(&phys_pager_mtx);
VM_OBJECT_WLOCK(object);
}
- object->handle = NULL;
object->type = OBJT_DEAD;
+ if (object->un_pager.phys.ops->phys_pg_dtor != NULL)
+ object->un_pager.phys.ops->phys_pg_dtor(object);
+ object->handle = NULL;
}
/*
* Fill as many pages as vm_fault has allocated for us.
*/
static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
- int *rahead)
+default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+ int *rbehind, int *rahead)
{
int i;
@@ -161,6 +196,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+ int *rahead)
+{
+ return (object->un_pager.phys.ops->phys_pg_getpages(object, m,
+ count, rbehind, rahead));
+}
+
/*
* Implement a pretty aggressive clustered getpages strategy. Hint that
* everything in an entire 4MB window should be prefaulted at once.
@@ -185,7 +228,7 @@
#define PHYSALLOC 16
static int
-phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
vm_pindex_t *last)
{
@@ -216,6 +259,14 @@
return (VM_PAGER_OK);
}
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+ vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+ return (object->un_pager.phys.ops->phys_pg_populate(object, pidx,
+ fault_type, max_prot, first, last));
+}
+
static void
phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
int *rtvals)
@@ -225,7 +276,7 @@
}
static boolean_t
-phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
int *after)
{
vm_pindex_t base, end;
@@ -239,6 +290,14 @@
return (TRUE);
}
+static boolean_t
+phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+ int *after)
+{
+ return (object->un_pager.phys.ops->phys_pg_haspage(object, pindex,
+ before, after));
+}
+
struct pagerops physpagerops = {
.pgo_init = phys_pager_init,
.pgo_alloc = phys_pager_alloc,
Index: sys/vm/pmap.h
===================================================================
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -106,6 +106,7 @@
*/
#define PMAP_ENTER_NOSLEEP 0x00000100
#define PMAP_ENTER_WIRED 0x00000200
+#define PMAP_ENTER_LARGEPAGE 0x00000400
#define PMAP_ENTER_RESERVED 0xFF000000
/*
@@ -171,5 +172,8 @@
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
+extern u_long pmap_superpagesize[];
+extern u_int pmap_superpagesize_nitems;
+
#endif /* _KERNEL */
#endif /* _PMAP_VM_ */
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -424,7 +424,7 @@
vm_offset_t vaddr;
vm_page_t m;
vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
- int i, npages, psind, rv;
+ int bdry_idx, i, npages, psind, rv;
MPASS(fs->object == fs->first_object);
VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
@@ -469,15 +469,47 @@
MPASS(pager_last < fs->first_object->size);
vm_fault_restore_map_lock(fs);
+ bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
if (fs->map->timestamp != fs->map_generation) {
- vm_fault_populate_cleanup(fs->first_object, pager_first,
- pager_last);
+ if (bdry_idx == 0) {
+ vm_fault_populate_cleanup(fs->first_object, pager_first,
+ pager_last);
+ } else {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ vm_page_xunbusy(m);
+ }
return (KERN_RESTART);
}
/*
* The map is unchanged after our last unlock. Process the fault.
*
+ * First, the special case of largepage mappings, where
+ * populate only busies the first page in superpage run.
+ */
+ if (bdry_idx != 0) {
+ m = vm_page_lookup(fs->first_object, pager_first);
+ vm_fault_populate_check_page(m);
+ VM_OBJECT_WUNLOCK(fs->first_object);
+ vaddr = fs->entry->start + IDX_TO_OFF(pager_first) -
+ fs->entry->offset;
+ /* assert alignment for entry */
+ KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0,
+ ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx",
+ (uintmax_t)fs->entry->start, (uintmax_t)pager_first,
+ (uintmax_t)fs->entry->offset, (uintmax_t)vaddr));
+ KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0,
+ ("unaligned superpage m %p %#jx", m,
+ (uintmax_t)VM_PAGE_TO_PHYS(m)));
+ rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot,
+ fs->fault_type | PMAP_ENTER_LARGEPAGE, bdry_idx);
+ VM_OBJECT_WLOCK(fs->first_object);
+ vm_page_xunbusy(m);
+ goto out;
+ }
+
+ /*
* The range [pager_first, pager_last] that is given to the
* pager is only a hint. The pager may populate any range
* within the object that includes the requested page index.
@@ -543,6 +575,7 @@
vm_page_xunbusy(&m[i]);
}
}
+out:
curthread->td_ru.ru_majflt++;
return (KERN_SUCCESS);
}
Index: sys/vm/vm_map.h
===================================================================
--- sys/vm/vm_map.h
+++ sys/vm/vm_map.h
@@ -149,6 +149,10 @@
#define MAP_ENTRY_STACK_GAP_UP 0x00040000
#define MAP_ENTRY_HEADER 0x00080000
+#define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000
+
+#define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20
+
#ifdef _KERNEL
static __inline u_char
vm_map_entry_behavior(vm_map_entry_t entry)
@@ -363,6 +367,9 @@
#define MAP_CREATE_STACK_GAP_UP 0x00010000
#define MAP_CREATE_STACK_GAP_DN 0x00020000
#define MAP_VN_EXEC 0x00040000
+#define MAP_SPLIT_BOUNDARY_MASK 0x00180000
+
+#define MAP_SPLIT_BOUNDARY_SHIFT 19
/*
* vm_fault option flags
@@ -451,6 +458,8 @@
vm_offset_t, int, vm_prot_t, vm_prot_t, int);
int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment);
int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
vm_prot_t, vm_prot_t, int);
vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
Index: sys/vm/vm_map.c
===================================================================
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -1603,13 +1603,17 @@
struct ucred *cred;
vm_eflags_t protoeflags;
vm_inherit_t inheritance;
+ u_long bdry;
+ u_int bidx;
VM_MAP_ASSERT_LOCKED(map);
KASSERT(object != kernel_object ||
(cow & MAP_COPY_ON_WRITE) == 0,
("vm_map_insert: kernel object and COW"));
- KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
- ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+ KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
+ (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
+ ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
+ object, cow));
KASSERT((prot & ~max) == 0,
("prot %#x is not subset of max_prot %#x", prot, max));
@@ -1665,6 +1669,17 @@
inheritance = VM_INHERIT_SHARE;
else
inheritance = VM_INHERIT_DEFAULT;
+ if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
+ /* This magically ignores index 0, for usual page size. */
+ bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
+ MAP_SPLIT_BOUNDARY_SHIFT;
+ if (bidx >= MAXPAGESIZES)
+ return (KERN_INVALID_ARGUMENT);
+ bdry = pagesizes[bidx] - 1;
+ if ((start & bdry) != 0 || (end & bdry) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ }
cred = NULL;
if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
@@ -1959,8 +1974,6 @@
&aslr_restarts, 0,
"Number of aslr failures");
-#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
-
/*
* Searches for the specified amount of free space in the given map with the
* specified alignment. Performs an address-ordered, first-fit search from
@@ -2028,6 +2041,19 @@
}
}
+int
+vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
+ vm_offset_t max_addr, vm_offset_t alignment)
+{
+ /* XXXKIB ASLR eh ? */
+ *addr = vm_map_findspace(map, *addr, length);
+ if (*addr + length > vm_map_max(map) ||
+ (max_addr != 0 && *addr + length > max_addr))
+ return (KERN_NO_SPACE);
+ return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
+ alignment));
+}
+
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
@@ -2370,19 +2396,6 @@
return (new_entry);
}
-/*
- * vm_map_clip_start: [ internal use only ]
- *
- * Asserts that the given entry begins at or after
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_start(map, entry, startaddr) \
-{ \
- if (startaddr > entry->start) \
- _vm_map_clip_start(map, entry, startaddr); \
-}
-
/*
* This routine is called only when it is known that
* the entry must be split.
@@ -2406,6 +2419,30 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_start: [ internal use only ]
+ *
+ * Asserts that the given entry begins at or after
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
+{
+ int bdry_idx;
+
+ if (startaddr <= entry->start)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_start(map, entry, startaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_lookup_clip_start:
*
@@ -2413,32 +2450,23 @@
* the interior of the entry. Return entry after 'start', and in
* prev_entry set the entry before 'start'.
*/
-static inline vm_map_entry_t
+static inline int
vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
- vm_map_entry_t *prev_entry)
+ vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
{
vm_map_entry_t entry;
+ int rv;
if (vm_map_lookup_entry(map, start, prev_entry)) {
entry = *prev_entry;
- vm_map_clip_start(map, entry, start);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ return (rv);
*prev_entry = vm_map_entry_pred(entry);
} else
entry = vm_map_entry_succ(*prev_entry);
- return (entry);
-}
-
-/*
- * vm_map_clip_end: [ internal use only ]
- *
- * Asserts that the given entry ends at or before
- * the specified address; if necessary,
- * it splits the entry into two.
- */
-#define vm_map_clip_end(map, entry, endaddr) \
-{ \
- if ((endaddr) < (entry->end)) \
- _vm_map_clip_end((map), (entry), (endaddr)); \
+ *res_entry = entry;
+ return (KERN_SUCCESS);
}
/*
@@ -2464,6 +2492,30 @@
vm_map_entry_link(map, new_entry);
}
+/*
+ * vm_map_clip_end: [ internal use only ]
+ *
+ * Asserts that the given entry ends at or before
+ * the specified address; if necessary,
+ * it splits the entry into two.
+ */
+static inline int
+vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
+{
+ int bdry_idx;
+
+ if (endaddr >= entry->end)
+ return (KERN_SUCCESS);
+ bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
+ MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
+ if (bdry_idx != 0) {
+ if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
+ return (KERN_INVALID_ARGUMENT);
+ }
+ _vm_map_clip_end(map, entry, endaddr);
+ return (KERN_SUCCESS);
+}
+
/*
* vm_map_submap: [ kernel use only ]
*
@@ -2503,12 +2555,17 @@
if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
(entry->eflags & MAP_ENTRY_COW) == 0 &&
entry->object.vm_object == NULL) {
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ result = vm_map_clip_start(map, entry, start);
+ if (result != KERN_SUCCESS)
+ goto unlock;
+ result = vm_map_clip_end(map, entry, end);
+ if (result != KERN_SUCCESS)
+ goto unlock;
entry->object.sub_map = submap;
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
result = KERN_SUCCESS;
}
+unlock:
vm_map_unlock(map);
if (result != KERN_SUCCESS) {
@@ -2695,11 +2752,18 @@
* of this loop early and let the next loop simplify the entries, since
* some may now be mergeable.
*/
- rv = KERN_SUCCESS;
- vm_map_clip_start(map, first_entry, start);
+ rv = vm_map_clip_start(map, first_entry, start);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
for (entry = first_entry; entry->start < end;
entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (rv);
+ }
if (set_max ||
((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
@@ -2819,6 +2883,7 @@
int behav)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
bool modify_map;
/*
@@ -2864,13 +2929,22 @@
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
+
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
continue;
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS) {
+ vm_map_unlock(map);
+ return (vm_mmap_to_errno(rv));
+ }
switch (behav) {
case MADV_NORMAL:
@@ -3005,6 +3079,7 @@
vm_inherit_t new_inheritance)
{
vm_map_entry_t entry, prev_entry;
+ int rv;
switch (new_inheritance) {
case VM_INHERIT_NONE:
@@ -3015,14 +3090,19 @@
default:
return (KERN_INVALID_ARGUMENT);
}
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
- for (entry = vm_map_lookup_clip_start(map, start, &prev_entry);
- entry->start < end;
- prev_entry = entry, entry = vm_map_entry_succ(entry)) {
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
+ for (; entry->start < end; prev_entry = entry,
+ entry = vm_map_entry_succ(entry)) {
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ goto unlock;
if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
new_inheritance != VM_INHERIT_ZERO)
entry->inheritance = new_inheritance;
@@ -3030,7 +3110,8 @@
}
vm_map_try_merge_entries(map, prev_entry, entry);
vm_map_unlock(map);
- return (KERN_SUCCESS);
+unlock:
+ return (rv);
}
/*
@@ -3129,8 +3210,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3337,8 +3423,13 @@
next_entry : NULL;
continue;
}
- vm_map_clip_start(map, entry, start);
- vm_map_clip_end(map, entry, end);
+ rv = vm_map_clip_start(map, entry, start);
+ if (rv != KERN_SUCCESS)
+ break;
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
+
/*
* Mark the entry in case the map lock is released. (See
* above.)
@@ -3743,18 +3834,22 @@
int
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
- vm_map_entry_t entry, next_entry;
+ vm_map_entry_t entry, next_entry, scratch_entry;
+ int rv;
VM_MAP_ASSERT_LOCKED(map);
+ rv = KERN_SUCCESS;
if (start == end)
- return (KERN_SUCCESS);
+ return (rv);
/*
* Find the start of the region, and clip it.
* Step through all entries in this region.
*/
- for (entry = vm_map_lookup_clip_start(map, start, &entry);
- entry->start < end; entry = next_entry) {
+ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ return (rv);
+ for (; entry->start < end; entry = next_entry) {
/*
* Wait for wiring or unwiring of an entry to complete.
* Also wait for any system wirings to disappear on
@@ -3778,13 +3873,19 @@
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
- next_entry = vm_map_lookup_clip_start(map,
- saved_start, &next_entry);
+ rv = vm_map_lookup_clip_start(map, saved_start,
+ &next_entry, &scratch_entry);
+ if (rv != KERN_SUCCESS)
+ break;
} else
next_entry = entry;
continue;
}
- vm_map_clip_end(map, entry, end);
+
+ /* XXXKIB or delete to the upper superpage boundary ? */
+ rv = vm_map_clip_end(map, entry, end);
+ if (rv != KERN_SUCCESS)
+ break;
next_entry = vm_map_entry_succ(entry);
/*
@@ -3814,7 +3915,7 @@
*/
vm_map_entry_delete(map, entry);
}
- return (KERN_SUCCESS);
+ return (rv);
}
/*
Index: sys/vm/vm_mmap.c
===================================================================
--- sys/vm/vm_mmap.c
+++ sys/vm/vm_mmap.c
@@ -285,7 +285,7 @@
}
if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
- MAP_PREFAULT_READ | MAP_GUARD |
+ MAP_PREFAULT_READ | MAP_GUARD | MAP_LARGEPAGE |
#ifdef MAP_32BIT
MAP_32BIT |
#endif
@@ -305,6 +305,10 @@
#endif
MAP_ALIGNMENT_MASK)) != 0))
return (EINVAL);
+ if ((flags & MAP_LARGEPAGE) != 0 && (flags & ~(MAP_LARGEPAGE |
+ MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_EXCL | MAP_NOCORE |
+ MAP_32BIT)) != 0)
+ return (EINVAL);
/*
* Align the file position to a page boundary,
@@ -368,10 +372,10 @@
* There should really be a pmap call to determine a reasonable
* location.
*/
- if (addr == 0 ||
+ if ((flags & MAP_LARGEPAGE) == 0 && (addr == 0 ||
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
addr < round_page((vm_offset_t)vms->vm_daddr +
- lim_max(td, RLIMIT_DATA))))
+ lim_max(td, RLIMIT_DATA)))))
addr = round_page((vm_offset_t)vms->vm_daddr +
lim_max(td, RLIMIT_DATA));
}
@@ -418,6 +422,10 @@
error = EINVAL;
goto done;
}
+ if ((flags & MAP_LARGEPAGE) != 0 && fp->f_ops != &shm_ops) {
+ error = EINVAL;
+ goto done;
+ }
if (check_fp_fn != NULL) {
error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
flags);
@@ -1511,6 +1519,39 @@
return (error);
}
+int
+kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
+{
+ int error;
+
+ RACCT_PROC_LOCK(td->td_proc);
+ if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+ if (ptoa(pmap_wired_count(map->pmap)) + size >
+ lim_cur(td, RLIMIT_MEMLOCK)) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ error = racct_set(td->td_proc, RACCT_MEMLOCK,
+ ptoa(pmap_wired_count(map->pmap)) + size);
+ if (error != 0) {
+ racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (error);
+ }
+ }
+ RACCT_PROC_UNLOCK(td->td_proc);
+ return (0);
+}
+
/*
* Internal version of mmap that maps a specific VM object into an
* map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
@@ -1520,39 +1561,15 @@
vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
boolean_t writecounted, struct thread *td)
{
- boolean_t curmap, fitit;
vm_offset_t max_addr;
int docow, error, findspace, rv;
+ bool curmap, fitit;
curmap = map == &td->td_proc->p_vmspace->vm_map;
if (curmap) {
- RACCT_PROC_LOCK(td->td_proc);
- if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- if (!old_mlock && map->flags & MAP_WIREFUTURE) {
- if (ptoa(pmap_wired_count(map->pmap)) + size >
- lim_cur(td, RLIMIT_MEMLOCK)) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (ENOMEM);
- }
- error = racct_set(td->td_proc, RACCT_MEMLOCK,
- ptoa(pmap_wired_count(map->pmap)) + size);
- if (error != 0) {
- racct_set_force(td->td_proc, RACCT_VMEM,
- map->size);
- RACCT_PROC_UNLOCK(td->td_proc);
- return (error);
- }
- }
- RACCT_PROC_UNLOCK(td->td_proc);
+ error = kern_mmap_racct_check(td, map, size);
+ if (error != 0)
+ return (error);
}
/*
Index: sys/vm/vm_object.h
===================================================================
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -173,6 +173,17 @@
struct pctrie swp_blks;
vm_ooffset_t writemappings;
} swp;
+
+ /*
+ * Phys pager
+ */
+ struct {
+ struct phys_pager_ops *ops;
+ union {
+ void *data_ptr;
+ uintptr_t data_val;
+ };
+ } phys;
} un_pager;
struct ucred *cred;
vm_ooffset_t charge;
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -289,6 +289,7 @@
kernel_object->flags |= OBJ_COLORED;
kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
#endif
+ kernel_object->un_pager.phys.ops = &default_phys_pg_ops;
/*
* The lock portion of struct vm_object must be type stable due
Index: sys/vm/vm_pager.h
===================================================================
--- sys/vm/vm_pager.h
+++ sys/vm/vm_pager.h
@@ -229,5 +229,22 @@
vm_object_t cdev_pager_lookup(void *handle);
void cdev_pager_free_page(vm_object_t object, vm_page_t m);
+struct phys_pager_ops {
+ int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count,
+ int *rbehind, int *rahead);
+ int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+ int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+ vm_pindex_t *last);
+ boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex,
+ int *before, int *after);
+ void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot,
+ vm_ooffset_t foff, struct ucred *cred);
+ void (*phys_pg_dtor)(vm_object_t vm_obj);
+};
+extern struct phys_pager_ops default_phys_pg_ops;
+vm_object_t phys_pager_allocate(void *handle, struct phys_pager_ops *ops,
+ void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff,
+ struct ucred *cred);
+
#endif /* _KERNEL */
#endif /* _VM_PAGER_ */
Index: usr.bin/posixshmcontrol/posixshmcontrol.c
===================================================================
--- usr.bin/posixshmcontrol/posixshmcontrol.c
+++ usr.bin/posixshmcontrol/posixshmcontrol.c
@@ -349,6 +349,8 @@
(long)st.st_ctim.tv_nsec);
printf("birth\t%ld.%09ld\n", (long)st.st_birthtim.tv_sec,
(long)st.st_birthtim.tv_nsec);
+ printf("pagesz\t%jd\n", roundup((uintmax_t)st.st_size,
+ PAGE_SIZE) / st.st_blocks);
}
close(fd);
return (ret);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 13, 1:12 PM (9 h, 56 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28337941
Default Alt Text
D24652.1776085949.diff (58 KB)
Attached To
Mode
D24652: Non-transparent superpages support.
Attached
Detach File
Event Timeline
Log In to Comment