Page MenuHomeFreeBSD

No OneTemporary

Size
147 KB
Referenced Files
None
Subscribers
None
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index c629db566528..6eda6c9c8352 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,1892 +1,1902 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_pci.h"
#include "opt_platform.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/asan.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/csan.h>
#include <sys/efi.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
#include <sys/msan.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
#include <sys/smp.h>
#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_dumpset.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <net/netisr.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/proc.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/trap.h>
#include <machine/tss.h>
#include <x86/ucode.h>
#include <x86/ifunc.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
#ifdef DEV_ATPIC
#include <x86/isa/icu.h>
#else
#include <x86/apicvar.h>
#endif
#include <isa/isareg.h>
#include <isa/rtc.h>
#include <x86/init.h>
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
/*
* The PTI trampoline stack needs enough space for a hardware trapframe and a
* couple of scratch registers, as well as the trapframe left behind after an
* iret fault.
*/
CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
offsetof(struct pti_frame, pti_rip));
extern u_int64_t hammer_time(u_int64_t, u_int64_t);
static void cpu_startup(void *);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
+/* Probe 8254 PIT and TSC. */
+static void native_clock_source_init(void);
+
/* Preload data parse function */
static caddr_t native_parse_preload_data(u_int64_t);
/* Native function to fetch and parse the e820 map */
static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
/* Default init_ops implementation. */
struct init_ops init_ops = {
- .parse_preload_data = native_parse_preload_data,
- .early_clock_source_init = i8254_init,
+ .parse_preload_data = native_parse_preload_data,
+ .early_clock_source_init = native_clock_source_init,
.early_delay = i8254_delay,
.parse_memmap = native_parse_memmap,
};
/*
* Physical address of the EFI System Table. Stashed from the metadata hints
* passed into the kernel and used by the EFI code to call runtime services.
*/
vm_paddr_t efi_systbl_phys;
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
int cold = 1;
long Maxmem = 0;
long realmem = 0;
struct kva_md_info kmi;
struct region_descriptor r_idt;
struct pcpu *__pcpu;
struct pcpu temp_bsp_pcpu;
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
struct mtx dt_lock; /* lock for GDT and LDT */
void (*vmm_resume_p)(void);
bool efi_boot;
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = kern_getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBook4,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = kern_getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)vm_free_count()))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
realmem = atop(memsize);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)vm_free_count()),
ptoa((uintmax_t)vm_free_count()) / 1048576);
#ifdef DEV_PCI
if (bootverbose && intel_graphics_stolen_base != 0)
printf("intel stolen mem: base %#jx size %ju MB\n",
(uintmax_t)intel_graphics_stolen_base,
(uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
#endif
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
static void
late_ifunc_resolve(void *dummy __unused)
{
link_elf_late_ireloc();
}
SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
void
cpu_setregs(void)
{
register_t cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
* BSP. See the comments there about why we set them.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
}
/*
* Initialize amd64 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
/*
* Software prototypes -- in more palatable form.
*
* Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
* slots as corresponding segments for i386 kernel.
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GNULL2_SEL 1 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 8 64 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
.ssd_type = SDT_SYSTSS,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Actually, the TSS is a system descriptor which is double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 LDT Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 12 LDT Descriptor, double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
_Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
void
setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = (uintptr_t)func;
ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
ip->gd_ist = ist;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((uintptr_t)func)>>16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(dblfault),
IDTVEC(div_pti), IDTVEC(bpt_pti),
IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
IDTVEC(xmm_pti),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
#endif
#ifdef XENHVM
IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
#endif
IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
IDTVEC(fast_syscall_pti);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
db_printf("\n");
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
struct {
uint16_t limit;
uint64_t base;
} __packed idtr, gdtr;
uint16_t ldt, tr;
__asm __volatile("sidt %0" : "=m" (idtr));
db_printf("idtr\t0x%016lx/%04x\n",
(u_long)idtr.base, (u_int)idtr.limit);
__asm __volatile("sgdt %0" : "=m" (gdtr));
db_printf("gdtr\t0x%016lx/%04x\n",
(u_long)gdtr.base, (u_int)gdtr.limit);
__asm __volatile("sldt %0" : "=r" (ldt));
db_printf("ldtr\t0x%04x\n", ldt);
__asm __volatile("str %0" : "=r" (tr));
db_printf("tr\t0x%04x\n", tr);
db_printf("cr0\t0x%016lx\n", rcr0());
db_printf("cr2\t0x%016lx\n", rcr2());
db_printf("cr3\t0x%016lx\n", rcr3());
db_printf("cr4\t0x%016lx\n", rcr4());
if (rcr4() & CR4_XSAVE)
db_printf("xcr0\t0x%016lx\n", rxcr(0));
db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
db_printf("FEATURES_CTL\t%016lx\n",
rdmsr(MSR_IA32_FEATURE_CONTROL));
db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
}
DB_SHOW_COMMAND(dbregs, db_show_dbregs)
{
db_printf("dr0\t0x%016lx\n", rdr0());
db_printf("dr1\t0x%016lx\n", rdr1());
db_printf("dr2\t0x%016lx\n", rdr2());
db_printf("dr3\t0x%016lx\n", rdr3());
db_printf("dr6\t0x%016lx\n", rdr6());
db_printf("dr7\t0x%016lx\n", rdr7());
}
#endif
void
sdtossd(sd, ssd)
struct user_segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_long = sd->sd_long;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
void
ssdtosd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct user_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_long = ssd->ssd_long;
sd->sd_def32 = ssd->ssd_def32;
sd->sd_gran = ssd->ssd_gran;
}
void
ssdtosyssd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct system_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_gran = ssd->ssd_gran;
}
u_int basemem;
static int
add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
int *physmap_idxp)
{
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (length == 0)
return (1);
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*
* NB: physmap_idx points to the next free slot.
*/
insert_idx = physmap_idx;
for (i = 0; i <= physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
return (1);
}
void
bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
vm_paddr_t *physmap, int *physmap_idx)
{
struct bios_smap *smap, *smapend;
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
for (smap = smapbase; smap < smapend; smap++) {
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016lx len=%016lx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
continue;
if (!add_physmap_entry(smap->base, smap->length, physmap,
physmap_idx))
break;
}
}
static void
add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
int *physmap_idx)
{
struct efi_md *map, *p;
const char *type;
size_t efisz;
int ndesc, i;
static const char *types[] = {
"Reserved",
"LoaderCode",
"LoaderData",
"BootServicesCode",
"BootServicesData",
"RuntimeServicesCode",
"RuntimeServicesData",
"ConventionalMemory",
"UnusableMemory",
"ACPIReclaimMemory",
"ACPIMemoryNVS",
"MemoryMappedIO",
"MemoryMappedIOPortSpace",
"PalCode",
"PersistentMemory"
};
/*
* Memory map data provided by UEFI via the GetMemoryMap
* Boot Services API.
*/
efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
map = (struct efi_md *)((uint8_t *)efihdr + efisz);
if (efihdr->descriptor_size == 0)
return;
ndesc = efihdr->memory_size / efihdr->descriptor_size;
if (boothowto & RB_VERBOSE)
printf("%23s %12s %12s %8s %4s\n",
"Type", "Physical", "Virtual", "#Pages", "Attr");
for (i = 0, p = map; i < ndesc; i++,
p = efi_next_descriptor(p, efihdr->descriptor_size)) {
if (boothowto & RB_VERBOSE) {
if (p->md_type < nitems(types))
type = types[p->md_type];
else
type = "<INVALID>";
printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
p->md_virt, p->md_pages);
if (p->md_attr & EFI_MD_ATTR_UC)
printf("UC ");
if (p->md_attr & EFI_MD_ATTR_WC)
printf("WC ");
if (p->md_attr & EFI_MD_ATTR_WT)
printf("WT ");
if (p->md_attr & EFI_MD_ATTR_WB)
printf("WB ");
if (p->md_attr & EFI_MD_ATTR_UCE)
printf("UCE ");
if (p->md_attr & EFI_MD_ATTR_WP)
printf("WP ");
if (p->md_attr & EFI_MD_ATTR_RP)
printf("RP ");
if (p->md_attr & EFI_MD_ATTR_XP)
printf("XP ");
if (p->md_attr & EFI_MD_ATTR_NV)
printf("NV ");
if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
printf("MORE_RELIABLE ");
if (p->md_attr & EFI_MD_ATTR_RO)
printf("RO ");
if (p->md_attr & EFI_MD_ATTR_RT)
printf("RUNTIME");
printf("\n");
}
switch (p->md_type) {
case EFI_MD_TYPE_CODE:
case EFI_MD_TYPE_DATA:
case EFI_MD_TYPE_BS_CODE:
case EFI_MD_TYPE_BS_DATA:
case EFI_MD_TYPE_FREE:
/*
* We're allowed to use any entry with these types.
*/
break;
default:
continue;
}
if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
physmap, physmap_idx))
break;
}
}
static void
native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
{
struct bios_smap *smap;
struct efi_map_header *efihdr;
u_int32_t size;
/*
* Memory map from INT 15:E820.
*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes smap.
*/
efihdr = (struct efi_map_header *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_EFI_MAP);
smap = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (efihdr == NULL && smap == NULL)
panic("No BIOS smap or EFI map info from loader!");
if (efihdr != NULL) {
add_efi_map_entries(efihdr, physmap, physmap_idx);
strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
} else {
size = *((u_int32_t *)smap - 1);
bios_add_smap_entries(smap, size, physmap, physmap_idx);
strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
}
}
#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(caddr_t kmdp, u_int64_t first)
{
int i, physmap_idx, pa_indx, da_indx;
vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
u_long physmem_start, physmem_tunable, memtest;
pt_entry_t *pte;
quad_t dcons_addr, dcons_size;
int page_counter;
/*
* Tell the physical memory allocator about pages used to store
* the kernel and preloaded data. See kmem_bootstrap_free().
*/
vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
bzero(physmap, sizeof(physmap));
physmap_idx = 0;
init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
physmap_idx -= 2;
/*
* Find the 'base memory' segment for SMP
*/
basemem = 0;
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] <= 0xA0000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
if (basemem == 0 || basemem > 640) {
if (bootverbose)
printf(
"Memory map doesn't contain a basemem segment, faking it");
basemem = 640;
}
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* The boot memory test is disabled by default, as it takes a
* significant amount of time on large-memory systems, and is
* unfriendly to virtual machines as it unnecessarily touches all
* pages.
*
* A general name is used as the code may be extended to support
* additional tests beyond the current "page present" test.
*/
memtest = 0;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
/*
* Don't allow MAXMEM or hw.physmem to extend the amount of memory
* in the system.
*/
if (Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(&first);
/*
* Size up each available chunk of physical memory.
*
* XXX Some BIOSes corrupt low 64KB between suspend and resume.
* By default, mask off the first 16 pages unless we appear to be
* running in a VM.
*/
physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
if (physmap[0] < physmem_start) {
if (physmem_start < PAGE_SIZE)
physmap[0] = PAGE_SIZE;
else if (physmem_start >= physmap[1])
physmap[0] = round_page(physmap[1] - PAGE_SIZE);
else
physmap[0] = round_page(physmem_start);
}
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
pte = CMAP1;
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
page_counter = 0;
if (memtest != 0)
printf("Testing system memory");
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr = (int *)CADDR1;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= (vm_paddr_t)kernphys && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* Print a "." every GB to show we're making
* progress.
*/
page_counter++;
if ((page_counter % PAGES_PER_GB) == 0)
printf(".");
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == PHYS_AVAIL_ENTRIES) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
*pte = 0;
invltlb();
if (memtest != 0)
printf("\n");
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
}
static caddr_t
native_parse_preload_data(u_int64_t modulep)
{
caddr_t kmdp;
char *envp;
#ifdef DDB
vm_offset_t ksym_start;
vm_offset_t ksym_end;
#endif
preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
preload_bootstrap_relocate(KERNBASE);
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
if (envp != NULL)
envp += KERNBASE;
init_static_kenv(envp, 0);
#ifdef DDB
ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
db_fetch_ksymtab(ksym_start, ksym_end, 0);
#endif
efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
return (kmdp);
}
+static void
+native_clock_source_init(void)
+{
+ i8254_init();
+ tsc_init();
+}
+
static void
amd64_kdb_init(void)
{
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
}
/* Set up the fast syscall stuff */
void
amd64_conf_fast_syscall(void)
{
uint64_t msr;
msr = rdmsr(MSR_EFER) | EFER_SCE;
wrmsr(MSR_EFER, msr);
wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
(u_int64_t)IDTVEC(fast_syscall));
wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
wrmsr(MSR_STAR, msr);
wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
}
void
amd64_bsp_pcpu_init1(struct pcpu *pc)
{
struct user_segment_descriptor *gdt;
PCPU_SET(prvspace, pc);
gdt = *PCPU_PTR(gdt);
PCPU_SET(curthread, &thread0);
PCPU_SET(tssp, PCPU_PTR(common_tss));
PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
PCPU_SET(smp_tlb_gen, 1);
}
void
amd64_bsp_pcpu_init2(uint64_t rsp0)
{
PCPU_SET(rsp0, rsp0);
PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
PCPU_SET(curpcb, thread0.td_pcb);
}
void
amd64_bsp_ist_init(struct pcpu *pc)
{
struct nmi_pcpu *np;
struct amd64tss *tssp;
tssp = &pc->pc_common_tss;
/* doublefault stack space, runs on ist1 */
np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist1 = (long)np;
/*
* NMI stack, runs on ist2. The pcpu pointer is stored just
* above the start of the ist2 stack.
*/
np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist2 = (long)np;
/*
* MC# stack, runs on ist3. The pcpu pointer is stored just
* above the start of the ist3 stack.
*/
np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist3 = (long)np;
/*
* DB# stack, runs on ist4.
*/
np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist4 = (long)np;
}
u_int64_t
hammer_time(u_int64_t modulep, u_int64_t physfree)
{
caddr_t kmdp;
int gsel_tss, x;
struct pcpu *pc;
uint64_t cr3, rsp0;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
char *env;
struct user_segment_descriptor *gdt;
struct region_descriptor r_gdt;
size_t kstack0_sz;
int late_console;
TSRAW(&thread0, TS_ENTER, __func__, NULL);
/*
* Calculate kernphys by inspecting page table created by loader.
* The assumptions:
* - kernel is mapped at KERNBASE, backed by contiguous phys memory
* aligned at 2M, below 4G (the latter is important for AP startup)
* - there is a 2M hole at KERNBASE
* - kernel is mapped with 2M superpages
* - all participating memory, i.e. kernel, modules, metadata,
* page table is accessible by pre-created 1:1 mapping
* (right now loader creates 1:1 mapping for lower 4G, and all
* memory is from there)
* - there is a usable memory block right after the end of the
* mapped kernel and all modules/metadata, pointed to by
* physfree, for early allocations
*/
cr3 = rcr3();
pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
(vm_offset_t)hammer_time);
pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
(vm_offset_t)hammer_time);
pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
(vm_offset_t)hammer_time);
kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
(vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
/* Fix-up for 2M hole */
physfree += kernphys;
kernphys += NBPDR;
kmdp = init_ops.parse_preload_data(modulep);
efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
MODINFOMD_EFI_MAP) != NULL;
if (!efi_boot) {
/* Tell the bios to warmboot next time */
atomic_store_short((u_short *)0x472, 0x1234);
}
physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
physfree = roundup2(physfree, PAGE_SIZE);
identify_cpu1();
identify_hypervisor();
identify_cpu_fixup_bsp();
identify_cpu2();
initializecpucache();
/*
* Check for pti, pcid, and invpcid before ifuncs are
* resolved, to correctly select the implementation for
* pmap_activate_sw_mode().
*/
pti = pti_get_default();
TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
invpcid_works = (cpu_stdext_feature &
CPUID_STDEXT_INVPCID) != 0;
} else {
pmap_pcid_enabled = 0;
}
link_elf_ireloc(kmdp);
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
/* Init basic tunables, hz etc */
init_param1();
thread0.td_kstack = physfree - kernphys + KERNSTART;
thread0.td_kstack_pages = kstack_pages;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
bzero((void *)thread0.td_kstack, kstack0_sz);
physfree += kstack0_sz;
/*
* Initialize enough of thread0 for delayed invalidation to
* work very early. Rely on thread0.td_base_pri
* zero-initialization, it is reset to PVM at proc0_init().
*/
pmap_thread_init_invl_gen(&thread0);
pc = &temp_bsp_pcpu;
pcpu_init(pc, 0, sizeof(struct pcpu));
gdt = &temp_bsp_pcpu.pc_gdt[0];
/*
* make gdt memory segments
*/
for (x = 0; x < NGDT; x++) {
if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
ssdtosd(&gdt_segs[x], &gdt[x]);
}
gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
ssdtosyssd(&gdt_segs[GPROC0_SEL],
(struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (long)gdt;
lgdt(&r_gdt);
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
SEL_UPL, 0);
setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
SEL_UPL, 0);
setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
SEL_KPL, 0);
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
&IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
#endif
#ifdef XENHVM
setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
&IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
#endif
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
lidt(&r_idt);
/*
* Initialize the clock before the console so that console
* initialization can use DELAY().
*/
clock_init();
/*
* Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
* transition).
* Once bootblocks have updated, we can test directly for
* efi_systbl != NULL here...
*/
if (efi_boot)
vty_set_preferred(VTY_VT);
TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
&syscall_ret_l1d_flush_mode);
TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
&x86_rngds_mitg_enable);
finishidentcpu(); /* Final stage of CPU initialization */
initializecpu(); /* Initialize CPU registers */
amd64_bsp_ist_init(pc);
/* Set the IO permission bitmap (empty due to tss seg limit) */
pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
IOPERM_BITMAP_SIZE;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
ltr(gsel_tss);
amd64_conf_fast_syscall();
/*
* We initialize the PCB pointer early so that exception
* handlers will work. Also set up td_critnest to short-cut
* the page fault handler.
*/
cpu_max_ext_state_size = sizeof(struct savefpu);
set_top_of_stack_td(&thread0);
thread0.td_pcb = get_pcb_td(&thread0);
thread0.td_critnest = 1;
/*
* The console and kdb should be initialized even earlier than here,
* but some console drivers don't work until after getmemsize().
* Default to late console initialization to support these drivers.
* This loses mainly printf()s in getmemsize() and early debugging.
*/
late_console = 1;
TUNABLE_INT_FETCH("debug.late_console", &late_console);
if (!late_console) {
cninit();
amd64_kdb_init();
}
getmemsize(kmdp, physfree);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
#ifdef DEV_PCI
/* This call might adjust phys_avail[]. */
pci_early_quirks();
#endif
if (late_console)
cninit();
/*
* Dump the boot metadata. We have to wait for cninit() since console
* output is required. If it's grossly incorrect the kernel will never
* make it this far.
*/
if (getenv_is_true("debug.dump_modinfo_at_boot"))
preload_dump();
#ifdef DEV_ISA
#ifdef DEV_ATPIC
elcr_probe();
atpic_startup();
#else
/* Reset and mask the atpics and leave them shut down. */
atpic_reset();
/*
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
#endif
#else
#error "have you forgotten the isa device?"
#endif
if (late_console)
amd64_kdb_init();
msgbufinit(msgbufp, msgbufsize);
fpuinit();
/* make an initial tss so cpu can get interrupt stack on syscall! */
rsp0 = thread0.td_md.md_stack_base;
/* Ensure the stack is aligned to 16 bytes */
rsp0 &= ~0xFul;
PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
amd64_bsp_pcpu_init2(rsp0);
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
load_ds(_udatasel);
load_es(_udatasel);
load_fs(_ufssel);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
env = kern_getenv("kernelname");
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
kcsan_cpu_init(0);
#ifdef FDT
x86_init_fdt();
#endif
thread0.td_critnest = 0;
kasan_init();
kmsan_init();
TSEXIT();
/* Location of kernel stack for locore */
return (thread0.td_md.md_stack_base);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct bios_smap *smapbase;
struct bios_smap_xattr smap;
caddr_t kmdp;
uint32_t *smapattr;
int count, error, i;
/* Retrieve the system memory map from the loader. */
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
return (0);
smapattr = (uint32_t *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
error = 0;
for (i = 0; i < count; i++) {
smap.base = smapbase[i].base;
smap.length = smapbase[i].length;
smap.type = smapbase[i].type;
if (smapattr != NULL)
smap.xattr = smapattr[i];
else
smap.xattr = 0;
error = SYSCTL_OUT(req, &smap, sizeof(smap));
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, smap,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
smap_sysctl_handler, "S,bios_smap_xattr",
"Raw BIOS SMAP data");
static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct efi_map_header *efihdr;
caddr_t kmdp;
uint32_t efisize;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
efihdr = (struct efi_map_header *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_EFI_MAP);
if (efihdr == NULL)
return (0);
efisize = *((uint32_t *)efihdr - 1);
return (SYSCTL_OUT(req, efihdr, efisize));
}
SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
efi_map_sysctl_handler, "S,efi_map_header",
"Raw EFI Memory Map");
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
critical_enter();
} else
td->td_md.md_spinlock_count++;
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0) {
critical_exit();
intr_restore(flags);
}
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_r12 = tf->tf_r12;
pcb->pcb_r13 = tf->tf_r13;
pcb->pcb_r14 = tf->tf_r14;
pcb->pcb_r15 = tf->tf_r15;
pcb->pcb_rbp = tf->tf_rbp;
pcb->pcb_rbx = tf->tf_rbx;
pcb->pcb_rip = tf->tf_rip;
pcb->pcb_rsp = tf->tf_rsp;
}
/*
* The pcb_flags is only modified by current thread, or by other threads
* when current thread is stopped. However, current thread may change it
* from the interrupt context in cpu_switch(), or in the trap handler.
* When we read-modify-write pcb_flags from C sources, compiler may generate
* code that is not atomic regarding the interrupt handler. If a trap or
* interrupt happens and any flag is modified from the handler, it can be
* clobbered with the cached value later. Therefore, we implement setting
* and clearing flags with single-instruction functions, which do not race
* with possible modification of the flags from the trap or interrupt context,
* because traps and interrupts are executed only on instruction boundary.
*/
void
set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
{
__asm __volatile("orl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
/*
* The support for RDFSBASE, WRFSBASE and similar instructions for %gs
* base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
* pcb if user space modified the bases. We must save on the context
* switch or if the return to usermode happens through the doreti.
*
* Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
* which have a consequence that the base MSRs must be saved each time
* the PCB_FULL_IRET flag is set. We disable interrupts to sync with
* context switches.
*/
static void
set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
{
register_t r;
if (curpcb == pcb &&
(flags & PCB_FULL_IRET) != 0 &&
(pcb->pcb_flags & PCB_FULL_IRET) == 0) {
r = intr_disable();
if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
if (rfs() == _ufssel)
pcb->pcb_fsbase = rdfsbase();
if (rgs() == _ugssel)
pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
}
set_pcb_flags_raw(pcb, flags);
intr_restore(r);
} else {
set_pcb_flags_raw(pcb, flags);
}
}
DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
{
return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
set_pcb_flags_fsgsbase : set_pcb_flags_raw);
}
void
clear_pcb_flags(struct pcb *pcb, const u_int flags)
{
__asm __volatile("andl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
#undef memset
#undef memmove
#undef memcpy
void *memset_std(void *buf, int c, size_t len);
void *memset_erms(void *buf, int c, size_t len);
void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
#ifdef KCSAN
/*
* These fail to build as ifuncs when used with KCSAN.
*/
void *
memset(void *buf, int c, size_t len)
{
return (memset_std(buf, c, len));
}
void *
memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
{
return (memmove_std(dst, src, len));
}
void *
memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
{
return (memcpy_std(dst, src, len));
}
#else
DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memset_erms : memset_std);
}
DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memmove_erms : memmove_std);
}
DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memcpy_erms : memcpy_std);
}
#endif
void pagezero_std(void *addr);
void pagezero_erms(void *addr);
DEFINE_IFUNC(, void , pagezero, (void *))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
pagezero_erms : pagezero_std);
}
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index ee6752861c9e..6913c0691fd4 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -1,1868 +1,1877 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
* Copyright (c) 2018 The FreeBSD Foundation
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_apic.h"
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_perfmon.h"
#include "opt_platform.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_dumpset.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <isa/rtc.h>
#include <net/netisr.h>
#include <machine/bootinfo.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/pcb_ext.h>
#include <machine/proc.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/sysarch.h>
#include <machine/trap.h>
#include <x86/ucode.h>
#include <machine/vm86.h>
#include <x86/init.h>
#ifdef PERFMON
#include <machine/perfmon.h>
#endif
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
#ifdef DEV_APIC
#include <x86/apicvar.h>
#endif
#ifdef DEV_ISA
#include <x86/isa/icu.h>
#endif
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
register_t init386(int first);
void dblfault_handler(void);
void identify_cpu(void);
static void cpu_startup(void *);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel;
u_int basemem;
static int above4g_allow = 1;
static int above24g_allow = 0;
int cold = 1;
long Maxmem = 0;
long realmem = 0;
#ifdef PAE
FEATURE(pae, "Physical Address Extensions");
#endif
struct kva_md_info kmi;
static struct trapframe proc0_tf;
struct pcpu __pcpu[MAXCPU];
+static void i386_clock_source_init(void);
+
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
extern char start_exceptions[], end_exceptions[];
extern struct sysentvec elf32_freebsd_sysvec;
/* Default init_ops implementation. */
struct init_ops init_ops = {
- .early_clock_source_init = i8254_init,
+ .early_clock_source_init = i386_clock_source_init,
.early_delay = i8254_delay,
};
+static void
+i386_clock_source_init(void)
+{
+ i8254_init();
+ tsc_init();
+}
+
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = kern_getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBook4,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
panicifcpuunsupported();
#ifdef PERFMON
perfmon_init();
#endif
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = kern_getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)vm_free_count()))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
realmem = atop(memsize);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)vm_free_count()),
ptoa((uintmax_t)vm_free_count()) / 1048576);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
void
cpu_setregs(void)
{
unsigned int cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
*
* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
* instructions. We must set the CR0_MP bit and use the CR0_TS
* bit to control the trap, because setting the CR0_EM bit does
* not cause WAIT instructions to trap. It's important to trap
* WAIT instructions - otherwise the "wait" variants of no-wait
* control instructions would degenerate to the "no-wait" variants
* after FP context switches but work correctly otherwise. It's
* particularly important to trap WAITs when there is no NPX -
* otherwise the "wait" variants would always degenerate.
*
* Try setting CR0_NE to get correct error reporting on 486DX's.
* Setting it should fail or do nothing on lesser processors.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
load_gs(_udatasel);
}
u_long bootdev; /* not a struct cdev *- encoding is different */
SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
/*
* Initialize 386 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
int _default_ldt;
struct mtx dt_lock; /* lock for GDT and LDT */
union descriptor gdt0[NGDT]; /* initial global descriptor table */
union descriptor *gdt = gdt0; /* global descriptor table */
union descriptor *ldt; /* local descriptor table */
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static struct i386tss *dblfault_tss;
static char *dblfault_stack;
static struct i386tss common_tss0;
vm_offset_t proc0kstack;
/*
* software prototypes -- in more palatable form.
*
* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = SEL_KPL,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUFS_SEL 2 %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS_SEL 3 %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 6 Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
{ .ssd_base = 0x400,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{
.ssd_base = 0x0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GLDT_SEL 10 LDT Descriptor */
{ .ssd_base = 0,
.ssd_limit = sizeof(union descriptor) * NLDT - 1,
.ssd_type = SDT_SYSLDT,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 User LDT Descriptor per process */
{ .ssd_base = 0,
.ssd_limit = (512 * sizeof(union descriptor)-1),
.ssd_type = SDT_SYSLDT,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPANIC_SEL 12 Panic Tss Descriptor */
{ .ssd_base = 0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GNDIS_SEL 18 NDIS Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
static struct soft_segment_descriptor ldt_segs[] = {
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
};
size_t setidt_disp;
void
setidt(int idx, inthand_t *func, int typ, int dpl, int selec)
{
uintptr_t off;
off = func != NULL ? (uintptr_t)func + setidt_disp : 0;
setidt_nodisp(idx, off, typ, dpl, selec);
}
void
setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec)
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = off;
ip->gd_selector = selec;
ip->gd_stkcpy = 0;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((u_int)off) >> 16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret),
#endif
#ifdef XENHVM
IDTVEC(xen_intr_upcall),
#endif
IDTVEC(int0x80_syscall);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func, func_trm;
bool trm;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
if (ip->gd_type == SDT_SYSTASKGT) {
db_printf("%3d\t<TASK>\n", idx);
} else {
func = (ip->gd_hioffset << 16 | ip->gd_looffset);
if (func >= PMAP_TRM_MIN_ADDRESS) {
func_trm = func;
func -= setidt_disp;
trm = true;
} else
trm = false;
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
if (trm)
db_printf(" (trampoline %#x)",
func_trm);
db_printf("\n");
}
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
uint64_t idtr, gdtr;
idtr = ridt();
db_printf("idtr\t0x%08x/%04x\n",
(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
gdtr = rgdt();
db_printf("gdtr\t0x%08x/%04x\n",
(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
db_printf("ldtr\t0x%04x\n", rldt());
db_printf("tr\t0x%04x\n", rtr());
db_printf("cr0\t0x%08x\n", rcr0());
db_printf("cr2\t0x%08x\n", rcr2());
db_printf("cr3\t0x%08x\n", rcr3());
db_printf("cr4\t0x%08x\n", rcr4());
if (rcr4() & CR4_XSAVE)
db_printf("xcr0\t0x%016llx\n", rxcr(0));
if (amd_feature & (AMDID_NX | AMDID_LM))
db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER));
if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
db_printf("FEATURES_CTL\t0x%016llx\n",
rdmsr(MSR_IA32_FEATURE_CONTROL));
if (((cpu_vendor_id == CPU_VENDOR_INTEL ||
cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) ||
cpu_vendor_id == CPU_VENDOR_HYGON)
db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR));
if (cpu_feature & CPUID_PAT)
db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT));
}
DB_SHOW_COMMAND(dbregs, db_show_dbregs)
{
db_printf("dr0\t0x%08x\n", rdr0());
db_printf("dr1\t0x%08x\n", rdr1());
db_printf("dr2\t0x%08x\n", rdr2());
db_printf("dr3\t0x%08x\n", rdr3());
db_printf("dr6\t0x%08x\n", rdr6());
db_printf("dr7\t0x%08x\n", rdr7());
}
DB_SHOW_COMMAND(frame, db_show_frame)
{
struct trapframe *frame;
frame = have_addr ? (struct trapframe *)addr : curthread->td_frame;
printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n",
frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs,
frame->tf_eip);
printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno);
printf("ds %#x es %#x fs %#x\n",
frame->tf_ds, frame->tf_es, frame->tf_fs);
printf("eax %#x ecx %#x edx %#x ebx %#x\n",
frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx);
printf("ebp %#x esi %#x edi %#x\n",
frame->tf_ebp, frame->tf_esi, frame->tf_edi);
}
#endif
void
sdtossd(sd, ssd)
struct segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
static int
add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
int *physmap_idxp)
{
uint64_t lim, ign;
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (length == 0)
return (1);
lim = 0x100000000; /* 4G */
if (pae_mode && above4g_allow)
lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */
if (base >= lim) {
printf("%uK of memory above %uGB ignored, pae %d "
"above4g_allow %d above24g_allow %d\n",
(u_int)(length / 1024), (u_int)(lim >> 30), pae_mode,
above4g_allow, above24g_allow);
return (1);
}
if (base + length >= lim) {
ign = base + length - lim;
length -= ign;
printf("%uK of memory above %uGB ignored, pae %d "
"above4g_allow %d above24g_allow %d\n",
(u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode,
above4g_allow, above24g_allow);
}
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*/
insert_idx = physmap_idx + 2;
for (i = 0; i <= physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
return (1);
}
static int
add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
{
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016llx len=%016llx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
return (1);
return (add_physmap_entry(smap->base, smap->length, physmap,
physmap_idxp));
}
static void
add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
int *physmap_idxp)
{
struct bios_smap *smap, *smapend;
u_int32_t smapsize;
/*
* Memory map from INT 15:E820.
*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes SMAP.
*/
smapsize = *((u_int32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
for (smap = smapbase; smap < smapend; smap++)
if (!add_smap_entry(smap, physmap, physmap_idxp))
break;
}
static void
basemem_setup(void)
{
if (basemem > 640) {
printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
basemem);
basemem = 640;
}
pmap_basemem_setup(basemem);
}
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* If we cannot accurately determine the physical memory map, then use
* value from the 0xE801 call, and failing that, the RTC.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(int first)
{
int has_smap, off, physmap_idx, pa_indx, da_indx;
u_long memtest;
vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
quad_t dcons_addr, dcons_size, physmem_tunable;
int hasbrokenint12, i, res;
u_int extmem;
struct vm86frame vmf;
struct vm86context vmc;
vm_paddr_t pa;
struct bios_smap *smap, *smapbase;
caddr_t kmdp;
has_smap = 0;
bzero(&vmf, sizeof(vmf));
bzero(physmap, sizeof(physmap));
basemem = 0;
/*
* Tell the physical memory allocator about pages used to store
* the kernel and preloaded data. See kmem_bootstrap_free().
*/
vm_phys_early_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first));
TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow);
TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow);
/*
* Check if the loader supplied an SMAP memory map. If so,
* use that and do not make any VM86 calls.
*/
physmap_idx = 0;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf32 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase != NULL) {
add_smap_entries(smapbase, physmap, &physmap_idx);
has_smap = 1;
goto have_smap;
}
/*
* Some newer BIOSes have a broken INT 12H implementation
* which causes a kernel panic immediately. In this case, we
* need use the SMAP to determine the base memory size.
*/
hasbrokenint12 = 0;
TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
if (hasbrokenint12 == 0) {
/* Use INT12 to determine base memory size. */
vm86_intcall(0x12, &vmf);
basemem = vmf.vmf_ax;
basemem_setup();
}
/*
* Fetch the memory map with INT 15:E820. Map page 1 R/W into
* the kernel page table so we can use it as a buffer. The
* kernel will unmap this page later.
*/
vmc.npages = 0;
smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1));
res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
vmf.vmf_ebx = 0;
do {
vmf.vmf_eax = 0xE820;
vmf.vmf_edx = SMAP_SIG;
vmf.vmf_ecx = sizeof(struct bios_smap);
i = vm86_datacall(0x15, &vmf, &vmc);
if (i || vmf.vmf_eax != SMAP_SIG)
break;
has_smap = 1;
if (!add_smap_entry(smap, physmap, &physmap_idx))
break;
} while (vmf.vmf_ebx != 0);
have_smap:
/*
* If we didn't fetch the "base memory" size from INT12,
* figure it out from the SMAP (or just guess).
*/
if (basemem == 0) {
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] == 0x00000000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
/* XXX: If we couldn't find basemem from SMAP, just guess. */
if (basemem == 0)
basemem = 640;
basemem_setup();
}
if (physmap[1] != 0)
goto physmap_done;
/*
* If we failed to find an SMAP, figure out the extended
* memory size. We will then build a simple memory map with
* two segments, one for "base memory" and the second for
* "extended memory". Note that "extended memory" starts at a
* physical address of 1MB and that both basemem and extmem
* are in units of 1KB.
*
* First, try to fetch the extended memory size via INT 15:E801.
*/
vmf.vmf_ax = 0xE801;
if (vm86_intcall(0x15, &vmf) == 0) {
extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
} else {
/*
* If INT15:E801 fails, this is our last ditch effort
* to determine the extended memory size. Currently
* we prefer the RTC value over INT15:88.
*/
#if 0
vmf.vmf_ah = 0x88;
vm86_intcall(0x15, &vmf);
extmem = vmf.vmf_ax;
#else
extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
#endif
}
/*
* Special hack for chipsets that still remap the 384k hole when
* there's 16MB of memory - this really confuses people that
* are trying to use bus mastering ISA controllers with the
* "16MB limit"; they only have 16MB, but the remapping puts
* them beyond the limit.
*
* If extended memory is between 15-16MB (16-17MB phys address range),
* chop it to 15MB.
*/
if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
extmem = 15 * 1024;
physmap[0] = 0;
physmap[1] = basemem * 1024;
physmap_idx = 2;
physmap[physmap_idx] = 0x100000;
physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
physmap_done:
/*
* Now, physmap contains a map of physical memory.
*/
#ifdef SMP
/* make hole for AP bootstrap code */
alloc_ap_trampoline(physmap, &physmap_idx);
#endif
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*
* This is especially confusing when it is much larger than the
* memory size and is displayed as "realmem".
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
* the amount of memory in the system.
*/
if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
/*
* The boot memory test is disabled by default, as it takes a
* significant amount of time on large-memory systems, and is
* unfriendly to virtual machines as it unnecessarily touches all
* pages.
*
* A general name is used as the code may be extended to support
* additional tests beyond the current "page present" test.
*/
memtest = 0;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/*
* If Maxmem has been increased beyond what the system has detected,
* extend the last memory segment to the new limit.
*/
if (atop(physmap[physmap_idx + 1]) < Maxmem)
physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(first);
/*
* Size up each available chunk of physical memory.
*/
physmap[0] = PAGE_SIZE; /* mask off page 0 */
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= KERNLOAD && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* map page into kernel: valid, read/write,non-cacheable
*/
ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N);
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == PHYS_AVAIL_ENTRIES) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
pmap_cmap3(0, 0);
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
off);
}
static void
i386_kdb_init(void)
{
#ifdef DDB
db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab, 0);
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
}
static void
fixup_idt(void)
{
struct gate_descriptor *ip;
uintptr_t off;
int x;
for (x = 0; x < NIDT; x++) {
ip = &idt[x];
if (ip->gd_type != SDT_SYS386IGT &&
ip->gd_type != SDT_SYS386TGT)
continue;
off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16);
KASSERT(off >= (uintptr_t)start_exceptions &&
off < (uintptr_t)end_exceptions,
("IDT[%d] type %d off %#x", x, ip->gd_type, off));
off += setidt_disp;
MPASS(off >= PMAP_TRM_MIN_ADDRESS &&
off < PMAP_TRM_MAX_ADDRESS);
ip->gd_looffset = off;
ip->gd_hioffset = off >> 16;
}
}
static void
i386_setidt1(void)
{
int x;
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL,
SEL_KPL));
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT,
SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall),
SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret),
SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
#endif
#ifdef XENHVM
setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
#endif
}
static void
i386_setidt2(void)
{
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
}
#if defined(DEV_ISA) && !defined(DEV_ATPIC)
static void
i386_setidt3(void)
{
setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
}
#endif
register_t
init386(int first)
{
struct region_descriptor r_gdt, r_idt; /* table descriptors */
int gsel_tss, metadata_missing, x, pa;
struct pcpu *pc;
struct xstate_hdr *xhdr;
caddr_t kmdp;
vm_offset_t addend;
size_t ucode_len;
int late_console;
thread0.td_kstack = proc0kstack;
thread0.td_kstack_pages = TD0_KSTACK_PAGES;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
if (bootinfo.bi_modulep) {
metadata_missing = 0;
addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ?
PMAP_MAP_LOW : 0;
preload_metadata = (caddr_t)bootinfo.bi_modulep + addend;
preload_bootstrap_relocate(addend);
} else {
metadata_missing = 1;
}
if (bootinfo.bi_envp != 0) {
addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ?
PMAP_MAP_LOW : 0;
init_static_kenv((char *)bootinfo.bi_envp + addend, 0);
} else {
init_static_kenv(NULL, 0);
}
/*
* Re-evaluate CPU features if we loaded a microcode update.
*/
ucode_len = ucode_load_bsp(first);
if (ucode_len != 0) {
identify_cpu();
first = roundup2(first + ucode_len, PAGE_SIZE);
}
identify_hypervisor();
/* Init basic tunables, hz etc */
init_param1();
/* Set bootmethod to BIOS: it's the only supported on i386. */
strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
/*
* Make gdt memory segments. All segments cover the full 4GB
* of address space and permissions are enforced at page level.
*/
gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0;
for (x = 0; x < NGDT; x++)
ssdtosd(&gdt_segs[x], &gdt0[x].sd);
r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1;
r_gdt.rd_base = (int)gdt0;
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
lgdt(&r_gdt);
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
pmap_kenter(pa, pa);
dpcpu_init((void *)first, 0);
first += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
/* Non-late cninit() and printf() can be moved up to here. */
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
i386_setidt1();
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (int) idt;
lidt(&r_idt);
finishidentcpu(); /* Final stage of CPU initialization */
/*
* Initialize the clock before the console so that console
* initialization can use DELAY().
*/
clock_init();
i386_setidt2();
pmap_set_nx();
initializecpu(); /* Initialize CPU registers */
initializecpucache();
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
/* Initialize the tss (except for the final esp0) early for vm86. */
common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages *
PAGE_SIZE - VM86_STACK_SPACE;
common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
common_tss0.tss_ioopt = sizeof(struct i386tss) << 16;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
ltr(gsel_tss);
/* Initialize the PIC early for vm86 calls. */
#ifdef DEV_ISA
#ifdef DEV_ATPIC
elcr_probe();
atpic_startup();
#else
/* Reset and mask the atpics and leave them shut down. */
atpic_reset();
/*
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
i386_setidt3();
#endif
#endif
/*
* The console and kdb should be initialized even earlier than here,
* but some console drivers don't work until after getmemsize().
* Default to late console initialization to support these drivers.
* This loses mainly printf()s in getmemsize() and early debugging.
*/
late_console = 1;
TUNABLE_INT_FETCH("debug.late_console", &late_console);
if (!late_console) {
cninit();
i386_kdb_init();
}
kmdp = preload_search_by_type("elf kernel");
link_elf_ireloc(kmdp);
vm86_initialize();
getmemsize(first);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
if (late_console)
cninit();
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
if (late_console)
i386_kdb_init();
msgbufinit(msgbufp, msgbufsize);
npxinit(true);
/*
* Set up thread0 pcb after npxinit calculated pcb + fpu save
* area size. Zero out the extended state header in fpu save
* area.
*/
thread0.td_pcb = get_pcb_td(&thread0);
thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
if (use_xsave) {
xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1);
xhdr->xstate_bv = xsave_mask;
}
PCPU_SET(curpcb, thread0.td_pcb);
/* Move esp0 in the tss to its final place. */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE;
PCPU_SET(kesp0, common_tss0.tss_esp0);
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */
ltr(gsel_tss);
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
thread0.td_pcb->pcb_cr3 = pmap_get_kcr3();
thread0.td_pcb->pcb_ext = 0;
thread0.td_frame = &proc0_tf;
#ifdef FDT
x86_init_fdt();
#endif
/* Location of kernel stack for locore */
return ((register_t)thread0.td_pcb);
}
static void
machdep_init_trampoline(void)
{
struct region_descriptor r_gdt, r_idt;
struct i386tss *tss;
char *copyout_buf, *trampoline, *tramp_stack_base;
int x;
gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus,
M_NOWAIT | M_ZERO);
bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (int)gdt;
lgdt(&r_gdt);
tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus,
M_NOWAIT | M_ZERO);
bcopy(&common_tss0, tss, sizeof(struct i386tss));
gdt[GPROC0_SEL].sd.sd_lobase = (int)tss;
gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24;
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
PCPU_SET(common_tssp, tss);
ltr(GSEL(GPROC0_SEL, SEL_KPL));
trampoline = pmap_trm_alloc(end_exceptions - start_exceptions,
M_NOWAIT);
bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions);
tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ -
VM86_STACK_SPACE);
tss[0].tss_esp0 = PCPU_GET(trampstk);
idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO);
bcopy(idt0, idt, sizeof(idt0));
/* Re-initialize new IDT since the handlers were relocated */
setidt_disp = trampoline - start_exceptions;
fixup_idt();
r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
r_idt.rd_base = (int)idt;
lidt(&r_idt);
/* dblfault TSS */
dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO);
dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT);
dblfault_tss->tss_esp = dblfault_tss->tss_esp0 =
dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 =
(int)dblfault_stack + PAGE_SIZE;
dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 =
dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss->tss_cr3 = pmap_get_kcr3();
dblfault_tss->tss_eip = (int)dblfault_handler;
dblfault_tss->tss_eflags = PSL_KERNEL;
dblfault_tss->tss_ds = dblfault_tss->tss_es =
dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL);
dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss;
gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24;
/* make ldt memory segments */
ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT,
M_NOWAIT | M_ZERO);
gdt[GLDT_SEL].sd.sd_lobase = (int)ldt;
gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24;
ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
for (x = 0; x < nitems(ldt_segs); x++)
ssdtosd(&ldt_segs[x], &ldt[x].sd);
_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
lldt(_default_ldt);
PCPU_SET(currentldt, _default_ldt);
copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
PCPU_SET(copyout_buf, copyout_buf);
copyout_init_tramp();
}
SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL);
#ifdef COMPAT_43
static void
i386_setup_lcall_gate(void)
{
struct sysentvec *sv;
struct user_segment_descriptor desc;
u_int lcall_addr;
sv = &elf32_freebsd_sysvec;
lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp;
bzero(&desc, sizeof(desc));
desc.sd_type = SDT_MEMERA;
desc.sd_dpl = SEL_UPL;
desc.sd_p = 1;
desc.sd_def32 = 1;
desc.sd_gran = 1;
desc.sd_lolimit = 0xffff;
desc.sd_hilimit = 0xf;
desc.sd_lobase = lcall_addr;
desc.sd_hibase = lcall_addr >> 24;
bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc));
}
SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL);
#endif
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct bios_smap *smapbase;
struct bios_smap_xattr smap;
caddr_t kmdp;
uint32_t *smapattr;
int count, error, i;
/* Retrieve the system memory map from the loader. */
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf32 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
return (0);
smapattr = (uint32_t *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase);
error = 0;
for (i = 0; i < count; i++) {
smap.base = smapbase[i].base;
smap.length = smapbase[i].length;
smap.type = smapbase[i].type;
if (smapattr != NULL)
smap.xattr = smapattr[i];
else
smap.xattr = 0;
error = SYSCTL_OUT(req, &smap, sizeof(smap));
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, smap,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
smap_sysctl_handler, "S,bios_smap_xattr",
"Raw BIOS SMAP data");
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
critical_enter();
} else
td->td_md.md_spinlock_count++;
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0) {
critical_exit();
intr_restore(flags);
}
}
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
static void f00f_hack(void *unused);
SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
static void
f00f_hack(void *unused)
{
struct region_descriptor r_idt;
struct gate_descriptor *new_idt;
vm_offset_t tmp;
if (!has_f00f_bug)
return;
printf("Intel Pentium detected, installing workaround for F00F bug\n");
tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO);
if (tmp == 0)
panic("kmem_malloc returned 0");
tmp = round_page(tmp);
/* Put the problematic entry (#6) at the end of the lower page. */
new_idt = (struct gate_descriptor *)
(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
bcopy(idt, new_idt, sizeof(idt0));
r_idt.rd_base = (u_int)new_idt;
r_idt.rd_limit = sizeof(idt0) - 1;
lidt(&r_idt);
/* SMP machines do not need the F00F hack. */
idt = new_idt;
pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
}
#endif /* defined(I586_CPU) && !NO_F00F_HACK */
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_edi = tf->tf_edi;
pcb->pcb_esi = tf->tf_esi;
pcb->pcb_ebp = tf->tf_ebp;
pcb->pcb_ebx = tf->tf_ebx;
pcb->pcb_eip = tf->tf_eip;
pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
pcb->pcb_gs = rgs();
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
diff --git a/sys/x86/include/clock.h b/sys/x86/include/clock.h
index 83c8351ed31c..9aeccadf89aa 100644
--- a/sys/x86/include/clock.h
+++ b/sys/x86/include/clock.h
@@ -1,48 +1,49 @@
/*-
* Kernel interface to machine-dependent clock driver.
* Garrett Wollman, September 1994.
* This file is in the public domain.
*
* $FreeBSD$
*/
#ifndef _MACHINE_CLOCK_H_
#define _MACHINE_CLOCK_H_
#ifdef _KERNEL
/*
* i386 to clock driver interface.
* XXX large parts of the driver and its interface are misplaced.
*/
extern int clkintr_pending;
extern u_int i8254_freq;
extern int i8254_max_count;
extern uint64_t tsc_freq;
extern int tsc_is_invariant;
extern int tsc_perf_stat;
#ifdef SMP
extern int smp_tsc;
#endif
void i8254_init(void);
void i8254_delay(int);
void clock_init(void);
void lapic_calibrate(void);
+void tsc_init(void);
void tsc_calibrate(void);
/*
* Driver to clock driver interface.
*/
void startrtclock(void);
-void init_TSC(void);
+void start_TSC(void);
void resume_TSC(void);
#define HAS_TIMER_SPKR 1
int timer_spkr_acquire(void);
int timer_spkr_release(void);
void timer_spkr_setfreq(int freq);
#endif /* _KERNEL */
#endif /* !_MACHINE_CLOCK_H_ */
diff --git a/sys/x86/isa/clock.c b/sys/x86/isa/clock.c
index 1178d35979c1..f21f847709cd 100644
--- a/sys/x86/isa/clock.c
+++ b/sys/x86/isa/clock.c
@@ -1,659 +1,659 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1990 The Regents of the University of California.
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz and Don Ahn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)clock.c 7.2 (Berkeley) 5/12/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Routines to handle clock hardware.
*/
#include "opt_clock.h"
#include "opt_isa.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/lock.h>
#include <sys/kdb.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/timeet.h>
#include <sys/timetc.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/intr_machdep.h>
#include <machine/ppireg.h>
#include <machine/timerreg.h>
#include <x86/apicvar.h>
#include <x86/init.h>
#include <isa/rtc.h>
#ifdef DEV_ISA
#include <isa/isareg.h>
#include <isa/isavar.h>
#endif
int clkintr_pending;
#ifndef TIMER_FREQ
#define TIMER_FREQ 1193182
#endif
u_int i8254_freq = TIMER_FREQ;
TUNABLE_INT("hw.i8254.freq", &i8254_freq);
int i8254_max_count;
static int i8254_timecounter = 1;
static struct mtx clock_lock;
static struct intsrc *i8254_intsrc;
static uint16_t i8254_lastcount;
static uint16_t i8254_offset;
static int (*i8254_pending)(struct intsrc *);
static int i8254_ticked;
struct attimer_softc {
int intr_en;
int port_rid, intr_rid;
struct resource *port_res;
struct resource *intr_res;
void *intr_handler;
struct timecounter tc;
struct eventtimer et;
int mode;
#define MODE_STOP 0
#define MODE_PERIODIC 1
#define MODE_ONESHOT 2
uint32_t period;
};
static struct attimer_softc *attimer_sc = NULL;
static int timer0_period = -2;
static int timer0_mode = 0xffff;
static int timer0_last = 0xffff;
/* Values for timerX_state: */
#define RELEASED 0
#define RELEASE_PENDING 1
#define ACQUIRED 2
#define ACQUIRE_PENDING 3
static u_char timer2_state;
static unsigned i8254_get_timecount(struct timecounter *tc);
static void set_i8254_freq(int mode, uint32_t period);
void
clock_init(void)
{
/* Init the clock lock */
mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
/* Init the clock in order to use DELAY */
init_ops.early_clock_source_init();
}
static int
clkintr(void *arg)
{
struct attimer_softc *sc = (struct attimer_softc *)arg;
if (i8254_timecounter && sc->period != 0) {
mtx_lock_spin(&clock_lock);
if (i8254_ticked)
i8254_ticked = 0;
else {
i8254_offset += i8254_max_count;
i8254_lastcount = 0;
}
clkintr_pending = 0;
mtx_unlock_spin(&clock_lock);
}
if (sc->et.et_active && sc->mode != MODE_STOP)
sc->et.et_event_cb(&sc->et, sc->et.et_arg);
return (FILTER_HANDLED);
}
int
timer_spkr_acquire(void)
{
int mode;
mode = TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT;
if (timer2_state != RELEASED)
return (-1);
timer2_state = ACQUIRED;
/*
* This access to the timer registers is as atomic as possible
* because it is a single instruction. We could do better if we
* knew the rate. Use of splclock() limits glitches to 10-100us,
* and this is probably good enough for timer2, so we aren't as
* careful with it as with timer0.
*/
outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f));
ppi_spkr_on(); /* enable counter2 output to speaker */
return (0);
}
int
timer_spkr_release(void)
{
if (timer2_state != ACQUIRED)
return (-1);
timer2_state = RELEASED;
outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT);
ppi_spkr_off(); /* disable counter2 output to speaker */
return (0);
}
void
timer_spkr_setfreq(int freq)
{
freq = i8254_freq / freq;
mtx_lock_spin(&clock_lock);
outb(TIMER_CNTR2, freq & 0xff);
outb(TIMER_CNTR2, freq >> 8);
mtx_unlock_spin(&clock_lock);
}
static int
getit(void)
{
int high, low;
mtx_lock_spin(&clock_lock);
/* Select timer0 and latch counter value. */
outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
low = inb(TIMER_CNTR0);
high = inb(TIMER_CNTR0);
mtx_unlock_spin(&clock_lock);
return ((high << 8) | low);
}
/*
* Wait "n" microseconds.
* Relies on timer 1 counting down from (i8254_freq / hz)
* Note: timer had better have been programmed before this is first used!
*/
void
i8254_delay(int n)
{
int delta, prev_tick, tick, ticks_left;
#ifdef DELAYDEBUG
int getit_calls = 1;
int n1;
static int state = 0;
if (state == 0) {
state = 1;
for (n1 = 1; n1 <= 10000000; n1 *= 10)
DELAY(n1);
state = 2;
}
if (state == 1)
printf("DELAY(%d)...", n);
#endif
/*
* Read the counter first, so that the rest of the setup overhead is
* counted. Guess the initial overhead is 20 usec (on most systems it
* takes about 1.5 usec for each of the i/o's in getit(). The loop
* takes about 6 usec on a 486/33 and 13 usec on a 386/20. The
* multiplications and divisions to scale the count take a while).
*
* However, if ddb is active then use a fake counter since reading
* the i8254 counter involves acquiring a lock. ddb must not do
* locking for many reasons, but it calls here for at least atkbd
* input.
*/
#ifdef KDB
if (kdb_active)
prev_tick = 1;
else
#endif
prev_tick = getit();
n -= 0; /* XXX actually guess no initial overhead */
/*
* Calculate (n * (i8254_freq / 1e6)) without using floating point
* and without any avoidable overflows.
*/
if (n <= 0)
ticks_left = 0;
else if (n < 256)
/*
* Use fixed point to avoid a slow division by 1000000.
* 39099 = 1193182 * 2^15 / 10^6 rounded to nearest.
* 2^15 is the first power of 2 that gives exact results
* for n between 0 and 256.
*/
ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15;
else
/*
* Don't bother using fixed point, although gcc-2.7.2
* generates particularly poor code for the long long
* division, since even the slow way will complete long
* before the delay is up (unless we're interrupted).
*/
ticks_left = ((u_int)n * (long long)i8254_freq + 999999)
/ 1000000;
while (ticks_left > 0) {
#ifdef KDB
if (kdb_active) {
inb(0x84);
tick = prev_tick - 1;
if (tick <= 0)
tick = i8254_max_count;
} else
#endif
tick = getit();
#ifdef DELAYDEBUG
++getit_calls;
#endif
delta = prev_tick - tick;
prev_tick = tick;
if (delta < 0) {
delta += i8254_max_count;
/*
* Guard against i8254_max_count being wrong.
* This shouldn't happen in normal operation,
* but it may happen if set_i8254_freq() is
* traced.
*/
if (delta < 0)
delta = 0;
}
ticks_left -= delta;
}
#ifdef DELAYDEBUG
if (state == 1)
printf(" %d calls to getit() at %d usec each\n",
getit_calls, (n + 5) / getit_calls);
#endif
}
static void
set_i8254_freq(int mode, uint32_t period)
{
int new_count, new_mode;
mtx_lock_spin(&clock_lock);
if (mode == MODE_STOP) {
if (i8254_timecounter) {
mode = MODE_PERIODIC;
new_count = 0x10000;
} else
new_count = -1;
} else {
new_count = min(((uint64_t)i8254_freq * period +
0x80000000LLU) >> 32, 0x10000);
}
if (new_count == timer0_period)
goto out;
i8254_max_count = ((new_count & ~0xffff) != 0) ? 0xffff : new_count;
timer0_period = (mode == MODE_PERIODIC) ? new_count : -1;
switch (mode) {
case MODE_STOP:
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, 0);
outb(TIMER_CNTR0, 0);
break;
case MODE_PERIODIC:
new_mode = TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT;
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
outb(TIMER_CNTR0, new_count >> 8);
break;
case MODE_ONESHOT:
if (new_count < 256 && timer0_last < 256) {
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_LSB;
if (new_mode != timer0_mode)
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
break;
}
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
if (new_mode != timer0_mode)
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
outb(TIMER_CNTR0, new_count >> 8);
break;
default:
panic("set_i8254_freq: unknown operational mode");
}
timer0_mode = new_mode;
timer0_last = new_count;
out:
mtx_unlock_spin(&clock_lock);
}
static void
i8254_restore(void)
{
timer0_period = -2;
timer0_mode = 0xffff;
timer0_last = 0xffff;
if (attimer_sc != NULL)
set_i8254_freq(attimer_sc->mode, attimer_sc->period);
else
set_i8254_freq(MODE_STOP, 0);
}
/* This is separate from startrtclock() so that it can be called early. */
void
i8254_init(void)
{
set_i8254_freq(MODE_STOP, 0);
}
void
-startrtclock()
+startrtclock(void)
{
- init_TSC();
+ start_TSC();
}
void
cpu_initclocks(void)
{
#ifdef EARLY_AP_STARTUP
struct thread *td;
int i;
td = curthread;
tsc_calibrate();
lapic_calibrate_timer();
cpu_initclocks_bsp();
CPU_FOREACH(i) {
if (i == 0)
continue;
thread_lock(td);
sched_bind(td, i);
thread_unlock(td);
cpu_initclocks_ap();
}
thread_lock(td);
if (sched_is_bound(td))
sched_unbind(td);
thread_unlock(td);
#else
tsc_calibrate();
lapic_calibrate_timer();
cpu_initclocks_bsp();
#endif
}
static int
sysctl_machdep_i8254_freq(SYSCTL_HANDLER_ARGS)
{
int error;
u_int freq;
/*
* Use `i8254' instead of `timer' in external names because `timer'
* is too generic. Should use it everywhere.
*/
freq = i8254_freq;
error = sysctl_handle_int(oidp, &freq, 0, req);
if (error == 0 && req->newptr != NULL) {
i8254_freq = freq;
if (attimer_sc != NULL) {
set_i8254_freq(attimer_sc->mode, attimer_sc->period);
attimer_sc->tc.tc_frequency = freq;
} else {
set_i8254_freq(MODE_STOP, 0);
}
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, i8254_freq,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
0, sizeof(u_int), sysctl_machdep_i8254_freq, "IU",
"i8254 timer frequency");
static unsigned
i8254_get_timecount(struct timecounter *tc)
{
device_t dev = (device_t)tc->tc_priv;
struct attimer_softc *sc = device_get_softc(dev);
register_t flags;
uint16_t count;
u_int high, low;
if (sc->period == 0)
return (i8254_max_count - getit());
#ifdef __amd64__
flags = read_rflags();
#else
flags = read_eflags();
#endif
mtx_lock_spin(&clock_lock);
/* Select timer0 and latch counter value. */
outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
low = inb(TIMER_CNTR0);
high = inb(TIMER_CNTR0);
count = i8254_max_count - ((high << 8) | low);
if (count < i8254_lastcount ||
(!i8254_ticked && (clkintr_pending ||
((count < 20 || (!(flags & PSL_I) &&
count < i8254_max_count / 2u)) &&
i8254_pending != NULL && i8254_pending(i8254_intsrc))))) {
i8254_ticked = 1;
i8254_offset += i8254_max_count;
}
i8254_lastcount = count;
count += i8254_offset;
mtx_unlock_spin(&clock_lock);
return (count);
}
static int
attimer_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
{
device_t dev = (device_t)et->et_priv;
struct attimer_softc *sc = device_get_softc(dev);
if (period != 0) {
sc->mode = MODE_PERIODIC;
sc->period = period;
} else {
sc->mode = MODE_ONESHOT;
sc->period = first;
}
if (!sc->intr_en) {
i8254_intsrc->is_pic->pic_enable_source(i8254_intsrc);
sc->intr_en = 1;
}
set_i8254_freq(sc->mode, sc->period);
return (0);
}
static int
attimer_stop(struct eventtimer *et)
{
device_t dev = (device_t)et->et_priv;
struct attimer_softc *sc = device_get_softc(dev);
sc->mode = MODE_STOP;
sc->period = 0;
set_i8254_freq(sc->mode, sc->period);
return (0);
}
#ifdef DEV_ISA
/*
* Attach to the ISA PnP descriptors for the timer
*/
static struct isa_pnp_id attimer_ids[] = {
{ 0x0001d041 /* PNP0100 */, "AT timer" },
{ 0 }
};
static int
attimer_probe(device_t dev)
{
int result;
result = ISA_PNP_PROBE(device_get_parent(dev), dev, attimer_ids);
/* ENOENT means no PnP-ID, device is hinted. */
if (result == ENOENT) {
device_set_desc(dev, "AT timer");
return (BUS_PROBE_LOW_PRIORITY);
}
return (result);
}
static int
attimer_attach(device_t dev)
{
struct attimer_softc *sc;
rman_res_t s;
int i;
attimer_sc = sc = device_get_softc(dev);
bzero(sc, sizeof(struct attimer_softc));
if (!(sc->port_res = bus_alloc_resource(dev, SYS_RES_IOPORT,
&sc->port_rid, IO_TIMER1, IO_TIMER1 + 3, 4, RF_ACTIVE)))
device_printf(dev,"Warning: Couldn't map I/O.\n");
i8254_intsrc = intr_lookup_source(0);
if (i8254_intsrc != NULL)
i8254_pending = i8254_intsrc->is_pic->pic_source_pending;
resource_int_value(device_get_name(dev), device_get_unit(dev),
"timecounter", &i8254_timecounter);
set_i8254_freq(MODE_STOP, 0);
if (i8254_timecounter) {
sc->tc.tc_get_timecount = i8254_get_timecount;
sc->tc.tc_counter_mask = 0xffff;
sc->tc.tc_frequency = i8254_freq;
sc->tc.tc_name = "i8254";
sc->tc.tc_quality = 0;
sc->tc.tc_priv = dev;
tc_init(&sc->tc);
}
if (resource_int_value(device_get_name(dev), device_get_unit(dev),
"clock", &i) != 0 || i != 0) {
sc->intr_rid = 0;
while (bus_get_resource(dev, SYS_RES_IRQ, sc->intr_rid,
&s, NULL) == 0 && s != 0)
sc->intr_rid++;
if (!(sc->intr_res = bus_alloc_resource(dev, SYS_RES_IRQ,
&sc->intr_rid, 0, 0, 1, RF_ACTIVE))) {
device_printf(dev,"Can't map interrupt.\n");
return (0);
}
/* Dirty hack, to make bus_setup_intr to not enable source. */
i8254_intsrc->is_handlers++;
if ((bus_setup_intr(dev, sc->intr_res,
INTR_MPSAFE | INTR_TYPE_CLK,
(driver_filter_t *)clkintr, NULL,
sc, &sc->intr_handler))) {
device_printf(dev, "Can't setup interrupt.\n");
i8254_intsrc->is_handlers--;
return (0);
}
i8254_intsrc->is_handlers--;
i8254_intsrc->is_pic->pic_enable_intr(i8254_intsrc);
sc->et.et_name = "i8254";
sc->et.et_flags = ET_FLAGS_PERIODIC;
if (!i8254_timecounter)
sc->et.et_flags |= ET_FLAGS_ONESHOT;
sc->et.et_quality = 100;
sc->et.et_frequency = i8254_freq;
sc->et.et_min_period = (0x0002LLU << 32) / i8254_freq;
sc->et.et_max_period = (0xfffeLLU << 32) / i8254_freq;
sc->et.et_start = attimer_start;
sc->et.et_stop = attimer_stop;
sc->et.et_priv = dev;
et_register(&sc->et);
}
return(0);
}
static int
attimer_resume(device_t dev)
{
i8254_restore();
return (0);
}
static device_method_t attimer_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, attimer_probe),
DEVMETHOD(device_attach, attimer_attach),
DEVMETHOD(device_detach, bus_generic_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, bus_generic_suspend),
DEVMETHOD(device_resume, attimer_resume),
{ 0, 0 }
};
static driver_t attimer_driver = {
"attimer",
attimer_methods,
sizeof(struct attimer_softc),
};
static devclass_t attimer_devclass;
DRIVER_MODULE(attimer, isa, attimer_driver, attimer_devclass, 0, 0);
DRIVER_MODULE(attimer, acpi, attimer_driver, attimer_devclass, 0, 0);
ISA_PNP_INFO(attimer_ids);
#endif /* DEV_ISA */
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index 317be8979feb..82ee358b6895 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -1,942 +1,955 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1998-2003 Poul-Henning Kamp
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_clock.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/kernel.h>
#include <sys/smp.h>
#include <sys/vdso.h>
#include <machine/clock.h>
#include <machine/cputypes.h>
#include <machine/fpu.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <x86/vmware.h>
#include <dev/acpica/acpi_hpet.h>
#include <contrib/dev/acpica/include/acpi.h>
#include "cpufreq_if.h"
uint64_t tsc_freq;
int tsc_is_invariant;
int tsc_perf_stat;
static int tsc_early_calib_exact;
static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
&tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
#ifdef SMP
int smp_tsc;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
"Indicates whether the TSC is safe to use in SMP mode");
int smp_tsc_adjust = 0;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
&smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
#endif
static int tsc_shift = 1;
SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
&tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
static int tsc_disabled;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
"Disable x86 Time Stamp Counter");
static int tsc_skip_calibration;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
&tsc_skip_calibration, 0,
"Disable early TSC frequency calibration");
static void tsc_freq_changed(void *arg, const struct cf_level *level,
int status);
static void tsc_freq_changing(void *arg, const struct cf_level *level,
int *status);
static u_int tsc_get_timecount(struct timecounter *tc);
static inline u_int tsc_get_timecount_low(struct timecounter *tc);
static u_int tsc_get_timecount_lfence(struct timecounter *tc);
static u_int tsc_get_timecount_low_lfence(struct timecounter *tc);
static u_int tsc_get_timecount_mfence(struct timecounter *tc);
static u_int tsc_get_timecount_low_mfence(struct timecounter *tc);
static u_int tscp_get_timecount(struct timecounter *tc);
static u_int tscp_get_timecount_low(struct timecounter *tc);
static void tsc_levels_changed(void *arg, int unit);
static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
struct timecounter *tc);
#ifdef COMPAT_FREEBSD32
static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
struct timecounter *tc);
#endif
static struct timecounter tsc_timecounter = {
.tc_get_timecount = tsc_get_timecount,
.tc_counter_mask = ~0u,
.tc_name = "TSC",
.tc_quality = 800, /* adjusted in code */
.tc_fill_vdso_timehands = x86_tsc_vdso_timehands,
#ifdef COMPAT_FREEBSD32
.tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32,
#endif
};
static int
tsc_freq_cpuid_vm(void)
{
u_int regs[4];
if (vm_guest == VM_GUEST_NO)
return (false);
if (hv_high < 0x40000010)
return (false);
do_cpuid(0x40000010, regs);
tsc_freq = (uint64_t)(regs[0]) * 1000;
tsc_early_calib_exact = 1;
return (true);
}
static void
tsc_freq_vmware(void)
{
u_int regs[4];
vmware_hvcall(VMW_HVCMD_GETHZ, regs);
if (regs[1] != UINT_MAX)
tsc_freq = regs[0] | ((uint64_t)regs[1] << 32);
tsc_early_calib_exact = 1;
}
/*
* Calculate TSC frequency using information from the CPUID leaf 0x15 'Time
* Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not
* functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency
* Information'. Leaf 0x16 is described in the SDM as informational only, but
* we can use this value until late calibration is complete.
*/
static bool
tsc_freq_cpuid(uint64_t *res)
{
u_int regs[4];
if (cpu_high < 0x15)
return (false);
do_cpuid(0x15, regs);
if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
*res = (uint64_t)regs[2] * regs[1] / regs[0];
return (true);
}
if (cpu_high < 0x16)
return (false);
do_cpuid(0x16, regs);
if (regs[0] != 0) {
*res = (uint64_t)regs[0] * 1000000;
return (true);
}
return (false);
}
static bool
tsc_freq_intel_brand(uint64_t *res)
{
char brand[48];
u_int regs[4];
uint64_t freq;
char *p;
u_int i;
/*
* Intel Processor Identification and the CPUID Instruction
* Application Note 485.
* http://www.intel.com/assets/pdf/appnote/241618.pdf
*/
if (cpu_exthigh >= 0x80000004) {
p = brand;
for (i = 0x80000002; i < 0x80000005; i++) {
do_cpuid(i, regs);
memcpy(p, regs, sizeof(regs));
p += sizeof(regs);
}
p = NULL;
for (i = 0; i < sizeof(brand) - 1; i++)
if (brand[i] == 'H' && brand[i + 1] == 'z')
p = brand + i;
if (p != NULL) {
p -= 5;
switch (p[4]) {
case 'M':
i = 1;
break;
case 'G':
i = 1000;
break;
case 'T':
i = 1000000;
break;
default:
return (false);
}
#define C2D(c) ((c) - '0')
if (p[1] == '.') {
freq = C2D(p[0]) * 1000;
freq += C2D(p[2]) * 100;
freq += C2D(p[3]) * 10;
freq *= i * 1000;
} else {
freq = C2D(p[0]) * 1000;
freq += C2D(p[1]) * 100;
freq += C2D(p[2]) * 10;
freq += C2D(p[3]);
freq *= i * 1000000;
}
#undef C2D
*res = freq;
return (true);
}
}
return (false);
}
static void
tsc_freq_8254(uint64_t *res)
{
uint64_t tsc1, tsc2;
int64_t overhead;
int count, i;
overhead = 0;
for (i = 0, count = 8; i < count; i++) {
tsc1 = rdtsc_ordered();
DELAY(0);
tsc2 = rdtsc_ordered();
if (i > 0)
overhead += tsc2 - tsc1;
}
overhead /= count;
tsc1 = rdtsc_ordered();
DELAY(100000);
tsc2 = rdtsc_ordered();
tsc_freq = (tsc2 - tsc1 - overhead) * 10;
}
static void
probe_tsc_freq(void)
{
- if (cpu_power_ecx & CPUID_PERF_STAT) {
- /*
- * XXX Some emulators expose host CPUID without actual support
- * for these MSRs. We must test whether they really work.
- */
- wrmsr(MSR_MPERF, 0);
- wrmsr(MSR_APERF, 0);
- DELAY(10);
- if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
- tsc_perf_stat = 1;
+#ifdef __i386__
+ /* The TSC is known to be broken on certain CPUs. */
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_AMD:
+ switch (cpu_id & 0xFF0) {
+ case 0x500:
+ /* K5 Model 0 */
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ case CPU_VENDOR_CENTAUR:
+ switch (cpu_id & 0xff0) {
+ case 0x540:
+ /*
+ * http://www.centtech.com/c6_data_sheet.pdf
+ *
+ * I-12 RDTSC may return incoherent values in EDX:EAX
+ * I-13 RDTSC hangs when certain event counters are used
+ */
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ case CPU_VENDOR_NSC:
+ switch (cpu_id & 0xff0) {
+ case 0x540:
+ if ((cpu_id & CPUID_STEPPING) == 0) {
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ }
+ break;
}
+#endif
switch (cpu_vendor_id) {
case CPU_VENDOR_AMD:
case CPU_VENDOR_HYGON:
if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
(vm_guest == VM_GUEST_NO &&
CPUID_TO_FAMILY(cpu_id) >= 0x10))
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_mfence;
}
break;
case CPU_VENDOR_INTEL:
if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
(vm_guest == VM_GUEST_NO &&
((CPUID_TO_FAMILY(cpu_id) == 0x6 &&
CPUID_TO_MODEL(cpu_id) >= 0xe) ||
(CPUID_TO_FAMILY(cpu_id) == 0xf &&
CPUID_TO_MODEL(cpu_id) >= 0x3))))
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_lfence;
}
break;
case CPU_VENDOR_CENTAUR:
if (vm_guest == VM_GUEST_NO &&
CPUID_TO_FAMILY(cpu_id) == 0x6 &&
CPUID_TO_MODEL(cpu_id) >= 0xf &&
(rdmsr(0x1203) & 0x100000000ULL) == 0)
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_lfence;
}
break;
}
- if (tsc_freq_cpuid_vm())
- return;
-
- if (vm_guest == VM_GUEST_VMWARE) {
+ if (tsc_freq_cpuid_vm()) {
+ if (bootverbose)
+ printf(
+ "Early TSC frequency %juHz derived from hypervisor CPUID\n",
+ (uintmax_t)tsc_freq);
+ } else if (vm_guest == VM_GUEST_VMWARE) {
tsc_freq_vmware();
- return;
- }
-
- if (tsc_freq_cpuid(&tsc_freq)) {
+ if (bootverbose)
+ printf(
+ "Early TSC frequency %juHz derived from VMWare hypercall\n",
+ (uintmax_t)tsc_freq);
+ } else if (tsc_freq_cpuid(&tsc_freq)) {
/*
* If possible, use the value obtained from CPUID as the initial
* frequency. This will be refined later during boot but is
* good enough for now. The 8254 PIT is not functional on some
* newer platforms anyway, so don't delay our boot for what
* might be a garbage result. Late calibration is required if
* the initial frequency was obtained from CPUID.16H, as the
* derived value may be off by as much as 1%.
*/
if (bootverbose)
printf("Early TSC frequency %juHz derived from CPUID\n",
(uintmax_t)tsc_freq);
} else if (tsc_skip_calibration) {
/*
* Try to parse the brand string to obtain the nominal TSC
* frequency.
*/
if (cpu_vendor_id == CPU_VENDOR_INTEL &&
tsc_freq_intel_brand(&tsc_freq)) {
if (bootverbose)
printf(
"Early TSC frequency %juHz derived from brand string\n",
(uintmax_t)tsc_freq);
} else {
tsc_disabled = 1;
}
} else {
/*
* Calibrate against the 8254 PIT. This estimate will be
* refined later in tsc_calib().
*/
tsc_freq_8254(&tsc_freq);
if (bootverbose)
printf(
"Early TSC frequency %juHz calibrated from 8254 PIT\n",
(uintmax_t)tsc_freq);
}
+
+ if (cpu_power_ecx & CPUID_PERF_STAT) {
+ /*
+ * XXX Some emulators expose host CPUID without actual support
+ * for these MSRs. We must test whether they really work.
+ */
+ wrmsr(MSR_MPERF, 0);
+ wrmsr(MSR_APERF, 0);
+ DELAY(10);
+ if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
+ tsc_perf_stat = 1;
+ }
}
void
-init_TSC(void)
+start_TSC(void)
{
-
if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
return;
-#ifdef __i386__
- /* The TSC is known to be broken on certain CPUs. */
- switch (cpu_vendor_id) {
- case CPU_VENDOR_AMD:
- switch (cpu_id & 0xFF0) {
- case 0x500:
- /* K5 Model 0 */
- return;
- }
- break;
- case CPU_VENDOR_CENTAUR:
- switch (cpu_id & 0xff0) {
- case 0x540:
- /*
- * http://www.centtech.com/c6_data_sheet.pdf
- *
- * I-12 RDTSC may return incoherent values in EDX:EAX
- * I-13 RDTSC hangs when certain event counters are used
- */
- return;
- }
- break;
- case CPU_VENDOR_NSC:
- switch (cpu_id & 0xff0) {
- case 0x540:
- if ((cpu_id & CPUID_STEPPING) == 0)
- return;
- break;
- }
- break;
- }
-#endif
-
- probe_tsc_freq();
-
/*
* Inform CPU accounting about our boot-time clock rate. This will
* be updated if someone loads a cpufreq driver after boot that
* discovers a new max frequency.
*
* The frequency may also be updated after late calibration is complete;
* however, we register the TSC as the ticker now to avoid switching
* counters after much of the kernel has already booted and potentially
* sampled the CPU clock.
*/
if (tsc_freq != 0)
set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
if (tsc_is_invariant)
return;
/* Register to find out about changes in CPU frequency. */
tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST);
tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST);
tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed,
tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY);
}
#ifdef SMP
/*
* RDTSC is not a serializing instruction, and does not drain
* instruction stream, so we need to drain the stream before executing
* it. It could be fixed by use of RDTSCP, except the instruction is
* not available everywhere.
*
* Use CPUID for draining in the boot-time SMP constistency test. The
* timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel
* and VIA) when SSE2 is present, and nothing on older machines which
* also do not issue RDTSC prematurely. There, testing for SSE2 and
* vendor is too cumbersome, and we learn about TSC presence from CPUID.
*
* Do not use do_cpuid(), since we do not need CPUID results, which
* have to be written into memory with do_cpuid().
*/
#define TSC_READ(x) \
static void \
tsc_read_##x(void *arg) \
{ \
uint64_t *tsc = arg; \
u_int cpu = PCPU_GET(cpuid); \
\
__asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \
tsc[cpu * 3 + x] = rdtsc(); \
}
TSC_READ(0)
TSC_READ(1)
TSC_READ(2)
#undef TSC_READ
#define N 1000
static void
comp_smp_tsc(void *arg)
{
uint64_t *tsc;
int64_t d1, d2;
u_int cpu = PCPU_GET(cpuid);
u_int i, j, size;
size = (mp_maxid + 1) * 3;
for (i = 0, tsc = arg; i < N; i++, tsc += size)
CPU_FOREACH(j) {
if (j == cpu)
continue;
d1 = tsc[cpu * 3 + 1] - tsc[j * 3];
d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1];
if (d1 <= 0 || d2 <= 0) {
smp_tsc = 0;
return;
}
}
}
static void
adj_smp_tsc(void *arg)
{
uint64_t *tsc;
int64_t d, min, max;
u_int cpu = PCPU_GET(cpuid);
u_int first, i, size;
first = CPU_FIRST();
if (cpu == first)
return;
min = INT64_MIN;
max = INT64_MAX;
size = (mp_maxid + 1) * 3;
for (i = 0, tsc = arg; i < N; i++, tsc += size) {
d = tsc[first * 3] - tsc[cpu * 3 + 1];
if (d > min)
min = d;
d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2];
if (d > min)
min = d;
d = tsc[first * 3 + 1] - tsc[cpu * 3];
if (d < max)
max = d;
d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1];
if (d < max)
max = d;
}
if (min > max)
return;
d = min / 2 + max / 2;
__asm __volatile (
"movl $0x10, %%ecx\n\t"
"rdmsr\n\t"
"addl %%edi, %%eax\n\t"
"adcl %%esi, %%edx\n\t"
"wrmsr\n"
: /* No output */
: "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32))
: "ax", "cx", "dx", "cc"
);
}
static int
test_tsc(int adj_max_count)
{
uint64_t *data, *tsc;
u_int i, size, adj;
if ((!smp_tsc && !tsc_is_invariant))
return (-100);
/*
* Misbehavior of TSC under VirtualBox has been observed. In
* particular, threads doing small (~1 second) sleeps may miss their
* wakeup and hang around in sleep state, causing hangs on shutdown.
*/
if (vm_guest == VM_GUEST_VBOX)
return (0);
TSENTER();
size = (mp_maxid + 1) * 3;
data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK);
adj = 0;
retry:
for (i = 0, tsc = data; i < N; i++, tsc += size)
smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
smp_tsc = 1; /* XXX */
smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
smp_no_rendezvous_barrier, data);
if (!smp_tsc && adj < adj_max_count) {
adj++;
smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
smp_no_rendezvous_barrier, data);
goto retry;
}
free(data, M_TEMP);
if (bootverbose)
printf("SMP: %sed TSC synchronization test%s\n",
smp_tsc ? "pass" : "fail",
adj > 0 ? " after adjustment" : "");
TSEXIT();
if (smp_tsc && tsc_is_invariant) {
switch (cpu_vendor_id) {
case CPU_VENDOR_AMD:
case CPU_VENDOR_HYGON:
/*
* Processor Programming Reference (PPR) for AMD
* Family 17h states that the TSC uses a common
* reference for all sockets, cores and threads.
*/
if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
return (1000);
/*
* Starting with Family 15h processors, TSC clock
* source is in the north bridge. Check whether
* we have a single-socket/multi-core platform.
* XXX Need more work for complex cases.
*/
if (CPUID_TO_FAMILY(cpu_id) < 0x15 ||
(amd_feature2 & AMDID2_CMP) == 0 ||
smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1)
break;
return (1000);
case CPU_VENDOR_INTEL:
/*
* XXX Assume Intel platforms have synchronized TSCs.
*/
return (1000);
}
return (800);
}
return (-100);
}
#undef N
#endif /* SMP */
static void
init_TSC_tc(void)
{
uint64_t max_freq;
int shift;
if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
return;
/*
* Limit timecounter frequency to fit in an int and prevent it from
* overflowing too fast.
*/
max_freq = UINT_MAX;
/*
* Intel CPUs without a C-state invariant TSC can stop the TSC
* in either C2 or C3. Disable use of C2 and C3 while using
* the TSC as the timecounter. The timecounter can be changed
* to enable C2 and C3.
*
* Note that the TSC is used as the cputicker for computing
* thread runtime regardless of the timecounter setting, so
* using an alternate timecounter and enabling C2 or C3 can
* result incorrect runtimes for kernel idle threads (but not
* for any non-idle threads).
*/
if (cpu_vendor_id == CPU_VENDOR_INTEL &&
(amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
if (bootverbose)
printf("TSC timecounter disables C2 and C3.\n");
}
/*
* We can not use the TSC in SMP mode unless the TSCs on all CPUs
* are synchronized. If the user is sure that the system has
* synchronized TSCs, set kern.timecounter.smp_tsc tunable to a
* non-zero value. The TSC seems unreliable in virtualized SMP
* environments, so it is set to a negative quality in those cases.
*/
#ifdef SMP
if (mp_ncpus > 1)
tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
else
#endif /* SMP */
if (tsc_is_invariant)
tsc_timecounter.tc_quality = 1000;
max_freq >>= tsc_shift;
for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++)
;
/*
* Timecounter implementation selection, top to bottom:
* - If RDTSCP is available, use RDTSCP.
* - If fence instructions are provided (SSE2), use LFENCE;RDTSC
* on Intel, and MFENCE;RDTSC on AMD.
* - For really old CPUs, just use RDTSC.
*/
if ((amd_feature & AMDID_RDTSCP) != 0) {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tscp_get_timecount_low : tscp_get_timecount;
} else if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) {
if (cpu_vendor_id == CPU_VENDOR_AMD ||
cpu_vendor_id == CPU_VENDOR_HYGON) {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low_mfence :
tsc_get_timecount_mfence;
} else {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low_lfence :
tsc_get_timecount_lfence;
}
} else {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low : tsc_get_timecount;
}
if (shift > 0) {
tsc_timecounter.tc_name = "TSC-low";
if (bootverbose)
printf("TSC timecounter discards lower %d bit(s)\n",
shift);
}
if (tsc_freq != 0) {
tsc_timecounter.tc_frequency = tsc_freq >> shift;
tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
/*
* Timecounter registration is deferred until after late
* calibration is finished.
*/
}
}
SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
static void
tsc_update_freq(uint64_t new_freq)
{
atomic_store_rel_64(&tsc_freq, new_freq);
atomic_store_rel_64(&tsc_timecounter.tc_frequency,
new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
}
+void
+tsc_init(void)
+{
+ if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
+ return;
+
+ probe_tsc_freq();
+}
+
/*
* Perform late calibration of the TSC frequency once ACPI-based timecounters
* are available. At this point timehands are not set up, so we read the
* highest-quality timecounter directly rather than using (s)binuptime().
*/
void
tsc_calibrate(void)
{
uint64_t freq;
if (tsc_disabled)
return;
if (tsc_early_calib_exact)
goto calibrated;
fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
freq = clockcalib(rdtsc_ordered, "TSC");
fpu_kern_leave(curthread, NULL);
tsc_update_freq(freq);
calibrated:
tc_init(&tsc_timecounter);
set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
}
void
resume_TSC(void)
{
#ifdef SMP
int quality;
/* If TSC was not good on boot, it is unlikely to become good now. */
if (tsc_timecounter.tc_quality < 0)
return;
/* Nothing to do with UP. */
if (mp_ncpus < 2)
return;
/*
* If TSC was good, a single synchronization should be enough,
* but honour smp_tsc_adjust if it's set.
*/
quality = test_tsc(MAX(smp_tsc_adjust, 1));
if (quality != tsc_timecounter.tc_quality) {
printf("TSC timecounter quality changed: %d -> %d\n",
tsc_timecounter.tc_quality, quality);
tsc_timecounter.tc_quality = quality;
}
#endif /* SMP */
}
/*
* When cpufreq levels change, find out about the (new) max frequency. We
* use this to update CPU accounting in case it got a lower estimate at boot.
*/
static void
tsc_levels_changed(void *arg, int unit)
{
device_t cf_dev;
struct cf_level *levels;
int count, error;
uint64_t max_freq;
/* Only use values from the first CPU, assuming all are equal. */
if (unit != 0)
return;
/* Find the appropriate cpufreq device instance. */
cf_dev = devclass_get_device(devclass_find("cpufreq"), unit);
if (cf_dev == NULL) {
printf("tsc_levels_changed() called but no cpufreq device?\n");
return;
}
/* Get settings from the device and find the max frequency. */
count = 64;
levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
if (levels == NULL)
return;
error = CPUFREQ_LEVELS(cf_dev, levels, &count);
if (error == 0 && count != 0) {
max_freq = (uint64_t)levels[0].total_set.freq * 1000000;
set_cputicker(rdtsc, max_freq, 1);
} else
printf("tsc_levels_changed: no max freq found\n");
free(levels, M_TEMP);
}
/*
* If the TSC timecounter is in use, veto the pending change. It may be
* possible in the future to handle a dynamically-changing timecounter rate.
*/
static void
tsc_freq_changing(void *arg, const struct cf_level *level, int *status)
{
if (*status != 0 || timecounter != &tsc_timecounter)
return;
printf("timecounter TSC must not be in use when "
"changing frequencies; change denied\n");
*status = EBUSY;
}
/* Update TSC freq with the value indicated by the caller. */
static void
tsc_freq_changed(void *arg, const struct cf_level *level, int status)
{
uint64_t freq;
/* If there was an error during the transition, don't do anything. */
if (tsc_disabled || status != 0)
return;
/* Total setting for this level gives the new frequency in MHz. */
freq = (uint64_t)level->total_set.freq * 1000000;
tsc_update_freq(freq);
}
static int
sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
{
int error;
uint64_t freq;
freq = atomic_load_acq_64(&tsc_freq);
if (freq == 0)
return (EOPNOTSUPP);
error = sysctl_handle_64(oidp, &freq, 0, req);
if (error == 0 && req->newptr != NULL)
tsc_update_freq(freq);
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq,
CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE,
0, 0, sysctl_machdep_tsc_freq, "QU",
"Time Stamp Counter frequency");
static u_int
tsc_get_timecount(struct timecounter *tc __unused)
{
return (rdtsc32());
}
static u_int
tscp_get_timecount(struct timecounter *tc __unused)
{
return (rdtscp32());
}
static inline u_int
tsc_get_timecount_low(struct timecounter *tc)
{
uint32_t rv;
__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
: "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
return (rv);
}
static u_int
tscp_get_timecount_low(struct timecounter *tc)
{
uint32_t rv;
__asm __volatile("rdtscp; movl %1, %%ecx; shrd %%cl, %%edx, %0"
: "=&a" (rv) : "m" (tc->tc_priv) : "ecx", "edx");
return (rv);
}
static u_int
tsc_get_timecount_lfence(struct timecounter *tc __unused)
{
lfence();
return (rdtsc32());
}
static u_int
tsc_get_timecount_low_lfence(struct timecounter *tc)
{
lfence();
return (tsc_get_timecount_low(tc));
}
static u_int
tsc_get_timecount_mfence(struct timecounter *tc __unused)
{
mfence();
return (rdtsc32());
}
static u_int
tsc_get_timecount_low_mfence(struct timecounter *tc)
{
mfence();
return (tsc_get_timecount_low(tc));
}
static uint32_t
x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
{
vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
vdso_th->th_x86_hpet_idx = 0xffffffff;
vdso_th->th_x86_pvc_last_systime = 0;
vdso_th->th_x86_pvc_stable_mask = 0;
bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
return (1);
}
#ifdef COMPAT_FREEBSD32
static uint32_t
x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
struct timecounter *tc)
{
vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
vdso_th32->th_x86_hpet_idx = 0xffffffff;
vdso_th32->th_x86_pvc_last_systime = 0;
vdso_th32->th_x86_pvc_stable_mask = 0;
bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
return (1);
}
#endif

File Metadata

Mime Type
text/x-diff
Expires
Sun, Mar 29, 1:15 PM (1 d, 16 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28218285
Default Alt Text
(147 KB)

Event Timeline