Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F144456855
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
147 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index c629db566528..6eda6c9c8352 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,1892 +1,1902 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
* Copyright (c) 2003 Peter Wemm.
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_pci.h"
#include "opt_platform.h"
#include "opt_sched.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/asan.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/csan.h>
#include <sys/efi.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
#include <sys/msan.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#ifdef SMP
#include <sys/smp.h>
#endif
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_dumpset.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <net/netisr.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/frame.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/proc.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/trap.h>
#include <machine/tss.h>
#include <x86/ucode.h>
#include <x86/ifunc.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
#ifdef DEV_ATPIC
#include <x86/isa/icu.h>
#else
#include <x86/apicvar.h>
#endif
#include <isa/isareg.h>
#include <isa/rtc.h>
#include <x86/init.h>
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
/*
* The PTI trampoline stack needs enough space for a hardware trapframe and a
* couple of scratch registers, as well as the trapframe left behind after an
* iret fault.
*/
CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
offsetof(struct pti_frame, pti_rip));
extern u_int64_t hammer_time(u_int64_t, u_int64_t);
static void cpu_startup(void *);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
+/* Probe 8254 PIT and TSC. */
+static void native_clock_source_init(void);
+
/* Preload data parse function */
static caddr_t native_parse_preload_data(u_int64_t);
/* Native function to fetch and parse the e820 map */
static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
/* Default init_ops implementation. */
struct init_ops init_ops = {
- .parse_preload_data = native_parse_preload_data,
- .early_clock_source_init = i8254_init,
+ .parse_preload_data = native_parse_preload_data,
+ .early_clock_source_init = native_clock_source_init,
.early_delay = i8254_delay,
.parse_memmap = native_parse_memmap,
};
/*
* Physical address of the EFI System Table. Stashed from the metadata hints
* passed into the kernel and used by the EFI code to call runtime services.
*/
vm_paddr_t efi_systbl_phys;
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
int cold = 1;
long Maxmem = 0;
long realmem = 0;
struct kva_md_info kmi;
struct region_descriptor r_idt;
struct pcpu *__pcpu;
struct pcpu temp_bsp_pcpu;
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
struct mtx dt_lock; /* lock for GDT and LDT */
void (*vmm_resume_p)(void);
bool efi_boot;
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = kern_getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBook4,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = kern_getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)vm_free_count()))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
realmem = atop(memsize);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)vm_free_count()),
ptoa((uintmax_t)vm_free_count()) / 1048576);
#ifdef DEV_PCI
if (bootverbose && intel_graphics_stolen_base != 0)
printf("intel stolen mem: base %#jx size %ju MB\n",
(uintmax_t)intel_graphics_stolen_base,
(uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
#endif
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
static void
late_ifunc_resolve(void *dummy __unused)
{
link_elf_late_ireloc();
}
SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
void
cpu_setregs(void)
{
register_t cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
* BSP. See the comments there about why we set them.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
}
/*
* Initialize amd64 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
/*
* Software prototypes -- in more palatable form.
*
* Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
* slots as corresponding segments for i386 kernel.
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GNULL2_SEL 1 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 8 64 bit Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_long = 1,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
.ssd_type = SDT_SYSTSS,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Actually, the TSS is a system descriptor which is double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 LDT Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 12 LDT Descriptor, double size */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_long = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
_Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
void
setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = (uintptr_t)func;
ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
ip->gd_ist = ist;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((uintptr_t)func)>>16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(dblfault),
IDTVEC(div_pti), IDTVEC(bpt_pti),
IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
IDTVEC(xmm_pti),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
#endif
#ifdef XENHVM
IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
#endif
IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
IDTVEC(fast_syscall_pti);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
db_printf("\n");
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
struct {
uint16_t limit;
uint64_t base;
} __packed idtr, gdtr;
uint16_t ldt, tr;
__asm __volatile("sidt %0" : "=m" (idtr));
db_printf("idtr\t0x%016lx/%04x\n",
(u_long)idtr.base, (u_int)idtr.limit);
__asm __volatile("sgdt %0" : "=m" (gdtr));
db_printf("gdtr\t0x%016lx/%04x\n",
(u_long)gdtr.base, (u_int)gdtr.limit);
__asm __volatile("sldt %0" : "=r" (ldt));
db_printf("ldtr\t0x%04x\n", ldt);
__asm __volatile("str %0" : "=r" (tr));
db_printf("tr\t0x%04x\n", tr);
db_printf("cr0\t0x%016lx\n", rcr0());
db_printf("cr2\t0x%016lx\n", rcr2());
db_printf("cr3\t0x%016lx\n", rcr3());
db_printf("cr4\t0x%016lx\n", rcr4());
if (rcr4() & CR4_XSAVE)
db_printf("xcr0\t0x%016lx\n", rxcr(0));
db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
db_printf("FEATURES_CTL\t%016lx\n",
rdmsr(MSR_IA32_FEATURE_CONTROL));
db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
}
DB_SHOW_COMMAND(dbregs, db_show_dbregs)
{
db_printf("dr0\t0x%016lx\n", rdr0());
db_printf("dr1\t0x%016lx\n", rdr1());
db_printf("dr2\t0x%016lx\n", rdr2());
db_printf("dr3\t0x%016lx\n", rdr3());
db_printf("dr6\t0x%016lx\n", rdr6());
db_printf("dr7\t0x%016lx\n", rdr7());
}
#endif
void
sdtossd(sd, ssd)
struct user_segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_long = sd->sd_long;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
void
ssdtosd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct user_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_long = ssd->ssd_long;
sd->sd_def32 = ssd->ssd_def32;
sd->sd_gran = ssd->ssd_gran;
}
void
ssdtosyssd(ssd, sd)
struct soft_segment_descriptor *ssd;
struct system_segment_descriptor *sd;
{
sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
sd->sd_type = ssd->ssd_type;
sd->sd_dpl = ssd->ssd_dpl;
sd->sd_p = ssd->ssd_p;
sd->sd_gran = ssd->ssd_gran;
}
u_int basemem;
static int
add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
int *physmap_idxp)
{
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (length == 0)
return (1);
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*
* NB: physmap_idx points to the next free slot.
*/
insert_idx = physmap_idx;
for (i = 0; i <= physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
return (1);
}
void
bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
vm_paddr_t *physmap, int *physmap_idx)
{
struct bios_smap *smap, *smapend;
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
for (smap = smapbase; smap < smapend; smap++) {
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016lx len=%016lx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
continue;
if (!add_physmap_entry(smap->base, smap->length, physmap,
physmap_idx))
break;
}
}
static void
add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
int *physmap_idx)
{
struct efi_md *map, *p;
const char *type;
size_t efisz;
int ndesc, i;
static const char *types[] = {
"Reserved",
"LoaderCode",
"LoaderData",
"BootServicesCode",
"BootServicesData",
"RuntimeServicesCode",
"RuntimeServicesData",
"ConventionalMemory",
"UnusableMemory",
"ACPIReclaimMemory",
"ACPIMemoryNVS",
"MemoryMappedIO",
"MemoryMappedIOPortSpace",
"PalCode",
"PersistentMemory"
};
/*
* Memory map data provided by UEFI via the GetMemoryMap
* Boot Services API.
*/
efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
map = (struct efi_md *)((uint8_t *)efihdr + efisz);
if (efihdr->descriptor_size == 0)
return;
ndesc = efihdr->memory_size / efihdr->descriptor_size;
if (boothowto & RB_VERBOSE)
printf("%23s %12s %12s %8s %4s\n",
"Type", "Physical", "Virtual", "#Pages", "Attr");
for (i = 0, p = map; i < ndesc; i++,
p = efi_next_descriptor(p, efihdr->descriptor_size)) {
if (boothowto & RB_VERBOSE) {
if (p->md_type < nitems(types))
type = types[p->md_type];
else
type = "<INVALID>";
printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
p->md_virt, p->md_pages);
if (p->md_attr & EFI_MD_ATTR_UC)
printf("UC ");
if (p->md_attr & EFI_MD_ATTR_WC)
printf("WC ");
if (p->md_attr & EFI_MD_ATTR_WT)
printf("WT ");
if (p->md_attr & EFI_MD_ATTR_WB)
printf("WB ");
if (p->md_attr & EFI_MD_ATTR_UCE)
printf("UCE ");
if (p->md_attr & EFI_MD_ATTR_WP)
printf("WP ");
if (p->md_attr & EFI_MD_ATTR_RP)
printf("RP ");
if (p->md_attr & EFI_MD_ATTR_XP)
printf("XP ");
if (p->md_attr & EFI_MD_ATTR_NV)
printf("NV ");
if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
printf("MORE_RELIABLE ");
if (p->md_attr & EFI_MD_ATTR_RO)
printf("RO ");
if (p->md_attr & EFI_MD_ATTR_RT)
printf("RUNTIME");
printf("\n");
}
switch (p->md_type) {
case EFI_MD_TYPE_CODE:
case EFI_MD_TYPE_DATA:
case EFI_MD_TYPE_BS_CODE:
case EFI_MD_TYPE_BS_DATA:
case EFI_MD_TYPE_FREE:
/*
* We're allowed to use any entry with these types.
*/
break;
default:
continue;
}
if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
physmap, physmap_idx))
break;
}
}
static void
native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
{
struct bios_smap *smap;
struct efi_map_header *efihdr;
u_int32_t size;
/*
* Memory map from INT 15:E820.
*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes smap.
*/
efihdr = (struct efi_map_header *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_EFI_MAP);
smap = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (efihdr == NULL && smap == NULL)
panic("No BIOS smap or EFI map info from loader!");
if (efihdr != NULL) {
add_efi_map_entries(efihdr, physmap, physmap_idx);
strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
} else {
size = *((u_int32_t *)smap - 1);
bios_add_smap_entries(smap, size, physmap, physmap_idx);
strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
}
}
#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(caddr_t kmdp, u_int64_t first)
{
int i, physmap_idx, pa_indx, da_indx;
vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
u_long physmem_start, physmem_tunable, memtest;
pt_entry_t *pte;
quad_t dcons_addr, dcons_size;
int page_counter;
/*
* Tell the physical memory allocator about pages used to store
* the kernel and preloaded data. See kmem_bootstrap_free().
*/
vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
bzero(physmap, sizeof(physmap));
physmap_idx = 0;
init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
physmap_idx -= 2;
/*
* Find the 'base memory' segment for SMP
*/
basemem = 0;
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] <= 0xA0000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
if (basemem == 0 || basemem > 640) {
if (bootverbose)
printf(
"Memory map doesn't contain a basemem segment, faking it");
basemem = 640;
}
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* The boot memory test is disabled by default, as it takes a
* significant amount of time on large-memory systems, and is
* unfriendly to virtual machines as it unnecessarily touches all
* pages.
*
* A general name is used as the code may be extended to support
* additional tests beyond the current "page present" test.
*/
memtest = 0;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
/*
* Don't allow MAXMEM or hw.physmem to extend the amount of memory
* in the system.
*/
if (Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(&first);
/*
* Size up each available chunk of physical memory.
*
* XXX Some BIOSes corrupt low 64KB between suspend and resume.
* By default, mask off the first 16 pages unless we appear to be
* running in a VM.
*/
physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
if (physmap[0] < physmem_start) {
if (physmem_start < PAGE_SIZE)
physmap[0] = PAGE_SIZE;
else if (physmem_start >= physmap[1])
physmap[0] = round_page(physmap[1] - PAGE_SIZE);
else
physmap[0] = round_page(physmem_start);
}
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
pte = CMAP1;
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
page_counter = 0;
if (memtest != 0)
printf("Testing system memory");
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr = (int *)CADDR1;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= (vm_paddr_t)kernphys && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* Print a "." every GB to show we're making
* progress.
*/
page_counter++;
if ((page_counter % PAGES_PER_GB) == 0)
printf(".");
/*
* map page into kernel: valid, read/write,non-cacheable
*/
*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
invltlb();
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == PHYS_AVAIL_ENTRIES) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
*pte = 0;
invltlb();
if (memtest != 0)
printf("\n");
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
}
static caddr_t
native_parse_preload_data(u_int64_t modulep)
{
caddr_t kmdp;
char *envp;
#ifdef DDB
vm_offset_t ksym_start;
vm_offset_t ksym_end;
#endif
preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
preload_bootstrap_relocate(KERNBASE);
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
if (envp != NULL)
envp += KERNBASE;
init_static_kenv(envp, 0);
#ifdef DDB
ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
db_fetch_ksymtab(ksym_start, ksym_end, 0);
#endif
efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
return (kmdp);
}
+static void
+native_clock_source_init(void)
+{
+ i8254_init();
+ tsc_init();
+}
+
static void
amd64_kdb_init(void)
{
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
}
/* Set up the fast syscall stuff */
void
amd64_conf_fast_syscall(void)
{
uint64_t msr;
msr = rdmsr(MSR_EFER) | EFER_SCE;
wrmsr(MSR_EFER, msr);
wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
(u_int64_t)IDTVEC(fast_syscall));
wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
wrmsr(MSR_STAR, msr);
wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
}
void
amd64_bsp_pcpu_init1(struct pcpu *pc)
{
struct user_segment_descriptor *gdt;
PCPU_SET(prvspace, pc);
gdt = *PCPU_PTR(gdt);
PCPU_SET(curthread, &thread0);
PCPU_SET(tssp, PCPU_PTR(common_tss));
PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
PCPU_SET(smp_tlb_gen, 1);
}
void
amd64_bsp_pcpu_init2(uint64_t rsp0)
{
PCPU_SET(rsp0, rsp0);
PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
PCPU_SET(curpcb, thread0.td_pcb);
}
void
amd64_bsp_ist_init(struct pcpu *pc)
{
struct nmi_pcpu *np;
struct amd64tss *tssp;
tssp = &pc->pc_common_tss;
/* doublefault stack space, runs on ist1 */
np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist1 = (long)np;
/*
* NMI stack, runs on ist2. The pcpu pointer is stored just
* above the start of the ist2 stack.
*/
np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist2 = (long)np;
/*
* MC# stack, runs on ist3. The pcpu pointer is stored just
* above the start of the ist3 stack.
*/
np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist3 = (long)np;
/*
* DB# stack, runs on ist4.
*/
np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
np->np_pcpu = (register_t)pc;
tssp->tss_ist4 = (long)np;
}
u_int64_t
hammer_time(u_int64_t modulep, u_int64_t physfree)
{
caddr_t kmdp;
int gsel_tss, x;
struct pcpu *pc;
uint64_t cr3, rsp0;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
char *env;
struct user_segment_descriptor *gdt;
struct region_descriptor r_gdt;
size_t kstack0_sz;
int late_console;
TSRAW(&thread0, TS_ENTER, __func__, NULL);
/*
* Calculate kernphys by inspecting page table created by loader.
* The assumptions:
* - kernel is mapped at KERNBASE, backed by contiguous phys memory
* aligned at 2M, below 4G (the latter is important for AP startup)
* - there is a 2M hole at KERNBASE
* - kernel is mapped with 2M superpages
* - all participating memory, i.e. kernel, modules, metadata,
* page table is accessible by pre-created 1:1 mapping
* (right now loader creates 1:1 mapping for lower 4G, and all
* memory is from there)
* - there is a usable memory block right after the end of the
* mapped kernel and all modules/metadata, pointed to by
* physfree, for early allocations
*/
cr3 = rcr3();
pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
(vm_offset_t)hammer_time);
pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
(vm_offset_t)hammer_time);
pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
(vm_offset_t)hammer_time);
kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
(vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
/* Fix-up for 2M hole */
physfree += kernphys;
kernphys += NBPDR;
kmdp = init_ops.parse_preload_data(modulep);
efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
MODINFOMD_EFI_MAP) != NULL;
if (!efi_boot) {
/* Tell the bios to warmboot next time */
atomic_store_short((u_short *)0x472, 0x1234);
}
physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
physfree = roundup2(physfree, PAGE_SIZE);
identify_cpu1();
identify_hypervisor();
identify_cpu_fixup_bsp();
identify_cpu2();
initializecpucache();
/*
* Check for pti, pcid, and invpcid before ifuncs are
* resolved, to correctly select the implementation for
* pmap_activate_sw_mode().
*/
pti = pti_get_default();
TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
invpcid_works = (cpu_stdext_feature &
CPUID_STDEXT_INVPCID) != 0;
} else {
pmap_pcid_enabled = 0;
}
link_elf_ireloc(kmdp);
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
/* Init basic tunables, hz etc */
init_param1();
thread0.td_kstack = physfree - kernphys + KERNSTART;
thread0.td_kstack_pages = kstack_pages;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
bzero((void *)thread0.td_kstack, kstack0_sz);
physfree += kstack0_sz;
/*
* Initialize enough of thread0 for delayed invalidation to
* work very early. Rely on thread0.td_base_pri
* zero-initialization, it is reset to PVM at proc0_init().
*/
pmap_thread_init_invl_gen(&thread0);
pc = &temp_bsp_pcpu;
pcpu_init(pc, 0, sizeof(struct pcpu));
gdt = &temp_bsp_pcpu.pc_gdt[0];
/*
* make gdt memory segments
*/
for (x = 0; x < NGDT; x++) {
if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
ssdtosd(&gdt_segs[x], &gdt[x]);
}
gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
ssdtosyssd(&gdt_segs[GPROC0_SEL],
(struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (long)gdt;
lgdt(&r_gdt);
wrmsr(MSR_FSBASE, 0); /* User value */
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
SEL_UPL, 0);
setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
SEL_UPL, 0);
setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
SEL_KPL, 0);
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
SEL_KPL, 0);
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
&IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
#endif
#ifdef XENHVM
setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
&IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
#endif
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
lidt(&r_idt);
/*
* Initialize the clock before the console so that console
* initialization can use DELAY().
*/
clock_init();
/*
* Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
* transition).
* Once bootblocks have updated, we can test directly for
* efi_systbl != NULL here...
*/
if (efi_boot)
vty_set_preferred(VTY_VT);
TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
&syscall_ret_l1d_flush_mode);
TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
&x86_rngds_mitg_enable);
finishidentcpu(); /* Final stage of CPU initialization */
initializecpu(); /* Initialize CPU registers */
amd64_bsp_ist_init(pc);
/* Set the IO permission bitmap (empty due to tss seg limit) */
pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
IOPERM_BITMAP_SIZE;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
ltr(gsel_tss);
amd64_conf_fast_syscall();
/*
* We initialize the PCB pointer early so that exception
* handlers will work. Also set up td_critnest to short-cut
* the page fault handler.
*/
cpu_max_ext_state_size = sizeof(struct savefpu);
set_top_of_stack_td(&thread0);
thread0.td_pcb = get_pcb_td(&thread0);
thread0.td_critnest = 1;
/*
* The console and kdb should be initialized even earlier than here,
* but some console drivers don't work until after getmemsize().
* Default to late console initialization to support these drivers.
* This loses mainly printf()s in getmemsize() and early debugging.
*/
late_console = 1;
TUNABLE_INT_FETCH("debug.late_console", &late_console);
if (!late_console) {
cninit();
amd64_kdb_init();
}
getmemsize(kmdp, physfree);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
#ifdef DEV_PCI
/* This call might adjust phys_avail[]. */
pci_early_quirks();
#endif
if (late_console)
cninit();
/*
* Dump the boot metadata. We have to wait for cninit() since console
* output is required. If it's grossly incorrect the kernel will never
* make it this far.
*/
if (getenv_is_true("debug.dump_modinfo_at_boot"))
preload_dump();
#ifdef DEV_ISA
#ifdef DEV_ATPIC
elcr_probe();
atpic_startup();
#else
/* Reset and mask the atpics and leave them shut down. */
atpic_reset();
/*
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
#endif
#else
#error "have you forgotten the isa device?"
#endif
if (late_console)
amd64_kdb_init();
msgbufinit(msgbufp, msgbufsize);
fpuinit();
/* make an initial tss so cpu can get interrupt stack on syscall! */
rsp0 = thread0.td_md.md_stack_base;
/* Ensure the stack is aligned to 16 bytes */
rsp0 &= ~0xFul;
PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
amd64_bsp_pcpu_init2(rsp0);
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
load_ds(_udatasel);
load_es(_udatasel);
load_fs(_ufssel);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
env = kern_getenv("kernelname");
if (env != NULL)
strlcpy(kernelname, env, sizeof(kernelname));
kcsan_cpu_init(0);
#ifdef FDT
x86_init_fdt();
#endif
thread0.td_critnest = 0;
kasan_init();
kmsan_init();
TSEXIT();
/* Location of kernel stack for locore */
return (thread0.td_md.md_stack_base);
}
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct bios_smap *smapbase;
struct bios_smap_xattr smap;
caddr_t kmdp;
uint32_t *smapattr;
int count, error, i;
/* Retrieve the system memory map from the loader. */
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
return (0);
smapattr = (uint32_t *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
error = 0;
for (i = 0; i < count; i++) {
smap.base = smapbase[i].base;
smap.length = smapbase[i].length;
smap.type = smapbase[i].type;
if (smapattr != NULL)
smap.xattr = smapattr[i];
else
smap.xattr = 0;
error = SYSCTL_OUT(req, &smap, sizeof(smap));
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, smap,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
smap_sysctl_handler, "S,bios_smap_xattr",
"Raw BIOS SMAP data");
static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct efi_map_header *efihdr;
caddr_t kmdp;
uint32_t efisize;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf64 kernel");
efihdr = (struct efi_map_header *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_EFI_MAP);
if (efihdr == NULL)
return (0);
efisize = *((uint32_t *)efihdr - 1);
return (SYSCTL_OUT(req, efihdr, efisize));
}
SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
efi_map_sysctl_handler, "S,efi_map_header",
"Raw EFI Memory Map");
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
critical_enter();
} else
td->td_md.md_spinlock_count++;
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0) {
critical_exit();
intr_restore(flags);
}
}
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_r12 = tf->tf_r12;
pcb->pcb_r13 = tf->tf_r13;
pcb->pcb_r14 = tf->tf_r14;
pcb->pcb_r15 = tf->tf_r15;
pcb->pcb_rbp = tf->tf_rbp;
pcb->pcb_rbx = tf->tf_rbx;
pcb->pcb_rip = tf->tf_rip;
pcb->pcb_rsp = tf->tf_rsp;
}
/*
* The pcb_flags is only modified by current thread, or by other threads
* when current thread is stopped. However, current thread may change it
* from the interrupt context in cpu_switch(), or in the trap handler.
* When we read-modify-write pcb_flags from C sources, compiler may generate
* code that is not atomic regarding the interrupt handler. If a trap or
* interrupt happens and any flag is modified from the handler, it can be
* clobbered with the cached value later. Therefore, we implement setting
* and clearing flags with single-instruction functions, which do not race
* with possible modification of the flags from the trap or interrupt context,
* because traps and interrupts are executed only on instruction boundary.
*/
void
set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
{
__asm __volatile("orl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
/*
* The support for RDFSBASE, WRFSBASE and similar instructions for %gs
* base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
* pcb if user space modified the bases. We must save on the context
* switch or if the return to usermode happens through the doreti.
*
* Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
* which have a consequence that the base MSRs must be saved each time
* the PCB_FULL_IRET flag is set. We disable interrupts to sync with
* context switches.
*/
static void
set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
{
register_t r;
if (curpcb == pcb &&
(flags & PCB_FULL_IRET) != 0 &&
(pcb->pcb_flags & PCB_FULL_IRET) == 0) {
r = intr_disable();
if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
if (rfs() == _ufssel)
pcb->pcb_fsbase = rdfsbase();
if (rgs() == _ugssel)
pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
}
set_pcb_flags_raw(pcb, flags);
intr_restore(r);
} else {
set_pcb_flags_raw(pcb, flags);
}
}
DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
{
return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
set_pcb_flags_fsgsbase : set_pcb_flags_raw);
}
void
clear_pcb_flags(struct pcb *pcb, const u_int flags)
{
__asm __volatile("andl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
#undef memset
#undef memmove
#undef memcpy
void *memset_std(void *buf, int c, size_t len);
void *memset_erms(void *buf, int c, size_t len);
void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
size_t len);
#ifdef KCSAN
/*
* These fail to build as ifuncs when used with KCSAN.
*/
void *
memset(void *buf, int c, size_t len)
{
return (memset_std(buf, c, len));
}
void *
memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
{
return (memmove_std(dst, src, len));
}
void *
memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
{
return (memcpy_std(dst, src, len));
}
#else
DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memset_erms : memset_std);
}
DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memmove_erms : memmove_std);
}
DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
memcpy_erms : memcpy_std);
}
#endif
void pagezero_std(void *addr);
void pagezero_erms(void *addr);
DEFINE_IFUNC(, void , pagezero, (void *))
{
return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
pagezero_erms : pagezero_std);
}
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index ee6752861c9e..6913c0691fd4 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -1,1868 +1,1877 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
* Copyright (c) 2018 The FreeBSD Foundation
* Copyright (c) 1992 Terrence R. Lambert.
* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_apic.h"
#include "opt_atpic.h"
#include "opt_cpu.h"
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_isa.h"
#include "opt_kstack_pages.h"
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_perfmon.h"
#include "opt_platform.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/bus.h>
#include <sys/callout.h>
#include <sys/cons.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memrange.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/ptrace.h>
#include <sys/reboot.h>
#include <sys/reg.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/ucontext.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/vm_dumpset.h>
#ifdef DDB
#ifndef KDB
#error KDB must be enabled in order for DDB to work!
#endif
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
#endif
#include <isa/rtc.h>
#include <net/netisr.h>
#include <machine/bootinfo.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/cputypes.h>
#include <machine/intr_machdep.h>
#include <x86/mca.h>
#include <machine/md_var.h>
#include <machine/metadata.h>
#include <machine/mp_watchdog.h>
#include <machine/pc/bios.h>
#include <machine/pcb.h>
#include <machine/pcb_ext.h>
#include <machine/proc.h>
#include <machine/sigframe.h>
#include <machine/specialreg.h>
#include <machine/sysarch.h>
#include <machine/trap.h>
#include <x86/ucode.h>
#include <machine/vm86.h>
#include <x86/init.h>
#ifdef PERFMON
#include <machine/perfmon.h>
#endif
#ifdef SMP
#include <machine/smp.h>
#endif
#ifdef FDT
#include <x86/fdt.h>
#endif
#ifdef DEV_APIC
#include <x86/apicvar.h>
#endif
#ifdef DEV_ISA
#include <x86/isa/icu.h>
#endif
/* Sanity check for __curthread() */
CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
register_t init386(int first);
void dblfault_handler(void);
void identify_cpu(void);
static void cpu_startup(void *);
SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
/* Intel ICH registers */
#define ICH_PMBASE 0x400
#define ICH_SMI_EN ICH_PMBASE + 0x30
int _udatasel, _ucodesel;
u_int basemem;
static int above4g_allow = 1;
static int above24g_allow = 0;
int cold = 1;
long Maxmem = 0;
long realmem = 0;
#ifdef PAE
FEATURE(pae, "Physical Address Extensions");
#endif
struct kva_md_info kmi;
static struct trapframe proc0_tf;
struct pcpu __pcpu[MAXCPU];
+static void i386_clock_source_init(void);
+
struct mtx icu_lock;
struct mem_range_softc mem_range_softc;
extern char start_exceptions[], end_exceptions[];
extern struct sysentvec elf32_freebsd_sysvec;
/* Default init_ops implementation. */
struct init_ops init_ops = {
- .early_clock_source_init = i8254_init,
+ .early_clock_source_init = i386_clock_source_init,
.early_delay = i8254_delay,
};
+static void
+i386_clock_source_init(void)
+{
+ i8254_init();
+ tsc_init();
+}
+
static void
cpu_startup(dummy)
void *dummy;
{
uintmax_t memsize;
char *sysenv;
/*
* On MacBooks, we need to disallow the legacy USB circuit to
* generate an SMI# because this can cause several problems,
* namely: incorrect CPU frequency detection and failure to
* start the APs.
* We do this by disabling a bit in the SMI_EN (SMI Control and
* Enable register) of the Intel ICH LPC Interface Bridge.
*/
sysenv = kern_getenv("smbios.system.product");
if (sysenv != NULL) {
if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
strncmp(sysenv, "MacBook3,1", 10) == 0 ||
strncmp(sysenv, "MacBook4,1", 10) == 0 ||
strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
strncmp(sysenv, "Macmini1,1", 10) == 0) {
if (bootverbose)
printf("Disabling LEGACY_USB_EN bit on "
"Intel ICH.\n");
outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
}
freeenv(sysenv);
}
/*
* Good {morning,afternoon,evening,night}.
*/
startrtclock();
printcpuinfo();
panicifcpuunsupported();
#ifdef PERFMON
perfmon_init();
#endif
/*
* Display physical memory if SMBIOS reports reasonable amount.
*/
memsize = 0;
sysenv = kern_getenv("smbios.memory.enabled");
if (sysenv != NULL) {
memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
freeenv(sysenv);
}
if (memsize < ptoa((uintmax_t)vm_free_count()))
memsize = ptoa((uintmax_t)Maxmem);
printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
realmem = atop(memsize);
/*
* Display any holes after the first chunk of extended memory.
*/
if (bootverbose) {
int indx;
printf("Physical memory chunk(s):\n");
for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
vm_paddr_t size;
size = phys_avail[indx + 1] - phys_avail[indx];
printf(
"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
(uintmax_t)phys_avail[indx],
(uintmax_t)phys_avail[indx + 1] - 1,
(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
}
}
vm_ksubmap_init(&kmi);
printf("avail memory = %ju (%ju MB)\n",
ptoa((uintmax_t)vm_free_count()),
ptoa((uintmax_t)vm_free_count()) / 1048576);
/*
* Set up buffers, so they can be used to read disk labels.
*/
bufinit();
vm_pager_bufferinit();
cpu_setregs();
}
void
cpu_setregs(void)
{
unsigned int cr0;
cr0 = rcr0();
/*
* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
*
* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
* instructions. We must set the CR0_MP bit and use the CR0_TS
* bit to control the trap, because setting the CR0_EM bit does
* not cause WAIT instructions to trap. It's important to trap
* WAIT instructions - otherwise the "wait" variants of no-wait
* control instructions would degenerate to the "no-wait" variants
* after FP context switches but work correctly otherwise. It's
* particularly important to trap WAITs when there is no NPX -
* otherwise the "wait" variants would always degenerate.
*
* Try setting CR0_NE to get correct error reporting on 486DX's.
* Setting it should fail or do nothing on lesser processors.
*/
cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
load_cr0(cr0);
load_gs(_udatasel);
}
u_long bootdev; /* not a struct cdev *- encoding is different */
SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
/*
* Initialize 386 and configure to run kernel
*/
/*
* Initialize segments & interrupt table
*/
int _default_ldt;
struct mtx dt_lock; /* lock for GDT and LDT */
union descriptor gdt0[NGDT]; /* initial global descriptor table */
union descriptor *gdt = gdt0; /* global descriptor table */
union descriptor *ldt; /* local descriptor table */
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
static struct i386tss *dblfault_tss;
static char *dblfault_stack;
static struct i386tss common_tss0;
vm_offset_t proc0kstack;
/*
* software prototypes -- in more palatable form.
*
* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
*/
struct soft_segment_descriptor gdt_segs[] = {
/* GNULL_SEL 0 Null Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = SEL_KPL,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUFS_SEL 2 %fs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUGS_SEL 3 %gs Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GCODE_SEL 4 Code Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GDATA_SEL 5 Data Descriptor for kernel */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUCODE_SEL 6 Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GUDATA_SEL 7 Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
{ .ssd_base = 0x400,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_KPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
{
.ssd_base = 0x0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GLDT_SEL 10 LDT Descriptor */
{ .ssd_base = 0,
.ssd_limit = sizeof(union descriptor) * NLDT - 1,
.ssd_type = SDT_SYSLDT,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GUSERLDT_SEL 11 User LDT Descriptor per process */
{ .ssd_base = 0,
.ssd_limit = (512 * sizeof(union descriptor)-1),
.ssd_type = SDT_SYSLDT,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GPANIC_SEL 12 Panic Tss Descriptor */
{ .ssd_base = 0,
.ssd_limit = sizeof(struct i386tss)-1,
.ssd_type = SDT_SYS386TSS,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
{ .ssd_base = 0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = 0,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 1 },
/* GNDIS_SEL 18 NDIS Descriptor */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
};
static struct soft_segment_descriptor ldt_segs[] = {
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Code Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMERA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
/* Null Descriptor - overwritten by call gate */
{ .ssd_base = 0x0,
.ssd_limit = 0x0,
.ssd_type = 0,
.ssd_dpl = 0,
.ssd_p = 0,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 0,
.ssd_gran = 0 },
/* Data Descriptor for user */
{ .ssd_base = 0x0,
.ssd_limit = 0xfffff,
.ssd_type = SDT_MEMRWA,
.ssd_dpl = SEL_UPL,
.ssd_p = 1,
.ssd_xx = 0, .ssd_xx1 = 0,
.ssd_def32 = 1,
.ssd_gran = 1 },
};
size_t setidt_disp;
void
setidt(int idx, inthand_t *func, int typ, int dpl, int selec)
{
uintptr_t off;
off = func != NULL ? (uintptr_t)func + setidt_disp : 0;
setidt_nodisp(idx, off, typ, dpl, selec);
}
void
setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec)
{
struct gate_descriptor *ip;
ip = idt + idx;
ip->gd_looffset = off;
ip->gd_selector = selec;
ip->gd_stkcpy = 0;
ip->gd_xx = 0;
ip->gd_type = typ;
ip->gd_dpl = dpl;
ip->gd_p = 1;
ip->gd_hioffset = ((u_int)off) >> 16 ;
}
extern inthand_t
IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm),
#ifdef KDTRACE_HOOKS
IDTVEC(dtrace_ret),
#endif
#ifdef XENHVM
IDTVEC(xen_intr_upcall),
#endif
IDTVEC(int0x80_syscall);
#ifdef DDB
/*
* Display the index and function name of any IDT entries that don't use
* the default 'rsvd' entry point.
*/
DB_SHOW_COMMAND(idt, db_show_idt)
{
struct gate_descriptor *ip;
int idx;
uintptr_t func, func_trm;
bool trm;
ip = idt;
for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
if (ip->gd_type == SDT_SYSTASKGT) {
db_printf("%3d\t<TASK>\n", idx);
} else {
func = (ip->gd_hioffset << 16 | ip->gd_looffset);
if (func >= PMAP_TRM_MIN_ADDRESS) {
func_trm = func;
func -= setidt_disp;
trm = true;
} else
trm = false;
if (func != (uintptr_t)&IDTVEC(rsvd)) {
db_printf("%3d\t", idx);
db_printsym(func, DB_STGY_PROC);
if (trm)
db_printf(" (trampoline %#x)",
func_trm);
db_printf("\n");
}
}
ip++;
}
}
/* Show privileged registers. */
DB_SHOW_COMMAND(sysregs, db_show_sysregs)
{
uint64_t idtr, gdtr;
idtr = ridt();
db_printf("idtr\t0x%08x/%04x\n",
(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
gdtr = rgdt();
db_printf("gdtr\t0x%08x/%04x\n",
(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
db_printf("ldtr\t0x%04x\n", rldt());
db_printf("tr\t0x%04x\n", rtr());
db_printf("cr0\t0x%08x\n", rcr0());
db_printf("cr2\t0x%08x\n", rcr2());
db_printf("cr3\t0x%08x\n", rcr3());
db_printf("cr4\t0x%08x\n", rcr4());
if (rcr4() & CR4_XSAVE)
db_printf("xcr0\t0x%016llx\n", rxcr(0));
if (amd_feature & (AMDID_NX | AMDID_LM))
db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER));
if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
db_printf("FEATURES_CTL\t0x%016llx\n",
rdmsr(MSR_IA32_FEATURE_CONTROL));
if (((cpu_vendor_id == CPU_VENDOR_INTEL ||
cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) ||
cpu_vendor_id == CPU_VENDOR_HYGON)
db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR));
if (cpu_feature & CPUID_PAT)
db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT));
}
DB_SHOW_COMMAND(dbregs, db_show_dbregs)
{
db_printf("dr0\t0x%08x\n", rdr0());
db_printf("dr1\t0x%08x\n", rdr1());
db_printf("dr2\t0x%08x\n", rdr2());
db_printf("dr3\t0x%08x\n", rdr3());
db_printf("dr6\t0x%08x\n", rdr6());
db_printf("dr7\t0x%08x\n", rdr7());
}
DB_SHOW_COMMAND(frame, db_show_frame)
{
struct trapframe *frame;
frame = have_addr ? (struct trapframe *)addr : curthread->td_frame;
printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n",
frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs,
frame->tf_eip);
printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno);
printf("ds %#x es %#x fs %#x\n",
frame->tf_ds, frame->tf_es, frame->tf_fs);
printf("eax %#x ecx %#x edx %#x ebx %#x\n",
frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx);
printf("ebp %#x esi %#x edi %#x\n",
frame->tf_ebp, frame->tf_esi, frame->tf_edi);
}
#endif
void
sdtossd(sd, ssd)
struct segment_descriptor *sd;
struct soft_segment_descriptor *ssd;
{
ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
ssd->ssd_type = sd->sd_type;
ssd->ssd_dpl = sd->sd_dpl;
ssd->ssd_p = sd->sd_p;
ssd->ssd_def32 = sd->sd_def32;
ssd->ssd_gran = sd->sd_gran;
}
static int
add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
int *physmap_idxp)
{
uint64_t lim, ign;
int i, insert_idx, physmap_idx;
physmap_idx = *physmap_idxp;
if (length == 0)
return (1);
lim = 0x100000000; /* 4G */
if (pae_mode && above4g_allow)
lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */
if (base >= lim) {
printf("%uK of memory above %uGB ignored, pae %d "
"above4g_allow %d above24g_allow %d\n",
(u_int)(length / 1024), (u_int)(lim >> 30), pae_mode,
above4g_allow, above24g_allow);
return (1);
}
if (base + length >= lim) {
ign = base + length - lim;
length -= ign;
printf("%uK of memory above %uGB ignored, pae %d "
"above4g_allow %d above24g_allow %d\n",
(u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode,
above4g_allow, above24g_allow);
}
/*
* Find insertion point while checking for overlap. Start off by
* assuming the new entry will be added to the end.
*/
insert_idx = physmap_idx + 2;
for (i = 0; i <= physmap_idx; i += 2) {
if (base < physmap[i + 1]) {
if (base + length <= physmap[i]) {
insert_idx = i;
break;
}
if (boothowto & RB_VERBOSE)
printf(
"Overlapping memory regions, ignoring second region\n");
return (1);
}
}
/* See if we can prepend to the next entry. */
if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
physmap[insert_idx] = base;
return (1);
}
/* See if we can append to the previous entry. */
if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
physmap[insert_idx - 1] += length;
return (1);
}
physmap_idx += 2;
*physmap_idxp = physmap_idx;
if (physmap_idx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many segments in the physical address map, giving up\n");
return (0);
}
/*
* Move the last 'N' entries down to make room for the new
* entry if needed.
*/
for (i = physmap_idx; i > insert_idx; i -= 2) {
physmap[i] = physmap[i - 2];
physmap[i + 1] = physmap[i - 1];
}
/* Insert the new entry. */
physmap[insert_idx] = base;
physmap[insert_idx + 1] = base + length;
return (1);
}
static int
add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
{
if (boothowto & RB_VERBOSE)
printf("SMAP type=%02x base=%016llx len=%016llx\n",
smap->type, smap->base, smap->length);
if (smap->type != SMAP_TYPE_MEMORY)
return (1);
return (add_physmap_entry(smap->base, smap->length, physmap,
physmap_idxp));
}
static void
add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap,
int *physmap_idxp)
{
struct bios_smap *smap, *smapend;
u_int32_t smapsize;
/*
* Memory map from INT 15:E820.
*
* subr_module.c says:
* "Consumer may safely assume that size value precedes data."
* ie: an int32_t immediately precedes SMAP.
*/
smapsize = *((u_int32_t *)smapbase - 1);
smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
for (smap = smapbase; smap < smapend; smap++)
if (!add_smap_entry(smap, physmap, physmap_idxp))
break;
}
static void
basemem_setup(void)
{
if (basemem > 640) {
printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
basemem);
basemem = 640;
}
pmap_basemem_setup(basemem);
}
/*
* Populate the (physmap) array with base/bound pairs describing the
* available physical memory in the system, then test this memory and
* build the phys_avail array describing the actually-available memory.
*
* If we cannot accurately determine the physical memory map, then use
* value from the 0xE801 call, and failing that, the RTC.
*
* Total memory size may be set by the kernel environment variable
* hw.physmem or the compile-time define MAXMEM.
*
* XXX first should be vm_paddr_t.
*/
static void
getmemsize(int first)
{
int has_smap, off, physmap_idx, pa_indx, da_indx;
u_long memtest;
vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
quad_t dcons_addr, dcons_size, physmem_tunable;
int hasbrokenint12, i, res;
u_int extmem;
struct vm86frame vmf;
struct vm86context vmc;
vm_paddr_t pa;
struct bios_smap *smap, *smapbase;
caddr_t kmdp;
has_smap = 0;
bzero(&vmf, sizeof(vmf));
bzero(physmap, sizeof(physmap));
basemem = 0;
/*
* Tell the physical memory allocator about pages used to store
* the kernel and preloaded data. See kmem_bootstrap_free().
*/
vm_phys_early_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first));
TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow);
TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow);
/*
* Check if the loader supplied an SMAP memory map. If so,
* use that and do not make any VM86 calls.
*/
physmap_idx = 0;
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf32 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase != NULL) {
add_smap_entries(smapbase, physmap, &physmap_idx);
has_smap = 1;
goto have_smap;
}
/*
* Some newer BIOSes have a broken INT 12H implementation
* which causes a kernel panic immediately. In this case, we
* need use the SMAP to determine the base memory size.
*/
hasbrokenint12 = 0;
TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
if (hasbrokenint12 == 0) {
/* Use INT12 to determine base memory size. */
vm86_intcall(0x12, &vmf);
basemem = vmf.vmf_ax;
basemem_setup();
}
/*
* Fetch the memory map with INT 15:E820. Map page 1 R/W into
* the kernel page table so we can use it as a buffer. The
* kernel will unmap this page later.
*/
vmc.npages = 0;
smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1));
res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
vmf.vmf_ebx = 0;
do {
vmf.vmf_eax = 0xE820;
vmf.vmf_edx = SMAP_SIG;
vmf.vmf_ecx = sizeof(struct bios_smap);
i = vm86_datacall(0x15, &vmf, &vmc);
if (i || vmf.vmf_eax != SMAP_SIG)
break;
has_smap = 1;
if (!add_smap_entry(smap, physmap, &physmap_idx))
break;
} while (vmf.vmf_ebx != 0);
have_smap:
/*
* If we didn't fetch the "base memory" size from INT12,
* figure it out from the SMAP (or just guess).
*/
if (basemem == 0) {
for (i = 0; i <= physmap_idx; i += 2) {
if (physmap[i] == 0x00000000) {
basemem = physmap[i + 1] / 1024;
break;
}
}
/* XXX: If we couldn't find basemem from SMAP, just guess. */
if (basemem == 0)
basemem = 640;
basemem_setup();
}
if (physmap[1] != 0)
goto physmap_done;
/*
* If we failed to find an SMAP, figure out the extended
* memory size. We will then build a simple memory map with
* two segments, one for "base memory" and the second for
* "extended memory". Note that "extended memory" starts at a
* physical address of 1MB and that both basemem and extmem
* are in units of 1KB.
*
* First, try to fetch the extended memory size via INT 15:E801.
*/
vmf.vmf_ax = 0xE801;
if (vm86_intcall(0x15, &vmf) == 0) {
extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
} else {
/*
* If INT15:E801 fails, this is our last ditch effort
* to determine the extended memory size. Currently
* we prefer the RTC value over INT15:88.
*/
#if 0
vmf.vmf_ah = 0x88;
vm86_intcall(0x15, &vmf);
extmem = vmf.vmf_ax;
#else
extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
#endif
}
/*
* Special hack for chipsets that still remap the 384k hole when
* there's 16MB of memory - this really confuses people that
* are trying to use bus mastering ISA controllers with the
* "16MB limit"; they only have 16MB, but the remapping puts
* them beyond the limit.
*
* If extended memory is between 15-16MB (16-17MB phys address range),
* chop it to 15MB.
*/
if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
extmem = 15 * 1024;
physmap[0] = 0;
physmap[1] = basemem * 1024;
physmap_idx = 2;
physmap[physmap_idx] = 0x100000;
physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
physmap_done:
/*
* Now, physmap contains a map of physical memory.
*/
#ifdef SMP
/* make hole for AP bootstrap code */
alloc_ap_trampoline(physmap, &physmap_idx);
#endif
/*
* Maxmem isn't the "maximum memory", it's one larger than the
* highest page of the physical address space. It should be
* called something like "Maxphyspage". We may adjust this
* based on ``hw.physmem'' and the results of the memory test.
*
* This is especially confusing when it is much larger than the
* memory size and is displayed as "realmem".
*/
Maxmem = atop(physmap[physmap_idx + 1]);
#ifdef MAXMEM
Maxmem = MAXMEM / 4;
#endif
if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable))
Maxmem = atop(physmem_tunable);
/*
* If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
* the amount of memory in the system.
*/
if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
Maxmem = atop(physmap[physmap_idx + 1]);
/*
* The boot memory test is disabled by default, as it takes a
* significant amount of time on large-memory systems, and is
* unfriendly to virtual machines as it unnecessarily touches all
* pages.
*
* A general name is used as the code may be extended to support
* additional tests beyond the current "page present" test.
*/
memtest = 0;
TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
if (atop(physmap[physmap_idx + 1]) != Maxmem &&
(boothowto & RB_VERBOSE))
printf("Physical memory use set to %ldK\n", Maxmem * 4);
/*
* If Maxmem has been increased beyond what the system has detected,
* extend the last memory segment to the new limit.
*/
if (atop(physmap[physmap_idx + 1]) < Maxmem)
physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(first);
/*
* Size up each available chunk of physical memory.
*/
physmap[0] = PAGE_SIZE; /* mask off page 0 */
pa_indx = 0;
da_indx = 1;
phys_avail[pa_indx++] = physmap[0];
phys_avail[pa_indx] = physmap[0];
dump_avail[da_indx] = physmap[0];
/*
* Get dcons buffer address
*/
if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
getenv_quad("dcons.size", &dcons_size) == 0)
dcons_addr = 0;
/*
* physmap is in bytes, so when converting to page boundaries,
* round up the start address and round down the end address.
*/
for (i = 0; i <= physmap_idx; i += 2) {
vm_paddr_t end;
end = ptoa((vm_paddr_t)Maxmem);
if (physmap[i + 1] < end)
end = trunc_page(physmap[i + 1]);
for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
int tmp, page_bad, full;
int *ptr;
full = FALSE;
/*
* block out kernel memory as not available.
*/
if (pa >= KERNLOAD && pa < first)
goto do_dump_avail;
/*
* block out dcons buffer
*/
if (dcons_addr > 0
&& pa >= trunc_page(dcons_addr)
&& pa < dcons_addr + dcons_size)
goto do_dump_avail;
page_bad = FALSE;
if (memtest == 0)
goto skip_memtest;
/*
* map page into kernel: valid, read/write,non-cacheable
*/
ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N);
tmp = *(int *)ptr;
/*
* Test for alternating 1's and 0's
*/
*(volatile int *)ptr = 0xaaaaaaaa;
if (*(volatile int *)ptr != 0xaaaaaaaa)
page_bad = TRUE;
/*
* Test for alternating 0's and 1's
*/
*(volatile int *)ptr = 0x55555555;
if (*(volatile int *)ptr != 0x55555555)
page_bad = TRUE;
/*
* Test for all 1's
*/
*(volatile int *)ptr = 0xffffffff;
if (*(volatile int *)ptr != 0xffffffff)
page_bad = TRUE;
/*
* Test for all 0's
*/
*(volatile int *)ptr = 0x0;
if (*(volatile int *)ptr != 0x0)
page_bad = TRUE;
/*
* Restore original value.
*/
*(int *)ptr = tmp;
skip_memtest:
/*
* Adjust array of valid/good pages.
*/
if (page_bad == TRUE)
continue;
/*
* If this good page is a continuation of the
* previous set of good pages, then just increase
* the end pointer. Otherwise start a new chunk.
* Note that "end" points one higher than end,
* making the range >= start and < end.
* If we're also doing a speculative memory
* test and we at or past the end, bump up Maxmem
* so that we keep going. The first bad page
* will terminate the loop.
*/
if (phys_avail[pa_indx] == pa) {
phys_avail[pa_indx] += PAGE_SIZE;
} else {
pa_indx++;
if (pa_indx == PHYS_AVAIL_ENTRIES) {
printf(
"Too many holes in the physical address space, giving up\n");
pa_indx--;
full = TRUE;
goto do_dump_avail;
}
phys_avail[pa_indx++] = pa; /* start */
phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
}
physmem++;
do_dump_avail:
if (dump_avail[da_indx] == pa) {
dump_avail[da_indx] += PAGE_SIZE;
} else {
da_indx++;
if (da_indx == PHYS_AVAIL_ENTRIES) {
da_indx--;
goto do_next;
}
dump_avail[da_indx++] = pa; /* start */
dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
}
do_next:
if (full)
break;
}
}
pmap_cmap3(0, 0);
/*
* XXX
* The last chunk must contain at least one page plus the message
* buffer to avoid complicating other code (message buffer address
* calculation, etc.).
*/
while (phys_avail[pa_indx - 1] + PAGE_SIZE +
round_page(msgbufsize) >= phys_avail[pa_indx]) {
physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
phys_avail[pa_indx--] = 0;
phys_avail[pa_indx--] = 0;
}
Maxmem = atop(phys_avail[pa_indx]);
/* Trim off space for the message buffer. */
phys_avail[pa_indx] -= round_page(msgbufsize);
/* Map the message buffer. */
for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
off);
}
static void
i386_kdb_init(void)
{
#ifdef DDB
db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab, 0);
#endif
kdb_init();
#ifdef KDB
if (boothowto & RB_KDB)
kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
#endif
}
static void
fixup_idt(void)
{
struct gate_descriptor *ip;
uintptr_t off;
int x;
for (x = 0; x < NIDT; x++) {
ip = &idt[x];
if (ip->gd_type != SDT_SYS386IGT &&
ip->gd_type != SDT_SYS386TGT)
continue;
off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16);
KASSERT(off >= (uintptr_t)start_exceptions &&
off < (uintptr_t)end_exceptions,
("IDT[%d] type %d off %#x", x, ip->gd_type, off));
off += setidt_disp;
MPASS(off >= PMAP_TRM_MIN_ADDRESS &&
off < PMAP_TRM_MAX_ADDRESS);
ip->gd_looffset = off;
ip->gd_hioffset = off >> 16;
}
}
static void
i386_setidt1(void)
{
int x;
/* exceptions */
for (x = 0; x < NIDT; x++)
setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL,
SEL_KPL));
setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT,
SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall),
SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
#ifdef KDTRACE_HOOKS
setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret),
SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
#endif
#ifdef XENHVM
setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
#endif
}
static void
i386_setidt2(void)
{
setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
}
#if defined(DEV_ISA) && !defined(DEV_ATPIC)
static void
i386_setidt3(void)
{
setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint),
SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
}
#endif
register_t
init386(int first)
{
struct region_descriptor r_gdt, r_idt; /* table descriptors */
int gsel_tss, metadata_missing, x, pa;
struct pcpu *pc;
struct xstate_hdr *xhdr;
caddr_t kmdp;
vm_offset_t addend;
size_t ucode_len;
int late_console;
thread0.td_kstack = proc0kstack;
thread0.td_kstack_pages = TD0_KSTACK_PAGES;
/*
* This may be done better later if it gets more high level
* components in it. If so just link td->td_proc here.
*/
proc_linkup0(&proc0, &thread0);
if (bootinfo.bi_modulep) {
metadata_missing = 0;
addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ?
PMAP_MAP_LOW : 0;
preload_metadata = (caddr_t)bootinfo.bi_modulep + addend;
preload_bootstrap_relocate(addend);
} else {
metadata_missing = 1;
}
if (bootinfo.bi_envp != 0) {
addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ?
PMAP_MAP_LOW : 0;
init_static_kenv((char *)bootinfo.bi_envp + addend, 0);
} else {
init_static_kenv(NULL, 0);
}
/*
* Re-evaluate CPU features if we loaded a microcode update.
*/
ucode_len = ucode_load_bsp(first);
if (ucode_len != 0) {
identify_cpu();
first = roundup2(first + ucode_len, PAGE_SIZE);
}
identify_hypervisor();
/* Init basic tunables, hz etc */
init_param1();
/* Set bootmethod to BIOS: it's the only supported on i386. */
strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
/*
* Make gdt memory segments. All segments cover the full 4GB
* of address space and permissions are enforced at page level.
*/
gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
pc = &__pcpu[0];
gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0;
for (x = 0; x < NGDT; x++)
ssdtosd(&gdt_segs[x], &gdt0[x].sd);
r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1;
r_gdt.rd_base = (int)gdt0;
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
lgdt(&r_gdt);
pcpu_init(pc, 0, sizeof(struct pcpu));
for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
pmap_kenter(pa, pa);
dpcpu_init((void *)first, 0);
first += DPCPU_SIZE;
PCPU_SET(prvspace, pc);
PCPU_SET(curthread, &thread0);
/* Non-late cninit() and printf() can be moved up to here. */
/*
* Initialize mutexes.
*
* icu_lock: in order to allow an interrupt to occur in a critical
* section, to set pcpu->ipending (etc...) properly, we
* must be able to get the icu lock, so it can't be
* under witness.
*/
mutex_init();
mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
i386_setidt1();
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (int) idt;
lidt(&r_idt);
finishidentcpu(); /* Final stage of CPU initialization */
/*
* Initialize the clock before the console so that console
* initialization can use DELAY().
*/
clock_init();
i386_setidt2();
pmap_set_nx();
initializecpu(); /* Initialize CPU registers */
initializecpucache();
/* pointer to selector slot for %fs/%gs */
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
/* Initialize the tss (except for the final esp0) early for vm86. */
common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages *
PAGE_SIZE - VM86_STACK_SPACE;
common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
common_tss0.tss_ioopt = sizeof(struct i386tss) << 16;
gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
ltr(gsel_tss);
/* Initialize the PIC early for vm86 calls. */
#ifdef DEV_ISA
#ifdef DEV_ATPIC
elcr_probe();
atpic_startup();
#else
/* Reset and mask the atpics and leave them shut down. */
atpic_reset();
/*
* Point the ICU spurious interrupt vectors at the APIC spurious
* interrupt handler.
*/
i386_setidt3();
#endif
#endif
/*
* The console and kdb should be initialized even earlier than here,
* but some console drivers don't work until after getmemsize().
* Default to late console initialization to support these drivers.
* This loses mainly printf()s in getmemsize() and early debugging.
*/
late_console = 1;
TUNABLE_INT_FETCH("debug.late_console", &late_console);
if (!late_console) {
cninit();
i386_kdb_init();
}
kmdp = preload_search_by_type("elf kernel");
link_elf_ireloc(kmdp);
vm86_initialize();
getmemsize(first);
init_param2(physmem);
/* now running on new page tables, configured,and u/iom is accessible */
if (late_console)
cninit();
if (metadata_missing)
printf("WARNING: loader(8) metadata is missing!\n");
if (late_console)
i386_kdb_init();
msgbufinit(msgbufp, msgbufsize);
npxinit(true);
/*
* Set up thread0 pcb after npxinit calculated pcb + fpu save
* area size. Zero out the extended state header in fpu save
* area.
*/
thread0.td_pcb = get_pcb_td(&thread0);
thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
if (use_xsave) {
xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1);
xhdr->xstate_bv = xsave_mask;
}
PCPU_SET(curpcb, thread0.td_pcb);
/* Move esp0 in the tss to its final place. */
/* Note: -16 is so we can grow the trapframe if we came from vm86 */
common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE;
PCPU_SET(kesp0, common_tss0.tss_esp0);
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */
ltr(gsel_tss);
/* transfer to user mode */
_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
/* setup proc 0's pcb */
thread0.td_pcb->pcb_flags = 0;
thread0.td_pcb->pcb_cr3 = pmap_get_kcr3();
thread0.td_pcb->pcb_ext = 0;
thread0.td_frame = &proc0_tf;
#ifdef FDT
x86_init_fdt();
#endif
/* Location of kernel stack for locore */
return ((register_t)thread0.td_pcb);
}
static void
machdep_init_trampoline(void)
{
struct region_descriptor r_gdt, r_idt;
struct i386tss *tss;
char *copyout_buf, *trampoline, *tramp_stack_base;
int x;
gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus,
M_NOWAIT | M_ZERO);
bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT);
r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
r_gdt.rd_base = (int)gdt;
lgdt(&r_gdt);
tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus,
M_NOWAIT | M_ZERO);
bcopy(&common_tss0, tss, sizeof(struct i386tss));
gdt[GPROC0_SEL].sd.sd_lobase = (int)tss;
gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24;
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
PCPU_SET(common_tssp, tss);
ltr(GSEL(GPROC0_SEL, SEL_KPL));
trampoline = pmap_trm_alloc(end_exceptions - start_exceptions,
M_NOWAIT);
bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions);
tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ -
VM86_STACK_SPACE);
tss[0].tss_esp0 = PCPU_GET(trampstk);
idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO);
bcopy(idt0, idt, sizeof(idt0));
/* Re-initialize new IDT since the handlers were relocated */
setidt_disp = trampoline - start_exceptions;
fixup_idt();
r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
r_idt.rd_base = (int)idt;
lidt(&r_idt);
/* dblfault TSS */
dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO);
dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT);
dblfault_tss->tss_esp = dblfault_tss->tss_esp0 =
dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 =
(int)dblfault_stack + PAGE_SIZE;
dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 =
dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss->tss_cr3 = pmap_get_kcr3();
dblfault_tss->tss_eip = (int)dblfault_handler;
dblfault_tss->tss_eflags = PSL_KERNEL;
dblfault_tss->tss_ds = dblfault_tss->tss_es =
dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL);
dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL);
dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss;
gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24;
/* make ldt memory segments */
ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT,
M_NOWAIT | M_ZERO);
gdt[GLDT_SEL].sd.sd_lobase = (int)ldt;
gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24;
ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
for (x = 0; x < nitems(ldt_segs); x++)
ssdtosd(&ldt_segs[x], &ldt[x].sd);
_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
lldt(_default_ldt);
PCPU_SET(currentldt, _default_ldt);
copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
PCPU_SET(copyout_buf, copyout_buf);
copyout_init_tramp();
}
SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL);
#ifdef COMPAT_43
static void
i386_setup_lcall_gate(void)
{
struct sysentvec *sv;
struct user_segment_descriptor desc;
u_int lcall_addr;
sv = &elf32_freebsd_sysvec;
lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp;
bzero(&desc, sizeof(desc));
desc.sd_type = SDT_MEMERA;
desc.sd_dpl = SEL_UPL;
desc.sd_p = 1;
desc.sd_def32 = 1;
desc.sd_gran = 1;
desc.sd_lolimit = 0xffff;
desc.sd_hilimit = 0xf;
desc.sd_lobase = lcall_addr;
desc.sd_hibase = lcall_addr >> 24;
bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc));
}
SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL);
#endif
void
cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
{
pcpu->pc_acpi_id = 0xffffffff;
}
static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
{
struct bios_smap *smapbase;
struct bios_smap_xattr smap;
caddr_t kmdp;
uint32_t *smapattr;
int count, error, i;
/* Retrieve the system memory map from the loader. */
kmdp = preload_search_by_type("elf kernel");
if (kmdp == NULL)
kmdp = preload_search_by_type("elf32 kernel");
smapbase = (struct bios_smap *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP);
if (smapbase == NULL)
return (0);
smapattr = (uint32_t *)preload_search_info(kmdp,
MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase);
error = 0;
for (i = 0; i < count; i++) {
smap.base = smapbase[i].base;
smap.length = smapbase[i].length;
smap.type = smapbase[i].type;
if (smapattr != NULL)
smap.xattr = smapattr[i];
else
smap.xattr = 0;
error = SYSCTL_OUT(req, &smap, sizeof(smap));
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, smap,
CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
smap_sysctl_handler, "S,bios_smap_xattr",
"Raw BIOS SMAP data");
void
spinlock_enter(void)
{
struct thread *td;
register_t flags;
td = curthread;
if (td->td_md.md_spinlock_count == 0) {
flags = intr_disable();
td->td_md.md_spinlock_count = 1;
td->td_md.md_saved_flags = flags;
critical_enter();
} else
td->td_md.md_spinlock_count++;
}
void
spinlock_exit(void)
{
struct thread *td;
register_t flags;
td = curthread;
flags = td->td_md.md_saved_flags;
td->td_md.md_spinlock_count--;
if (td->td_md.md_spinlock_count == 0) {
critical_exit();
intr_restore(flags);
}
}
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
static void f00f_hack(void *unused);
SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
static void
f00f_hack(void *unused)
{
struct region_descriptor r_idt;
struct gate_descriptor *new_idt;
vm_offset_t tmp;
if (!has_f00f_bug)
return;
printf("Intel Pentium detected, installing workaround for F00F bug\n");
tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO);
if (tmp == 0)
panic("kmem_malloc returned 0");
tmp = round_page(tmp);
/* Put the problematic entry (#6) at the end of the lower page. */
new_idt = (struct gate_descriptor *)
(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
bcopy(idt, new_idt, sizeof(idt0));
r_idt.rd_base = (u_int)new_idt;
r_idt.rd_limit = sizeof(idt0) - 1;
lidt(&r_idt);
/* SMP machines do not need the F00F hack. */
idt = new_idt;
pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
}
#endif /* defined(I586_CPU) && !NO_F00F_HACK */
/*
* Construct a PCB from a trapframe. This is called from kdb_trap() where
* we want to start a backtrace from the function that caused us to enter
* the debugger. We have the context in the trapframe, but base the trace
* on the PCB. The PCB doesn't have to be perfect, as long as it contains
* enough for a backtrace.
*/
void
makectx(struct trapframe *tf, struct pcb *pcb)
{
pcb->pcb_edi = tf->tf_edi;
pcb->pcb_esi = tf->tf_esi;
pcb->pcb_ebp = tf->tf_ebp;
pcb->pcb_ebx = tf->tf_ebx;
pcb->pcb_eip = tf->tf_eip;
pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
pcb->pcb_gs = rgs();
}
#ifdef KDB
/*
* Provide inb() and outb() as functions. They are normally only available as
* inline functions, thus cannot be called from the debugger.
*/
/* silence compiler warnings */
u_char inb_(u_short);
void outb_(u_short, u_char);
u_char
inb_(u_short port)
{
return inb(port);
}
void
outb_(u_short port, u_char data)
{
outb(port, data);
}
#endif /* KDB */
diff --git a/sys/x86/include/clock.h b/sys/x86/include/clock.h
index 83c8351ed31c..9aeccadf89aa 100644
--- a/sys/x86/include/clock.h
+++ b/sys/x86/include/clock.h
@@ -1,48 +1,49 @@
/*-
* Kernel interface to machine-dependent clock driver.
* Garrett Wollman, September 1994.
* This file is in the public domain.
*
* $FreeBSD$
*/
#ifndef _MACHINE_CLOCK_H_
#define _MACHINE_CLOCK_H_
#ifdef _KERNEL
/*
* i386 to clock driver interface.
* XXX large parts of the driver and its interface are misplaced.
*/
extern int clkintr_pending;
extern u_int i8254_freq;
extern int i8254_max_count;
extern uint64_t tsc_freq;
extern int tsc_is_invariant;
extern int tsc_perf_stat;
#ifdef SMP
extern int smp_tsc;
#endif
void i8254_init(void);
void i8254_delay(int);
void clock_init(void);
void lapic_calibrate(void);
+void tsc_init(void);
void tsc_calibrate(void);
/*
* Driver to clock driver interface.
*/
void startrtclock(void);
-void init_TSC(void);
+void start_TSC(void);
void resume_TSC(void);
#define HAS_TIMER_SPKR 1
int timer_spkr_acquire(void);
int timer_spkr_release(void);
void timer_spkr_setfreq(int freq);
#endif /* _KERNEL */
#endif /* !_MACHINE_CLOCK_H_ */
diff --git a/sys/x86/isa/clock.c b/sys/x86/isa/clock.c
index 1178d35979c1..f21f847709cd 100644
--- a/sys/x86/isa/clock.c
+++ b/sys/x86/isa/clock.c
@@ -1,659 +1,659 @@
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1990 The Regents of the University of California.
* Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz and Don Ahn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)clock.c 7.2 (Berkeley) 5/12/91
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Routines to handle clock hardware.
*/
#include "opt_clock.h"
#include "opt_isa.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/lock.h>
#include <sys/kdb.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/rman.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/timeet.h>
#include <sys/timetc.h>
#include <machine/clock.h>
#include <machine/cpu.h>
#include <machine/intr_machdep.h>
#include <machine/ppireg.h>
#include <machine/timerreg.h>
#include <x86/apicvar.h>
#include <x86/init.h>
#include <isa/rtc.h>
#ifdef DEV_ISA
#include <isa/isareg.h>
#include <isa/isavar.h>
#endif
int clkintr_pending;
#ifndef TIMER_FREQ
#define TIMER_FREQ 1193182
#endif
u_int i8254_freq = TIMER_FREQ;
TUNABLE_INT("hw.i8254.freq", &i8254_freq);
int i8254_max_count;
static int i8254_timecounter = 1;
static struct mtx clock_lock;
static struct intsrc *i8254_intsrc;
static uint16_t i8254_lastcount;
static uint16_t i8254_offset;
static int (*i8254_pending)(struct intsrc *);
static int i8254_ticked;
struct attimer_softc {
int intr_en;
int port_rid, intr_rid;
struct resource *port_res;
struct resource *intr_res;
void *intr_handler;
struct timecounter tc;
struct eventtimer et;
int mode;
#define MODE_STOP 0
#define MODE_PERIODIC 1
#define MODE_ONESHOT 2
uint32_t period;
};
static struct attimer_softc *attimer_sc = NULL;
static int timer0_period = -2;
static int timer0_mode = 0xffff;
static int timer0_last = 0xffff;
/* Values for timerX_state: */
#define RELEASED 0
#define RELEASE_PENDING 1
#define ACQUIRED 2
#define ACQUIRE_PENDING 3
static u_char timer2_state;
static unsigned i8254_get_timecount(struct timecounter *tc);
static void set_i8254_freq(int mode, uint32_t period);
void
clock_init(void)
{
/* Init the clock lock */
mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
/* Init the clock in order to use DELAY */
init_ops.early_clock_source_init();
}
static int
clkintr(void *arg)
{
struct attimer_softc *sc = (struct attimer_softc *)arg;
if (i8254_timecounter && sc->period != 0) {
mtx_lock_spin(&clock_lock);
if (i8254_ticked)
i8254_ticked = 0;
else {
i8254_offset += i8254_max_count;
i8254_lastcount = 0;
}
clkintr_pending = 0;
mtx_unlock_spin(&clock_lock);
}
if (sc->et.et_active && sc->mode != MODE_STOP)
sc->et.et_event_cb(&sc->et, sc->et.et_arg);
return (FILTER_HANDLED);
}
int
timer_spkr_acquire(void)
{
int mode;
mode = TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT;
if (timer2_state != RELEASED)
return (-1);
timer2_state = ACQUIRED;
/*
* This access to the timer registers is as atomic as possible
* because it is a single instruction. We could do better if we
* knew the rate. Use of splclock() limits glitches to 10-100us,
* and this is probably good enough for timer2, so we aren't as
* careful with it as with timer0.
*/
outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f));
ppi_spkr_on(); /* enable counter2 output to speaker */
return (0);
}
int
timer_spkr_release(void)
{
if (timer2_state != ACQUIRED)
return (-1);
timer2_state = RELEASED;
outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT);
ppi_spkr_off(); /* disable counter2 output to speaker */
return (0);
}
void
timer_spkr_setfreq(int freq)
{
freq = i8254_freq / freq;
mtx_lock_spin(&clock_lock);
outb(TIMER_CNTR2, freq & 0xff);
outb(TIMER_CNTR2, freq >> 8);
mtx_unlock_spin(&clock_lock);
}
static int
getit(void)
{
int high, low;
mtx_lock_spin(&clock_lock);
/* Select timer0 and latch counter value. */
outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
low = inb(TIMER_CNTR0);
high = inb(TIMER_CNTR0);
mtx_unlock_spin(&clock_lock);
return ((high << 8) | low);
}
/*
* Wait "n" microseconds.
* Relies on timer 1 counting down from (i8254_freq / hz)
* Note: timer had better have been programmed before this is first used!
*/
void
i8254_delay(int n)
{
int delta, prev_tick, tick, ticks_left;
#ifdef DELAYDEBUG
int getit_calls = 1;
int n1;
static int state = 0;
if (state == 0) {
state = 1;
for (n1 = 1; n1 <= 10000000; n1 *= 10)
DELAY(n1);
state = 2;
}
if (state == 1)
printf("DELAY(%d)...", n);
#endif
/*
* Read the counter first, so that the rest of the setup overhead is
* counted. Guess the initial overhead is 20 usec (on most systems it
* takes about 1.5 usec for each of the i/o's in getit(). The loop
* takes about 6 usec on a 486/33 and 13 usec on a 386/20. The
* multiplications and divisions to scale the count take a while).
*
* However, if ddb is active then use a fake counter since reading
* the i8254 counter involves acquiring a lock. ddb must not do
* locking for many reasons, but it calls here for at least atkbd
* input.
*/
#ifdef KDB
if (kdb_active)
prev_tick = 1;
else
#endif
prev_tick = getit();
n -= 0; /* XXX actually guess no initial overhead */
/*
* Calculate (n * (i8254_freq / 1e6)) without using floating point
* and without any avoidable overflows.
*/
if (n <= 0)
ticks_left = 0;
else if (n < 256)
/*
* Use fixed point to avoid a slow division by 1000000.
* 39099 = 1193182 * 2^15 / 10^6 rounded to nearest.
* 2^15 is the first power of 2 that gives exact results
* for n between 0 and 256.
*/
ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15;
else
/*
* Don't bother using fixed point, although gcc-2.7.2
* generates particularly poor code for the long long
* division, since even the slow way will complete long
* before the delay is up (unless we're interrupted).
*/
ticks_left = ((u_int)n * (long long)i8254_freq + 999999)
/ 1000000;
while (ticks_left > 0) {
#ifdef KDB
if (kdb_active) {
inb(0x84);
tick = prev_tick - 1;
if (tick <= 0)
tick = i8254_max_count;
} else
#endif
tick = getit();
#ifdef DELAYDEBUG
++getit_calls;
#endif
delta = prev_tick - tick;
prev_tick = tick;
if (delta < 0) {
delta += i8254_max_count;
/*
* Guard against i8254_max_count being wrong.
* This shouldn't happen in normal operation,
* but it may happen if set_i8254_freq() is
* traced.
*/
if (delta < 0)
delta = 0;
}
ticks_left -= delta;
}
#ifdef DELAYDEBUG
if (state == 1)
printf(" %d calls to getit() at %d usec each\n",
getit_calls, (n + 5) / getit_calls);
#endif
}
static void
set_i8254_freq(int mode, uint32_t period)
{
int new_count, new_mode;
mtx_lock_spin(&clock_lock);
if (mode == MODE_STOP) {
if (i8254_timecounter) {
mode = MODE_PERIODIC;
new_count = 0x10000;
} else
new_count = -1;
} else {
new_count = min(((uint64_t)i8254_freq * period +
0x80000000LLU) >> 32, 0x10000);
}
if (new_count == timer0_period)
goto out;
i8254_max_count = ((new_count & ~0xffff) != 0) ? 0xffff : new_count;
timer0_period = (mode == MODE_PERIODIC) ? new_count : -1;
switch (mode) {
case MODE_STOP:
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, 0);
outb(TIMER_CNTR0, 0);
break;
case MODE_PERIODIC:
new_mode = TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT;
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
outb(TIMER_CNTR0, new_count >> 8);
break;
case MODE_ONESHOT:
if (new_count < 256 && timer0_last < 256) {
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_LSB;
if (new_mode != timer0_mode)
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
break;
}
new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
if (new_mode != timer0_mode)
outb(TIMER_MODE, new_mode);
outb(TIMER_CNTR0, new_count & 0xff);
outb(TIMER_CNTR0, new_count >> 8);
break;
default:
panic("set_i8254_freq: unknown operational mode");
}
timer0_mode = new_mode;
timer0_last = new_count;
out:
mtx_unlock_spin(&clock_lock);
}
static void
i8254_restore(void)
{
timer0_period = -2;
timer0_mode = 0xffff;
timer0_last = 0xffff;
if (attimer_sc != NULL)
set_i8254_freq(attimer_sc->mode, attimer_sc->period);
else
set_i8254_freq(MODE_STOP, 0);
}
/* This is separate from startrtclock() so that it can be called early. */
void
i8254_init(void)
{
set_i8254_freq(MODE_STOP, 0);
}
void
-startrtclock()
+startrtclock(void)
{
- init_TSC();
+ start_TSC();
}
void
cpu_initclocks(void)
{
#ifdef EARLY_AP_STARTUP
struct thread *td;
int i;
td = curthread;
tsc_calibrate();
lapic_calibrate_timer();
cpu_initclocks_bsp();
CPU_FOREACH(i) {
if (i == 0)
continue;
thread_lock(td);
sched_bind(td, i);
thread_unlock(td);
cpu_initclocks_ap();
}
thread_lock(td);
if (sched_is_bound(td))
sched_unbind(td);
thread_unlock(td);
#else
tsc_calibrate();
lapic_calibrate_timer();
cpu_initclocks_bsp();
#endif
}
static int
sysctl_machdep_i8254_freq(SYSCTL_HANDLER_ARGS)
{
int error;
u_int freq;
/*
* Use `i8254' instead of `timer' in external names because `timer'
* is too generic. Should use it everywhere.
*/
freq = i8254_freq;
error = sysctl_handle_int(oidp, &freq, 0, req);
if (error == 0 && req->newptr != NULL) {
i8254_freq = freq;
if (attimer_sc != NULL) {
set_i8254_freq(attimer_sc->mode, attimer_sc->period);
attimer_sc->tc.tc_frequency = freq;
} else {
set_i8254_freq(MODE_STOP, 0);
}
}
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, i8254_freq,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
0, sizeof(u_int), sysctl_machdep_i8254_freq, "IU",
"i8254 timer frequency");
static unsigned
i8254_get_timecount(struct timecounter *tc)
{
device_t dev = (device_t)tc->tc_priv;
struct attimer_softc *sc = device_get_softc(dev);
register_t flags;
uint16_t count;
u_int high, low;
if (sc->period == 0)
return (i8254_max_count - getit());
#ifdef __amd64__
flags = read_rflags();
#else
flags = read_eflags();
#endif
mtx_lock_spin(&clock_lock);
/* Select timer0 and latch counter value. */
outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
low = inb(TIMER_CNTR0);
high = inb(TIMER_CNTR0);
count = i8254_max_count - ((high << 8) | low);
if (count < i8254_lastcount ||
(!i8254_ticked && (clkintr_pending ||
((count < 20 || (!(flags & PSL_I) &&
count < i8254_max_count / 2u)) &&
i8254_pending != NULL && i8254_pending(i8254_intsrc))))) {
i8254_ticked = 1;
i8254_offset += i8254_max_count;
}
i8254_lastcount = count;
count += i8254_offset;
mtx_unlock_spin(&clock_lock);
return (count);
}
static int
attimer_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
{
device_t dev = (device_t)et->et_priv;
struct attimer_softc *sc = device_get_softc(dev);
if (period != 0) {
sc->mode = MODE_PERIODIC;
sc->period = period;
} else {
sc->mode = MODE_ONESHOT;
sc->period = first;
}
if (!sc->intr_en) {
i8254_intsrc->is_pic->pic_enable_source(i8254_intsrc);
sc->intr_en = 1;
}
set_i8254_freq(sc->mode, sc->period);
return (0);
}
static int
attimer_stop(struct eventtimer *et)
{
device_t dev = (device_t)et->et_priv;
struct attimer_softc *sc = device_get_softc(dev);
sc->mode = MODE_STOP;
sc->period = 0;
set_i8254_freq(sc->mode, sc->period);
return (0);
}
#ifdef DEV_ISA
/*
* Attach to the ISA PnP descriptors for the timer
*/
static struct isa_pnp_id attimer_ids[] = {
{ 0x0001d041 /* PNP0100 */, "AT timer" },
{ 0 }
};
static int
attimer_probe(device_t dev)
{
int result;
result = ISA_PNP_PROBE(device_get_parent(dev), dev, attimer_ids);
/* ENOENT means no PnP-ID, device is hinted. */
if (result == ENOENT) {
device_set_desc(dev, "AT timer");
return (BUS_PROBE_LOW_PRIORITY);
}
return (result);
}
static int
attimer_attach(device_t dev)
{
struct attimer_softc *sc;
rman_res_t s;
int i;
attimer_sc = sc = device_get_softc(dev);
bzero(sc, sizeof(struct attimer_softc));
if (!(sc->port_res = bus_alloc_resource(dev, SYS_RES_IOPORT,
&sc->port_rid, IO_TIMER1, IO_TIMER1 + 3, 4, RF_ACTIVE)))
device_printf(dev,"Warning: Couldn't map I/O.\n");
i8254_intsrc = intr_lookup_source(0);
if (i8254_intsrc != NULL)
i8254_pending = i8254_intsrc->is_pic->pic_source_pending;
resource_int_value(device_get_name(dev), device_get_unit(dev),
"timecounter", &i8254_timecounter);
set_i8254_freq(MODE_STOP, 0);
if (i8254_timecounter) {
sc->tc.tc_get_timecount = i8254_get_timecount;
sc->tc.tc_counter_mask = 0xffff;
sc->tc.tc_frequency = i8254_freq;
sc->tc.tc_name = "i8254";
sc->tc.tc_quality = 0;
sc->tc.tc_priv = dev;
tc_init(&sc->tc);
}
if (resource_int_value(device_get_name(dev), device_get_unit(dev),
"clock", &i) != 0 || i != 0) {
sc->intr_rid = 0;
while (bus_get_resource(dev, SYS_RES_IRQ, sc->intr_rid,
&s, NULL) == 0 && s != 0)
sc->intr_rid++;
if (!(sc->intr_res = bus_alloc_resource(dev, SYS_RES_IRQ,
&sc->intr_rid, 0, 0, 1, RF_ACTIVE))) {
device_printf(dev,"Can't map interrupt.\n");
return (0);
}
/* Dirty hack, to make bus_setup_intr to not enable source. */
i8254_intsrc->is_handlers++;
if ((bus_setup_intr(dev, sc->intr_res,
INTR_MPSAFE | INTR_TYPE_CLK,
(driver_filter_t *)clkintr, NULL,
sc, &sc->intr_handler))) {
device_printf(dev, "Can't setup interrupt.\n");
i8254_intsrc->is_handlers--;
return (0);
}
i8254_intsrc->is_handlers--;
i8254_intsrc->is_pic->pic_enable_intr(i8254_intsrc);
sc->et.et_name = "i8254";
sc->et.et_flags = ET_FLAGS_PERIODIC;
if (!i8254_timecounter)
sc->et.et_flags |= ET_FLAGS_ONESHOT;
sc->et.et_quality = 100;
sc->et.et_frequency = i8254_freq;
sc->et.et_min_period = (0x0002LLU << 32) / i8254_freq;
sc->et.et_max_period = (0xfffeLLU << 32) / i8254_freq;
sc->et.et_start = attimer_start;
sc->et.et_stop = attimer_stop;
sc->et.et_priv = dev;
et_register(&sc->et);
}
return(0);
}
static int
attimer_resume(device_t dev)
{
i8254_restore();
return (0);
}
static device_method_t attimer_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, attimer_probe),
DEVMETHOD(device_attach, attimer_attach),
DEVMETHOD(device_detach, bus_generic_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
DEVMETHOD(device_suspend, bus_generic_suspend),
DEVMETHOD(device_resume, attimer_resume),
{ 0, 0 }
};
static driver_t attimer_driver = {
"attimer",
attimer_methods,
sizeof(struct attimer_softc),
};
static devclass_t attimer_devclass;
DRIVER_MODULE(attimer, isa, attimer_driver, attimer_devclass, 0, 0);
DRIVER_MODULE(attimer, acpi, attimer_driver, attimer_devclass, 0, 0);
ISA_PNP_INFO(attimer_ids);
#endif /* DEV_ISA */
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index 317be8979feb..82ee358b6895 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -1,942 +1,955 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 1998-2003 Poul-Henning Kamp
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_clock.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/eventhandler.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/kernel.h>
#include <sys/smp.h>
#include <sys/vdso.h>
#include <machine/clock.h>
#include <machine/cputypes.h>
#include <machine/fpu.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <x86/vmware.h>
#include <dev/acpica/acpi_hpet.h>
#include <contrib/dev/acpica/include/acpi.h>
#include "cpufreq_if.h"
uint64_t tsc_freq;
int tsc_is_invariant;
int tsc_perf_stat;
static int tsc_early_calib_exact;
static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
&tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
#ifdef SMP
int smp_tsc;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0,
"Indicates whether the TSC is safe to use in SMP mode");
int smp_tsc_adjust = 0;
SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN,
&smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP");
#endif
static int tsc_shift = 1;
SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN,
&tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency");
static int tsc_disabled;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
"Disable x86 Time Stamp Counter");
static int tsc_skip_calibration;
SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
&tsc_skip_calibration, 0,
"Disable early TSC frequency calibration");
static void tsc_freq_changed(void *arg, const struct cf_level *level,
int status);
static void tsc_freq_changing(void *arg, const struct cf_level *level,
int *status);
static u_int tsc_get_timecount(struct timecounter *tc);
static inline u_int tsc_get_timecount_low(struct timecounter *tc);
static u_int tsc_get_timecount_lfence(struct timecounter *tc);
static u_int tsc_get_timecount_low_lfence(struct timecounter *tc);
static u_int tsc_get_timecount_mfence(struct timecounter *tc);
static u_int tsc_get_timecount_low_mfence(struct timecounter *tc);
static u_int tscp_get_timecount(struct timecounter *tc);
static u_int tscp_get_timecount_low(struct timecounter *tc);
static void tsc_levels_changed(void *arg, int unit);
static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
struct timecounter *tc);
#ifdef COMPAT_FREEBSD32
static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
struct timecounter *tc);
#endif
static struct timecounter tsc_timecounter = {
.tc_get_timecount = tsc_get_timecount,
.tc_counter_mask = ~0u,
.tc_name = "TSC",
.tc_quality = 800, /* adjusted in code */
.tc_fill_vdso_timehands = x86_tsc_vdso_timehands,
#ifdef COMPAT_FREEBSD32
.tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32,
#endif
};
static int
tsc_freq_cpuid_vm(void)
{
u_int regs[4];
if (vm_guest == VM_GUEST_NO)
return (false);
if (hv_high < 0x40000010)
return (false);
do_cpuid(0x40000010, regs);
tsc_freq = (uint64_t)(regs[0]) * 1000;
tsc_early_calib_exact = 1;
return (true);
}
static void
tsc_freq_vmware(void)
{
u_int regs[4];
vmware_hvcall(VMW_HVCMD_GETHZ, regs);
if (regs[1] != UINT_MAX)
tsc_freq = regs[0] | ((uint64_t)regs[1] << 32);
tsc_early_calib_exact = 1;
}
/*
* Calculate TSC frequency using information from the CPUID leaf 0x15 'Time
* Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not
* functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency
* Information'. Leaf 0x16 is described in the SDM as informational only, but
* we can use this value until late calibration is complete.
*/
static bool
tsc_freq_cpuid(uint64_t *res)
{
u_int regs[4];
if (cpu_high < 0x15)
return (false);
do_cpuid(0x15, regs);
if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) {
*res = (uint64_t)regs[2] * regs[1] / regs[0];
return (true);
}
if (cpu_high < 0x16)
return (false);
do_cpuid(0x16, regs);
if (regs[0] != 0) {
*res = (uint64_t)regs[0] * 1000000;
return (true);
}
return (false);
}
static bool
tsc_freq_intel_brand(uint64_t *res)
{
char brand[48];
u_int regs[4];
uint64_t freq;
char *p;
u_int i;
/*
* Intel Processor Identification and the CPUID Instruction
* Application Note 485.
* http://www.intel.com/assets/pdf/appnote/241618.pdf
*/
if (cpu_exthigh >= 0x80000004) {
p = brand;
for (i = 0x80000002; i < 0x80000005; i++) {
do_cpuid(i, regs);
memcpy(p, regs, sizeof(regs));
p += sizeof(regs);
}
p = NULL;
for (i = 0; i < sizeof(brand) - 1; i++)
if (brand[i] == 'H' && brand[i + 1] == 'z')
p = brand + i;
if (p != NULL) {
p -= 5;
switch (p[4]) {
case 'M':
i = 1;
break;
case 'G':
i = 1000;
break;
case 'T':
i = 1000000;
break;
default:
return (false);
}
#define C2D(c) ((c) - '0')
if (p[1] == '.') {
freq = C2D(p[0]) * 1000;
freq += C2D(p[2]) * 100;
freq += C2D(p[3]) * 10;
freq *= i * 1000;
} else {
freq = C2D(p[0]) * 1000;
freq += C2D(p[1]) * 100;
freq += C2D(p[2]) * 10;
freq += C2D(p[3]);
freq *= i * 1000000;
}
#undef C2D
*res = freq;
return (true);
}
}
return (false);
}
static void
tsc_freq_8254(uint64_t *res)
{
uint64_t tsc1, tsc2;
int64_t overhead;
int count, i;
overhead = 0;
for (i = 0, count = 8; i < count; i++) {
tsc1 = rdtsc_ordered();
DELAY(0);
tsc2 = rdtsc_ordered();
if (i > 0)
overhead += tsc2 - tsc1;
}
overhead /= count;
tsc1 = rdtsc_ordered();
DELAY(100000);
tsc2 = rdtsc_ordered();
tsc_freq = (tsc2 - tsc1 - overhead) * 10;
}
static void
probe_tsc_freq(void)
{
- if (cpu_power_ecx & CPUID_PERF_STAT) {
- /*
- * XXX Some emulators expose host CPUID without actual support
- * for these MSRs. We must test whether they really work.
- */
- wrmsr(MSR_MPERF, 0);
- wrmsr(MSR_APERF, 0);
- DELAY(10);
- if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
- tsc_perf_stat = 1;
+#ifdef __i386__
+ /* The TSC is known to be broken on certain CPUs. */
+ switch (cpu_vendor_id) {
+ case CPU_VENDOR_AMD:
+ switch (cpu_id & 0xFF0) {
+ case 0x500:
+ /* K5 Model 0 */
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ case CPU_VENDOR_CENTAUR:
+ switch (cpu_id & 0xff0) {
+ case 0x540:
+ /*
+ * http://www.centtech.com/c6_data_sheet.pdf
+ *
+ * I-12 RDTSC may return incoherent values in EDX:EAX
+ * I-13 RDTSC hangs when certain event counters are used
+ */
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ case CPU_VENDOR_NSC:
+ switch (cpu_id & 0xff0) {
+ case 0x540:
+ if ((cpu_id & CPUID_STEPPING) == 0) {
+ tsc_disabled = 1;
+ return;
+ }
+ break;
+ }
+ break;
}
+#endif
switch (cpu_vendor_id) {
case CPU_VENDOR_AMD:
case CPU_VENDOR_HYGON:
if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
(vm_guest == VM_GUEST_NO &&
CPUID_TO_FAMILY(cpu_id) >= 0x10))
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_mfence;
}
break;
case CPU_VENDOR_INTEL:
if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 ||
(vm_guest == VM_GUEST_NO &&
((CPUID_TO_FAMILY(cpu_id) == 0x6 &&
CPUID_TO_MODEL(cpu_id) >= 0xe) ||
(CPUID_TO_FAMILY(cpu_id) == 0xf &&
CPUID_TO_MODEL(cpu_id) >= 0x3))))
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_lfence;
}
break;
case CPU_VENDOR_CENTAUR:
if (vm_guest == VM_GUEST_NO &&
CPUID_TO_FAMILY(cpu_id) == 0x6 &&
CPUID_TO_MODEL(cpu_id) >= 0xf &&
(rdmsr(0x1203) & 0x100000000ULL) == 0)
tsc_is_invariant = 1;
if (cpu_feature & CPUID_SSE2) {
tsc_timecounter.tc_get_timecount =
tsc_get_timecount_lfence;
}
break;
}
- if (tsc_freq_cpuid_vm())
- return;
-
- if (vm_guest == VM_GUEST_VMWARE) {
+ if (tsc_freq_cpuid_vm()) {
+ if (bootverbose)
+ printf(
+ "Early TSC frequency %juHz derived from hypervisor CPUID\n",
+ (uintmax_t)tsc_freq);
+ } else if (vm_guest == VM_GUEST_VMWARE) {
tsc_freq_vmware();
- return;
- }
-
- if (tsc_freq_cpuid(&tsc_freq)) {
+ if (bootverbose)
+ printf(
+ "Early TSC frequency %juHz derived from VMWare hypercall\n",
+ (uintmax_t)tsc_freq);
+ } else if (tsc_freq_cpuid(&tsc_freq)) {
/*
* If possible, use the value obtained from CPUID as the initial
* frequency. This will be refined later during boot but is
* good enough for now. The 8254 PIT is not functional on some
* newer platforms anyway, so don't delay our boot for what
* might be a garbage result. Late calibration is required if
* the initial frequency was obtained from CPUID.16H, as the
* derived value may be off by as much as 1%.
*/
if (bootverbose)
printf("Early TSC frequency %juHz derived from CPUID\n",
(uintmax_t)tsc_freq);
} else if (tsc_skip_calibration) {
/*
* Try to parse the brand string to obtain the nominal TSC
* frequency.
*/
if (cpu_vendor_id == CPU_VENDOR_INTEL &&
tsc_freq_intel_brand(&tsc_freq)) {
if (bootverbose)
printf(
"Early TSC frequency %juHz derived from brand string\n",
(uintmax_t)tsc_freq);
} else {
tsc_disabled = 1;
}
} else {
/*
* Calibrate against the 8254 PIT. This estimate will be
* refined later in tsc_calib().
*/
tsc_freq_8254(&tsc_freq);
if (bootverbose)
printf(
"Early TSC frequency %juHz calibrated from 8254 PIT\n",
(uintmax_t)tsc_freq);
}
+
+ if (cpu_power_ecx & CPUID_PERF_STAT) {
+ /*
+ * XXX Some emulators expose host CPUID without actual support
+ * for these MSRs. We must test whether they really work.
+ */
+ wrmsr(MSR_MPERF, 0);
+ wrmsr(MSR_APERF, 0);
+ DELAY(10);
+ if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0)
+ tsc_perf_stat = 1;
+ }
}
void
-init_TSC(void)
+start_TSC(void)
{
-
if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
return;
-#ifdef __i386__
- /* The TSC is known to be broken on certain CPUs. */
- switch (cpu_vendor_id) {
- case CPU_VENDOR_AMD:
- switch (cpu_id & 0xFF0) {
- case 0x500:
- /* K5 Model 0 */
- return;
- }
- break;
- case CPU_VENDOR_CENTAUR:
- switch (cpu_id & 0xff0) {
- case 0x540:
- /*
- * http://www.centtech.com/c6_data_sheet.pdf
- *
- * I-12 RDTSC may return incoherent values in EDX:EAX
- * I-13 RDTSC hangs when certain event counters are used
- */
- return;
- }
- break;
- case CPU_VENDOR_NSC:
- switch (cpu_id & 0xff0) {
- case 0x540:
- if ((cpu_id & CPUID_STEPPING) == 0)
- return;
- break;
- }
- break;
- }
-#endif
-
- probe_tsc_freq();
-
/*
* Inform CPU accounting about our boot-time clock rate. This will
* be updated if someone loads a cpufreq driver after boot that
* discovers a new max frequency.
*
* The frequency may also be updated after late calibration is complete;
* however, we register the TSC as the ticker now to avoid switching
* counters after much of the kernel has already booted and potentially
* sampled the CPU clock.
*/
if (tsc_freq != 0)
set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
if (tsc_is_invariant)
return;
/* Register to find out about changes in CPU frequency. */
tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change,
tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST);
tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change,
tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST);
tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed,
tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY);
}
#ifdef SMP
/*
* RDTSC is not a serializing instruction, and does not drain
* instruction stream, so we need to drain the stream before executing
* it. It could be fixed by use of RDTSCP, except the instruction is
* not available everywhere.
*
* Use CPUID for draining in the boot-time SMP constistency test. The
* timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel
* and VIA) when SSE2 is present, and nothing on older machines which
* also do not issue RDTSC prematurely. There, testing for SSE2 and
* vendor is too cumbersome, and we learn about TSC presence from CPUID.
*
* Do not use do_cpuid(), since we do not need CPUID results, which
* have to be written into memory with do_cpuid().
*/
#define TSC_READ(x) \
static void \
tsc_read_##x(void *arg) \
{ \
uint64_t *tsc = arg; \
u_int cpu = PCPU_GET(cpuid); \
\
__asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \
tsc[cpu * 3 + x] = rdtsc(); \
}
TSC_READ(0)
TSC_READ(1)
TSC_READ(2)
#undef TSC_READ
#define N 1000
static void
comp_smp_tsc(void *arg)
{
uint64_t *tsc;
int64_t d1, d2;
u_int cpu = PCPU_GET(cpuid);
u_int i, j, size;
size = (mp_maxid + 1) * 3;
for (i = 0, tsc = arg; i < N; i++, tsc += size)
CPU_FOREACH(j) {
if (j == cpu)
continue;
d1 = tsc[cpu * 3 + 1] - tsc[j * 3];
d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1];
if (d1 <= 0 || d2 <= 0) {
smp_tsc = 0;
return;
}
}
}
static void
adj_smp_tsc(void *arg)
{
uint64_t *tsc;
int64_t d, min, max;
u_int cpu = PCPU_GET(cpuid);
u_int first, i, size;
first = CPU_FIRST();
if (cpu == first)
return;
min = INT64_MIN;
max = INT64_MAX;
size = (mp_maxid + 1) * 3;
for (i = 0, tsc = arg; i < N; i++, tsc += size) {
d = tsc[first * 3] - tsc[cpu * 3 + 1];
if (d > min)
min = d;
d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2];
if (d > min)
min = d;
d = tsc[first * 3 + 1] - tsc[cpu * 3];
if (d < max)
max = d;
d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1];
if (d < max)
max = d;
}
if (min > max)
return;
d = min / 2 + max / 2;
__asm __volatile (
"movl $0x10, %%ecx\n\t"
"rdmsr\n\t"
"addl %%edi, %%eax\n\t"
"adcl %%esi, %%edx\n\t"
"wrmsr\n"
: /* No output */
: "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32))
: "ax", "cx", "dx", "cc"
);
}
static int
test_tsc(int adj_max_count)
{
uint64_t *data, *tsc;
u_int i, size, adj;
if ((!smp_tsc && !tsc_is_invariant))
return (-100);
/*
* Misbehavior of TSC under VirtualBox has been observed. In
* particular, threads doing small (~1 second) sleeps may miss their
* wakeup and hang around in sleep state, causing hangs on shutdown.
*/
if (vm_guest == VM_GUEST_VBOX)
return (0);
TSENTER();
size = (mp_maxid + 1) * 3;
data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK);
adj = 0;
retry:
for (i = 0, tsc = data; i < N; i++, tsc += size)
smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc);
smp_tsc = 1; /* XXX */
smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc,
smp_no_rendezvous_barrier, data);
if (!smp_tsc && adj < adj_max_count) {
adj++;
smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc,
smp_no_rendezvous_barrier, data);
goto retry;
}
free(data, M_TEMP);
if (bootverbose)
printf("SMP: %sed TSC synchronization test%s\n",
smp_tsc ? "pass" : "fail",
adj > 0 ? " after adjustment" : "");
TSEXIT();
if (smp_tsc && tsc_is_invariant) {
switch (cpu_vendor_id) {
case CPU_VENDOR_AMD:
case CPU_VENDOR_HYGON:
/*
* Processor Programming Reference (PPR) for AMD
* Family 17h states that the TSC uses a common
* reference for all sockets, cores and threads.
*/
if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
return (1000);
/*
* Starting with Family 15h processors, TSC clock
* source is in the north bridge. Check whether
* we have a single-socket/multi-core platform.
* XXX Need more work for complex cases.
*/
if (CPUID_TO_FAMILY(cpu_id) < 0x15 ||
(amd_feature2 & AMDID2_CMP) == 0 ||
smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1)
break;
return (1000);
case CPU_VENDOR_INTEL:
/*
* XXX Assume Intel platforms have synchronized TSCs.
*/
return (1000);
}
return (800);
}
return (-100);
}
#undef N
#endif /* SMP */
static void
init_TSC_tc(void)
{
uint64_t max_freq;
int shift;
if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
return;
/*
* Limit timecounter frequency to fit in an int and prevent it from
* overflowing too fast.
*/
max_freq = UINT_MAX;
/*
* Intel CPUs without a C-state invariant TSC can stop the TSC
* in either C2 or C3. Disable use of C2 and C3 while using
* the TSC as the timecounter. The timecounter can be changed
* to enable C2 and C3.
*
* Note that the TSC is used as the cputicker for computing
* thread runtime regardless of the timecounter setting, so
* using an alternate timecounter and enabling C2 or C3 can
* result incorrect runtimes for kernel idle threads (but not
* for any non-idle threads).
*/
if (cpu_vendor_id == CPU_VENDOR_INTEL &&
(amd_pminfo & AMDPM_TSC_INVARIANT) == 0) {
tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP;
if (bootverbose)
printf("TSC timecounter disables C2 and C3.\n");
}
/*
* We can not use the TSC in SMP mode unless the TSCs on all CPUs
* are synchronized. If the user is sure that the system has
* synchronized TSCs, set kern.timecounter.smp_tsc tunable to a
* non-zero value. The TSC seems unreliable in virtualized SMP
* environments, so it is set to a negative quality in those cases.
*/
#ifdef SMP
if (mp_ncpus > 1)
tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust);
else
#endif /* SMP */
if (tsc_is_invariant)
tsc_timecounter.tc_quality = 1000;
max_freq >>= tsc_shift;
for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++)
;
/*
* Timecounter implementation selection, top to bottom:
* - If RDTSCP is available, use RDTSCP.
* - If fence instructions are provided (SSE2), use LFENCE;RDTSC
* on Intel, and MFENCE;RDTSC on AMD.
* - For really old CPUs, just use RDTSC.
*/
if ((amd_feature & AMDID_RDTSCP) != 0) {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tscp_get_timecount_low : tscp_get_timecount;
} else if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) {
if (cpu_vendor_id == CPU_VENDOR_AMD ||
cpu_vendor_id == CPU_VENDOR_HYGON) {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low_mfence :
tsc_get_timecount_mfence;
} else {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low_lfence :
tsc_get_timecount_lfence;
}
} else {
tsc_timecounter.tc_get_timecount = shift > 0 ?
tsc_get_timecount_low : tsc_get_timecount;
}
if (shift > 0) {
tsc_timecounter.tc_name = "TSC-low";
if (bootverbose)
printf("TSC timecounter discards lower %d bit(s)\n",
shift);
}
if (tsc_freq != 0) {
tsc_timecounter.tc_frequency = tsc_freq >> shift;
tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
/*
* Timecounter registration is deferred until after late
* calibration is finished.
*/
}
}
SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
static void
tsc_update_freq(uint64_t new_freq)
{
atomic_store_rel_64(&tsc_freq, new_freq);
atomic_store_rel_64(&tsc_timecounter.tc_frequency,
new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
}
+void
+tsc_init(void)
+{
+ if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled)
+ return;
+
+ probe_tsc_freq();
+}
+
/*
* Perform late calibration of the TSC frequency once ACPI-based timecounters
* are available. At this point timehands are not set up, so we read the
* highest-quality timecounter directly rather than using (s)binuptime().
*/
void
tsc_calibrate(void)
{
uint64_t freq;
if (tsc_disabled)
return;
if (tsc_early_calib_exact)
goto calibrated;
fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
freq = clockcalib(rdtsc_ordered, "TSC");
fpu_kern_leave(curthread, NULL);
tsc_update_freq(freq);
calibrated:
tc_init(&tsc_timecounter);
set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
}
void
resume_TSC(void)
{
#ifdef SMP
int quality;
/* If TSC was not good on boot, it is unlikely to become good now. */
if (tsc_timecounter.tc_quality < 0)
return;
/* Nothing to do with UP. */
if (mp_ncpus < 2)
return;
/*
* If TSC was good, a single synchronization should be enough,
* but honour smp_tsc_adjust if it's set.
*/
quality = test_tsc(MAX(smp_tsc_adjust, 1));
if (quality != tsc_timecounter.tc_quality) {
printf("TSC timecounter quality changed: %d -> %d\n",
tsc_timecounter.tc_quality, quality);
tsc_timecounter.tc_quality = quality;
}
#endif /* SMP */
}
/*
* When cpufreq levels change, find out about the (new) max frequency. We
* use this to update CPU accounting in case it got a lower estimate at boot.
*/
static void
tsc_levels_changed(void *arg, int unit)
{
device_t cf_dev;
struct cf_level *levels;
int count, error;
uint64_t max_freq;
/* Only use values from the first CPU, assuming all are equal. */
if (unit != 0)
return;
/* Find the appropriate cpufreq device instance. */
cf_dev = devclass_get_device(devclass_find("cpufreq"), unit);
if (cf_dev == NULL) {
printf("tsc_levels_changed() called but no cpufreq device?\n");
return;
}
/* Get settings from the device and find the max frequency. */
count = 64;
levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
if (levels == NULL)
return;
error = CPUFREQ_LEVELS(cf_dev, levels, &count);
if (error == 0 && count != 0) {
max_freq = (uint64_t)levels[0].total_set.freq * 1000000;
set_cputicker(rdtsc, max_freq, 1);
} else
printf("tsc_levels_changed: no max freq found\n");
free(levels, M_TEMP);
}
/*
* If the TSC timecounter is in use, veto the pending change. It may be
* possible in the future to handle a dynamically-changing timecounter rate.
*/
static void
tsc_freq_changing(void *arg, const struct cf_level *level, int *status)
{
if (*status != 0 || timecounter != &tsc_timecounter)
return;
printf("timecounter TSC must not be in use when "
"changing frequencies; change denied\n");
*status = EBUSY;
}
/* Update TSC freq with the value indicated by the caller. */
static void
tsc_freq_changed(void *arg, const struct cf_level *level, int status)
{
uint64_t freq;
/* If there was an error during the transition, don't do anything. */
if (tsc_disabled || status != 0)
return;
/* Total setting for this level gives the new frequency in MHz. */
freq = (uint64_t)level->total_set.freq * 1000000;
tsc_update_freq(freq);
}
static int
sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
{
int error;
uint64_t freq;
freq = atomic_load_acq_64(&tsc_freq);
if (freq == 0)
return (EOPNOTSUPP);
error = sysctl_handle_64(oidp, &freq, 0, req);
if (error == 0 && req->newptr != NULL)
tsc_update_freq(freq);
return (error);
}
SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq,
CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE,
0, 0, sysctl_machdep_tsc_freq, "QU",
"Time Stamp Counter frequency");
static u_int
tsc_get_timecount(struct timecounter *tc __unused)
{
return (rdtsc32());
}
static u_int
tscp_get_timecount(struct timecounter *tc __unused)
{
return (rdtscp32());
}
static inline u_int
tsc_get_timecount_low(struct timecounter *tc)
{
uint32_t rv;
__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
: "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
return (rv);
}
static u_int
tscp_get_timecount_low(struct timecounter *tc)
{
uint32_t rv;
__asm __volatile("rdtscp; movl %1, %%ecx; shrd %%cl, %%edx, %0"
: "=&a" (rv) : "m" (tc->tc_priv) : "ecx", "edx");
return (rv);
}
static u_int
tsc_get_timecount_lfence(struct timecounter *tc __unused)
{
lfence();
return (rdtsc32());
}
static u_int
tsc_get_timecount_low_lfence(struct timecounter *tc)
{
lfence();
return (tsc_get_timecount_low(tc));
}
static u_int
tsc_get_timecount_mfence(struct timecounter *tc __unused)
{
mfence();
return (rdtsc32());
}
static u_int
tsc_get_timecount_low_mfence(struct timecounter *tc)
{
mfence();
return (tsc_get_timecount_low(tc));
}
static uint32_t
x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
{
vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
vdso_th->th_x86_hpet_idx = 0xffffffff;
vdso_th->th_x86_pvc_last_systime = 0;
vdso_th->th_x86_pvc_stable_mask = 0;
bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
return (1);
}
#ifdef COMPAT_FREEBSD32
static uint32_t
x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
struct timecounter *tc)
{
vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
vdso_th32->th_x86_hpet_idx = 0xffffffff;
vdso_th32->th_x86_pvc_last_systime = 0;
vdso_th32->th_x86_pvc_stable_mask = 0;
bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
return (1);
}
#endif
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Mar 29, 1:15 PM (1 d, 16 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28218285
Default Alt Text
(147 KB)
Attached To
Mode
rG FreeBSD src repository
Attached
Detach File
Event Timeline
Log In to Comment