// This file is part of the Essence operating system. // It is released under the terms of the MIT license -- see LICENSE.md. // Written by: nakst. #include extern "C" uint64_t ProcessorReadCR3(); extern "C" void gdt_data(); extern "C" void processorGDTR(); extern "C" void ProcessorAPStartup(); struct MSIHandler { KIRQHandler callback; void *context; }; struct IRQHandler { KIRQHandler callback; void *context; intptr_t line; KPCIDevice *pciDevice; const char *cOwnerName; }; uint8_t pciIRQLines[0x100 /* slots */][4 /* pins */]; MSIHandler msiHandlers[INTERRUPT_VECTOR_MSI_COUNT]; IRQHandler irqHandlers[0x40]; KSpinlock irqHandlersLock; // Also for msiHandlers. extern volatile uint64_t timeStampCounterSynchronizationValue; PhysicalMemoryRegion *physicalMemoryRegions; size_t physicalMemoryRegionsCount; size_t physicalMemoryRegionsPagesCount; size_t physicalMemoryOriginalPagesCount; size_t physicalMemoryRegionsIndex; uintptr_t physicalMemoryHighest; uint32_t bootloaderID; uintptr_t bootloaderInformationOffset; // Spinlock since some drivers need to access it in IRQs (e.g. ACPICA). KSpinlock pciConfigSpinlock; KSpinlock ipiLock; const char *const exceptionInformation[] = { "0x00: Divide Error (Fault)", "0x01: Debug Exception (Fault/Trap)", "0x02: Non-Maskable External Interrupt (Interrupt)", "0x03: Breakpoint (Trap)", "0x04: Overflow (Trap)", "0x05: BOUND Range Exceeded (Fault)", "0x06: Invalid Opcode (Fault)", "0x07: x87 Coprocessor Unavailable (Fault)", "0x08: Double Fault (Abort)", "0x09: x87 Coprocessor Segment Overrun (Fault)", "0x0A: Invalid TSS (Fault)", "0x0B: Segment Not Present (Fault)", "0x0C: Stack Protection (Fault)", "0x0D: General Protection (Fault)", "0x0E: Page Fault (Fault)", "0x0F: Reserved/Unknown", "0x10: x87 FPU Floating-Point Error (Fault)", "0x11: Alignment Check (Fault)", "0x12: Machine Check (Abort)", "0x13: SIMD Floating-Point Exception (Fault)", "0x14: Virtualization Exception (Fault)", "0x15: Reserved/Unknown", "0x16: Reserved/Unknown", "0x17: Reserved/Unknown", "0x18: Reserved/Unknown", "0x19: Reserved/Unknown", "0x1A: Reserved/Unknown", "0x1B: Reserved/Unknown", "0x1C: Reserved/Unknown", "0x1D: Reserved/Unknown", "0x1E: Reserved/Unknown", "0x1F: Reserved/Unknown", }; uint32_t LapicReadRegister(uint32_t reg) { #ifdef ES_ARCH_X86_64 return acpi.lapicAddress[reg]; #else return ((volatile uint32_t *) LOCAL_APIC_BASE)[reg]; #endif } void LapicWriteRegister(uint32_t reg, uint32_t value) { #ifdef ES_ARCH_X86_64 acpi.lapicAddress[reg] = value; #else ((volatile uint32_t *) LOCAL_APIC_BASE)[reg] = value; #endif } void LapicNextTimer(size_t ms) { LapicWriteRegister(0x320 >> 2, TIMER_INTERRUPT | (1 << 17)); LapicWriteRegister(0x380 >> 2, acpi.lapicTicksPerMs * ms); } void LapicEndOfInterrupt() { LapicWriteRegister(0xB0 >> 2, 0); } uintptr_t MMArchEarlyAllocatePage() { uintptr_t i = physicalMemoryRegionsIndex; while (!physicalMemoryRegions[i].pageCount) { i++; if (i == physicalMemoryRegionsCount) { KernelPanic("MMArchEarlyAllocatePage - Expected more pages in physical regions.\n"); } } PhysicalMemoryRegion *region = physicalMemoryRegions + i; uintptr_t returnValue = region->baseAddress; region->baseAddress += K_PAGE_SIZE; region->pageCount--; physicalMemoryRegionsPagesCount--; physicalMemoryRegionsIndex = i; return returnValue; } uint64_t MMArchPopulatePageFrameDatabase() { uint64_t commitLimit = 0; for (uintptr_t i = 0; i < physicalMemoryRegionsCount; i++) { uintptr_t base = physicalMemoryRegions[i].baseAddress >> K_PAGE_BITS; uintptr_t count = physicalMemoryRegions[i].pageCount; commitLimit += count; for (uintptr_t j = 0; j < count; j++) { MMPhysicalInsertFreePagesNext(base + j); } } physicalMemoryRegionsPagesCount = 0; return commitLimit; } uintptr_t MMArchGetPhysicalMemoryHighest() { return physicalMemoryHighest; } void ProcessorOut8Delayed(uint16_t port, uint8_t value) { ProcessorOut8(port, value); // Read an unused port to get a short delay. ProcessorIn8(IO_UNUSED_DELAY); } extern "C" void PCSetupCOM1() { #ifdef COM_OUTPUT ProcessorOut8Delayed(IO_COM_1 + 1, 0x00); ProcessorOut8Delayed(IO_COM_1 + 3, 0x80); ProcessorOut8Delayed(IO_COM_1 + 0, 0x03); ProcessorOut8Delayed(IO_COM_1 + 1, 0x00); ProcessorOut8Delayed(IO_COM_1 + 3, 0x03); ProcessorOut8Delayed(IO_COM_1 + 2, 0xC7); ProcessorOut8Delayed(IO_COM_1 + 4, 0x0B); // Print a divider line. for (uint8_t i = 0; i < 10; i++) ProcessorDebugOutputByte('-'); ProcessorDebugOutputByte('\r'); ProcessorDebugOutputByte('\n'); #endif } extern "C" void PCDisablePIC() { // Remap the ISRs sent by the PIC to 0x20 - 0x2F. // Even though we'll mask the PIC to use the APIC, // we have to do this so that the spurious interrupts are sent to a reasonable vector range. ProcessorOut8Delayed(IO_PIC_1_COMMAND, 0x11); ProcessorOut8Delayed(IO_PIC_2_COMMAND, 0x11); ProcessorOut8Delayed(IO_PIC_1_DATA, 0x20); ProcessorOut8Delayed(IO_PIC_2_DATA, 0x28); ProcessorOut8Delayed(IO_PIC_1_DATA, 0x04); ProcessorOut8Delayed(IO_PIC_2_DATA, 0x02); ProcessorOut8Delayed(IO_PIC_1_DATA, 0x01); ProcessorOut8Delayed(IO_PIC_2_DATA, 0x01); // Mask all interrupts. ProcessorOut8Delayed(IO_PIC_1_DATA, 0xFF); ProcessorOut8Delayed(IO_PIC_2_DATA, 0xFF); } extern "C" void PCProcessMemoryMap() { physicalMemoryRegions = (PhysicalMemoryRegion *) (LOW_MEMORY_MAP_START + 0x60000 + bootloaderInformationOffset); for (uintptr_t i = 0; physicalMemoryRegions[i].baseAddress; i++) { PhysicalMemoryRegion region = physicalMemoryRegions[i]; uint64_t end = region.baseAddress + (region.pageCount << K_PAGE_BITS); #ifdef ES_BITS_32 if (end > 0x100000000) { region.pageCount = 0; continue; } #endif physicalMemoryRegionsPagesCount += region.pageCount; if (end > physicalMemoryHighest) physicalMemoryHighest = end; physicalMemoryRegionsCount++; } physicalMemoryOriginalPagesCount = physicalMemoryRegions[physicalMemoryRegionsCount].pageCount; } uintptr_t GetBootloaderInformationOffset() { return bootloaderInformationOffset; } uint32_t KPCIReadConfig(uint8_t bus, uint8_t device, uint8_t function, uint8_t offset, int size) { KSpinlockAcquire(&pciConfigSpinlock); EsDefer(KSpinlockRelease(&pciConfigSpinlock)); if (offset & 3) KernelPanic("KPCIReadConfig - offset is not 4-byte aligned."); ProcessorOut32(IO_PCI_CONFIG, (uint32_t) (0x80000000 | (bus << 16) | (device << 11) | (function << 8) | offset)); if (size == 8) return ProcessorIn8(IO_PCI_DATA); if (size == 16) return ProcessorIn16(IO_PCI_DATA); if (size == 32) return ProcessorIn32(IO_PCI_DATA); KernelPanic("PCIController::ReadConfig - Invalid size %d.\n", size); return 0; } void KPCIWriteConfig(uint8_t bus, uint8_t device, uint8_t function, uint8_t offset, uint32_t value, int size) { KSpinlockAcquire(&pciConfigSpinlock); EsDefer(KSpinlockRelease(&pciConfigSpinlock)); if (offset & 3) KernelPanic("KPCIWriteConfig - offset is not 4-byte aligned."); ProcessorOut32(IO_PCI_CONFIG, (uint32_t) (0x80000000 | (bus << 16) | (device << 11) | (function << 8) | offset)); if (size == 8) ProcessorOut8(IO_PCI_DATA, value); else if (size == 16) ProcessorOut16(IO_PCI_DATA, value); else if (size == 32) ProcessorOut32(IO_PCI_DATA, value); else KernelPanic("PCIController::WriteConfig - Invalid size %d.\n", size); } void MMArchUnmapPages(MMSpace *space, uintptr_t virtualAddressStart, uintptr_t pageCount, unsigned flags, size_t unmapMaximum, uintptr_t *resumePosition) { // We can't let anyone use the unmapped pages until they've been invalidated on all processors. // This also synchronises modified bit updating. KMutexAcquire(&pmm.pageFrameMutex); EsDefer(KMutexRelease(&pmm.pageFrameMutex)); KMutexAcquire(&space->data.mutex); EsDefer(KMutexRelease(&space->data.mutex)); #ifdef ES_ARCH_X86_64 uintptr_t tableBase = virtualAddressStart & 0x0000FFFFFFFFF000; #else uintptr_t tableBase = virtualAddressStart & 0xFFFFF000; #endif uintptr_t start = resumePosition ? *resumePosition : 0; // TODO Freeing newly empty page tables. // - What do we need to invalidate when we do this? for (uintptr_t i = start; i < pageCount; i++) { uintptr_t virtualAddress = (i << K_PAGE_BITS) + tableBase; #ifdef ES_ARCH_X86_64 if ((PAGE_TABLE_L4[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 3)] & 1) == 0) { i -= (virtualAddress >> K_PAGE_BITS) % (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 3)); i += (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 3)); continue; } if ((PAGE_TABLE_L3[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 2)] & 1) == 0) { i -= (virtualAddress >> K_PAGE_BITS) % (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 2)); i += (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 2)); continue; } #endif if ((PAGE_TABLE_L2[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 1)] & 1) == 0) { i -= (virtualAddress >> K_PAGE_BITS) % (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 1)); i += (1 << (ENTRIES_PER_PAGE_TABLE_BITS * 1)); continue; } uintptr_t indexL1 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 0); uintptr_t translation = PAGE_TABLE_L1[indexL1]; if (!(translation & 1)) { // The page wasn't mapped. continue; } bool copy = translation & (1 << 9); if (copy && (flags & MM_UNMAP_PAGES_BALANCE_FILE) && (~flags & MM_UNMAP_PAGES_FREE_COPIED)) { // Ignore copied pages when balancing file mappings. continue; } if ((~translation & (1 << 5)) || (~translation & (1 << 6))) { // See MMArchMapPage for a discussion of why these bits must be set. KernelPanic("MMArchUnmapPages - Page found without accessed or dirty bit set (virtualAddress: %x, translation: %x).\n", virtualAddress, translation); } PAGE_TABLE_L1[indexL1] = 0; #ifdef ES_ARCH_X86_64 uintptr_t physicalAddress = translation & 0x0000FFFFFFFFF000; #else uintptr_t physicalAddress = translation & 0xFFFFF000; #endif if ((flags & MM_UNMAP_PAGES_FREE) || ((flags & MM_UNMAP_PAGES_FREE_COPIED) && copy)) { MMPhysicalFree(physicalAddress, true); } else if (flags & MM_UNMAP_PAGES_BALANCE_FILE) { // It's safe to do this before page invalidation, // because the page fault handler is synchronised with the same mutexes acquired above. if (MMUnmapFilePage(physicalAddress >> K_PAGE_BITS)) { if (resumePosition) { if (!unmapMaximum--) { *resumePosition = i; break; } } } } } MMArchInvalidatePages(virtualAddressStart, pageCount); } bool MMArchMapPage(MMSpace *space, uintptr_t physicalAddress, uintptr_t virtualAddress, unsigned flags) { // TODO Use the no-execute bit. if ((physicalAddress | virtualAddress) & (K_PAGE_SIZE - 1)) { KernelPanic("MMArchMapPage - Address not page aligned.\n"); } if (pmm.pageFrames && (physicalAddress >> K_PAGE_BITS) < pmm.pageFrameDatabaseCount) { if (pmm.pageFrames[physicalAddress >> K_PAGE_BITS].state != MMPageFrame::ACTIVE && pmm.pageFrames[physicalAddress >> K_PAGE_BITS].state != MMPageFrame::UNUSABLE) { KernelPanic("MMArchMapPage - Physical page frame %x not marked as ACTIVE or UNUSABLE.\n", physicalAddress); } } if (!physicalAddress) { KernelPanic("MMArchMapPage - Attempt to map physical page 0.\n"); } else if (!virtualAddress) { KernelPanic("MMArchMapPage - Attempt to map virtual page 0.\n"); #ifdef ES_ARCH_X86_64 } else if (virtualAddress < 0xFFFF800000000000 && ProcessorReadCR3() != space->data.cr3) { #else } else if (virtualAddress < 0xC0000000 && ProcessorReadCR3() != space->data.cr3) { #endif KernelPanic("MMArchMapPage - Attempt to map page into other address space.\n"); } bool acquireFrameLock = !(flags & (MM_MAP_PAGE_NO_NEW_TABLES | MM_MAP_PAGE_FRAME_LOCK_ACQUIRED)); if (acquireFrameLock) KMutexAcquire(&pmm.pageFrameMutex); EsDefer(if (acquireFrameLock) KMutexRelease(&pmm.pageFrameMutex);); bool acquireSpaceLock = ~flags & MM_MAP_PAGE_NO_NEW_TABLES; if (acquireSpaceLock) KMutexAcquire(&space->data.mutex); EsDefer(if (acquireSpaceLock) KMutexRelease(&space->data.mutex)); // EsPrint("\tMap, %x -> %x\n", virtualAddress, physicalAddress); uintptr_t oldVirtualAddress = virtualAddress; #ifdef ES_ARCH_X86_64 physicalAddress &= 0xFFFFFFFFFFFFF000; virtualAddress &= 0x0000FFFFFFFFF000; #endif #ifdef ES_ARCH_X86_64 uintptr_t indexL4 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 3); uintptr_t indexL3 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 2); #endif uintptr_t indexL2 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 1); uintptr_t indexL1 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 0); if (space != coreMMSpace && space != kernelMMSpace /* Don't check the kernel's space since the bootloader's tables won't be committed. */) { #ifdef ES_ARCH_X86_64 if (!(space->data.l3Commit[indexL4 >> 3] & (1 << (indexL4 & 7)))) KernelPanic("MMArchMapPage - Attempt to map using uncommitted L3 page table.\n"); if (!(space->data.l2Commit[indexL3 >> 3] & (1 << (indexL3 & 7)))) KernelPanic("MMArchMapPage - Attempt to map using uncommitted L2 page table.\n"); #endif if (!(space->data.l1Commit[indexL2 >> 3] & (1 << (indexL2 & 7)))) KernelPanic("MMArchMapPage - Attempt to map using uncommitted L1 page table.\n"); } #ifdef ES_ARCH_X86_64 if ((PAGE_TABLE_L4[indexL4] & 1) == 0) { if (flags & MM_MAP_PAGE_NO_NEW_TABLES) KernelPanic("MMArchMapPage - NO_NEW_TABLES flag set, but a table was missing.\n"); PAGE_TABLE_L4[indexL4] = MMPhysicalAllocate(MM_PHYSICAL_ALLOCATE_LOCK_ACQUIRED) | 7; ProcessorInvalidatePage((uintptr_t) (PAGE_TABLE_L3 + indexL3)); // Not strictly necessary. EsMemoryZero((void *) ((uintptr_t) (PAGE_TABLE_L3 + indexL3) & ~(K_PAGE_SIZE - 1)), K_PAGE_SIZE); space->data.pageTablesActive++; } if ((PAGE_TABLE_L3[indexL3] & 1) == 0) { if (flags & MM_MAP_PAGE_NO_NEW_TABLES) KernelPanic("MMArchMapPage - NO_NEW_TABLES flag set, but a table was missing.\n"); PAGE_TABLE_L3[indexL3] = MMPhysicalAllocate(MM_PHYSICAL_ALLOCATE_LOCK_ACQUIRED) | 7; ProcessorInvalidatePage((uintptr_t) (PAGE_TABLE_L2 + indexL2)); // Not strictly necessary. EsMemoryZero((void *) ((uintptr_t) (PAGE_TABLE_L2 + indexL2) & ~(K_PAGE_SIZE - 1)), K_PAGE_SIZE); space->data.pageTablesActive++; } #endif if ((PAGE_TABLE_L2[indexL2] & 1) == 0) { if (flags & MM_MAP_PAGE_NO_NEW_TABLES) KernelPanic("MMArchMapPage - NO_NEW_TABLES flag set, but a table was missing.\n"); PAGE_TABLE_L2[indexL2] = MMPhysicalAllocate(MM_PHYSICAL_ALLOCATE_LOCK_ACQUIRED) | 7; ProcessorInvalidatePage((uintptr_t) (PAGE_TABLE_L1 + indexL1)); // Not strictly necessary. EsMemoryZero((void *) ((uintptr_t) (PAGE_TABLE_L1 + indexL1) & ~(K_PAGE_SIZE - 1)), K_PAGE_SIZE); space->data.pageTablesActive++; } uintptr_t oldValue = PAGE_TABLE_L1[indexL1]; uintptr_t value = physicalAddress | 3; #ifdef ES_ARCH_X86_64 if (flags & MM_MAP_PAGE_WRITE_COMBINING) value |= 16; // This only works because we modified the PAT in SetupProcessor1. #else if (flags & MM_MAP_PAGE_WRITE_COMBINING) KernelPanic("MMArchMapPage - Write combining is unimplemented.\n"); // TODO. #endif if (flags & MM_MAP_PAGE_NOT_CACHEABLE) value |= 24; if (flags & MM_MAP_PAGE_USER) value |= 7; else value |= 1 << 8; // Global. if (flags & MM_MAP_PAGE_READ_ONLY) value &= ~2; if (flags & MM_MAP_PAGE_COPIED) value |= 1 << 9; // Ignored by the CPU. // When the CPU accesses or writes to a page, // it will modify the table entry to set the accessed or dirty bits respectively, // but it uses its TLB entry as the assumed previous value of the entry. // When unmapping pages we can't atomically remove an entry and do the TLB shootdown. // This creates a race condition: // 1. CPU 0 maps a page table entry. The dirty bit is not set. // 2. CPU 1 reads from the page. A TLB entry is created with the dirty bit not set. // 3. CPU 0 unmaps the entry. // 4. CPU 1 writes to the page. As the TLB entry has the dirty bit cleared, it sets the entry to its cached entry ORed with the dirty bit. // 5. CPU 0 invalidates the entry. // That is, CPU 1 didn't realize the page was unmapped when it wrote out its entry, so the page becomes mapped again. // To prevent this, we mark all pages with the dirty and accessed bits when we initially map them. // (We don't use these bits for anything, anyway. They're basically useless on SMP systems, as far as I can tell.) // That said, a CPU won't overwrite and clear a dirty bit when writing out its accessed flag (tested on Qemu); // see here https://stackoverflow.com/questions/69024372/. // Tl;dr: if a CPU ever sees an entry without these bits set, it can overwrite the entry with junk whenever it feels like it. // TODO Should we be marking page tables as dirty/accessed? (Including those made by the 32-bit AND 64-bit bootloader and MMArchInitialise). // When page table trimming is implemented, we'll probably need to do this. value |= (1 << 5) | (1 << 6); if ((oldValue & 1) && !(flags & MM_MAP_PAGE_OVERWRITE)) { if (flags & MM_MAP_PAGE_IGNORE_IF_MAPPED) { return false; } if ((oldValue & ~(K_PAGE_SIZE - 1)) != physicalAddress) { KernelPanic("MMArchMapPage - Attempt to map %x to %x that has already been mapped to %x.\n", virtualAddress, physicalAddress, oldValue & (~(K_PAGE_SIZE - 1))); } if (oldValue == value) { KernelPanic("MMArchMapPage - Attempt to rewrite page translation.\n", physicalAddress, virtualAddress, oldValue & (K_PAGE_SIZE - 1), value & (K_PAGE_SIZE - 1)); } else if (!(oldValue & 2) && (value & 2)) { // The page has become writable. } else { KernelPanic("MMArchMapPage - Attempt to change flags mapping %x address %x from %x to %x.\n", physicalAddress, virtualAddress, oldValue & (K_PAGE_SIZE - 1), value & (K_PAGE_SIZE - 1)); } } PAGE_TABLE_L1[indexL1] = value; // We rely on this page being invalidated on this CPU in some places. ProcessorInvalidatePage(oldVirtualAddress); return true; } bool MMArchMakePageWritable(MMSpace *space, uintptr_t virtualAddress) { KMutexAcquire(&space->data.mutex); EsDefer(KMutexRelease(&space->data.mutex)); #ifdef ES_ARCH_X86_64 virtualAddress &= 0x0000FFFFFFFFF000; #else virtualAddress &= 0xFFFFF000; #endif #ifdef ES_ARCH_X86_64 uintptr_t indexL4 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 3); if ((PAGE_TABLE_L4[indexL4] & 1) == 0) return false; uintptr_t indexL3 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 2); if ((PAGE_TABLE_L3[indexL3] & 1) == 0) return false; #endif uintptr_t indexL2 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 1); if ((PAGE_TABLE_L2[indexL2] & 1) == 0) return false; uintptr_t indexL1 = virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 0); if ((PAGE_TABLE_L1[indexL1] & 1) == 0) return false; PAGE_TABLE_L1[indexL1] |= 2; return true; } void MMArchInitialise() { coreMMSpace->data.cr3 = kernelMMSpace->data.cr3 = ProcessorReadCR3(); mmCoreRegions[0].baseAddress = MM_CORE_SPACE_START; mmCoreRegions[0].pageCount = MM_CORE_SPACE_SIZE / K_PAGE_SIZE; #ifdef ES_ARCH_X86_64 for (uintptr_t i = 0x100; i < 0x200; i++) { if (PAGE_TABLE_L4[i] == 0) { // We don't need to commit anything because the PMM isn't ready yet. PAGE_TABLE_L4[i] = MMPhysicalAllocate(ES_FLAGS_DEFAULT) | 3; EsMemoryZero((void *) (PAGE_TABLE_L3 + i * 0x200), K_PAGE_SIZE); } } coreMMSpace->data.l1Commit = coreL1Commit; KMutexAcquire(&coreMMSpace->reserveMutex); kernelMMSpace->data.l1Commit = (uint8_t *) MMReserve(coreMMSpace, L1_COMMIT_SIZE_BYTES, MM_REGION_NORMAL | MM_REGION_NO_COMMIT_TRACKING | MM_REGION_FIXED)->baseAddress; KMutexRelease(&coreMMSpace->reserveMutex); #endif } uintptr_t MMArchTranslateAddress(MMSpace *, uintptr_t virtualAddress, bool writeAccess) { // TODO This mutex will be necessary if we ever remove page tables. // space->data.mutex.Acquire(); // EsDefer(space->data.mutex.Release()); #ifdef ES_ARCH_X86_64 virtualAddress &= 0x0000FFFFFFFFF000; if ((PAGE_TABLE_L4[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 3)] & 1) == 0) return 0; if ((PAGE_TABLE_L3[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 2)] & 1) == 0) return 0; #endif if ((PAGE_TABLE_L2[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 1)] & 1) == 0) return 0; uintptr_t physicalAddress = PAGE_TABLE_L1[virtualAddress >> (K_PAGE_BITS + ENTRIES_PER_PAGE_TABLE_BITS * 0)]; if (writeAccess && !(physicalAddress & 2)) return 0; #ifdef ES_ARCH_X86_64 return (physicalAddress & 1) ? (physicalAddress & 0x0000FFFFFFFFF000) : 0; #else return (physicalAddress & 1) ? (physicalAddress & 0xFFFFF000) : 0; #endif } uintptr_t ArchFindRootSystemDescriptorPointer() { uint64_t uefiRSDP = *((uint64_t *) (LOW_MEMORY_MAP_START + GetBootloaderInformationOffset() + 0x7FE8)); if (uefiRSDP) { return uefiRSDP; } PhysicalMemoryRegion searchRegions[2]; searchRegions[0].baseAddress = (uintptr_t) (((uint16_t *) LOW_MEMORY_MAP_START)[0x40E] << 4) + LOW_MEMORY_MAP_START; searchRegions[0].pageCount = 0x400; searchRegions[1].baseAddress = (uintptr_t) 0xE0000 + LOW_MEMORY_MAP_START; searchRegions[1].pageCount = 0x20000; for (uintptr_t i = 0; i < 2; i++) { for (uintptr_t address = searchRegions[i].baseAddress; address < searchRegions[i].baseAddress + searchRegions[i].pageCount; address += 16) { RootSystemDescriptorPointer *rsdp = (RootSystemDescriptorPointer *) address; if (rsdp->signature != SIGNATURE_RSDP) { continue; } if (rsdp->revision == 0) { if (EsMemorySumBytes((uint8_t *) rsdp, 20)) { continue; } return (uintptr_t) rsdp - LOW_MEMORY_MAP_START; } else if (rsdp->revision == 2) { if (EsMemorySumBytes((uint8_t *) rsdp, sizeof(RootSystemDescriptorPointer))) { continue; } return (uintptr_t) rsdp - LOW_MEMORY_MAP_START; } } } return 0; } uint64_t ArchGetTimeFromPITMs() { // TODO This isn't working on real hardware, but EarlyDelay1Ms is? // NOTE This will only work if called at least once every 50 ms. // (The PIT only stores a 16-bit counter, which is depleted every 50 ms.) static bool started = false; static uint64_t cumulative = 0, last = 0; if (!started) { ProcessorOut8(IO_PIT_COMMAND, 0x30); ProcessorOut8(IO_PIT_DATA, 0xFF); ProcessorOut8(IO_PIT_DATA, 0xFF); started = true; last = 0xFFFF; return 0; } else { ProcessorOut8(IO_PIT_COMMAND, 0x00); uint16_t x = ProcessorIn8(IO_PIT_DATA); x |= (ProcessorIn8(IO_PIT_DATA)) << 8; cumulative += last - x; if (x > last) cumulative += 0x10000; last = x; return cumulative * 1000 / 1193182; } } void EarlyDelay1Ms() { ProcessorOut8(IO_PIT_COMMAND, 0x30); ProcessorOut8(IO_PIT_DATA, 0xA9); ProcessorOut8(IO_PIT_DATA, 0x04); while (true) { ProcessorOut8(IO_PIT_COMMAND, 0xE2); if (ProcessorIn8(IO_PIT_DATA) & (1 << 7)) { break; } } } NewProcessorStorage AllocateNewProcessorStorage(ArchCPU *archCPU) { NewProcessorStorage storage = {}; storage.local = (CPULocalStorage *) EsHeapAllocate(sizeof(CPULocalStorage), true, K_FIXED); #ifdef ES_ARCH_X86_64 storage.gdt = (uint32_t *) MMMapPhysical(kernelMMSpace, MMPhysicalAllocate(MM_PHYSICAL_ALLOCATE_COMMIT_NOW), K_PAGE_SIZE, ES_FLAGS_DEFAULT); #endif storage.local->archCPU = archCPU; archCPU->local = storage.local; scheduler.CreateProcessorThreads(storage.local); archCPU->kernelProcessorID = storage.local->processorID; return storage; } void SetupProcessor2(NewProcessorStorage *storage) { // Setup the local interrupts for the current processor. for (uintptr_t i = 0; i < acpi.lapicNMICount; i++) { if (acpi.lapicNMIs[i].processor == 0xFF || acpi.lapicNMIs[i].processor == storage->local->archCPU->processorID) { uint32_t registerIndex = (0x350 + (acpi.lapicNMIs[i].lintIndex << 4)) >> 2; uint32_t value = 2 | (1 << 10); // NMI exception interrupt vector. if (acpi.lapicNMIs[i].activeLow) value |= 1 << 13; if (acpi.lapicNMIs[i].levelTriggered) value |= 1 << 15; LapicWriteRegister(registerIndex, value); } } LapicWriteRegister(0x350 >> 2, LapicReadRegister(0x350 >> 2) & ~(1 << 16)); LapicWriteRegister(0x360 >> 2, LapicReadRegister(0x360 >> 2) & ~(1 << 16)); LapicWriteRegister(0x080 >> 2, 0); if (LapicReadRegister(0x30 >> 2) & 0x80000000) LapicWriteRegister(0x410 >> 2, 0); LapicEndOfInterrupt(); // Configure the LAPIC's timer. LapicWriteRegister(0x3E0 >> 2, 2); // Divisor = 16 // Create the processor's local storage. ProcessorSetLocalStorage(storage->local); // Setup a GDT and TSS for the processor. #ifdef ES_ARCH_X86_64 uint32_t *gdt = storage->gdt; void *bootstrapGDT = (void *) (((uint64_t *) ((uint16_t *) processorGDTR + 1))[0]); EsMemoryCopy(gdt, bootstrapGDT, 2048); uint32_t *tss = (uint32_t *) ((uint8_t *) storage->gdt + 2048); storage->local->archCPU->kernelStack = (void **) (tss + 1); ProcessorInstallTSS(gdt, tss); #endif } void ArchInitialise() { ACPIParseTables(); uint8_t bootstrapLapicID = (LapicReadRegister(0x20 >> 2) >> 24); ArchCPU *currentCPU = nullptr; for (uintptr_t i = 0; i < acpi.processorCount; i++) { if (acpi.processors[i].apicID == bootstrapLapicID) { // That's us! currentCPU = acpi.processors + i; currentCPU->bootProcessor = true; break; } } if (!currentCPU) { KernelPanic("ArchInitialise - Could not find the bootstrap processor\n"); } // Calibrate the LAPIC's timer and processor's timestamp counter. ProcessorDisableInterrupts(); uint64_t start = ProcessorReadTimeStamp(); LapicWriteRegister(0x380 >> 2, (uint32_t) -1); for (int i = 0; i < 8; i++) EarlyDelay1Ms(); // Average over 8ms acpi.lapicTicksPerMs = ((uint32_t) -1 - LapicReadRegister(0x390 >> 2)) >> 4; EsRandomAddEntropy(LapicReadRegister(0x390 >> 2)); uint64_t end = ProcessorReadTimeStamp(); timeStampTicksPerMs = (end - start) >> 3; ProcessorEnableInterrupts(); // EsPrint("timeStampTicksPerMs = %d\n", timeStampTicksPerMs); // Finish processor initialisation. // This sets up interrupts, the timer, CPULocalStorage, the GDT and TSS, // and registers the processor with the scheduler. NewProcessorStorage storage = AllocateNewProcessorStorage(currentCPU); SetupProcessor2(&storage); } size_t ProcessorSendIPI(uintptr_t interrupt, bool nmi, int processorID) { // It's possible that another CPU is trying to send an IPI at the same time we want to send the panic IPI. // TODO What should we do in this case? if (interrupt != KERNEL_PANIC_IPI) KSpinlockAssertLocked(&ipiLock); // Note: We send IPIs at a special priority that ProcessorDisableInterrupts doesn't mask. size_t ignored = 0; for (uintptr_t i = 0; i < acpi.processorCount; i++) { ArchCPU *processor = acpi.processors + i; if (processorID != -1) { if (processorID != processor->kernelProcessorID) { ignored++; continue; } } else { if (processor == GetLocalStorage()->archCPU || !processor->local || !processor->local->schedulerReady) { ignored++; continue; } } uint32_t destination = acpi.processors[i].apicID << 24; uint32_t command = interrupt | (1 << 14) | (nmi ? 0x400 : 0); LapicWriteRegister(0x310 >> 2, destination); LapicWriteRegister(0x300 >> 2, command); // Wait for the interrupt to be sent. while (LapicReadRegister(0x300 >> 2) & (1 << 12)); } return ignored; } void ProcessorSendYieldIPI(Thread *thread) { thread->receivedYieldIPI = false; KSpinlockAcquire(&ipiLock); ProcessorSendIPI(YIELD_IPI, false); KSpinlockRelease(&ipiLock); while (!thread->receivedYieldIPI); // Spin until the thread gets the IPI. } void ArchNextTimer(size_t ms) { while (!scheduler.started); // Wait until the scheduler is ready. GetLocalStorage()->schedulerReady = true; // Make sure this CPU can be scheduled. LapicNextTimer(ms); // Set the next timer. } uint64_t ArchGetTimeMs() { // Update the time stamp counter synchronization value. timeStampCounterSynchronizationValue = ((timeStampCounterSynchronizationValue & 0x8000000000000000) ^ 0x8000000000000000) | ProcessorReadTimeStamp(); #ifdef ES_ARCH_X86_64 if (acpi.hpetBaseAddress && acpi.hpetPeriod) { __int128 fsToMs = 1000000000000; __int128 reading = acpi.hpetBaseAddress[30]; return (uint64_t) (reading * (__int128) acpi.hpetPeriod / fsToMs); } #endif return ArchGetTimeFromPITMs(); } extern "C" bool PostContextSwitch(InterruptContext *context, MMSpace *oldAddressSpace) { if (scheduler.dispatchSpinlock.interruptsEnabled) { KernelPanic("PostContextSwitch - Interrupts were enabled. (3)\n"); } // We can only free the scheduler's spinlock when we are no longer using the stack // from the previous thread. See DoContextSwitch. // (Another CPU can KillThread this once it's back in activeThreads.) KSpinlockRelease(&scheduler.dispatchSpinlock, true); Thread *currentThread = GetCurrentThread(); #ifdef ES_ARCH_X86_64 CPULocalStorage *local = GetLocalStorage(); void *kernelStack = (void *) currentThread->kernelStack; *local->archCPU->kernelStack = kernelStack; #endif bool newThread = currentThread->cpuTimeSlices == 1; LapicEndOfInterrupt(); ContextSanityCheck(context); ProcessorSetThreadStorage(currentThread->tlsAddress); MMSpaceCloseReference(oldAddressSpace); #ifdef ES_ARCH_X86_64 KernelLog(LOG_VERBOSE, "Arch", "context switch", "Context switch to %zthread %x at %x\n", newThread ? "new " : "", currentThread, context->rip); currentThread->lastKnownExecutionAddress = context->rip; #else KernelLog(LOG_VERBOSE, "Arch", "context switch", "Context switch to %zthread %x at %x\n", newThread ? "new " : "", currentThread, context->eip); currentThread->lastKnownExecutionAddress = context->eip; #endif if (ProcessorAreInterruptsEnabled()) { KernelPanic("PostContextSwitch - Interrupts were enabled. (2)\n"); } if (local->spinlockCount) { KernelPanic("PostContextSwitch - spinlockCount is non-zero (%x).\n", local); } #ifdef ES_ARCH_X86_32 if (context->fromRing0) { // Returning to a kernel thread; we need to fix the stack. uint32_t irq = context->esp; uint32_t errorCode = context->ss; context->ss = context->flags; context->esp = context->cs; context->flags = context->eip; context->cs = context->errorCode; context->eip = context->irq; context->irq = irq; context->errorCode = errorCode; } #endif return newThread; } bool SetupInterruptRedirectionEntry(uintptr_t _line) { KSpinlockAssertLocked(&irqHandlersLock); static uint32_t alreadySetup = 0; if (alreadySetup & (1 << _line)) { return true; } // Work out which interrupt the IoApic will sent to the processor. // TODO Use the upper 4 bits for IRQ priority. uintptr_t line = _line; uintptr_t thisProcessorIRQ = line + IRQ_BASE; bool activeLow = false; bool levelTriggered = true; // If there was an interrupt override entry in the MADT table, // then we'll have to use that number instead. for (uintptr_t i = 0; i < acpi.interruptOverrideCount; i++) { ACPIInterruptOverride *interruptOverride = acpi.interruptOverrides + i; if (interruptOverride->sourceIRQ == line) { line = interruptOverride->gsiNumber; activeLow = interruptOverride->activeLow; levelTriggered = interruptOverride->levelTriggered; break; } } KernelLog(LOG_INFO, "Arch", "IRQ flags", "SetupInterruptRedirectionEntry - IRQ %d is active %z, %z triggered.\n", line, activeLow ? "low" : "high", levelTriggered ? "level" : "edge"); ACPIIoApic *ioApic; bool foundIoApic = false; // Look for the IoApic to which this interrupt is sent. for (uintptr_t i = 0; i < acpi.ioapicCount; i++) { ioApic = acpi.ioApics + i; if (line >= ioApic->gsiBase && line < (ioApic->gsiBase + (0xFF & (ACPIIoApicReadRegister(ioApic, 1) >> 16)))) { foundIoApic = true; line -= ioApic->gsiBase; break; } } // We couldn't find the IoApic that handles this interrupt. if (!foundIoApic) { KernelLog(LOG_ERROR, "Arch", "no IOAPIC", "SetupInterruptRedirectionEntry - Could not find an IOAPIC handling interrupt line %d.\n", line); return false; } // A normal priority interrupt. uintptr_t redirectionTableIndex = line * 2 + 0x10; uint32_t redirectionEntry = thisProcessorIRQ; if (activeLow) redirectionEntry |= (1 << 13); if (levelTriggered) redirectionEntry |= (1 << 15); // Send the interrupt to the processor that registered the interrupt. ACPIIoApicWriteRegister(ioApic, redirectionTableIndex, 1 << 16); // Mask the interrupt while we modify the entry. ACPIIoApicWriteRegister(ioApic, redirectionTableIndex + 1, GetLocalStorage()->archCPU->apicID << 24); ACPIIoApicWriteRegister(ioApic, redirectionTableIndex, redirectionEntry); alreadySetup |= 1 << _line; return true; } void KUnregisterMSI(uintptr_t tag) { KSpinlockAcquire(&irqHandlersLock); EsDefer(KSpinlockRelease(&irqHandlersLock)); msiHandlers[tag].callback = nullptr; } KMSIInformation KRegisterMSI(KIRQHandler handler, void *context, const char *cOwnerName) { KSpinlockAcquire(&irqHandlersLock); EsDefer(KSpinlockRelease(&irqHandlersLock)); for (uintptr_t i = 0; i < INTERRUPT_VECTOR_MSI_COUNT; i++) { if (msiHandlers[i].callback) continue; msiHandlers[i] = { handler, context }; // TODO Selecting the best target processor. // Currently this sends everything to processor 0. KernelLog(LOG_INFO, "Arch", "register MSI", "Register MSI with vector %X for '%z'.\n", INTERRUPT_VECTOR_MSI_START + i, cOwnerName); return { .address = 0xFEE00000, .data = INTERRUPT_VECTOR_MSI_START + i, .tag = i, }; } return {}; } bool KRegisterIRQ(intptr_t line, KIRQHandler handler, void *context, const char *cOwnerName, KPCIDevice *pciDevice) { if (line == -1 && !pciDevice) { KernelPanic("KRegisterIRQ - Interrupt line is %d, and pciDevice is %x.\n", line, pciDevice); } // Save the handler callback and context. if (line > 0x20 || line < -1) KernelPanic("KRegisterIRQ - Unexpected IRQ %d\n", line); bool found = false; KSpinlockAcquire(&irqHandlersLock); for (uintptr_t i = 0; i < sizeof(irqHandlers) / sizeof(irqHandlers[0]); i++) { if (!irqHandlers[i].callback) { found = true; irqHandlers[i].callback = handler; irqHandlers[i].context = context; irqHandlers[i].line = line; irqHandlers[i].pciDevice = pciDevice; irqHandlers[i].cOwnerName = cOwnerName; break; } } bool result = true; if (!found) { KernelLog(LOG_ERROR, "Arch", "too many IRQ handlers", "The limit of IRQ handlers was reached (%d), and the handler for '%z' was not registered.\n", sizeof(irqHandlers) / sizeof(irqHandlers[0]), cOwnerName); result = false; } else { KernelLog(LOG_INFO, "Arch", "register IRQ", "KRegisterIRQ - Registered IRQ %d to '%z'.\n", line, cOwnerName); if (line != -1) { if (!SetupInterruptRedirectionEntry(line)) { result = false; } } else { SetupInterruptRedirectionEntry(9); SetupInterruptRedirectionEntry(10); SetupInterruptRedirectionEntry(11); } } KSpinlockRelease(&irqHandlersLock); return result; } void ArchStartupApplicationProcessors() { // TODO How do we know that this address is usable? #define AP_TRAMPOLINE 0x10000 KEvent delay = {}; uint8_t *startupData = (uint8_t *) (LOW_MEMORY_MAP_START + AP_TRAMPOLINE); // Put the trampoline code in memory. EsMemoryCopy(startupData, (void *) ProcessorAPStartup, 0x1000); // Assume that the AP trampoline code <=4KB. // Put the paging table location at AP_TRAMPOLINE + 0xFF0. *((uint64_t *) (startupData + 0xFF0)) = ProcessorReadCR3(); // Put the 64-bit GDTR at AP_TRAMPOLINE + 0xFE0. EsMemoryCopy(startupData + 0xFE0, (void *) processorGDTR, 0x10); // Put the GDT at AP_TRAMPOLINE + 0x1000. EsMemoryCopy(startupData + 0x1000, (void *) gdt_data, 0x1000); // Put the startup flag at AP_TRAMPOLINE + 0xFC0 uint8_t volatile *startupFlag = (uint8_t *) (LOW_MEMORY_MAP_START + AP_TRAMPOLINE + 0xFC0); // Temporarily identity map 2 pages in at 0x10000. MMArchMapPage(kernelMMSpace, AP_TRAMPOLINE, AP_TRAMPOLINE, MM_MAP_PAGE_COMMIT_TABLES_NOW); MMArchMapPage(kernelMMSpace, AP_TRAMPOLINE + 0x1000, AP_TRAMPOLINE + 0x1000, MM_MAP_PAGE_COMMIT_TABLES_NOW); for (uintptr_t i = 0; i < acpi.processorCount; i++) { ArchCPU *processor = acpi.processors + i; if (processor->bootProcessor) continue; // Allocate state for the processor. NewProcessorStorage storage = AllocateNewProcessorStorage(processor); // Clear the startup flag. *startupFlag = 0; // Put the stack at AP_TRAMPOLINE + 0xFD0, and the address of the NewProcessorStorage at AP_TRAMPOLINE + 0xFB0. void *stack = (void *) ((uintptr_t) MMStandardAllocate(kernelMMSpace, 0x1000, MM_REGION_FIXED) + 0x1000); *((void **) (startupData + 0xFD0)) = stack; *((NewProcessorStorage **) (startupData + 0xFB0)) = &storage; KernelLog(LOG_INFO, "ACPI", "starting processor", "Starting processor %d with local storage %x...\n", i, storage.local); // Send an INIT IPI. ProcessorDisableInterrupts(); // Don't be interrupted between writes... LapicWriteRegister(0x310 >> 2, processor->apicID << 24); LapicWriteRegister(0x300 >> 2, 0x4500); ProcessorEnableInterrupts(); KEventWait(&delay, 10); // Send a startup IPI. ProcessorDisableInterrupts(); LapicWriteRegister(0x310 >> 2, processor->apicID << 24); LapicWriteRegister(0x300 >> 2, 0x4600 | (AP_TRAMPOLINE >> K_PAGE_BITS)); ProcessorEnableInterrupts(); for (uintptr_t i = 0; i < 100 && *startupFlag == 0; i++) KEventWait(&delay, 1); if (*startupFlag) { // The processor started correctly. } else { // Send a startup IPI, again. ProcessorDisableInterrupts(); LapicWriteRegister(0x310 >> 2, processor->apicID << 24); LapicWriteRegister(0x300 >> 2, 0x4600 | (AP_TRAMPOLINE >> K_PAGE_BITS)); ProcessorEnableInterrupts(); for (uintptr_t i = 0; i < 1000 && *startupFlag == 0; i++) KEventWait(&delay, 1); // Wait longer this time. if (*startupFlag) { // The processor started correctly. } else { // The processor could not be started. KernelLog(LOG_ERROR, "ACPI", "processor startup failure", "ACPIInitialise - Could not start processor %d\n", processor->processorID); continue; } } // EsPrint("Startup flag 1 reached!\n"); for (uintptr_t i = 0; i < 10000 && *startupFlag != 2; i++) KEventWait(&delay, 1); if (*startupFlag == 2) { // The processor started! } else { // The processor did not report it completed initilisation, worringly. // Don't let it continue. KernelLog(LOG_ERROR, "ACPI", "processor startup failure", "ACPIInitialise - Could not initialise processor %d\n", processor->processorID); // TODO Send IPI to stop the processor. } } // Remove the identity pages needed for the trampoline code. MMArchUnmapPages(kernelMMSpace, AP_TRAMPOLINE, 2, ES_FLAGS_DEFAULT); }