; This file is part of the Essence operating system.
; It is released under the terms of the MIT license -- see LICENSE.md.
; Written by: nakst.

[bits 64]

[global ArchSwitchContext]
[global GetCurrentThread]
[global GetLocalStorage]
[global MMArchSafeCopy]
[global ProcessorAPStartup]
[global ProcessorAreInterruptsEnabled]
[global ProcessorDebugOutputByte]
[global ProcessorDisableInterrupts]
[global ProcessorEnableInterrupts]
[global ProcessorFakeTimerInterrupt]
[global ProcessorFlushCodeCache]
[global ProcessorGetRBP]
[global ProcessorGetRSP]
[global ProcessorHalt]
[global ProcessorIn16]
[global ProcessorIn32]
[global ProcessorIn8]
[global ProcessorInstallTSS]
[global ProcessorInvalidateAllPages]
[global ProcessorInvalidatePage]
[global ProcessorOut16]
[global ProcessorOut32]
[global ProcessorOut8]
[global ProcessorReadCR3]
[global ProcessorReadMXCSR]
[global ProcessorReadTimeStamp]
[global ProcessorReset]
[global ProcessorSetAddressSpace]
[global ProcessorSetLocalStorage]
[global ProcessorSetThreadStorage]
[global _KThreadTerminate]
[global _start]
[global gdt_data]
[global pagingNXESupport]
[global pagingPCIDSupport]
[global pagingSMEPSupport]
[global pagingTCESupport]
[global processorGDTR]
[global simdSSE3Support]
[global simdSSSE3Support]
[global timeStampCounterSynchronizationValue]

[extern ArchNextTimer]
[extern InterruptHandler]
[extern KThreadTerminate]
[extern KernelInitialise]
[extern PostContextSwitch]
[extern SetupProcessor2]
[extern Syscall]
[extern installationID]
[extern PCSetupCOM1]
[extern PCDisablePIC]
[extern PCProcessMemoryMap]
[extern bootloaderID]
[extern bootloaderInformationOffset]

[section .bss]

align 16

%define stack_size 16384
stack: resb stack_size

%define idt_size 4096
idt_data: resb idt_size

%define cpu_local_storage_size 8192
; Array of pointers to the CPU local states
cpu_local_storage: resb cpu_local_storage_size

[section .data]

idt:
	.limit: dw idt_size - 1
	.base:  dq idt_data

cpu_local_storage_index:
	dq 0

pagingNXESupport:
	dd 1
pagingPCIDSupport:
	dd 1
pagingSMEPSupport:
	dd 1
pagingTCESupport:
	dd 1
simdSSE3Support:
	dd 1
simdSSSE3Support:
	dd 1

align 16
processorGDTR:
	dq 0
	dq 0

[section .text]

_start:
	mov	rax,0x63
	mov	fs,ax
	mov	gs,ax

	; Save the bootloader ID.
	mov	rax,bootloaderID
	mov	[rax],rsi

	; The MBR bootloader does not know the address of the RSDP. 
	cmp	rdi,0
	jne	.standard_acpi
	mov	[0x7FE8],rdi
	.standard_acpi:

	; Save the bootloader information offset.
	mov	rax,bootloaderInformationOffset
	mov	[rax],rdi

	; Install a stack
	mov	rsp,stack + stack_size

	; Load the installation ID.
	mov	rbx,installationID
	mov	rax,[rdi + 0x7FF0]
	mov	[rbx],rax
	mov	rax,[rdi + 0x7FF8]
	mov	[rbx + 8],rax

	; Unmap the identity paging the bootloader used
	mov	rax,0xFFFFFF7FBFDFE000
	mov	qword [rax],0
	mov	rax,cr3
	mov	cr3,rax

	call	PCSetupCOM1
	call	PCDisablePIC
	call	PCProcessMemoryMap

	; Install the interrupt handlers
%macro INSTALL_INTERRUPT_HANDLER 1
	mov	rbx,(%1 * 16) + idt_data
	mov	rdx,InterruptHandler%1
	call	InstallInterruptHandler
%endmacro
%assign i 0
%rep 256
	INSTALL_INTERRUPT_HANDLER i
%assign i i+1
%endrep

	; Save the location of the bootstrap GDT
	mov	rcx,processorGDTR
	sgdt	[rcx]

	; First stage of processor initilisation
	call	SetupProcessor1

	; Call the KernelInitialise function
	and	rsp,~0xF
	call	KernelInitialise

ProcessorReady:
	; Set the timer and become this CPU's idle thread.
	mov	rdi,1
	call	ArchNextTimer
	jmp	ProcessorIdle

SetupProcessor1:
	.enable_cpu_features:
	; Enable no-execute support, if available
	mov	eax,0x80000001
	cpuid
	and	edx,1 << 20
	shr	edx,20
	mov	rax,pagingNXESupport
	and	[rax],edx
	cmp	edx,0
	je	.no_paging_nxe_support
	mov	ecx,0xC0000080
	rdmsr
	or	eax,1 << 11
	wrmsr
	.no_paging_nxe_support:

	; x87 FPU
	fninit
	mov	rax,.cw
	fldcw	[rax]
	jmp	.cwa
	.cw:	dw 0x037A
	.cwa:

	; Enable SMEP support, if available
	; This prevents the kernel from executing userland pages
	; TODO Test this: neither Bochs or Qemu seem to support it?
	xor	eax,eax
	cpuid
	cmp	eax,7
	jb	.no_smep_support
	mov	eax,7
	xor	ecx,ecx
	cpuid
	and	ebx,1 << 7
	shr	ebx,7
	mov	rax,pagingSMEPSupport
	and	[rax],ebx
	cmp	ebx,0
	je	.no_smep_support
	mov	word [rax],2
	mov	rax,cr4
	or	rax,1 << 20
	mov	cr4,rax
	.no_smep_support:

	; Enable PCID support, if available
	mov	eax,1
	xor	ecx,ecx
	cpuid
	and	ecx,1 << 17
	shr	ecx,17
	mov	rax,pagingPCIDSupport
	and	[rax],ecx
	cmp	ecx,0
	je	.no_pcid_support
	mov	rax,cr4
	or	rax,1 << 17
	mov	cr4,rax
	.no_pcid_support:

	; Enable global pages
	mov	rax,cr4
	or	rax,1 << 7
	mov	cr4,rax

	; Enable TCE support, if available
	mov	eax,0x80000001
	xor	ecx,ecx
	cpuid
	and	ecx,1 << 17
	shr	ecx,17
	mov	rax,pagingTCESupport
	and	[rax],ecx
	cmp	ecx,0
	je	.no_tce_support
	mov	ecx,0xC0000080
	rdmsr
	or	eax,1 << 15
	wrmsr
	.no_tce_support:

	; Enable write protect, so copy-on-write works in the kernel, and MMArchSafeCopy will page fault in read-only regions.
	mov	rax,cr0
	or	rax,1 << 16
	mov	cr0,rax

	; Enable MMX, SSE and SSE2
	; These features are all guaranteed to be present on a x86_64 CPU
	mov	rax,cr0
	mov	rbx,cr4
	and	rax,~4
	or	rax,2
	or	rbx,512 + 1024
	mov	cr0,rax
	mov	cr4,rbx

	; Detect SSE3 and SSSE3, if available.
	mov	eax,1
	cpuid
	test	ecx,1 << 0
	jnz	.has_sse3
	mov	rax,simdSSE3Support
	and	byte [rax],0
	.has_sse3:
	test	ecx,1 << 9
	jnz	.has_ssse3
	mov	rax,simdSSSE3Support
	and	byte [rax],0
	.has_ssse3:

	; Enable system-call extensions (SYSCALL and SYSRET).
	mov	ecx,0xC0000080
	rdmsr
	or	eax,1
	wrmsr
	add	ecx,1
	rdmsr
	mov	edx,0x005B0048
	wrmsr
	add	ecx,1
	mov	rdx,SyscallEntry
	mov	rax,rdx
	shr	rdx,32
	wrmsr
	add	ecx,2
	rdmsr
	mov	eax,(1 << 10) | (1 << 9) ; Clear direction and interrupt flag when we enter ring 0.
	wrmsr

	; Assign PAT2 to WC.
	mov	ecx,0x277
	xor	rax,rax
	xor	rdx,rdx
	rdmsr
	and	eax,0xFFF8FFFF
	or	eax,0x00010000
	wrmsr

	.setup_cpu_local_storage:
	mov	ecx,0xC0000101
	mov	rax,cpu_local_storage
	mov	rdx,cpu_local_storage
	shr	rdx,32
	mov	rdi,cpu_local_storage_index
	add	rax,[rdi]
	add	qword [rdi],32 ; Space for 4 8-byte values at gs:0 - gs:31
	wrmsr

	.load_idtr:
	; Load the IDTR
	mov	rax,idt
	lidt	[rax]
	sti

	.enable_apic:
	; Enable the APIC!
	; Since we're on AMD64, we know that the APIC will be present.
	mov	ecx,0x1B
	rdmsr
	or	eax,0x800
	wrmsr
	and	eax,~0xFFF
	mov	edi,eax

	; Set the spurious interrupt vector to 0xFF
	mov	rax,0xFFFFFE00000000F0 ; LOW_MEMORY_MAP_START + 0xF0
	add	rax,rdi
	mov	ebx,[rax]
	or	ebx,0x1FF
	mov	[rax],ebx

	; Use the flat processor addressing model
	mov	rax,0xFFFFFE00000000E0 ; LOW_MEMORY_MAP_START + 0xE0
	add	rax,rdi
	mov	dword [rax],0xFFFFFFFF

	; Make sure that no external interrupts are masked
	xor	rax,rax
	mov	cr8,rax

	ret

SyscallEntry:
	mov	rsp,[gs:8]
	sti

	mov	ax,0x50
	mov	ds,ax
	mov	es,ax

	; Preserve RCX, R11, R12 and RBX.
	push	rcx
	push	r11
	push	r12
	mov	rax,rsp
	push	rbx
	push	rax

	; Arguments in RDI, RSI, RDX, R8, R9. (RCX contains return address).
	; Return value in RAX.
	mov	rbx,rsp
	and	rsp,~0xF
	call	Syscall
	mov	rsp,rbx

	; Disable maskable interrupts.
	cli

	; Return to long mode. (Address in RCX).
	add	rsp,8
	push	rax
	mov	ax,0x63
	mov	ds,ax
	mov	es,ax
	pop	rax
	pop	rbx
	pop	r12 ; User RSP
	pop	r11
	pop	rcx ; Return address
	db	0x48 
	sysret

ProcessorFakeTimerInterrupt:
	int	0x40
	ret

ProcessorDisableInterrupts:
	mov	rax,14 ; Still allow important IPIs to go through.
	mov	cr8,rax
	sti ; TODO Where is this necessary? Is is a performance issue?
	ret

ProcessorEnableInterrupts:
	; WARNING: Changing this mechanism also requires update in x86_64.cpp, when deciding if we should re-enable interrupts on exception.
	mov	rax,0
	mov	cr8,rax
	sti ; TODO Where is this necessary? Is is a performance issue?
	ret

ProcessorAreInterruptsEnabled:
	pushf	
	pop	rax
	and	rax,0x200
	shr	rax,9

	mov	rdx,cr8
	cmp	rdx,0
	je	.done
	mov	rax,0
	.done:

	; pushf
	; pop	rax
	; and	rax,0x200
	; shr	rax,9
	ret

ProcessorHalt:
	cli
	hlt
	jmp	ProcessorHalt

ProcessorOut8:
	mov	rdx,rdi
	mov	rax,rsi
	out	dx,al
	ret

ProcessorIn8:
	mov	rdx,rdi
	xor	rax,rax
	in	al,dx
	ret

ProcessorOut16:
	mov	rdx,rdi
	mov	rax,rsi
	out	dx,ax
	ret

ProcessorIn16:
	mov	rdx,rdi
	xor	rax,rax
	in	ax,dx
	ret

ProcessorOut32:
	mov	rdx,rdi
	mov	rax,rsi
	out	dx,eax
	ret

ProcessorIn32:
	mov	rdx,rdi
	xor	rax,rax
	in	eax,dx
	ret

ProcessorInvalidatePage:
	invlpg	[rdi]
	ret

ProcessorInvalidateAllPages:
	; Toggle CR4.PGE to invalidate all TLB entries, including global entries.
	mov	rax,cr4
	and	rax,~(1 << 7)
	mov	cr4,rax
	or	rax,1 << 7
	mov	cr4,rax
	ret

ProcessorIdle:
	sti
	hlt
	jmp	ProcessorIdle

GetLocalStorage:
	mov	rax,[gs:0]
	ret

GetCurrentThread:
	mov	rax,[gs:16]
	ret

ProcessorSetLocalStorage:
	mov	[gs:0],rdi
	ret

ProcessorSetThreadStorage:
	push	rdx
	push	rcx
	mov	rcx,0xC0000100 ; set fs base
	mov	rdx,rdi
	mov	rax,rdi
	shr	rdx,32
	wrmsr		       ; to edx:eax (from rdi)
	pop	rcx
	pop	rdx
	ret

InstallInterruptHandler:
	mov	word [rbx + 0],dx
	mov	word [rbx + 2],0x48
	mov	word [rbx + 4],0x8E00 
	shr	rdx,16
	mov	word [rbx + 6],dx
	shr	rdx,16
	mov	qword [rbx + 8],rdx

	ret

%macro INTERRUPT_HANDLER 1
InterruptHandler%1:
	push	dword 0 ; A fake error code
	push	dword %1 ; The interrupt number
	jmp	ASMInterruptHandler
%endmacro

%macro INTERRUPT_HANDLER_EC 1
InterruptHandler%1:
	; The CPU already pushed an error code
	push	dword %1 ; The interrupt number
	jmp	ASMInterruptHandler
%endmacro

INTERRUPT_HANDLER 0
INTERRUPT_HANDLER 1
INTERRUPT_HANDLER 2
INTERRUPT_HANDLER 3
INTERRUPT_HANDLER 4
INTERRUPT_HANDLER 5
INTERRUPT_HANDLER 6
INTERRUPT_HANDLER 7
INTERRUPT_HANDLER_EC 8
INTERRUPT_HANDLER 9
INTERRUPT_HANDLER_EC 10
INTERRUPT_HANDLER_EC 11
INTERRUPT_HANDLER_EC 12
INTERRUPT_HANDLER_EC 13
INTERRUPT_HANDLER_EC 14
INTERRUPT_HANDLER 15
INTERRUPT_HANDLER 16
INTERRUPT_HANDLER_EC 17
INTERRUPT_HANDLER 18
INTERRUPT_HANDLER 19
INTERRUPT_HANDLER 20
INTERRUPT_HANDLER 21
INTERRUPT_HANDLER 22
INTERRUPT_HANDLER 23
INTERRUPT_HANDLER 24
INTERRUPT_HANDLER 25
INTERRUPT_HANDLER 26
INTERRUPT_HANDLER 27
INTERRUPT_HANDLER 28
INTERRUPT_HANDLER 29
INTERRUPT_HANDLER 30
INTERRUPT_HANDLER 31

%assign i 32
%rep 224
INTERRUPT_HANDLER i
%assign i i+1
%endrep

ASMInterruptHandler:
	cld

	push	rax
	push	rbx
	push	rcx
	push	rdx
	push	rsi
	push	rdi
	push	rbp
	push	r8
	push	r9
	push	r10
	push	r11
	push	r12
	push	r13
	push	r14
	push	r15

	mov	rax,cr8
	push	rax

	mov	rax,0x123456789ABCDEF
	push	rax

	mov	rbx,rsp
	and	rsp,~0xF
	fxsave	[rsp - 512]
	mov	rsp,rbx
	sub	rsp,512 + 16
	
	xor	rax,rax
	mov	ax,ds
	push	rax
	mov	ax,0x10
	mov	ds,ax
	mov	es,ax
	mov	rax,cr2
	push	rax

	mov	rdi,rsp
	mov	rbx,rsp
	and	rsp,~0xF
	call	InterruptHandler
	mov	rsp,rbx
	xor	rax,rax

ReturnFromInterruptHandler:
	add	rsp,8
	pop	rbx
	mov	ds,bx
	mov	es,bx

	add	rsp,512 + 16
	mov	rbx,rsp
	and	rbx,~0xF
	fxrstor	[rbx - 512]

	cmp	al,0
	je	.oldThread
	fninit ; New thread - initialise FPU.
	.oldThread:

	pop	rax
	mov	rbx,0x123456789ABCDEF
	cmp	rax,rbx
	jne	$

	cli	
	pop	rax
	mov	cr8,rax

	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	r11
	pop	r10
	pop	r9
	pop	r8
	pop	rbp
	pop	rdi
	pop	rsi
	pop	rdx
	pop	rcx
	pop	rbx
	pop	rax

	add	rsp,16
	iretq

ProcessorSetAddressSpace:
	mov	rdi,[rdi]
	mov	rax,cr3
	cmp	rax,rdi
	je	.cont
	mov	cr3,rdi
	.cont:
	ret

ProcessorGetRSP:
	mov	rax,rsp
	ret

ProcessorGetRBP:
	mov	rax,rbp
	ret

ArchSwitchContext:
	cli
	mov	[gs:16],rcx
	mov	[gs:8],rdx
	mov	rsi,[rsi]
	mov	rax,cr3
	cmp	rax,rsi
	je	.cont
	mov	cr3,rsi
	.cont:
	mov	rsp,rdi
	mov	rsi,r8
	call	PostContextSwitch
	jmp	ReturnFromInterruptHandler

ProcessorReadCR3:
	mov	rax,cr3
	ret

ProcessorDebugOutputByte:
%ifdef COM_OUTPUT
	mov	dx,0x3F8 + 5
	.WaitRead:
	in	al,dx
	and	al,0x20
	cmp	al,0
	je	.WaitRead
	mov	dx,0x3F8 + 0
	mov	rax,rdi
	out	dx,al
%endif
	ret

ProcessorReadTimeStamp:
	rdtsc
	shl	rdx,32
	or	rax,rdx
	ret

ProcessorFlushCodeCache:
	wbinvd
	ret

ProcessorReadMXCSR:
	mov	rax,.buffer
	stmxcsr	[rax]
	mov	rax,.buffer
	mov	rax,[rax]
	ret
	.buffer: dq 0

ProcessorInstallTSS:
	push	rbx

	; Set the location of the TSS in the GDT.
	mov	rax,rdi
	mov	rbx,rsi
	mov	[rax + 56 + 2],bx
	shr	rbx,16
	mov	[rax + 56 + 4],bl
	shr	rbx,8
	mov	[rax + 56 + 7],bl
	shr	rbx,8
	mov	[rax + 56 + 8],rbx

	; Flush the GDT.
	mov	rax,gdt_data.gdt2
	mov	rdx,[rax]
	mov	[rax],rdi
	mov	rdi,gdt_data.gdt
	lgdt	[rdi]
	mov	[rax],rdx

	; Flush the TSS.
	mov	ax,0x38
	ltr	ax

	pop	rbx
	ret

MMArchSafeCopy:
	call	GetCurrentThread
	mov	byte [rax + 0],1 ; see definition of Thread
	mov	rcx,rdx
	mov	r8,.error ; where to jump to if we get a page fault
	rep	movsb
	mov	byte [rax + 0],0
	mov	al,1
	ret
	.error: ; we got a page fault in a user address, return false
	mov	byte [rax + 0],0
	mov	al,0
	ret

ProcessorReset:
	in	al,0x64
	test	al,2
	jne	ProcessorReset
	mov	al,0xFE
	out	0x64,al
	jmp	$

_KThreadTerminate:
	sub	rsp,8
	jmp	KThreadTerminate

SynchronizeTimeStampCounter:
	mov	rdx,[timeStampCounterSynchronizationValue]
	mov	rcx,0x8000000000000000
	.loop:
	mov	rbx,rdx
	mov	rax,[timeStampCounterSynchronizationValue]
	xor	rbx,rax
	test	rbx,rcx
	jz	.loop
	sub	rcx,1
	and	rax,rcx
	mov	ecx,0x10
	mov	rdx,rax
	shr	rdx,32
	wrmsr
	ret
	timeStampCounterSynchronizationValue: dq 0

[bits 16]
ProcessorAPStartup: ; This function must be less than 4KB in length (see drivers/acpi.cpp)
	mov	ax,0x1000
	mov	ds,ax
	mov	byte [0xFC0],1 ; Indicate we've started.
	mov	eax,[0xFF0]
	mov	cr3,eax
	lgdt	[0x1000 + gdt_data.gdt - gdt_data]
	mov	eax,cr0
	or	eax,1
	mov	cr0,eax
	jmp	0x8:dword (.pmode - ProcessorAPStartup + 0x10000)
[bits 32]
	.pmode:
	mov	eax,cr4
	or	eax,32
	mov	cr4,eax
	mov	ecx,0xC0000080
	rdmsr
	or	eax,256
	wrmsr
	mov	eax,cr0
	or	eax,0x80000000
	mov	cr0,eax
	jmp	0x48:(.start_64_bit_mode - ProcessorAPStartup + 0x10000)
[bits 64]
	.start_64_bit_mode:
	mov	rax,.start_64_bit_mode2
	jmp	rax
	.start_64_bit_mode2:
	mov	rax,0x50
	mov	ds,rax
	mov	es,rax
	mov	ss,rax
	mov	rax,0x63
	mov	fs,rax
	mov	gs,rax
	lgdt	[0x10FE0]
	mov	rsp,[0x10FD0]
	call	SetupProcessor1
	call	SynchronizeTimeStampCounter
	mov	rdi,[0x10FB0]
	call	SetupProcessor2
	mov	byte [0x10FC0],2 ; Indicate the BSP can start the next processor.
	and	rsp,~0xF
	jmp	ProcessorReady

gdt_data:
	.null_entry:	dq 0
	.code_entry:	dd 0xFFFF	; 0x08
			db 0
			dw 0xCF9A
			db 0
	.data_entry:	dd 0xFFFF	; 0x10
			db 0
			dw 0xCF92
			db 0
	.code_entry_16:	dd 0xFFFF	; 0x18
			db 0
			dw 0x0F9A
			db 0
	.data_entry_16:	dd 0xFFFF	; 0x20
			db 0
			dw 0x0F92
			db 0
	.user_code:	dd 0xFFFF	; 0x2B
			db 0
			dw 0xCFFA
			db 0
	.user_data:	dd 0xFFFF	; 0x33
			db 0
			dw 0xCFF2
			db 0
	.tss:		dd 0x68		; 0x38
			db 0
			dw 0xE9
			db 0
			dq 0
	.code_entry64:	dd 0xFFFF	; 0x48
			db 0
			dw 0xAF9A
			db 0
	.data_entry64:	dd 0xFFFF	; 0x50
			db 0
			dw 0xAF92
			db 0
	.user_code64:	dd 0xFFFF	; 0x5B
			db 0
			dw 0xAFFA
			db 0
	.user_data64:	dd 0xFFFF	; 0x63
			db 0
			dw 0xAFF2
			db 0
	.user_code64c:	dd 0xFFFF	; 0x6B
			db 0
			dw 0xAFFA
			db 0
	.gdt:		dw (gdt_data.gdt - gdt_data - 1)
	.gdt2:		dq 0x11000

%macro CALL_REGISTER_INDIRECT 1
[global __x86_indirect_thunk_%1]
__x86_indirect_thunk_%1:
	jmp	%1
%endmacro

CALL_REGISTER_INDIRECT rax
CALL_REGISTER_INDIRECT rbx
CALL_REGISTER_INDIRECT rcx
CALL_REGISTER_INDIRECT rdx
CALL_REGISTER_INDIRECT rsi
CALL_REGISTER_INDIRECT rdi
CALL_REGISTER_INDIRECT rbp
CALL_REGISTER_INDIRECT r8
CALL_REGISTER_INDIRECT r9
CALL_REGISTER_INDIRECT r10
CALL_REGISTER_INDIRECT r11
CALL_REGISTER_INDIRECT r12
CALL_REGISTER_INDIRECT r13
CALL_REGISTER_INDIRECT r14
CALL_REGISTER_INDIRECT r15