Linux内核内存管理 - 从内核启动过程透视内存管理

这是<Linux内核内存管理>系列的第三篇

第一篇为内核内存管理过程知识点的的简单梳理

第二篇介绍了内核的数据结构

前言

以Intel X64 CPU为例，Linux的初始化可大致分为如下几个过程：

Loader跳转到内核后的实模式(Real Mode)
32位保护模式跳转到64位长模式
64位长模式下解压内核
解压内核后，建立新的页表映射，并跳转到Arch（平台）相关C代码
执行平台无关初始化代码

内存管理占据了以上过程的重要角色。包括了内存布局规划、分段管理、页表配置、内核移动等。

本文使用Qemu模拟，基于Linux v5.13.9版本，按顺序介绍以上过程中的内存管理。

实模式(Real Mode)

使用如下命令启动编译好的64位内核：

1	qemu-system-x86_64 -kernel arch/x86/boot/bzImage -nographic -append "console=ttyS0 nokaslr" -s -S

其中：

内核参数”console=ttyS0 nokaslr“的主要作用为指定内核控制台，以及关闭KASLR功能（主要原因是为了调试方便，开启KASLR后每次开机内核解压的地址都是随机的）。
而-s和-S参数主要是为了GDB调试Qemu之用。

执行上述命令后，便得到如下图的内核地址分布。
实模式内存分布

根据内核文档Linux/x86 Boot Protocol，任何Boot Loader(Grub/Lilo/…)加载X86内核，均要遵守该协议。内核发展至今，该协议版本已经发展到了2.15。图中X为Boot Loader加载内核的起始偏移，在Qemu平台上该偏移为 0x10000。加载后，内核Boot Sector开始执行，执行入口点为 _start。参考Linker Script arch/x86/boot/setup.ld。

OUTPUT_FORMAT("elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(_start)

SECTIONS
{
	. = 0;
	.bstext		: { *(.bstext) }
	.bsdata		: { *(.bsdata) }
....

这里会直接跳转到start_of_setup开始执行。

#arch/x86/boot/header.S
	.globl	_start
_start:
		.byte	0xeb		# short (2-byte) jump
		.byte	start_of_setup-1f

	.section ".entrytext", "ax"
start_of_setup:
# Force %es = %ds
	movw	%ds, %ax
	movw	%ax, %es
	cld

	movw	%ss, %dx
	cmpw	%ax, %dx	# %ds == %ss?
	movw	%sp, %dx
	je	2f		# -> assume %sp is reasonably set

	# Invalid %ss, make up a new stack
	movw	$_end, %dx
	testb	$CAN_USE_HEAP, loadflags
	jz	1f
	movw	heap_end_ptr, %dx
1:	addw	$STACK_SIZE, %dx
	jnc	2f
	xorw	%dx, %dx	# Prevent wraparound

2:	# Now %dx should point to the end of our stack space
	andw	$~3, %dx	# dword align (might as well...)
	jnz	3f
	movw	$0xfffc, %dx	# Make sure we're not zero
3:	movw	%ax, %ss
	movzwl	%dx, %esp	# Clear upper half of %esp
	sti			# Now we should have a working stack

# We will have entered with %cs = %ds+0x20, normalize %cs so it is on par with the other segments.
	pushw	%ds
	pushw	$6f
	lretw
6:
# Check signature at end of setup
	cmpl	$0x5a5aaa55, setup_sig
	jne	setup_bad

# Zero the bss
	movw	$__bss_start, %di
	movw	$_end+3, %cx
	xorl	%eax, %eax
	subw	%di, %cx
	shrw	$2, %cx
	rep; stosl

# Jump to C code (should not return)
	calll	main

以上代码会为实模式代码执行清理方向位，并未C代码的执行分配堆空间和栈空间。接着跳转到6执行，检查内核代码加载的正确性。这里说明一下， lretw及之前两行汇编语句的作用是调用返回，之前两行是将返回地址保存在栈内，参考<Intel® 64 and IA-32 Architectures Software Developer’s Manual>。如注释，使用lret的目的是为了重置CS寄存器的值，确保与其他段寄存器一致。可参考Intel手册，ret指令的说明：

When executing a far return, the processor pops the return instruction pointer from the top of the stack into the EIP
register, then pops the segment selector from the top of the stack into the CS register. The processor then begins
program execution in the new code segment at the new instruction pointer.

接着清空BSS段后跳转到main函数执行。

	/* First, copy the boot header into the "zeropage" */
	copy_boot_params();
	console_init();
	if (cmdline_find_option_bool("debug"))
		puts("early console in setup code\n");
	init_heap();
	if (validate_cpu()) {
		puts("Unable to boot - please use a kernel appropriate "
		     "for your CPU.\n");
		die();
	}
	set_bios_mode();
	detect_memory();
	keyboard_init();
	query_ist();
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
	query_apm_bios();
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
	query_edd();
#endif
	set_video();
	go_to_protected_mode();

main函数的注释比较清楚,我们这里只讲一下copy_boot_param/detect_memory/go_to_protected_mode:

copy_boot_param将内存中boot_param的信息（见图“实模式内存分布”）拷贝到全局变量boot_params内。boot_params存放的就是Linux Boot Protocol定义的哪些参数。有些栏位是编译过程中改写，有些烂尾由Boot Loader填写。boot_param包括内核cmdline会穿插内核初始化的各个子过程中
detect_memory主要是使用e820获取内存的基础布局，存储到boot_param指定区域（boot_params.e820_table和boot_params.e820_entries）。
go_to_protected_mode主要是打开32位地址线（A20 Gate），做GDT/IDT表的配置，关中断，打开保护模式，并跳转到32位代码开始执行。代码如下：

//arch/x86/boot/pm.c
void go_to_protected_mode(void)
{
	realmode_switch_hook();

	/* Enable the A20 gate */
	if (enable_a20()) {
		puts("A20 gate not responding, unable to boot...\n");
		die();
	}

	reset_coprocessor();

	mask_all_interrupts();

	setup_idt();
	setup_gdt();
	protected_mode_jump(boot_params.hdr.code32_start,
			    (u32)&boot_params + (ds() << 4));
}

protected_mode_jump是一段汇编代码，定义在arch/x86/boot/pmjump.S，这里不多过多分析。其主要就是修改CR0寄存器的PE（Protect Enable）位，并执行跳转指令跳转到32位代码(.Lin_pm32标号)处执行。

#arch/x86/boot/pmjump.S
/*
 * void protected_mode_jump(u32 entrypoint, u32 bootparams);
 */
SYM_FUNC_START_NOALIGN(protected_mode_jump)
........

	movl	%cr0, %edx
	orb	$X86_CR0_PE, %dl	# Protected mode
	movl	%edx, %cr0

	# Transition to 32-bit mode
	.byte	0x66, 0xea		# ljmpl opcode
2:	.long	.Lin_pm32		# offset
	.word	__BOOT_CS		# segment
SYM_FUNC_END(protected_mode_jump)

SYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32)
	# Set up data segments for flat 32-bit mode
	movl	%ecx, %ds
	movl	%ecx, %es
	movl	%ecx, %fs
	movl	%ecx, %gs
	movl	%ecx, %ss
	# The 32-bit code sets up its own stack, but this way we do have
	# a valid stack if some debugging hack wants to use it.
	addl	%ebx, %esp

	# Set up TR to make Intel VT happy
	ltr	%di

	# Clear registers to allow for future extensions to the
	# 32-bit boot protocol
	xorl	%ecx, %ecx
	xorl	%edx, %edx
	xorl	%ebx, %ebx
	xorl	%ebp, %ebp
	xorl	%edi, %edi

	# Set up LDTR to make Intel VT happy
	lldt	%cx

	jmpl	*%eax			# Jump to the 32-bit entrypoint
SYM_FUNC_END(.Lin_pm32)

32位代码伊始就是重建各个段寄存器为BOOT_DS。段寄存器内容为向GDT某项的段选择子，而BOOT_DS即为GDT的第三个表项。此时GDT的表项可以到arch/x86/boot/pm.c查找，大概定义了Base为0大小为4G的段，这足以覆盖内核初始化32位代码执行的区域。有关GDT表及段选择相关知识，可以查阅<Intel® 64 and IA-32 Architectures Software Developer’s Manual>中Volume 3，CHAPTER 3 PROTECTED-MODE MEMORY MANAGEMENT一节。做一些寄存器内容的清理，就跳转到32位内核的起始地址执行。

该起始地址，是protected_mode_jump函数的第一个参数-boot_params.hdr.code32_start。在我们的QEMU环境中这个值为0x100000

为什么是存储在eax寄存器呢，这里就需要了解System V Application Binary Interface AMD64中有关calling convention的知识,Linux内核也是遵守System V ABI的。ABI指的是Application Binary Interface，根据程序运行的Arch不同而有不同的定义。

32位保护模式跳转到64位长模式

startup_32

_
0x100000存放的是32位代码起始地址，具体布局可以参考链接脚本：vmlinux.lds

链接脚本，即Linker Script，这是告诉链接器目标文件该如何链接的脚本。一般GCC编译我们不会指定链接脚本，这是因为其有默认的链接脚本。

#ifdef CONFIG_X86_64
OUTPUT_ARCH(i386:x86-64)
ENTRY(startup_64)
#else
OUTPUT_ARCH(i386)
ENTRY(startup_32)
#endif

SECTIONS
{
	/* Be careful parts of head_64.S assume startup_32 is at
	 * address 0.
	 */
	. = 0;
	.head.text : {
		_head = . ;
		HEAD_TEXT
		_ehead = . ;
	}
	.rodata..compressed : {
		*(.rodata..compressed)
	}
	.text :	{
		_text = .; 	/* Text */
		*(.text)
		*(.text.*)
		_etext = . ;
	}
	.rodata : {
		_rodata = . ;
		*(.rodata)	 /* read-only data */
		*(.rodata.*)
		_erodata = . ;
	}
	.data :	{
		_data = . ;
		*(.data)
		*(.data.*)
		*(.bss.efistub)
		_edata = . ;
	}
	. = ALIGN(L1_CACHE_BYTES);
	.bss : {
		_bss = . ;
		*(.bss)
		*(.bss.*)
		*(COMMON)
		. = ALIGN(8);	/* For convenience during zeroing */
		_ebss = .;
	}
#ifdef CONFIG_X86_64
       . = ALIGN(PAGE_SIZE);
       .pgtable : {
		_pgtable = . ;
		*(.pgtable)
		_epgtable = . ;
	}
#endif
	. = ALIGN(PAGE_SIZE);	/* keep ZO size page aligned */
	_end = .;

经过ld链接、且qemu加载后，得到下图左侧的内存布局。从地址0x100000开始，首先是32位保护模式入口代码、解压缩代码等，之后摆放了压缩的内核。其后分别是解压后内核的代码段、只读数据段、数据段、未初始化数据段和32位代码页表。
32位下内存分布

从链接脚本可以看出：32位代码的入口地址是startup_32。代码首先清中断，加载新GDT表，同时重置各段寄存器，建立堆栈。

需要注意代码定义了一个宏rva,它的主要作用是为了计算段内相对地址，这样可以避免内核加载到不同位置时，同样的代码皆可执行。

#arch/x86/boot/compressed/head_64.S
#define rva(X) ((X) - startup_32)

	.code32
SYM_FUNC_START(startup_32)
	cld
	cli

	leal	(BP_scratch+4)(%esi), %esp
	call	1f
1:	popl	%ebp
	subl	$ rva(1b), %ebp

	leal	rva(gdt)(%ebp), %eax
	movl	%eax, 2(%eax)
	lgdt	(%eax)

	/* Load segment registers with our descriptors */
	movl	$__BOOT_DS, %eax
	movl	%eax, %ds
	movl	%eax, %es
	movl	%eax, %fs
	movl	%eax, %gs
	movl	%eax, %ss

	leal	rva(boot_stack_end)(%ebp), %esp

	pushl	$__KERNEL32_CS
	leal	rva(1f)(%ebp), %eax
	pushl	%eax
	lretl
1:
	call	startup32_load_idt
	call	verify_cpu
	testl	%eax, %eax
	jnz	.Lno_longmode

#ifdef CONFIG_RELOCATABLE
	movl	%ebp, %ebx
......
	movl	BP_kernel_alignment(%esi), %eax
	decl	%eax
	addl	%eax, %ebx
	notl	%eax
	andl	%eax, %ebx
	cmpl	$LOAD_PHYSICAL_ADDR, %ebx
	jae	1f
#endif
	movl	$LOAD_PHYSICAL_ADDR, %ebx
1:

	addl	BP_init_size(%esi), %ebx
	subl	$ rva(_end), %ebx

	/* Enable PAE mode */
	movl	%cr4, %eax
	orl	$X86_CR4_PAE, %eax
	movl	%eax, %cr4

加载IDT后，打开PAE模式。然后会计算出将压缩内核摆放的位置放到ebx，用于原地(in-place)解压。上面代码中BP_kernel_alignment(%esi) 主要作用是从boot_param对应区域取出对应的值。我们再次打开Linux/x86 Boot Protocol和Boot Protocol附属栏位查看这些栏位的说明:

偏移/所占字节数	参数	描述
0230/4	kernel_alignment	Physical addr alignment required for kernel
0260/4	init_size	Linear memory required during initialization
01E4/4	scratch	Scratch field for the kernel setup code

其中init_size存放的是内核初始化、解压所需要的空间，这是根据内核压缩In-place解压预留足够的空间。这部分大小的计算可以参考内核源码arch/x86/boot/header.S的说明（本人也还没吃透,待补充）。
紧接着内核为4GB大小的内存建立每页大小为2MB的内核页表（见Figure 2图右）并加载页表目录地址（pgtable）到CR3寄存器，并开启64位长模式。参考Wiki:

当处于长模式（Long mode）时，64位应用程序（或者是操作系统）可以使用64位指令和寄存器，而32位程序将以一种兼容子模式运行。

4GB大小足以执行内核解压等动作。接着内核将64位地址startup_64压入栈，开启分页，并执行lret指令跳转到startup_64处执行。

此处我们省略了SEV功能的检查，这是AMD CPU的特性。此处不做分析。

startup_64

startup_64 的开始同样会清中断，清理各段寄存器。同时计算压缩内核要移动到的地址，即LOAD_PHYSICAL_ADDR + INIT_SIZE - 压缩内核的长度(rva（_end）)。此处处理与startup_32相同

可能大家会疑惑，为什么这段代码在startup_32做了，此处还要做一遍。主要原因代码内有描述，内核可能会被64位Loader直接加载并从startup_64处执行。

接着内核加载空的IDT表，检查是否需要开启五级页表，并做对应处理。紧接着清除EFLAGS寄存器后，将压缩内核移动到In-place解压的位置(LOAD_PHYSICAL_ADDR + INIT_SIZE - 压缩内核的长度)，紧接着重新加载移动过位置的GDT表。之后跳转到移动后的 .Lrelocated 地址处开始执行。

.Lrelocated

.Lrelocated 代码最主要的作用有三个：

加载IDT：此时IDT的内容只开启了Page Fault Trap，对应的处理函数是boot_page_fault，其实现在arch/x86/boot/compressed/ident_map_64.c, 主要作用就是做一些基础检查后，为对应缺页的地址建立一致性映射。
创建一致性映射：主要为[_head, _end], bootparam 和 boot cmdline三个区域建立一致性映射。
解压内核：解压内核本文不做分析。提及一点就是如果开启了KASLR，解压内核前，会计算一个随机偏移生成内核真正的解压地址。

解压完内核后跳转到加压后内核的入口地址,即arch/x86/kernel/head_64.S的startup_64标号处

内核解压后

startup_64 代码如下：

SYM_CODE_START_NOALIGN(startup_64)
	UNWIND_HINT_EMPTY
	leaq	(__end_init_task - SIZEOF_PTREGS)(%rip), %rsp

	leaq	_text(%rip), %rdi
	pushq	%rsi
	call	startup_64_setup_env
	popq	%rsi

	pushq	$__KERNEL_CS
	leaq	.Lon_kernel_cs(%rip), %rax
	pushq	%rax
	lretq

.Lon_kernel_cs:
	UNWIND_HINT_EMPTY

	/* Sanitize CPU configuration */
	call verify_cpu

	leaq	_text(%rip), %rdi
	pushq	%rsi
	call	__startup_64
	popq	%rsi

	addq	$(early_top_pgt - __START_KERNEL_map), %rax
	jmp 1f
SYM_CODE_END(startup_64)

以上代码会配置栈之后，调用startup_64_setup_env配置Startup GDT和IDT。GDT表的内容如下：

static struct desc_struct startup_gdt[GDT_ENTRIES] = {
	[GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
	[GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
	[GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
};

Startup GDT中的段描述符,都是0地址开始的4GB大小。Startup IDT（也叫binrgup IDT）主要处理AMD 架构下VMM Communication异常，该异常与虚拟机有关。
之后内核继续执行到verify_cpu这个汇编函数，其定义在verify_cpu.S,其主要是使用cpuid指令得到CPU对长模式和SSE指令集的支持状况。
检查完后，内核跳转执行 __startup_64,其主要作用是重新建立内核早期4级或者5级页表，此时需要考虑KASLR产生的随机偏移，因此我们可以看到此函数调用了多次fixup_pointer函数进行页表项纠正。
页表定义在head_64.s，如下：

SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
	.fill	512,8,0
	.fill	PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)

SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
SYM_DATA_END(early_dynamic_pgts)

SYM_DATA(early_recursion_flag, .long 0)

	.data

#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
	.org    init_top_pgt + L4_PAGE_OFFSET*8, 0
	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
	.org    init_top_pgt + L4_START_KERNEL*8, 0
	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
	.fill	PTI_USER_PGD_FILL,8,0
SYM_DATA_END(init_top_pgt)

SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
	.fill	511, 8, 0
SYM_DATA_END(level3_ident_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
SYM_DATA_END(level2_ident_pgt)
#else
SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
	.fill	512,8,0
	.fill	PTI_USER_PGD_FILL,8,0
SYM_DATA_END(init_top_pgt)
#endif

#ifdef CONFIG_X86_5LEVEL
SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
	.fill	511,8,0
	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level4_kernel_pgt)
#endif

SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
	.fill	L3_START_KERNEL,8,0
	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
SYM_DATA_END(level3_kernel_pgt)

SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
SYM_DATA_END(level2_kernel_pgt)

SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
	.fill	(512 - 4 - FIXMAP_PMD_NUM),8,0
	pgtno = 0
	.rept (FIXMAP_PMD_NUM)
	.quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
		+ _PAGE_TABLE_NOENC;
	pgtno = pgtno + 1
	.endr
	/* 6 MB reserved space + a 2MB hole */
	.fill	4,8,0
SYM_DATA_END(level2_fixmap_pgt)

SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
	.rept (FIXMAP_PMD_NUM)
	.fill	512,8,0
	.endr
SYM_DATA_END(level1_fixmap_pgt)

比较难理解，我们用图翻译一下：
内核早期页表

图中为内核代码建立了早期映射，这样，就可以愉快地执行内核代码了。（当然，也并不一定是愉快执行内核代码，后面我们也会看到，内核需要注册IDT表项来处理Page Fault Trap）。

/* Switch to new page-table */
movq	%rax, %cr3

/* Ensure I am executing from virtual addresses */
movq	$1f, %rax
ANNOTATE_RETPOLINE_SAFE
jmp	*%rax

__startup_64执行过后我们跳过一些SEV的处理，便开始使用新的内核页表。此后我们就跳转到__START_KERNEL_map开始的虚拟地址执行了。紧接着重新初始化GDT、设置段寄存器、建立初始化运营时的栈、建立IDT。这中间有一段代码：

	/* Set up %gs.
	 *
	 * The base of %gs always points to fixed_percpu_data. If the
	 * stack protector canary is enabled, it is located at %gs:40.
	 * Note that, on SMP, the boot cpu uses init data section until
	 * the per cpu areas are set up.
	 */
	movl	$MSR_GS_BASE,%ecx
	movl	initial_gs(%rip),%eax
	movl	initial_gs+4(%rip),%edx
	wrmsr
..................
	pushq	$.Lafter_lret	# put return address on stack for unwinder
	xorl	%ebp, %ebp	# clear frame pointer
	movq	initial_code(%rip), %rax
	pushq	$__KERNEL_CS	# set correct cs
	pushq	%rax		# target address in negative space
	lretq

......
SYM_DATA(initial_code,	.quad x86_64_start_kernel)

它的作用是为多处理器系统保存per CPU变量的地址，保存到64-bit model specific register (MSR)。接着跳转到初始化c代码, 即x86_64_start_kernel。

总结

本文重点分析了从内核被Loader加载一直执行到C代码入口的内存管理。一些主要的步骤：

开启保护模式
开启长模式
内核解压同时添加随机偏移
建立内核页表并跳转到虚拟地址执行

系列后续我们将分析执行到C代码入口之后的处理