內(nèi)核頁表
和用戶態(tài)頁表不同,在系統(tǒng)初始化的時(shí)候,我們就要?jiǎng)?chuàng)建內(nèi)核頁表了。我們從內(nèi)核頁表的根 swapper_pg_dir 開始找線索。
// arch/x86/include/asm/pgtable_64.h extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; // 對應(yīng)直接映射區(qū) extern pmd_t level2_kernel_pgt[512]; // 對應(yīng)內(nèi)核代碼區(qū) extern pmd_t level2_fixmap_pgt[512]; // 對應(yīng)固定映射區(qū) extern pmd_t level2_ident_pgt[512]; extern pte_t level1_fixmap_pgt[512]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt // 指向內(nèi)核最頂級(jí)的目錄 pgd
內(nèi)核頁表的頂級(jí)目錄 init_top_pgt,定義在 __INITDATA 里面。咱們講過 ELF 的格式,也講過虛擬內(nèi)存空間的布局。它們都有代碼段,還有一些初始化了的全局變量,放在.init 區(qū)域。這些說的就是這個(gè)區(qū)域。
可以看到,頁表的根其實(shí)是全局變量,這就使得我們初始化的時(shí)候,甚至內(nèi)存管理還沒有初始化的時(shí)候,很容易就可以定位到。
// arch\x86\kernel\head_64.S __INITDATA // quad 是聲明了一項(xiàng)的內(nèi)容,org 是跳到了某個(gè)位置 NEXT_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable * anyway. * * The kernel code+data+bss must not be bigger than that. * * (NOTE: at +512MB starts the module area, see MODULES_VADDR. * If you want to increase this then increase MODULES_VADDR * too.) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 NEXT_PAGE(level1_fixmap_pgt) .fill 51
// __PAGE_OFFSET_BASE: 虛擬地址空間里面內(nèi)核的起始地址 // __START_KERNEL_map: 虛擬地址空間里面內(nèi)核代碼段的起始地址 PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map)
如果是用戶態(tài)進(jìn)程頁表,會(huì)有 mm_struct 指向進(jìn)程頂級(jí)目錄 pgd,對于內(nèi)核來講,也定義了一個(gè) mm_struct,指向 swapper_pg_dir。
struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) };
定義完了內(nèi)核頁表,接下來是初始化內(nèi)核頁表,在系統(tǒng)啟動(dòng)的時(shí)候 start_kernel 會(huì)調(diào)用 setup_arch。
在 setup_arch 中,load_cr3(swapper_pg_dir) 說明內(nèi)核頁表要開始起作用了,并且刷新了 TLB,初始化 init_mm 的成員變量,最重要的就是 init_mem_mapping。最終它會(huì)調(diào)用 kernel_physical_mapping_init。
在 kernel_physical_mapping_init 里,我們先通過 __va 將物理地址轉(zhuǎn)換為虛擬地址,然后再創(chuàng)建虛擬地址和物理地址的映射頁表。
void __init setup_arch(char **cmdline_p) { /* * copy kernel address range established so far and switch * to the proper swapper page table */ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, initial_page_table + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); load_cr3(swapper_pg_dir); __flush_tlb_all(); ...... init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; ...... init_mem_mapping(); ...... } /* * Create page table mapping for the physical memory for specific physical * addresses. The virtual and physical addresses have to be aligned on PMD level * down. It returns the last physical address mapped. */ unsigned long __meminit kernel_physical_mapping_init(unsigned long paddr_start, unsigned long paddr_end, unsigned long page_size_mask) { unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; paddr_last = paddr_end; vaddr = (unsigned long)__va(paddr_start); vaddr_end = (unsigned long)__va(paddr_end); vaddr_start = vaddr; for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); p4d_t *p4d; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { p4d = (p4d_t *)pgd_page_vaddr(*pgd); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } p4d = alloc_low_page(); paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); } __flush_tlb_all(); return paddr_l }
vmalloc 和 kmap_atomic 原理
在用戶態(tài)可以通過 malloc 函數(shù)分配內(nèi)存,當(dāng)然 malloc 在分配比較大的內(nèi)存的時(shí)候,底層調(diào)用的是 mmap,當(dāng)然也可以直接通過 mmap 做內(nèi)存映射,在內(nèi)核里面也有相應(yīng)的函數(shù)。
在虛擬地址空間里面,有個(gè) vmalloc 區(qū)域,從 VMALLOC_START 開始到 VMALLOC_END,可以用于映射一段物理內(nèi)存。
/** * vmalloc - allocate virtually contiguous memory * @size: allocation size * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL); } static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, 0, node, caller); }
void *kmap_atomic_prot(struct page *page, pgprot_t prot) { ...... // 如果是 64 位沒有高端地址的,就調(diào)用 page_address,里面會(huì)調(diào)用 lowmem_page_address // 其實(shí)低端內(nèi)存的映射,會(huì)直接使用 __va 進(jìn)行臨時(shí)映射 if (!PageHighMem(page)) return page_address(page); ...... // 如果是 32 位有高端地址的,就需要調(diào)用 set_pte 通過內(nèi)核頁表進(jìn)行臨時(shí)映射 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte-idx, mk_pte(page, prot)); ...... return (void *)vaddr; } void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)
可以看出,kmap_atomic 和 vmalloc 不同。
kmap_atomic 發(fā)現(xiàn),沒有頁表的時(shí)候,就直接創(chuàng)建頁表進(jìn)行映射了。
而 vmalloc 沒有,它只分配了內(nèi)核的虛擬地址。所以,訪問它的時(shí)候,會(huì)產(chǎn)生缺頁異常。
內(nèi)核態(tài)的缺頁異常還是會(huì)調(diào)用 do_page_fault,但是會(huì)走到vmalloc_fault。這個(gè)函數(shù)并不復(fù)雜,主要用于關(guān)聯(lián)內(nèi)核頁表項(xiàng)。
/* * 32-bit: * * Handle a fault on the vmalloc or module mapping area */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0 }
浙公網(wǎng)安備 33010602011771號(hào)