一、体系结构与内存模型
1、体系结构
- 目前多处理器系统当中,有两种体系结构:
- 非一致内存访问(NUMA),指内存划分成多个内存节点的多处理器系统,访问一个内存节点花费的时候取决于处理和内存节点的距离。NUMA是中高端服务器的主流体系结构。
- 对称多处理器(SMP),即一致内存访问(UMA),所有处理器访问内存花费的时间是相同。每个处理器的地位是平等的,仅在内核初始化的时候不平等:“0号处理器作为引导处理器负责初始化内核,其他处理器等待内核初始化完成。”
- 在实际应用中可以采用混合体系结构,在NUMA节点内部使用SMP体系结构。
2、内存模型
- 从处理器角度看到的物理内存分布,内核管理不同内存模型的方式存在差异。内存管理子系统当中有3种内存模型:
- 平坦内存(Flat Memory):内存的物理地址空间是连续的,没有空洞。
- 不连续内存(Discontiguous Memory):内存的物理地址空间存在空洞,这种模型可以高效地处理空洞。
- 稀疏内存(Sparse Memory):内存的物理地址空间存在空洞,如果需要支持内存热插拔,只能选择稀疏内存模型。
- 从内存管理子系统使用节点(node)、区域(zone)和页(page)三级结构描述物理内存。
资料直通车:Linux内核源码技术学习路线+视频教程内核源码1、内存节点
学习直通车:Linux内核源码内存调优文件系统进程管理设备驱动/网络协议栈
- NUMA系统的内存节点,根据处理器和内存的距离划分; 在具有不连续内存的UMA系统中,表示比区域的级别理高的内存区域,根据物理地址是否连续划分,每块物理地址连续的内存是一个内存节点。内存节点使用一个pglist_data结构体数据类型描述内存布局。
文章图片
- 成员node_mem_map指向页描述符数组,每个物理页对应一个页描述符。node_mem_map可能不是指向数组的第一个元素,因为页描述符数组的大小必须对齐到2的(MAX_ORDER-1)次方。(MAX_ORDER-1)是页分配器可分配的最大阶数。具体pglist_ddata对应内核源码分析如下:
typedef struct pglist_data {struct zone node_zones[MAX_NR_ZONES];
// 内存区域数组struct zonelist node_zonelists[MAX_ZONELISTS];
// 备用区域列表int nr_zones;
// 内存区域数量#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */struct page *node_mem_map;
// 页描述符数组,除了稀疏内存模型以外#ifdef CONFIG_PAGE_EXTENSIONstruct page_ext *node_page_ext;
// 页的扩展属性#endif#endif#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)/** Must be held any time you expect node_start_pfn,* node_present_pages, node_spanned_pages or nr_zones to stay constant.** pgdat_resize_lock() and pgdat_resize_unlock() are provided to* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.** Nests above zone->lock and zone->span_seqlock*/spinlock_t node_size_lock;
#endifunsigned long node_start_pfn;
// 起始物理页号unsigned long node_present_pages;
// 物理页总数(不包括空洞)unsigned long node_spanned_pages;
// 物理页总数(包括空洞)int node_id;
// 节点标识符wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd;
/* Protected bymem_hotplug_begin/end() */int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures;
/* Number of 'reclaimed == 0' runs */#ifdef CONFIG_COMPACTIONint kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif/** This is a per-node reserve of pages that are not available* to userspace allocations.*/unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA/** zone reclaim becomes active if more unmapped pages exist.*/unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA *//* Write-intensive fields used by page reclaim */ZONE_PADDING(_pad1_)spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT/** If memory initialisation on large machines is deferred then this* is the first PFN that needs to be initialised.*/unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */#ifdef CONFIG_TRANSPARENT_HUGEPAGEspinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif/* Fields commonly accessed by the page reclaim scanner */struct lruvec lruvec;
unsigned long flags;
ZONE_PADDING(_pad2_)/* Per-node vmstats */struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
2、内存区域内存节点被划分为内存区域,内核定义区域类型如下所述:enum zone_type {#ifdef CONFIG_ZONE_DMA/** ZONE_DMA is used when there are devices that are not able* to do DMA to all of addressable memory (ZONE_NORMAL). Then we* carve out the portion of memory that is needed for these devices.* The range is arch specific.** Some examples** Architecture Limit* ---------------------------* parisc, ia64, sparc <4G* s390, powerpc <2G* arm Various* alpha Unlimited or 0-16MB.** i386, x86_64 and multiple other arches* <16M.*//*DMA区域-->直接内存访问*/ZONE_DMA,#endif#ifdef CONFIG_ZONE_DMA32/** x86_64 needs two ZONE_DMAs because it supports devices that are* only able to do DMA to the lower 16M but also 32 bit devices that* can only do DMA areas below 4G.*/ZONE_DMA32, // 64位系统#endif/** Normal addressable memory is in ZONE_NORMAL. DMA operations can be* performed on pages in ZONE_NORMAL if the DMA devices support* transfers to all addressable memory.*/ZONE_NORMAL, // 普通区域 -->线性映射区域(ARM处理器需要使用页表映射,MIPS处理器不需要使用页表映射)#ifdef CONFIG_HIGHMEM/** A memory area that is only addressable by the kernel through* mapping portions into its own address space. This is for example* used by i386 to allow the kernel to address the memory beyond* 900MB. The kernel will set up special mappings (page* table entries on i386) for each page that the kernel needs to* access.*/ZONE_HIGHMEM, // 高端内存区域。64位系统的内核虚拟地址空间非常大,不再需要庙内存区域#endifZONE_MOVABLE, // 可移动区域:伪内存区域,用来防止内存碎片#ifdef CONFIG_ZONE_DEVICEZONE_DEVICE, // 为支持持久内存(热插拔增加的内存区域)#endif__MAX_NR_ZONES};
每一个内存区域用一个zone结构体描述,对应内核源码如下:struct zone {/* Read-mostly fields *//* zone watermarks, access with *_wmark_pages(zone) macros */unsigned long _watermark[NR_WMARK];
// 页分配器使用的水线unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/** We don't know if the memory that we're going to allocate will be* freeable or/and it will be released eventually, so to avoid totally* wasting several GB of ram we must reserve some of the lower zone* memory (otherwise we risk to run OOM on the lower zones despite* there being tons of freeable ram on the higher zones). This array is* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl* changes.*/long lowmem_reserve[MAX_NR_ZONES];
// 页分配器使用,当前区域保留多少页不能借给高的区域类型#ifdef CONFIG_NUMAint node;
#endifstruct pglist_data *zone_pgdat;
// 指向内存节点的pglist_data实例struct per_cpu_pageset __percpu *pageset;
// 每处理器页集合#ifndef CONFIG_SPARSEMEM/** Flags for a pageblock_nr_pages block. See pageblock-flags.h.* In SPARSEMEM, this map is stored in struct mem_section*/unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM *//* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */unsigned long zone_start_pfn;
/** spanned_pages is the total pages spanned by the zone, including* holes, which is calculated as:* spanned_pages = zone_end_pfn - zone_start_pfn;
** present_pages is physical pages existing within the zone, which* is calculated as:* present_pages = spanned_pages - absent_pages(pages in holes);
** managed_pages is present pages managed by the buddy system, which* is calculated as (reserved_pages includes pages allocated by the* bootmem allocator):* managed_pages = present_pages - reserved_pages;
** So present_pages may be used by memory hotplug or memory power* management logic to figure out unmanaged pages by checking* (present_pages - managed_pages). And managed_pages should be used* by page allocator and vm scanner to calculate all kinds of watermarks* and thresholds.** Locking rules:** zone_start_pfn and spanned_pages are protected by span_seqlock.* It is a seqlock because it has to be read outside of zone->lock,* and it is done in the main allocator path. But, it is written* quite infrequently.** The span_seq lock is declared along with zone->lock because it is* frequently read in proximity to zone->lock. It's good to* give them a chance of being in the same cacheline.** Write access to present_pages at runtime should be protected by* mem_hotplug_begin/end(). Any reader who can't tolerant drift of* present_pages should get_online_mems() to get a stable value.*/atomic_long_t managed_pages;
// 伙伴分配器管理的物理页的数量unsigned long spanned_pages;
// 当前区域跨越的总页数,包括空洞unsigned long present_pages;
// 当前区域存在的物理页的数量,不我包括空洞const char *name;
// 区域名称#ifdef CONFIG_MEMORY_ISOLATION/** Number of isolated pageblock. It is used to solve incorrect* freepage counting problem due to racy retrieving migratetype* of pageblock. Protected by zone->lock.*/unsigned long nr_isolate_pageblock;
#endif#ifdef CONFIG_MEMORY_HOTPLUG/* see spanned/present_pages for more description */seqlock_t span_seqlock;
#endifint initialized;
/* Write-intensive fields used from the page allocator */ZONE_PADDING(_pad1_)/* 不同长度的空闲区域 */struct free_area free_area[MAX_ORDER];
/* zone flags, see below */unsigned long flags;
/* Primarily protects free_area */spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */ZONE_PADDING(_pad2_)/** When free pages are below this point, additional steps are taken* when reading the number of free pages to avoid per-cpu counter* drift allowing watermarks to be breached*/unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA/* pfn where compaction free scanner should start */unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */unsigned long compact_cached_migrate_pfn[2];
#endif#ifdef CONFIG_COMPACTION/** On compaction failure, 1<* are skipped before trying again. The number attempted since* last failure is tracked with compact_considered.*/unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif#if defined CONFIG_COMPACTION || defined CONFIG_CMA/* Set to true when the PG_migrate_skip bits should be cleared */bool compact_blockskip_flush;
#endifbool contiguous;
ZONE_PADDING(_pad3_)/* Zone statistics */atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
3、物理页每个物理页对应一个page结构体,称为页描述符,内存节点的pglist_data实例的成员node_mem_map指向该内存节点包含的所有物理页的页描述符组成的数组。在内核里面:内核函数page_to_nid用来得到物理内存所属的内存节点的编号源码如下:#ifdef NODE_NOT_IN_PAGE_FLAGSextern int page_to_nid(const struct page *page);
#elsestatic inline int page_to_nid(const struct page *page){struct page *p = (struct page *)page;
return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}#endifpage_zonenum用来得到物理页所属的内存区域类型:static inline enum zone_type page_zonenum(const struct page *page){return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}
三、Bootmem/Memblock分配器
- 在Linux内核初始化的时候需要分配内存,内核提供临时的引导内存分配器,在页分配器和块分配器初始化完成之后,把空闲的物理页交给页分配器管理,丢弃引导内存分配器。
// 其中下面这个结构体中成员node_bootmem_map,指向一个位图,// 每个物理页对应一位,如果物理页被分配,把对应的位设置为1struct bootmem_data;
- 在老版本里面有bootmem_data此结构体。新版本只有memblock结构体。
/*** struct memblock_type - collection of memory regions of certain type* @cnt: number of regions* @max: size of the allocated array* @total_size: size of all regions* @regions: array of regions* @name: the memory type symbolic name*/// 内存块类型的数据结构struct memblock_type {unsigned long cnt;
// 区域数量unsigned long max;
// 已分配数组的大小phys_addr_t total_size;
// 所有区域的长度struct memblock_region *regions;
// 内存块区域数组char *name;
// 内存块类型的名称};
/*** struct memblock - memblock allocator metadata* @bottom_up: is bottom up direction?* @current_limit: physical address of the current allocation limit* @memory: usabe memory regions* @reserved: reserved memory regions* @physmem: all physical memory*/struct memblock {bool bottom_up;
// 表示分配方式,值为真表示从低地址向上分配,为假表示从高地址向下分配phys_addr_t current_limit;
// 可分配内存的最大物理地址struct memblock_type memory;
// 内存类型(已分配内存和未分配内存)struct memblock_type reserved;
// 保存类型#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAPstruct memblock_type physmem;
// 物理内存类型#endif};
物理内存类型和内存类型区别:内存类型是物理内存类型的子集,在引用内核时可以使用内核参数,把定可用内存的大小。物理内存类型总是包含所有内存范围(可用内存范围)。/*** enum memblock_flags - definition of memory region attributes* @MEMBLOCK_NONE: no special request* @MEMBLOCK_HOTPLUG: hotpluggable region* @MEMBLOCK_MIRROR: mirrored region* @MEMBLOCK_NOMAP: don't add to kernel direct mapping*/enum memblock_flags {MEMBLOCK_NONE = 0x0, /* 没有特殊要求的区域 */MEMBLOCK_HOTPLUG = 0x1, /* 可热插拔区域 */MEMBLOCK_MIRROR = 0x2, /* 镜像区域 */MEMBLOCK_NOMAP = 0x4, /* 不添加到内核直接映射(线性映射区域) */};
/*** struct memblock_region - represents a memory region* @base: physical address of the region* @size: size of the region* @flags: memory region attributes* @nid: NUMA node id*/// 内存块区域数据结构如下:struct memblock_region {phys_addr_t base;
// 起始物理地址phys_addr_t size;
// 长度enum memblock_flags flags;
// 标志#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAPint nid;
// 节点编号#endif};
ARM64内核初始化memblock分配器过程,具体内核源码分析如下:a.解析设备树二进制文件中节点/memory,把所有物理内存范围添加到memblock.memory。b.直接在内核函数arm64_memblock_init初始化memblock。void __init arm64_memblock_init(void){const s64 linear_region_size = -(s64)PAGE_OFFSET;
/* Handle linux,usable-memory-range property */fdt_enforce_memory_region();
/* Remove memory above our supported physical address size */memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
/** Ensure that the linear region takes up exactly half of the kernel* virtual address space. This way, we can distinguish a linear address* from a kernel/module/vmalloc address by testing a single bit.*/BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));
/** Select a suitable value for the base of physical memory.*/memstart_addr = round_down(memblock_start_of_DRAM(),ARM64_MEMSTART_ALIGN);
/** Remove the memory that we will not be able to cover with the* linear mapping. Take care not to clip the kernel which may be* high in memory.*/memblock_remove(max_t(u64, memstart_addr + linear_region_size,__pa_symbol(_end)), ULLONG_MAX);
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {/* ensure that memstart_addr remains sufficiently aligned */memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,ARM64_MEMSTART_ALIGN);
memblock_remove(0, memstart_addr);
}/** Apply the memory limit if it was set. Since the kernel may be loaded* high up in memory, add back the kernel region that must be accessible* via the linear mapping.*/if (memory_limit != PHYS_ADDR_MAX) {memblock_mem_limit_remove_map(memory_limit);
memblock_add(__pa_symbol(_text), (u64)(_end - _text));
}if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {/** Add back the memory we just removed if it results in the* initrd to become inaccessible via the linear mapping.* Otherwise, this is a no-op*/u64 base = phys_initrd_start & PAGE_MASK;
u64 size = PAGE_ALIGN(phys_initrd_size);
/** We can only add back the initrd memory if we don't end up* with more memory than we can address via the linear mapping.* It is up to the bootloader to position the kernel and the* initrd reasonably close to each other (i.e., within 32 GB of* each other) so that all granule/#levels combinations can* always access both.*/if (WARN(base < memblock_start_of_DRAM() ||base + size > memblock_start_of_DRAM() +linear_region_size,"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {initrd_start = 0;
} else {memblock_remove(base, size);
/* clear MEMBLOCK_ flags */memblock_add(base, size);
memblock_reserve(base, size);
}}if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {extern u16 memstart_offset_seed;
u64 range = linear_region_size -(memblock_end_of_DRAM() - memblock_start_of_DRAM());
/** If the size of the linear region exceeds, by a sufficient* margin, the size of the region that the available physical* memory spans, randomize the linear region as well.*/if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {range /= ARM64_MEMSTART_ALIGN;
memstart_addr -= ARM64_MEMSTART_ALIGN *((range * memstart_offset_seed) >> 16);
}}/** Register the kernel text, kernel data, initrd, and initial* pagetables with memblock.*/memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {/* the generic initrd code expects virtual addresses */initrd_start = __phys_to_virt(phys_initrd_start);
initrd_end = initrd_start + phys_initrd_size;
}early_init_fdt_scan_reserved_mem();
/* 4GB maximum for 32-bit only capable devices */if (IS_ENABLED(CONFIG_ZONE_DMA32))arm64_dma_phys_limit = max_zone_dma_phys();
elsearm64_dma_phys_limit = PHYS_MASK + 1;
reserve_crashkernel();
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
dma_contiguous_reserve(arm64_dma_phys_limit);
}
推荐阅读
- 如何在Windows的VirtualBox中设置SSH可访问的Ubuntu 18.04服务器实例
- Vulnstack内网渗透|Vulnstack内网靶场渗透(ATT&CK实战系列-红队评估(七))
- 渗透测试|MSF图形化界面Viper(炫彩蛇)下载与使用
- #|C语言每日一练——第195天(折半查找(升级版))
- 鸿蒙轻内核 M 核源码分析(数据结构之任务排序链表)
- 如何在Android中使用JSCH(SFTP)将文件上传到服务器
- JS Web开发的服务在哪里使用
- 如何在Android中使用JSCH(SFTP)列出远程路径
- 如何在Android中使用JSCH(SFTP)将文件下载到服务器