From d903746887cb2b2bec0f9db9ff0eb20f580dfd89 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 16 Mar 2026 20:11:17 +0100 Subject: [PATCH 1/2] Add support for 16k pages With 16384 pages: | size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs | | - | - | - | - | - | | 16 | 93.75% | 256 | 16384 | 75.0% | | 32 | 46.88% | 256 | 16384 | 50.0% | | 48 | 31.25% | 256 | 16384 | 25.0% | | 64 | 23.44% | 256 | 16384 | 0.0% | | 80 | 18.75% | 204 | 16384 | 0.390625% | | 96 | 15.62% | 170 | 16384 | 0.390625% | | 112 | 13.39% | 146 | 16384 | 0.1953125% | | 128 | 11.72% | 256 | 32768 | 0.0% | | 160 | 19.38% | 204 | 32768 | 0.390625% | | 192 | 16.15% | 256 | 49152 | 0.0% | | 224 | 13.84% | 219 | 49152 | 0.1953125% | | 256 | 12.11% | 256 | 65536 | 0.0% | | 320 | 19.69% | 256 | 81920 | 0.0% | | 384 | 16.41% | 256 | 98304 | 0.0% | | 448 | 14.06% | 256 | 114688 | 0.0% | | 512 | 12.3% | 256 | 131072 | 0.0% | | 640 | 19.84% | 256 | 163840 | 0.0% | | 768 | 16.54% | 256 | 196608 | 0.0% | | 896 | 14.17% | 256 | 229376 | 0.0% | | 1024 | 12.4% | 256 | 262144 | 0.0% | | 1280 | 19.92% | 192 | 245760 | 0.0% | | 1536 | 16.6% | 160 | 245760 | 0.0% | | 1792 | 14.23% | 128 | 229376 | 0.0% | | 2048 | 12.45% | 128 | 262144 | 0.0% | | 2560 | 19.96% | 96 | 245760 | 0.0% | | 3072 | 16.63% | 80 | 245760 | 0.0% | | 3584 | 14.26% | 64 | 229376 | 0.0% | | 4096 | 12.48% | 64 | 262144 | 0.0% | | 5120 | 19.98% | 48 | 245760 | 0.0% | | 6144 | 16.65% | 40 | 245760 | 0.0% | | 7168 | 14.27% | 32 | 229376 | 0.0% | | 8192 | 12.49% | 32 | 262144 | 0.0% | | 10240 | 19.99% | 24 | 245760 | 0.0% | | 12288 | 16.66% | 20 | 245760 | 0.0% | | 14336 | 14.28% | 16 | 229376 | 0.0% | | 16384 | 12.49% | 16 | 262144 | 0.0% | | 20480 | 20.0% | 12 | 245760 | 0.0% | | 24576 | 16.66% | 10 | 245760 | 0.0% | | 28672 | 14.28% | 8 | 229376 | 0.0% | | 32768 | 12.5% | 8 | 262144 | 0.0% | | 40960 | 20.0% | 6 | 245760 | 0.0% | | 49152 | 16.66% | 5 | 245760 | 0.0% | | 57344 | 14.28% | 4 | 229376 | 0.0% | | 65536 | 12.5% | 4 | 262144 | 0.0% | | 81920 | 20.0% | 3 | 245760 | 0.0% | | 98304 | 16.67% | 2 | 196608 | 0.0% | | 114688 | 14.28% | 2 | 229376 | 0.0% | | 131072 | 12.5% | 2 | 262144 | 0.0% | maximum bitmap size is 256-bit maximum page span size is 16 (262144) with 4k to compare: | size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs | | - | - | - | - | - | | 16 | 93.75% | 256 | 4096 | 0.0% | | 32 | 46.88% | 128 | 4096 | 0.0% | | 48 | 31.25% | 85 | 4096 | 0.390625% | | 64 | 23.44% | 64 | 4096 | 0.0% | | 80 | 18.75% | 51 | 4096 | 0.390625% | | 96 | 15.62% | 42 | 4096 | 1.5625% | | 112 | 13.39% | 36 | 4096 | 1.5625% | | 128 | 11.72% | 64 | 8192 | 0.0% | | 160 | 19.38% | 51 | 8192 | 0.390625% | | 192 | 16.15% | 64 | 12288 | 0.0% | | 224 | 13.84% | 54 | 12288 | 1.5625% | | 256 | 12.11% | 64 | 16384 | 0.0% | | 320 | 19.69% | 64 | 20480 | 0.0% | | 384 | 16.41% | 64 | 24576 | 0.0% | | 448 | 14.06% | 64 | 28672 | 0.0% | | 512 | 12.3% | 64 | 32768 | 0.0% | | 640 | 19.84% | 64 | 40960 | 0.0% | | 768 | 16.54% | 64 | 49152 | 0.0% | | 896 | 14.17% | 64 | 57344 | 0.0% | | 1024 | 12.4% | 64 | 65536 | 0.0% | | 1280 | 19.92% | 16 | 20480 | 0.0% | | 1536 | 16.6% | 16 | 24576 | 0.0% | | 1792 | 14.23% | 16 | 28672 | 0.0% | | 2048 | 12.45% | 16 | 32768 | 0.0% | | 2560 | 19.96% | 8 | 20480 | 0.0% | | 3072 | 16.63% | 8 | 24576 | 0.0% | | 3584 | 14.26% | 8 | 28672 | 0.0% | | 4096 | 12.48% | 8 | 32768 | 0.0% | | 5120 | 19.98% | 8 | 40960 | 0.0% | | 6144 | 16.65% | 8 | 49152 | 0.0% | | 7168 | 14.27% | 8 | 57344 | 0.0% | | 8192 | 12.49% | 8 | 65536 | 0.0% | | 10240 | 19.99% | 6 | 61440 | 0.0% | | 12288 | 16.66% | 5 | 61440 | 0.0% | | 14336 | 14.28% | 4 | 57344 | 0.0% | | 16384 | 12.49% | 4 | 65536 | 0.0% | | 20480 | 20.0% | 2 | 40960 | 0.0% | | 24576 | 16.66% | 2 | 49152 | 0.0% | | 28672 | 14.28% | 2 | 57344 | 0.0% | | 32768 | 12.5% | 2 | 65536 | 0.0% | | 40960 | 20.0% | 1 | 40960 | 0.0% | | 49152 | 16.66% | 1 | 49152 | 0.0% | | 57344 | 14.28% | 1 | 57344 | 0.0% | | 65536 | 12.5% | 1 | 65536 | 0.0% | | 81920 | 20.0% | 1 | 81920 | 0.0% | | 98304 | 16.67% | 1 | 98304 | 0.0% | | 114688 | 14.28% | 1 | 114688 | 0.0% | | 131072 | 12.5% | 1 | 131072 | 0.0% | maximum bitmap size is 256-bit maximum page span size is 16 (65536) --- Android.bp | 1 + Makefile | 7 ++++- README.md | 31 +++++++++++++-------- calculate-waste | 71 +++++++++++++++++++++++++++++++++++++---------- config/default.mk | 1 + config/light.mk | 1 + h_malloc.c | 20 ++++++++++++- pages.h | 7 +++++ 8 files changed, 111 insertions(+), 28 deletions(-) diff --git a/Android.bp b/Android.bp index f6a7a9c0..b2872033 100644 --- a/Android.bp +++ b/Android.bp @@ -28,6 +28,7 @@ common_cflags = [ "-DN_ARENA=1", "-DCONFIG_STATS=true", "-DCONFIG_SELF_INIT=false", + "-DCONFIG_PAGE_SIZE=4096", ] cc_defaults { diff --git a/Makefile b/Makefile index f33f88ea..d238857d 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,10 @@ endif OBJECTS := $(addprefix $(OUT)/,$(OBJECTS)) +ifeq (,$(filter $(CONFIG_PAGE_SIZE),4096 16384)) + $(error CONFIG_PAGE_SIZE must be 4096 or 16384) +endif + ifeq (,$(filter $(CONFIG_SEAL_METADATA),true false)) $(error CONFIG_SEAL_METADATA must be true or false) endif @@ -108,7 +112,8 @@ CPPFLAGS += \ -DCONFIG_CLASS_REGION_SIZE=$(CONFIG_CLASS_REGION_SIZE) \ -DN_ARENA=$(CONFIG_N_ARENA) \ -DCONFIG_STATS=$(CONFIG_STATS) \ - -DCONFIG_SELF_INIT=$(CONFIG_SELF_INIT) + -DCONFIG_SELF_INIT=$(CONFIG_SELF_INIT) \ + -DCONFIG_PAGE_SIZE=$(CONFIG_PAGE_SIZE) $(OUT)/libhardened_malloc$(SUFFIX).so: $(OBJECTS) | $(OUT) $(CC) $(CFLAGS) $(LDFLAGS) -shared $^ $(LDLIBS) -o $@ diff --git a/README.md b/README.md index 089dd1ab..c4610d1b 100644 --- a/README.md +++ b/README.md @@ -180,10 +180,9 @@ large number of guard pages created by hardened\_malloc. As an example, in This is unnecessary if you set `CONFIG_GUARD_SLABS_INTERVAL` to a very large value in the build configuration. -On arm64, make sure your kernel is configured to use 4k pages since we haven't -yet added support for 16k and 64k pages. The kernel also has to be configured -to use 4 level page tables for the full 48 bit address space instead of only -having a 39 bit address space for the default hardened\_malloc configuration. +On arm64, the kernel also has to be configured to use 4 level page tables for +the full 48 bit address space instead of only having a 39 bit address space +for the default hardened\_malloc configuration. It's possible to reduce the class region size substantially to make a 39 bit address space workable but the defaults won't work. @@ -334,6 +333,14 @@ The following integer configuration options are available: granularity. See the [section on size classes](#size-classes) below for details. +* `CONFIG_PAGE_SIZE`: `4096` (default) to set the page size used by the + allocator. Supported values are `4096` and `16384`. This must match the page + size of the kernel the library will run on. On arm64, kernels may be + configured for 4k or 16k pages. The allocator verifies at runtime that the + compile-time page size matches the kernel page size and will abort if they + differ. The slab slot counts are tuned per page size to minimize internal + fragmentation for slabs. + There will be more control over enabled features in the future along with control over fairly arbitrarily chosen values like the size of empty slab caches (making them smaller improves security and reduces memory usage while @@ -537,11 +544,11 @@ classes for each doubling in size. The slot counts tied to the size classes are specific to this allocator rather than being taken from jemalloc. Slabs are always a span of pages so the slot -count needs to be tuned to minimize waste due to rounding to the page size. For -now, this allocator is set up only for 4096 byte pages as a small page size is -desirable for finer-grained memory protection and randomization. It could be -ported to larger page sizes in the future. The current slot counts are only a -preliminary set of values. +count needs to be tuned to minimize waste due to rounding to the page size. +Tuned slot counts are provided for 4096 and 16384 byte page sizes, selectable +via `CONFIG_PAGE_SIZE`. A smaller page size is desirable for +finer-grained memory protection and randomization. The tables below show the +default slot counts for 4096 byte pages. | size class | worst case internal fragmentation | slab slots | slab size | internal fragmentation for slabs | | - | - | - | - | - | @@ -584,7 +591,7 @@ preliminary set of values. The slab allocation size classes end at 16384 since that's the final size for 2048 byte spacing and the next spacing class matches the page size of 4096 -bytes on the target platforms. This is the minimum set of small size classes +bytes when using 4k pages. This is the minimum set of small size classes required to avoid substantial waste from rounding. The `CONFIG_EXTENDED_SIZE_CLASSES` option extends the size classes up to @@ -620,8 +627,8 @@ the same size class scheme providing 4 size classes for every doubling of size. It increases virtual memory consumption but drastically improves performance where realloc is used without proper growth factors, which is fairly common and destroys performance in some commonly used programs. If large size classes are -disabled, the granularity is instead the page size, which is currently always -4096 bytes on supported platforms. +disabled, the granularity is instead the page size (4096 or 16384 bytes +depending on `CONFIG_PAGE_SIZE`). ## Scalability diff --git a/calculate-waste b/calculate-waste index ca26d9a5..bedb6465 100755 --- a/calculate-waste +++ b/calculate-waste @@ -1,6 +1,11 @@ #!/usr/bin/env python3 -from sys import argv +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--pagesize", default=4096, type=int) +args = parser.parse_args() +page_size = args.pagesize size_classes = [ 16, 32, 48, 64, 80, 96, 112, 128, @@ -16,7 +21,7 @@ size_classes = [ 81920, 98304, 114688, 131072, ] -size_class_slots = [ +size_class_slots_4k = [ 256, 128, 85, 64, 51, 42, 36, 64, 51, 64, 54, 64, 64, 64, 64, 64, @@ -30,29 +35,67 @@ size_class_slots = [ 1, 1, 1, 1, ] +size_class_slots_16k = [ + 256, 256, 256, 256, + 204, 170, 146, 256, + 204, 256, 219, 256, + 256, 256, 256, 256, + 256, 256, 256, 256, + 192, 160, 128, 128, + 96, 80, 64, 64, + 48, 40, 32, 32, + 24, 20, 16, 16, + 12, 10, 8, 8, + 6, 5, 4, 4, + 3, 2, 2, 2, +] + +if page_size == 16384: + size_class_slots = size_class_slots_16k +else: + size_class_slots = size_class_slots_4k + fragmentation = [100 - 1 / 16 * 100] for i in range(len(size_classes) - 1): size_class = size_classes[i + 1] worst_case = size_classes[i] + 1 used = worst_case / size_class - fragmentation.append(100 - used * 100); + fragmentation.append(100 - used * 100) + def page_align(size): - return (size + 4095) & ~4095 + mask = page_size - 1 + return (size + mask) & ~mask + +print(f"Page size: {page_size}") +print() +print("| ", end="") +print( + "size class", + "worst case internal fragmentation", + "slab slots", + "slab size", + "internal fragmentation for slabs", + sep=" | ", + end=" |\n", +) print("| ", end="") -print("size class", "worst case internal fragmentation", "slab slots", "slab size", "internal fragmentation for slabs", sep=" | ", end=" |\n") -print("| ", end='') print("-", "-", "-", "-", "-", sep=" | ", end=" |\n") for size, slots, fragmentation in zip(size_classes, size_class_slots, fragmentation): used = size * slots real = page_align(used) - print("| ", end='') - print(size, f"{fragmentation:.4}%", slots, real, str(100 - used / real * 100) + "%", sep=" | ", end=" |\n") - -if len(argv) < 2: - exit() + print("| ", end="") + print( + size, + f"{fragmentation:.4}%", + slots, + real, + str(100 - used / real * 100) + "%", + sep=" | ", + end=" |\n", + ) max_bits = 256 max_page_span = 16 @@ -60,16 +103,16 @@ max_page_span = 16 print() print("maximum bitmap size is {}-bit".format(max_bits)) -print("maximum page span size is {} ({})".format(max_page_span, max_page_span * 4096)) +print( "maximum page span size is {} ({})".format(max_page_span, max_page_span * page_size)) for size_class in size_classes: choices = [] for bits in range(1, max_bits + 1): used = size_class * bits real = page_align(used) - if real > 65536: + if real > max_page_span * page_size: continue - pages = real / 4096 + pages = real / page_size efficiency = used / real * 100 choices.append((bits, used, real, pages, efficiency)) diff --git a/config/default.mk b/config/default.mk index 71b1cc42..384bea3d 100644 --- a/config/default.mk +++ b/config/default.mk @@ -21,3 +21,4 @@ CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB CONFIG_N_ARENA := 4 CONFIG_STATS := false CONFIG_SELF_INIT := true +CONFIG_PAGE_SIZE := 4096 diff --git a/config/light.mk b/config/light.mk index 88a0e1f5..7bf113d9 100644 --- a/config/light.mk +++ b/config/light.mk @@ -21,3 +21,4 @@ CONFIG_CLASS_REGION_SIZE := 34359738368 # 32GiB CONFIG_N_ARENA := 4 CONFIG_STATS := false CONFIG_SELF_INIT := true +CONFIG_PAGE_SIZE := 4096 diff --git a/h_malloc.c b/h_malloc.c index 4579ca81..704ce6ce 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -178,6 +178,22 @@ static const u32 size_classes[] = { }; static const u16 size_class_slots[] = { +#if CONFIG_PAGE_SIZE == 16384 + /* 0 */ 256, + /* 16 */ 256, 256, 256, 256, 204, 170, 146, 256, + /* 32 */ 204, 256, 219, 256, + /* 64 */ 256, 256, 256, 256, + /* 128 */ 256, 256, 256, 256, + /* 256 */ 192, 160, 128, 128, + /* 512 */ 96, 80, 64, 64, + /* 1024 */ 48, 40, 32, 32, + /* 2048 */ 24, 20, 16, 16, +#if CONFIG_EXTENDED_SIZE_CLASSES + /* 4096 */ 12, 10, 8, 8, + /* 8192 */ 6, 5, 4, 4, + /* 16384 */ 3, 2, 2, 2, +#endif +#else /* 4k pages */ /* 0 */ 256, /* 16 */ 256, 128, 85, 64, 51, 42, 36, 64, /* 32 */ 51, 64, 54, 64, @@ -192,6 +208,7 @@ static const u16 size_class_slots[] = { /* 8192 */ 1, 1, 1, 1, /* 16384 */ 1, 1, 1, 1, #endif +#endif }; static size_t get_slots(unsigned class) { @@ -321,7 +338,8 @@ struct __attribute__((aligned(CACHELINE_SIZE))) size_class { #define REAL_CLASS_REGION_SIZE (CLASS_REGION_SIZE * 2) #define ARENA_SIZE (REAL_CLASS_REGION_SIZE * N_SIZE_CLASSES) static const size_t slab_region_size = ARENA_SIZE * N_ARENA; -static_assert(PAGE_SIZE == 4096, "bitmap handling will need adjustment for other page sizes"); +static_assert(PAGE_SIZE == 4096 || PAGE_SIZE == 16384, + "page size must be 4096 or 16384"); static void *get_slab(const struct size_class *c, size_t slab_size, const struct slab_metadata *metadata) { size_t index = metadata - c->slab_info; diff --git a/pages.h b/pages.h index 8795ddc9..cfc86579 100644 --- a/pages.h +++ b/pages.h @@ -7,7 +7,14 @@ #include "util.h" +#ifndef PAGE_SHIFT +#if CONFIG_PAGE_SIZE == 16384 +#define PAGE_SHIFT 14 +#else #define PAGE_SHIFT 12 +#endif +#endif + #ifndef PAGE_SIZE #define PAGE_SIZE ((size_t)1 << PAGE_SHIFT) #endif From 51f86f71ce52926749f3c5297841a363b2df81cc Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 23 Mar 2026 13:29:01 +0100 Subject: [PATCH 2/2] Increase the size of the bitmaps for 16k The for loop can be unrolled/make use of intrinsics later. --- calculate-waste | 4 ++-- h_malloc.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/calculate-waste b/calculate-waste index bedb6465..b1309df3 100755 --- a/calculate-waste +++ b/calculate-waste @@ -97,13 +97,13 @@ for size, slots, fragmentation in zip(size_classes, size_class_slots, fragmentat end=" |\n", ) -max_bits = 256 +max_bits = 512 max_page_span = 16 print() print("maximum bitmap size is {}-bit".format(max_bits)) -print( "maximum page span size is {} ({})".format(max_page_span, max_page_span * page_size)) +print("maximum page span size is {} ({})".format(max_page_span, max_page_span * page_size)) for size_class in size_classes: choices = [] diff --git a/h_malloc.c b/h_malloc.c index 704ce6ce..79adbe6a 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -115,7 +115,11 @@ static bool memory_map_fixed_tagged(void *ptr, size_t size) { #define SLAB_METADATA_COUNT struct slab_metadata { +#if CONFIG_PAGE_SIZE == 16384 + u64 bitmap[8]; +#else u64 bitmap[4]; +#endif struct slab_metadata *next; struct slab_metadata *prev; #if SLAB_CANARY @@ -125,8 +129,12 @@ struct slab_metadata { u16 count; #endif #if SLAB_QUARANTINE +#if CONFIG_PAGE_SIZE == 16384 + u64 quarantine_bitmap[8]; +#else u64 quarantine_bitmap[4]; -#endif +#endif /* CONFIG_PAGE_SIZE */ + #endif #ifdef HAS_ARM_MTE // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide) // @@ -467,20 +475,14 @@ static bool has_free_slots(size_t slots, const struct slab_metadata *metadata) { #ifdef SLAB_METADATA_COUNT return metadata->count < slots; #else - if (slots <= U64_WIDTH) { - u64 masked = metadata->bitmap[0] | get_mask(slots); - return masked != ~0UL; - } - if (slots <= U64_WIDTH * 2) { - u64 masked = metadata->bitmap[1] | get_mask(slots - U64_WIDTH); - return metadata->bitmap[0] != ~0UL || masked != ~0UL; - } - if (slots <= U64_WIDTH * 3) { - u64 masked = metadata->bitmap[2] | get_mask(slots - U64_WIDTH * 2); - return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || masked != ~0UL; + size_t last = (slots - 1) / U64_WIDTH; + for (size_t i = 0; i < last; i++) { + if (metadata->bitmap[i] != ~0UL) { + return true; + } } - u64 masked = metadata->bitmap[3] | get_mask(slots - U64_WIDTH * 3); - return metadata->bitmap[0] != ~0UL || metadata->bitmap[1] != ~0UL || metadata->bitmap[2] != ~0UL || masked != ~0UL; + u64 masked = metadata->bitmap[last] | get_mask(slots - last * U64_WIDTH); + return masked != ~0UL; #endif } @@ -489,7 +491,12 @@ static bool is_free_slab(const struct slab_metadata *metadata) { return !metadata->count; #else return !metadata->bitmap[0] && !metadata->bitmap[1] && !metadata->bitmap[2] && - !metadata->bitmap[3]; + !metadata->bitmap[3] +#if CONFIG_PAGE_SIZE == 16384 + && !metadata->bitmap[4] && !metadata->bitmap[5] && !metadata->bitmap[6] + && !metadata->bitmap[7] +#endif + ; #endif }