diff --git a/fearless_simd/examples/srgb.rs b/fearless_simd/examples/srgb.rs
index ca40533e1..c54c957c0 100644
--- a/fearless_simd/examples/srgb.rs
+++ b/fearless_simd/examples/srgb.rs
@@ -8,41 +8,41 @@
 
 use fearless_simd::{Level, dispatch, f32x4, prelude::*};
 
-// This block shows how to use safe wrappers for compile-time enforcement
-// of using valid SIMD intrinsics.
-#[cfg(feature = "safe_wrappers")]
-#[inline(always)]
-fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    // #[cfg(target_arch = "x86_64")]
-    // if let Some(avx2) = a.simd.level().as_avx2() {
-    //     return avx2
-    //         .sse4_1
-    //         ._mm_blend_ps::<8>(a.into(), b.into())
-    //         .simd_into(a.simd);
-    // }
-    #[cfg(target_arch = "aarch64")]
-    if let Some(neon) = a.simd.level().as_neon() {
-        return neon
-            .neon
-            .vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-            .simd_into(a.simd);
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__m128, _mm_blend_ps};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__m128, _mm_blend_ps};
+
+#[cfg(target_arch = "aarch64")]
+fearless_simd::neon_kernel! {
+    #[inline]
+    fn copy_alpha_neon(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+        vcopyq_laneq_f32::<3, 3>(a, b)
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fearless_simd::sse4_2_kernel! {
+    #[inline]
+    fn copy_alpha_sse4_2(a: __m128, b: __m128) -> __m128 {
+        _mm_blend_ps::<8>(a, b)
     }
-    let mut result = a;
-    result[3] = b[3];
-    result
 }
 
-// This block lets the example compile without safe wrappers.
-#[cfg(not(feature = "safe_wrappers"))]
 #[inline(always)]
 fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if let Some(sse4_2) = a.simd.level().as_sse4_2() {
+        return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd);
+    }
+
     #[cfg(target_arch = "aarch64")]
-    if let Some(_neon) = a.simd.level().as_neon() {
-        unsafe {
-            return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-                .simd_into(a.simd);
-        }
+    if let Some(neon) = a.simd.level().as_neon() {
+        return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd);
     }
+
     let mut result = a;
     result[3] = b[3];
     result
diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
index 9d342539a..00f702ab5 100644
--- a/fearless_simd/src/generated.rs
+++ b/fearless_simd/src/generated.rs
@@ -47,6 +47,7 @@
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod avx2;
 mod fallback;
+mod kernel_macros;
 #[cfg(target_arch = "aarch64")]
 mod neon;
 mod ops;
diff --git a/fearless_simd/src/generated/kernel_macros.rs b/fearless_simd/src/generated/kernel_macros.rs
new file mode 100644
index 000000000..a8331e1a2
--- /dev/null
+++ b/fearless_simd/src/generated/kernel_macros.rs
@@ -0,0 +1,340 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// This file is autogenerated by fearless_simd_gen
+
+#[cfg(target_arch = "aarch64")]
+#[doc = "Creates a context where you can safely call intrinsics"]
+#[doc = "available at the [`Neon`](crate::Neon) SIMD level."]
+#[doc = ""]
+#[doc = "This is useful if the portable abstractions are not enough, and you need to"]
+#[doc = "use platform-specific intrinsics for parts of the computation."]
+#[doc = ""]
+#[doc = "See [`Neon`](crate::Neon) for the target features represented by this SIMD level."]
+#[doc = ""]
+#[doc = "The generated wrapper takes a SIMD token ([`Neon`](crate::Neon)) as its first argument."]
+#[doc = "The macro runs your body inside an inner function annotated with the appropriate"]
+#[doc = "`#[target_feature]` attributes. That makes platform-specific intrinsics from `core::arch` or"]
+#[doc = "`std::arch` safe to call in the body, as long as they do not have safety"]
+#[doc = "requirements beyond those target features."]
+#[doc = ""]
+#[doc = "## Example"]
+#[doc = ""]
+#[doc = "```rust"]
+#[doc = "# #[allow(unused_imports)]"]
+#[doc = "use fearless_simd::{f32x4, prelude::*};"]
+#[doc = "#[cfg(target_arch = \"aarch64\")]"]
+#[doc = "use std::arch::aarch64::{float32x4_t, vaddq_f32};"]
+#[doc = ""]
+#[doc = "#[cfg(target_arch = \"aarch64\")]"]
+#[doc = "fearless_simd::neon_kernel! {"]
+#[doc = "    fn add_f32x4(a: float32x4_t, b: float32x4_t) -> float32x4_t {"]
+#[doc = "        vaddq_f32(a, b)"]
+#[doc = "    }"]
+#[doc = "}"]
+#[doc = ""]
+#[doc = "# fn main() {"]
+#[doc = "#[cfg(target_arch = \"aarch64\")]"]
+#[doc = "if let Some(neon) = fearless_simd::Level::new().as_neon() {"]
+#[doc = "    let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);"]
+#[doc = "    let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);"]
+#[doc = "    let sum: f32x4<_> = add_f32x4(neon, a.into(), b.into()).simd_into(neon);"]
+#[doc = ""]
+#[doc = "    assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);"]
+#[doc = "}"]
+#[doc = "# }"]
+#[doc = "```"]
+#[doc = ""]
+#[doc = "See the [sRGB example] for an end-to-end use of kernel macros."]
+#[doc = ""]
+#[doc = "[sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs"]
+#[doc = ""]
+#[doc = "Kernel macros only accept safe functions."]
+#[doc = ""]
+#[doc = "```compile_fail"]
+#[doc = "fearless_simd::neon_kernel! {"]
+#[doc = "    unsafe fn should_not_compile() {}"]
+#[doc = "}"]
+#[doc = "```"]
+#[macro_export]
+macro_rules! neon_kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $($arg:ident : $arg_ty:ty),* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg(target_arch = "aarch64")]
+        $(#[$meta])*
+        $vis fn $name(
+            _simd: $crate::Neon,
+            $($arg: $arg_ty),*
+        ) $(-> $ret)? {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            // SAFETY: the `Neon` token proves that the required target features are available.
+            unsafe { __fearless_simd_kernel($($arg),*) }
+        }
+    };
+}
+
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+#[doc = "Creates a context where you can safely call intrinsics"]
+#[doc = "available at the [`WasmSimd128`](crate::WasmSimd128) SIMD level."]
+#[doc = ""]
+#[doc = "This is useful if the portable abstractions are not enough, and you need to"]
+#[doc = "use platform-specific intrinsics for parts of the computation."]
+#[doc = ""]
+#[doc = "See [`WasmSimd128`](crate::WasmSimd128) for the target features represented by this SIMD level."]
+#[doc = ""]
+#[doc = "The generated wrapper takes a SIMD token ([`WasmSimd128`](crate::WasmSimd128)) as its first argument and is"]
+#[doc = "compiled only when the required target features are enabled. That makes matching"]
+#[doc = "platform-specific intrinsics from `core::arch` or `std::arch` safe to call in the"]
+#[doc = "body, as long as they do not have safety requirements beyond those target features."]
+#[doc = ""]
+#[doc = "## Example"]
+#[doc = ""]
+#[doc = "```rust"]
+#[doc = "# #[allow(unused_imports)]"]
+#[doc = "use fearless_simd::{f32x4, prelude::*};"]
+#[doc = "#[cfg(all(target_arch = \"wasm32\", target_feature = \"simd128\"))]"]
+#[doc = "use std::arch::wasm32::{f32x4_add, v128};"]
+#[doc = ""]
+#[doc = "#[cfg(all(target_arch = \"wasm32\", target_feature = \"simd128\"))]"]
+#[doc = "fearless_simd::wasm_simd128_kernel! {"]
+#[doc = "    fn add_f32x4(a: v128, b: v128) -> v128 {"]
+#[doc = "        f32x4_add(a, b)"]
+#[doc = "    }"]
+#[doc = "}"]
+#[doc = ""]
+#[doc = "# fn main() {"]
+#[doc = "#[cfg(all(target_arch = \"wasm32\", target_feature = \"simd128\"))]"]
+#[doc = "{"]
+#[doc = "    if let Some(wasm) = fearless_simd::Level::new().as_wasm_simd128() {"]
+#[doc = "        let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);"]
+#[doc = "        let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);"]
+#[doc = "        let sum: f32x4<_> = add_f32x4(wasm, a.into(), b.into()).simd_into(wasm);"]
+#[doc = ""]
+#[doc = "        assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);"]
+#[doc = "    }"]
+#[doc = "}"]
+#[doc = "# }"]
+#[doc = "```"]
+#[doc = ""]
+#[doc = "See the [sRGB example] for an end-to-end use of kernel macros."]
+#[doc = ""]
+#[doc = "[sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs"]
+#[doc = ""]
+#[doc = "Kernel macros only accept safe functions."]
+#[doc = ""]
+#[doc = "```compile_fail"]
+#[doc = "fearless_simd::wasm_simd128_kernel! {"]
+#[doc = "    unsafe fn should_not_compile() {}"]
+#[doc = "}"]
+#[doc = "```"]
+#[macro_export]
+macro_rules! wasm_simd128_kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $($arg:ident : $arg_ty:ty),* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+        $(#[$meta])*
+        $vis fn $name(
+            _simd: $crate::WasmSimd128,
+            $($arg: $arg_ty),*
+        ) $(-> $ret)? {
+            #[inline]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            let _ = _simd;
+            __fearless_simd_kernel($($arg),*)
+        }
+    };
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[doc = "Creates a context where you can safely call intrinsics"]
+#[doc = "available at the [`Sse4_2`](crate::Sse4_2) SIMD level."]
+#[doc = ""]
+#[doc = "This is useful if the portable abstractions are not enough, and you need to"]
+#[doc = "use platform-specific intrinsics for parts of the computation."]
+#[doc = ""]
+#[doc = "See [`Sse4_2`](crate::Sse4_2) for the target features represented by this SIMD level."]
+#[doc = ""]
+#[doc = "The generated wrapper takes a SIMD token ([`Sse4_2`](crate::Sse4_2)) as its first argument."]
+#[doc = "The macro runs your body inside an inner function annotated with the appropriate"]
+#[doc = "`#[target_feature]` attributes. That makes platform-specific intrinsics from `core::arch` or"]
+#[doc = "`std::arch` safe to call in the body, as long as they do not have safety"]
+#[doc = "requirements beyond those target features."]
+#[doc = ""]
+#[doc = "## Example"]
+#[doc = ""]
+#[doc = "```rust"]
+#[doc = "# #[allow(unused_imports)]"]
+#[doc = "use fearless_simd::{f32x4, prelude::*};"]
+#[doc = "#[cfg(target_arch = \"x86\")]"]
+#[doc = "use std::arch::x86::{__m128, _mm_add_ps};"]
+#[doc = "#[cfg(target_arch = \"x86_64\")]"]
+#[doc = "use std::arch::x86_64::{__m128, _mm_add_ps};"]
+#[doc = ""]
+#[doc = "#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]"]
+#[doc = "fearless_simd::sse4_2_kernel! {"]
+#[doc = "    fn add_f32x4(a: __m128, b: __m128) -> __m128 {"]
+#[doc = "        _mm_add_ps(a, b)"]
+#[doc = "    }"]
+#[doc = "}"]
+#[doc = ""]
+#[doc = "# fn main() {"]
+#[doc = "#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]"]
+#[doc = "if let Some(sse4_2) = fearless_simd::Level::new().as_sse4_2() {"]
+#[doc = "    let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(sse4_2);"]
+#[doc = "    let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(sse4_2);"]
+#[doc = "    let sum: f32x4<_> = add_f32x4(sse4_2, a.into(), b.into()).simd_into(sse4_2);"]
+#[doc = ""]
+#[doc = "    assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);"]
+#[doc = "}"]
+#[doc = "# }"]
+#[doc = "```"]
+#[doc = ""]
+#[doc = "See the [sRGB example] for an end-to-end use of kernel macros."]
+#[doc = ""]
+#[doc = "[sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs"]
+#[doc = ""]
+#[doc = "Kernel macros only accept safe functions."]
+#[doc = ""]
+#[doc = "```compile_fail"]
+#[doc = "fearless_simd::sse4_2_kernel! {"]
+#[doc = "    unsafe fn should_not_compile() {}"]
+#[doc = "}"]
+#[doc = "```"]
+#[macro_export]
+macro_rules! sse4_2_kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $($arg:ident : $arg_ty:ty),* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        $(#[$meta])*
+        $vis fn $name(
+            _simd: $crate::Sse4_2,
+            $($arg: $arg_ty),*
+        ) $(-> $ret)? {
+            #[inline]
+            #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            // SAFETY: the `Sse4_2` token proves that the required target features are available.
+            unsafe { __fearless_simd_kernel($($arg),*) }
+        }
+    };
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[doc = "Creates a context where you can safely call intrinsics"]
+#[doc = "available at the [`Avx2`](crate::Avx2) SIMD level."]
+#[doc = ""]
+#[doc = "This is useful if the portable abstractions are not enough, and you need to"]
+#[doc = "use platform-specific intrinsics for parts of the computation."]
+#[doc = ""]
+#[doc = "See [`Avx2`](crate::Avx2) for the target features represented by this SIMD level."]
+#[doc = ""]
+#[doc = "The generated wrapper takes a SIMD token ([`Avx2`](crate::Avx2)) as its first argument."]
+#[doc = "The macro runs your body inside an inner function annotated with the appropriate"]
+#[doc = "`#[target_feature]` attributes. That makes platform-specific intrinsics from `core::arch` or"]
+#[doc = "`std::arch` safe to call in the body, as long as they do not have safety"]
+#[doc = "requirements beyond those target features."]
+#[doc = ""]
+#[doc = "## Example"]
+#[doc = ""]
+#[doc = "```rust"]
+#[doc = "# #[allow(unused_imports)]"]
+#[doc = "use fearless_simd::{i32x8, prelude::*};"]
+#[doc = "#[cfg(target_arch = \"x86\")]"]
+#[doc = "use std::arch::x86::{__m256i, _mm256_add_epi32};"]
+#[doc = "#[cfg(target_arch = \"x86_64\")]"]
+#[doc = "use std::arch::x86_64::{__m256i, _mm256_add_epi32};"]
+#[doc = ""]
+#[doc = "#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]"]
+#[doc = "fearless_simd::avx2_kernel! {"]
+#[doc = "    fn add_i32x8(a: __m256i, b: __m256i) -> __m256i {"]
+#[doc = "        _mm256_add_epi32(a, b)"]
+#[doc = "    }"]
+#[doc = "}"]
+#[doc = ""]
+#[doc = "# fn main() {"]
+#[doc = "#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]"]
+#[doc = "if let Some(avx2) = fearless_simd::Level::new().as_avx2() {"]
+#[doc = "    let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);"]
+#[doc = "    let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);"]
+#[doc = "    let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2);"]
+#[doc = ""]
+#[doc = "    assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]);"]
+#[doc = "}"]
+#[doc = "# }"]
+#[doc = "```"]
+#[doc = ""]
+#[doc = "See the [sRGB example] for an end-to-end use of kernel macros."]
+#[doc = ""]
+#[doc = "[sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs"]
+#[doc = ""]
+#[doc = "Kernel macros only accept safe functions."]
+#[doc = ""]
+#[doc = "```compile_fail"]
+#[doc = "fearless_simd::avx2_kernel! {"]
+#[doc = "    unsafe fn should_not_compile() {}"]
+#[doc = "}"]
+#[doc = "```"]
+#[macro_export]
+macro_rules! avx2_kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $($arg:ident : $arg_ty:ty),* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        $(#[$meta])*
+        $vis fn $name(
+            _simd: $crate::Avx2,
+            $($arg: $arg_ty),*
+        ) $(-> $ret)? {
+            #[inline]
+            #[target_feature(enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave")]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            // SAFETY: the `Avx2` token proves that the required target features are available.
+            unsafe { __fearless_simd_kernel($($arg),*) }
+        }
+    };
+}
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 0bfe06234..820c18b50 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -29,6 +29,11 @@ pub(crate) trait Level {
     /// If this SIMD level is not runtime-toggleable (for instance, the fallback implementation or WASM SIMD128),
     /// returns `None`.
     fn enabled_target_features(&self) -> Option<&'static str>;
+    /// The `cfg` expression under which this SIMD level token is available to generated kernel
+    /// macros.
+    fn availability_cfg(&self) -> Option<&'static str> {
+        None
+    }
     /// A function that takes a given vector type and returns the corresponding native vector type. For instance,
     /// `f32x8` would map to `__m256` on `Avx2`, and to `[f32; 8]` on `Fallback`. This will never be passed a vector
     /// type *larger* than [`Level::max_block_size`], since [`VecType::aligned_wrapper_ty`] will split those up into
diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs
index 10efdfd99..b8446dc69 100644
--- a/fearless_simd_gen/src/main.rs
+++ b/fearless_simd_gen/src/main.rs
@@ -6,7 +6,7 @@
     reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
 )]
 
-use std::{fs::File, io::Write, path::Path};
+use std::{fmt, fs::File, io::Write, path::Path};
 
 use clap::{Parser, ValueEnum};
 use proc_macro2::TokenStream;
@@ -17,6 +17,7 @@ mod arch;
 mod generic;
 mod level;
 mod mk_fallback;
+mod mk_kernel_macros;
 mod mk_neon;
 mod mk_ops;
 mod mk_simd_trait;
@@ -30,6 +31,7 @@ mod types;
 enum Module {
     SimdTypes,
     SimdTrait,
+    KernelMacros,
     Ops,
     Neon,
     Wasm,
@@ -56,16 +58,17 @@ struct Cli {
 }
 
 impl Module {
-    fn generate_code(self) -> TokenStream {
+    fn generate_code(self) -> GeneratedCode {
         match self {
-            Self::SimdTypes => mk_simd_types::mk_simd_types(),
-            Self::SimdTrait => mk_simd_trait::mk_simd_trait(),
-            Self::Ops => mk_ops::mk_ops(),
-            Self::Neon => mk_neon::Neon.make_module(),
-            Self::Wasm => mk_wasm::WasmSimd128.make_module(),
-            Self::Fallback => mk_fallback::Fallback.make_module(),
-            Self::Sse4_2 => mk_x86::X86::Sse4_2.make_module(),
-            Self::Avx2 => mk_x86::X86::Avx2.make_module(),
+            Self::SimdTypes => GeneratedCode::Tokens(mk_simd_types::mk_simd_types()),
+            Self::SimdTrait => GeneratedCode::Tokens(mk_simd_trait::mk_simd_trait()),
+            Self::KernelMacros => GeneratedCode::Source(mk_kernel_macros::mk_kernel_macros()),
+            Self::Ops => GeneratedCode::Tokens(mk_ops::mk_ops()),
+            Self::Neon => GeneratedCode::Tokens(mk_neon::Neon.make_module()),
+            Self::Wasm => GeneratedCode::Tokens(mk_wasm::WasmSimd128.make_module()),
+            Self::Fallback => GeneratedCode::Tokens(mk_fallback::Fallback.make_module()),
+            Self::Sse4_2 => GeneratedCode::Tokens(mk_x86::X86::Sse4_2.make_module()),
+            Self::Avx2 => GeneratedCode::Tokens(mk_x86::X86::Avx2.make_module()),
         }
     }
 
@@ -99,6 +102,7 @@ impl Module {
         match self {
             Self::SimdTypes => "simd_types",
             Self::SimdTrait => "simd_trait",
+            Self::KernelMacros => "kernel_macros",
             Self::Ops => "ops",
             Self::Neon => "neon",
             Self::Fallback => "fallback",
@@ -109,9 +113,24 @@ impl Module {
     }
 }
 
+enum GeneratedCode {
+    Tokens(TokenStream),
+    Source(String),
+}
+
+impl fmt::Display for GeneratedCode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Tokens(tokens) => tokens.fmt(f),
+            Self::Source(source) => f.write_str(source),
+        }
+    }
+}
+
 const MODULES: &[Module] = &[
     Module::SimdTypes,
     Module::SimdTrait,
+    Module::KernelMacros,
     Module::Ops,
     Module::Neon,
     Module::Fallback,
diff --git a/fearless_simd_gen/src/mk_kernel_macros.rs b/fearless_simd_gen/src/mk_kernel_macros.rs
new file mode 100644
index 000000000..6564418f8
--- /dev/null
+++ b/fearless_simd_gen/src/mk_kernel_macros.rs
@@ -0,0 +1,305 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use crate::{level::Level, mk_neon::Neon, mk_wasm::WasmSimd128, mk_x86::X86};
+
+/// This emits a String rather than a TokenStream
+/// because rustfmt just gives up formatting macros
+/// and we end up with a completely unreadable token soup
+/// if we don't impose formatting on it manually.
+pub(crate) fn mk_kernel_macros() -> String {
+    [
+        kernel_macro(&Neon),
+        kernel_macro(&WasmSimd128),
+        kernel_macro(&X86::Sse4_2),
+        kernel_macro(&X86::Avx2),
+    ]
+    .join("\n")
+}
+
+fn kernel_macro(level: &dyn Level) -> String {
+    let macro_name = format!("{}_kernel", snake_case(level.name()));
+    let name = level.name();
+    let cfg = level
+        .availability_cfg()
+        .expect("kernel macros should only be generated for cfg-gated SIMD levels");
+    let body = kernel_body(level);
+    let target_feature_doc = target_feature_doc(level);
+    let example_doc = example_doc(level);
+
+    KERNEL_MACRO_TEMPLATE
+        .replace("@MACRO_NAME@", &macro_name)
+        .replace("@LEVEL_NAME@", name)
+        .replace("@CFG@", cfg)
+        .replace("@BODY@", &body)
+        .replace("@TARGET_FEATURE_DOC@", &target_feature_doc)
+        .replace("@EXAMPLE_DOC@", &example_doc)
+}
+
+fn kernel_body(level: &dyn Level) -> String {
+    if let Some(features) = level.enabled_target_features() {
+        KERNEL_BODY_WITH_TARGET_FEATURES
+            .replace("@FEATURES@", features)
+            .replace("@LEVEL_NAME@", level.name())
+    } else {
+        KERNEL_BODY.to_string()
+    }
+}
+
+fn target_feature_doc(level: &dyn Level) -> String {
+    let body = if level.enabled_target_features().is_some() {
+        r#"
+#[doc = "The generated wrapper takes a SIMD token ([`@LEVEL_NAME@`](crate::@LEVEL_NAME@)) as its first argument."]
+#[doc = "The macro runs your body inside an inner function annotated with the appropriate"]
+#[doc = "`#[target_feature]` attributes. That makes platform-specific intrinsics from `core::arch` or"]
+#[doc = "`std::arch` safe to call in the body, as long as they do not have safety"]
+#[doc = "requirements beyond those target features."]
+"#
+    } else {
+        r#"
+#[doc = "The generated wrapper takes a SIMD token ([`@LEVEL_NAME@`](crate::@LEVEL_NAME@)) as its first argument and is"]
+#[doc = "compiled only when the required target features are enabled. That makes matching"]
+#[doc = "platform-specific intrinsics from `core::arch` or `std::arch` safe to call in the"]
+#[doc = "body, as long as they do not have safety requirements beyond those target features."]
+"#
+    };
+
+    body.replace("@LEVEL_NAME@", level.name())
+}
+
+fn example_doc(level: &dyn Level) -> String {
+    let example = match level.name() {
+        "Neon" => {
+            r#"
+## Example
+
+```rust
+# #[allow(unused_imports)]
+use fearless_simd::{f32x4, prelude::*};
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::{float32x4_t, vaddq_f32};
+
+#[cfg(target_arch = "aarch64")]
+fearless_simd::neon_kernel! {
+    fn add_f32x4(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+        vaddq_f32(a, b)
+    }
+}
+
+# fn main() {
+#[cfg(target_arch = "aarch64")]
+if let Some(neon) = fearless_simd::Level::new().as_neon() {
+    let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
+    let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
+    let sum: f32x4<_> = add_f32x4(neon, a.into(), b.into()).simd_into(neon);
+
+    assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);
+}
+# }
+```
+"#
+        }
+        "WasmSimd128" => {
+            r#"
+## Example
+
+```rust
+# #[allow(unused_imports)]
+use fearless_simd::{f32x4, prelude::*};
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+use std::arch::wasm32::{f32x4_add, v128};
+
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+fearless_simd::wasm_simd128_kernel! {
+    fn add_f32x4(a: v128, b: v128) -> v128 {
+        f32x4_add(a, b)
+    }
+}
+
+# fn main() {
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+{
+    if let Some(wasm) = fearless_simd::Level::new().as_wasm_simd128() {
+        let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
+        let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
+        let sum: f32x4<_> = add_f32x4(wasm, a.into(), b.into()).simd_into(wasm);
+
+        assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);
+    }
+}
+# }
+```
+"#
+        }
+        "Sse4_2" => {
+            r#"
+## Example
+
+```rust
+# #[allow(unused_imports)]
+use fearless_simd::{f32x4, prelude::*};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::{__m128, _mm_add_ps};
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::{__m128, _mm_add_ps};
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fearless_simd::sse4_2_kernel! {
+    fn add_f32x4(a: __m128, b: __m128) -> __m128 {
+        _mm_add_ps(a, b)
+    }
+}
+
+# fn main() {
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+if let Some(sse4_2) = fearless_simd::Level::new().as_sse4_2() {
+    let a: f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(sse4_2);
+    let b: f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(sse4_2);
+    let sum: f32x4<_> = add_f32x4(sse4_2, a.into(), b.into()).simd_into(sse4_2);
+
+    assert_eq!(<[f32; 4]>::from(sum), [11.0, 22.0, 33.0, 44.0]);
+}
+# }
+```
+"#
+        }
+        "Avx2" => {
+            r#"
+## Example
+
+```rust
+# #[allow(unused_imports)]
+use fearless_simd::{i32x8, prelude::*};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::{__m256i, _mm256_add_epi32};
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::{__m256i, _mm256_add_epi32};
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fearless_simd::avx2_kernel! {
+    fn add_i32x8(a: __m256i, b: __m256i) -> __m256i {
+        _mm256_add_epi32(a, b)
+    }
+}
+
+# fn main() {
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+if let Some(avx2) = fearless_simd::Level::new().as_avx2() {
+    let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+    let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+    let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2);
+
+    assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]);
+}
+# }
+```
+"#
+        }
+        _ => unreachable!("kernel macros are only generated for known SIMD levels"),
+    };
+
+    doc_block(example)
+}
+
+fn doc_block(markdown: &str) -> String {
+    markdown
+        .trim_matches('\n')
+        .lines()
+        .map(|line| {
+            format!(
+                r#"#[doc = "{}"]"#,
+                line.replace('\\', "\\\\").replace('"', "\\\"")
+            )
+        })
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+fn snake_case(name: &str) -> String {
+    let mut result = String::new();
+    let mut prev_was_lowercase = false;
+    for ch in name.chars() {
+        if ch == '_' {
+            result.push(ch);
+            prev_was_lowercase = false;
+        } else if ch.is_uppercase() {
+            if prev_was_lowercase {
+                result.push('_');
+            }
+            result.extend(ch.to_lowercase());
+            prev_was_lowercase = false;
+        } else {
+            result.push(ch);
+            prev_was_lowercase = ch.is_lowercase();
+        }
+    }
+    result
+}
+
+const KERNEL_MACRO_TEMPLATE: &str = r#"
+#[cfg(@CFG@)]
+#[doc = "Creates a context where you can safely call intrinsics"]
+#[doc = "available at the [`@LEVEL_NAME@`](crate::@LEVEL_NAME@) SIMD level."]
+#[doc = ""]
+#[doc = "This is useful if the portable abstractions are not enough, and you need to"]
+#[doc = "use platform-specific intrinsics for parts of the computation."]
+#[doc = ""]
+#[doc = "See [`@LEVEL_NAME@`](crate::@LEVEL_NAME@) for the target features represented by this SIMD level."]
+#[doc = ""]
+@TARGET_FEATURE_DOC@
+#[doc = ""]
+@EXAMPLE_DOC@
+#[doc = ""]
+#[doc = "See the [sRGB example] for an end-to-end use of kernel macros."]
+#[doc = ""]
+#[doc = "[sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs"]
+#[doc = ""]
+#[doc = "Kernel macros only accept safe functions."]
+#[doc = ""]
+#[doc = "```compile_fail"]
+#[doc = "fearless_simd::@MACRO_NAME@! {"]
+#[doc = "    unsafe fn should_not_compile() {}"]
+#[doc = "}"]
+#[doc = "```"]
+#[macro_export]
+macro_rules! @MACRO_NAME@ {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $($arg:ident : $arg_ty:ty),* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg(@CFG@)]
+        $(#[$meta])*
+        $vis fn $name(
+            _simd: $crate::@LEVEL_NAME@,
+            $($arg: $arg_ty),*
+        ) $(-> $ret)? {
+@BODY@
+        }
+    };
+}
+"#;
+
+const KERNEL_BODY_WITH_TARGET_FEATURES: &str = r#"            #[inline]
+            #[target_feature(enable = "@FEATURES@")]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            // SAFETY: the `@LEVEL_NAME@` token proves that the required target features are available.
+            unsafe { __fearless_simd_kernel($($arg),*) }"#;
+
+const KERNEL_BODY: &str = r#"            #[inline]
+            fn __fearless_simd_kernel(
+                $($arg: $arg_ty),*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+
+            let _ = _simd;
+            __fearless_simd_kernel($($arg),*)"#;
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index ad356dfac..fc1318378 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -36,6 +36,10 @@ impl Level for Neon {
         Some("neon")
     }
 
+    fn availability_cfg(&self) -> Option<&'static str> {
+        Some(r#"target_arch = "aarch64""#)
+    }
+
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream {
         let scalar = match vec_ty.scalar {
             ScalarType::Float => "float",
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 705777769..f2fd7de38 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -37,6 +37,10 @@ impl Level for WasmSimd128 {
         None
     }
 
+    fn availability_cfg(&self) -> Option<&'static str> {
+        Some(r#"all(target_arch = "wasm32", target_feature = "simd128")"#)
+    }
+
     fn arch_ty(&self, _vec_ty: &VecType) -> TokenStream {
         quote! { v128 }
     }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index e0c269490..d1abba6d3 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -48,6 +48,10 @@ impl Level for X86 {
         })
     }
 
+    fn availability_cfg(&self) -> Option<&'static str> {
+        Some(r#"any(target_arch = "x86", target_arch = "x86_64")"#)
+    }
+
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream {
         let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
             (ScalarType::Float, 32) => "",