diff --git a/crates/libm-test/src/f8_impl.rs b/crates/libm-test/src/f8_impl.rs
index 56ea0b72..0683d839 100644
--- a/crates/libm-test/src/f8_impl.rs
+++ b/crates/libm-test/src/f8_impl.rs
@@ -78,6 +78,10 @@ impl Float for f8 {
         libm::generic::copysign(self, other)
     }
 
+    fn fma(self, _y: Self, _z: Self) -> Self {
+        unimplemented!()
+    }
+
     fn normalize(_significand: Self::Int) -> (i32, Self::Int) {
         unimplemented!()
     }
diff --git a/etc/function-definitions.json b/etc/function-definitions.json
index 63d9927a..64a775ba 100644
--- a/etc/function-definitions.json
+++ b/etc/function-definitions.json
@@ -130,8 +130,7 @@
     "copysign": {
         "sources": [
             "src/math/copysign.rs",
-            "src/math/generic/copysign.rs",
-            "src/math/support/float_traits.rs"
+            "src/math/generic/copysign.rs"
         ],
         "type": "f64"
     },
@@ -343,22 +342,19 @@
     },
     "fma": {
         "sources": [
-            "src/math/fma.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma.rs"
         ],
         "type": "f64"
     },
     "fmaf": {
         "sources": [
-            "src/math/fmaf.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma_wide.rs"
         ],
         "type": "f32"
     },
     "fmaf128": {
         "sources": [
-            "src/math/fmaf128.rs",
-            "src/math/generic/fma.rs"
+            "src/math/fma.rs"
         ],
         "type": "f128"
     },
diff --git a/etc/update-api-list.py b/etc/update-api-list.py
index c0b6e41d..67d1b050 100755
--- a/etc/update-api-list.py
+++ b/etc/update-api-list.py
@@ -24,7 +24,7 @@
 DIRECTORIES = [".github", "ci", "crates", "etc", "src"]
 
 # These files do not trigger a retest.
-IGNORED_SOURCES = ["src/libm_helper.rs"]
+IGNORED_SOURCES = ["src/libm_helper.rs", "src/math/support/float_traits.rs"]
 
 IndexTy: TypeAlias = dict[str, dict[str, Any]]
 """Type of the `index` item in rustdoc's JSON output"""
diff --git a/src/math/cbrt.rs b/src/math/cbrt.rs
index 8560d37a..9d3311cd 100644
--- a/src/math/cbrt.rs
+++ b/src/math/cbrt.rs
@@ -103,11 +103,11 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
      * and rr an approximation of 1/zz. We now perform another iteration of
      * Newton-Raphson, this time with a linear approximation only. */
     y2 = y * y;
-    let mut y2l: f64 = fmaf64(y, y, -y2);
+    let mut y2l: f64 = y.fma(y, -y2);
 
     /* y2 + y2l = y^2 exactly */
     let mut y3: f64 = y2 * y;
-    let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l;
+    let mut y3l: f64 = y.fma(y2, -y3) + y * y2l;
 
     /* y3 + y3l approximates y^3 with about 106 bits of accuracy */
     h = ((y3 - zz) + y3l) * rr;
@@ -132,9 +132,9 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
         cold_path();
 
         y2 = y1 * y1;
-        y2l = fmaf64(y1, y1, -y2);
+        y2l = y1.fma(y1, -y2);
         y3 = y2 * y1;
-        y3l = fmaf64(y1, y2, -y3) + y1 * y2l;
+        y3l = y1.fma(y2, -y3) + y1 * y2l;
         h = ((y3 - zz) + y3l) * rr;
         dy = h * (y1 * u0);
         y = y1 - dy;
@@ -198,18 +198,6 @@ pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> {
     FpResult::ok(f64::from_bits(cvt3))
 }
 
-fn fmaf64(x: f64, y: f64, z: f64) -> f64 {
-    #[cfg(intrinsics_enabled)]
-    {
-        return unsafe { core::intrinsics::fmaf64(x, y, z) };
-    }
-
-    #[cfg(not(intrinsics_enabled))]
-    {
-        return super::fma(x, y, z);
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/math/fma.rs b/src/math/fma.rs
index 69cc3eb6..a54984c9 100644
--- a/src/math/fma.rs
+++ b/src/math/fma.rs
@@ -1,14 +1,364 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
+
+use super::super::support::{DInt, FpResult, HInt, IntTy, Round, Status};
+use super::{CastFrom, CastInto, Float, Int, MinInt};
+
 /// Fused multiply add (f64)
 ///
 /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
 #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
 pub fn fma(x: f64, y: f64, z: f64) -> f64 {
-    return super::generic::fma(x, y, z);
+    fma_round(x, y, z, Round::Nearest).val
+}
+
+/// Fused multiply add (f128)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg(f128_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
+    fma_round(x, y, z, Round::Nearest).val
+}
+
+/// Fused multiply-add that works when there is not a larger float size available. Computes
+/// `(x * y) + z`.
+pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
+where
+    F: Float,
+    F: CastFrom<F::SignedInt>,
+    F: CastFrom<i8>,
+    F::Int: HInt,
+    u32: CastInto<F::Int>,
+{
+    let one = IntTy::<F>::ONE;
+    let zero = IntTy::<F>::ZERO;
+
+    // Normalize such that the top of the mantissa is zero and we have a guard bit.
+    let nx = Norm::from_float(x);
+    let ny = Norm::from_float(y);
+    let nz = Norm::from_float(z);
+
+    if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() {
+        // Value will overflow, defer to non-fused operations.
+        return FpResult::ok(x * y + z);
+    }
+
+    if nz.is_zero_nan_inf() {
+        if nz.is_zero() {
+            // Empty add component means we only need to multiply.
+            return FpResult::ok(x * y);
+        }
+        // `z` is NaN or infinity, which sets the result.
+        return FpResult::ok(z);
+    }
+
+    // multiply: r = x * y
+    let zhi: F::Int;
+    let zlo: F::Int;
+    let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();
+
+    // Exponent result of multiplication
+    let mut e: i32 = nx.e + ny.e;
+    // Needed shift to align `z` to the multiplication result
+    let mut d: i32 = nz.e - e;
+    let sbits = F::BITS as i32;
+
+    // Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
+    if d > 0 {
+        // The magnitude of `z` is larger than `x * y`
+        if d < sbits {
+            // Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
+            // it into `(zhi, zlo)`. No exponent adjustment necessary.
+            zlo = nz.m << d;
+            zhi = nz.m >> (sbits - d);
+        } else {
+            // Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
+            // as a shift by `sbits`).
+            zlo = zero;
+            zhi = nz.m;
+            d -= sbits;
+
+            // `z`'s exponent is large enough that it now needs to be taken into account.
+            e = nz.e - sbits;
+
+            if d == 0 {
+                // Exactly `sbits`, nothing to do
+            } else if d < sbits {
+                // Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
+                rlo = (rhi << (sbits - d)) | (rlo >> d);
+                // Set the sticky bit
+                rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero);
+                rhi = rhi >> d;
+            } else {
+                // `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
+                // the sticky bit.
+                rlo = one;
+                rhi = zero;
+            }
+        }
+    } else {
+        // `z`'s magnitude once shifted fits entirely within `zlo`
+        zhi = zero;
+        d = -d;
+        if d == 0 {
+            // No shift needed
+            zlo = nz.m;
+        } else if d < sbits {
+            // Shift s.t. `nz.m` fits into `zlo`
+            let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
+            zlo = (nz.m >> d) | sticky;
+        } else {
+            // Would be entirely shifted out, only set the sticky bit
+            zlo = one;
+        }
+    }
+
+    /* addition */
+
+    let mut neg = nx.neg ^ ny.neg;
+    let samesign: bool = !neg ^ nz.neg;
+    let mut rhi_nonzero = true;
+
+    if samesign {
+        // r += z
+        rlo = rlo.wrapping_add(zlo);
+        rhi += zhi + IntTy::<F>::from(rlo < zlo);
+    } else {
+        // r -= z
+        let (res, borrow) = rlo.overflowing_sub(zlo);
+        rlo = res;
+        rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
+        if (rhi >> (F::BITS - 1)) != zero {
+            rlo = rlo.signed().wrapping_neg().unsigned();
+            rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
+            neg = !neg;
+        }
+        rhi_nonzero = rhi != zero;
+    }
+
+    /* Construct result */
+
+    // Shift result into `rhi`, left-aligned. Last bit is sticky
+    if rhi_nonzero {
+        // `d` > 0, need to shift both `rhi` and `rlo` into result
+        e += sbits;
+        d = rhi.leading_zeros() as i32 - 1;
+        rhi = (rhi << d) | (rlo >> (sbits - d));
+        // Update sticky
+        rhi |= IntTy::<F>::from((rlo << d) != zero);
+    } else if rlo != zero {
+        // `rhi` is zero, `rlo` is the entire result and needs to be shifted
+        d = rlo.leading_zeros() as i32 - 1;
+        if d < 0 {
+            // Shift and set sticky
+            rhi = (rlo >> 1) | (rlo & one);
+        } else {
+            rhi = rlo << d;
+        }
+    } else {
+        // exact +/- 0.0
+        return FpResult::ok(x * y + z);
+    }
+
+    e -= d;
+
+    // Use int->float conversion to populate the significand.
+    // i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
+    let mut i: F::SignedInt = rhi.signed();
+
+    if neg {
+        i = -i;
+    }
+
+    // `|r|` is in `[0x1p62,0x1p63]` for `f64`
+    let mut r: F = F::cast_from_lossy(i);
+
+    /* Account for subnormal and rounding */
+
+    // Unbiased exponent for the maximum value of `r`
+    let max_pow = F::BITS - 1 + F::EXP_BIAS;
+
+    let mut status = Status::OK;
+
+    if e < -(max_pow as i32 - 2) {
+        // Result is subnormal before rounding
+        if e == -(max_pow as i32 - 1) {
+            let mut c = F::from_parts(false, max_pow, zero);
+            if neg {
+                c = -c;
+            }
+
+            if r == c {
+                // Min normal after rounding,
+                status.set_underflow(true);
+                r = F::MIN_POSITIVE_NORMAL.copysign(r);
+                return FpResult::new(r, status);
+            }
+
+            if (rhi << (F::SIG_BITS + 1)) != zero {
+                // Account for truncated bits. One bit will be lost in the `scalbn` call, add
+                // another top bit to avoid double rounding if inexact.
+                let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2));
+                i = iu.signed();
+
+                if neg {
+                    i = -i;
+                }
+
+                r = F::cast_from_lossy(i);
+
+                // Remove the top bit
+                r = F::cast_from(2i8) * r - c;
+                status.set_underflow(true);
+            }
+        } else {
+            // Only round once when scaled
+            d = F::EXP_BITS as i32 - 1;
+            let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
+            i = (((rhi >> d) | sticky) << d).signed();
+
+            if neg {
+                i = -i;
+            }
+
+            r = F::cast_from_lossy(i);
+        }
+    }
+
+    // Use our exponent to scale the final value.
+    FpResult::new(super::generic::scalbn(r, e), status)
+}
+
+/// Representation of `F` that has handled subnormals.
+#[derive(Clone, Copy, Debug)]
+struct Norm<F: Float> {
+    /// Normalized significand with one guard bit, unsigned.
+    m: F::Int,
+    /// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
+    /// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
+    e: i32,
+    neg: bool,
+}
+
+impl<F: Float> Norm<F> {
+    /// Unbias the exponent and account for the mantissa's precision, including the guard bit.
+    const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;
+
+    /// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
+    /// adjusted the exponent such that it exceeds this threashold.
+    const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;
+
+    fn from_float(x: F) -> Self {
+        let mut ix = x.to_bits();
+        let mut e = x.ex() as i32;
+        let neg = x.is_sign_negative();
+        if e == 0 {
+            // Normalize subnormals by multiplication
+            let scale_i = F::BITS - 1;
+            let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
+            let scaled = x * scale_f;
+            ix = scaled.to_bits();
+            e = scaled.ex() as i32;
+            e = if e == 0 {
+                // If the exponent is still zero, the input was zero. Artifically set this value
+                // such that the final `e` will exceed `ZERO_INF_NAN`.
+                1 << F::EXP_BITS
+            } else {
+                // Otherwise, account for the scaling we just did.
+                e - scale_i as i32
+            };
+        }
+
+        e -= Self::EXP_UNBIAS as i32;
+
+        // Absolute  value, set the implicit bit, and shift to create a guard bit
+        ix &= F::SIG_MASK;
+        ix |= F::IMPLICIT_BIT;
+        ix <<= 1;
+
+        Self { m: ix, e, neg }
+    }
+
+    /// True if the value was zero, infinity, or NaN.
+    fn is_zero_nan_inf(self) -> bool {
+        self.e >= Self::ZERO_INF_NAN as i32
+    }
+
+    /// The only value we have
+    fn is_zero(self) -> bool {
+        // The only exponent that strictly exceeds this value is our sentinel value for zero.
+        self.e > Self::ZERO_INF_NAN as i32
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+
+    /// Test the generic `fma_round` algorithm for a given float.
+    fn spec_test<F>()
+    where
+        F: Float,
+        F: CastFrom<F::SignedInt>,
+        F: CastFrom<i8>,
+        F::Int: HInt,
+        u32: CastInto<F::Int>,
+    {
+        let x = F::from_bits(F::Int::ONE);
+        let y = F::from_bits(F::Int::ONE);
+        let z = F::ZERO;
+
+        let fma = |x, y, z| fma_round(x, y, z, Round::Nearest).val;
+
+        // 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
+        // fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
+        // exact result"
+        assert_biteq!(fma(x, y, z), F::ZERO);
+        assert_biteq!(fma(x, -y, z), F::NEG_ZERO);
+        assert_biteq!(fma(-x, y, z), F::NEG_ZERO);
+        assert_biteq!(fma(-x, -y, z), F::ZERO);
+    }
+
+    #[test]
+    fn spec_test_f32() {
+        spec_test::<f32>();
+    }
+
+    #[test]
+    fn spec_test_f64() {
+        spec_test::<f64>();
+
+        let expect_underflow = [
+            (
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.ffffffffffffp-1023"),
+                hf64!("0x0.ffffffffffff8p-1022"),
+            ),
+            (
+                // FIXME: we raise underflow but this should only be inexact (based on C and
+                // `rustc_apfloat`).
+                hf64!("0x1.0p-1070"),
+                hf64!("0x1.0p-1070"),
+                hf64!("-0x1.0p-1022"),
+                hf64!("-0x1.0p-1022"),
+            ),
+        ];
+
+        for (x, y, z, res) in expect_underflow {
+            let FpResult { val, status } = fma_round(x, y, z, Round::Nearest);
+            assert_biteq!(val, res);
+            assert_eq!(status, Status::UNDERFLOW);
+        }
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn spec_test_f128() {
+        spec_test::<f128>();
+    }
+
     #[test]
     fn fma_segfault() {
         // These two inputs cause fma to segfault on release due to overflow:
diff --git a/src/math/fma_wide.rs b/src/math/fma_wide.rs
new file mode 100644
index 00000000..a8c1a548
--- /dev/null
+++ b/src/math/fma_wide.rs
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: MIT */
+/* origin: musl src/math/fmaf.c. Ported to generic Rust algorithm in 2025, TG. */
+
+use super::super::support::{FpResult, IntTy, Round, Status};
+use super::{CastFrom, CastInto, DFloat, Float, HFloat, MinInt};
+
+// Placeholder so we can have `fmaf16` in the `Float` trait.
+#[allow(unused)]
+#[cfg(f16_enabled)]
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
+    unimplemented!()
+}
+
+/// Floating multiply add (f32)
+///
+/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
+#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    fma_wide_round(x, y, z, Round::Nearest).val
+}
+
+/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
+/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
+pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
+where
+    F: Float + HFloat<D = B>,
+    B: Float + DFloat<H = F>,
+    B::Int: CastInto<i32>,
+    i32: CastFrom<i32>,
+{
+    let one = IntTy::<B>::ONE;
+
+    let xy: B = x.widen() * y.widen();
+    let mut result: B = xy + z.widen();
+    let mut ui: B::Int = result.to_bits();
+    let re = result.ex();
+    let zb: B = z.widen();
+
+    let prec_diff = B::SIG_BITS - F::SIG_BITS;
+    let excess_prec = ui & ((one << prec_diff) - one);
+    let halfway = one << (prec_diff - 1);
+
+    // Common case: the larger precision is fine if...
+    // This is not a halfway case
+    if excess_prec != halfway
+        // Or the result is NaN
+        || re == B::EXP_SAT
+        // Or the result is exact
+        || (result - xy == zb && result - zb == xy)
+        // Or the mode is something other than round to nearest
+        || round != Round::Nearest
+    {
+        let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
+        let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
+
+        let mut status = Status::OK;
+
+        if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
+            // This branch is never hit; requires previous operations to set a status
+            status.set_inexact(false);
+
+            result = xy + z.widen();
+            if status.inexact() {
+                status.set_underflow(true);
+            } else {
+                status.set_inexact(true);
+            }
+        }
+
+        return FpResult { val: result.narrow(), status };
+    }
+
+    let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
+    let err = if neg == (zb > xy) { xy - result + zb } else { zb - result + xy };
+    if neg == (err < B::ZERO) {
+        ui += one;
+    } else {
+        ui -= one;
+    }
+
+    FpResult::ok(B::from_bits(ui).narrow())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn issue_263() {
+        let a = f32::from_bits(1266679807);
+        let b = f32::from_bits(1300234242);
+        let c = f32::from_bits(1115553792);
+        let expected = f32::from_bits(1501560833);
+        assert_eq!(fmaf(a, b, c), expected);
+    }
+}
diff --git a/src/math/fmaf.rs b/src/math/fmaf.rs
deleted file mode 100644
index 40d7f40d..00000000
--- a/src/math/fmaf.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-/// Floating multiply add (f32)
-///
-/// Computes `(x*y)+z`, rounded as one ternary operation:
-/// Computes the value (as if) to infinite precision and rounds once to the result format,
-/// according to the rounding mode characterized by the value of FLT_ROUNDS.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
-    super::generic::fma_wide(x, y, z)
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn issue_263() {
-        let a = f32::from_bits(1266679807);
-        let b = f32::from_bits(1300234242);
-        let c = f32::from_bits(1115553792);
-        let expected = f32::from_bits(1501560833);
-        assert_eq!(super::fmaf(a, b, c), expected);
-    }
-}
diff --git a/src/math/fmaf128.rs b/src/math/fmaf128.rs
deleted file mode 100644
index 50f7360d..00000000
--- a/src/math/fmaf128.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Fused multiply add (f128)
-///
-/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
-    return super::generic::fma(x, y, z);
-}
diff --git a/src/math/generic/fma.rs b/src/math/generic/fma.rs
deleted file mode 100644
index cb1061cc..00000000
--- a/src/math/generic/fma.rs
+++ /dev/null
@@ -1,420 +0,0 @@
-/* SPDX-License-Identifier: MIT */
-/* origin: musl src/math/{fma,fmaf}.c. Ported to generic Rust algorithm in 2025, TG. */
-
-use super::super::support::{DInt, FpResult, HInt, IntTy, Round, Status};
-use super::super::{CastFrom, CastInto, DFloat, Float, HFloat, Int, MinInt};
-
-/// Fused multiply-add that works when there is not a larger float size available. Currently this
-/// is still specialized only for `f64`. Computes `(x * y) + z`.
-#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
-pub fn fma<F>(x: F, y: F, z: F) -> F
-where
-    F: Float,
-    F: CastFrom<F::SignedInt>,
-    F: CastFrom<i8>,
-    F::Int: HInt,
-    u32: CastInto<F::Int>,
-{
-    fma_round(x, y, z, Round::Nearest).val
-}
-
-pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
-where
-    F: Float,
-    F: CastFrom<F::SignedInt>,
-    F: CastFrom<i8>,
-    F::Int: HInt,
-    u32: CastInto<F::Int>,
-{
-    let one = IntTy::<F>::ONE;
-    let zero = IntTy::<F>::ZERO;
-
-    // Normalize such that the top of the mantissa is zero and we have a guard bit.
-    let nx = Norm::from_float(x);
-    let ny = Norm::from_float(y);
-    let nz = Norm::from_float(z);
-
-    if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() {
-        // Value will overflow, defer to non-fused operations.
-        return FpResult::ok(x * y + z);
-    }
-
-    if nz.is_zero_nan_inf() {
-        if nz.is_zero() {
-            // Empty add component means we only need to multiply.
-            return FpResult::ok(x * y);
-        }
-        // `z` is NaN or infinity, which sets the result.
-        return FpResult::ok(z);
-    }
-
-    // multiply: r = x * y
-    let zhi: F::Int;
-    let zlo: F::Int;
-    let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();
-
-    // Exponent result of multiplication
-    let mut e: i32 = nx.e + ny.e;
-    // Needed shift to align `z` to the multiplication result
-    let mut d: i32 = nz.e - e;
-    let sbits = F::BITS as i32;
-
-    // Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
-    if d > 0 {
-        // The magnitude of `z` is larger than `x * y`
-        if d < sbits {
-            // Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
-            // it into `(zhi, zlo)`. No exponent adjustment necessary.
-            zlo = nz.m << d;
-            zhi = nz.m >> (sbits - d);
-        } else {
-            // Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
-            // as a shift by `sbits`).
-            zlo = zero;
-            zhi = nz.m;
-            d -= sbits;
-
-            // `z`'s exponent is large enough that it now needs to be taken into account.
-            e = nz.e - sbits;
-
-            if d == 0 {
-                // Exactly `sbits`, nothing to do
-            } else if d < sbits {
-                // Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
-                rlo = (rhi << (sbits - d)) | (rlo >> d);
-                // Set the sticky bit
-                rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero);
-                rhi = rhi >> d;
-            } else {
-                // `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
-                // the sticky bit.
-                rlo = one;
-                rhi = zero;
-            }
-        }
-    } else {
-        // `z`'s magnitude once shifted fits entirely within `zlo`
-        zhi = zero;
-        d = -d;
-        if d == 0 {
-            // No shift needed
-            zlo = nz.m;
-        } else if d < sbits {
-            // Shift s.t. `nz.m` fits into `zlo`
-            let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
-            zlo = (nz.m >> d) | sticky;
-        } else {
-            // Would be entirely shifted out, only set the sticky bit
-            zlo = one;
-        }
-    }
-
-    /* addition */
-
-    let mut neg = nx.neg ^ ny.neg;
-    let samesign: bool = !neg ^ nz.neg;
-    let mut rhi_nonzero = true;
-
-    if samesign {
-        // r += z
-        rlo = rlo.wrapping_add(zlo);
-        rhi += zhi + IntTy::<F>::from(rlo < zlo);
-    } else {
-        // r -= z
-        let (res, borrow) = rlo.overflowing_sub(zlo);
-        rlo = res;
-        rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
-        if (rhi >> (F::BITS - 1)) != zero {
-            rlo = rlo.signed().wrapping_neg().unsigned();
-            rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
-            neg = !neg;
-        }
-        rhi_nonzero = rhi != zero;
-    }
-
-    /* Construct result */
-
-    // Shift result into `rhi`, left-aligned. Last bit is sticky
-    if rhi_nonzero {
-        // `d` > 0, need to shift both `rhi` and `rlo` into result
-        e += sbits;
-        d = rhi.leading_zeros() as i32 - 1;
-        rhi = (rhi << d) | (rlo >> (sbits - d));
-        // Update sticky
-        rhi |= IntTy::<F>::from((rlo << d) != zero);
-    } else if rlo != zero {
-        // `rhi` is zero, `rlo` is the entire result and needs to be shifted
-        d = rlo.leading_zeros() as i32 - 1;
-        if d < 0 {
-            // Shift and set sticky
-            rhi = (rlo >> 1) | (rlo & one);
-        } else {
-            rhi = rlo << d;
-        }
-    } else {
-        // exact +/- 0.0
-        return FpResult::ok(x * y + z);
-    }
-
-    e -= d;
-
-    // Use int->float conversion to populate the significand.
-    // i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
-    let mut i: F::SignedInt = rhi.signed();
-
-    if neg {
-        i = -i;
-    }
-
-    // `|r|` is in `[0x1p62,0x1p63]` for `f64`
-    let mut r: F = F::cast_from_lossy(i);
-
-    /* Account for subnormal and rounding */
-
-    // Unbiased exponent for the maximum value of `r`
-    let max_pow = F::BITS - 1 + F::EXP_BIAS;
-
-    let mut status = Status::OK;
-
-    if e < -(max_pow as i32 - 2) {
-        // Result is subnormal before rounding
-        if e == -(max_pow as i32 - 1) {
-            let mut c = F::from_parts(false, max_pow, zero);
-            if neg {
-                c = -c;
-            }
-
-            if r == c {
-                // Min normal after rounding,
-                status.set_underflow(true);
-                r = F::MIN_POSITIVE_NORMAL.copysign(r);
-                return FpResult::new(r, status);
-            }
-
-            if (rhi << (F::SIG_BITS + 1)) != zero {
-                // Account for truncated bits. One bit will be lost in the `scalbn` call, add
-                // another top bit to avoid double rounding if inexact.
-                let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2));
-                i = iu.signed();
-
-                if neg {
-                    i = -i;
-                }
-
-                r = F::cast_from_lossy(i);
-
-                // Remove the top bit
-                r = F::cast_from(2i8) * r - c;
-                status.set_underflow(true);
-            }
-        } else {
-            // Only round once when scaled
-            d = F::EXP_BITS as i32 - 1;
-            let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
-            i = (((rhi >> d) | sticky) << d).signed();
-
-            if neg {
-                i = -i;
-            }
-
-            r = F::cast_from_lossy(i);
-        }
-    }
-
-    // Use our exponent to scale the final value.
-    FpResult::new(super::scalbn(r, e), status)
-}
-
-/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
-/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
-pub fn fma_wide<F, B>(x: F, y: F, z: F) -> F
-where
-    F: Float + HFloat<D = B>,
-    B: Float + DFloat<H = F>,
-    B::Int: CastInto<i32>,
-    i32: CastFrom<i32>,
-{
-    fma_wide_round(x, y, z, Round::Nearest).val
-}
-
-pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
-where
-    F: Float + HFloat<D = B>,
-    B: Float + DFloat<H = F>,
-    B::Int: CastInto<i32>,
-    i32: CastFrom<i32>,
-{
-    let one = IntTy::<B>::ONE;
-
-    let xy: B = x.widen() * y.widen();
-    let mut result: B = xy + z.widen();
-    let mut ui: B::Int = result.to_bits();
-    let re = result.ex();
-    let zb: B = z.widen();
-
-    let prec_diff = B::SIG_BITS - F::SIG_BITS;
-    let excess_prec = ui & ((one << prec_diff) - one);
-    let halfway = one << (prec_diff - 1);
-
-    // Common case: the larger precision is fine if...
-    // This is not a halfway case
-    if excess_prec != halfway
-        // Or the result is NaN
-        || re == B::EXP_SAT
-        // Or the result is exact
-        || (result - xy == zb && result - zb == xy)
-        // Or the mode is something other than round to nearest
-        || round != Round::Nearest
-    {
-        let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
-        let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
-
-        let mut status = Status::OK;
-
-        if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
-            // This branch is never hit; requires previous operations to set a status
-            status.set_inexact(false);
-
-            result = xy + z.widen();
-            if status.inexact() {
-                status.set_underflow(true);
-            } else {
-                status.set_inexact(true);
-            }
-        }
-
-        return FpResult { val: result.narrow(), status };
-    }
-
-    let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
-    let err = if neg == (zb > xy) { xy - result + zb } else { zb - result + xy };
-    if neg == (err < B::ZERO) {
-        ui += one;
-    } else {
-        ui -= one;
-    }
-
-    FpResult::ok(B::from_bits(ui).narrow())
-}
-
-/// Representation of `F` that has handled subnormals.
-#[derive(Clone, Copy, Debug)]
-struct Norm<F: Float> {
-    /// Normalized significand with one guard bit, unsigned.
-    m: F::Int,
-    /// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
-    /// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
-    e: i32,
-    neg: bool,
-}
-
-impl<F: Float> Norm<F> {
-    /// Unbias the exponent and account for the mantissa's precision, including the guard bit.
-    const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;
-
-    /// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
-    /// adjusted the exponent such that it exceeds this threashold.
-    const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;
-
-    fn from_float(x: F) -> Self {
-        let mut ix = x.to_bits();
-        let mut e = x.ex() as i32;
-        let neg = x.is_sign_negative();
-        if e == 0 {
-            // Normalize subnormals by multiplication
-            let scale_i = F::BITS - 1;
-            let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
-            let scaled = x * scale_f;
-            ix = scaled.to_bits();
-            e = scaled.ex() as i32;
-            e = if e == 0 {
-                // If the exponent is still zero, the input was zero. Artifically set this value
-                // such that the final `e` will exceed `ZERO_INF_NAN`.
-                1 << F::EXP_BITS
-            } else {
-                // Otherwise, account for the scaling we just did.
-                e - scale_i as i32
-            };
-        }
-
-        e -= Self::EXP_UNBIAS as i32;
-
-        // Absolute  value, set the implicit bit, and shift to create a guard bit
-        ix &= F::SIG_MASK;
-        ix |= F::IMPLICIT_BIT;
-        ix <<= 1;
-
-        Self { m: ix, e, neg }
-    }
-
-    /// True if the value was zero, infinity, or NaN.
-    fn is_zero_nan_inf(self) -> bool {
-        self.e >= Self::ZERO_INF_NAN as i32
-    }
-
-    /// The only value we have
-    fn is_zero(self) -> bool {
-        // The only exponent that strictly exceeds this value is our sentinel value for zero.
-        self.e > Self::ZERO_INF_NAN as i32
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn spec_test<F>()
-    where
-        F: Float,
-        F: CastFrom<F::SignedInt>,
-        F: CastFrom<i8>,
-        F::Int: HInt,
-        u32: CastInto<F::Int>,
-    {
-        let x = F::from_bits(F::Int::ONE);
-        let y = F::from_bits(F::Int::ONE);
-        let z = F::ZERO;
-
-        // 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
-        // fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
-        // exact result"
-        assert_biteq!(fma(x, y, z), F::ZERO);
-        assert_biteq!(fma(x, -y, z), F::NEG_ZERO);
-        assert_biteq!(fma(-x, y, z), F::NEG_ZERO);
-        assert_biteq!(fma(-x, -y, z), F::ZERO);
-    }
-
-    #[test]
-    fn spec_test_f64() {
-        spec_test::<f64>();
-
-        let expect_underflow = [
-            (
-                hf64!("0x1.0p-1070"),
-                hf64!("0x1.0p-1070"),
-                hf64!("0x1.ffffffffffffp-1023"),
-                hf64!("0x0.ffffffffffff8p-1022"),
-            ),
-            (
-                // FIXME: we raise underflow but this should only be inexact (based on C and
-                // `rustc_apfloat`).
-                hf64!("0x1.0p-1070"),
-                hf64!("0x1.0p-1070"),
-                hf64!("-0x1.0p-1022"),
-                hf64!("-0x1.0p-1022"),
-            ),
-        ];
-
-        for (x, y, z, res) in expect_underflow {
-            let FpResult { val, status } = fma_round(x, y, z, Round::Nearest);
-            assert_biteq!(val, res);
-            assert_eq!(status, Status::UNDERFLOW);
-        }
-    }
-
-    #[test]
-    #[cfg(f128_enabled)]
-    fn spec_test_f128() {
-        spec_test::<f128>();
-    }
-}
diff --git a/src/math/generic/mod.rs b/src/math/generic/mod.rs
index f224eba7..9be185f8 100644
--- a/src/math/generic/mod.rs
+++ b/src/math/generic/mod.rs
@@ -3,7 +3,6 @@ mod copysign;
 mod fabs;
 mod fdim;
 mod floor;
-mod fma;
 mod fmax;
 mod fmaximum;
 mod fmaximum_num;
@@ -22,7 +21,6 @@ pub use copysign::copysign;
 pub use fabs::fabs;
 pub use fdim::fdim;
 pub use floor::floor;
-pub use fma::{fma, fma_wide};
 pub use fmax::fmax;
 pub use fmaximum::fmaximum;
 pub use fmaximum_num::fmaximum_num;
diff --git a/src/math/mod.rs b/src/math/mod.rs
index e58d79ad..5fc8fa0b 100644
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -164,7 +164,7 @@ mod fdimf;
 mod floor;
 mod floorf;
 mod fma;
-mod fmaf;
+mod fma_wide;
 mod fmin_fmax;
 mod fminimum_fmaximum;
 mod fminimum_fmaximum_num;
@@ -271,7 +271,7 @@ pub use self::fdimf::fdimf;
 pub use self::floor::floor;
 pub use self::floorf::floorf;
 pub use self::fma::fma;
-pub use self::fmaf::fmaf;
+pub use self::fma_wide::fmaf;
 pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf};
 pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf};
 pub use self::fminimum_fmaximum_num::{fmaximum_num, fmaximum_numf, fminimum_num, fminimum_numf};
@@ -370,6 +370,9 @@ cfg_if! {
         pub use self::sqrtf16::sqrtf16;
         pub use self::truncf16::truncf16;
         // verify-sorted-end
+
+        #[allow(unused_imports)]
+        pub(crate) use self::fma_wide::fmaf16;
     }
 }
 
@@ -381,7 +384,6 @@ cfg_if! {
         mod fabsf128;
         mod fdimf128;
         mod floorf128;
-        mod fmaf128;
         mod fmodf128;
         mod ldexpf128;
         mod roundf128;
@@ -396,7 +398,7 @@ cfg_if! {
         pub use self::fabsf128::fabsf128;
         pub use self::fdimf128::fdimf128;
         pub use self::floorf128::floorf128;
-        pub use self::fmaf128::fmaf128;
+        pub use self::fma::fmaf128;
         pub use self::fmin_fmax::{fmaxf128, fminf128};
         pub use self::fminimum_fmaximum::{fmaximumf128, fminimumf128};
         pub use self::fminimum_fmaximum_num::{fmaximum_numf128, fminimum_numf128};
diff --git a/src/math/support/float_traits.rs b/src/math/support/float_traits.rs
index 534ca9a0..96c209c8 100644
--- a/src/math/support/float_traits.rs
+++ b/src/math/support/float_traits.rs
@@ -160,9 +160,11 @@ pub trait Float:
     fn abs(self) -> Self;
 
     /// Returns a number composed of the magnitude of self and the sign of sign.
-    #[allow(dead_code)]
     fn copysign(self, other: Self) -> Self;
 
+    /// Fused multiply add, rounding once.
+    fn fma(self, y: Self, z: Self) -> Self;
+
     /// Returns (normalized exponent, normalized significand)
     #[allow(dead_code)]
     fn normalize(significand: Self::Int) -> (i32, Self::Int);
@@ -184,7 +186,9 @@ macro_rules! float_impl {
         $sity:ident,
         $bits:expr,
         $significand_bits:expr,
-        $from_bits:path
+        $from_bits:path,
+        $fma_fn:ident,
+        $fma_intrinsic:ident
     ) => {
         impl Float for $ty {
             type Int = $ity;
@@ -252,6 +256,16 @@ macro_rules! float_impl {
                     }
                 }
             }
+            fn fma(self, y: Self, z: Self) -> Self {
+                cfg_if! {
+                    // fma is not yet available in `core`
+                    if #[cfg(intrinsics_enabled)] {
+                        unsafe{ core::intrinsics::$fma_intrinsic(self, y, z) }
+                    } else {
+                        super::super::$fma_fn(self, y, z)
+                    }
+                }
+            }
             fn normalize(significand: Self::Int) -> (i32, Self::Int) {
                 let shift = significand.leading_zeros().wrapping_sub(Self::EXP_BITS);
                 (1i32.wrapping_sub(shift as i32), significand << shift as Self::Int)
@@ -261,11 +275,11 @@ macro_rules! float_impl {
 }
 
 #[cfg(f16_enabled)]
-float_impl!(f16, u16, i16, 16, 10, f16::from_bits);
-float_impl!(f32, u32, i32, 32, 23, f32_from_bits);
-float_impl!(f64, u64, i64, 64, 52, f64_from_bits);
+float_impl!(f16, u16, i16, 16, 10, f16::from_bits, fmaf16, fmaf16);
+float_impl!(f32, u32, i32, 32, 23, f32_from_bits, fmaf, fmaf32);
+float_impl!(f64, u64, i64, 64, 52, f64_from_bits, fma, fmaf64);
 #[cfg(f128_enabled)]
-float_impl!(f128, u128, i128, 128, 112, f128::from_bits);
+float_impl!(f128, u128, i128, 128, 112, f128::from_bits, fmaf128, fmaf128);
 
 /* FIXME(msrv): vendor some things that are not const stable at our MSRV */