Add fast single-precision add/sub/mul for Hazard3 (#1883)

* Add fast single-precision add/sub/mul for Hazard3 * Make test output less noisy. Map -nan to -inf in vector gen. Move random vectors to separate files. * Re-disable USB stdout for pico_float_test by default... * Disable pico/float.h exports on RISC-V as these functions aren't implemented * Add hazard3 instructions to asm_helper. Split hazard3.h to support this. You can still include hazard3.h to get everything. This just allows you to pull in less.
2026-01-28 01:47:21 +01:00 · 2024-08-30 17:36:30 +01:00 · 2024-08-30 17:36:30 +01:00 · d886df6eb0
commit d886df6eb0
parent 876f331033
12 changed files with 2896 additions and 164 deletions
--- a/src/rp2350/pico_platform/include/pico/asm_helper.S
+++ b/src/rp2350/pico_platform/include/pico/asm_helper.S
@ -6,12 +6,17 @@

 #include "pico.h"

+#ifdef __riscv
+// Get macros for convenient use of Hazard3 instructions without binutils support
+#include "hardware/hazard3/instructions.h"
+#endif
+
 #if !PICO_ASSEMBLER_IS_CLANG
 #define apsr_nzcv r15
 #endif
-# note we don't do this by default in this file for backwards comaptibility with user code
-# that may include this file, but not use unified syntax. Note that this macro does equivalent
-# setup to the pico_default_asm macro for inline assembly in C code.
+// note we don't do this by default in this file for backwards comaptibility with user code
+// that may include this file, but not use unified syntax. Note that this macro does equivalent
+// setup to the pico_default_asm macro for inline assembly in C code.
 .macro pico_default_asm_setup
 #ifndef __riscv
 .syntax unified
@ -60,28 +65,7 @@ weak_func WRAPPER_FUNC_NAME(\x)
 .word \func + \offset
 .endm

-# backwards compatibility
+// backwards compatibility
 .macro __pre_init func, priority_string1
 __pre_init_with_offset func, 0, priority_string1
 .endm
-
-#ifdef __riscv
-// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
-.macro h3.bextm rd rs1 rs2 nbits
-.if (\nbits < 1) || (\nbits > 8)
-.err
-.endif
-    .insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
-.endm
-
-// rd = (rs1 >> shamt) & ~(-1 << nbits)
-.macro h3.bextmi rd rs1 shamt nbits
-.if (\nbits < 1) || (\nbits > 8)
-.err
-.endif
-.if (\shamt < 0) || (\shamt > 31)
-.err
-.endif
-    .insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
-.endm
-#endif
--- a/src/rp2_common/hardware_hazard3/include/hardware/hazard3.h
+++ b/src/rp2_common/hardware_hazard3/include/hardware/hazard3.h
@ -4,13 +4,18 @@
 * SPDX-License-Identifier: BSD-3-Clause
 */

-#ifndef _HARDWARE_HAZARD3_
-#define _HARDWARE_HAZARD3_
+#ifndef _HARDWARE_HAZARD3_H
+#define _HARDWARE_HAZARD3_H

 #include "pico.h"
 #include "hardware/riscv.h"
+
+// This includes both standard and Hazard3 custom CSRs:
 #include "hardware/regs/rvcsr.h"

+#include "hardware/hazard3/features.h"
+#include "hardware/hazard3/instructions.h"
+
 /** \file hardware/hazard3.h
 *  \defgroup hardware_hazard3 hardware_hazard3
 *
@ -18,87 +23,7 @@
 *
 */

-// Feature detection macros for Hazard3 custom extensions
-#if PICO_RP2350
-#define __hazard3_extension_xh3power
-#define __hazard3_extension_xh3bextm
-#define __hazard3_extension_xh3irq
-#define __hazard3_extension_xh3pmpm
-#endif
-
-#ifdef __ASSEMBLER__
-
-// Assembly language instruction macros for Hazard3 custom instructions
-
-// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
-// first bit indexed by rs2, and bit count configured by an immediate value.
-// R-format instruction. Pseudocode:
-//
-//     rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
-
-.macro h3.bextm rd rs1 rs2 nbits
-.if (\nbits < 1) || (\nbits > 8)
-.err
-.endif
-#ifdef __hazard3_extension_xh3bextm
-    .insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
-#else
-    srl  \rd, \rs1, \rs2
-    andi \rd, \rd, ((1 << \nbits) - 1)
-#endif
-.endm
-
-// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
-// first bit index and the number of bits both configured by immediate
-// values. I-format instruction. Pseudocode:
-//
-//     rd = (rs1 >> shamt) & ~(-1 << nbits)
-
-.macro h3.bextmi rd rs1 shamt nbits
-.if (\nbits < 1) || (\nbits > 8)
-.err
-.endif
-.if (\shamt < 0) || (\shamt > 31)
-.err
-.endif
-#ifdef __hazard3_extension_xh3bextm
-    .insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
-#else
-    srli \rd, \rs1, \shamt
-    andi \rd, \rd, ((1 << \nbits) - 1)
-#endif
-.endm
-
-// h3.block: enter an idle state until another processor in the same
-// multiprocessor complex executes an h3.unblock instruction, or the
-// processor is interrupted. Fall through immediately if an h3.unblock has
-// been received since the last execution of an h3.block on this processor.
-// On RP2350, processors also have their own h3.unblock signals reflected
-// back to them.
-
-.macro h3.block
-#ifdef __hazard3_extension_xh3power
-    slt x0, x0, x0
-#else
-    nop
-#endif
-.endm
-
-// h3.unblock: signal other processors in the same multiprocessor complex to
-// exit the idle state entered by an h3.block instruction. On RP2350, this
-// signal is also reflected back to the processor that executed the
-// h3.unblock, which will cause that processor's next h3.block to fall
-// through immediately.
-
-.macro h3.unblock
-#ifdef __hazard3_extension_xh3power
-    slt x0, x0, x1
-#else
-    nop
-#endif
-.endm
-
-#else // !__ASSEMBLER__
+#ifndef __ASSEMBLER__

 #ifdef __cplusplus
 extern "C" {
@ -128,51 +53,6 @@ extern "C" {
 #define hazard3_irqarray_clear(csr, index, data) static_assert(false, "Not supported: Xh3irq extension")
 #endif

-
-// nbits must be a constant expression
-#ifdef __hazard3_extension_xh3bextm
-#define __hazard3_bextm(nbits, rs1, rs2) ({\
-    uint32_t __h3_bextm_rd; \
-    asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
-        : "=r" (__h3_bextm_rd) \
-        : "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
-    ); \
-    __h3_bextm_rd; \
-})
-#else
-#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
 #endif

-// nbits and shamt must be constant expressions
-#ifdef __hazard3_extension_xh3bextm
-#define __hazard3_bextmi(nbits, rs1, shamt) ({\
-    uint32_t __h3_bextmi_rd; \
-    asm (".insn i 0x0b, 0x4, %0, %1, %2"\
-        : "=r" (__h3_bextmi_rd) \
-        : "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
-    ); \
-    __h3_bextmi_rd; \
-})
-#else
-#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
-#endif
-
-#ifdef __hazard3_extension_xh3power
-#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
-#else
-#define __hazard3_block() do {} while (0)
-#endif
-
-#ifdef __hazard3_extension_xh3power
-#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
-#else
-#define __hazard3_unblock() do {} while (0)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // !__ASSEMBLER__
-
 #endif
--- a/src/rp2_common/hardware_hazard3/include/hardware/hazard3/features.h
+++ b/src/rp2_common/hardware_hazard3/include/hardware/hazard3/features.h
@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef _HARDWARE_HAZARD3_FEATURES_H
+#define _HARDWARE_HAZARD3_FEATURES_H
+
+#include "pico.h"
+
+/** \file hardware/hazard3/features.h
+ *  \addtogroup hardware_hazard3
+ *
+ * \brief Sets macros for supported Hazard3 custom extensions (features) based on PICO_PLATFORM macros
+ *
+ */
+
+// Feature detection macros for Hazard3 custom extensions
+#if PICO_RP2350
+// Version 1.0 of these four extensions
+// (encoded as major * 100 + minor)
+#define __hazard3_extension_xh3power 100
+#define __hazard3_extension_xh3bextm 100
+#define __hazard3_extension_xh3irq 100
+#define __hazard3_extension_xh3pmpm 100
+#endif
+
+#endif
--- a/src/rp2_common/hardware_hazard3/include/hardware/hazard3/instructions.h
+++ b/src/rp2_common/hardware_hazard3/include/hardware/hazard3/instructions.h
@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef _HARDWARE_HAZARD3_INSTRUCTIONS_H
+#define _HARDWARE_HAZARD3_INSTRUCTIONS_H
+
+#include "pico.h"
+
+// Get list of supported extensions based on platform:
+#include "hardware/hazard3/features.h"
+
+/** \file hardware/hazard3/instructions.h
+ *  \addtogroup hardware_hazard3
+ *
+ * \brief Intrinsics and asm macros for Hazard3 custom instructions
+ *
+ * The implementation of these intrinsics depends on the feature macros
+ * defined in hardware/hazard3/features.h. When the relevant feature is not
+ * present, the intrinsics fall back on an RV32I equivalent if possible.
+ *
+ */
+
+#ifdef __ASSEMBLER__
+
+// Assembly language instruction macros for Hazard3 custom instructions
+
+// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
+// first bit indexed by rs2, and bit count configured by an immediate value.
+// R-format instruction. Pseudocode:
+//
+//     rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
+
+.macro h3.bextm rd rs1 rs2 nbits
+.if (\nbits < 1) || (\nbits > 8)
+.err
+.endif
+#ifdef __hazard3_extension_xh3bextm
+    .insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
+#else
+    srl  \rd, \rs1, \rs2
+    andi \rd, \rd, ((1 << \nbits) - 1)
+#endif
+.endm
+
+// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
+// first bit index and the number of bits both configured by immediate
+// values. I-format instruction. Pseudocode:
+//
+//     rd = (rs1 >> shamt) & ~(-1 << nbits)
+
+.macro h3.bextmi rd rs1 shamt nbits
+.if (\nbits < 1) || (\nbits > 8)
+.err
+.endif
+.if (\shamt < 0) || (\shamt > 31)
+.err
+.endif
+#ifdef __hazard3_extension_xh3bextm
+    .insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
+#else
+    srli \rd, \rs1, \shamt
+    andi \rd, \rd, ((1 << \nbits) - 1)
+#endif
+.endm
+
+// h3.block: enter an idle state until another processor in the same
+// multiprocessor complex executes an h3.unblock instruction, or the
+// processor is interrupted. Fall through immediately if an h3.unblock has
+// been received since the last execution of an h3.block on this processor.
+// On RP2350, processors also have their own h3.unblock signals reflected
+// back to them.
+
+.macro h3.block
+#ifdef __hazard3_extension_xh3power
+    slt x0, x0, x0
+#else
+    nop
+#endif
+.endm
+
+// h3.unblock: signal other processors in the same multiprocessor complex to
+// exit the idle state entered by an h3.block instruction. On RP2350, this
+// signal is also reflected back to the processor that executed the
+// h3.unblock, which will cause that processor's next h3.block to fall
+// through immediately.
+
+.macro h3.unblock
+#ifdef __hazard3_extension_xh3power
+    slt x0, x0, x1
+#else
+    nop
+#endif
+.endm
+
+#else // !__ASSEMBLER__
+
+// C language instruction macros for Hazard3 custom instructions
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// nbits must be a constant expression
+#ifdef __hazard3_extension_xh3bextm
+#define __hazard3_bextm(nbits, rs1, rs2) ({\
+    uint32_t __h3_bextm_rd; \
+    asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
+        : "=r" (__h3_bextm_rd) \
+        : "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
+    ); \
+    __h3_bextm_rd; \
+})
+#else
+#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
+#endif
+
+// nbits and shamt must be constant expressions
+#ifdef __hazard3_extension_xh3bextm
+#define __hazard3_bextmi(nbits, rs1, shamt) ({\
+    uint32_t __h3_bextmi_rd; \
+    asm (".insn i 0x0b, 0x4, %0, %1, %2"\
+        : "=r" (__h3_bextmi_rd) \
+        : "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
+    ); \
+    __h3_bextmi_rd; \
+})
+#else
+#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
+#endif
+
+#ifdef __hazard3_extension_xh3power
+#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
+#else
+#define __hazard3_block() do {} while (0)
+#endif
+
+#ifdef __hazard3_extension_xh3power
+#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
+#else
+#define __hazard3_unblock() do {} while (0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // !__ASSEMBLER__
+
+#endif
--- a/src/rp2_common/pico_float/CMakeLists.txt
+++ b/src/rp2_common/pico_float/CMakeLists.txt
@ -10,11 +10,7 @@

    # add alias "default" which is just pico.
    add_library(pico_float_default INTERFACE)
-    if (PICO_RISCV)
-        target_link_libraries(pico_float_default INTERFACE pico_float_compiler)
-    else()
-        target_link_libraries(pico_float_default INTERFACE pico_float_pico)
-    endif()
+    target_link_libraries(pico_float_default INTERFACE pico_float_pico)

    set(PICO_DEFAULT_FLOAT_IMPL pico_float_default)

@ -128,6 +124,10 @@
        wrap_float_functions(pico_float_pico_vfp NO_WRAP_AEABI)
        target_link_libraries(pico_float_pico INTERFACE
                pico_float_pico_vfp)
+    else()
+        target_sources(pico_float_pico INTERFACE
+            ${CMAKE_CURRENT_LIST_DIR}/float_single_hazard3.S
+            )
    endif()


--- a/src/rp2_common/pico_float/float_single_hazard3.S
+++ b/src/rp2_common/pico_float/float_single_hazard3.S
@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "pico/asm_helper.S"
+#include "hardware/hazard3.h"
+
+// This file reimplements some common single-precision soft float routines
+// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
+// and is tuned for Hazard3 execution timings.
+
+// Subnormal values are always flushed to zero on both input and output.
+// Rounding is always to nearest (even on tie).
+
+pico_default_asm_setup
+
+.macro float_section name
+#if PICO_FLOAT_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+float_section __addsf3
+.global __subsf3
+.p2align 2
+__subsf3:
+    binvi a1, a1, 31
+.global __addsf3
+__addsf3:
+    // Unpack exponent:
+    h3.bextmi a2, a0, 23, 8
+    h3.bextmi a3, a1, 23, 8
+    // Flush-to-zero => 0 + y = y applies, including nan, with the sole
+    // exception of y being subnormal (which also needs to be flushed)
+    beqz a2, __addsf_return_y_flushed
+    // Don't have to handle this case for x + 0 = 0 because we already know x
+    // is nonzero
+    beqz a3, __addsf_return_x
+    // Unpack significand, plus 3 extra zeroes for working space:
+    slli a4, a0, 9
+    slli a5, a1, 9
+    // check nan/inf on input
+    li t0, 255
+    beq a2, t0, __addsf_x_nan_inf
+    beq a3, t0, __addsf_y_nan_inf
+    // (finish unpacking significand)
+    srli a4, a4, 6
+    srli a5, a5, 6
+
+    // If we're still on the straight path then we are adding two normal
+    // values. Add implicit one (1.xx...xx000)
+    bseti a4, a4, 23 + 3
+    bseti a5, a5, 23 + 3
+    // Negate if sign bit is set
+    bgez a0, 1f
+    neg a4, a4
+1:
+    // (tuck this 16-bit here to avoid alignment penalty)
+    li t1, 25
+    bgez a1, 1f
+    neg a5, a5
+1:
+
+    bltu a2, a3, __addsf_ye_gt_xe
+
+    // The main body is repeated twice with different register assignments.
+    // lhs is the more-significant addend:
+.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
+    sub \packed_rhs, \exp_lhs, \exp_rhs
+    // If there is a large exponent difference then there is no effect on lhs
+.if \rhs_is_x
+    bgeu \packed_rhs, t1, __addsf_return_y
+.else
+    bgeu \packed_rhs, t1, __addsf_return_x
+.endif
+    // Shift rhs down to correct relative significance
+    sra \packed_lhs, \sig_rhs, \packed_rhs
+    // Set sticky bit if ones were shifted out
+    sll \packed_rhs, \packed_lhs, \packed_rhs
+    sltu \packed_rhs, \packed_rhs, \sig_rhs
+    or \packed_lhs, \packed_lhs, \packed_rhs
+    // Add significands
+    add \sig_lhs, \sig_lhs, \packed_lhs
+    // Detect exact cancellation (may be beyond max normalisation shift; also
+    // IEEE 754 requires +0 for exact cancellation, no matter input signs)
+    beqz \sig_lhs, __addsf_return_0
+    // Convert two's complement back to sign + magnitude
+    srai \exp_rhs, \sig_lhs, 31
+    xor \sig_lhs, \sig_lhs, \exp_rhs
+    sub \sig_lhs, \sig_lhs, \exp_rhs
+    // Renormalise significand: bit 31 is now implicit one
+    clz \packed_lhs, \sig_lhs
+    sll \sig_lhs, \sig_lhs, \packed_lhs
+    // Adjust exponent
+    addi \packed_lhs, \packed_lhs, -5
+    sub \exp_lhs, \exp_lhs, \packed_lhs
+
+    // Round to nearest, even on tie (bias upward if above odd number)
+    bexti \packed_lhs, \sig_lhs, 8
+    addi \sig_lhs, \sig_lhs, 127
+    add \sig_lhs, \sig_lhs, \packed_lhs
+    // Exponent may increase by one due to rounding up from all-ones; this is
+    // detected by clearing of implicit one (there is a carry-out too)
+    bgez \sig_lhs, 3f
+4:
+    // Detect underflow/overflow
+    bgeu \exp_lhs, t0, 1f
+
+    // Pack and return
+    packh \exp_lhs, \exp_lhs, \exp_rhs
+    slli \exp_lhs, \exp_lhs, 23
+    slli \sig_lhs, \sig_lhs, 1
+    srli \sig_lhs, \sig_lhs, 9
+    add a0, \sig_lhs, \exp_lhs
+    ret
+1:
+    bgez \exp_lhs, 2f
+    // Signed zero on underflow
+    slli a0, \exp_rhs, 31
+    ret
+2:
+    // Signed infinity on overflow
+    packh a0, t0, \exp_rhs
+    slli a0, a0, 23
+    ret
+3:
+    // Exponent increase due to rounding (uncommon)
+    srli \sig_lhs, \sig_lhs, 1
+    addi \exp_lhs, \exp_lhs, 1
+    j 4b
+.endm
+
+__addsf_xe_gte_ye:
+    addsf_core a0, a1, a4, a5, a2, a3, 0
+.p2align 2
+__addsf_ye_gt_xe:
+    addsf_core a1, a0, a5, a4, a3, a2, 1
+
+__addsf_x_nan_inf:
+    // When at least one operand is nan, we must propagate at least one of
+    // those nan payloads (sign of nan result is unspecified, which we take
+    // advantage of by implementing x - y as x + -y). Check x nan vs inf:
+    bnez a4, __addsf_return_x
+__addsf_x_inf:
+    // If x is +-inf, need to distinguish the following cases:
+    bne  a3, t0, __addsf_return_x // y is neither inf nor nan   -> return x (propagate inf)
+    bnez a5,     __addsf_return_y // y is nan:                  -> return y (propagate nan)
+    xor a5, a0, a1
+    srli a5, a5, 31
+    beqz a5,     __addsf_return_x // y is inf of same sign      -> return either x or y (x is faster)
+    li a0, -1                     // y is inf of different sign -> return nan
+    ret
+
+__addsf_y_nan_inf:
+    // Mirror of __addsf_x_nan_inf
+    bnez a5, __addsf_return_y
+__addsf_y_inf:
+    bne  a2, t0, __addsf_return_y
+    bnez a4,     __addsf_return_x
+    xor a4, a0, a1
+    srli a4, a4, 31
+    beqz a4,     __addsf_return_x
+    li a0, -1
+    ret
+
+__addsf_return_y_flushed:
+    bnez a3, 1f
+    srli a1, a1, 23
+    slli a1, a1, 23
+1:
+__addsf_return_y:
+    mv a0, a1
+__addsf_return_x:
+    ret
+__addsf_return_0:
+    li a0, 0
+    ret
+
+
+float_section __mulsf3
+.global __mulsf3
+.p2align 2
+__mulsf3:
+    // Force y to be positive (by possibly negating x) *before* unpacking.
+    // This allows many special cases to be handled without repacking.
+    bgez a1, 1f
+    binvi a0, a0, 31
+1:
+    // Unpack exponent:
+    h3.bextmi a2, a0, 23, 8
+    h3.bextmi a3, a1, 23, 8
+    // Check special cases
+    li t0, 255
+    beqz a2, __mulsf_x_0
+    beqz a3, __mulsf_y_0
+    beq a2, t0, __mulsf_x_nan_inf
+    beq a3, t0, __mulsf_y_nan_inf
+
+    // Finish unpacking sign
+    srai a6, a0, 31
+    // Unpack significand (with implicit one in MSB)
+    slli a4, a0, 8
+    slli a5, a1, 8
+    bseti a4, a4, 31
+    bseti a5, a5, 31
+    // Get full 64-bit multiply result in a4:a1 (one cycle each half)
+    // Going from Q1.23 to Q2.46 (both left-justified)
+    mul a1, a4, a5
+    mulhu a4, a4, a5
+    // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
+    // final significand (ignoring rounding)
+    clz a0, a4
+    sll a4, a4, a0
+    sub a2, a2, a0
+    // After normalising we can calculate the final exponent, since rounding
+    // cannot increase the exponent for multiplication (unlike addition)
+    add a2, a2, a3
+    // Subtract redundant bias term (127), add 1 for normalisation correction
+    addi a2, a2, -126
+    blez a2, __mulsf_underflow
+    bge a2, t0, __mulsf_overflow
+
+    // Gather sticky bits from low fraction:
+    snez a1, a1
+    or a4, a4, a1
+    // Round to nearest, even on tie (aka bias upward if odd)
+    bexti a1, a4, 8
+    add a4, a4, a1
+    addi a4, a4, 127
+    // Pack it and ship it
+    packh a2, a2, a6
+    slli a2, a2, 23
+    slli a4, a4, 1
+    srli a4, a4, 9
+    add a0, a4, a2
+    ret
+
+__mulsf_underflow:
+    // Signed zero
+    slli a0, a6, 31
+    ret
+__mulsf_overflow:
+    // Signed inf
+    packh a0, t0, a6
+    slli a0, a0, 23
+    ret
+
+__mulsf_x_0:
+    // 0 times nan    -> propagate nan
+    // 0 times inf    -> generate nan
+    // 0 times others -> 0 (need to flush significand too as we are FTZ)
+    bne a3, t0, __mulsf_return_flushed_x
+    slli a5, a1, 9
+    beqz a5, 1f
+    // Propagate nan from y
+__mulsf_return_y:
+    mv a0, a1
+    ret
+1:
+    // Generate new nan
+    li a0, -1
+    ret
+
+__mulsf_y_0:
+    // Mirror image of x_0 except we still return x for signed 0, since the
+    // signs were already resolved.
+    bne a2, t0, __mulsf_return_flushed_x
+    slli a1, a0, 9
+    bnez a1, 1f
+    li a0, -1
+1:
+    ret
+
+__mulsf_return_flushed_x:
+    // If we don't support subnormals we at least need to flush to a canonical
+    // zero. This is just a sign bit in bit 31.
+    srli a0, a0, 31
+    slli a0, a0, 31
+__mulsf_return_x:
+    ret
+
+__mulsf_x_nan_inf:
+    // We know that y is not zero and is positive. So...
+    //      x is nan    -> return x
+    // else y is nan    -> return y
+    // else y is inf    -> return x
+    // else y is normal -> return x
+    // (the order of the first two clauses is actually our free choice)
+    slli a4, a0, 9
+    bnez a4, __mulsf_return_x
+    bne a3, t0, __mulsf_return_x
+    slli a5, a1, 9
+    bnez a5, __mulsf_return_y
+    ret // return x
+
+__mulsf_y_nan_inf:
+    // We know that x is not zero, nan, nor inf. That just leaves normals.
+    // y is nan -> return y
+    // y is inf -> return inf * sgn(x) (since we already merged the signs)
+    slli a5, a1, 9
+    bnez a5, __mulsf_return_y
+    srai a0, a0, 31
+    packh a0, t0, a0
+    slli a0, a0, 23
+    ret
+
+
+// This is a hack to improve soft float performance for the routines we don't
+// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
+float_section __clz2si
+.global __clz2si
+__clz2si:
+    clz a0, a0
+    ret
--- a/src/rp2_common/pico_float/include/pico/float.h
+++ b/src/rp2_common/pico_float/include/pico/float.h
@ -21,8 +21,8 @@ extern "C" {
 *
 * \brief Optimized single-precision floating point functions
 *
-* (Replacement) optimized implementations are provided of the following compiler built-ins
-* and math library functions:
+* (Replacement) optimized implementations are provided for the following compiler built-ins
+* and math library functions on Arm:
 *
 * - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
 * - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
@ -34,11 +34,18 @@ extern "C" {
 * - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
 * - exp10f, sincosf, powintf
 *
-* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly
+* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
 *
 * - float2fix64_z, fdiv_fast, fsqrt_fast,
+*
+* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
+*
+* - __addsf3, __subsf3, __mulsf3
 */

+// None of these functions are available on RISC-V:
+#if !defined(__riscv) || PICO_COMBINED_DOCS
+
 float int2float(int32_t f);
 float uint2float(uint32_t f);
 float int642float(int64_t f);
@ -74,6 +81,8 @@ float fdiv_fast(float n, float d);
 float fsqrt_fast(float f);
 #endif

+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@ -1,7 +1,22 @@
 PROJECT(pico_float_test)

-# todo revist this test for
-if (NOT PICO_RISCV)
+
+if (PICO_RISCV)
+
+    # Separate, simpler test: currently we only have a few single-precision
+    # routines for RISC-V soft float (and the other tests are a bit
+    # AEABI-dependent)
+    add_executable(pico_float_test
+        pico_float_test_hazard3.c
+        )
+    target_link_libraries(pico_float_test PRIVATE pico_float pico_stdlib)
+    target_include_directories(pico_float_test PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+    pico_add_extra_outputs(pico_float_test)
+
+    # pico_enable_stdio_usb(pico_float_test 1)
+    # pico_enable_stdio_uart(pico_float_test 0)
+
+else ()
    add_executable(pico_float_test
            pico_float_test.c
            llvm/call_apsr.S
@ -64,4 +79,4 @@ if (NOT PICO_RISCV)
        target_link_libraries(m33 pico_double pico_stdlib)
        pico_add_extra_outputs(m33)
    endif()
-endif()
+endif()
--- a/test/pico_float_test/hazard3_test_gen.c
+++ b/test/pico_float_test/hazard3_test_gen.c
@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdio.h>
+#include <fenv.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+// xoroshiro256++ pseudorandom number generator.
+// Adapted from: https://prng.di.unimi.it/xoshiro256plusplus.c
+// Original copyright notice:
+
+/*  Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)
+
+To the extent possible under law, the author has dedicated all copyright
+and related and neighboring rights to this software to the public domain
+worldwide. This software is distributed without any warranty.
+
+See <http://creativecommons.org/publicdomain/zero/1.0/>. */
+
+/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
+   It has excellent (sub-ns) speed, a state (256 bits) that is large
+   enough for any parallel application, and it passes all tests we are
+   aware of.
+
+   For generating just floating-point numbers, xoshiro256+ is even faster.
+
+   The state must be seeded so that it is not everywhere zero. If you have
+   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
+   output to fill s. */
+
+static inline uint64_t xr256_rotl(const uint64_t x, int k) {
+	return (x << k) | (x >> (64 - k));
+}
+
+uint64_t xr256_next(uint64_t s[4]) {
+	const uint64_t result = xr256_rotl(s[0] + s[3], 23) + s[0];
+
+	const uint64_t t = s[1] << 17;
+
+	s[2] ^= s[0];
+	s[3] ^= s[1];
+	s[1] ^= s[2];
+	s[0] ^= s[3];
+
+	s[2] ^= t;
+
+	s[3] = xr256_rotl(s[3], 45);
+
+	return result;
+}
+uint32_t bitcast_f2u(float x) {
+	// This is UB but then so is every C program
+	union {
+		float f;
+		uint32_t u;
+	} un;
+	un.f = x;
+	return un.u;
+}
+
+float bitcast_u2f(uint32_t x) {
+	union {
+		float f;
+		uint32_t u;
+	} un;
+	un.u = x;
+	return un.f;
+}
+
+bool is_nan_u(uint32_t x) {
+	return ((x >> 23) & 0xffu) == 0xffu && (x & ~(-1u << 23));
+}
+
+uint32_t flush_to_zero_u(uint32_t x) {
+	if (!(x & (0xffu << 23))) {
+		x &= -1u << 23;
+	}
+	return x;
+}
+
+uint32_t model_fadd(uint32_t x, uint32_t y) {
+	x = flush_to_zero_u(x);
+	y = flush_to_zero_u(y);
+	// Use local hardware implementation to perform calculation
+	uint32_t result = bitcast_f2u(bitcast_u2f(x) + bitcast_u2f(y));
+	// Use correct canonical generated nan
+	if (is_nan_u(result)) {
+		result = -1u;
+	}
+	result = flush_to_zero_u(result);
+	return result;
+}
+
+uint32_t model_fmul(uint32_t x, uint32_t y) {
+	x = flush_to_zero_u(x);
+	y = flush_to_zero_u(y);
+	// Use local hardware implementation to perform calculation
+	uint32_t result = bitcast_f2u(bitcast_u2f(x) * bitcast_u2f(y));
+	// Use correct canonical generated nan
+	if (is_nan_u(result)) {
+		result = -1u;
+	}
+	result = flush_to_zero_u(result);
+	return result;
+}
+
+int main() {
+	// SHA-256 of a rude word
+	uint64_t rand_state[4] = {
+		0x5891b5b522d5df08u,
+		0x6d0ff0b110fbd9d2u,
+		0x1bb4fc7163af34d0u,
+		0x8286a2e846f6be03u
+	};
+	for (int i = 0; i < 1000; ++i) {
+		uint32_t x, y;
+		x = xr256_next(rand_state) & 0xffffffffu;
+		y = xr256_next(rand_state) & 0xffffffffu;
+		// Map nan to +-inf (input nans should already be well-covered)
+		if (is_nan_u(x)) {
+			x &= -1u << 23;
+		}
+		if (is_nan_u(y)) {
+			y &= -1u << 23;
+		}
+#if 1
+		printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fadd(x, y));
+#else
+		printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fmul(x, y));
+#endif
+	}
+}
--- a/test/pico_float_test/pico_float_test_hazard3.c
+++ b/test/pico_float_test/pico_float_test_hazard3.c
@ -0,0 +1,209 @@
+/**
+ * Copyright (c) 2024 Raspberry Pi Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdio.h>
+#include "pico/stdlib.h"
+
+// This test covers the single-precision functions in:
+//
+//    src/pico_float/float_hazard3_single.S
+//
+// It assumes the canonical generated-NaN value and NaN sign rules used by
+// those functions (which are unspecified by IEEE 754). It does not cover
+// libgcc/libm functions from outside of that source file.
+
+typedef struct {
+    uint32_t x;
+    uint32_t y;
+    uint32_t expect;
+} test_t;
+
+test_t add_directed_tests[] = {
+    // 1 + 1 = 2
+    {0x3f800000u, 0x3f800000u, 0x40000000u},
+    // 2 + 1 = 3
+    {0x40000000u, 0x3f800000u, 0x40400000u},
+    // 1 + 2 = 3
+    {0x3f800000u, 0x40000000u, 0x40400000u},
+    // 1 + -1 = +0 (exact cancellation)
+    {0x3f800000u, 0xbf800000u, 0x00000000u},
+    // -1 + 1 = +0 (exact cancellation)
+    {0xbf800000u, 0x3f800000u, 0x00000000u},
+    // 1 + <<1 ulp = 1
+    {0x3f800000u, 0x2f800000u, 0x3f800000u},
+    // <<1 ulp + 1 = 1
+    {0x2f800000u, 0x3f800000u, 0x3f800000u},
+    // -1 + 1.25 = 0.25
+    {0xbf800000u, 0x3fa00000u, 0x3e800000u},
+    // max normal + 0.5 ulp = +inf
+    {0x7f7fffffu, 0x73000000u, 0x7f800000u},
+    // max normal + max normal = +inf
+    {0x7f7fffffu, 0x7f7fffffu, 0x7f800000u},
+    // min normal - 0.5 ulp = -inf
+    {0xff7fffffu, 0xf3000000u, 0xff800000u},
+    // min normal + min_normal = -inf
+    {0xff7fffffu, 0xff7fffffu, 0xff800000u},
+    // max normal + 0.499... ulp = max normal
+    {0x7f7fffffu, 0x72ffffffu, 0x7f7fffffu},
+    // min normal - 0.499... ulp = min normal
+    {0xff7fffffu, 0xf2ffffffu, 0xff7fffffu},
+    // nan + 0 = same nan
+    {0xffff1234u, 0x00000000u, 0xffff1234u},
+    // 0 + nan = same nan
+    {0x00000000u, 0xffff1234u, 0xffff1234u},
+    // nan + 1 = same nan
+    {0xffff1234u, 0x3f800000u, 0xffff1234u},
+    // 1 + nan = same nan
+    {0x3f800000u, 0xffff1234u, 0xffff1234u},
+    // nan + inf = same nan
+    {0xffff1234u, 0x7f800000u, 0xffff1234u},
+    // inf + nan = same nan
+    {0x7f800000u, 0xffff1234u, 0xffff1234u},
+    // inf + inf = inf
+    {0x7f800000u, 0x7f800000u, 0x7f800000u},
+    // -inf + -inf = -inf
+    {0xff800000u, 0xff800000u, 0xff800000u},
+    // inf + -inf = nan (all-ones is our canonical cheap nan)
+    {0x7f800000u, 0xff800000u, 0xffffffffu},
+    // -inf + inf = nan
+    {0xff800000u, 0x7f800000u, 0xffffffffu},
+    // subnormal + subnormal = exactly 0
+    {0x007fffffu, 0x007fffffu, 0x00000000u},
+    // -subnormal + -subnormal = exactly -0
+    {0x807fffffu, 0x807fffffu, 0x80000000u},
+    // Even + 0.5 ulp: round down
+    {0x3f800002u, 0x33800000u, 0x3f800002u},
+    // Even - 0.5 ulp: round up
+    {0x3f800002u, 0xb3800000u, 0x3f800002u},
+    // Odd + 0.5 ulp: round up
+    {0x3f800001u, 0x33800000u, 0x3f800002u},
+    // Odd - 0.5 ulp: round down
+    {0x3f800001u, 0xb3800000u, 0x3f800000u},
+    // All-zeroes significand - 0.5 ulp: no rounding (exact)
+    {0x3f800000u, 0xb3800000u, 0x3f7fffffu},
+    // Very subnormal difference of normals: flushed to zero
+    {0x03800000u, 0x837fffffu, 0x00000000u},
+    // Barely subnormal difference of normals: also flushed (unflushed result is 2^(emin-1))
+    {0x03800000u, 0x837e0000u, 0x00000000u},
+};
+
+test_t mul_directed_tests[] = {
+    // -- directed tests --
+    // 1 * 1 = 1
+    {0x3f800000u, 0x3f800000u, 0x3f800000u},
+    // 1 * -1 = -1
+    {0x3f800000u, 0xbf800000u, 0xbf800000u},
+    // -1 * 1 = -1
+    {0xbf800000u, 0x3f800000u, 0xbf800000u},
+    // -1 * -1 = 1
+    {0xbf800000u, 0xbf800000u, 0x3f800000u},
+    // -0 * 0 = -0
+    {0x80000000u, 0x00000000u, 0x80000000u},
+    // 0 * -0 = - 0
+    {0x00000000u, 0x80000000u, 0x80000000u},    
+    // 1 * 2 = 2
+    {0x3f800000u, 0x40000000u, 0x40000000u},
+    // 2 * 1 = 2
+    {0x40000000u, 0x3f800000u, 0x40000000u},
+    // inf * inf = inf
+    {0x7f800000u, 0x7f800000u, 0x7f800000u},
+    // inf * -inf = -inf
+    {0x7f800000u, 0xff800000u, 0xff800000u},
+    // inf * 0 = nan
+    {0x7f800000u, 0x00000000u, 0xffffffffu},
+    // 0 * inf = nan
+    {0x00000000u, 0x7f800000u, 0xffffffffu},
+    // 1 * -inf = -inf
+    {0x3f800000u, 0xff800000u, 0xff800000u},
+    // -inf * 1 = -inf
+    {0xff800000u, 0x3f800000u, 0xff800000u},
+    // -1 * inf = -inf
+    {0xbf800000u, 0x7f800000u, 0xff800000u},
+    // inf * -1 = -inf
+    {0x7f800000u, 0xbf800000u, 0xff800000u},
+    // 1 * nonzero subnormal = exactly 0
+    {0x3f800000u, 0x007fffffu, 0x00000000u},
+    // nonzero subnormal * -1 = exactly -0
+    {0x007fffffu, 0xbf800000u, 0x80000000u},
+    // nan * 0 = same nan
+    {0xffff1234u, 0x00000000u, 0xffff1234u},
+    // 0 * nan = same nan
+    {0x00000000u, 0xffff1234u, 0xffff1234u},
+    // nan * 1 = same nan
+    {0xffff1234u, 0x3f800000u, 0xffff1234u},
+    // 1 * nan = same nan
+    {0x3f800000u, 0xffff1234u, 0xffff1234u},
+    // nan * inf = same nan
+    {0xffff1234u, 0x7f800000u, 0xffff1234u},
+    // inf * nan = same nan
+    {0x7f800000u, 0xffff1234u, 0xffff1234u},
+    // (2 - 0.5 ulp) x (2 - 0.5 ulp) = 4 - 0.5 ulp
+    {0x3fffffffu, 0x3fffffffu, 0x407ffffeu},
+    // (2 - 0.5 ulp) x (1 + 1 ulp) = 2 exactly
+    {0xbfffffffu, 0x3f800001u, 0xc0000000u},
+    // 1.666... * 1.333.. = 2.222...
+    {0x3fd55555u, 0x3faaaaaau, 0x400e38e3u},
+    // 1.25 x 2^-63 x 1.25 x 2^-64 = 0
+    // (normal inputs with subnormal output, and we claim to be FTZ)
+    {0x20200000u, 0x1fa00000u, 0x00000000u},
+};
+
+#define N_RANDOM_TESTS 1000
+extern test_t add_random_tests[N_RANDOM_TESTS];
+extern test_t mul_random_tests[N_RANDOM_TESTS];
+
+uint32_t __addsf3(uint32_t x, uint32_t y);
+uint32_t __mulsf3(uint32_t x, uint32_t y);
+
+int run_tests(test_t *tests, int n_tests, const char *op_str, uint32_t (*func)(uint32_t, uint32_t)) {
+    int failed = 0;
+    for (int i = 0; i < n_tests; ++i) {
+        uint32_t actual = func(tests[i].x, tests[i].y);
+        if (tests[i].expect != actual) {
+            printf("%08x %s %08x -> %08x", tests[i].x, op_str, tests[i].y, tests[i].expect);
+            printf("  FAIL: got %08x\n", actual);
+            ++failed;
+        }
+    }
+    printf("Passed: %d / %d\n", n_tests - failed, n_tests);
+    return failed;
+}
+
+int main() {
+    stdio_init_all();
+    int failed = 0;
+    sleep_ms(3000);
+    printf("Testing: __addsf3 (directed tests)\n");
+    failed += run_tests(add_directed_tests, count_of(add_directed_tests), "+", __addsf3);
+    printf("Testing: __mulsf3 (directed tests)\n");
+    failed += run_tests(mul_directed_tests, count_of(mul_directed_tests), "*", __mulsf3);
+    if (failed) {
+        printf("Skipping random tests due to %d test failures\n", failed);
+        goto done;
+    }
+    printf("Testing: __addsf3 (random tests)\n");
+    failed += run_tests(add_random_tests, N_RANDOM_TESTS, "+", __addsf3);
+    printf("Testing: __mulsf3 (random tests)\n");
+    failed += run_tests(mul_random_tests, N_RANDOM_TESTS, "*", __mulsf3);
+
+    printf("%d tests failed.\n", failed);
+    if (failed == 0) {
+        printf("Well done, you can relax now\n");
+    }
+done:
+    while (true) {asm volatile ("wfi\n");} // keep USB stdout alive
+    return 0;
+}
+
+// Generated using the FPU on my machine (Zen 4) plus FTZ on inputs/outputs
+// See hazard3_test_gen.c
+test_t add_random_tests[N_RANDOM_TESTS] = {
+#include "vectors/hazard3_addsf.inc"
+};
+
+test_t mul_random_tests[N_RANDOM_TESTS] = {
+#include "vectors/hazard3_mulsf.inc"
+};
--- a/test/pico_float_test/vectors/hazard3_addsf.inc
+++ b/test/pico_float_test/vectors/hazard3_addsf.inc
--- a/test/pico_float_test/vectors/hazard3_mulsf.inc
+++ b/test/pico_float_test/vectors/hazard3_mulsf.inc