Add fast single-precision add/sub/mul for Hazard3 (#1883)

* Add fast single-precision add/sub/mul for Hazard3

* Make test output less noisy. Map -nan to -inf in vector gen. Move random vectors to separate files.

* Re-disable USB stdout for pico_float_test by default...

* Disable pico/float.h exports on RISC-V as these functions aren't implemented

* Add hazard3 instructions to asm_helper. Split hazard3.h to support this.

You can still include hazard3.h to get everything. This just allows you
to pull in less.
This commit is contained in:
Luke Wren 2024-08-30 17:36:30 +01:00 committed by GitHub
parent 876f331033
commit d886df6eb0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 2896 additions and 164 deletions

View file

@ -6,12 +6,17 @@
#include "pico.h"
#ifdef __riscv
// Get macros for convenient use of Hazard3 instructions without binutils support
#include "hardware/hazard3/instructions.h"
#endif
#if !PICO_ASSEMBLER_IS_CLANG
#define apsr_nzcv r15
#endif
# note we don't do this by default in this file for backwards comaptibility with user code
# that may include this file, but not use unified syntax. Note that this macro does equivalent
# setup to the pico_default_asm macro for inline assembly in C code.
// note we don't do this by default in this file for backwards comaptibility with user code
// that may include this file, but not use unified syntax. Note that this macro does equivalent
// setup to the pico_default_asm macro for inline assembly in C code.
.macro pico_default_asm_setup
#ifndef __riscv
.syntax unified
@ -60,28 +65,7 @@ weak_func WRAPPER_FUNC_NAME(\x)
.word \func + \offset
.endm
# backwards compatibility
// backwards compatibility
.macro __pre_init func, priority_string1
__pre_init_with_offset func, 0, priority_string1
.endm
#ifdef __riscv
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
.macro h3.bextm rd rs1 rs2 nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
.endm
// rd = (rs1 >> shamt) & ~(-1 << nbits)
.macro h3.bextmi rd rs1 shamt nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
.if (\shamt < 0) || (\shamt > 31)
.err
.endif
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
.endm
#endif

View file

@ -4,13 +4,18 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
#ifndef _HARDWARE_HAZARD3_
#define _HARDWARE_HAZARD3_
#ifndef _HARDWARE_HAZARD3_H
#define _HARDWARE_HAZARD3_H
#include "pico.h"
#include "hardware/riscv.h"
// This includes both standard and Hazard3 custom CSRs:
#include "hardware/regs/rvcsr.h"
#include "hardware/hazard3/features.h"
#include "hardware/hazard3/instructions.h"
/** \file hardware/hazard3.h
* \defgroup hardware_hazard3 hardware_hazard3
*
@ -18,87 +23,7 @@
*
*/
// Feature detection macros for Hazard3 custom extensions
#if PICO_RP2350
#define __hazard3_extension_xh3power
#define __hazard3_extension_xh3bextm
#define __hazard3_extension_xh3irq
#define __hazard3_extension_xh3pmpm
#endif
#ifdef __ASSEMBLER__
// Assembly language instruction macros for Hazard3 custom instructions
// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
// first bit indexed by rs2, and bit count configured by an immediate value.
// R-format instruction. Pseudocode:
//
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
.macro h3.bextm rd rs1 rs2 nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
#ifdef __hazard3_extension_xh3bextm
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
#else
srl \rd, \rs1, \rs2
andi \rd, \rd, ((1 << \nbits) - 1)
#endif
.endm
// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
// first bit index and the number of bits both configured by immediate
// values. I-format instruction. Pseudocode:
//
// rd = (rs1 >> shamt) & ~(-1 << nbits)
.macro h3.bextmi rd rs1 shamt nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
.if (\shamt < 0) || (\shamt > 31)
.err
.endif
#ifdef __hazard3_extension_xh3bextm
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
#else
srli \rd, \rs1, \shamt
andi \rd, \rd, ((1 << \nbits) - 1)
#endif
.endm
// h3.block: enter an idle state until another processor in the same
// multiprocessor complex executes an h3.unblock instruction, or the
// processor is interrupted. Fall through immediately if an h3.unblock has
// been received since the last execution of an h3.block on this processor.
// On RP2350, processors also have their own h3.unblock signals reflected
// back to them.
.macro h3.block
#ifdef __hazard3_extension_xh3power
slt x0, x0, x0
#else
nop
#endif
.endm
// h3.unblock: signal other processors in the same multiprocessor complex to
// exit the idle state entered by an h3.block instruction. On RP2350, this
// signal is also reflected back to the processor that executed the
// h3.unblock, which will cause that processor's next h3.block to fall
// through immediately.
.macro h3.unblock
#ifdef __hazard3_extension_xh3power
slt x0, x0, x1
#else
nop
#endif
.endm
#else // !__ASSEMBLER__
#ifndef __ASSEMBLER__
#ifdef __cplusplus
extern "C" {
@ -128,51 +53,6 @@ extern "C" {
#define hazard3_irqarray_clear(csr, index, data) static_assert(false, "Not supported: Xh3irq extension")
#endif
// nbits must be a constant expression
#ifdef __hazard3_extension_xh3bextm
#define __hazard3_bextm(nbits, rs1, rs2) ({\
uint32_t __h3_bextm_rd; \
asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
: "=r" (__h3_bextm_rd) \
: "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
); \
__h3_bextm_rd; \
})
#else
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
#endif
// nbits and shamt must be constant expressions
#ifdef __hazard3_extension_xh3bextm
#define __hazard3_bextmi(nbits, rs1, shamt) ({\
uint32_t __h3_bextmi_rd; \
asm (".insn i 0x0b, 0x4, %0, %1, %2"\
: "=r" (__h3_bextmi_rd) \
: "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
); \
__h3_bextmi_rd; \
})
#else
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
#endif
#ifdef __hazard3_extension_xh3power
#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
#else
#define __hazard3_block() do {} while (0)
#endif
#ifdef __hazard3_extension_xh3power
#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
#else
#define __hazard3_unblock() do {} while (0)
#endif
#ifdef __cplusplus
}
#endif
#endif // !__ASSEMBLER__
#endif

View file

@ -0,0 +1,29 @@
/*
* Copyright (c) 2024 Raspberry Pi Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#ifndef _HARDWARE_HAZARD3_FEATURES_H
#define _HARDWARE_HAZARD3_FEATURES_H
#include "pico.h"
/** \file hardware/hazard3/features.h
* \addtogroup hardware_hazard3
*
* \brief Sets macros for supported Hazard3 custom extensions (features) based on PICO_PLATFORM macros
*
*/
// Feature detection macros for Hazard3 custom extensions
#if PICO_RP2350
// Version 1.0 of these four extensions
// (encoded as major * 100 + minor)
#define __hazard3_extension_xh3power 100
#define __hazard3_extension_xh3bextm 100
#define __hazard3_extension_xh3irq 100
#define __hazard3_extension_xh3pmpm 100
#endif
#endif

View file

@ -0,0 +1,152 @@
/*
* Copyright (c) 2024 Raspberry Pi Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#ifndef _HARDWARE_HAZARD3_INSTRUCTIONS_H
#define _HARDWARE_HAZARD3_INSTRUCTIONS_H
#include "pico.h"
// Get list of supported extensions based on platform:
#include "hardware/hazard3/features.h"
/** \file hardware/hazard3/instructions.h
* \addtogroup hardware_hazard3
*
* \brief Intrinsics and asm macros for Hazard3 custom instructions
*
* The implementation of these intrinsics depends on the feature macros
* defined in hardware/hazard3/features.h. When the relevant feature is not
* present, the intrinsics fall back on an RV32I equivalent if possible.
*
*/
#ifdef __ASSEMBLER__
// Assembly language instruction macros for Hazard3 custom instructions
// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
// first bit indexed by rs2, and bit count configured by an immediate value.
// R-format instruction. Pseudocode:
//
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
.macro h3.bextm rd rs1 rs2 nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
#ifdef __hazard3_extension_xh3bextm
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
#else
srl \rd, \rs1, \rs2
andi \rd, \rd, ((1 << \nbits) - 1)
#endif
.endm
// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
// first bit index and the number of bits both configured by immediate
// values. I-format instruction. Pseudocode:
//
// rd = (rs1 >> shamt) & ~(-1 << nbits)
.macro h3.bextmi rd rs1 shamt nbits
.if (\nbits < 1) || (\nbits > 8)
.err
.endif
.if (\shamt < 0) || (\shamt > 31)
.err
.endif
#ifdef __hazard3_extension_xh3bextm
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
#else
srli \rd, \rs1, \shamt
andi \rd, \rd, ((1 << \nbits) - 1)
#endif
.endm
// h3.block: enter an idle state until another processor in the same
// multiprocessor complex executes an h3.unblock instruction, or the
// processor is interrupted. Fall through immediately if an h3.unblock has
// been received since the last execution of an h3.block on this processor.
// On RP2350, processors also have their own h3.unblock signals reflected
// back to them.
.macro h3.block
#ifdef __hazard3_extension_xh3power
slt x0, x0, x0
#else
nop
#endif
.endm
// h3.unblock: signal other processors in the same multiprocessor complex to
// exit the idle state entered by an h3.block instruction. On RP2350, this
// signal is also reflected back to the processor that executed the
// h3.unblock, which will cause that processor's next h3.block to fall
// through immediately.
.macro h3.unblock
#ifdef __hazard3_extension_xh3power
slt x0, x0, x1
#else
nop
#endif
.endm
#else // !__ASSEMBLER__
// C language instruction macros for Hazard3 custom instructions
#ifdef __cplusplus
extern "C" {
#endif
// nbits must be a constant expression
#ifdef __hazard3_extension_xh3bextm
#define __hazard3_bextm(nbits, rs1, rs2) ({\
uint32_t __h3_bextm_rd; \
asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
: "=r" (__h3_bextm_rd) \
: "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
); \
__h3_bextm_rd; \
})
#else
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
#endif
// nbits and shamt must be constant expressions
#ifdef __hazard3_extension_xh3bextm
#define __hazard3_bextmi(nbits, rs1, shamt) ({\
uint32_t __h3_bextmi_rd; \
asm (".insn i 0x0b, 0x4, %0, %1, %2"\
: "=r" (__h3_bextmi_rd) \
: "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
); \
__h3_bextmi_rd; \
})
#else
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
#endif
#ifdef __hazard3_extension_xh3power
#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
#else
#define __hazard3_block() do {} while (0)
#endif
#ifdef __hazard3_extension_xh3power
#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
#else
#define __hazard3_unblock() do {} while (0)
#endif
#ifdef __cplusplus
}
#endif
#endif // !__ASSEMBLER__
#endif

View file

@ -10,11 +10,7 @@
# add alias "default" which is just pico.
add_library(pico_float_default INTERFACE)
if (PICO_RISCV)
target_link_libraries(pico_float_default INTERFACE pico_float_compiler)
else()
target_link_libraries(pico_float_default INTERFACE pico_float_pico)
endif()
target_link_libraries(pico_float_default INTERFACE pico_float_pico)
set(PICO_DEFAULT_FLOAT_IMPL pico_float_default)
@ -128,6 +124,10 @@
wrap_float_functions(pico_float_pico_vfp NO_WRAP_AEABI)
target_link_libraries(pico_float_pico INTERFACE
pico_float_pico_vfp)
else()
target_sources(pico_float_pico INTERFACE
${CMAKE_CURRENT_LIST_DIR}/float_single_hazard3.S
)
endif()

View file

@ -0,0 +1,318 @@
/*
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include "pico/asm_helper.S"
#include "hardware/hazard3.h"
// This file reimplements some common single-precision soft float routines
// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
// and is tuned for Hazard3 execution timings.
// Subnormal values are always flushed to zero on both input and output.
// Rounding is always to nearest (even on tie).
pico_default_asm_setup
.macro float_section name
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm
float_section __addsf3
.global __subsf3
.p2align 2
__subsf3:
binvi a1, a1, 31
.global __addsf3
__addsf3:
// Unpack exponent:
h3.bextmi a2, a0, 23, 8
h3.bextmi a3, a1, 23, 8
// Flush-to-zero => 0 + y = y applies, including nan, with the sole
// exception of y being subnormal (which also needs to be flushed)
beqz a2, __addsf_return_y_flushed
// Don't have to handle this case for x + 0 = 0 because we already know x
// is nonzero
beqz a3, __addsf_return_x
// Unpack significand, plus 3 extra zeroes for working space:
slli a4, a0, 9
slli a5, a1, 9
// check nan/inf on input
li t0, 255
beq a2, t0, __addsf_x_nan_inf
beq a3, t0, __addsf_y_nan_inf
// (finish unpacking significand)
srli a4, a4, 6
srli a5, a5, 6
// If we're still on the straight path then we are adding two normal
// values. Add implicit one (1.xx...xx000)
bseti a4, a4, 23 + 3
bseti a5, a5, 23 + 3
// Negate if sign bit is set
bgez a0, 1f
neg a4, a4
1:
// (tuck this 16-bit here to avoid alignment penalty)
li t1, 25
bgez a1, 1f
neg a5, a5
1:
bltu a2, a3, __addsf_ye_gt_xe
// The main body is repeated twice with different register assignments.
// lhs is the more-significant addend:
.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
sub \packed_rhs, \exp_lhs, \exp_rhs
// If there is a large exponent difference then there is no effect on lhs
.if \rhs_is_x
bgeu \packed_rhs, t1, __addsf_return_y
.else
bgeu \packed_rhs, t1, __addsf_return_x
.endif
// Shift rhs down to correct relative significance
sra \packed_lhs, \sig_rhs, \packed_rhs
// Set sticky bit if ones were shifted out
sll \packed_rhs, \packed_lhs, \packed_rhs
sltu \packed_rhs, \packed_rhs, \sig_rhs
or \packed_lhs, \packed_lhs, \packed_rhs
// Add significands
add \sig_lhs, \sig_lhs, \packed_lhs
// Detect exact cancellation (may be beyond max normalisation shift; also
// IEEE 754 requires +0 for exact cancellation, no matter input signs)
beqz \sig_lhs, __addsf_return_0
// Convert two's complement back to sign + magnitude
srai \exp_rhs, \sig_lhs, 31
xor \sig_lhs, \sig_lhs, \exp_rhs
sub \sig_lhs, \sig_lhs, \exp_rhs
// Renormalise significand: bit 31 is now implicit one
clz \packed_lhs, \sig_lhs
sll \sig_lhs, \sig_lhs, \packed_lhs
// Adjust exponent
addi \packed_lhs, \packed_lhs, -5
sub \exp_lhs, \exp_lhs, \packed_lhs
// Round to nearest, even on tie (bias upward if above odd number)
bexti \packed_lhs, \sig_lhs, 8
addi \sig_lhs, \sig_lhs, 127
add \sig_lhs, \sig_lhs, \packed_lhs
// Exponent may increase by one due to rounding up from all-ones; this is
// detected by clearing of implicit one (there is a carry-out too)
bgez \sig_lhs, 3f
4:
// Detect underflow/overflow
bgeu \exp_lhs, t0, 1f
// Pack and return
packh \exp_lhs, \exp_lhs, \exp_rhs
slli \exp_lhs, \exp_lhs, 23
slli \sig_lhs, \sig_lhs, 1
srli \sig_lhs, \sig_lhs, 9
add a0, \sig_lhs, \exp_lhs
ret
1:
bgez \exp_lhs, 2f
// Signed zero on underflow
slli a0, \exp_rhs, 31
ret
2:
// Signed infinity on overflow
packh a0, t0, \exp_rhs
slli a0, a0, 23
ret
3:
// Exponent increase due to rounding (uncommon)
srli \sig_lhs, \sig_lhs, 1
addi \exp_lhs, \exp_lhs, 1
j 4b
.endm
__addsf_xe_gte_ye:
addsf_core a0, a1, a4, a5, a2, a3, 0
.p2align 2
__addsf_ye_gt_xe:
addsf_core a1, a0, a5, a4, a3, a2, 1
__addsf_x_nan_inf:
// When at least one operand is nan, we must propagate at least one of
// those nan payloads (sign of nan result is unspecified, which we take
// advantage of by implementing x - y as x + -y). Check x nan vs inf:
bnez a4, __addsf_return_x
__addsf_x_inf:
// If x is +-inf, need to distinguish the following cases:
bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf)
bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan)
xor a5, a0, a1
srli a5, a5, 31
beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster)
li a0, -1 // y is inf of different sign -> return nan
ret
__addsf_y_nan_inf:
// Mirror of __addsf_x_nan_inf
bnez a5, __addsf_return_y
__addsf_y_inf:
bne a2, t0, __addsf_return_y
bnez a4, __addsf_return_x
xor a4, a0, a1
srli a4, a4, 31
beqz a4, __addsf_return_x
li a0, -1
ret
__addsf_return_y_flushed:
bnez a3, 1f
srli a1, a1, 23
slli a1, a1, 23
1:
__addsf_return_y:
mv a0, a1
__addsf_return_x:
ret
__addsf_return_0:
li a0, 0
ret
float_section __mulsf3
.global __mulsf3
.p2align 2
__mulsf3:
// Force y to be positive (by possibly negating x) *before* unpacking.
// This allows many special cases to be handled without repacking.
bgez a1, 1f
binvi a0, a0, 31
1:
// Unpack exponent:
h3.bextmi a2, a0, 23, 8
h3.bextmi a3, a1, 23, 8
// Check special cases
li t0, 255
beqz a2, __mulsf_x_0
beqz a3, __mulsf_y_0
beq a2, t0, __mulsf_x_nan_inf
beq a3, t0, __mulsf_y_nan_inf
// Finish unpacking sign
srai a6, a0, 31
// Unpack significand (with implicit one in MSB)
slli a4, a0, 8
slli a5, a1, 8
bseti a4, a4, 31
bseti a5, a5, 31
// Get full 64-bit multiply result in a4:a1 (one cycle each half)
// Going from Q1.23 to Q2.46 (both left-justified)
mul a1, a4, a5
mulhu a4, a4, a5
// Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
// final significand (ignoring rounding)
clz a0, a4
sll a4, a4, a0
sub a2, a2, a0
// After normalising we can calculate the final exponent, since rounding
// cannot increase the exponent for multiplication (unlike addition)
add a2, a2, a3
// Subtract redundant bias term (127), add 1 for normalisation correction
addi a2, a2, -126
blez a2, __mulsf_underflow
bge a2, t0, __mulsf_overflow
// Gather sticky bits from low fraction:
snez a1, a1
or a4, a4, a1
// Round to nearest, even on tie (aka bias upward if odd)
bexti a1, a4, 8
add a4, a4, a1
addi a4, a4, 127
// Pack it and ship it
packh a2, a2, a6
slli a2, a2, 23
slli a4, a4, 1
srli a4, a4, 9
add a0, a4, a2
ret
__mulsf_underflow:
// Signed zero
slli a0, a6, 31
ret
__mulsf_overflow:
// Signed inf
packh a0, t0, a6
slli a0, a0, 23
ret
__mulsf_x_0:
// 0 times nan -> propagate nan
// 0 times inf -> generate nan
// 0 times others -> 0 (need to flush significand too as we are FTZ)
bne a3, t0, __mulsf_return_flushed_x
slli a5, a1, 9
beqz a5, 1f
// Propagate nan from y
__mulsf_return_y:
mv a0, a1
ret
1:
// Generate new nan
li a0, -1
ret
__mulsf_y_0:
// Mirror image of x_0 except we still return x for signed 0, since the
// signs were already resolved.
bne a2, t0, __mulsf_return_flushed_x
slli a1, a0, 9
bnez a1, 1f
li a0, -1
1:
ret
__mulsf_return_flushed_x:
// If we don't support subnormals we at least need to flush to a canonical
// zero. This is just a sign bit in bit 31.
srli a0, a0, 31
slli a0, a0, 31
__mulsf_return_x:
ret
__mulsf_x_nan_inf:
// We know that y is not zero and is positive. So...
// x is nan -> return x
// else y is nan -> return y
// else y is inf -> return x
// else y is normal -> return x
// (the order of the first two clauses is actually our free choice)
slli a4, a0, 9
bnez a4, __mulsf_return_x
bne a3, t0, __mulsf_return_x
slli a5, a1, 9
bnez a5, __mulsf_return_y
ret // return x
__mulsf_y_nan_inf:
// We know that x is not zero, nan, nor inf. That just leaves normals.
// y is nan -> return y
// y is inf -> return inf * sgn(x) (since we already merged the signs)
slli a5, a1, 9
bnez a5, __mulsf_return_y
srai a0, a0, 31
packh a0, t0, a0
slli a0, a0, 23
ret
// This is a hack to improve soft float performance for the routines we don't
// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
float_section __clz2si
.global __clz2si
__clz2si:
clz a0, a0
ret

View file

@ -21,8 +21,8 @@ extern "C" {
*
* \brief Optimized single-precision floating point functions
*
* (Replacement) optimized implementations are provided of the following compiler built-ins
* and math library functions:
* (Replacement) optimized implementations are provided for the following compiler built-ins
* and math library functions on Arm:
*
* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
@ -34,11 +34,18 @@ extern "C" {
* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
* - exp10f, sincosf, powintf
*
* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly
* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
*
* - float2fix64_z, fdiv_fast, fsqrt_fast,
*
* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
*
* - __addsf3, __subsf3, __mulsf3
*/
// None of these functions are available on RISC-V:
#if !defined(__riscv) || PICO_COMBINED_DOCS
float int2float(int32_t f);
float uint2float(uint32_t f);
float int642float(int64_t f);
@ -74,6 +81,8 @@ float fdiv_fast(float n, float d);
float fsqrt_fast(float f);
#endif
#endif
#ifdef __cplusplus
}
#endif

View file

@ -1,7 +1,22 @@
PROJECT(pico_float_test)
# todo revist this test for
if (NOT PICO_RISCV)
if (PICO_RISCV)
# Separate, simpler test: currently we only have a few single-precision
# routines for RISC-V soft float (and the other tests are a bit
# AEABI-dependent)
add_executable(pico_float_test
pico_float_test_hazard3.c
)
target_link_libraries(pico_float_test PRIVATE pico_float pico_stdlib)
target_include_directories(pico_float_test PRIVATE ${CMAKE_CURRENT_LIST_DIR})
pico_add_extra_outputs(pico_float_test)
# pico_enable_stdio_usb(pico_float_test 1)
# pico_enable_stdio_uart(pico_float_test 0)
else ()
add_executable(pico_float_test
pico_float_test.c
llvm/call_apsr.S
@ -64,4 +79,4 @@ if (NOT PICO_RISCV)
target_link_libraries(m33 pico_double pico_stdlib)
pico_add_extra_outputs(m33)
endif()
endif()
endif()

View file

@ -0,0 +1,136 @@
/*
* Copyright (c) 2024 Raspberry Pi Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <stdio.h>
#include <fenv.h>
#include <stdbool.h>
#include <stdint.h>
// xoroshiro256++ pseudorandom number generator.
// Adapted from: https://prng.di.unimi.it/xoshiro256plusplus.c
// Original copyright notice:
/* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)
To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
It has excellent (sub-ns) speed, a state (256 bits) that is large
enough for any parallel application, and it passes all tests we are
aware of.
For generating just floating-point numbers, xoshiro256+ is even faster.
The state must be seeded so that it is not everywhere zero. If you have
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
output to fill s. */
static inline uint64_t xr256_rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
uint64_t xr256_next(uint64_t s[4]) {
const uint64_t result = xr256_rotl(s[0] + s[3], 23) + s[0];
const uint64_t t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = xr256_rotl(s[3], 45);
return result;
}
uint32_t bitcast_f2u(float x) {
// This is UB but then so is every C program
union {
float f;
uint32_t u;
} un;
un.f = x;
return un.u;
}
float bitcast_u2f(uint32_t x) {
union {
float f;
uint32_t u;
} un;
un.u = x;
return un.f;
}
bool is_nan_u(uint32_t x) {
return ((x >> 23) & 0xffu) == 0xffu && (x & ~(-1u << 23));
}
uint32_t flush_to_zero_u(uint32_t x) {
if (!(x & (0xffu << 23))) {
x &= -1u << 23;
}
return x;
}
uint32_t model_fadd(uint32_t x, uint32_t y) {
x = flush_to_zero_u(x);
y = flush_to_zero_u(y);
// Use local hardware implementation to perform calculation
uint32_t result = bitcast_f2u(bitcast_u2f(x) + bitcast_u2f(y));
// Use correct canonical generated nan
if (is_nan_u(result)) {
result = -1u;
}
result = flush_to_zero_u(result);
return result;
}
uint32_t model_fmul(uint32_t x, uint32_t y) {
x = flush_to_zero_u(x);
y = flush_to_zero_u(y);
// Use local hardware implementation to perform calculation
uint32_t result = bitcast_f2u(bitcast_u2f(x) * bitcast_u2f(y));
// Use correct canonical generated nan
if (is_nan_u(result)) {
result = -1u;
}
result = flush_to_zero_u(result);
return result;
}
int main() {
// SHA-256 of a rude word
uint64_t rand_state[4] = {
0x5891b5b522d5df08u,
0x6d0ff0b110fbd9d2u,
0x1bb4fc7163af34d0u,
0x8286a2e846f6be03u
};
for (int i = 0; i < 1000; ++i) {
uint32_t x, y;
x = xr256_next(rand_state) & 0xffffffffu;
y = xr256_next(rand_state) & 0xffffffffu;
// Map nan to +-inf (input nans should already be well-covered)
if (is_nan_u(x)) {
x &= -1u << 23;
}
if (is_nan_u(y)) {
y &= -1u << 23;
}
#if 1
printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fadd(x, y));
#else
printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fmul(x, y));
#endif
}
}

View file

@ -0,0 +1,209 @@
/**
* Copyright (c) 2024 Raspberry Pi Ltd.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <stdio.h>
#include "pico/stdlib.h"
// This test covers the single-precision functions in:
//
// src/pico_float/float_hazard3_single.S
//
// It assumes the canonical generated-NaN value and NaN sign rules used by
// those functions (which are unspecified by IEEE 754). It does not cover
// libgcc/libm functions from outside of that source file.
typedef struct {
uint32_t x;
uint32_t y;
uint32_t expect;
} test_t;
test_t add_directed_tests[] = {
// 1 + 1 = 2
{0x3f800000u, 0x3f800000u, 0x40000000u},
// 2 + 1 = 3
{0x40000000u, 0x3f800000u, 0x40400000u},
// 1 + 2 = 3
{0x3f800000u, 0x40000000u, 0x40400000u},
// 1 + -1 = +0 (exact cancellation)
{0x3f800000u, 0xbf800000u, 0x00000000u},
// -1 + 1 = +0 (exact cancellation)
{0xbf800000u, 0x3f800000u, 0x00000000u},
// 1 + <<1 ulp = 1
{0x3f800000u, 0x2f800000u, 0x3f800000u},
// <<1 ulp + 1 = 1
{0x2f800000u, 0x3f800000u, 0x3f800000u},
// -1 + 1.25 = 0.25
{0xbf800000u, 0x3fa00000u, 0x3e800000u},
// max normal + 0.5 ulp = +inf
{0x7f7fffffu, 0x73000000u, 0x7f800000u},
// max normal + max normal = +inf
{0x7f7fffffu, 0x7f7fffffu, 0x7f800000u},
// min normal - 0.5 ulp = -inf
{0xff7fffffu, 0xf3000000u, 0xff800000u},
// min normal + min_normal = -inf
{0xff7fffffu, 0xff7fffffu, 0xff800000u},
// max normal + 0.499... ulp = max normal
{0x7f7fffffu, 0x72ffffffu, 0x7f7fffffu},
// min normal - 0.499... ulp = min normal
{0xff7fffffu, 0xf2ffffffu, 0xff7fffffu},
// nan + 0 = same nan
{0xffff1234u, 0x00000000u, 0xffff1234u},
// 0 + nan = same nan
{0x00000000u, 0xffff1234u, 0xffff1234u},
// nan + 1 = same nan
{0xffff1234u, 0x3f800000u, 0xffff1234u},
// 1 + nan = same nan
{0x3f800000u, 0xffff1234u, 0xffff1234u},
// nan + inf = same nan
{0xffff1234u, 0x7f800000u, 0xffff1234u},
// inf + nan = same nan
{0x7f800000u, 0xffff1234u, 0xffff1234u},
// inf + inf = inf
{0x7f800000u, 0x7f800000u, 0x7f800000u},
// -inf + -inf = -inf
{0xff800000u, 0xff800000u, 0xff800000u},
// inf + -inf = nan (all-ones is our canonical cheap nan)
{0x7f800000u, 0xff800000u, 0xffffffffu},
// -inf + inf = nan
{0xff800000u, 0x7f800000u, 0xffffffffu},
// subnormal + subnormal = exactly 0
{0x007fffffu, 0x007fffffu, 0x00000000u},
// -subnormal + -subnormal = exactly -0
{0x807fffffu, 0x807fffffu, 0x80000000u},
// Even + 0.5 ulp: round down
{0x3f800002u, 0x33800000u, 0x3f800002u},
// Even - 0.5 ulp: round up
{0x3f800002u, 0xb3800000u, 0x3f800002u},
// Odd + 0.5 ulp: round up
{0x3f800001u, 0x33800000u, 0x3f800002u},
// Odd - 0.5 ulp: round down
{0x3f800001u, 0xb3800000u, 0x3f800000u},
// All-zeroes significand - 0.5 ulp: no rounding (exact)
{0x3f800000u, 0xb3800000u, 0x3f7fffffu},
// Very subnormal difference of normals: flushed to zero
{0x03800000u, 0x837fffffu, 0x00000000u},
// Barely subnormal difference of normals: also flushed (unflushed result is 2^(emin-1))
{0x03800000u, 0x837e0000u, 0x00000000u},
};
test_t mul_directed_tests[] = {
// -- directed tests --
// 1 * 1 = 1
{0x3f800000u, 0x3f800000u, 0x3f800000u},
// 1 * -1 = -1
{0x3f800000u, 0xbf800000u, 0xbf800000u},
// -1 * 1 = -1
{0xbf800000u, 0x3f800000u, 0xbf800000u},
// -1 * -1 = 1
{0xbf800000u, 0xbf800000u, 0x3f800000u},
// -0 * 0 = -0
{0x80000000u, 0x00000000u, 0x80000000u},
// 0 * -0 = - 0
{0x00000000u, 0x80000000u, 0x80000000u},
// 1 * 2 = 2
{0x3f800000u, 0x40000000u, 0x40000000u},
// 2 * 1 = 2
{0x40000000u, 0x3f800000u, 0x40000000u},
// inf * inf = inf
{0x7f800000u, 0x7f800000u, 0x7f800000u},
// inf * -inf = -inf
{0x7f800000u, 0xff800000u, 0xff800000u},
// inf * 0 = nan
{0x7f800000u, 0x00000000u, 0xffffffffu},
// 0 * inf = nan
{0x00000000u, 0x7f800000u, 0xffffffffu},
// 1 * -inf = -inf
{0x3f800000u, 0xff800000u, 0xff800000u},
// -inf * 1 = -inf
{0xff800000u, 0x3f800000u, 0xff800000u},
// -1 * inf = -inf
{0xbf800000u, 0x7f800000u, 0xff800000u},
// inf * -1 = -inf
{0x7f800000u, 0xbf800000u, 0xff800000u},
// 1 * nonzero subnormal = exactly 0
{0x3f800000u, 0x007fffffu, 0x00000000u},
// nonzero subnormal * -1 = exactly -0
{0x007fffffu, 0xbf800000u, 0x80000000u},
// nan * 0 = same nan
{0xffff1234u, 0x00000000u, 0xffff1234u},
// 0 * nan = same nan
{0x00000000u, 0xffff1234u, 0xffff1234u},
// nan * 1 = same nan
{0xffff1234u, 0x3f800000u, 0xffff1234u},
// 1 * nan = same nan
{0x3f800000u, 0xffff1234u, 0xffff1234u},
// nan * inf = same nan
{0xffff1234u, 0x7f800000u, 0xffff1234u},
// inf * nan = same nan
{0x7f800000u, 0xffff1234u, 0xffff1234u},
// (2 - 0.5 ulp) x (2 - 0.5 ulp) = 4 - 0.5 ulp
{0x3fffffffu, 0x3fffffffu, 0x407ffffeu},
// (2 - 0.5 ulp) x (1 + 1 ulp) = 2 exactly
{0xbfffffffu, 0x3f800001u, 0xc0000000u},
// 1.666... * 1.333.. = 2.222...
{0x3fd55555u, 0x3faaaaaau, 0x400e38e3u},
// 1.25 x 2^-63 x 1.25 x 2^-64 = 0
// (normal inputs with subnormal output, and we claim to be FTZ)
{0x20200000u, 0x1fa00000u, 0x00000000u},
};
#define N_RANDOM_TESTS 1000
extern test_t add_random_tests[N_RANDOM_TESTS];
extern test_t mul_random_tests[N_RANDOM_TESTS];
uint32_t __addsf3(uint32_t x, uint32_t y);
uint32_t __mulsf3(uint32_t x, uint32_t y);
int run_tests(test_t *tests, int n_tests, const char *op_str, uint32_t (*func)(uint32_t, uint32_t)) {
int failed = 0;
for (int i = 0; i < n_tests; ++i) {
uint32_t actual = func(tests[i].x, tests[i].y);
if (tests[i].expect != actual) {
printf("%08x %s %08x -> %08x", tests[i].x, op_str, tests[i].y, tests[i].expect);
printf(" FAIL: got %08x\n", actual);
++failed;
}
}
printf("Passed: %d / %d\n", n_tests - failed, n_tests);
return failed;
}
int main() {
stdio_init_all();
int failed = 0;
sleep_ms(3000);
printf("Testing: __addsf3 (directed tests)\n");
failed += run_tests(add_directed_tests, count_of(add_directed_tests), "+", __addsf3);
printf("Testing: __mulsf3 (directed tests)\n");
failed += run_tests(mul_directed_tests, count_of(mul_directed_tests), "*", __mulsf3);
if (failed) {
printf("Skipping random tests due to %d test failures\n", failed);
goto done;
}
printf("Testing: __addsf3 (random tests)\n");
failed += run_tests(add_random_tests, N_RANDOM_TESTS, "+", __addsf3);
printf("Testing: __mulsf3 (random tests)\n");
failed += run_tests(mul_random_tests, N_RANDOM_TESTS, "*", __mulsf3);
printf("%d tests failed.\n", failed);
if (failed == 0) {
printf("Well done, you can relax now\n");
}
done:
while (true) {asm volatile ("wfi\n");} // keep USB stdout alive
return 0;
}
// Generated using the FPU on my machine (Zen 4) plus FTZ on inputs/outputs
// See hazard3_test_gen.c
test_t add_random_tests[N_RANDOM_TESTS] = {
#include "vectors/hazard3_addsf.inc"
};
test_t mul_random_tests[N_RANDOM_TESTS] = {
#include "vectors/hazard3_mulsf.inc"
};

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff