mirror of
https://github.com/raspberrypi/pico-sdk.git
synced 2026-01-28 01:47:21 +01:00
Add fast single-precision add/sub/mul for Hazard3 (#1883)
* Add fast single-precision add/sub/mul for Hazard3 * Make test output less noisy. Map -nan to -inf in vector gen. Move random vectors to separate files. * Re-disable USB stdout for pico_float_test by default... * Disable pico/float.h exports on RISC-V as these functions aren't implemented * Add hazard3 instructions to asm_helper. Split hazard3.h to support this. You can still include hazard3.h to get everything. This just allows you to pull in less.
This commit is contained in:
parent
876f331033
commit
d886df6eb0
12 changed files with 2896 additions and 164 deletions
|
|
@ -6,12 +6,17 @@
|
|||
|
||||
#include "pico.h"
|
||||
|
||||
#ifdef __riscv
|
||||
// Get macros for convenient use of Hazard3 instructions without binutils support
|
||||
#include "hardware/hazard3/instructions.h"
|
||||
#endif
|
||||
|
||||
#if !PICO_ASSEMBLER_IS_CLANG
|
||||
#define apsr_nzcv r15
|
||||
#endif
|
||||
# note we don't do this by default in this file for backwards comaptibility with user code
|
||||
# that may include this file, but not use unified syntax. Note that this macro does equivalent
|
||||
# setup to the pico_default_asm macro for inline assembly in C code.
|
||||
// note we don't do this by default in this file for backwards comaptibility with user code
|
||||
// that may include this file, but not use unified syntax. Note that this macro does equivalent
|
||||
// setup to the pico_default_asm macro for inline assembly in C code.
|
||||
.macro pico_default_asm_setup
|
||||
#ifndef __riscv
|
||||
.syntax unified
|
||||
|
|
@ -60,28 +65,7 @@ weak_func WRAPPER_FUNC_NAME(\x)
|
|||
.word \func + \offset
|
||||
.endm
|
||||
|
||||
# backwards compatibility
|
||||
// backwards compatibility
|
||||
.macro __pre_init func, priority_string1
|
||||
__pre_init_with_offset func, 0, priority_string1
|
||||
.endm
|
||||
|
||||
#ifdef __riscv
|
||||
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
|
||||
.macro h3.bextm rd rs1 rs2 nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
|
||||
.endm
|
||||
|
||||
// rd = (rs1 >> shamt) & ~(-1 << nbits)
|
||||
.macro h3.bextmi rd rs1 shamt nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
.if (\shamt < 0) || (\shamt > 31)
|
||||
.err
|
||||
.endif
|
||||
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
|
||||
.endm
|
||||
#endif
|
||||
|
|
@ -4,13 +4,18 @@
|
|||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#ifndef _HARDWARE_HAZARD3_
|
||||
#define _HARDWARE_HAZARD3_
|
||||
#ifndef _HARDWARE_HAZARD3_H
|
||||
#define _HARDWARE_HAZARD3_H
|
||||
|
||||
#include "pico.h"
|
||||
#include "hardware/riscv.h"
|
||||
|
||||
// This includes both standard and Hazard3 custom CSRs:
|
||||
#include "hardware/regs/rvcsr.h"
|
||||
|
||||
#include "hardware/hazard3/features.h"
|
||||
#include "hardware/hazard3/instructions.h"
|
||||
|
||||
/** \file hardware/hazard3.h
|
||||
* \defgroup hardware_hazard3 hardware_hazard3
|
||||
*
|
||||
|
|
@ -18,87 +23,7 @@
|
|||
*
|
||||
*/
|
||||
|
||||
// Feature detection macros for Hazard3 custom extensions
|
||||
#if PICO_RP2350
|
||||
#define __hazard3_extension_xh3power
|
||||
#define __hazard3_extension_xh3bextm
|
||||
#define __hazard3_extension_xh3irq
|
||||
#define __hazard3_extension_xh3pmpm
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
|
||||
// Assembly language instruction macros for Hazard3 custom instructions
|
||||
|
||||
// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
|
||||
// first bit indexed by rs2, and bit count configured by an immediate value.
|
||||
// R-format instruction. Pseudocode:
|
||||
//
|
||||
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
|
||||
|
||||
.macro h3.bextm rd rs1 rs2 nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
|
||||
#else
|
||||
srl \rd, \rs1, \rs2
|
||||
andi \rd, \rd, ((1 << \nbits) - 1)
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
|
||||
// first bit index and the number of bits both configured by immediate
|
||||
// values. I-format instruction. Pseudocode:
|
||||
//
|
||||
// rd = (rs1 >> shamt) & ~(-1 << nbits)
|
||||
|
||||
.macro h3.bextmi rd rs1 shamt nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
.if (\shamt < 0) || (\shamt > 31)
|
||||
.err
|
||||
.endif
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
|
||||
#else
|
||||
srli \rd, \rs1, \shamt
|
||||
andi \rd, \rd, ((1 << \nbits) - 1)
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.block: enter an idle state until another processor in the same
|
||||
// multiprocessor complex executes an h3.unblock instruction, or the
|
||||
// processor is interrupted. Fall through immediately if an h3.unblock has
|
||||
// been received since the last execution of an h3.block on this processor.
|
||||
// On RP2350, processors also have their own h3.unblock signals reflected
|
||||
// back to them.
|
||||
|
||||
.macro h3.block
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
slt x0, x0, x0
|
||||
#else
|
||||
nop
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.unblock: signal other processors in the same multiprocessor complex to
|
||||
// exit the idle state entered by an h3.block instruction. On RP2350, this
|
||||
// signal is also reflected back to the processor that executed the
|
||||
// h3.unblock, which will cause that processor's next h3.block to fall
|
||||
// through immediately.
|
||||
|
||||
.macro h3.unblock
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
slt x0, x0, x1
|
||||
#else
|
||||
nop
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#else // !__ASSEMBLER__
|
||||
#ifndef __ASSEMBLER__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -128,51 +53,6 @@ extern "C" {
|
|||
#define hazard3_irqarray_clear(csr, index, data) static_assert(false, "Not supported: Xh3irq extension")
|
||||
#endif
|
||||
|
||||
|
||||
// nbits must be a constant expression
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) ({\
|
||||
uint32_t __h3_bextm_rd; \
|
||||
asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
|
||||
: "=r" (__h3_bextm_rd) \
|
||||
: "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
|
||||
); \
|
||||
__h3_bextm_rd; \
|
||||
})
|
||||
#else
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
|
||||
#endif
|
||||
|
||||
// nbits and shamt must be constant expressions
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
#define __hazard3_bextmi(nbits, rs1, shamt) ({\
|
||||
uint32_t __h3_bextmi_rd; \
|
||||
asm (".insn i 0x0b, 0x4, %0, %1, %2"\
|
||||
: "=r" (__h3_bextmi_rd) \
|
||||
: "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
|
||||
); \
|
||||
__h3_bextmi_rd; \
|
||||
})
|
||||
#else
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
|
||||
#endif
|
||||
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
|
||||
#else
|
||||
#define __hazard3_block() do {} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
|
||||
#else
|
||||
#define __hazard3_unblock() do {} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !__ASSEMBLER__
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#ifndef _HARDWARE_HAZARD3_FEATURES_H
|
||||
#define _HARDWARE_HAZARD3_FEATURES_H
|
||||
|
||||
#include "pico.h"
|
||||
|
||||
/** \file hardware/hazard3/features.h
|
||||
* \addtogroup hardware_hazard3
|
||||
*
|
||||
* \brief Sets macros for supported Hazard3 custom extensions (features) based on PICO_PLATFORM macros
|
||||
*
|
||||
*/
|
||||
|
||||
// Feature detection macros for Hazard3 custom extensions
|
||||
#if PICO_RP2350
|
||||
// Version 1.0 of these four extensions
|
||||
// (encoded as major * 100 + minor)
|
||||
#define __hazard3_extension_xh3power 100
|
||||
#define __hazard3_extension_xh3bextm 100
|
||||
#define __hazard3_extension_xh3irq 100
|
||||
#define __hazard3_extension_xh3pmpm 100
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#ifndef _HARDWARE_HAZARD3_INSTRUCTIONS_H
|
||||
#define _HARDWARE_HAZARD3_INSTRUCTIONS_H
|
||||
|
||||
#include "pico.h"
|
||||
|
||||
// Get list of supported extensions based on platform:
|
||||
#include "hardware/hazard3/features.h"
|
||||
|
||||
/** \file hardware/hazard3/instructions.h
|
||||
* \addtogroup hardware_hazard3
|
||||
*
|
||||
* \brief Intrinsics and asm macros for Hazard3 custom instructions
|
||||
*
|
||||
* The implementation of these intrinsics depends on the feature macros
|
||||
* defined in hardware/hazard3/features.h. When the relevant feature is not
|
||||
* present, the intrinsics fall back on an RV32I equivalent if possible.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
|
||||
// Assembly language instruction macros for Hazard3 custom instructions
|
||||
|
||||
// h3.bextm: Extract up to 8 consecutive bits from register rs1, with the
|
||||
// first bit indexed by rs2, and bit count configured by an immediate value.
|
||||
// R-format instruction. Pseudocode:
|
||||
//
|
||||
// rd = (rs1 >> rs2[4:0]) & ~(-1 << nbits)
|
||||
|
||||
.macro h3.bextm rd rs1 rs2 nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
.insn r 0x0b, 0x4, (((\nbits - 1) & 0x7 ) << 1), \rd, \rs1, \rs2
|
||||
#else
|
||||
srl \rd, \rs1, \rs2
|
||||
andi \rd, \rd, ((1 << \nbits) - 1)
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.bextmi: Extract up to 8 consecutive bits from register rs1, with the
|
||||
// first bit index and the number of bits both configured by immediate
|
||||
// values. I-format instruction. Pseudocode:
|
||||
//
|
||||
// rd = (rs1 >> shamt) & ~(-1 << nbits)
|
||||
|
||||
.macro h3.bextmi rd rs1 shamt nbits
|
||||
.if (\nbits < 1) || (\nbits > 8)
|
||||
.err
|
||||
.endif
|
||||
.if (\shamt < 0) || (\shamt > 31)
|
||||
.err
|
||||
.endif
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
.insn i 0x0b, 0x4, \rd, \rs1, (\shamt & 0x1f) | (((\nbits - 1) & 0x7 ) << 6)
|
||||
#else
|
||||
srli \rd, \rs1, \shamt
|
||||
andi \rd, \rd, ((1 << \nbits) - 1)
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.block: enter an idle state until another processor in the same
|
||||
// multiprocessor complex executes an h3.unblock instruction, or the
|
||||
// processor is interrupted. Fall through immediately if an h3.unblock has
|
||||
// been received since the last execution of an h3.block on this processor.
|
||||
// On RP2350, processors also have their own h3.unblock signals reflected
|
||||
// back to them.
|
||||
|
||||
.macro h3.block
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
slt x0, x0, x0
|
||||
#else
|
||||
nop
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// h3.unblock: signal other processors in the same multiprocessor complex to
|
||||
// exit the idle state entered by an h3.block instruction. On RP2350, this
|
||||
// signal is also reflected back to the processor that executed the
|
||||
// h3.unblock, which will cause that processor's next h3.block to fall
|
||||
// through immediately.
|
||||
|
||||
.macro h3.unblock
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
slt x0, x0, x1
|
||||
#else
|
||||
nop
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#else // !__ASSEMBLER__
|
||||
|
||||
// C language instruction macros for Hazard3 custom instructions
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// nbits must be a constant expression
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) ({\
|
||||
uint32_t __h3_bextm_rd; \
|
||||
asm (".insn r 0x0b, 0, %3, %0, %1, %2"\
|
||||
: "=r" (__h3_bextm_rd) \
|
||||
: "r" (rs1), "r" (rs2), "i" ((((nbits) - 1) & 0x7) << 1)\
|
||||
); \
|
||||
__h3_bextm_rd; \
|
||||
})
|
||||
#else
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((rs2) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
|
||||
#endif
|
||||
|
||||
// nbits and shamt must be constant expressions
|
||||
#ifdef __hazard3_extension_xh3bextm
|
||||
#define __hazard3_bextmi(nbits, rs1, shamt) ({\
|
||||
uint32_t __h3_bextmi_rd; \
|
||||
asm (".insn i 0x0b, 0x4, %0, %1, %2"\
|
||||
: "=r" (__h3_bextmi_rd) \
|
||||
: "r" (rs1), "i" ((((nbits) - 1) & 0x7) << 6 | ((shamt) & 0x1f)) \
|
||||
); \
|
||||
__h3_bextmi_rd; \
|
||||
})
|
||||
#else
|
||||
#define __hazard3_bextm(nbits, rs1, rs2) (((rs1) >> ((shamt) & 0x1f)) & (0xffu >> (7 - (((nbits) - 1) & 0x7))))
|
||||
#endif
|
||||
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
#define __hazard3_block() asm volatile ("slt x0, x0, x0" : : : "memory")
|
||||
#else
|
||||
#define __hazard3_block() do {} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef __hazard3_extension_xh3power
|
||||
#define __hazard3_unblock() asm volatile ("slt x0, x0, x1" : : : "memory")
|
||||
#else
|
||||
#define __hazard3_unblock() do {} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !__ASSEMBLER__
|
||||
|
||||
#endif
|
||||
|
|
@ -10,11 +10,7 @@
|
|||
|
||||
# add alias "default" which is just pico.
|
||||
add_library(pico_float_default INTERFACE)
|
||||
if (PICO_RISCV)
|
||||
target_link_libraries(pico_float_default INTERFACE pico_float_compiler)
|
||||
else()
|
||||
target_link_libraries(pico_float_default INTERFACE pico_float_pico)
|
||||
endif()
|
||||
target_link_libraries(pico_float_default INTERFACE pico_float_pico)
|
||||
|
||||
set(PICO_DEFAULT_FLOAT_IMPL pico_float_default)
|
||||
|
||||
|
|
@ -128,6 +124,10 @@
|
|||
wrap_float_functions(pico_float_pico_vfp NO_WRAP_AEABI)
|
||||
target_link_libraries(pico_float_pico INTERFACE
|
||||
pico_float_pico_vfp)
|
||||
else()
|
||||
target_sources(pico_float_pico INTERFACE
|
||||
${CMAKE_CURRENT_LIST_DIR}/float_single_hazard3.S
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
|
|
|
|||
318
src/rp2_common/pico_float/float_single_hazard3.S
Normal file
318
src/rp2_common/pico_float/float_single_hazard3.S
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include "pico/asm_helper.S"
|
||||
#include "hardware/hazard3.h"
|
||||
|
||||
// This file reimplements some common single-precision soft float routines
|
||||
// from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm)
|
||||
// and is tuned for Hazard3 execution timings.
|
||||
|
||||
// Subnormal values are always flushed to zero on both input and output.
|
||||
// Rounding is always to nearest (even on tie).
|
||||
|
||||
pico_default_asm_setup
|
||||
|
||||
.macro float_section name
|
||||
#if PICO_FLOAT_IN_RAM
|
||||
.section RAM_SECTION_NAME(\name), "ax"
|
||||
#else
|
||||
.section SECTION_NAME(\name), "ax"
|
||||
#endif
|
||||
.endm
|
||||
|
||||
float_section __addsf3
|
||||
.global __subsf3
|
||||
.p2align 2
|
||||
__subsf3:
|
||||
binvi a1, a1, 31
|
||||
.global __addsf3
|
||||
__addsf3:
|
||||
// Unpack exponent:
|
||||
h3.bextmi a2, a0, 23, 8
|
||||
h3.bextmi a3, a1, 23, 8
|
||||
// Flush-to-zero => 0 + y = y applies, including nan, with the sole
|
||||
// exception of y being subnormal (which also needs to be flushed)
|
||||
beqz a2, __addsf_return_y_flushed
|
||||
// Don't have to handle this case for x + 0 = 0 because we already know x
|
||||
// is nonzero
|
||||
beqz a3, __addsf_return_x
|
||||
// Unpack significand, plus 3 extra zeroes for working space:
|
||||
slli a4, a0, 9
|
||||
slli a5, a1, 9
|
||||
// check nan/inf on input
|
||||
li t0, 255
|
||||
beq a2, t0, __addsf_x_nan_inf
|
||||
beq a3, t0, __addsf_y_nan_inf
|
||||
// (finish unpacking significand)
|
||||
srli a4, a4, 6
|
||||
srli a5, a5, 6
|
||||
|
||||
// If we're still on the straight path then we are adding two normal
|
||||
// values. Add implicit one (1.xx...xx000)
|
||||
bseti a4, a4, 23 + 3
|
||||
bseti a5, a5, 23 + 3
|
||||
// Negate if sign bit is set
|
||||
bgez a0, 1f
|
||||
neg a4, a4
|
||||
1:
|
||||
// (tuck this 16-bit here to avoid alignment penalty)
|
||||
li t1, 25
|
||||
bgez a1, 1f
|
||||
neg a5, a5
|
||||
1:
|
||||
|
||||
bltu a2, a3, __addsf_ye_gt_xe
|
||||
|
||||
// The main body is repeated twice with different register assignments.
|
||||
// lhs is the more-significant addend:
|
||||
.macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x
|
||||
sub \packed_rhs, \exp_lhs, \exp_rhs
|
||||
// If there is a large exponent difference then there is no effect on lhs
|
||||
.if \rhs_is_x
|
||||
bgeu \packed_rhs, t1, __addsf_return_y
|
||||
.else
|
||||
bgeu \packed_rhs, t1, __addsf_return_x
|
||||
.endif
|
||||
// Shift rhs down to correct relative significance
|
||||
sra \packed_lhs, \sig_rhs, \packed_rhs
|
||||
// Set sticky bit if ones were shifted out
|
||||
sll \packed_rhs, \packed_lhs, \packed_rhs
|
||||
sltu \packed_rhs, \packed_rhs, \sig_rhs
|
||||
or \packed_lhs, \packed_lhs, \packed_rhs
|
||||
// Add significands
|
||||
add \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Detect exact cancellation (may be beyond max normalisation shift; also
|
||||
// IEEE 754 requires +0 for exact cancellation, no matter input signs)
|
||||
beqz \sig_lhs, __addsf_return_0
|
||||
// Convert two's complement back to sign + magnitude
|
||||
srai \exp_rhs, \sig_lhs, 31
|
||||
xor \sig_lhs, \sig_lhs, \exp_rhs
|
||||
sub \sig_lhs, \sig_lhs, \exp_rhs
|
||||
// Renormalise significand: bit 31 is now implicit one
|
||||
clz \packed_lhs, \sig_lhs
|
||||
sll \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Adjust exponent
|
||||
addi \packed_lhs, \packed_lhs, -5
|
||||
sub \exp_lhs, \exp_lhs, \packed_lhs
|
||||
|
||||
// Round to nearest, even on tie (bias upward if above odd number)
|
||||
bexti \packed_lhs, \sig_lhs, 8
|
||||
addi \sig_lhs, \sig_lhs, 127
|
||||
add \sig_lhs, \sig_lhs, \packed_lhs
|
||||
// Exponent may increase by one due to rounding up from all-ones; this is
|
||||
// detected by clearing of implicit one (there is a carry-out too)
|
||||
bgez \sig_lhs, 3f
|
||||
4:
|
||||
// Detect underflow/overflow
|
||||
bgeu \exp_lhs, t0, 1f
|
||||
|
||||
// Pack and return
|
||||
packh \exp_lhs, \exp_lhs, \exp_rhs
|
||||
slli \exp_lhs, \exp_lhs, 23
|
||||
slli \sig_lhs, \sig_lhs, 1
|
||||
srli \sig_lhs, \sig_lhs, 9
|
||||
add a0, \sig_lhs, \exp_lhs
|
||||
ret
|
||||
1:
|
||||
bgez \exp_lhs, 2f
|
||||
// Signed zero on underflow
|
||||
slli a0, \exp_rhs, 31
|
||||
ret
|
||||
2:
|
||||
// Signed infinity on overflow
|
||||
packh a0, t0, \exp_rhs
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
3:
|
||||
// Exponent increase due to rounding (uncommon)
|
||||
srli \sig_lhs, \sig_lhs, 1
|
||||
addi \exp_lhs, \exp_lhs, 1
|
||||
j 4b
|
||||
.endm
|
||||
|
||||
__addsf_xe_gte_ye:
|
||||
addsf_core a0, a1, a4, a5, a2, a3, 0
|
||||
.p2align 2
|
||||
__addsf_ye_gt_xe:
|
||||
addsf_core a1, a0, a5, a4, a3, a2, 1
|
||||
|
||||
__addsf_x_nan_inf:
|
||||
// When at least one operand is nan, we must propagate at least one of
|
||||
// those nan payloads (sign of nan result is unspecified, which we take
|
||||
// advantage of by implementing x - y as x + -y). Check x nan vs inf:
|
||||
bnez a4, __addsf_return_x
|
||||
__addsf_x_inf:
|
||||
// If x is +-inf, need to distinguish the following cases:
|
||||
bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf)
|
||||
bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan)
|
||||
xor a5, a0, a1
|
||||
srli a5, a5, 31
|
||||
beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster)
|
||||
li a0, -1 // y is inf of different sign -> return nan
|
||||
ret
|
||||
|
||||
__addsf_y_nan_inf:
|
||||
// Mirror of __addsf_x_nan_inf
|
||||
bnez a5, __addsf_return_y
|
||||
__addsf_y_inf:
|
||||
bne a2, t0, __addsf_return_y
|
||||
bnez a4, __addsf_return_x
|
||||
xor a4, a0, a1
|
||||
srli a4, a4, 31
|
||||
beqz a4, __addsf_return_x
|
||||
li a0, -1
|
||||
ret
|
||||
|
||||
__addsf_return_y_flushed:
|
||||
bnez a3, 1f
|
||||
srli a1, a1, 23
|
||||
slli a1, a1, 23
|
||||
1:
|
||||
__addsf_return_y:
|
||||
mv a0, a1
|
||||
__addsf_return_x:
|
||||
ret
|
||||
__addsf_return_0:
|
||||
li a0, 0
|
||||
ret
|
||||
|
||||
|
||||
float_section __mulsf3
|
||||
.global __mulsf3
|
||||
.p2align 2
|
||||
__mulsf3:
|
||||
// Force y to be positive (by possibly negating x) *before* unpacking.
|
||||
// This allows many special cases to be handled without repacking.
|
||||
bgez a1, 1f
|
||||
binvi a0, a0, 31
|
||||
1:
|
||||
// Unpack exponent:
|
||||
h3.bextmi a2, a0, 23, 8
|
||||
h3.bextmi a3, a1, 23, 8
|
||||
// Check special cases
|
||||
li t0, 255
|
||||
beqz a2, __mulsf_x_0
|
||||
beqz a3, __mulsf_y_0
|
||||
beq a2, t0, __mulsf_x_nan_inf
|
||||
beq a3, t0, __mulsf_y_nan_inf
|
||||
|
||||
// Finish unpacking sign
|
||||
srai a6, a0, 31
|
||||
// Unpack significand (with implicit one in MSB)
|
||||
slli a4, a0, 8
|
||||
slli a5, a1, 8
|
||||
bseti a4, a4, 31
|
||||
bseti a5, a5, 31
|
||||
// Get full 64-bit multiply result in a4:a1 (one cycle each half)
|
||||
// Going from Q1.23 to Q2.46 (both left-justified)
|
||||
mul a1, a4, a5
|
||||
mulhu a4, a4, a5
|
||||
// Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the
|
||||
// final significand (ignoring rounding)
|
||||
clz a0, a4
|
||||
sll a4, a4, a0
|
||||
sub a2, a2, a0
|
||||
// After normalising we can calculate the final exponent, since rounding
|
||||
// cannot increase the exponent for multiplication (unlike addition)
|
||||
add a2, a2, a3
|
||||
// Subtract redundant bias term (127), add 1 for normalisation correction
|
||||
addi a2, a2, -126
|
||||
blez a2, __mulsf_underflow
|
||||
bge a2, t0, __mulsf_overflow
|
||||
|
||||
// Gather sticky bits from low fraction:
|
||||
snez a1, a1
|
||||
or a4, a4, a1
|
||||
// Round to nearest, even on tie (aka bias upward if odd)
|
||||
bexti a1, a4, 8
|
||||
add a4, a4, a1
|
||||
addi a4, a4, 127
|
||||
// Pack it and ship it
|
||||
packh a2, a2, a6
|
||||
slli a2, a2, 23
|
||||
slli a4, a4, 1
|
||||
srli a4, a4, 9
|
||||
add a0, a4, a2
|
||||
ret
|
||||
|
||||
__mulsf_underflow:
|
||||
// Signed zero
|
||||
slli a0, a6, 31
|
||||
ret
|
||||
__mulsf_overflow:
|
||||
// Signed inf
|
||||
packh a0, t0, a6
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
|
||||
__mulsf_x_0:
|
||||
// 0 times nan -> propagate nan
|
||||
// 0 times inf -> generate nan
|
||||
// 0 times others -> 0 (need to flush significand too as we are FTZ)
|
||||
bne a3, t0, __mulsf_return_flushed_x
|
||||
slli a5, a1, 9
|
||||
beqz a5, 1f
|
||||
// Propagate nan from y
|
||||
__mulsf_return_y:
|
||||
mv a0, a1
|
||||
ret
|
||||
1:
|
||||
// Generate new nan
|
||||
li a0, -1
|
||||
ret
|
||||
|
||||
__mulsf_y_0:
|
||||
// Mirror image of x_0 except we still return x for signed 0, since the
|
||||
// signs were already resolved.
|
||||
bne a2, t0, __mulsf_return_flushed_x
|
||||
slli a1, a0, 9
|
||||
bnez a1, 1f
|
||||
li a0, -1
|
||||
1:
|
||||
ret
|
||||
|
||||
__mulsf_return_flushed_x:
|
||||
// If we don't support subnormals we at least need to flush to a canonical
|
||||
// zero. This is just a sign bit in bit 31.
|
||||
srli a0, a0, 31
|
||||
slli a0, a0, 31
|
||||
__mulsf_return_x:
|
||||
ret
|
||||
|
||||
__mulsf_x_nan_inf:
|
||||
// We know that y is not zero and is positive. So...
|
||||
// x is nan -> return x
|
||||
// else y is nan -> return y
|
||||
// else y is inf -> return x
|
||||
// else y is normal -> return x
|
||||
// (the order of the first two clauses is actually our free choice)
|
||||
slli a4, a0, 9
|
||||
bnez a4, __mulsf_return_x
|
||||
bne a3, t0, __mulsf_return_x
|
||||
slli a5, a1, 9
|
||||
bnez a5, __mulsf_return_y
|
||||
ret // return x
|
||||
|
||||
__mulsf_y_nan_inf:
|
||||
// We know that x is not zero, nan, nor inf. That just leaves normals.
|
||||
// y is nan -> return y
|
||||
// y is inf -> return inf * sgn(x) (since we already merged the signs)
|
||||
slli a5, a1, 9
|
||||
bnez a5, __mulsf_return_y
|
||||
srai a0, a0, 31
|
||||
packh a0, t0, a0
|
||||
slli a0, a0, 23
|
||||
ret
|
||||
|
||||
|
||||
// This is a hack to improve soft float performance for the routines we don't
|
||||
// implement (e.g. libm) in libraries built against a non-Zbb ISA dialect:
|
||||
float_section __clz2si
|
||||
.global __clz2si
|
||||
__clz2si:
|
||||
clz a0, a0
|
||||
ret
|
||||
|
|
@ -21,8 +21,8 @@ extern "C" {
|
|||
*
|
||||
* \brief Optimized single-precision floating point functions
|
||||
*
|
||||
* (Replacement) optimized implementations are provided of the following compiler built-ins
|
||||
* and math library functions:
|
||||
* (Replacement) optimized implementations are provided for the following compiler built-ins
|
||||
* and math library functions on Arm:
|
||||
*
|
||||
* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
|
||||
* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
|
||||
|
|
@ -34,11 +34,18 @@ extern "C" {
|
|||
* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
|
||||
* - exp10f, sincosf, powintf
|
||||
*
|
||||
* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly
|
||||
* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
|
||||
*
|
||||
* - float2fix64_z, fdiv_fast, fsqrt_fast,
|
||||
*
|
||||
* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
|
||||
*
|
||||
* - __addsf3, __subsf3, __mulsf3
|
||||
*/
|
||||
|
||||
// None of these functions are available on RISC-V:
|
||||
#if !defined(__riscv) || PICO_COMBINED_DOCS
|
||||
|
||||
float int2float(int32_t f);
|
||||
float uint2float(uint32_t f);
|
||||
float int642float(int64_t f);
|
||||
|
|
@ -74,6 +81,8 @@ float fdiv_fast(float n, float d);
|
|||
float fsqrt_fast(float f);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,7 +1,22 @@
|
|||
PROJECT(pico_float_test)
|
||||
|
||||
# todo revist this test for
|
||||
if (NOT PICO_RISCV)
|
||||
|
||||
if (PICO_RISCV)
|
||||
|
||||
# Separate, simpler test: currently we only have a few single-precision
|
||||
# routines for RISC-V soft float (and the other tests are a bit
|
||||
# AEABI-dependent)
|
||||
add_executable(pico_float_test
|
||||
pico_float_test_hazard3.c
|
||||
)
|
||||
target_link_libraries(pico_float_test PRIVATE pico_float pico_stdlib)
|
||||
target_include_directories(pico_float_test PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
pico_add_extra_outputs(pico_float_test)
|
||||
|
||||
# pico_enable_stdio_usb(pico_float_test 1)
|
||||
# pico_enable_stdio_uart(pico_float_test 0)
|
||||
|
||||
else ()
|
||||
add_executable(pico_float_test
|
||||
pico_float_test.c
|
||||
llvm/call_apsr.S
|
||||
|
|
@ -64,4 +79,4 @@ if (NOT PICO_RISCV)
|
|||
target_link_libraries(m33 pico_double pico_stdlib)
|
||||
pico_add_extra_outputs(m33)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
|||
136
test/pico_float_test/hazard3_test_gen.c
Normal file
136
test/pico_float_test/hazard3_test_gen.c
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
/*
|
||||
* Copyright (c) 2024 Raspberry Pi Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fenv.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// xoroshiro256++ pseudorandom number generator.
|
||||
// Adapted from: https://prng.di.unimi.it/xoshiro256plusplus.c
|
||||
// Original copyright notice:
|
||||
|
||||
/* Written in 2019 by David Blackman and Sebastiano Vigna (vigna@acm.org)
|
||||
|
||||
To the extent possible under law, the author has dedicated all copyright
|
||||
and related and neighboring rights to this software to the public domain
|
||||
worldwide. This software is distributed without any warranty.
|
||||
|
||||
See <http://creativecommons.org/publicdomain/zero/1.0/>. */
|
||||
|
||||
/* This is xoshiro256++ 1.0, one of our all-purpose, rock-solid generators.
|
||||
It has excellent (sub-ns) speed, a state (256 bits) that is large
|
||||
enough for any parallel application, and it passes all tests we are
|
||||
aware of.
|
||||
|
||||
For generating just floating-point numbers, xoshiro256+ is even faster.
|
||||
|
||||
The state must be seeded so that it is not everywhere zero. If you have
|
||||
a 64-bit seed, we suggest to seed a splitmix64 generator and use its
|
||||
output to fill s. */
|
||||
|
||||
static inline uint64_t xr256_rotl(const uint64_t x, int k) {
|
||||
return (x << k) | (x >> (64 - k));
|
||||
}
|
||||
|
||||
uint64_t xr256_next(uint64_t s[4]) {
|
||||
const uint64_t result = xr256_rotl(s[0] + s[3], 23) + s[0];
|
||||
|
||||
const uint64_t t = s[1] << 17;
|
||||
|
||||
s[2] ^= s[0];
|
||||
s[3] ^= s[1];
|
||||
s[1] ^= s[2];
|
||||
s[0] ^= s[3];
|
||||
|
||||
s[2] ^= t;
|
||||
|
||||
s[3] = xr256_rotl(s[3], 45);
|
||||
|
||||
return result;
|
||||
}
|
||||
uint32_t bitcast_f2u(float x) {
|
||||
// This is UB but then so is every C program
|
||||
union {
|
||||
float f;
|
||||
uint32_t u;
|
||||
} un;
|
||||
un.f = x;
|
||||
return un.u;
|
||||
}
|
||||
|
||||
float bitcast_u2f(uint32_t x) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t u;
|
||||
} un;
|
||||
un.u = x;
|
||||
return un.f;
|
||||
}
|
||||
|
||||
bool is_nan_u(uint32_t x) {
|
||||
return ((x >> 23) & 0xffu) == 0xffu && (x & ~(-1u << 23));
|
||||
}
|
||||
|
||||
uint32_t flush_to_zero_u(uint32_t x) {
|
||||
if (!(x & (0xffu << 23))) {
|
||||
x &= -1u << 23;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
uint32_t model_fadd(uint32_t x, uint32_t y) {
|
||||
x = flush_to_zero_u(x);
|
||||
y = flush_to_zero_u(y);
|
||||
// Use local hardware implementation to perform calculation
|
||||
uint32_t result = bitcast_f2u(bitcast_u2f(x) + bitcast_u2f(y));
|
||||
// Use correct canonical generated nan
|
||||
if (is_nan_u(result)) {
|
||||
result = -1u;
|
||||
}
|
||||
result = flush_to_zero_u(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
uint32_t model_fmul(uint32_t x, uint32_t y) {
|
||||
x = flush_to_zero_u(x);
|
||||
y = flush_to_zero_u(y);
|
||||
// Use local hardware implementation to perform calculation
|
||||
uint32_t result = bitcast_f2u(bitcast_u2f(x) * bitcast_u2f(y));
|
||||
// Use correct canonical generated nan
|
||||
if (is_nan_u(result)) {
|
||||
result = -1u;
|
||||
}
|
||||
result = flush_to_zero_u(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
int main() {
|
||||
// SHA-256 of a rude word
|
||||
uint64_t rand_state[4] = {
|
||||
0x5891b5b522d5df08u,
|
||||
0x6d0ff0b110fbd9d2u,
|
||||
0x1bb4fc7163af34d0u,
|
||||
0x8286a2e846f6be03u
|
||||
};
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
uint32_t x, y;
|
||||
x = xr256_next(rand_state) & 0xffffffffu;
|
||||
y = xr256_next(rand_state) & 0xffffffffu;
|
||||
// Map nan to +-inf (input nans should already be well-covered)
|
||||
if (is_nan_u(x)) {
|
||||
x &= -1u << 23;
|
||||
}
|
||||
if (is_nan_u(y)) {
|
||||
y &= -1u << 23;
|
||||
}
|
||||
#if 1
|
||||
printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fadd(x, y));
|
||||
#else
|
||||
printf("{0x%08xu, 0x%08xu, 0x%08xu},\n", x, y, model_fmul(x, y));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
209
test/pico_float_test/pico_float_test_hazard3.c
Normal file
209
test/pico_float_test/pico_float_test_hazard3.c
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
/**
|
||||
* Copyright (c) 2024 Raspberry Pi Ltd.
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "pico/stdlib.h"
|
||||
|
||||
// This test covers the single-precision functions in:
|
||||
//
|
||||
// src/pico_float/float_hazard3_single.S
|
||||
//
|
||||
// It assumes the canonical generated-NaN value and NaN sign rules used by
|
||||
// those functions (which are unspecified by IEEE 754). It does not cover
|
||||
// libgcc/libm functions from outside of that source file.
|
||||
|
||||
typedef struct {
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t expect;
|
||||
} test_t;
|
||||
|
||||
test_t add_directed_tests[] = {
|
||||
// 1 + 1 = 2
|
||||
{0x3f800000u, 0x3f800000u, 0x40000000u},
|
||||
// 2 + 1 = 3
|
||||
{0x40000000u, 0x3f800000u, 0x40400000u},
|
||||
// 1 + 2 = 3
|
||||
{0x3f800000u, 0x40000000u, 0x40400000u},
|
||||
// 1 + -1 = +0 (exact cancellation)
|
||||
{0x3f800000u, 0xbf800000u, 0x00000000u},
|
||||
// -1 + 1 = +0 (exact cancellation)
|
||||
{0xbf800000u, 0x3f800000u, 0x00000000u},
|
||||
// 1 + <<1 ulp = 1
|
||||
{0x3f800000u, 0x2f800000u, 0x3f800000u},
|
||||
// <<1 ulp + 1 = 1
|
||||
{0x2f800000u, 0x3f800000u, 0x3f800000u},
|
||||
// -1 + 1.25 = 0.25
|
||||
{0xbf800000u, 0x3fa00000u, 0x3e800000u},
|
||||
// max normal + 0.5 ulp = +inf
|
||||
{0x7f7fffffu, 0x73000000u, 0x7f800000u},
|
||||
// max normal + max normal = +inf
|
||||
{0x7f7fffffu, 0x7f7fffffu, 0x7f800000u},
|
||||
// min normal - 0.5 ulp = -inf
|
||||
{0xff7fffffu, 0xf3000000u, 0xff800000u},
|
||||
// min normal + min_normal = -inf
|
||||
{0xff7fffffu, 0xff7fffffu, 0xff800000u},
|
||||
// max normal + 0.499... ulp = max normal
|
||||
{0x7f7fffffu, 0x72ffffffu, 0x7f7fffffu},
|
||||
// min normal - 0.499... ulp = min normal
|
||||
{0xff7fffffu, 0xf2ffffffu, 0xff7fffffu},
|
||||
// nan + 0 = same nan
|
||||
{0xffff1234u, 0x00000000u, 0xffff1234u},
|
||||
// 0 + nan = same nan
|
||||
{0x00000000u, 0xffff1234u, 0xffff1234u},
|
||||
// nan + 1 = same nan
|
||||
{0xffff1234u, 0x3f800000u, 0xffff1234u},
|
||||
// 1 + nan = same nan
|
||||
{0x3f800000u, 0xffff1234u, 0xffff1234u},
|
||||
// nan + inf = same nan
|
||||
{0xffff1234u, 0x7f800000u, 0xffff1234u},
|
||||
// inf + nan = same nan
|
||||
{0x7f800000u, 0xffff1234u, 0xffff1234u},
|
||||
// inf + inf = inf
|
||||
{0x7f800000u, 0x7f800000u, 0x7f800000u},
|
||||
// -inf + -inf = -inf
|
||||
{0xff800000u, 0xff800000u, 0xff800000u},
|
||||
// inf + -inf = nan (all-ones is our canonical cheap nan)
|
||||
{0x7f800000u, 0xff800000u, 0xffffffffu},
|
||||
// -inf + inf = nan
|
||||
{0xff800000u, 0x7f800000u, 0xffffffffu},
|
||||
// subnormal + subnormal = exactly 0
|
||||
{0x007fffffu, 0x007fffffu, 0x00000000u},
|
||||
// -subnormal + -subnormal = exactly -0
|
||||
{0x807fffffu, 0x807fffffu, 0x80000000u},
|
||||
// Even + 0.5 ulp: round down
|
||||
{0x3f800002u, 0x33800000u, 0x3f800002u},
|
||||
// Even - 0.5 ulp: round up
|
||||
{0x3f800002u, 0xb3800000u, 0x3f800002u},
|
||||
// Odd + 0.5 ulp: round up
|
||||
{0x3f800001u, 0x33800000u, 0x3f800002u},
|
||||
// Odd - 0.5 ulp: round down
|
||||
{0x3f800001u, 0xb3800000u, 0x3f800000u},
|
||||
// All-zeroes significand - 0.5 ulp: no rounding (exact)
|
||||
{0x3f800000u, 0xb3800000u, 0x3f7fffffu},
|
||||
// Very subnormal difference of normals: flushed to zero
|
||||
{0x03800000u, 0x837fffffu, 0x00000000u},
|
||||
// Barely subnormal difference of normals: also flushed (unflushed result is 2^(emin-1))
|
||||
{0x03800000u, 0x837e0000u, 0x00000000u},
|
||||
};
|
||||
|
||||
test_t mul_directed_tests[] = {
|
||||
// -- directed tests --
|
||||
// 1 * 1 = 1
|
||||
{0x3f800000u, 0x3f800000u, 0x3f800000u},
|
||||
// 1 * -1 = -1
|
||||
{0x3f800000u, 0xbf800000u, 0xbf800000u},
|
||||
// -1 * 1 = -1
|
||||
{0xbf800000u, 0x3f800000u, 0xbf800000u},
|
||||
// -1 * -1 = 1
|
||||
{0xbf800000u, 0xbf800000u, 0x3f800000u},
|
||||
// -0 * 0 = -0
|
||||
{0x80000000u, 0x00000000u, 0x80000000u},
|
||||
// 0 * -0 = - 0
|
||||
{0x00000000u, 0x80000000u, 0x80000000u},
|
||||
// 1 * 2 = 2
|
||||
{0x3f800000u, 0x40000000u, 0x40000000u},
|
||||
// 2 * 1 = 2
|
||||
{0x40000000u, 0x3f800000u, 0x40000000u},
|
||||
// inf * inf = inf
|
||||
{0x7f800000u, 0x7f800000u, 0x7f800000u},
|
||||
// inf * -inf = -inf
|
||||
{0x7f800000u, 0xff800000u, 0xff800000u},
|
||||
// inf * 0 = nan
|
||||
{0x7f800000u, 0x00000000u, 0xffffffffu},
|
||||
// 0 * inf = nan
|
||||
{0x00000000u, 0x7f800000u, 0xffffffffu},
|
||||
// 1 * -inf = -inf
|
||||
{0x3f800000u, 0xff800000u, 0xff800000u},
|
||||
// -inf * 1 = -inf
|
||||
{0xff800000u, 0x3f800000u, 0xff800000u},
|
||||
// -1 * inf = -inf
|
||||
{0xbf800000u, 0x7f800000u, 0xff800000u},
|
||||
// inf * -1 = -inf
|
||||
{0x7f800000u, 0xbf800000u, 0xff800000u},
|
||||
// 1 * nonzero subnormal = exactly 0
|
||||
{0x3f800000u, 0x007fffffu, 0x00000000u},
|
||||
// nonzero subnormal * -1 = exactly -0
|
||||
{0x007fffffu, 0xbf800000u, 0x80000000u},
|
||||
// nan * 0 = same nan
|
||||
{0xffff1234u, 0x00000000u, 0xffff1234u},
|
||||
// 0 * nan = same nan
|
||||
{0x00000000u, 0xffff1234u, 0xffff1234u},
|
||||
// nan * 1 = same nan
|
||||
{0xffff1234u, 0x3f800000u, 0xffff1234u},
|
||||
// 1 * nan = same nan
|
||||
{0x3f800000u, 0xffff1234u, 0xffff1234u},
|
||||
// nan * inf = same nan
|
||||
{0xffff1234u, 0x7f800000u, 0xffff1234u},
|
||||
// inf * nan = same nan
|
||||
{0x7f800000u, 0xffff1234u, 0xffff1234u},
|
||||
// (2 - 0.5 ulp) x (2 - 0.5 ulp) = 4 - 0.5 ulp
|
||||
{0x3fffffffu, 0x3fffffffu, 0x407ffffeu},
|
||||
// (2 - 0.5 ulp) x (1 + 1 ulp) = 2 exactly
|
||||
{0xbfffffffu, 0x3f800001u, 0xc0000000u},
|
||||
// 1.666... * 1.333.. = 2.222...
|
||||
{0x3fd55555u, 0x3faaaaaau, 0x400e38e3u},
|
||||
// 1.25 x 2^-63 x 1.25 x 2^-64 = 0
|
||||
// (normal inputs with subnormal output, and we claim to be FTZ)
|
||||
{0x20200000u, 0x1fa00000u, 0x00000000u},
|
||||
};
|
||||
|
||||
#define N_RANDOM_TESTS 1000
|
||||
extern test_t add_random_tests[N_RANDOM_TESTS];
|
||||
extern test_t mul_random_tests[N_RANDOM_TESTS];
|
||||
|
||||
uint32_t __addsf3(uint32_t x, uint32_t y);
|
||||
uint32_t __mulsf3(uint32_t x, uint32_t y);
|
||||
|
||||
int run_tests(test_t *tests, int n_tests, const char *op_str, uint32_t (*func)(uint32_t, uint32_t)) {
|
||||
int failed = 0;
|
||||
for (int i = 0; i < n_tests; ++i) {
|
||||
uint32_t actual = func(tests[i].x, tests[i].y);
|
||||
if (tests[i].expect != actual) {
|
||||
printf("%08x %s %08x -> %08x", tests[i].x, op_str, tests[i].y, tests[i].expect);
|
||||
printf(" FAIL: got %08x\n", actual);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
printf("Passed: %d / %d\n", n_tests - failed, n_tests);
|
||||
return failed;
|
||||
}
|
||||
|
||||
int main() {
|
||||
stdio_init_all();
|
||||
int failed = 0;
|
||||
sleep_ms(3000);
|
||||
printf("Testing: __addsf3 (directed tests)\n");
|
||||
failed += run_tests(add_directed_tests, count_of(add_directed_tests), "+", __addsf3);
|
||||
printf("Testing: __mulsf3 (directed tests)\n");
|
||||
failed += run_tests(mul_directed_tests, count_of(mul_directed_tests), "*", __mulsf3);
|
||||
if (failed) {
|
||||
printf("Skipping random tests due to %d test failures\n", failed);
|
||||
goto done;
|
||||
}
|
||||
printf("Testing: __addsf3 (random tests)\n");
|
||||
failed += run_tests(add_random_tests, N_RANDOM_TESTS, "+", __addsf3);
|
||||
printf("Testing: __mulsf3 (random tests)\n");
|
||||
failed += run_tests(mul_random_tests, N_RANDOM_TESTS, "*", __mulsf3);
|
||||
|
||||
printf("%d tests failed.\n", failed);
|
||||
if (failed == 0) {
|
||||
printf("Well done, you can relax now\n");
|
||||
}
|
||||
done:
|
||||
while (true) {asm volatile ("wfi\n");} // keep USB stdout alive
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Generated using the FPU on my machine (Zen 4) plus FTZ on inputs/outputs
|
||||
// See hazard3_test_gen.c
|
||||
test_t add_random_tests[N_RANDOM_TESTS] = {
|
||||
#include "vectors/hazard3_addsf.inc"
|
||||
};
|
||||
|
||||
test_t mul_random_tests[N_RANDOM_TESTS] = {
|
||||
#include "vectors/hazard3_mulsf.inc"
|
||||
};
|
||||
1000
test/pico_float_test/vectors/hazard3_addsf.inc
Normal file
1000
test/pico_float_test/vectors/hazard3_addsf.inc
Normal file
File diff suppressed because it is too large
Load diff
1000
test/pico_float_test/vectors/hazard3_mulsf.inc
Normal file
1000
test/pico_float_test/vectors/hazard3_mulsf.inc
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue