diff --git a/src/rp2040/pico_platform/include/pico/platform/cpu_regs.h b/src/rp2040/pico_platform/include/pico/platform/cpu_regs.h
index 21ff95af..a51ae84a 100644
--- a/src/rp2040/pico_platform/include/pico/platform/cpu_regs.h
+++ b/src/rp2040/pico_platform/include/pico/platform/cpu_regs.h
@@ -23,6 +23,7 @@
 #define arm_cpu_hw m0plus_hw
 #include "hardware/structs/nvic.h"
 #include "hardware/structs/scb.h"
+#include "hardware/structs/systick.h"
 #endif
 
 #endif
\ No newline at end of file
diff --git a/src/rp2350/pico_platform/include/pico/platform/cpu_regs.h b/src/rp2350/pico_platform/include/pico/platform/cpu_regs.h
index b79127c7..87125b56 100644
--- a/src/rp2350/pico_platform/include/pico/platform/cpu_regs.h
+++ b/src/rp2350/pico_platform/include/pico/platform/cpu_regs.h
@@ -25,6 +25,7 @@
 #define arm_cpu_hw m33_hw
 #include "hardware/structs/nvic.h"
 #include "hardware/structs/scb.h"
+#include "hardware/structs/systick.h"
 #endif
 #endif
 #endif
\ No newline at end of file
diff --git a/src/rp2_common/pico_double/CMakeLists.txt b/src/rp2_common/pico_double/CMakeLists.txt
index c038db86..2d5ac90e 100644
--- a/src/rp2_common/pico_double/CMakeLists.txt
+++ b/src/rp2_common/pico_double/CMakeLists.txt
@@ -23,68 +23,84 @@ if (NOT TARGET pico_double)
             $<IF:$<BOOL:$<TARGET_PROPERTY:PICO_TARGET_DOUBLE_IMPL>>,$<TARGET_PROPERTY:PICO_TARGET_DOUBLE_IMPL>,${PICO_DEFAULT_DOUBLE_IMPL}>)
 
     function(wrap_double_functions TARGET)
-        pico_wrap_function(${TARGET} __aeabi_dadd)
-        pico_wrap_function(${TARGET} __aeabi_ddiv)
-        pico_wrap_function(${TARGET} __aeabi_dmul)
-        pico_wrap_function(${TARGET} __aeabi_drsub)
-        pico_wrap_function(${TARGET} __aeabi_dsub)
-        pico_wrap_function(${TARGET} __aeabi_cdcmpeq)
-        pico_wrap_function(${TARGET} __aeabi_cdrcmple)
-        pico_wrap_function(${TARGET} __aeabi_cdcmple)
-        pico_wrap_function(${TARGET} __aeabi_dcmpeq)
-        pico_wrap_function(${TARGET} __aeabi_dcmplt)
-        pico_wrap_function(${TARGET} __aeabi_dcmple)
-        pico_wrap_function(${TARGET} __aeabi_dcmpge)
-        pico_wrap_function(${TARGET} __aeabi_dcmpgt)
-        pico_wrap_function(${TARGET} __aeabi_dcmpun)
-        pico_wrap_function(${TARGET} __aeabi_i2d)
-        pico_wrap_function(${TARGET} __aeabi_l2d)
-        pico_wrap_function(${TARGET} __aeabi_ui2d)
-        pico_wrap_function(${TARGET} __aeabi_ul2d)
-        pico_wrap_function(${TARGET} __aeabi_d2iz)
-        pico_wrap_function(${TARGET} __aeabi_d2lz)
-        pico_wrap_function(${TARGET} __aeabi_d2uiz)
-        pico_wrap_function(${TARGET} __aeabi_d2ulz)
-        pico_wrap_function(${TARGET} __aeabi_d2f)
-        pico_wrap_function(${TARGET} sqrt)
-        pico_wrap_function(${TARGET} cos)
-        pico_wrap_function(${TARGET} sin)
-        pico_wrap_function(${TARGET} tan)
-        pico_wrap_function(${TARGET} atan2)
-        pico_wrap_function(${TARGET} exp)
-        pico_wrap_function(${TARGET} log)
-
-        pico_wrap_function(${TARGET} ldexp)
-        pico_wrap_function(${TARGET} copysign)
-        pico_wrap_function(${TARGET} trunc)
-        pico_wrap_function(${TARGET} floor)
-        pico_wrap_function(${TARGET} ceil)
-        pico_wrap_function(${TARGET} round)
-        pico_wrap_function(${TARGET} sincos) # gnu
-        pico_wrap_function(${TARGET} asin)
-        pico_wrap_function(${TARGET} acos)
-        pico_wrap_function(${TARGET} atan)
-        pico_wrap_function(${TARGET} sinh)
-        pico_wrap_function(${TARGET} cosh)
-        pico_wrap_function(${TARGET} tanh)
-        pico_wrap_function(${TARGET} asinh)
-        pico_wrap_function(${TARGET} acosh)
-        pico_wrap_function(${TARGET} atanh)
-        pico_wrap_function(${TARGET} exp2)
-        pico_wrap_function(${TARGET} log2)
-        pico_wrap_function(${TARGET} exp10)
-        pico_wrap_function(${TARGET} log10)
-        pico_wrap_function(${TARGET} pow)
-        pico_wrap_function(${TARGET} powint) #gnu
-        pico_wrap_function(${TARGET} hypot)
-        pico_wrap_function(${TARGET} cbrt)
-        pico_wrap_function(${TARGET} fmod)
-        pico_wrap_function(${TARGET} drem)
-        pico_wrap_function(${TARGET} remainder)
-        pico_wrap_function(${TARGET} remquo)
-        pico_wrap_function(${TARGET} expm1)
-        pico_wrap_function(${TARGET} log1p)
-        pico_wrap_function(${TARGET} fma)
+        cmake_parse_arguments(WRAP_DOUBLE "NO_AEABI_ARITHMETIC;NO_AEABI_CMP;NO_AEABI_CONV_32;NO_AEABI_CONV_64;NO_AEABI_CONV_FLOAT;NO_SQRT;NO_SCI;NO_SCI_EXTRA" "" "" ${ARGN} )
+        if (NOT WRAP_DOUBLE_NO_AEABI_ARITHMETIC)
+            pico_wrap_function(${TARGET} __aeabi_dadd)
+            pico_wrap_function(${TARGET} __aeabi_ddiv)
+            pico_wrap_function(${TARGET} __aeabi_dmul)
+            pico_wrap_function(${TARGET} __aeabi_drsub)
+            pico_wrap_function(${TARGET} __aeabi_dsub)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_AEABI_CMP)
+            pico_wrap_function(${TARGET} __aeabi_cdcmpeq)
+            pico_wrap_function(${TARGET} __aeabi_cdrcmple)
+            pico_wrap_function(${TARGET} __aeabi_cdcmple)
+            pico_wrap_function(${TARGET} __aeabi_dcmpeq)
+            pico_wrap_function(${TARGET} __aeabi_dcmplt)
+            pico_wrap_function(${TARGET} __aeabi_dcmple)
+            pico_wrap_function(${TARGET} __aeabi_dcmpge)
+            pico_wrap_function(${TARGET} __aeabi_dcmpgt)
+            pico_wrap_function(${TARGET} __aeabi_dcmpun)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_AEABI_CONV_32)
+            pico_wrap_function(${TARGET} __aeabi_i2d)
+            pico_wrap_function(${TARGET} __aeabi_ui2d)
+            pico_wrap_function(${TARGET} __aeabi_d2iz)
+            pico_wrap_function(${TARGET} __aeabi_d2uiz)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_AEABI_CONV_64)
+            pico_wrap_function(${TARGET} __aeabi_l2d)
+            pico_wrap_function(${TARGET} __aeabi_ul2d)
+            pico_wrap_function(${TARGET} __aeabi_d2lz)
+            pico_wrap_function(${TARGET} __aeabi_d2ulz)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_AEABI_CONV_FLOAT)
+                pico_wrap_function(${TARGET} __aeabi_d2f)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_SQRT)
+            pico_wrap_function(${TARGET} sqrt)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_SCI)
+            pico_wrap_function(${TARGET} cos)
+            pico_wrap_function(${TARGET} sin)
+            pico_wrap_function(${TARGET} tan)
+            pico_wrap_function(${TARGET} atan2)
+            pico_wrap_function(${TARGET} exp)
+            pico_wrap_function(${TARGET} log)
+        endif()
+        if (NOT WRAP_DOUBLE_NO_SCI_EXTRA)
+            pico_wrap_function(${TARGET} ldexp)
+            pico_wrap_function(${TARGET} copysign)
+            pico_wrap_function(${TARGET} trunc)
+            pico_wrap_function(${TARGET} floor)
+            pico_wrap_function(${TARGET} ceil)
+            pico_wrap_function(${TARGET} round)
+            pico_wrap_function(${TARGET} sincos) # gnu
+            pico_wrap_function(${TARGET} asin)
+            pico_wrap_function(${TARGET} acos)
+            pico_wrap_function(${TARGET} atan)
+            pico_wrap_function(${TARGET} sinh)
+            pico_wrap_function(${TARGET} cosh)
+            pico_wrap_function(${TARGET} tanh)
+            pico_wrap_function(${TARGET} asinh)
+            pico_wrap_function(${TARGET} acosh)
+            pico_wrap_function(${TARGET} atanh)
+            pico_wrap_function(${TARGET} exp2)
+            pico_wrap_function(${TARGET} log2)
+            pico_wrap_function(${TARGET} exp10)
+            pico_wrap_function(${TARGET} log10)
+            pico_wrap_function(${TARGET} pow)
+            pico_wrap_function(${TARGET} powint) #gnu
+            pico_wrap_function(${TARGET} hypot)
+            pico_wrap_function(${TARGET} cbrt)
+            pico_wrap_function(${TARGET} fmod)
+            pico_wrap_function(${TARGET} drem)
+            pico_wrap_function(${TARGET} remainder)
+            pico_wrap_function(${TARGET} remquo)
+            pico_wrap_function(${TARGET} expm1)
+            pico_wrap_function(${TARGET} log1p)
+            pico_wrap_function(${TARGET} fma)
+        endif()
     endfunction()
 
     pico_add_library(pico_double_pico)
@@ -96,8 +112,8 @@ if (NOT TARGET pico_double)
                 ${CMAKE_CURRENT_LIST_DIR}/double_v1_rom_shim_rp2040.S
         )
         target_link_libraries(pico_double_pico INTERFACE pico_bootrom pico_double_headers hardware_divider)
-        wrap_double_functions(pico_double_pico)
-    elseif(NOT PICO_RISCV)
+        wrap_double_functions(pico_double_pico) # wrap everything
+    elseif(PICO_RP2350 AND NOT PICO_RISCV)
         pico_add_library(pico_double_pico_dcp)
         target_sources(pico_double_pico_dcp INTERFACE
                 ${CMAKE_CURRENT_LIST_DIR}/double_math.c
@@ -107,11 +123,10 @@ if (NOT TARGET pico_double)
                 ${CMAKE_CURRENT_LIST_DIR}/double_conv_m33.S
                 )
         target_link_libraries(pico_double_pico_dcp INTERFACE pico_double_headers)
-        wrap_double_functions(pico_double_pico_dcp)
+        wrap_double_functions(pico_double_pico_dcp) #wrap everything
         target_link_libraries(pico_double_pico INTERFACE pico_double_pico_dcp)
     endif()
 
-
     pico_add_library(pico_double_none)
     target_sources(pico_double_none INTERFACE
             ${CMAKE_CURRENT_LIST_DIR}/double_none.S
diff --git a/src/rp2_common/pico_double/include/pico/double.h b/src/rp2_common/pico_double/include/pico/double.h
index 188c34f7..9afce8bb 100644
--- a/src/rp2_common/pico_double/include/pico/double.h
+++ b/src/rp2_common/pico_double/include/pico/double.h
@@ -84,7 +84,10 @@ extern "C" {
 *
 * - GNU extensions:
 *
-*   powint, sincos
+*   sincos
+*
+* Additional functions on Arm:
+*   powint
 *
 * On Arm, the following additional optimized functions are also provided when using `pico_double_pico`, all of which
 * saturate to the nearest representable value for too large input when converting from floating point types:
@@ -129,9 +132,33 @@ extern "C" {
 * On RISC-V there is no custom double-precision floating point support, so `pico_double_pico` is equivalent to `pico_double_compiler`
 * \endif
 */
+
+// === we always define these
+#define PICO_DOUBLE_HAS_INT32_TO_DOUBLE_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_INT64_TO_DOUBLE_CONVERSIONS 1
+// rounding towards zero
+#define PICO_DOUBLE_HAS_DOUBLE_TO_INT32_Z_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_DOUBLE_TO_INT64_Z_CONVERSIONS 1
+// ===
+
+// PICO_CONFIG: PICO_DOUBLE_IN_RAM, Force placement of SDK provided double-precision floating point into RAM, type=bool, default=0, group=pico_float
 #if !defined(__riscv) || PICO_COMBINED_DOCS
 
 #if PICO_COMBINED_DOCS || !LIB_PICO_DOUBLE_COMPILER
+#define PICO_DOUBLE_HAS_FIX32_TO_DOUBLE_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_FIX64_TO_DOUBLE_CONVERSIONS 1
+// rounding towards zero
+#define PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_Z_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_Z_CONVERSIONS 1
+
+// rounding towards negative infinity
+#define PICO_DOUBLE_HAS_DOUBLE_TO_INT32_M_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_DOUBLE_TO_INT64_M_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_M_CONVERSIONS 1
+#define PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_M_CONVERSIONS 1
+
+#define PICO_DOUBLE_HAS_POWINT 1
+
 double int2double(int32_t i);
 double uint2double(uint32_t i);
 double int642double(int64_t i);
@@ -143,8 +170,8 @@ double ufix642double(uint64_t m, int e);
 
 // These methods round towards 0, which IS the C way
 int32_t double2int_z(double f);
-int64_t double2int64_z(double f);
 int32_t double2uint_z(double f);
+int64_t double2int64_z(double f);
 int64_t double2uint64_z(double f);
 int32_t double2fix_z(double f, int e);
 uint32_t double2ufix_z(double f, int e);
@@ -162,18 +189,31 @@ uint32_t double2ufix(double f, int e);
 int64_t double2fix64(double f, int e);
 uint64_t double2ufix64(double f, int e);
 
+double powint(double x, int y);
 #endif
 
 double exp10(double x);
+#if PICO_C_COMPILER_IS_CLANG && !LIB_PICO_DOUBLE_COMPILER
+// clang unhelpfully splits sincosf into explict calls to sin & cos
+extern void WRAPPER_FUNC(sincos)(double x, double *sinx, double *cosx);
+#define sincos(x, sinx, cosx) WRAPPER_FUNC(sincos)(x, sinx, cosx)
+#else
 void sincos(double x, double *sinx, double *cosx);
-double powint(double x, int y);
+#endif
+
 
 #if PICO_RP2350 || PICO_COMBINED_DOCS
+
+#if LIB_PICO_DOUBLE_PICO_DCP
+#define PICO_DOUBLE_HAS_DDIV_FAST 1
+#define PICO_DOUBLE_HAS_SQRT_FAST 1
+#define PICO_DOUBLE_HAS_FMA_FAST 1
 double ddiv_fast(double n, double d);
 double sqrt_fast(double f);
 double fma_fast(double x, double y, double z); // this is not fused
 double mla(double x, double y, double z); // another name for fma_fast
 #endif
+#endif
 
 #endif
 
@@ -188,6 +228,14 @@ static inline int32_t double2int_z(double d) { return (int32_t)d; }
 static inline int64_t double2int64_z(double d) { return (int64_t)d; }
 static inline int32_t double2uint_z(double d) { return (uint32_t)d; }
 static inline int64_t double2uint64_z(double d) { return (uint64_t)d; }
+
+#if __has_builtin(__builtin_powi)
+#define PICO_DOUBLE_HAS_POWINT 1
+static __force_inline double powint(double d, int32_t p) {
+    return __builtin_powi(d, p);
+}
+#endif
+
 #endif
 
 #ifdef __cplusplus
diff --git a/src/rp2_common/pico_float/CMakeLists.txt b/src/rp2_common/pico_float/CMakeLists.txt
index 28eba6eb..a2474d22 100644
--- a/src/rp2_common/pico_float/CMakeLists.txt
+++ b/src/rp2_common/pico_float/CMakeLists.txt
@@ -174,6 +174,8 @@
                 #NO_SCI_EXTRA       # todo - are our versions better than what GCC proides?
                 NO_FMAF             # direct VFP instruction support
         )
+        # this allows inlining of sqrtf for example - if you really want errno support, use pico_float_compiler
+        target_compile_options(pico_float_pico_vfp INTERFACE -fno-math-errno)
 
         target_link_libraries(pico_float_pico INTERFACE
                 pico_float_pico_vfp)
diff --git a/src/rp2_common/pico_float/include/pico/float.h b/src/rp2_common/pico_float/include/pico/float.h
index 1f5e3a64..1a01df02 100644
--- a/src/rp2_common/pico_float/include/pico/float.h
+++ b/src/rp2_common/pico_float/include/pico/float.h
@@ -93,7 +93,11 @@ extern "C" {
 *
 * - GNU extensions:
 *
-*   powintf, sincosf
+*   sincosf
+*
+* Additional functions on Arm:
+*
+*   powintf
 *
 * On Arm, the following additional optimized functions are also provided (when using `_pico` variants of `pico_float`), all of which
 * saturate to the nearest representable value for too large input when converting from floating point types:
@@ -150,10 +154,32 @@ extern "C" {
 * \endif
 */
 
+// === we always define these
+#define PICO_FLOAT_HAS_INT32_TO_FLOAT_CONVERSIONS 1
+#define PICO_FLOAT_HAS_INT64_TO_FLOAT_CONVERSIONS 1
+// rounding towards zero
+#define PICO_FLOAT_HAS_FLOAT_TO_INT32_Z_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FLOAT_TO_INT64_Z_CONVERSIONS 1
+// ===
+
 // PICO_CONFIG: PICO_FLOAT_IN_RAM, Force placement of SDK provided single-precision floating point into RAM, type=bool, default=0, group=pico_float
 #if !defined(__riscv) || PICO_COMBINED_DOCS
 
 #if PICO_COMBINED_DOCS || !LIB_PICO_FLOAT_COMPILER
+#define PICO_FLOAT_HAS_FIX32_TO_FLOAT_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FIX64_TO_FLOAT_CONVERSIONS 1
+// rounding towards zero
+#define PICO_FLOAT_HAS_FLOAT_TO_FIX32_Z_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FLOAT_TO_FIX64_Z_CONVERSIONS 1
+
+// rounding towards negative infinity
+#define PICO_FLOAT_HAS_FLOAT_TO_INT32_M_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FLOAT_TO_INT64_M_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FLOAT_TO_FIX32_M_CONVERSIONS 1
+#define PICO_FLOAT_HAS_FLOAT_TO_FIX64_M_CONVERSIONS 1
+
+#define PICO_FLOAT_HAS_POWINTF 1
+
 #if LIB_PICO_FLOAT_PICO_VFP
 // note these functions do still exist for assembler use, we would just prefer to let the compiler handle it for C/C++ to avoid a call
 static inline float int2float(int32_t i) { return (float)i; }
@@ -164,6 +190,7 @@ float uint2float(uint32_t i);
 #endif
 float int642float(int64_t i);
 float uint642float(uint64_t i);
+
 float fix2float(int32_t m, int e);
 float ufix2float(uint32_t m, int e);
 float fix642float(int64_t m, int e);
@@ -196,6 +223,8 @@ uint32_t float2ufix(float f, int e);
 int64_t float2fix64(float f, int e);
 uint64_t float2ufix64(float f, int e);
 
+float powintf(float x, int y);
+
 #if LIB_PICO_FLOAT_PICO_VFP
 // a bit of a hack to inline VFP fixed point conversion when exponent is constant and in range 1-32
 #define fix2float(m, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _fix2float_inline(m, e) : fix2 ## float(m, e), fix2 ## float(m, e))
@@ -293,10 +322,17 @@ uint64_t float2ufix64(float f, int e);
 #endif
 
 float exp10f(float x);
+#if PICO_C_COMPILER_IS_CLANG && !LIB_PICO_FLOAT_COMPILER
+// clang unhelpfully splits sincosf into explict calls to sin & cos
+extern void WRAPPER_FUNC(sincosf)(float x, float *sinx, float *cosx);
+#define sincosf(x, sinx, cosx) WRAPPER_FUNC(sincosf)(x, sinx, cosx)
+#else
 void sincosf(float x, float *sinx, float *cosx);
-float powintf(float x, int y);
+#endif
 
-#if PICO_RP2350 || PICO_COMBINED_DOCS
+#if (PICO_RP2350 && LIB_PICO_FLOAT_PICO_DCP) || PICO_COMBINED_DOCS
+#define PICO_FLOAT_HAS_FDIV_FAST 1
+#define PICO_FLOAT_HAS_SQRTF_FAST 1
 float fdiv_fast(float n, float d);
 float sqrtf_fast(float f);
 #endif
@@ -315,6 +351,13 @@ static inline int32_t float2int_z(float f) { return (int32_t)f; }
 static inline int64_t float2int64_z(float f) { return (int64_t)f; }
 static inline int32_t float2uint_z(float f) { return (uint32_t)f; }
 static inline int64_t float2uint64_z(float f) { return (uint64_t)f; }
+
+#if __has_builtin(__builtin_powif)
+#define PICO_FLOAT_HAS_POWINTF 1
+static __force_inline float powintf(float f, int32_t p) {
+    return __builtin_powif(f, p);
+}
+#endif
 #endif
 
 #ifdef __cplusplus
diff --git a/test/pico_float_test/CMakeLists.txt b/test/pico_float_test/CMakeLists.txt
index 032497f5..a80dfcc6 100644
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@@ -70,6 +70,16 @@ foreach (FLOAT_TYPE IN LISTS FLOAT_TYPES)
         target_compile_options(${PICO_FLOAT_TEST} PRIVATE -fno-strict-float-cast-overflow)
         target_compile_options(custom_float_funcs_test_${FLOAT_TYPE} PRIVATE -fno-strict-float-cast-overflow)
     endif()
+
+    if (NOT PICO_RISCV) # todo need risc-v support too
+        add_executable(float_benchmark_${FLOAT_TYPE} float_benchmark.c)
+        pico_set_float_implementation(float_benchmark_${FLOAT_TYPE} ${FLOAT_TYPE})
+        target_link_libraries(float_benchmark_${FLOAT_TYPE} PRIVATE pico_stdlib m)
+        pico_add_extra_outputs(float_benchmark_${FLOAT_TYPE})
+        target_compile_definitions(float_benchmark_${FLOAT_TYPE} PRIVATE PICO_FLOAT_IN_RAM=1)
+        pico_set_printf_implementation(float_benchmark_${FLOAT_TYPE} compiler)
+        pico_set_binary_type(float_benchmark_${FLOAT_TYPE} copy_to_ram)
+    endif()
 endforeach ()
 
 foreach (DOUBLE_TYPE IN LISTS DOUBLE_TYPES)
@@ -116,6 +126,16 @@ foreach (DOUBLE_TYPE IN LISTS DOUBLE_TYPES)
             target_compile_options(custom_double_funcs_test_${DOUBLE_TYPE} PRIVATE -fno-strict-float-cast-overflow)
         endif()
     endif()
+
+    if (NOT PICO_RISCV) # todo need risc-v support too
+        add_executable(double_benchmark_${DOUBLE_TYPE} double_benchmark.c)
+        pico_set_double_implementation(double_benchmark_${DOUBLE_TYPE} ${DOUBLE_TYPE})
+        target_link_libraries(double_benchmark_${DOUBLE_TYPE} PRIVATE pico_stdlib m)
+        pico_add_extra_outputs(double_benchmark_${DOUBLE_TYPE})
+        target_compile_definitions(double_benchmark_${DOUBLE_TYPE} PRIVATE PICO_DOUBLE_IN_RAM=1)
+        pico_set_printf_implementation(double_benchmark_${DOUBLE_TYPE} compiler)
+        pico_set_binary_type(double_benchmark_${DOUBLE_TYPE} copy_to_ram)
+    endif()
 endforeach ()
 
 if (PICO_RP2350 AND NOT PICO_RISCV)
diff --git a/test/pico_float_test/custom_double_funcs_test.c b/test/pico_float_test/custom_double_funcs_test.c
index 928d87a0..db641e14 100644
--- a/test/pico_float_test/custom_double_funcs_test.c
+++ b/test/pico_float_test/custom_double_funcs_test.c
@@ -473,8 +473,8 @@ int test() {
     test_checki(double2int(-2147483648.1), INT32_MIN, "double2int17");
     test_checki(double2int(-21474836480.1), INT32_MIN, "double2int18");
     test_checki(double2int(make_positive_denormal_double()), 0, "double2int19");
-    double double2int20 = double2int(make_negative_denormal_double());
-    if (double2int20 == -1.0) double2int20 = 0; // -1 is a valid answer depending on flush to zero
+    int double2int20 = double2int(make_negative_denormal_double());
+    if (double2int20 == -1) double2int20 = 0; // -1 is a valid answer depending on flush to zero
     test_checki(double2int20, 0, "double2int20");
 
     printf("double2uint\n");
diff --git a/test/pico_float_test/double_benchmark.c b/test/pico_float_test/double_benchmark.c
new file mode 100644
index 00000000..8ba3f1c8
--- /dev/null
+++ b/test/pico_float_test/double_benchmark.c
@@ -0,0 +1,1344 @@
+#include <stdio.h>
+#include <math.h>
+#include "pico/stdlib.h"
+#include "pico/double.h"
+#include "pico/platform/cpu_regs.h"
+
+#if defined(LLVM_LIBC_COMMON_H) && !defined(__LLVM_LIBC__)
+#define __LLVM_LIBC__ 1
+#endif
+
+static void init_systick() {
+    systick_hw->csr = 0;
+    systick_hw->rvr = ARM_CPU_PREFIXED(SYST_RVR_RELOAD_BITS);
+    systick_hw->csr = ARM_CPU_PREFIXED(SYST_CSR_CLKSOURCE_BITS) | ARM_CPU_PREFIXED(SYST_CSR_ENABLE_BITS);
+}
+
+// Stop the compiler from constant-folding a hardware base pointer into the
+// pointers to individual registers, in cases where constant folding has
+// produced redundant 32-bit pointer literals that could have been load/store
+// offsets. (Note typeof(ptr+0) gives non-const, for +r constraint.) E.g.
+//     uart_hw_t *uart0 = __get_opaque_ptr(uart0_hw);
+#define __get_opaque_ptr(ptr) ({ \
+    typeof((ptr)+0) __opaque_ptr = (ptr); \
+    asm ("" : "+r"(__opaque_ptr)); \
+    __opaque_ptr; \
+})
+
+static __force_inline uint32_t systick_value() {
+    return systick_hw->cvr;
+}
+
+static __force_inline io_ro_32 *systick_value_ptr() {
+    return __get_opaque_ptr(&systick_hw->cvr);
+}
+
+static int cycle_diff(uint32_t systick1, uint32_t systick2) {
+    static_assert(ARM_CPU_PREFIXED(SYST_CVR_CURRENT_LSB) == 0, "");
+    uint32_t shift = 32 - ARM_CPU_PREFIXED(SYST_CVR_CURRENT_MSB);
+    return (((int32_t)((systick1 << shift) - (systick2 << shift))) >> shift) - 1; // -1 since the second systick read costs one
+}
+
+#define timer_func_def(name) static __noinline int __not_in_flash_func(time_##name)
+
+static double d_a[] = {1.3, -200.3, 1.6e15, 1e-2};
+static double d_b[] = {-121.3, 50.3, 27.9, 1.7e23};
+static double d_c[] = {20.3, -50.3, -3.9e-3, -4.1e7};
+static double d_m1to1[] = {-0.5, .9999, 0.1, -0.999999};
+
+static int32_t i_pow[] = {3,6,27,-10};
+static double d_positive[] = {0.0, 3.7, 1245325., 1e27};
+static double d_1plus[] = {1.0, 3.7, 1245325., 1e27};
+
+static double d_smaller[] = {-1000.3, 200.3, 1.6e15};
+static double d_bigger[] = {-121.3, 5000.3, 1.6e16};
+
+static int32_t i_32[]  = { 0, 3, -200, INT32_MIN, INT32_MAX };
+static int64_t i_64[]  = { 0, 3, -200, 0x123456789abcll, -0x123456789abcll, INT64_MIN, INT64_MAX };
+
+// bits for fixed point conversions
+static int32_t n_32[]  = { 0, 3, -3, 16, -16 };
+
+static_assert(count_of(d_a) == count_of(d_b), "");
+static_assert(count_of(d_a) == count_of(d_c), "");
+static_assert(count_of(d_a) == count_of(i_pow), "");
+static_assert(count_of(d_a) == count_of(d_positive), "");
+
+static_assert(count_of(d_smaller) == count_of(d_bigger), "");
+
+static double time_unary_func(int (*timer)(double), double *d, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(d[i]);
+    }
+    return total / count;
+}
+
+static double time_unary_n_func(int (*timer)(double, int32_t), double *d, int32_t *n, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(d[i], n[i]);
+    }
+    return total / count;
+}
+
+static double time_binary_func(int (*timer)(double,double), double *d1, double *d2, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(d1[i], d2[i]);
+    }
+    return total / count;
+}
+
+static double time_binary_int_func(int (*timer)(double,int32_t), double *d, int32_t *i32, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(d[i], i32[i]);
+    }
+    return total / count;
+}
+
+static double time_ternary_func(int (*timer)(double,double,double), double *d1, double *d2, double *d3, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(d1[i], d2[i], d3[i]);
+    }
+    return total / count;
+}
+
+static double time_unary_int32_func(int (*timer)(int32_t), int32_t *i32, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(i32[i]);
+    }
+    return total / count;
+}
+
+static double time_unary_int32_n_func(int (*timer)(int32_t, int32_t), int32_t *i32, int32_t *n, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(i32[i], n[i]);
+    }
+    return total / count;
+}
+
+static double time_unary_int64_func(int (*timer)(int64_t), int64_t *i64, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(i64[i]);
+    }
+    return total / count;
+}
+
+static double time_unary_int64_n_func(int (*timer)(int64_t, int32_t), int64_t *i64, int32_t *n, uint count) {
+    double total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (double)timer(i64[i], n[i]);
+    }
+    return total / count;
+}
+
+#if !defined(__ARM_FP) || ((__ARM_FP & 8) == 0)
+#  define EMITS_VFP 0
+#else
+#  define EMITS_VFP 1
+#endif
+
+#if defined(__ARM_PCS_VFP)
+#  define USING_HARD_FLOAT_ABI 1
+#else
+#  define USING_HARD_FLOAT_ABI 0
+#endif
+
+// #pragma message("__ARM_FP = " __XSTRING(__ARM_FP))
+// #pragma message("__ARM_PCS_VFP = " __XSTRING(__ARM_PCS_VFP))
+// #pragma message("__SOFTFP__ = " __XSTRING(__SOFTFP__))
+// #pragma message("EMITS_VFP = " __XSTRING(EMITS_VFP))
+// #pragma message("USING_HARD_FLOAT_ABI = " __XSTRING(USING_HARD_FLOAT_ABI))
+
+#define LOAD_COST 2
+#define STORE_COST 2
+
+#define DOUBLE_INPUT_COST (LOAD_COST * 2)
+#define DOUBLE_OUTPUT_COST (STORE_COST * 2)
+#define INT_INPUT_COST LOAD_COST
+#define INT_OUTPUT_COST STORE_COST
+#define INT64_INPUT_COST (LOAD_COST * 2)
+#define INT64_OUTPUT_COST (STORE_COST * 2)
+#define FLOAT_OUTPUT_COST STORE_COST
+
+timer_func_def(dadd)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = a + b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dsub)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = a - b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dmul)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = a * b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(ddiv)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = a / b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(ddiv_fast)(volatile double a, volatile double b) {
+#if PICO_DOUBLE_HAS_DDIV_FAST
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ddiv_fast(a,b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(dsqrt)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = sqrt(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dsqrt_fast)(volatile double a) {
+#if PICO_DOUBLE_HAS_SQRT_FAST
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = sqrt_fast(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(dfma)(volatile double a, volatile double b, volatile double c) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fma(a, b, c);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 3 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dfma_fast)(volatile double a, volatile double b, volatile double c) {
+#if PICO_DOUBLE_HAS_FMA_FAST
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fma_fast(a, b, c);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 3 - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+
+#define DCMP_OVERHEAD 4
+timer_func_def(dcmpeq)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a == b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(dcmplt)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a < b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(dcmple)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a <= b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(dcmpgt)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a > b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(dcmpge)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a > b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(dcmpun)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = __builtin_isunordered(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(i2d)(volatile int32_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = (double)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(ui2d)(volatile int32_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = (double)(uint32_t)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(int2double)(volatile int32_t i) {
+#if PICO_DOUBLE_HAS_INT32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = int2double(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(uint2double)(volatile int32_t i) {
+#if PICO_DOUBLE_HAS_INT32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = uint2double(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(d2i)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = (int32_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+}
+
+timer_func_def(d2ui)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = (uint32_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+}
+
+timer_func_def(double2int_z)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2int_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2uint_z)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2uint_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2int)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2int(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2uint)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2uint(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(l2d)(volatile int64_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = (double)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(ul2d)(volatile int64_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = (double)(uint64_t)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(int642double)(volatile int64_t i) {
+#if PICO_DOUBLE_HAS_INT64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = int642double(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(uint642double)(volatile int64_t i) {
+#if PICO_DOUBLE_HAS_INT64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = uint642double(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(d2l)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = (int64_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+}
+
+timer_func_def(d2ul)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = (uint64_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+}
+
+timer_func_def(double2int64_z)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2int64_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2uint64_z)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2uint64_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2int64)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2int64(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2uint64)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_INT64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2uint64(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+timer_func_def(d2f)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = (float) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+// ----------------------
+
+timer_func_def(fix2double)(volatile int32_t i, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_FIX32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fix2double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix2double)(volatile int32_t i, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_FIX32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ufix2double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix_z)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2fix_z(a, nn);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix_z)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2ufix_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2fix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2ufix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fix642double)(volatile int64_t i, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_FIX64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fix642double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix642double)(volatile int64_t i, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_FIX64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ufix642double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix64_z)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2fix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix64_z)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2ufix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix64)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2fix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix64)(volatile double a, volatile int32_t nn) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2ufix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+// ----------------------
+
+timer_func_def(fix2double_c)(volatile int32_t i) {
+#if PICO_DOUBLE_HAS_FIX32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fix2double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix2double_c)(volatile int32_t i) {
+#if PICO_DOUBLE_HAS_FIX32_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ufix2double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix_z_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2fix_z(a, 16);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix_z_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2ufix_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = double2fix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = double2ufix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fix642double_c)(volatile int64_t i) {
+#if PICO_DOUBLE_HAS_FIX64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fix642double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix642double_c)(volatile int64_t i) {
+#if PICO_DOUBLE_HAS_FIX64_TO_DOUBLE_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ufix642double(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix64_z_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2fix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix64_z_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2ufix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2fix64_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = double2fix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(double2ufix64_c)(volatile double a) {
+#if PICO_DOUBLE_HAS_DOUBLE_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = double2ufix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+timer_func_def(dcos)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = cos(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dsin)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = sin(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dsincos)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    double s, c;
+    sincos(a, &s, &c);
+    volatile double x = s;
+    volatile double y = c;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST * 2;
+}
+
+timer_func_def(dtan)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = tan(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(datan2)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = atan2(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dexp)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = exp(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dlog)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = log(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dpowint)(volatile double a, int32_t pow) {
+#if PICO_DOUBLE_HAS_POWINT
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = powint(a, pow);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(dcopysign)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = copysign(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dtrunc)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = trunc(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dfloor)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = floor(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dceil)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ceil(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dround)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = round(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dfmod)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = fmod(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(ddrem)(volatile double a, volatile double b) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 23)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = drem(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dremainder)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = remainder(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dremquo)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int c;
+    uint32_t t0 = *systick_ptr;
+    volatile double x = remquo(a, b, &c);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dexp2)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = exp2(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dlog2)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = log2(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dexp10)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = exp10(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dlog10)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = log10(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dldexp)(volatile double a, int32_t b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = ldexp(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dexpm1)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = expm1(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dlog1p)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = log1p(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dpow)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = pow(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dcbrt)(volatile double a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = cbrt(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dacosh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = acosh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(datanh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = atanh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dhypot)(volatile double a, volatile double b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = hypot(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+timer_func_def(dasin)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) && (__clang_major__ < 23)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = asin(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dacos)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) && (__clang_major__ < 23)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = acos(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(datan)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) && (__clang_major__ < 23)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = atan(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dsinh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = sinh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dcosh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = cosh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dtanh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = tanh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(dasinh)(volatile double a) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__) // && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = asinh(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+#endif
+}
+
+int main() {
+    stdio_init_all();
+    init_systick();
+#if PICO_C_COMPILER_IS_CLANG
+    printf("================= Clang - ");
+#else
+    printf("================ GCC - ");
+#endif
+#if LIB_PICO_DOUBLE_COMPILER
+    printf("COMPILER ===\n");
+#elif LIB_PICO_DOUBLE_PICO
+    printf("PICO ===\n");
+#elif LIB_PICO_DOUBLE_PICO_VFP
+    printf("PICO VFP ===\n");
+#elif LIB_PICO_DOUBLE_PICO_DCP
+    printf("PICO DCP ===\n");
+#else
+#error unknown float impl
+#endif
+#if EMITS_VFP
+    printf("hard-double       true\n");
+#else
+    printf("hard-double       false\n");
+#endif
+#if USING_HARD_FLOAT_ABI
+    printf("abi               hard\n");
+#else
+    printf("abi               soft(fp)\n");
+#endif
+#if PICO_RP2040
+    printf("platform          rp2040\n");
+#elif PICO_RP2350
+    printf("platform          rp2350\n");
+#endif
+#if PICO_RISCV
+    printf("arch              risc-v\n");
+#else
+    printf("arch              arm\n");
+#endif
+    printf("----------------- Basic ---\n");
+    printf("add               %g\n", time_binary_func(time_dadd, d_a, d_b, count_of(d_a)));
+    printf("sub               %g\n", time_binary_func(time_dsub, d_a, d_b, count_of(d_a)));
+    printf("mul               %g\n", time_binary_func(time_dmul, d_a, d_b, count_of(d_a)));
+    printf("div               %g\n", time_binary_func(time_ddiv, d_a, d_b, count_of(d_a)));
+    printf("sqrt              %g\n", time_unary_func(time_dsqrt, d_positive, count_of(d_a)));
+    printf("fma               %g\n", time_ternary_func(time_dfma, d_a, d_b, d_c, count_of(d_a)));
+    printf("div_fast          %g\n", time_binary_func(time_ddiv_fast, d_a, d_b, count_of(d_a)));
+    printf("sqrt_fast         %g\n", time_unary_func(time_dsqrt_fast, d_positive, count_of(d_a)));
+    printf("fma_fast          %g\n", time_ternary_func(time_dfma_fast, d_a, d_b, d_c, count_of(d_a)));
+    printf("----------------- Comparison ---\n");
+    // these are hard to make the compiler generate it seems
+    // printf("ccmpeq             %g\n", time_binary_func(time_dccmpeq, d_a, d_b, count_of(d_a)));
+    // printf("crcmple            %g\n", time_binary_func(time_dcrcmple, d_a, d_b, count_of(d_a)));
+    // printf("ccmple             %g\n", time_binary_func(time_dccmple, d_a, d_b, count_of(d_a)));
+    printf("cmpeq             %g\n", time_binary_func(time_dcmpeq, d_a, d_a, count_of(d_a)));
+    printf("cmplt             %g\n", time_binary_func(time_dcmplt, d_smaller, d_bigger, count_of(d_smaller)));
+    printf("cmple             %g\n", time_binary_func(time_dcmple, d_smaller, d_bigger, count_of(d_smaller)));
+    printf("cmpge             %g\n", time_binary_func(time_dcmpge, d_bigger, d_smaller, count_of(d_bigger)));
+    printf("cmpgt             %g\n", time_binary_func(time_dcmpgt, d_bigger, d_smaller, count_of(d_bigger)));
+    printf("cmpun             %g\n", time_binary_func(time_dcmpun, d_a, d_a, count_of(d_a)));
+    printf("----------------- 32-bit Conversions ---\n");
+    printf("i2d               %g\n", time_unary_int32_func(time_i2d, i_32, count_of(i_32)));
+    printf("ui2d              %g\n", time_unary_int32_func(time_ui2d, i_32, count_of(i_32)));
+    printf("int2double        %g\n", time_unary_int32_func(time_int2double, i_32, count_of(i_32)));
+    printf("uint2double       %g\n", time_unary_int32_func(time_uint2double, i_32, count_of(i_32)));
+    printf("d2i               %g\n", time_unary_func(time_d2i, d_a, count_of(d_a)));
+    printf("d2ui              %g\n", time_unary_func(time_d2ui, d_a, count_of(d_a)));
+    printf("double2int_z      %g\n", time_unary_func(time_double2int_z, d_a, count_of(d_a)));
+    printf("double2uint_z     %g\n", time_unary_func(time_double2uint_z, d_a, count_of(d_a)));
+    printf("double2int        %g\n", time_unary_func(time_double2int, d_a, count_of(d_a)));
+    printf("double2uint       %g\n", time_unary_func(time_double2uint, d_a, count_of(d_a)));
+    printf("----------------- 64-bit Conversions ---\n");
+    printf("l2d               %g\n", time_unary_int64_func(time_l2d, i_64, count_of(i_64)));
+    printf("ul2d              %g\n", time_unary_int64_func(time_ul2d, i_64, count_of(i_64)));
+    printf("int642double      %g\n", time_unary_int64_func(time_int642double, i_64, count_of(i_64)));
+    printf("uint642double     %g\n", time_unary_int64_func(time_uint642double, i_64, count_of(i_64)));
+    printf("d2l               %g\n", time_unary_func(time_d2l, d_a, count_of(d_a)));
+    printf("d2ul              %g\n", time_unary_func(time_d2ul, d_a, count_of(d_a)));
+    printf("double2int64_z    %g\n", time_unary_func(time_double2int64_z, d_a, count_of(d_a)));
+    printf("double2uint64_z   %g\n", time_unary_func(time_double2uint64_z, d_a, count_of(d_a)));
+    printf("double2int64      %g\n", time_unary_func(time_double2int64, d_a, count_of(d_a)));
+    printf("double2uint64     %g\n", time_unary_func(time_double2uint64, d_a, count_of(d_a)));
+    printf("d2f               %g\n", time_unary_func(time_d2f, d_a, count_of(d_a)));
+    printf("----------------- Fixed-point (Constant Point) 32-bit Conversions ---\n");
+    printf("fix2double_c      %g\n", time_unary_int32_func(time_fix2double_c, i_32, count_of(i_32)));
+    printf("ufix2double_c     %g\n", time_unary_int32_func(time_ufix2double_c, i_32, count_of(i_32)));
+    printf("double2fix_z_c    %g\n", time_unary_func(time_double2fix_z_c, d_a, count_of(d_a)));
+    printf("double2ufix_z_c   %g\n", time_unary_func(time_double2ufix_z_c, d_a, count_of(d_a)));
+    printf("double2fix_c      %g\n", time_unary_func(time_double2fix_c, d_a, count_of(d_a)));
+    printf("double2ufix_c     %g\n", time_unary_func(time_double2ufix_c, d_a, count_of(d_a)));
+    printf("----------------- Fixed-point (Constant Point) 64-bit Conversions ---\n");
+    printf("fix642double_c    %g\n", time_unary_int64_func(time_fix642double_c, i_64, count_of(i_64)));
+    printf("ufix642double_c   %g\n", time_unary_int64_func(time_ufix642double_c, i_64, count_of(i_64)));
+    printf("double2fix64_z_c  %g\n", time_unary_func(time_double2fix64_z_c, d_a, count_of(d_a)));
+    printf("double2ufix64_z_c %g\n", time_unary_func(time_double2ufix64_z_c, d_a, count_of(d_a)));
+    printf("double2fix64_c    %g\n", time_unary_func(time_double2fix64_c, d_a, count_of(d_a)));
+    printf("double2ufix64_c   %g\n", time_unary_func(time_double2ufix64_c, d_a, count_of(d_a)));
+    printf("----------------- Fixed-point (Dynamic Point) 32-bit Conversions ---\n");
+    printf("fix2double        %g\n", time_unary_int32_n_func(time_fix2double, i_32, n_32, count_of(i_32)));
+    printf("ufix2double       %g\n", time_unary_int32_n_func(time_ufix2double, i_32, n_32, count_of(i_32)));
+    printf("double2fix_z      %g\n", time_unary_n_func(time_double2fix_z, d_a, n_32, count_of(d_a)));
+    printf("double2ufix_z     %g\n", time_unary_n_func(time_double2ufix_z, d_a, n_32, count_of(d_a)));
+    printf("double2fix        %g\n", time_unary_n_func(time_double2fix, d_a, n_32, count_of(d_a)));
+    printf("double2ufix       %g\n", time_unary_n_func(time_double2ufix, d_a, n_32, count_of(d_a)));
+    printf("----------------- Fixed-point (Dynamic Point) 64-bit Conversions ---\n");
+    printf("fix642double      %g\n", time_unary_int64_n_func(time_fix642double, i_64, n_32, count_of(i_64)));
+    printf("ufix642double     %g\n", time_unary_int64_n_func(time_ufix642double, i_64, n_32, count_of(i_64)));
+    printf("double2fix64_z    %g\n", time_unary_n_func(time_double2fix64_z, d_a, n_32, count_of(d_a)));
+    printf("double2ufix64_z   %g\n", time_unary_n_func(time_double2ufix64_z, d_a, n_32, count_of(d_a)));
+    printf("double2fix64      %g\n", time_unary_n_func(time_double2fix64, d_a, n_32, count_of(d_a)));
+    printf("double2ufix64     %g\n", time_unary_n_func(time_double2ufix64, d_a, n_32, count_of(d_a)));
+    
+    printf("----------------- Trig (basic) ---\n");
+    printf("cos               %g\n", time_unary_func(time_dcos, d_positive, count_of(d_a)));
+    printf("sin               %g\n", time_unary_func(time_dsin, d_positive, count_of(d_a)));
+    printf("tan               %g\n", time_unary_func(time_dtan, d_positive, count_of(d_a)));
+    printf("atan2             %g\n", time_binary_func(time_datan2, d_a, d_b, count_of(d_a)));
+    printf("sincos            %g\n", time_unary_func(time_dsincos, d_positive, count_of(d_a)));
+    printf("----------------- Sci (basic) ---\n");
+    printf("dexp              %g\n", time_unary_func(time_dexp, d_a, count_of(d_a)));
+    printf("dlog              %g\n", time_unary_func(time_dlog, d_positive, count_of(d_positive)));
+    printf("----------------- Misc ---\n");
+    printf("dcopysign         %g\n", time_binary_func(time_dcopysign, d_a, d_b, count_of(d_a)));
+    printf("dtrunc            %g\n", time_unary_func(time_dtrunc, d_a, count_of(d_a)));
+    printf("dfloor            %g\n", time_unary_func(time_dfloor, d_a, count_of(d_a)));
+    printf("dceil             %g\n", time_unary_func(time_dceil, d_a, count_of(d_a)));
+    printf("dround            %g\n", time_unary_func(time_dround, d_a, count_of(d_a)));
+    printf("dfmod             %g\n", time_binary_func(time_dfmod, d_a, d_b, count_of(d_a)));
+    printf("ddrem             %g\n", time_binary_func(time_ddrem, d_a, d_b, count_of(d_a)));
+    printf("dremainder        %g\n", time_binary_func(time_dremainder, d_a, d_b, count_of(d_a)));
+    printf("dremquo           %g\n", time_binary_func(time_dremquo, d_a, d_b, count_of(d_a)));
+    printf("----------------- Sci (extra) ---\n");
+    printf("dexp2             %g\n", time_unary_func(time_dexp2, d_a, count_of(d_a)));
+    printf("dlog2             %g\n", time_unary_func(time_dlog2, d_positive, count_of(d_positive)));
+    printf("dexp10            %g\n", time_unary_func(time_dexp10, d_a, count_of(d_a)));
+    printf("dlog10            %g\n", time_unary_func(time_dlog10, d_positive, count_of(d_positive)));
+    printf("dldexp            %g\n", time_binary_int_func(time_dldexp, d_a, i_pow, count_of(d_a)));
+    printf("dexpm1            %g\n", time_unary_func(time_dexpm1, d_a, count_of(d_a)));
+    printf("dlog1p            %g\n", time_unary_func(time_dlog1p, d_positive, count_of(d_positive)));
+    printf("dpowint           %g\n", time_binary_int_func(time_dpowint, d_a, i_32, count_of(d_a)));
+    printf("dpow              %g\n", time_binary_func(time_dpow, d_a, d_b, count_of(d_a)));
+    printf("dcbrt             %g\n", time_unary_func(time_dcbrt, d_a, count_of(d_a)));
+    printf("----------------- Trig (extra) ---\n");
+    printf("dacos             %g\n", time_unary_func(time_dacos, d_m1to1, count_of(d_m1to1)));
+    printf("dasin             %g\n", time_unary_func(time_dasin, d_m1to1, count_of(d_m1to1)));
+    printf("datan             %g\n", time_unary_func(time_datan, d_a, count_of(d_a)));
+    printf("dcosh             %g\n", time_unary_func(time_dcosh, d_a, count_of(d_a)));
+    printf("dsinh             %g\n", time_unary_func(time_dsinh, d_a, count_of(d_a)));
+    printf("dtanh             %g\n", time_unary_func(time_dtanh, d_a, count_of(d_a)));
+    printf("dacosh            %g\n", time_unary_func(time_dacosh, d_1plus, count_of(d_1plus)));
+    printf("dasinh            %g\n", time_unary_func(time_dasinh, d_1plus, count_of(d_1plus)));
+    printf("datanh            %g\n", time_unary_func(time_datanh, d_m1to1, count_of(d_m1to1)));
+    printf("dhypot            %g\n", time_binary_func(time_dhypot, d_a, d_b, count_of(d_a)));
+
+    printf("PASSED\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/test/pico_float_test/float_benchmark.c b/test/pico_float_test/float_benchmark.c
new file mode 100644
index 00000000..d6e157a6
--- /dev/null
+++ b/test/pico_float_test/float_benchmark.c
@@ -0,0 +1,1286 @@
+#include <stdio.h>
+#include <math.h>
+#include "pico/stdlib.h"
+#include "pico/float.h"
+#include "pico/platform/cpu_regs.h"
+
+#if defined(LLVM_LIBC_COMMON_H) && !defined(__LLVM_LIBC__)
+#define __LLVM_LIBC__ 1
+#endif
+
+static void init_systick() {
+    systick_hw->csr = 0;
+    systick_hw->rvr = ARM_CPU_PREFIXED(SYST_RVR_RELOAD_BITS);
+    systick_hw->csr = ARM_CPU_PREFIXED(SYST_CSR_CLKSOURCE_BITS) | ARM_CPU_PREFIXED(SYST_CSR_ENABLE_BITS);
+}
+
+// Stop the compiler from constant-folding a hardware base pointer into the
+// pointers to individual registers, in cases where constant folding has
+// produced redundant 32-bit pointer literals that could have been load/store
+// offsets. (Note typeof(ptr+0) gives non-const, for +r constraint.) E.g.
+//     uart_hw_t *uart0 = __get_opaque_ptr(uart0_hw);
+#define __get_opaque_ptr(ptr) ({ \
+    typeof((ptr)+0) __opaque_ptr = (ptr); \
+    asm ("" : "+r"(__opaque_ptr)); \
+    __opaque_ptr; \
+})
+
+static __force_inline uint32_t systick_value() {
+    return systick_hw->cvr;
+}
+
+static __force_inline io_ro_32 *systick_value_ptr() {
+    return __get_opaque_ptr(&systick_hw->cvr);
+}
+
+static int cycle_diff(uint32_t systick1, uint32_t systick2) {
+    static_assert(ARM_CPU_PREFIXED(SYST_CVR_CURRENT_LSB) == 0, "");
+    uint32_t shift = 32 - ARM_CPU_PREFIXED(SYST_CVR_CURRENT_MSB);
+    return (((int32_t)((systick1 << shift) - (systick2 << shift))) >> shift) - 1; // -1 since the second systick read costs one
+}
+
+#define timer_func_def(name) static __noinline int __not_in_flash_func(time_##name)
+
+static float f_a[] = {1.3f, -200.3f, 1.6e15f, 1e-2f};
+static float f_b[] = {-121.3f, 50.3f, 27.9f, 1.7e23f};
+static float f_c[] = {20.3f, -50.3f, -3.9e-3f, -4.1e7f};
+static float f_m1to1[] = {-0.5f, .9999f, 0.1f, -0.999999f};
+
+static int32_t i_pow[] = {3,6,27,-10};
+static float f_positive[] = {0.0f, 3.7f, 1245325.f, 1e27f};
+static float f_1plus[] = {1.0f, 3.7f, 1245325.f, 1e27f};
+
+static float f_smaller[] = {-1000.3f, 200.3f, 1.6e15f};
+static float f_bigger[] = {-121.3f, 5000.3f, 1.6e16f};
+
+static int32_t i_32[]  = { 0, 3, -200, INT32_MIN, INT32_MAX };
+static int64_t i_64[]  = { 0, 3, -200, 0x123456789abcll, -0x123456789abcll, INT64_MIN, INT64_MAX };
+
+// bits for fixed point conversions
+static int32_t n_32[]  = { 0, 3, -3, 16, -16 };
+
+static_assert(count_of(f_a) == count_of(f_b), "");
+static_assert(count_of(f_a) == count_of(f_c), "");
+static_assert(count_of(f_a) == count_of(i_pow), "");
+static_assert(count_of(f_a) == count_of(f_positive), "");
+
+static_assert(count_of(f_smaller) == count_of(f_bigger), "");
+
+static float time_unary_func(int (*timer)(float), float *f, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(f[i]);
+    }
+    return total / count;
+}
+
+static float time_unary_n_func(int (*timer)(float, int32_t), float *f, int32_t *n, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(f[i], n[i]);
+    }
+    return total / count;
+}
+
+static float time_binary_func(int (*timer)(float,float), float *f1, float *f2, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(f1[i], f2[i]);
+    }
+    return total / count;
+}
+
+static float time_binary_int_func(int (*timer)(float,int32_t), float *f, int32_t *i32, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(f[i], i32[i]);
+    }
+    return total / count;
+}
+
+static float time_ternary_func(int (*timer)(float,float,float), float *f1, float *f2, float *f3, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(f1[i], f2[i], f3[i]);
+    }
+    return total / count;
+}
+
+static float time_unary_int32_func(int (*timer)(int32_t), int32_t *i32, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(i32[i]);
+    }
+    return total / count;
+}
+
+static float time_unary_int32_n_func(int (*timer)(int32_t, int32_t), int32_t *i32, int32_t *n, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(i32[i], n[i]);
+    }
+    return total / count;
+}
+
+static float time_unary_int64_func(int (*timer)(int64_t), int64_t *i64, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(i64[i]);
+    }
+    return total / count;
+}
+
+static float time_unary_int64_n_func(int (*timer)(int64_t, int32_t), int64_t *i64, int32_t *n, uint count) {
+    float total = 0.f;
+    for (uint i=0;i<count;i++) {
+        total += (float)timer(i64[i], n[i]);
+    }
+    return total / count;
+}
+
+#if !defined(__ARM_FP) || ((__ARM_FP & 4) == 0)
+#  define EMITS_VFP 0
+#else
+#  define EMITS_VFP 1
+#endif
+
+#if defined(__ARM_PCS_VFP)
+#  define USING_HARD_FLOAT_ABI 1
+#else
+#  define USING_HARD_FLOAT_ABI 0
+#endif
+
+// #pragma message("__ARM_FP = " __XSTRING(__ARM_FP))
+// #pragma message("__ARM_PCS_VFP = " __XSTRING(__ARM_PCS_VFP))
+// #pragma message("__SOFTFP__ = " __XSTRING(__SOFTFP__))
+// #pragma message("EMITS_VFP = " __XSTRING(EMITS_VFP))
+// #pragma message("USING_HARD_FLOAT_ABI = " __XSTRING(USING_HARD_FLOAT_ABI))
+
+#define LOAD_COST 2
+#define STORE_COST 2
+
+#define FLOAT_INPUT_COST LOAD_COST
+#define FLOAT_OUTPUT_COST STORE_COST
+#define INT_INPUT_COST LOAD_COST
+#define INT_OUTPUT_COST STORE_COST
+#define INT64_INPUT_COST (LOAD_COST * 2)
+#define INT64_OUTPUT_COST (STORE_COST * 2)
+#define DOUBLE_OUTPUT_COST (STORE_COST * 2)
+
+timer_func_def(fadd)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = a + b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fsub)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = a - b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fmul)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = a * b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fdiv)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = a / b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fdiv_fast)(volatile float a, volatile float b) {
+#if PICO_FLOAT_HAS_FDIV_FAST
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fdiv_fast(a,b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fsqrt)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = sqrtf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fsqrt_fast)(volatile float a) {
+#if PICO_FLOAT_HAS_SQRTF_FAST
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = sqrtf_fast(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ffma)(volatile float a, volatile float b, volatile float c) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fmaf(a, b, c);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 3 - FLOAT_OUTPUT_COST;
+}
+
+#define DCMP_OVERHEAD 4
+timer_func_def(fcmpeq)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a == b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(fcmplt)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a < b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(fcmple)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a <= b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(fcmpgt)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a > b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(fcmpge)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = a > b;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(fcmpun)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile bool v = __builtin_isunordered(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - DCMP_OVERHEAD;
+}
+
+timer_func_def(i2f)(volatile int32_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = (float)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ui2f)(volatile int32_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = (float)(uint32_t)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(int2float)(volatile int32_t i) {
+#if PICO_FLOAT_HAS_INT32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = int2float(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(uint2float)(volatile int32_t i) {
+#if PICO_FLOAT_HAS_INT32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = uint2float(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(f2i)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = (int32_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+}
+
+timer_func_def(f2ui)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = (uint32_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+}
+
+timer_func_def(float2int_z)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2int_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2uint_z)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2uint_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2int)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2int(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2uint)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2uint(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(l2f)(volatile int64_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = (float)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ul2f)(volatile int64_t i) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = (float)(uint64_t)i;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(int642float)(volatile int64_t i) {
+#if PICO_FLOAT_HAS_INT64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = int642float(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(uint642float)(volatile int64_t i) {
+#if PICO_FLOAT_HAS_INT64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = uint642float(i);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(f2l)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = (int64_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+}
+
+timer_func_def(f2ul)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = (uint64_t) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+}
+
+timer_func_def(float2int64_z)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2int64_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2uint64_z)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2uint64_z(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2int64)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2int64(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2uint64)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_INT64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2uint64(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+timer_func_def(f2d)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile double x = (double) a;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - DOUBLE_OUTPUT_COST;
+}
+
+// ----------------------
+
+timer_func_def(fix2float)(volatile int32_t i, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FIX32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fix2float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix2float)(volatile int32_t i, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FIX32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ufix2float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix_z)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2fix_z(a, nn);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix_z)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2ufix_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2fix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2ufix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fix642float)(volatile int64_t i, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FIX64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fix642float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix642float)(volatile int64_t i, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FIX64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ufix642float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix64_z)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2fix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix64_z)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2ufix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix64)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2fix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix64)(volatile float a, volatile int32_t nn) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int32_t n = nn; pico_default_asm_volatile( "" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2ufix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+// ----------------------
+
+timer_func_def(fix2float_c)(volatile int32_t i) {
+#if PICO_FLOAT_HAS_FIX32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fix2float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix2float_c)(volatile int32_t i) {
+#if PICO_FLOAT_HAS_FIX32_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ufix2float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix_z_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2fix_z(a, 16);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix_z_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2ufix_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int32_t x = float2fix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX32_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint32_t x = float2ufix(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fix642float_c)(volatile int64_t i) {
+#if PICO_FLOAT_HAS_FIX64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fix642float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(ufix642float_c)(volatile int64_t i) {
+#if PICO_FLOAT_HAS_FIX64_TO_FLOAT_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ufix642float(i, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - INT64_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix64_z_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2fix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix64_z_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_Z_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2ufix64_z(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2fix64_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile int64_t x = float2fix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(float2ufix64_c)(volatile float a) {
+#if PICO_FLOAT_HAS_FLOAT_TO_FIX64_M_CONVERSIONS
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    const int n = 16; pico_default_asm_volatile("" : : "r" (n) : "memory");
+    uint32_t t0 = *systick_ptr;
+    volatile uint64_t x = float2ufix64(a, n);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - INT64_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+// ----------------------
+
+timer_func_def(fcos)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = cosf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fsin)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = sinf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fsincos)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    float s, c;
+    sincosf(a, &s, &c);
+    volatile float x = s;
+    volatile float y = c;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST * 2;
+}
+
+timer_func_def(ftan)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = tanf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fatan2)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = atan2f(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fexp)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = expf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(flog)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = logf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fpowint)(volatile float a, int32_t pow) {
+#if PICO_FLOAT_HAS_POWINTF
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = powintf(a, pow);;
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+#else
+    return -1;
+#endif
+}
+
+timer_func_def(fcopysign)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = copysignf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ftrunc)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = truncf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ffloor)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = floorf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fceil)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ceilf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fround)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = roundf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ffmod)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = fmodf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fdrem)(volatile float a, volatile float b) {
+    // LLVM libc is string betting the floating point functions
+#if defined(__LLVM_LIBC__) && defined(__llvm__)// not sure when this is fixed && (__clang_major__ < 21)
+    return -1;
+#else
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = dremf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+#endif
+}
+
+timer_func_def(fremainder)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = remainderf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fremquo)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    int c;
+    uint32_t t0 = *systick_ptr;
+    volatile float x = remquof(a, b, &c);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fexp2)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = exp2f(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(flog2)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = log2f(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fexp10)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = exp10f(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(flog10)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = log10f(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fldexp)(volatile float a, int32_t b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = ldexpf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fexpm1)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = expm1f(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(flog1p)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = log1pf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fpow)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = powf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fcbrt)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = cbrtf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(facosh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = acoshf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fatanh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = atanhf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fhypot)(volatile float a, volatile float b) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = hypotf(a, b);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fasin)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = asinf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(facos)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = acosf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fatan)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = atanf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fsinh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = sinhf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fcosh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = coshf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(ftanh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = tanhf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+timer_func_def(fasinh)(volatile float a) {
+    register io_ro_32 *systick_ptr = systick_value_ptr();
+    uint32_t t0 = *systick_ptr;
+    volatile float x = asinhf(a);
+    uint32_t t1 = *systick_ptr;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+}
+
+
+int main() {
+    stdio_init_all();
+    init_systick();
+#if PICO_C_COMPILER_IS_CLANG
+    printf("================= Clang - ");
+#else
+    printf("================ GCC - ");
+#endif
+#if LIB_PICO_FLOAT_COMPILER
+    printf("COMPILER ===\n");
+#elif LIB_PICO_FLOAT_PICO
+    printf("PICO ===\n");
+#elif LIB_PICO_FLOAT_PICO_VFP
+    printf("PICO VFP ===\n");
+#elif LIB_PICO_FLOAT_PICO_DCP
+    printf("PICO DCP ===\n");
+#else
+#error unknown float impl
+#endif
+#if EMITS_VFP
+    printf("hard-float        true\n");
+#else
+    printf("hard-float        false\n");
+#endif
+#if USING_HARD_FLOAT_ABI
+    printf("abi               hard\n");
+#else
+    printf("abi               soft(fp)\n");
+#endif
+#if PICO_RP2040
+    printf("platform          rp2040\n");
+#elif PICO_RP2350
+    printf("platform          rp2350\n");
+#endif
+#if PICO_RISCV
+    printf("arch              risc-v\n");
+#else
+    printf("arch              arm\n");
+#endif
+    printf("----------------- Basic ---\n");
+    printf("fadd              %g\n", time_binary_func(time_fadd, f_a, f_b, count_of(f_a)));
+    printf("fsub              %g\n", time_binary_func(time_fsub, f_a, f_b, count_of(f_a)));
+    printf("fmul              %g\n", time_binary_func(time_fmul, f_a, f_b, count_of(f_a)));
+    printf("fdiv              %g\n", time_binary_func(time_fdiv, f_a, f_b, count_of(f_a)));
+    printf("fsqrt             %g\n", time_unary_func(time_fsqrt, f_positive, count_of(f_a)));
+    printf("ffma              %g\n", time_ternary_func(time_ffma, f_a, f_b, f_c, count_of(f_a)));
+    printf("fdiv_fast         %g\n", time_binary_func(time_fdiv_fast, f_a, f_b, count_of(f_a)));
+    printf("fsqrt_fast        %g\n", time_unary_func(time_fsqrt_fast, f_positive, count_of(f_a)));
+    printf("----------------- Comparison ---\n");
+    // these are hard to make the compiler generate it seems
+    // printf("fccmpeq               %g\n", time_binary_func(time_fccmpeq, f_a, f_b, count_of(f_a)));
+    // printf("fcrcmple              %g\n", time_binary_func(time_fcrcmple, f_a, f_b, count_of(f_a)));
+    // printf("fccmple               %g\n", time_binary_func(time_fccmple, f_a, f_b, count_of(f_a)));
+    printf("fcmpeq            %g\n", time_binary_func(time_fcmpeq, f_a, f_a, count_of(f_a)));
+    printf("fcmplt            %g\n", time_binary_func(time_fcmplt, f_smaller, f_bigger, count_of(f_smaller)));
+    printf("fcmple            %g\n", time_binary_func(time_fcmple, f_smaller, f_bigger, count_of(f_smaller)));
+    printf("fcmpge            %g\n", time_binary_func(time_fcmpge, f_bigger, f_smaller, count_of(f_bigger)));
+    printf("fcmpgt            %g\n", time_binary_func(time_fcmpgt, f_bigger, f_smaller, count_of(f_bigger)));
+    printf("fcmpun            %g\n", time_binary_func(time_fcmpun, f_a, f_a, count_of(f_a)));
+    printf("----------------- 32-bit Conversions ---\n");
+    printf("i2f               %g\n", time_unary_int32_func(time_i2f, i_32, count_of(i_32)));
+    printf("ui2f              %g\n", time_unary_int32_func(time_ui2f, i_32, count_of(i_32)));
+    printf("int2float         %g\n", time_unary_int32_func(time_int2float, i_32, count_of(i_32)));
+    printf("uint2float        %g\n", time_unary_int32_func(time_uint2float, i_32, count_of(i_32)));
+    printf("f2i               %g\n", time_unary_func(time_f2i, f_a, count_of(f_a)));
+    printf("f2ui              %g\n", time_unary_func(time_f2ui, f_a, count_of(f_a)));
+    printf("float2int_z       %g\n", time_unary_func(time_float2int_z, f_a, count_of(f_a)));
+    printf("float2uint_z      %g\n", time_unary_func(time_float2uint_z, f_a, count_of(f_a)));
+    printf("float2int         %g\n", time_unary_func(time_float2int, f_a, count_of(f_a)));
+    printf("float2uint        %g\n", time_unary_func(time_float2uint, f_a, count_of(f_a)));
+    printf("----------------- 64-bit Conversions ---\n");
+    printf("l2f               %g\n", time_unary_int64_func(time_l2f, i_64, count_of(i_64)));
+    printf("ul2f              %g\n", time_unary_int64_func(time_ul2f, i_64, count_of(i_64)));
+    printf("int642float       %g\n", time_unary_int64_func(time_int642float, i_64, count_of(i_64)));
+    printf("uint642float      %g\n", time_unary_int64_func(time_uint642float, i_64, count_of(i_64)));
+    printf("f2l               %g\n", time_unary_func(time_f2l, f_a, count_of(f_a)));
+    printf("f2ul              %g\n", time_unary_func(time_f2ul, f_a, count_of(f_a)));
+    printf("float2int64_z     %g\n", time_unary_func(time_float2int64_z, f_a, count_of(f_a)));
+    printf("float2uint64_z    %g\n", time_unary_func(time_float2uint64_z, f_a, count_of(f_a)));
+    printf("float2int64       %g\n", time_unary_func(time_float2int64, f_a, count_of(f_a)));
+    printf("float2uint64      %g\n", time_unary_func(time_float2uint64, f_a, count_of(f_a)));
+    printf("f2d               %g\n", time_unary_func(time_f2d, f_a, count_of(f_a)));
+    printf("----------------- Fixed-point (Constant Point) 32-bit Conversions ---\n");
+    printf("fix2float_c       %g\n", time_unary_int32_func(time_fix2float_c, i_32, count_of(i_32)));
+    printf("ufix2float_c      %g\n", time_unary_int32_func(time_ufix2float_c, i_32, count_of(i_32)));
+    printf("float2fix_z_c     %g\n", time_unary_func(time_float2fix_z_c, f_a, count_of(f_a)));
+    printf("float2ufix_z_c    %g\n", time_unary_func(time_float2ufix_z_c, f_a, count_of(f_a)));
+    printf("float2fix_c       %g\n", time_unary_func(time_float2fix_c, f_a, count_of(f_a)));
+    printf("float2ufix_c      %g\n", time_unary_func(time_float2ufix_c, f_a, count_of(f_a)));
+    printf("----------------- Fixed-point (Constant Point) 64-bit Conversions ---\n");
+    printf("fix642float_c     %g\n", time_unary_int64_func(time_fix642float_c, i_64, count_of(i_64)));
+    printf("ufix642float_c    %g\n", time_unary_int64_func(time_ufix642float_c, i_64, count_of(i_64)));
+    printf("float2fix64_z_c   %g\n", time_unary_func(time_float2fix64_z_c, f_a, count_of(f_a)));
+    printf("float2ufix64_z_c  %g\n", time_unary_func(time_float2ufix64_z_c, f_a, count_of(f_a)));
+    printf("float2fix64_c     %g\n", time_unary_func(time_float2fix64_c, f_a, count_of(f_a)));
+    printf("float2ufix64_c    %g\n", time_unary_func(time_float2ufix64_c, f_a, count_of(f_a)));
+    printf("----------------- Fixed-point (Dynamic Point) 32-bit Conversions ---\n");
+    printf("fix2float         %g\n", time_unary_int32_n_func(time_fix2float, i_32, n_32, count_of(i_32)));
+    printf("ufix2float        %g\n", time_unary_int32_n_func(time_ufix2float, i_32, n_32, count_of(i_32)));
+    printf("float2fix_z       %g\n", time_unary_n_func(time_float2fix_z, f_a, n_32, count_of(f_a)));
+    printf("float2ufix_z      %g\n", time_unary_n_func(time_float2ufix_z, f_a, n_32, count_of(f_a)));
+    printf("float2fix         %g\n", time_unary_n_func(time_float2fix, f_a, n_32, count_of(f_a)));
+    printf("float2ufix        %g\n", time_unary_n_func(time_float2ufix, f_a, n_32, count_of(f_a)));
+    printf("----------------- Fixed-point (Dynamic Point) 64-bit Conversions ---\n");
+    printf("fix642float       %g\n", time_unary_int64_n_func(time_fix642float, i_64, n_32, count_of(i_64)));
+    printf("ufix642float      %g\n", time_unary_int64_n_func(time_ufix642float, i_64, n_32, count_of(i_64)));
+    printf("float2fix64_z     %g\n", time_unary_n_func(time_float2fix64_z, f_a, n_32, count_of(f_a)));
+    printf("float2ufix64_z    %g\n", time_unary_n_func(time_float2ufix64_z, f_a, n_32, count_of(f_a)));
+    printf("float2fix64       %g\n", time_unary_n_func(time_float2fix64, f_a, n_32, count_of(f_a)));
+    printf("float2ufix64      %g\n", time_unary_n_func(time_float2ufix64, f_a, n_32, count_of(f_a)));
+
+    printf("----------------- Trig (basic) ---\n");
+    printf("fcos              %g\n", time_unary_func(time_fcos, f_a, count_of(f_a)));
+    printf("fsin              %g\n", time_unary_func(time_fsin, f_a, count_of(f_a)));
+    printf("ftan              %g\n", time_unary_func(time_ftan, f_a, count_of(f_a)));
+    printf("fatan2            %g\n", time_binary_func(time_fatan2, f_a, f_b, count_of(f_a)));
+    printf("fsincos           %g\n", time_unary_func(time_fsincos, f_a, count_of(f_a)));
+    printf("----------------- Sci (basic) ---\n");
+    printf("fexp              %g\n", time_unary_func(time_fexp, f_a, count_of(f_a)));
+    printf("flog              %g\n", time_unary_func(time_flog, f_positive, count_of(f_positive)));
+    printf("----------------- Misc ---\n");
+    printf("fcopysign         %g\n", time_binary_func(time_fcopysign, f_a, f_b, count_of(f_a)));
+    printf("ftrunc            %g\n", time_unary_func(time_ftrunc, f_a, count_of(f_a)));
+    printf("ffloor            %g\n", time_unary_func(time_ffloor, f_a, count_of(f_a)));
+    printf("fceil             %g\n", time_unary_func(time_fceil, f_a, count_of(f_a)));
+    printf("fround            %g\n", time_unary_func(time_fround, f_a, count_of(f_a)));
+    printf("ffmod             %g\n", time_binary_func(time_ffmod, f_a, f_b, count_of(f_a)));
+    printf("fdrem             %g\n", time_binary_func(time_fdrem, f_a, f_b, count_of(f_a)));
+    printf("fremainder        %g\n", time_binary_func(time_fremainder, f_a, f_b, count_of(f_a)));
+    printf("fremquo           %g\n", time_binary_func(time_fremquo, f_a, f_b, count_of(f_a)));
+    printf("----------------- Sci (extra) ---\n");
+    printf("fexp2             %g\n", time_unary_func(time_fexp2, f_a, count_of(f_a)));
+    printf("flog2             %g\n", time_unary_func(time_flog2, f_positive, count_of(f_positive)));
+    printf("fexp10            %g\n", time_unary_func(time_fexp10, f_a, count_of(f_a)));
+    printf("flog10            %g\n", time_unary_func(time_flog10, f_positive, count_of(f_positive)));
+    printf("fldexp            %g\n", time_binary_int_func(time_fldexp, f_a, i_pow, count_of(f_a)));
+    printf("fexpm1            %g\n", time_unary_func(time_fexpm1, f_a, count_of(f_a)));
+    printf("flog1p            %g\n", time_unary_func(time_flog1p, f_positive, count_of(f_positive)));
+    printf("fpowint           %g\n", time_binary_int_func(time_fpowint, f_a, i_32, count_of(f_a)));
+    printf("fpow              %g\n", time_binary_func(time_fpow, f_a, f_b, count_of(f_a)));
+    printf("fcbrt             %g\n", time_unary_func(time_fcbrt, f_a, count_of(f_a)));
+    printf("----------------- Trig (extra) ---\n");
+    printf("facos             %g\n", time_unary_func(time_facos, f_m1to1, count_of(f_m1to1)));
+    printf("fasin             %g\n", time_unary_func(time_fasin, f_m1to1, count_of(f_m1to1)));
+    printf("fatan             %g\n", time_unary_func(time_fatan, f_a, count_of(f_a)));
+    printf("fcosh             %g\n", time_unary_func(time_fcosh, f_a, count_of(f_a)));
+    printf("fsinh             %g\n", time_unary_func(time_fsinh, f_a, count_of(f_a)));
+    printf("ftanh             %g\n", time_unary_func(time_ftanh, f_a, count_of(f_a)));
+    printf("facosh            %g\n", time_unary_func(time_facosh, f_1plus, count_of(f_1plus)));
+    printf("fasinh            %g\n", time_unary_func(time_fasinh, f_1plus, count_of(f_1plus)));
+    printf("fatanh            %g\n", time_unary_func(time_fatanh, f_m1to1, count_of(f_m1to1)));
+    printf("fhypot            %g\n", time_binary_func(time_fhypot, f_a, f_b, count_of(f_a)));
+
+    printf("PASSED\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/test/pico_float_test/pico_double_test.c b/test/pico_float_test/pico_double_test.c
index a0959fe5..459a4bda 100644
--- a/test/pico_float_test/pico_double_test.c
+++ b/test/pico_float_test/pico_double_test.c
@@ -329,7 +329,7 @@ double __real_fma(double, double, double);
 
 #define FRAC ((double)(1ull << 50))
 #define allowed_range(a) (fabs(a) / FRAC)
-#define assert_close(a, b) test_assert((fabs(a - b) <= allowed_range(a) || ({ printf("  error: %f != %f\n", a, b); 0; })) || (isinf(a) && isinf(b) && (a < 0) == (b < 0)))
+#define assert_close(a, b) test_assert((fabs((a) - (b)) <= allowed_range(a) || ({ printf("  error: %f != %f\n", a, b); 0; })) || (isinf(a) && isinf(b) && ((a) < 0) == ((b) < 0)))
 #define check1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); test_assert(r == r2); r; })
 #define check2(func,p0,p1) ({ typeof(p0) r = func(p0,p1), r2 = __CONCAT(__real_, func)(p0,p1); test_assert(r == r2); r; })
 #define check_close1(func,p0) ({ typeof(p0) r = func(p0), r2 = __CONCAT(__real_, func)(p0); if (isnan(p0)) assert_nan(r); else assert_close(r, r2); r; })
@@ -367,7 +367,7 @@ int main() {
     for (double x = 0; x < 3; x++) {
         printf("\n ----- %g\n", x);
         printf("SQRT %10.18g\n", check_close1(sqrt, x));
-#if PICO_RP2350 && !LIB_PICO_DOUBLE_COMPILER
+#if PICO_DOUBLE_HAS_SQRT_FAST
         printf("SQRT_FAST %10.18g\n", check_close1(sqrt_fast, x));
 #endif
         printf("COS %10.18g\n", check_close1(cos, x));
@@ -442,10 +442,9 @@ int main() {
     for (double a = -100.0; a < 100.0; a += 53.103) {
         for (double b = -2000000.0; b < 1000000.0; b += 397243.5) {
             for (double c = -700.0; c < 1000.0; c += 287.4) {
-                printf("fma %f %f %f\n", a, b, c);
-                check_close3(fma, a, b, c);
-#if PICO_RP2350 && !LIB_PICO_DOUBLE_COMPILER
-                check_close3(fma_fast, a, b, c);
+                printf("FMA %f\n", check_close3(fma, a, b, c));
+#if PICO_DOUBLE_HAS_FMA_FAST
+                printf("FMAFAST %f\n", check_close3(fma, a, b, c));
 #endif
             }
         }
@@ -531,7 +530,7 @@ int main() {
                x - 0.377777777777777777777777777777, g, 123456789.0 / x);
         check2(__aeabi_dmul, x, x);
         check2(__aeabi_ddiv, 1.0, x);
-#if PICO_RP2350 && !LIB_PICO_DOUBLE_COMPILER
+#if PICO_DOUBLE_HAS_DDIV_FAST
         check2(ddiv_fast, 1.0, x);
 #endif
     }
diff --git a/test/pico_float_test/pico_float_test.c b/test/pico_float_test/pico_float_test.c
index 5296a793..c38cec97 100644
--- a/test/pico_float_test/pico_float_test.c
+++ b/test/pico_float_test/pico_float_test.c
@@ -311,7 +311,7 @@ int32_t __attribute__((pcs("aapcs"))) __aeabi_f2iz(float);
 int64_t __attribute__((pcs("aapcs"))) __aeabi_f2lz(float);
 float __attribute__((pcs("aapcs"))) __aeabi_fmul(float, float);
 float __attribute__((pcs("aapcs"))) __aeabi_fdiv(float, float);
-#if LIB_PICO_FLOAT_PICO
+#if !LIB_PICO_FLOAT_COMPILER
 #if !LIB_PICO_FLOAT_PICO_VFP
 float __attribute__((pcs("aapcs"))) __real___aeabi_i2f(int);
 float __attribute__((pcs("aapcs"))) __real___aeabi_ui2f(int);
@@ -321,9 +321,9 @@ float __attribute__((pcs("aapcs"))) __real___aeabi_fmul(float, float);
 float __attribute__((pcs("aapcs"))) __real___aeabi_fdiv(float, float);
 int32_t __attribute__((pcs("aapcs"))) __real___aeabi_f2iz(float);
 int64_t __attribute__((pcs("aapcs"))) __real___aeabi_f2lz(float);
+#endif
 float __real_sqrtf(float);
 float __real_fmaf(float, float, float);
-#endif
 float __real_cosf(float);
 float __real_sinf(float);
 float __real_tanf(float);
@@ -421,6 +421,7 @@ int main() {
 #if 1
     for (float x = 0; x < 3; x++) {
         printf("\n ----- %f\n", x);
+        // not replaced in this version
 #if !LIB_PICO_FLOAT_PICO_VFP
         printf("FSQRT %10.18f\n", check_close1(sqrtf, x));
 #endif
@@ -435,9 +436,8 @@ int main() {
         printf("FEXP %10.18f\n", check_close1(expf, x));
         printf("FLN %10.18f\n", check_close1(logf, x));
         printf("POWF %10.18f\n", check_close2(powf, x, x));
-        // todo clang why does this not compile?
-#ifndef __clang__
-        printf("TRUNCF %10.18f\n", check_close1(truncf, x));
+#if !(__clang__ && __PICOLIBC__) // seems to be a buf with wrapping the extern inline trunc
+        printf("TRUNCF %10.18f\n", check1(truncf, x));
 #endif
         printf("LDEXPF %10.18f\n", check_close2(ldexpf, x, x));
         printf("FMODF %10.18f\n", check_close2(fmodf, x, 3.0f));
@@ -561,17 +561,17 @@ int main() {
     }
     for(float x = 4294967296.f * 4294967296.f * 2.f; x>=0.5f; x/=2.f) {
         printf("f2i64 %f->%lld\n", x, (int64_t)x);
-#if PICO_RP2040
         if ((double)x >= (double)INT64_MAX) {
 #if TEST_SATURATION
             test_assert(__aeabi_f2lz(x) == INT64_MAX);
 #endif
         } else {
+#if PICO_RP2040
             check1(__aeabi_f2lz, x);
-        }
 #else
-        check1_vfp_unwrapped(__aeabi_f2lz, x);
+            check1_vfp_unwrapped(__aeabi_f2lz, x);
 #endif
+        }
     }
     for(float x = -4294967296.f * 4294967296.f; x<=-0.5f; x/=2.f) {
         printf("f2i32 %f->%d\n", x, (int32_t)x);
@@ -579,17 +579,17 @@ int main() {
     }
     for(float x = 4294967296.f * 4294967296.f; x>=0.5f; x/=2.f) {
         printf("f2i32 %f->%d\n", x, (int32_t)x);
-#if PICO_RP2040
         if ((double)x >= (double)INT32_MAX) {
 #if TEST_SATURATION
             test_assert(__aeabi_f2iz(x) == INT32_MAX);
 #endif
         } else {
+#if PICO_RP2040
             check1(__aeabi_f2iz, x);
-        }
 #else
-        check1_vfp_unwrapped(__aeabi_f2iz, x);
+            check1_vfp_unwrapped(__aeabi_f2iz, x);
 #endif
+        }
     }
 
     for (float x = 1; x < 11; x += 2) {