diff --git a/CMakeLists.txt b/CMakeLists.txt index 5411361..477f415 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,29 @@ elseif(MSVC14 OR MSVC14) ADD_DEFINITIONS(-D_TIMESPEC_DEFINED) endif() +# Enable SIMD instructions on x86 +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i386|i686") + add_compile_options(-mpclmul -msse4.1) +endif() + +######################################################################## +# CRC SIMD specific setup +######################################################################## +# Enable crypto extensions on ARM64 +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + # Check if using GCC or Clang + if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + # GCC requires explicit crypto extension flag on all ARM64 platforms + add_compile_options(-march=armv8-a+crypto) + elseif(APPLE) + # AppleClang on Apple Silicon - use native CPU features + add_compile_options(-mcpu=apple-m1) + else() + # Other Clang on ARM64 Linux + add_compile_options(-march=armv8-a+crypto) + endif() +endif() + ######################################################################## # Find build dependencies ######################################################################## diff --git a/include/crc.h b/include/crc.h deleted file mode 100644 index bb85571..0000000 --- a/include/crc.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef __CRC_H -#define __CRC_H - -/* CRC16 CCIT 'false' table, similar to what can be found in the Kernel */ -static const uint16_t ccitt_hash[] = { - 0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7, - 0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef, - 0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6, - 0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de, - 0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485, - 0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d, - 0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4, - 0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc, - 0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823, - 0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b, - 0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12, - 0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a, - 0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41, - 0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49, - 0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70, - 0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78, - 0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f, - 0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067, - 0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e, - 0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256, - 0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d, - 0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405, - 0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c, - 0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634, - 0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab, - 0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3, - 0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a, - 0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92, - 0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9, - 0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1, - 0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8, - 0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0, -}; - -uint16_t crc16_ccitt(const uint8_t* buffer, size_t size) -{ - uint16_t crc = 0xffff; - while (size-- > 0) - crc = (crc << 8) ^ ccitt_hash[((crc >> 8) ^ *(buffer++)) & 0x00ff]; - - return crc; -} -#endif diff --git a/include/crc16speed.h b/include/crc16speed.h new file mode 100644 index 0000000..49db646 --- /dev/null +++ b/include/crc16speed.h @@ -0,0 +1,20 @@ +#ifndef CRC16SPEED_H +#define CRC16SPEED_H +#include "crcspeed.h" +#include "stdbool.h" + +/* Does not require init */ +uint16_t crc16(uint16_t crc, const void *data, const uint64_t len); +void crc16speed_cache_table(void); + +/* All other crc functions here require _init() before usage. */ +bool crc16speed_init(void); +uint16_t crc16_lookup(uint16_t crc, const void *in_data, const uint64_t len); +uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l); + +bool crc16speed_init_big(void); +uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l); + +bool crc16speed_init_native(void); +uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l); +#endif diff --git a/include/crc_simd.h b/include/crc_simd.h new file mode 100644 index 0000000..629497d --- /dev/null +++ b/include/crc_simd.h @@ -0,0 +1,28 @@ +/* CRC SIMD Acceleration Header + * Copyright (c) 2024 + * SPDX-License-Identifier: BSD-3-Clause + * + * SIMD-accelerated CRC computation using: + * - PCLMULQDQ on x86/x64 (Intel/AMD) + * - PMULL on ARM64/NEON (Apple Silicon, ARM servers) + */ + +#ifndef CRC_SIMD_H +#define CRC_SIMD_H + +#include +#include +#include + +/* Feature detection */ +bool crc_simd_available(void); + +/* CRC64 SIMD functions - use Jones polynomial 0xad93d23594c935a9 */ +void crc64_simd_init(void); +uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len); + +/* CRC16 SIMD functions - use CRC-16-CCITT polynomial 0x1021 */ +void crc16_simd_init(void); +uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len); + +#endif /* CRC_SIMD_H */ diff --git a/include/crcspeed.h b/include/crcspeed.h new file mode 100644 index 0000000..d7ee95e --- /dev/null +++ b/include/crcspeed.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2014, Matt Stancliff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ + +#ifndef CRCSPEED_H +#define CRCSPEED_H + +#include +#include + +typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t); +typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t); + +/* CRC-64 */ +void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]); +void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]); +void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]); + +uint64_t crcspeed64little(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); +uint64_t crcspeed64big(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); +uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len); + +/* CRC-16 */ +void crcspeed16little_init(crcfn16 fn, uint16_t table[8][256]); +void crcspeed16big_init(crcfn16 fn, uint16_t table[8][256]); +void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]); + +uint16_t crcspeed16little(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +uint16_t crcspeed16big(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len); +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e92737c..77d2704 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,7 +18,7 @@ ######################################################################## # Setup shared library variant ######################################################################## -add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c) +add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c crcspeed.c crc16speed.c crc_simd.c) target_link_libraries(hsdaoh ${LIBUSB_LIBRARIES} ${LIBUVC_LIBRARIES} ${THREADS_PTHREADS_LIBRARY}) target_include_directories(hsdaoh PUBLIC $ @@ -86,7 +86,6 @@ set(INSTALL_TARGETS hsdaoh hsdaoh_static hsdaoh_file hsdaoh_tcp hsdaoh_test) target_link_libraries(hsdaoh_file hsdaoh ${LIBFLAC_LIBRARIES} -# ${LIBUSB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ) target_link_libraries(hsdaoh_tcp hsdaoh diff --git a/src/crc16speed.c b/src/crc16speed.c new file mode 100644 index 0000000..0516a5c --- /dev/null +++ b/src/crc16speed.c @@ -0,0 +1,213 @@ +/* Copyright (c) 2014, Matt Stancliff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ + +#include "crc16speed.h" + +/* If CRCSPEED16_DUAL is defined, we allow calls to + * both _little and _big CRC. + * By default, we only allow one endianness to be used + * and the first call to either _init function will set the + * lookup table endianness for the life of this module. + * We don't enable dual lookups by default because + * each 8x256 lookup table is 4k. */ +#ifndef CRC16SPEED_DUAL +static uint16_t crc16_table[8][256] = {{0}}; +static void *crc16_table_little = NULL, *crc16_table_big = NULL; +static const bool dual = false; +#else +static uint16_t crc16_table_little[8][256] = {{0}}; +static uint16_t crc16_table_big[8][256] = {{0}}; +static void *crc16_table = NULL; +static const bool dual = true; +#endif + +/* value of crc16_table[0][1], architecture dependent. */ +#define LITTLE1 UINT16_C(0x1021) +#define BIG1 UINT16_C(0x2110) + +/* Define CRC16SPEED_SAFE if you want runtime checks to stop + * CRCs from being calculated by uninitialized tables (and also stop tables + * from being initialized more than once). */ +#ifdef CRC16SPEED_SAFE +#define should_init(table, val) \ + do { \ + if ((table)[0][1] == (val)) \ + return false; \ + } while (0) +#define check_init(table, val) \ + do { \ + if ((table)[0][1] != (val)) \ + return false; \ + } while (0) +#else +#define should_init(a, b) +#define check_init(a, b) +#endif + +/* This is CRC-16-CCITT (non-reflected poly, non-inverted input/output). + * crc16() is only used to bootstrap an initial 256-entry lookup table. */ +#define POLY 0x1021 +uint16_t crc16(uint16_t crc, const void *in_data, uint64_t len) { + const uint8_t *data = in_data; + for (uint64_t i = 0; i < len; i++) { + crc = crc ^ (data[i] << 8); + for (int j = 0; j < 8; j++) { + if (crc & 0x8000) { + crc = (crc << 1) ^ POLY; + } else { + crc = (crc << 1); + } + } + } + + return crc; +} + +/* Only for testing; doesn't support DUAL */ +uint16_t crc16_lookup(uint16_t crc, const void *in_data, uint64_t len) { + const uint8_t *data = in_data; + for (uint64_t i = 0; i < len; i++) { + crc = (crc << 8) ^ crc16_table[0][((crc >> 8) ^ data[i]) & 0x00ff]; + } + + return crc; +} + +/* Returns false if CRC16SPEED_SAFE and table already initialized. */ +bool crc16speed_init(void) { +#ifndef CRC16SPEED_DUAL + should_init(crc16_table, LITTLE1); +#else + should_init(crc16_table_little, LITTLE1); +#endif + crcspeed16little_init(crc16, dual ? crc16_table_little : crc16_table); + return true; +} + +/* Returns false if CRC16SPEED_SAFE and table already initialized. */ +bool crc16speed_init_big(void) { +#ifndef CRC16SPEED_DUAL + should_init(crc16_table, BIG1); +#else + should_init(crc16_table_big, BIG1); +#endif + crcspeed16big_init(crc16, dual ? crc16_table_big : crc16_table); + return true; +} + +uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l) { +/* Quickly check if CRC table is initialized to little endian correctly. */ +#ifndef CRC16SPEED_DUAL + check_init(crc16_table, LITTLE1); +#else + check_init(crc16_table_little, LITTLE1); +#endif + return crcspeed16little(dual ? crc16_table_little : crc16_table, crc, + (void *)s, l); +} + +uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l) { +/* Quickly check if CRC table is initialized to big endian correctly. */ +#ifndef CRC16SPEED_DUAL + check_init(crc16_table, BIG1); +#else + check_init(crc16_table_big, BIG1); +#endif + return crcspeed16big(dual ? crc16_table_big : crc16_table, crc, (void *)s, + l); +} + +bool crc16speed_init_native(void) { + const uint64_t n = 1; + return *(char *)&n ? crc16speed_init() : crc16speed_init_big(); +} + +/* If you are on a platform where endianness can change at runtime, this + * will break unless you compile with CRC16SPEED_DUAL and manually run + * _init() and _init_big() instead of using _init_native() */ +uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l) { + const uint64_t n = 1; + return *(char *)&n ? crc16speed(crc, s, l) : crc16speed_big(crc, s, l); +} + +/* Iterate over table to fully load it into a cache near the CPU. */ +void crc16speed_cache_table(void) { + volatile uint16_t m; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 256; ++j) { +#ifndef CRC16SPEED_DUAL + m = crc16_table[i][j]; +#else + m = crc16_table_little[i][j]; + m += crc16_table_big[i][j]; +#endif + } + } + (void)m; /* Suppress unused variable warning */ +} + +/* Test main */ +#if defined(CRCSPEED_TEST) || defined(CRCSPEED_TEST_MAIN) +#include +#include + +#define UNUSED(x) (void)(x) +int crc16Test(int argc, char *argv[]) { + UNUSED(argc); + UNUSED(argv); + crc16speed_init(); + printf("[calcula]: 31c3 == %04" PRIx64 "\n", + (uint64_t)crc16(0, "123456789", 9)); + printf("[lookupt]: 31c3 == %04" PRIx64 "\n", + (uint64_t)crc16_lookup(0, "123456789", 9)); + printf("[16speed]: 31c3 == %04" PRIx64 "\n", + (uint64_t)crc16speed(0, "123456789", 9)); + char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed " + "do eiusmod tempor incididunt ut labore et dolore magna " + "aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis " + "aute irure dolor in reprehenderit in voluptate velit esse " + "cillum dolore eu fugiat nulla pariatur. Excepteur sint " + "occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum."; + printf("[calcula]: 4b20 == %04" PRIx64 "\n", + (uint64_t)crc16(0, li, sizeof(li))); + printf("[lookupt]: 4b20 == %04" PRIx64 "\n", + (uint64_t)crc16_lookup(0, li, sizeof(li))); + printf("[16speed]: 4b20 == %04" PRIx64 "\n", + (uint64_t)crc16speed(0, li, sizeof(li))); + return 0; +} + +#endif + +#ifdef CRCSPEED_TEST_MAIN +int main(int argc, char *argv[]) { + return crc16Test(argc, argv); +} + +#endif diff --git a/src/crc_simd.c b/src/crc_simd.c new file mode 100644 index 0000000..83182b3 --- /dev/null +++ b/src/crc_simd.c @@ -0,0 +1,456 @@ +/* CRC SIMD Acceleration Implementation + * Copyright (c) 2024 + * SPDX-License-Identifier: BSD-3-Clause + * + * SIMD-accelerated CRC computation using PCLMULQDQ (x86) or PMULL (ARM64) + * Based on Intel's "Fast CRC Computation for Generic Polynomials Using + * PCLMULQDQ Instruction" whitepaper and Intel ISA-L implementation. + * + * Supports: + * - CRC64-Jones: polynomial 0xad93d23594c935a9 (reflected) + * - CRC16-CCITT: polynomial 0x1021 (non-reflected) + * + * References: + * - Intel whitepaper: Fast CRC Computation for Generic Polynomials + * - Intel ISA-L: https://github.com/intel/isa-l + */ + +#include "crc_simd.h" +#include + +/* ============================================================================ + * Architecture Detection and Feature Testing + * ============================================================================ + */ + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) +#define ARCH_X86 1 +#if defined(__GNUC__) || defined(__clang__) +#include +#endif +#include +#include +#elif defined(__aarch64__) || defined(_M_ARM64) +#define ARCH_ARM64 1 +#include +#if defined(__ARM_FEATURE_CRYPTO) || defined(__APPLE__) +#define HAS_PMULL 1 +#include +#endif +#endif + +static bool g_simd_checked = false; +static bool g_simd_available = false; + +bool crc_simd_available(void) { + if (g_simd_checked) { + return g_simd_available; + } + g_simd_checked = true; + +#if defined(ARCH_X86) +#if defined(__GNUC__) || defined(__clang__) + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + /* Check for PCLMULQDQ (bit 1 of ECX) and SSE4.1 (bit 19 of ECX) */ + g_simd_available = (ecx & (1 << 1)) && (ecx & (1 << 19)); + } +#endif +#elif defined(ARCH_ARM64) +#if defined(__APPLE__) + /* Apple Silicon always has PMULL */ + g_simd_available = true; +#elif defined(HAS_PMULL) + g_simd_available = true; +#else + g_simd_available = false; +#endif +#else + g_simd_available = false; +#endif + + return g_simd_available; +} + +/* ============================================================================ + * CRC64 Jones Polynomial: 0xad93d23594c935a9 (reflected) + * + * Constants from Intel ISA-L crc64_jones_refl_by8.asm + * ============================================================================ + */ + +/* 16-byte folding constants */ +#define CRC64_RK1 UINT64_C(0x381d0015c96f4444) +#define CRC64_RK2 UINT64_C(0xd9d7be7d505da32c) + +/* 128-byte folding constants */ +#define CRC64_RK3 UINT64_C(0x768361524d29ed0b) +#define CRC64_RK4 UINT64_C(0xcc26fa7c57f8054c) + +/* 8-way reduction constants */ +#define CRC64_RK9 UINT64_C(0x5bc94ba8e2087636) +#define CRC64_RK10 UINT64_C(0x6cf09c8f37710b75) +#define CRC64_RK11 UINT64_C(0x3885fd59e440d95a) +#define CRC64_RK12 UINT64_C(0xbccba3936411fb7e) +#define CRC64_RK13 UINT64_C(0xe4dd0d81cbfce585) +#define CRC64_RK14 UINT64_C(0xb715e37b96ed8633) +#define CRC64_RK15 UINT64_C(0xf49784a634f014e4) +#define CRC64_RK16 UINT64_C(0xaf86efb16d9ab4fb) +#define CRC64_RK17 UINT64_C(0x7b3211a760160db8) +#define CRC64_RK18 UINT64_C(0xa062b2319d66692f) +#define CRC64_RK19 UINT64_C(0xef3d1d18ed889ed2) +#define CRC64_RK20 UINT64_C(0x6ba4d760ab38201e) + +/* ============================================================================ + * CRC16-CCITT Polynomial: 0x1021 (non-reflected) + * + * Constants computed by gen_constants.c + * ============================================================================ + */ + +/* 16-byte folding constants */ +#define CRC16_K1 UINT64_C(0x10e2) /* x^144 mod P */ +#define CRC16_K2 UINT64_C(0xaefc) /* x^128 mod P */ + +/* 64-byte folding constants */ +#define CRC16_K3 UINT64_C(0x78b3) /* x^528 mod P */ +#define CRC16_K4 UINT64_C(0x13fc) /* x^512 mod P */ + +/* 8-way reduction constants */ +#define CRC16_RK9 UINT64_C(0x4347) /* x^912 mod P */ +#define CRC16_RK10 UINT64_C(0xcbc5) /* x^896 mod P */ +#define CRC16_RK11 UINT64_C(0x9e3a) /* x^784 mod P */ +#define CRC16_RK12 UINT64_C(0x106f) /* x^768 mod P */ +#define CRC16_RK13 UINT64_C(0x9c1a) /* x^656 mod P */ +#define CRC16_RK14 UINT64_C(0xda35) /* x^640 mod P */ +#define CRC16_RK15 UINT64_C(0x78b3) /* x^528 mod P */ +#define CRC16_RK16 UINT64_C(0x13fc) /* x^512 mod P */ +#define CRC16_RK17 UINT64_C(0xbd64) /* x^400 mod P */ +#define CRC16_RK18 UINT64_C(0xcde2) /* x^384 mod P */ +#define CRC16_RK19 UINT64_C(0x8ddc) /* x^272 mod P */ +#define CRC16_RK20 UINT64_C(0x8e29) /* x^256 mod P */ + +/* Barrett reduction constants */ +#define CRC16_MU UINT64_C(0x0000f0d3) /* floor(x^32 / P) */ +#define CRC16_POLY UINT64_C(0x1021) + +/* ============================================================================ + * X86/X64 PCLMULQDQ Implementation + * ============================================================================ + */ + +#if defined(ARCH_X86) + +/* Helper: fold one 128-bit block for CRC64 (reflected) */ +static inline __m128i fold_128(__m128i acc, __m128i data, __m128i k1k2) { + __m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x00); + __m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x11); + return _mm_xor_si128(_mm_xor_si128(t1, t2), data); +} + +/* Helper: byte-swap a 128-bit register (used for CRC16 non-reflected) */ +__attribute__((unused)) static inline __m128i bswap_128(__m128i x) { + const __m128i mask = + _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return _mm_shuffle_epi8(x, mask); +} + +/* Helper: fold one 128-bit block for CRC16 (non-reflected) + * For non-reflected CRC, we use different PCLMULQDQ immediates + * and the constants are positioned in the high 16 bits. + * Note: Currently unused as we fall back to crc16speed for CRC16 SIMD on x86. + */ +__attribute__((unused)) static inline __m128i +fold_128_16(__m128i acc, __m128i data, __m128i k1k2) { + /* For non-reflected CRC16: + * - k1 is in high 64 bits, k2 in low 64 bits + * - We use 0x10 and 0x01 immediates (cross-multiply) + */ + __m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x10); /* acc_lo * k1 */ + __m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x01); /* acc_hi * k2 */ + return _mm_xor_si128(_mm_xor_si128(t1, t2), data); +} + +void crc64_simd_init(void) { + crc_simd_available(); +} + +/* ============================================================================ + * CRC16 SIMD Implementation (x86/x64 with PCLMULQDQ) + * + * CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC. + * + * Non-reflected CRCs process data MSB-first, which requires: + * 1. Byte-swapping data to get correct byte order for PCLMULQDQ + * 2. Positioning CRC in high bits + * 3. Different folding constant interpretation + * + * This implementation uses the marzooqy/crc-clmul approach: + * - Data is byte-swapped (reversed within 128-bit blocks) + * - CRC goes in high 64 bits + * - For non-reflected: PCLMULQDQ multiplies acc_hi*k_hi and acc_lo*k_lo + * - Final reduction via table lookup + * + * Folding constants for CRC16-CCITT polynomial 0x1021: + * - x^192 mod P = 0x650b (for high 64-bit lane) + * - x^128 mod P = 0xaefc (for low 64-bit lane) + * ============================================================================ + */ + +/* CRC16-CCITT folding constants for 128-bit blocks */ +/* x^192 mod P (for high 64-bit lane) */ +#define CRC16_K_HI_X86 UINT64_C(0x650b) + +/* x^128 mod P (for low 64-bit lane) */ +#define CRC16_K_LO_X86 UINT64_C(0xaefc) + +/* Byte-swap 128-bit register: reverse all 16 bytes */ +static inline __m128i crc16_bswap_128_x86(__m128i x) { + const __m128i mask = + _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return _mm_shuffle_epi8(x, mask); +} + +/* Fold helper for non-reflected CRC16 on x86 + * For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo + * Constants packed as: k_hi in high 64 bits, k_lo in low 64 bits + */ +static inline __m128i crc16_fold_x86(__m128i acc, __m128i data, __m128i k) { + /* Non-reflected folding: + * - acc_hi (high 64) gets multiplied by k_hi (high 64) -> use 0x11 + * - acc_lo (low 64) gets multiplied by k_lo (low 64) -> use 0x00 + */ + __m128i t1 = _mm_clmulepi64_si128(acc, k, 0x11); /* acc_hi * k_hi */ + __m128i t2 = _mm_clmulepi64_si128(acc, k, 0x00); /* acc_lo * k_lo */ + return _mm_xor_si128(_mm_xor_si128(t1, t2), data); +} + +uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) { + const uint8_t *buf = (const uint8_t *)data; + extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l); + + /* Use table-based for small buffers - SIMD overhead not worth it */ + if (!crc_simd_available() || len < 64) { + return crc16speed(crc, data, len); + } + + /* Process 16-byte blocks using SIMD folding */ + uint64_t blocks = len / 16; + if (blocks < 2) { + return crc16speed(crc, data, len); + } + + /* Constants: k_hi in high 64 bits, k_lo in low 64 bits */ + const __m128i k = _mm_set_epi64x(CRC16_K_HI_X86, CRC16_K_LO_X86); + + /* Load and byte-swap first 16 bytes */ + __m128i acc = crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf)); + + /* XOR initial CRC into high 16 bits of high 64-bit lane + * For non-reflected CRC, the CRC value is conceptually at the MSB end + */ + uint64_t crc_shifted = (uint64_t)crc << 48; + __m128i crc_vec = _mm_set_epi64x(crc_shifted, 0); + acc = _mm_xor_si128(acc, crc_vec); + + buf += 16; + blocks--; + + /* Main folding loop */ + while (blocks > 0) { + /* Load and byte-swap next 16 bytes */ + __m128i data_vec = + crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf)); + + /* Fold accumulator with new data */ + acc = crc16_fold_x86(acc, data_vec, k); + + buf += 16; + blocks--; + } + + /* Store folded result (byte-swap back to original order) */ + __m128i acc_swapped = crc16_bswap_128_x86(acc); + + uint8_t final_buf[32]; + _mm_storeu_si128((__m128i *)final_buf, acc_swapped); + + /* Process remaining bytes through table lookup */ + uint64_t remaining = len % 16; + if (remaining > 0) { + memcpy(final_buf + 16, buf, remaining); + return crc16speed(0, final_buf, 16 + remaining); + } else { + return crc16speed(0, final_buf, 16); + } +} + +void crc16_simd_init(void) { + crc_simd_available(); +} + +#elif defined(ARCH_ARM64) && defined(HAS_PMULL) + +/* ============================================================================ + * ARM64 NEON/PMULL Implementation + * ============================================================================ + */ + +static inline uint64x2_t fold_128_neon(uint64x2_t acc, uint64x2_t data, + uint64_t k1, uint64_t k2) { + poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 0), k2); + poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 1), k1); + uint64x2_t t1_vec = vreinterpretq_u64_p128(t1); + uint64x2_t t2_vec = vreinterpretq_u64_p128(t2); + return veorq_u64(veorq_u64(t1_vec, t2_vec), data); +} + +/* Byte-swap for NEON - used for non-reflected CRCs (currently unused, kept for + * reference) */ +#if 0 +static inline uint8x16_t bswap_128_neon(uint8x16_t x) { + return vrev64q_u8(x); +} +#endif + +void crc64_simd_init(void) { + crc_simd_available(); +} + +/* ============================================================================ + * CRC16 SIMD Implementation for ARM64 + * + * CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC. + * + * Non-reflected CRCs process data MSB-first, which requires: + * 1. Byte-swapping data to get correct byte order for PMULL + * 2. Positioning CRC in high bits + * 3. Different folding constant interpretation + * + * This implementation uses the marzooqy/crc-clmul approach: + * - Data is byte-swapped (reversed within 128-bit blocks) + * - CRC goes in high 64 bits: SET(crc << 48, 0) + * - For non-reflected: PMULL immediates swap acc_hi*k1 and acc_lo*k2 + * - Final reduction via table lookup + * + * Folding constants for CRC16-CCITT polynomial 0x1021: + * - x^192 mod P = 0x650b (for high 64-bit lane) + * - x^128 mod P = 0xaefc (for low 64-bit lane) + * ============================================================================ + */ + +/* CRC16-CCITT folding constants for 128-bit blocks */ +#define CRC16_K_HI UINT64_C(0x650b) /* x^192 mod P (for high 64-bit lane) */ +#define CRC16_K_LO UINT64_C(0xaefc) /* x^128 mod P (for low 64-bit lane) */ + +/* Byte-swap 128 bits: reverse all 16 bytes */ +static inline uint8x16_t crc16_bswap_128(uint8x16_t x) { + /* Reverse bytes within each 64-bit lane, then swap lanes */ + uint8x16_t rev = vrev64q_u8(x); + return vcombine_u8(vget_high_u8(rev), vget_low_u8(rev)); +} + +/* Fold helper for non-reflected CRC16 + * For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo + * This is different from reflected where we use 0x00 and 0x11 immediates + */ +static inline uint64x2_t crc16_fold_neon(uint64x2_t acc, uint64x2_t data, + uint64_t k_hi, uint64_t k_lo) { + /* Non-reflected folding: + * - acc_hi (lane 1) gets multiplied by k_hi + * - acc_lo (lane 0) gets multiplied by k_lo + */ + poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 1), k_hi); /* acc_hi * k_hi */ + poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 0), k_lo); /* acc_lo * k_lo */ + uint64x2_t t1_vec = vreinterpretq_u64_p128(t1); + uint64x2_t t2_vec = vreinterpretq_u64_p128(t2); + return veorq_u64(veorq_u64(t1_vec, t2_vec), data); +} + +uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) { + extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l); + const uint8_t *buf = (const uint8_t *)data; + + /* Use table-based for small buffers - SIMD overhead not worth it */ + if (!crc_simd_available() || len < 64) { + return crc16speed(crc, data, len); + } + + /* Process 16-byte blocks using SIMD folding */ + uint64_t blocks = len / 16; + if (blocks < 2) { + return crc16speed(crc, data, len); + } + + /* Load and byte-swap first 16 bytes */ + uint8x16_t data8 = vld1q_u8(buf); + data8 = crc16_bswap_128(data8); + uint64x2_t acc = vreinterpretq_u64_u8(data8); + + /* XOR initial CRC into high 16 bits of high 64-bit lane + * For non-reflected CRC, the CRC value is conceptually at the MSB end + */ + uint64_t crc_shifted = (uint64_t)crc << 48; + uint64x2_t crc_vec = vsetq_lane_u64(crc_shifted, vdupq_n_u64(0), 1); + acc = veorq_u64(acc, crc_vec); + + buf += 16; + blocks--; + + /* Main folding loop */ + while (blocks > 0) { + /* Load and byte-swap next 16 bytes */ + data8 = vld1q_u8(buf); + data8 = crc16_bswap_128(data8); + uint64x2_t data_vec = vreinterpretq_u64_u8(data8); + + /* Fold accumulator with new data */ + acc = crc16_fold_neon(acc, data_vec, CRC16_K_HI, CRC16_K_LO); + + buf += 16; + blocks--; + } + + /* Store folded result (byte-swap back to original order) */ + uint8x16_t acc8 = vreinterpretq_u8_u64(acc); + acc8 = crc16_bswap_128(acc8); + + uint8_t final_buf[32]; + vst1q_u8(final_buf, acc8); + + /* Process remaining bytes through table lookup */ + uint64_t remaining = len % 16; + if (remaining > 0) { + memcpy(final_buf + 16, buf, remaining); + return crc16speed(0, final_buf, 16 + remaining); + } else { + return crc16speed(0, final_buf, 16); + } +} + +void crc16_simd_init(void) { + crc_simd_available(); +} + +#else /* No SIMD support */ + +bool crc_simd_available(void); + +uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len) { + extern uint64_t crc64speed(uint64_t crc, const void *s, const uint64_t l); + return crc64speed(crc, data, len); +} + +void crc64_simd_init(void) { +} + +uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) { + extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l); + return crc16speed(crc, data, len); +} + +void crc16_simd_init(void) { +} + +#endif /* Architecture selection */ diff --git a/src/crcspeed.c b/src/crcspeed.c new file mode 100644 index 0000000..ea80583 --- /dev/null +++ b/src/crcspeed.c @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2013 Mark Adler + * Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler + * Modifications by Matt Stancliff : + * - removed CRC64-specific behavior + * - added generation of lookup tables by parameters + * - removed inversion of CRC input/result + * - removed automatic initialization in favor of explicit initialization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu + */ + +#include "crcspeed.h" + +/* Fill in a CRC constants table. */ +void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) { + uint64_t crc; + + /* generate CRCs for all single byte sequences */ + for (int n = 0; n < 256; n++) { + table[0][n] = crcfn(0, &n, 1); + } + + /* generate nested CRC table for future slice-by-8 lookup */ + for (int n = 0; n < 256; n++) { + crc = table[0][n]; + for (int k = 1; k < 8; k++) { + crc = table[0][crc & 0xff] ^ (crc >> 8); + table[k][n] = crc; + } + } +} + +void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) { + uint16_t crc; + + /* generate CRCs for all single byte sequences */ + for (int n = 0; n < 256; n++) { + table[0][n] = crcfn(0, &n, 1); + } + + /* generate nested CRC table for future slice-by-8 lookup */ + for (int n = 0; n < 256; n++) { + crc = table[0][n]; + for (int k = 1; k < 8; k++) { + crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8); + table[k][n] = crc; + } + } +} + +/* Reverse the bytes in a 64-bit word. */ +static inline uint64_t rev8(uint64_t a) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap64(a); +#else + uint64_t m; + + m = UINT64_C(0xff00ff00ff00ff); + a = ((a >> 8) & m) | (a & m) << 8; + m = UINT64_C(0xffff0000ffff); + a = ((a >> 16) & m) | (a & m) << 16; + return a >> 32 | a << 32; +#endif +} + +/* This function is called once to initialize the CRC table for use on a + big-endian architecture. */ +void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) { + /* Create the little endian table then reverse all the entires. */ + crcspeed64little_init(fn, big_table); + for (int k = 0; k < 8; k++) { + for (int n = 0; n < 256; n++) { + big_table[k][n] = rev8(big_table[k][n]); + } + } +} + +void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) { + /* Create the little endian table then reverse all the entires. */ + crcspeed16little_init(fn, big_table); + for (int k = 0; k < 8; k++) { + for (int n = 0; n < 256; n++) { + big_table[k][n] = rev8(big_table[k][n]); + } + } +} + +/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian + * architecture. If you need inverted CRC, invert *before* calling and invert + * *after* calling. + * 64 bit crc = process 8 bytes at once; + */ +uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc, + void *buf, size_t len) { + unsigned char *next = buf; + + /* process individual bytes until we reach an 8-byte aligned pointer */ + while (len && ((uintptr_t)next & 7) != 0) { + crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + /* fast middle processing, 8 bytes (aligned!) per loop */ + /* clang-format off */ + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = little_table[7][crc & 0xff] ^ + little_table[6][(crc >> 8) & 0xff] ^ + little_table[5][(crc >> 16) & 0xff] ^ + little_table[4][(crc >> 24) & 0xff] ^ + little_table[3][(crc >> 32) & 0xff] ^ + little_table[2][(crc >> 40) & 0xff] ^ + little_table[1][(crc >> 48) & 0xff] ^ + little_table[0][crc >> 56]; + next += 8; + len -= 8; + } + /* clang-format on */ + + /* process remaining bytes (can't be larger than 8) */ + while (len) { + crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + return crc; +} + +uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc, + void *buf, size_t len) { + unsigned char *next = buf; + + /* process individual bytes until we reach an 8-byte aligned pointer */ + while (len && ((uintptr_t)next & 7) != 0) { + crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8); + len--; + } + + /* fast middle processing, 8 bytes (aligned!) per loop */ + /* clang-format off */ + while (len >= 8) { + uint64_t n = *(uint64_t *)next; + crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^ + little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^ + little_table[5][(n >> 16) & 0xff] ^ + little_table[4][(n >> 24) & 0xff] ^ + little_table[3][(n >> 32) & 0xff] ^ + little_table[2][(n >> 40) & 0xff] ^ + little_table[1][(n >> 48) & 0xff] ^ + little_table[0][n >> 56]; + next += 8; + len -= 8; + } + /* clang-format on */ + + /* process remaining bytes (can't be larger than 8) */ + while (len) { + crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8); + len--; + } + + return crc; +} + +/* Calculate a non-inverted CRC eight bytes at a time on a big-endian + * architecture. + */ +uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf, + size_t len) { + unsigned char *next = buf; + + crc = rev8(crc); + while (len && ((uintptr_t)next & 7) != 0) { + crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8); + len--; + } + + /* clang-format off */ + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = big_table[0][crc & 0xff] ^ + big_table[1][(crc >> 8) & 0xff] ^ + big_table[2][(crc >> 16) & 0xff] ^ + big_table[3][(crc >> 24) & 0xff] ^ + big_table[4][(crc >> 32) & 0xff] ^ + big_table[5][(crc >> 40) & 0xff] ^ + big_table[6][(crc >> 48) & 0xff] ^ + big_table[7][crc >> 56]; + next += 8; + len -= 8; + } + /* clang-format on */ + + while (len) { + crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8); + len--; + } + + return rev8(crc); +} + +/* WARNING: Completely untested on big endian architecture. Possibly broken. */ +uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf, + size_t len) { + unsigned char *next = buf; + uint64_t crc = crc_in; + + crc = rev8(crc); + while (len && ((uintptr_t)next & 7) != 0) { + crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + /* clang-format off */ + while (len >= 8) { + uint64_t n = *(uint64_t *)next; + crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^ + big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^ + big_table[2][(n >> 16) & 0xff] ^ + big_table[3][(n >> 24) & 0xff] ^ + big_table[4][(n >> 32) & 0xff] ^ + big_table[5][(n >> 40) & 0xff] ^ + big_table[6][(n >> 48) & 0xff] ^ + big_table[7][n >> 56]; + next += 8; + len -= 8; + } + /* clang-format on */ + + while (len) { + crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + + return rev8(crc); +} + +/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes + at a time using passed-in lookup table. + This selects one of two routines depending on the endianess of + the architecture. */ +uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf, + size_t len) { + uint64_t n = 1; + + return *(char *)&n ? crcspeed64little(table, crc, buf, len) + : crcspeed64big(table, crc, buf, len); +} + +uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf, + size_t len) { + uint64_t n = 1; + + return *(char *)&n ? crcspeed16little(table, crc, buf, len) + : crcspeed16big(table, crc, buf, len); +} + +/* Initialize CRC lookup table in architecture-dependent manner. */ +void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) { + uint64_t n = 1; + + *(char *)&n ? crcspeed64little_init(fn, table) + : crcspeed64big_init(fn, table); +} + +void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) { + uint64_t n = 1; + + *(char *)&n ? crcspeed16little_init(fn, table) + : crcspeed16big_init(fn, table); +} diff --git a/src/libhsdaoh.c b/src/libhsdaoh.c index 03ce1c7..2cbae0c 100644 --- a/src/libhsdaoh.c +++ b/src/libhsdaoh.c @@ -47,7 +47,8 @@ #include #include #include -#include +#include +#include #define DEFAULT_BUFFERS 96 @@ -493,6 +494,9 @@ int hsdaoh_open(hsdaoh_dev_t **out_dev, uint32_t index) dev->cnv_f1 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN); dev->cnv_f2 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN); + crc16speed_init(); + crc16_simd_init(); + found: *out_dev = dev; @@ -771,7 +775,7 @@ void hsdaoh_process_frame(hsdaoh_dev_t *dev, uint8_t *data, int size) frame_errors++; dev->last_crc[1] = dev->last_crc[0]; - dev->last_crc[0] = crc16_ccitt(line_dat, dev->width * sizeof(uint16_t)); + dev->last_crc[0] = crc16_simd(0xffff, line_dat, dev->width * sizeof(uint16_t)); } if ((payload_len > 0) && dev->stream_synced) {