lib: switch to accelerated CRC16 implementation

Use faster SIMD implementation from
https://github.com/mattsta/crcspeed
This commit is contained in:
Steve Markgraf 2025-12-22 20:46:16 +01:00
parent 5867fdb15c
commit 359f141789
10 changed files with 1096 additions and 52 deletions

View file

@ -62,6 +62,29 @@ elseif(MSVC14 OR MSVC14)
ADD_DEFINITIONS(-D_TIMESPEC_DEFINED)
endif()
# Enable SIMD instructions on x86
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i386|i686")
add_compile_options(-mpclmul -msse4.1)
endif()
########################################################################
# CRC SIMD specific setup
########################################################################
# Enable crypto extensions on ARM64
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
# Check if using GCC or Clang
if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
# GCC requires explicit crypto extension flag on all ARM64 platforms
add_compile_options(-march=armv8-a+crypto)
elseif(APPLE)
# AppleClang on Apple Silicon - use native CPU features
add_compile_options(-mcpu=apple-m1)
else()
# Other Clang on ARM64 Linux
add_compile_options(-march=armv8-a+crypto)
endif()
endif()
########################################################################
# Find build dependencies
########################################################################

View file

@ -1,48 +0,0 @@
#ifndef __CRC_H
#define __CRC_H
/* CRC16 CCIT 'false' table, similar to what can be found in the Kernel */
static const uint16_t ccitt_hash[] = {
0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7,
0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef,
0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6,
0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de,
0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485,
0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d,
0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4,
0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc,
0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823,
0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b,
0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12,
0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a,
0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41,
0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49,
0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70,
0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78,
0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f,
0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067,
0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e,
0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256,
0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d,
0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405,
0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c,
0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634,
0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab,
0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3,
0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a,
0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92,
0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9,
0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1,
0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8,
0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0,
};
uint16_t crc16_ccitt(const uint8_t* buffer, size_t size)
{
uint16_t crc = 0xffff;
while (size-- > 0)
crc = (crc << 8) ^ ccitt_hash[((crc >> 8) ^ *(buffer++)) & 0x00ff];
return crc;
}
#endif

20
include/crc16speed.h Normal file
View file

@ -0,0 +1,20 @@
#ifndef CRC16SPEED_H
#define CRC16SPEED_H
#include "crcspeed.h"
#include "stdbool.h"
/* Does not require init */
uint16_t crc16(uint16_t crc, const void *data, const uint64_t len);
void crc16speed_cache_table(void);
/* All other crc functions here require _init() before usage. */
bool crc16speed_init(void);
uint16_t crc16_lookup(uint16_t crc, const void *in_data, const uint64_t len);
uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
bool crc16speed_init_big(void);
uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l);
bool crc16speed_init_native(void);
uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l);
#endif

28
include/crc_simd.h Normal file
View file

@ -0,0 +1,28 @@
/* CRC SIMD Acceleration Header
* Copyright (c) 2024
* SPDX-License-Identifier: BSD-3-Clause
*
* SIMD-accelerated CRC computation using:
* - PCLMULQDQ on x86/x64 (Intel/AMD)
* - PMULL on ARM64/NEON (Apple Silicon, ARM servers)
*/
#ifndef CRC_SIMD_H
#define CRC_SIMD_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
/* Feature detection */
bool crc_simd_available(void);
/* CRC64 SIMD functions - use Jones polynomial 0xad93d23594c935a9 */
void crc64_simd_init(void);
uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len);
/* CRC16 SIMD functions - use CRC-16-CCITT polynomial 0x1021 */
void crc16_simd_init(void);
uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len);
#endif /* CRC_SIMD_H */

60
include/crcspeed.h Normal file
View file

@ -0,0 +1,60 @@
/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. */
#ifndef CRCSPEED_H
#define CRCSPEED_H
#include <inttypes.h>
#include <stdio.h>
typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);
/* CRC-64 */
void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);
void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]);
uint64_t crcspeed64little(uint64_t table[8][256], uint64_t crc, void *buf,
size_t len);
uint64_t crcspeed64big(uint64_t table[8][256], uint64_t crc, void *buf,
size_t len);
uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
size_t len);
/* CRC-16 */
void crcspeed16little_init(crcfn16 fn, uint16_t table[8][256]);
void crcspeed16big_init(crcfn16 fn, uint16_t table[8][256]);
void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]);
uint16_t crcspeed16little(uint16_t table[8][256], uint16_t crc, void *buf,
size_t len);
uint16_t crcspeed16big(uint16_t table[8][256], uint16_t crc, void *buf,
size_t len);
uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
size_t len);
#endif

View file

@ -18,7 +18,7 @@
########################################################################
# Setup shared library variant
########################################################################
add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c)
add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c crcspeed.c crc16speed.c crc_simd.c)
target_link_libraries(hsdaoh ${LIBUSB_LIBRARIES} ${LIBUVC_LIBRARIES} ${THREADS_PTHREADS_LIBRARY})
target_include_directories(hsdaoh PUBLIC
$<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
@ -86,7 +86,6 @@ set(INSTALL_TARGETS hsdaoh hsdaoh_static hsdaoh_file hsdaoh_tcp hsdaoh_test)
target_link_libraries(hsdaoh_file hsdaoh
${LIBFLAC_LIBRARIES}
# ${LIBUSB_LIBRARIES}
${CMAKE_THREAD_LIBS_INIT}
)
target_link_libraries(hsdaoh_tcp hsdaoh

213
src/crc16speed.c Normal file
View file

@ -0,0 +1,213 @@
/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. */
#include "crc16speed.h"
/* If CRCSPEED16_DUAL is defined, we allow calls to
* both _little and _big CRC.
* By default, we only allow one endianness to be used
* and the first call to either _init function will set the
* lookup table endianness for the life of this module.
* We don't enable dual lookups by default because
* each 8x256 lookup table is 4k. */
#ifndef CRC16SPEED_DUAL
static uint16_t crc16_table[8][256] = {{0}};
static void *crc16_table_little = NULL, *crc16_table_big = NULL;
static const bool dual = false;
#else
static uint16_t crc16_table_little[8][256] = {{0}};
static uint16_t crc16_table_big[8][256] = {{0}};
static void *crc16_table = NULL;
static const bool dual = true;
#endif
/* value of crc16_table[0][1], architecture dependent. */
#define LITTLE1 UINT16_C(0x1021)
#define BIG1 UINT16_C(0x2110)
/* Define CRC16SPEED_SAFE if you want runtime checks to stop
* CRCs from being calculated by uninitialized tables (and also stop tables
* from being initialized more than once). */
#ifdef CRC16SPEED_SAFE
#define should_init(table, val) \
do { \
if ((table)[0][1] == (val)) \
return false; \
} while (0)
#define check_init(table, val) \
do { \
if ((table)[0][1] != (val)) \
return false; \
} while (0)
#else
#define should_init(a, b)
#define check_init(a, b)
#endif
/* This is CRC-16-CCITT (non-reflected poly, non-inverted input/output).
* crc16() is only used to bootstrap an initial 256-entry lookup table. */
#define POLY 0x1021
uint16_t crc16(uint16_t crc, const void *in_data, uint64_t len) {
const uint8_t *data = in_data;
for (uint64_t i = 0; i < len; i++) {
crc = crc ^ (data[i] << 8);
for (int j = 0; j < 8; j++) {
if (crc & 0x8000) {
crc = (crc << 1) ^ POLY;
} else {
crc = (crc << 1);
}
}
}
return crc;
}
/* Only for testing; doesn't support DUAL */
uint16_t crc16_lookup(uint16_t crc, const void *in_data, uint64_t len) {
const uint8_t *data = in_data;
for (uint64_t i = 0; i < len; i++) {
crc = (crc << 8) ^ crc16_table[0][((crc >> 8) ^ data[i]) & 0x00ff];
}
return crc;
}
/* Returns false if CRC16SPEED_SAFE and table already initialized. */
bool crc16speed_init(void) {
#ifndef CRC16SPEED_DUAL
should_init(crc16_table, LITTLE1);
#else
should_init(crc16_table_little, LITTLE1);
#endif
crcspeed16little_init(crc16, dual ? crc16_table_little : crc16_table);
return true;
}
/* Returns false if CRC16SPEED_SAFE and table already initialized. */
bool crc16speed_init_big(void) {
#ifndef CRC16SPEED_DUAL
should_init(crc16_table, BIG1);
#else
should_init(crc16_table_big, BIG1);
#endif
crcspeed16big_init(crc16, dual ? crc16_table_big : crc16_table);
return true;
}
uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l) {
/* Quickly check if CRC table is initialized to little endian correctly. */
#ifndef CRC16SPEED_DUAL
check_init(crc16_table, LITTLE1);
#else
check_init(crc16_table_little, LITTLE1);
#endif
return crcspeed16little(dual ? crc16_table_little : crc16_table, crc,
(void *)s, l);
}
uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l) {
/* Quickly check if CRC table is initialized to big endian correctly. */
#ifndef CRC16SPEED_DUAL
check_init(crc16_table, BIG1);
#else
check_init(crc16_table_big, BIG1);
#endif
return crcspeed16big(dual ? crc16_table_big : crc16_table, crc, (void *)s,
l);
}
bool crc16speed_init_native(void) {
const uint64_t n = 1;
return *(char *)&n ? crc16speed_init() : crc16speed_init_big();
}
/* If you are on a platform where endianness can change at runtime, this
* will break unless you compile with CRC16SPEED_DUAL and manually run
* _init() and _init_big() instead of using _init_native() */
uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l) {
const uint64_t n = 1;
return *(char *)&n ? crc16speed(crc, s, l) : crc16speed_big(crc, s, l);
}
/* Iterate over table to fully load it into a cache near the CPU. */
void crc16speed_cache_table(void) {
volatile uint16_t m;
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 256; ++j) {
#ifndef CRC16SPEED_DUAL
m = crc16_table[i][j];
#else
m = crc16_table_little[i][j];
m += crc16_table_big[i][j];
#endif
}
}
(void)m; /* Suppress unused variable warning */
}
/* Test main */
#if defined(CRCSPEED_TEST) || defined(CRCSPEED_TEST_MAIN)
#include <stdio.h>
#include <stdlib.h>
#define UNUSED(x) (void)(x)
int crc16Test(int argc, char *argv[]) {
UNUSED(argc);
UNUSED(argv);
crc16speed_init();
printf("[calcula]: 31c3 == %04" PRIx64 "\n",
(uint64_t)crc16(0, "123456789", 9));
printf("[lookupt]: 31c3 == %04" PRIx64 "\n",
(uint64_t)crc16_lookup(0, "123456789", 9));
printf("[16speed]: 31c3 == %04" PRIx64 "\n",
(uint64_t)crc16speed(0, "123456789", 9));
char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
"do eiusmod tempor incididunt ut labore et dolore magna "
"aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
"ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
"aute irure dolor in reprehenderit in voluptate velit esse "
"cillum dolore eu fugiat nulla pariatur. Excepteur sint "
"occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum.";
printf("[calcula]: 4b20 == %04" PRIx64 "\n",
(uint64_t)crc16(0, li, sizeof(li)));
printf("[lookupt]: 4b20 == %04" PRIx64 "\n",
(uint64_t)crc16_lookup(0, li, sizeof(li)));
printf("[16speed]: 4b20 == %04" PRIx64 "\n",
(uint64_t)crc16speed(0, li, sizeof(li)));
return 0;
}
#endif
#ifdef CRCSPEED_TEST_MAIN
int main(int argc, char *argv[]) {
return crc16Test(argc, argv);
}
#endif

456
src/crc_simd.c Normal file
View file

@ -0,0 +1,456 @@
/* CRC SIMD Acceleration Implementation
* Copyright (c) 2024
* SPDX-License-Identifier: BSD-3-Clause
*
* SIMD-accelerated CRC computation using PCLMULQDQ (x86) or PMULL (ARM64)
* Based on Intel's "Fast CRC Computation for Generic Polynomials Using
* PCLMULQDQ Instruction" whitepaper and Intel ISA-L implementation.
*
* Supports:
* - CRC64-Jones: polynomial 0xad93d23594c935a9 (reflected)
* - CRC16-CCITT: polynomial 0x1021 (non-reflected)
*
* References:
* - Intel whitepaper: Fast CRC Computation for Generic Polynomials
* - Intel ISA-L: https://github.com/intel/isa-l
*/
#include "crc_simd.h"
#include <string.h>
/* ============================================================================
* Architecture Detection and Feature Testing
* ============================================================================
*/
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86)
#define ARCH_X86 1
#if defined(__GNUC__) || defined(__clang__)
#include <cpuid.h>
#endif
#include <immintrin.h>
#include <wmmintrin.h>
#elif defined(__aarch64__) || defined(_M_ARM64)
#define ARCH_ARM64 1
#include <arm_neon.h>
#if defined(__ARM_FEATURE_CRYPTO) || defined(__APPLE__)
#define HAS_PMULL 1
#include <arm_acle.h>
#endif
#endif
static bool g_simd_checked = false;
static bool g_simd_available = false;
bool crc_simd_available(void) {
if (g_simd_checked) {
return g_simd_available;
}
g_simd_checked = true;
#if defined(ARCH_X86)
#if defined(__GNUC__) || defined(__clang__)
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
/* Check for PCLMULQDQ (bit 1 of ECX) and SSE4.1 (bit 19 of ECX) */
g_simd_available = (ecx & (1 << 1)) && (ecx & (1 << 19));
}
#endif
#elif defined(ARCH_ARM64)
#if defined(__APPLE__)
/* Apple Silicon always has PMULL */
g_simd_available = true;
#elif defined(HAS_PMULL)
g_simd_available = true;
#else
g_simd_available = false;
#endif
#else
g_simd_available = false;
#endif
return g_simd_available;
}
/* ============================================================================
* CRC64 Jones Polynomial: 0xad93d23594c935a9 (reflected)
*
* Constants from Intel ISA-L crc64_jones_refl_by8.asm
* ============================================================================
*/
/* 16-byte folding constants */
#define CRC64_RK1 UINT64_C(0x381d0015c96f4444)
#define CRC64_RK2 UINT64_C(0xd9d7be7d505da32c)
/* 128-byte folding constants */
#define CRC64_RK3 UINT64_C(0x768361524d29ed0b)
#define CRC64_RK4 UINT64_C(0xcc26fa7c57f8054c)
/* 8-way reduction constants */
#define CRC64_RK9 UINT64_C(0x5bc94ba8e2087636)
#define CRC64_RK10 UINT64_C(0x6cf09c8f37710b75)
#define CRC64_RK11 UINT64_C(0x3885fd59e440d95a)
#define CRC64_RK12 UINT64_C(0xbccba3936411fb7e)
#define CRC64_RK13 UINT64_C(0xe4dd0d81cbfce585)
#define CRC64_RK14 UINT64_C(0xb715e37b96ed8633)
#define CRC64_RK15 UINT64_C(0xf49784a634f014e4)
#define CRC64_RK16 UINT64_C(0xaf86efb16d9ab4fb)
#define CRC64_RK17 UINT64_C(0x7b3211a760160db8)
#define CRC64_RK18 UINT64_C(0xa062b2319d66692f)
#define CRC64_RK19 UINT64_C(0xef3d1d18ed889ed2)
#define CRC64_RK20 UINT64_C(0x6ba4d760ab38201e)
/* ============================================================================
* CRC16-CCITT Polynomial: 0x1021 (non-reflected)
*
* Constants computed by gen_constants.c
* ============================================================================
*/
/* 16-byte folding constants */
#define CRC16_K1 UINT64_C(0x10e2) /* x^144 mod P */
#define CRC16_K2 UINT64_C(0xaefc) /* x^128 mod P */
/* 64-byte folding constants */
#define CRC16_K3 UINT64_C(0x78b3) /* x^528 mod P */
#define CRC16_K4 UINT64_C(0x13fc) /* x^512 mod P */
/* 8-way reduction constants */
#define CRC16_RK9 UINT64_C(0x4347) /* x^912 mod P */
#define CRC16_RK10 UINT64_C(0xcbc5) /* x^896 mod P */
#define CRC16_RK11 UINT64_C(0x9e3a) /* x^784 mod P */
#define CRC16_RK12 UINT64_C(0x106f) /* x^768 mod P */
#define CRC16_RK13 UINT64_C(0x9c1a) /* x^656 mod P */
#define CRC16_RK14 UINT64_C(0xda35) /* x^640 mod P */
#define CRC16_RK15 UINT64_C(0x78b3) /* x^528 mod P */
#define CRC16_RK16 UINT64_C(0x13fc) /* x^512 mod P */
#define CRC16_RK17 UINT64_C(0xbd64) /* x^400 mod P */
#define CRC16_RK18 UINT64_C(0xcde2) /* x^384 mod P */
#define CRC16_RK19 UINT64_C(0x8ddc) /* x^272 mod P */
#define CRC16_RK20 UINT64_C(0x8e29) /* x^256 mod P */
/* Barrett reduction constants */
#define CRC16_MU UINT64_C(0x0000f0d3) /* floor(x^32 / P) */
#define CRC16_POLY UINT64_C(0x1021)
/* ============================================================================
* X86/X64 PCLMULQDQ Implementation
* ============================================================================
*/
#if defined(ARCH_X86)
/* Helper: fold one 128-bit block for CRC64 (reflected) */
static inline __m128i fold_128(__m128i acc, __m128i data, __m128i k1k2) {
__m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x00);
__m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x11);
return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
}
/* Helper: byte-swap a 128-bit register (used for CRC16 non-reflected) */
__attribute__((unused)) static inline __m128i bswap_128(__m128i x) {
const __m128i mask =
_mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(x, mask);
}
/* Helper: fold one 128-bit block for CRC16 (non-reflected)
* For non-reflected CRC, we use different PCLMULQDQ immediates
* and the constants are positioned in the high 16 bits.
* Note: Currently unused as we fall back to crc16speed for CRC16 SIMD on x86.
*/
__attribute__((unused)) static inline __m128i
fold_128_16(__m128i acc, __m128i data, __m128i k1k2) {
/* For non-reflected CRC16:
* - k1 is in high 64 bits, k2 in low 64 bits
* - We use 0x10 and 0x01 immediates (cross-multiply)
*/
__m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x10); /* acc_lo * k1 */
__m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x01); /* acc_hi * k2 */
return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
}
void crc64_simd_init(void) {
crc_simd_available();
}
/* ============================================================================
* CRC16 SIMD Implementation (x86/x64 with PCLMULQDQ)
*
* CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC.
*
* Non-reflected CRCs process data MSB-first, which requires:
* 1. Byte-swapping data to get correct byte order for PCLMULQDQ
* 2. Positioning CRC in high bits
* 3. Different folding constant interpretation
*
* This implementation uses the marzooqy/crc-clmul approach:
* - Data is byte-swapped (reversed within 128-bit blocks)
* - CRC goes in high 64 bits
* - For non-reflected: PCLMULQDQ multiplies acc_hi*k_hi and acc_lo*k_lo
* - Final reduction via table lookup
*
* Folding constants for CRC16-CCITT polynomial 0x1021:
* - x^192 mod P = 0x650b (for high 64-bit lane)
* - x^128 mod P = 0xaefc (for low 64-bit lane)
* ============================================================================
*/
/* CRC16-CCITT folding constants for 128-bit blocks */
/* x^192 mod P (for high 64-bit lane) */
#define CRC16_K_HI_X86 UINT64_C(0x650b)
/* x^128 mod P (for low 64-bit lane) */
#define CRC16_K_LO_X86 UINT64_C(0xaefc)
/* Byte-swap 128-bit register: reverse all 16 bytes */
static inline __m128i crc16_bswap_128_x86(__m128i x) {
const __m128i mask =
_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
return _mm_shuffle_epi8(x, mask);
}
/* Fold helper for non-reflected CRC16 on x86
* For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo
* Constants packed as: k_hi in high 64 bits, k_lo in low 64 bits
*/
static inline __m128i crc16_fold_x86(__m128i acc, __m128i data, __m128i k) {
/* Non-reflected folding:
* - acc_hi (high 64) gets multiplied by k_hi (high 64) -> use 0x11
* - acc_lo (low 64) gets multiplied by k_lo (low 64) -> use 0x00
*/
__m128i t1 = _mm_clmulepi64_si128(acc, k, 0x11); /* acc_hi * k_hi */
__m128i t2 = _mm_clmulepi64_si128(acc, k, 0x00); /* acc_lo * k_lo */
return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
}
uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
const uint8_t *buf = (const uint8_t *)data;
extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
/* Use table-based for small buffers - SIMD overhead not worth it */
if (!crc_simd_available() || len < 64) {
return crc16speed(crc, data, len);
}
/* Process 16-byte blocks using SIMD folding */
uint64_t blocks = len / 16;
if (blocks < 2) {
return crc16speed(crc, data, len);
}
/* Constants: k_hi in high 64 bits, k_lo in low 64 bits */
const __m128i k = _mm_set_epi64x(CRC16_K_HI_X86, CRC16_K_LO_X86);
/* Load and byte-swap first 16 bytes */
__m128i acc = crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf));
/* XOR initial CRC into high 16 bits of high 64-bit lane
* For non-reflected CRC, the CRC value is conceptually at the MSB end
*/
uint64_t crc_shifted = (uint64_t)crc << 48;
__m128i crc_vec = _mm_set_epi64x(crc_shifted, 0);
acc = _mm_xor_si128(acc, crc_vec);
buf += 16;
blocks--;
/* Main folding loop */
while (blocks > 0) {
/* Load and byte-swap next 16 bytes */
__m128i data_vec =
crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf));
/* Fold accumulator with new data */
acc = crc16_fold_x86(acc, data_vec, k);
buf += 16;
blocks--;
}
/* Store folded result (byte-swap back to original order) */
__m128i acc_swapped = crc16_bswap_128_x86(acc);
uint8_t final_buf[32];
_mm_storeu_si128((__m128i *)final_buf, acc_swapped);
/* Process remaining bytes through table lookup */
uint64_t remaining = len % 16;
if (remaining > 0) {
memcpy(final_buf + 16, buf, remaining);
return crc16speed(0, final_buf, 16 + remaining);
} else {
return crc16speed(0, final_buf, 16);
}
}
void crc16_simd_init(void) {
crc_simd_available();
}
#elif defined(ARCH_ARM64) && defined(HAS_PMULL)
/* ============================================================================
* ARM64 NEON/PMULL Implementation
* ============================================================================
*/
static inline uint64x2_t fold_128_neon(uint64x2_t acc, uint64x2_t data,
uint64_t k1, uint64_t k2) {
poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 0), k2);
poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 1), k1);
uint64x2_t t1_vec = vreinterpretq_u64_p128(t1);
uint64x2_t t2_vec = vreinterpretq_u64_p128(t2);
return veorq_u64(veorq_u64(t1_vec, t2_vec), data);
}
/* Byte-swap for NEON - used for non-reflected CRCs (currently unused, kept for
* reference) */
#if 0
static inline uint8x16_t bswap_128_neon(uint8x16_t x) {
return vrev64q_u8(x);
}
#endif
void crc64_simd_init(void) {
crc_simd_available();
}
/* ============================================================================
* CRC16 SIMD Implementation for ARM64
*
* CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC.
*
* Non-reflected CRCs process data MSB-first, which requires:
* 1. Byte-swapping data to get correct byte order for PMULL
* 2. Positioning CRC in high bits
* 3. Different folding constant interpretation
*
* This implementation uses the marzooqy/crc-clmul approach:
* - Data is byte-swapped (reversed within 128-bit blocks)
* - CRC goes in high 64 bits: SET(crc << 48, 0)
* - For non-reflected: PMULL immediates swap acc_hi*k1 and acc_lo*k2
* - Final reduction via table lookup
*
* Folding constants for CRC16-CCITT polynomial 0x1021:
* - x^192 mod P = 0x650b (for high 64-bit lane)
* - x^128 mod P = 0xaefc (for low 64-bit lane)
* ============================================================================
*/
/* CRC16-CCITT folding constants for 128-bit blocks */
#define CRC16_K_HI UINT64_C(0x650b) /* x^192 mod P (for high 64-bit lane) */
#define CRC16_K_LO UINT64_C(0xaefc) /* x^128 mod P (for low 64-bit lane) */
/* Byte-swap 128 bits: reverse all 16 bytes */
static inline uint8x16_t crc16_bswap_128(uint8x16_t x) {
/* Reverse bytes within each 64-bit lane, then swap lanes */
uint8x16_t rev = vrev64q_u8(x);
return vcombine_u8(vget_high_u8(rev), vget_low_u8(rev));
}
/* Fold helper for non-reflected CRC16
* For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo
* This is different from reflected where we use 0x00 and 0x11 immediates
*/
static inline uint64x2_t crc16_fold_neon(uint64x2_t acc, uint64x2_t data,
uint64_t k_hi, uint64_t k_lo) {
/* Non-reflected folding:
* - acc_hi (lane 1) gets multiplied by k_hi
* - acc_lo (lane 0) gets multiplied by k_lo
*/
poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 1), k_hi); /* acc_hi * k_hi */
poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 0), k_lo); /* acc_lo * k_lo */
uint64x2_t t1_vec = vreinterpretq_u64_p128(t1);
uint64x2_t t2_vec = vreinterpretq_u64_p128(t2);
return veorq_u64(veorq_u64(t1_vec, t2_vec), data);
}
uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
const uint8_t *buf = (const uint8_t *)data;
/* Use table-based for small buffers - SIMD overhead not worth it */
if (!crc_simd_available() || len < 64) {
return crc16speed(crc, data, len);
}
/* Process 16-byte blocks using SIMD folding */
uint64_t blocks = len / 16;
if (blocks < 2) {
return crc16speed(crc, data, len);
}
/* Load and byte-swap first 16 bytes */
uint8x16_t data8 = vld1q_u8(buf);
data8 = crc16_bswap_128(data8);
uint64x2_t acc = vreinterpretq_u64_u8(data8);
/* XOR initial CRC into high 16 bits of high 64-bit lane
* For non-reflected CRC, the CRC value is conceptually at the MSB end
*/
uint64_t crc_shifted = (uint64_t)crc << 48;
uint64x2_t crc_vec = vsetq_lane_u64(crc_shifted, vdupq_n_u64(0), 1);
acc = veorq_u64(acc, crc_vec);
buf += 16;
blocks--;
/* Main folding loop */
while (blocks > 0) {
/* Load and byte-swap next 16 bytes */
data8 = vld1q_u8(buf);
data8 = crc16_bswap_128(data8);
uint64x2_t data_vec = vreinterpretq_u64_u8(data8);
/* Fold accumulator with new data */
acc = crc16_fold_neon(acc, data_vec, CRC16_K_HI, CRC16_K_LO);
buf += 16;
blocks--;
}
/* Store folded result (byte-swap back to original order) */
uint8x16_t acc8 = vreinterpretq_u8_u64(acc);
acc8 = crc16_bswap_128(acc8);
uint8_t final_buf[32];
vst1q_u8(final_buf, acc8);
/* Process remaining bytes through table lookup */
uint64_t remaining = len % 16;
if (remaining > 0) {
memcpy(final_buf + 16, buf, remaining);
return crc16speed(0, final_buf, 16 + remaining);
} else {
return crc16speed(0, final_buf, 16);
}
}
void crc16_simd_init(void) {
crc_simd_available();
}
#else /* No SIMD support */
bool crc_simd_available(void);
uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len) {
extern uint64_t crc64speed(uint64_t crc, const void *s, const uint64_t l);
return crc64speed(crc, data, len);
}
void crc64_simd_init(void) {
}
uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
return crc16speed(crc, data, len);
}
void crc16_simd_init(void) {
}
#endif /* Architecture selection */

289
src/crcspeed.c Normal file
View file

@ -0,0 +1,289 @@
/*
* Copyright (C) 2013 Mark Adler
* Originally by: crc64.c Version 1.4 16 Dec 2013 Mark Adler
* Modifications by Matt Stancliff <matt@genges.com>:
* - removed CRC64-specific behavior
* - added generation of lookup tables by parameters
* - removed inversion of CRC input/result
* - removed automatic initialization in favor of explicit initialization
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler@alumni.caltech.edu
*/
#include "crcspeed.h"
/* Fill in a CRC constants table. */
void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
uint64_t crc;
/* generate CRCs for all single byte sequences */
for (int n = 0; n < 256; n++) {
table[0][n] = crcfn(0, &n, 1);
}
/* generate nested CRC table for future slice-by-8 lookup */
for (int n = 0; n < 256; n++) {
crc = table[0][n];
for (int k = 1; k < 8; k++) {
crc = table[0][crc & 0xff] ^ (crc >> 8);
table[k][n] = crc;
}
}
}
void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
uint16_t crc;
/* generate CRCs for all single byte sequences */
for (int n = 0; n < 256; n++) {
table[0][n] = crcfn(0, &n, 1);
}
/* generate nested CRC table for future slice-by-8 lookup */
for (int n = 0; n < 256; n++) {
crc = table[0][n];
for (int k = 1; k < 8; k++) {
crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8);
table[k][n] = crc;
}
}
}
/* Reverse the bytes in a 64-bit word. */
static inline uint64_t rev8(uint64_t a) {
#if defined(__GNUC__) || defined(__clang__)
return __builtin_bswap64(a);
#else
uint64_t m;
m = UINT64_C(0xff00ff00ff00ff);
a = ((a >> 8) & m) | (a & m) << 8;
m = UINT64_C(0xffff0000ffff);
a = ((a >> 16) & m) | (a & m) << 16;
return a >> 32 | a << 32;
#endif
}
/* This function is called once to initialize the CRC table for use on a
big-endian architecture. */
void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) {
/* Create the little endian table then reverse all the entires. */
crcspeed64little_init(fn, big_table);
for (int k = 0; k < 8; k++) {
for (int n = 0; n < 256; n++) {
big_table[k][n] = rev8(big_table[k][n]);
}
}
}
void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
/* Create the little endian table then reverse all the entires. */
crcspeed16little_init(fn, big_table);
for (int k = 0; k < 8; k++) {
for (int n = 0; n < 256; n++) {
big_table[k][n] = rev8(big_table[k][n]);
}
}
}
/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
* architecture. If you need inverted CRC, invert *before* calling and invert
* *after* calling.
* 64 bit crc = process 8 bytes at once;
*/
uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
void *buf, size_t len) {
unsigned char *next = buf;
/* process individual bytes until we reach an 8-byte aligned pointer */
while (len && ((uintptr_t)next & 7) != 0) {
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
/* fast middle processing, 8 bytes (aligned!) per loop */
/* clang-format off */
while (len >= 8) {
crc ^= *(uint64_t *)next;
crc = little_table[7][crc & 0xff] ^
little_table[6][(crc >> 8) & 0xff] ^
little_table[5][(crc >> 16) & 0xff] ^
little_table[4][(crc >> 24) & 0xff] ^
little_table[3][(crc >> 32) & 0xff] ^
little_table[2][(crc >> 40) & 0xff] ^
little_table[1][(crc >> 48) & 0xff] ^
little_table[0][crc >> 56];
next += 8;
len -= 8;
}
/* clang-format on */
/* process remaining bytes (can't be larger than 8) */
while (len) {
crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
return crc;
}
uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
void *buf, size_t len) {
unsigned char *next = buf;
/* process individual bytes until we reach an 8-byte aligned pointer */
while (len && ((uintptr_t)next & 7) != 0) {
crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
len--;
}
/* fast middle processing, 8 bytes (aligned!) per loop */
/* clang-format off */
while (len >= 8) {
uint64_t n = *(uint64_t *)next;
crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^
little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
little_table[5][(n >> 16) & 0xff] ^
little_table[4][(n >> 24) & 0xff] ^
little_table[3][(n >> 32) & 0xff] ^
little_table[2][(n >> 40) & 0xff] ^
little_table[1][(n >> 48) & 0xff] ^
little_table[0][n >> 56];
next += 8;
len -= 8;
}
/* clang-format on */
/* process remaining bytes (can't be larger than 8) */
while (len) {
crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
len--;
}
return crc;
}
/* Calculate a non-inverted CRC eight bytes at a time on a big-endian
* architecture.
*/
uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
size_t len) {
unsigned char *next = buf;
crc = rev8(crc);
while (len && ((uintptr_t)next & 7) != 0) {
crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
len--;
}
/* clang-format off */
while (len >= 8) {
crc ^= *(uint64_t *)next;
crc = big_table[0][crc & 0xff] ^
big_table[1][(crc >> 8) & 0xff] ^
big_table[2][(crc >> 16) & 0xff] ^
big_table[3][(crc >> 24) & 0xff] ^
big_table[4][(crc >> 32) & 0xff] ^
big_table[5][(crc >> 40) & 0xff] ^
big_table[6][(crc >> 48) & 0xff] ^
big_table[7][crc >> 56];
next += 8;
len -= 8;
}
/* clang-format on */
while (len) {
crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
len--;
}
return rev8(crc);
}
/* WARNING: Completely untested on big endian architecture. Possibly broken. */
uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf,
size_t len) {
unsigned char *next = buf;
uint64_t crc = crc_in;
crc = rev8(crc);
while (len && ((uintptr_t)next & 7) != 0) {
crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
/* clang-format off */
while (len >= 8) {
uint64_t n = *(uint64_t *)next;
crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^
big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
big_table[2][(n >> 16) & 0xff] ^
big_table[3][(n >> 24) & 0xff] ^
big_table[4][(n >> 32) & 0xff] ^
big_table[5][(n >> 40) & 0xff] ^
big_table[6][(n >> 48) & 0xff] ^
big_table[7][n >> 56];
next += 8;
len -= 8;
}
/* clang-format on */
while (len) {
crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
len--;
}
return rev8(crc);
}
/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes
at a time using passed-in lookup table.
This selects one of two routines depending on the endianess of
the architecture. */
uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
size_t len) {
uint64_t n = 1;
return *(char *)&n ? crcspeed64little(table, crc, buf, len)
: crcspeed64big(table, crc, buf, len);
}
uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
size_t len) {
uint64_t n = 1;
return *(char *)&n ? crcspeed16little(table, crc, buf, len)
: crcspeed16big(table, crc, buf, len);
}
/* Initialize CRC lookup table in architecture-dependent manner. */
void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) {
uint64_t n = 1;
*(char *)&n ? crcspeed64little_init(fn, table)
: crcspeed64big_init(fn, table);
}
void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) {
uint64_t n = 1;
*(char *)&n ? crcspeed16little_init(fn, table)
: crcspeed16big_init(fn, table);
}

View file

@ -47,7 +47,8 @@
#include <hsdaoh.h>
#include <hsdaoh_private.h>
#include <format_convert.h>
#include <crc.h>
#include <crc16speed.h>
#include <crc_simd.h>
#define DEFAULT_BUFFERS 96
@ -493,6 +494,9 @@ int hsdaoh_open(hsdaoh_dev_t **out_dev, uint32_t index)
dev->cnv_f1 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN);
dev->cnv_f2 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN);
crc16speed_init();
crc16_simd_init();
found:
*out_dev = dev;
@ -771,7 +775,7 @@ void hsdaoh_process_frame(hsdaoh_dev_t *dev, uint8_t *data, int size)
frame_errors++;
dev->last_crc[1] = dev->last_crc[0];
dev->last_crc[0] = crc16_ccitt(line_dat, dev->width * sizeof(uint16_t));
dev->last_crc[0] = crc16_simd(0xffff, line_dat, dev->width * sizeof(uint16_t));
}
if ((payload_len > 0) && dev->stream_synced) {