lib: switch to accelerated CRC16 implementation

Use faster SIMD implementation from https://github.com/mattsta/crcspeed
2026-01-27 16:57:17 +01:00 · 2025-12-22 20:46:16 +01:00 · 2025-12-22 20:46:16 +01:00 · 359f141789
commit 359f141789
parent 5867fdb15c
10 changed files with 1096 additions and 52 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -62,6 +62,29 @@ elseif(MSVC14 OR MSVC14)
    ADD_DEFINITIONS(-D_TIMESPEC_DEFINED)
 endif()

+# Enable SIMD instructions on x86
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i386|i686")
+    add_compile_options(-mpclmul -msse4.1)
+endif()
+
+########################################################################
+# CRC SIMD specific setup
+########################################################################
+# Enable crypto extensions on ARM64
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    # Check if using GCC or Clang
+    if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+        # GCC requires explicit crypto extension flag on all ARM64 platforms
+        add_compile_options(-march=armv8-a+crypto)
+    elseif(APPLE)
+        # AppleClang on Apple Silicon - use native CPU features
+        add_compile_options(-mcpu=apple-m1)
+    else()
+        # Other Clang on ARM64 Linux
+        add_compile_options(-march=armv8-a+crypto)
+    endif()
+endif()
+
 ########################################################################
 # Find build dependencies
 ########################################################################
--- a/include/crc.h
+++ b/include/crc.h
@ -1,48 +0,0 @@
-#ifndef __CRC_H
-#define __CRC_H
-
-/* CRC16 CCIT 'false' table, similar to what can be found in the Kernel */
-static const uint16_t ccitt_hash[] = {
-	0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7,
-	0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef,
-	0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6,
-	0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de,
-	0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485,
-	0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d,
-	0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4,
-	0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc,
-	0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823,
-	0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b,
-	0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12,
-	0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a,
-	0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41,
-	0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49,
-	0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70,
-	0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78,
-	0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f,
-	0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067,
-	0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e,
-	0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256,
-	0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d,
-	0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405,
-	0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c,
-	0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634,
-	0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab,
-	0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3,
-	0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a,
-	0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92,
-	0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9,
-	0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1,
-	0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8,
-	0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0,
-};
-
-uint16_t crc16_ccitt(const uint8_t* buffer, size_t size)
-{
-	uint16_t crc = 0xffff;
-	while (size-- > 0)
-		crc = (crc << 8) ^ ccitt_hash[((crc >> 8) ^ *(buffer++)) & 0x00ff];
-
-	return crc;
-}
-#endif
--- a/include/crc16speed.h
+++ b/include/crc16speed.h
@ -0,0 +1,20 @@
+#ifndef CRC16SPEED_H
+#define CRC16SPEED_H
+#include "crcspeed.h"
+#include "stdbool.h"
+
+/* Does not require init */
+uint16_t crc16(uint16_t crc, const void *data, const uint64_t len);
+void crc16speed_cache_table(void);
+
+/* All other crc functions here require _init() before usage. */
+bool crc16speed_init(void);
+uint16_t crc16_lookup(uint16_t crc, const void *in_data, const uint64_t len);
+uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
+
+bool crc16speed_init_big(void);
+uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l);
+
+bool crc16speed_init_native(void);
+uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l);
+#endif
--- a/include/crc_simd.h
+++ b/include/crc_simd.h
@ -0,0 +1,28 @@
+/* CRC SIMD Acceleration Header
+ * Copyright (c) 2024
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * SIMD-accelerated CRC computation using:
+ * - PCLMULQDQ on x86/x64 (Intel/AMD)
+ * - PMULL on ARM64/NEON (Apple Silicon, ARM servers)
+ */
+
+#ifndef CRC_SIMD_H
+#define CRC_SIMD_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Feature detection */
+bool crc_simd_available(void);
+
+/* CRC64 SIMD functions - use Jones polynomial 0xad93d23594c935a9 */
+void crc64_simd_init(void);
+uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len);
+
+/* CRC16 SIMD functions - use CRC-16-CCITT polynomial 0x1021 */
+void crc16_simd_init(void);
+uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len);
+
+#endif /* CRC_SIMD_H */
--- a/include/crcspeed.h
+++ b/include/crcspeed.h
@ -0,0 +1,60 @@
+/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of Redis nor the names of its contributors may be used
+ *     to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#ifndef CRCSPEED_H
+#define CRCSPEED_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+typedef uint64_t (*crcfn64)(uint64_t, const void *, const uint64_t);
+typedef uint16_t (*crcfn16)(uint16_t, const void *, const uint64_t);
+
+/* CRC-64 */
+void crcspeed64little_init(crcfn64 fn, uint64_t table[8][256]);
+void crcspeed64big_init(crcfn64 fn, uint64_t table[8][256]);
+void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]);
+
+uint64_t crcspeed64little(uint64_t table[8][256], uint64_t crc, void *buf,
+                          size_t len);
+uint64_t crcspeed64big(uint64_t table[8][256], uint64_t crc, void *buf,
+                       size_t len);
+uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
+                          size_t len);
+
+/* CRC-16 */
+void crcspeed16little_init(crcfn16 fn, uint16_t table[8][256]);
+void crcspeed16big_init(crcfn16 fn, uint16_t table[8][256]);
+void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]);
+
+uint16_t crcspeed16little(uint16_t table[8][256], uint16_t crc, void *buf,
+                          size_t len);
+uint16_t crcspeed16big(uint16_t table[8][256], uint16_t crc, void *buf,
+                       size_t len);
+uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
+                          size_t len);
+#endif
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -18,7 +18,7 @@
 ########################################################################
 # Setup shared library variant
 ########################################################################
-add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c)
+add_library(hsdaoh SHARED libhsdaoh.c format_convert.c iqconverter_float.c crcspeed.c crc16speed.c crc_simd.c)
 target_link_libraries(hsdaoh ${LIBUSB_LIBRARIES} ${LIBUVC_LIBRARIES} ${THREADS_PTHREADS_LIBRARY})
 target_include_directories(hsdaoh PUBLIC
  $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
@ -86,7 +86,6 @@ set(INSTALL_TARGETS hsdaoh hsdaoh_static hsdaoh_file hsdaoh_tcp hsdaoh_test)

 target_link_libraries(hsdaoh_file hsdaoh
    ${LIBFLAC_LIBRARIES}
-#    ${LIBUSB_LIBRARIES}
    ${CMAKE_THREAD_LIBS_INIT}
 )
 target_link_libraries(hsdaoh_tcp hsdaoh
--- a/src/crc16speed.c
+++ b/src/crc16speed.c
@ -0,0 +1,213 @@
+/* Copyright (c) 2014, Matt Stancliff <matt@genges.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of Redis nor the names of its contributors may be used
+ *     to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include "crc16speed.h"
+
+/* If CRCSPEED16_DUAL is defined, we allow calls to
+ * both _little and _big CRC.
+ * By default, we only allow one endianness to be used
+ * and the first call to either _init function will set the
+ * lookup table endianness for the life of this module.
+ * We don't enable dual lookups by default because
+ * each 8x256 lookup table is 4k. */
+#ifndef CRC16SPEED_DUAL
+static uint16_t crc16_table[8][256] = {{0}};
+static void *crc16_table_little = NULL, *crc16_table_big = NULL;
+static const bool dual = false;
+#else
+static uint16_t crc16_table_little[8][256] = {{0}};
+static uint16_t crc16_table_big[8][256] = {{0}};
+static void *crc16_table = NULL;
+static const bool dual = true;
+#endif
+
+/* value of crc16_table[0][1], architecture dependent. */
+#define LITTLE1 UINT16_C(0x1021)
+#define BIG1 UINT16_C(0x2110)
+
+/* Define CRC16SPEED_SAFE if you want runtime checks to stop
+ * CRCs from being calculated by uninitialized tables (and also stop tables
+ * from being initialized more than once). */
+#ifdef CRC16SPEED_SAFE
+#define should_init(table, val)                                                \
+    do {                                                                       \
+        if ((table)[0][1] == (val))                                            \
+            return false;                                                      \
+    } while (0)
+#define check_init(table, val)                                                 \
+    do {                                                                       \
+        if ((table)[0][1] != (val))                                            \
+            return false;                                                      \
+    } while (0)
+#else
+#define should_init(a, b)
+#define check_init(a, b)
+#endif
+
+/* This is CRC-16-CCITT (non-reflected poly, non-inverted input/output).
+ * crc16() is only used to bootstrap an initial 256-entry lookup table. */
+#define POLY 0x1021
+uint16_t crc16(uint16_t crc, const void *in_data, uint64_t len) {
+    const uint8_t *data = in_data;
+    for (uint64_t i = 0; i < len; i++) {
+        crc = crc ^ (data[i] << 8);
+        for (int j = 0; j < 8; j++) {
+            if (crc & 0x8000) {
+                crc = (crc << 1) ^ POLY;
+            } else {
+                crc = (crc << 1);
+            }
+        }
+    }
+
+    return crc;
+}
+
+/* Only for testing; doesn't support DUAL */
+uint16_t crc16_lookup(uint16_t crc, const void *in_data, uint64_t len) {
+    const uint8_t *data = in_data;
+    for (uint64_t i = 0; i < len; i++) {
+        crc = (crc << 8) ^ crc16_table[0][((crc >> 8) ^ data[i]) & 0x00ff];
+    }
+
+    return crc;
+}
+
+/* Returns false if CRC16SPEED_SAFE and table already initialized. */
+bool crc16speed_init(void) {
+#ifndef CRC16SPEED_DUAL
+    should_init(crc16_table, LITTLE1);
+#else
+    should_init(crc16_table_little, LITTLE1);
+#endif
+    crcspeed16little_init(crc16, dual ? crc16_table_little : crc16_table);
+    return true;
+}
+
+/* Returns false if CRC16SPEED_SAFE and table already initialized. */
+bool crc16speed_init_big(void) {
+#ifndef CRC16SPEED_DUAL
+    should_init(crc16_table, BIG1);
+#else
+    should_init(crc16_table_big, BIG1);
+#endif
+    crcspeed16big_init(crc16, dual ? crc16_table_big : crc16_table);
+    return true;
+}
+
+uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l) {
+/* Quickly check if CRC table is initialized to little endian correctly. */
+#ifndef CRC16SPEED_DUAL
+    check_init(crc16_table, LITTLE1);
+#else
+    check_init(crc16_table_little, LITTLE1);
+#endif
+    return crcspeed16little(dual ? crc16_table_little : crc16_table, crc,
+                            (void *)s, l);
+}
+
+uint16_t crc16speed_big(uint16_t crc, const void *s, const uint64_t l) {
+/* Quickly check if CRC table is initialized to big endian correctly. */
+#ifndef CRC16SPEED_DUAL
+    check_init(crc16_table, BIG1);
+#else
+    check_init(crc16_table_big, BIG1);
+#endif
+    return crcspeed16big(dual ? crc16_table_big : crc16_table, crc, (void *)s,
+                         l);
+}
+
+bool crc16speed_init_native(void) {
+    const uint64_t n = 1;
+    return *(char *)&n ? crc16speed_init() : crc16speed_init_big();
+}
+
+/* If you are on a platform where endianness can change at runtime, this
+ * will break unless you compile with CRC16SPEED_DUAL and manually run
+ * _init() and _init_big() instead of using _init_native() */
+uint16_t crc16speed_native(uint16_t crc, const void *s, const uint64_t l) {
+    const uint64_t n = 1;
+    return *(char *)&n ? crc16speed(crc, s, l) : crc16speed_big(crc, s, l);
+}
+
+/* Iterate over table to fully load it into a cache near the CPU. */
+void crc16speed_cache_table(void) {
+    volatile uint16_t m;
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 256; ++j) {
+#ifndef CRC16SPEED_DUAL
+            m = crc16_table[i][j];
+#else
+            m = crc16_table_little[i][j];
+            m += crc16_table_big[i][j];
+#endif
+        }
+    }
+    (void)m; /* Suppress unused variable warning */
+}
+
+/* Test main */
+#if defined(CRCSPEED_TEST) || defined(CRCSPEED_TEST_MAIN)
+#include <stdio.h>
+#include <stdlib.h>
+
+#define UNUSED(x) (void)(x)
+int crc16Test(int argc, char *argv[]) {
+    UNUSED(argc);
+    UNUSED(argv);
+    crc16speed_init();
+    printf("[calcula]: 31c3 == %04" PRIx64 "\n",
+           (uint64_t)crc16(0, "123456789", 9));
+    printf("[lookupt]: 31c3 == %04" PRIx64 "\n",
+           (uint64_t)crc16_lookup(0, "123456789", 9));
+    printf("[16speed]: 31c3 == %04" PRIx64 "\n",
+           (uint64_t)crc16speed(0, "123456789", 9));
+    char li[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed "
+                "do eiusmod tempor incididunt ut labore et dolore magna "
+                "aliqua. Ut enim ad minim veniam, quis nostrud exercitation "
+                "ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis "
+                "aute irure dolor in reprehenderit in voluptate velit esse "
+                "cillum dolore eu fugiat nulla pariatur. Excepteur sint "
+                "occaecat cupidatat non proident, sunt in culpa qui officia "
+                "deserunt mollit anim id est laborum.";
+    printf("[calcula]: 4b20 == %04" PRIx64 "\n",
+           (uint64_t)crc16(0, li, sizeof(li)));
+    printf("[lookupt]: 4b20 == %04" PRIx64 "\n",
+           (uint64_t)crc16_lookup(0, li, sizeof(li)));
+    printf("[16speed]: 4b20 == %04" PRIx64 "\n",
+           (uint64_t)crc16speed(0, li, sizeof(li)));
+    return 0;
+}
+
+#endif
+
+#ifdef CRCSPEED_TEST_MAIN
+int main(int argc, char *argv[]) {
+    return crc16Test(argc, argv);
+}
+
+#endif
--- a/src/crc_simd.c
+++ b/src/crc_simd.c
@ -0,0 +1,456 @@
+/* CRC SIMD Acceleration Implementation
+ * Copyright (c) 2024
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * SIMD-accelerated CRC computation using PCLMULQDQ (x86) or PMULL (ARM64)
+ * Based on Intel's "Fast CRC Computation for Generic Polynomials Using
+ * PCLMULQDQ Instruction" whitepaper and Intel ISA-L implementation.
+ *
+ * Supports:
+ * - CRC64-Jones: polynomial 0xad93d23594c935a9 (reflected)
+ * - CRC16-CCITT: polynomial 0x1021 (non-reflected)
+ *
+ * References:
+ * - Intel whitepaper: Fast CRC Computation for Generic Polynomials
+ * - Intel ISA-L: https://github.com/intel/isa-l
+ */
+
+#include "crc_simd.h"
+#include <string.h>
+
+/* ============================================================================
+ * Architecture Detection and Feature Testing
+ * ============================================================================
+ */
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+#define ARCH_X86 1
+#if defined(__GNUC__) || defined(__clang__)
+#include <cpuid.h>
+#endif
+#include <immintrin.h>
+#include <wmmintrin.h>
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define ARCH_ARM64 1
+#include <arm_neon.h>
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__APPLE__)
+#define HAS_PMULL 1
+#include <arm_acle.h>
+#endif
+#endif
+
+static bool g_simd_checked = false;
+static bool g_simd_available = false;
+
+bool crc_simd_available(void) {
+    if (g_simd_checked) {
+        return g_simd_available;
+    }
+    g_simd_checked = true;
+
+#if defined(ARCH_X86)
+#if defined(__GNUC__) || defined(__clang__)
+    unsigned int eax, ebx, ecx, edx;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+        /* Check for PCLMULQDQ (bit 1 of ECX) and SSE4.1 (bit 19 of ECX) */
+        g_simd_available = (ecx & (1 << 1)) && (ecx & (1 << 19));
+    }
+#endif
+#elif defined(ARCH_ARM64)
+#if defined(__APPLE__)
+    /* Apple Silicon always has PMULL */
+    g_simd_available = true;
+#elif defined(HAS_PMULL)
+    g_simd_available = true;
+#else
+    g_simd_available = false;
+#endif
+#else
+    g_simd_available = false;
+#endif
+
+    return g_simd_available;
+}
+
+/* ============================================================================
+ * CRC64 Jones Polynomial: 0xad93d23594c935a9 (reflected)
+ *
+ * Constants from Intel ISA-L crc64_jones_refl_by8.asm
+ * ============================================================================
+ */
+
+/* 16-byte folding constants */
+#define CRC64_RK1 UINT64_C(0x381d0015c96f4444)
+#define CRC64_RK2 UINT64_C(0xd9d7be7d505da32c)
+
+/* 128-byte folding constants */
+#define CRC64_RK3 UINT64_C(0x768361524d29ed0b)
+#define CRC64_RK4 UINT64_C(0xcc26fa7c57f8054c)
+
+/* 8-way reduction constants */
+#define CRC64_RK9 UINT64_C(0x5bc94ba8e2087636)
+#define CRC64_RK10 UINT64_C(0x6cf09c8f37710b75)
+#define CRC64_RK11 UINT64_C(0x3885fd59e440d95a)
+#define CRC64_RK12 UINT64_C(0xbccba3936411fb7e)
+#define CRC64_RK13 UINT64_C(0xe4dd0d81cbfce585)
+#define CRC64_RK14 UINT64_C(0xb715e37b96ed8633)
+#define CRC64_RK15 UINT64_C(0xf49784a634f014e4)
+#define CRC64_RK16 UINT64_C(0xaf86efb16d9ab4fb)
+#define CRC64_RK17 UINT64_C(0x7b3211a760160db8)
+#define CRC64_RK18 UINT64_C(0xa062b2319d66692f)
+#define CRC64_RK19 UINT64_C(0xef3d1d18ed889ed2)
+#define CRC64_RK20 UINT64_C(0x6ba4d760ab38201e)
+
+/* ============================================================================
+ * CRC16-CCITT Polynomial: 0x1021 (non-reflected)
+ *
+ * Constants computed by gen_constants.c
+ * ============================================================================
+ */
+
+/* 16-byte folding constants */
+#define CRC16_K1 UINT64_C(0x10e2) /* x^144 mod P */
+#define CRC16_K2 UINT64_C(0xaefc) /* x^128 mod P */
+
+/* 64-byte folding constants */
+#define CRC16_K3 UINT64_C(0x78b3) /* x^528 mod P */
+#define CRC16_K4 UINT64_C(0x13fc) /* x^512 mod P */
+
+/* 8-way reduction constants */
+#define CRC16_RK9 UINT64_C(0x4347)  /* x^912 mod P */
+#define CRC16_RK10 UINT64_C(0xcbc5) /* x^896 mod P */
+#define CRC16_RK11 UINT64_C(0x9e3a) /* x^784 mod P */
+#define CRC16_RK12 UINT64_C(0x106f) /* x^768 mod P */
+#define CRC16_RK13 UINT64_C(0x9c1a) /* x^656 mod P */
+#define CRC16_RK14 UINT64_C(0xda35) /* x^640 mod P */
+#define CRC16_RK15 UINT64_C(0x78b3) /* x^528 mod P */
+#define CRC16_RK16 UINT64_C(0x13fc) /* x^512 mod P */
+#define CRC16_RK17 UINT64_C(0xbd64) /* x^400 mod P */
+#define CRC16_RK18 UINT64_C(0xcde2) /* x^384 mod P */
+#define CRC16_RK19 UINT64_C(0x8ddc) /* x^272 mod P */
+#define CRC16_RK20 UINT64_C(0x8e29) /* x^256 mod P */
+
+/* Barrett reduction constants */
+#define CRC16_MU UINT64_C(0x0000f0d3) /* floor(x^32 / P) */
+#define CRC16_POLY UINT64_C(0x1021)
+
+/* ============================================================================
+ * X86/X64 PCLMULQDQ Implementation
+ * ============================================================================
+ */
+
+#if defined(ARCH_X86)
+
+/* Helper: fold one 128-bit block for CRC64 (reflected) */
+static inline __m128i fold_128(__m128i acc, __m128i data, __m128i k1k2) {
+    __m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x00);
+    __m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x11);
+    return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
+}
+
+/* Helper: byte-swap a 128-bit register (used for CRC16 non-reflected) */
+__attribute__((unused)) static inline __m128i bswap_128(__m128i x) {
+    const __m128i mask =
+        _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    return _mm_shuffle_epi8(x, mask);
+}
+
+/* Helper: fold one 128-bit block for CRC16 (non-reflected)
+ * For non-reflected CRC, we use different PCLMULQDQ immediates
+ * and the constants are positioned in the high 16 bits.
+ * Note: Currently unused as we fall back to crc16speed for CRC16 SIMD on x86.
+ */
+__attribute__((unused)) static inline __m128i
+fold_128_16(__m128i acc, __m128i data, __m128i k1k2) {
+    /* For non-reflected CRC16:
+     * - k1 is in high 64 bits, k2 in low 64 bits
+     * - We use 0x10 and 0x01 immediates (cross-multiply)
+     */
+    __m128i t1 = _mm_clmulepi64_si128(acc, k1k2, 0x10); /* acc_lo * k1 */
+    __m128i t2 = _mm_clmulepi64_si128(acc, k1k2, 0x01); /* acc_hi * k2 */
+    return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
+}
+
+void crc64_simd_init(void) {
+    crc_simd_available();
+}
+
+/* ============================================================================
+ * CRC16 SIMD Implementation (x86/x64 with PCLMULQDQ)
+ *
+ * CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC.
+ *
+ * Non-reflected CRCs process data MSB-first, which requires:
+ * 1. Byte-swapping data to get correct byte order for PCLMULQDQ
+ * 2. Positioning CRC in high bits
+ * 3. Different folding constant interpretation
+ *
+ * This implementation uses the marzooqy/crc-clmul approach:
+ * - Data is byte-swapped (reversed within 128-bit blocks)
+ * - CRC goes in high 64 bits
+ * - For non-reflected: PCLMULQDQ multiplies acc_hi*k_hi and acc_lo*k_lo
+ * - Final reduction via table lookup
+ *
+ * Folding constants for CRC16-CCITT polynomial 0x1021:
+ * - x^192 mod P = 0x650b (for high 64-bit lane)
+ * - x^128 mod P = 0xaefc (for low 64-bit lane)
+ * ============================================================================
+ */
+
+/* CRC16-CCITT folding constants for 128-bit blocks */
+/* x^192 mod P (for high 64-bit lane) */
+#define CRC16_K_HI_X86 UINT64_C(0x650b)
+
+/* x^128 mod P (for low 64-bit lane) */
+#define CRC16_K_LO_X86 UINT64_C(0xaefc)
+
+/* Byte-swap 128-bit register: reverse all 16 bytes */
+static inline __m128i crc16_bswap_128_x86(__m128i x) {
+    const __m128i mask =
+        _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm_shuffle_epi8(x, mask);
+}
+
+/* Fold helper for non-reflected CRC16 on x86
+ * For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo
+ * Constants packed as: k_hi in high 64 bits, k_lo in low 64 bits
+ */
+static inline __m128i crc16_fold_x86(__m128i acc, __m128i data, __m128i k) {
+    /* Non-reflected folding:
+     * - acc_hi (high 64) gets multiplied by k_hi (high 64) -> use 0x11
+     * - acc_lo (low 64) gets multiplied by k_lo (low 64) -> use 0x00
+     */
+    __m128i t1 = _mm_clmulepi64_si128(acc, k, 0x11); /* acc_hi * k_hi */
+    __m128i t2 = _mm_clmulepi64_si128(acc, k, 0x00); /* acc_lo * k_lo */
+    return _mm_xor_si128(_mm_xor_si128(t1, t2), data);
+}
+
+uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
+    const uint8_t *buf = (const uint8_t *)data;
+    extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
+
+    /* Use table-based for small buffers - SIMD overhead not worth it */
+    if (!crc_simd_available() || len < 64) {
+        return crc16speed(crc, data, len);
+    }
+
+    /* Process 16-byte blocks using SIMD folding */
+    uint64_t blocks = len / 16;
+    if (blocks < 2) {
+        return crc16speed(crc, data, len);
+    }
+
+    /* Constants: k_hi in high 64 bits, k_lo in low 64 bits */
+    const __m128i k = _mm_set_epi64x(CRC16_K_HI_X86, CRC16_K_LO_X86);
+
+    /* Load and byte-swap first 16 bytes */
+    __m128i acc = crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf));
+
+    /* XOR initial CRC into high 16 bits of high 64-bit lane
+     * For non-reflected CRC, the CRC value is conceptually at the MSB end
+     */
+    uint64_t crc_shifted = (uint64_t)crc << 48;
+    __m128i crc_vec = _mm_set_epi64x(crc_shifted, 0);
+    acc = _mm_xor_si128(acc, crc_vec);
+
+    buf += 16;
+    blocks--;
+
+    /* Main folding loop */
+    while (blocks > 0) {
+        /* Load and byte-swap next 16 bytes */
+        __m128i data_vec =
+            crc16_bswap_128_x86(_mm_loadu_si128((const __m128i *)buf));
+
+        /* Fold accumulator with new data */
+        acc = crc16_fold_x86(acc, data_vec, k);
+
+        buf += 16;
+        blocks--;
+    }
+
+    /* Store folded result (byte-swap back to original order) */
+    __m128i acc_swapped = crc16_bswap_128_x86(acc);
+
+    uint8_t final_buf[32];
+    _mm_storeu_si128((__m128i *)final_buf, acc_swapped);
+
+    /* Process remaining bytes through table lookup */
+    uint64_t remaining = len % 16;
+    if (remaining > 0) {
+        memcpy(final_buf + 16, buf, remaining);
+        return crc16speed(0, final_buf, 16 + remaining);
+    } else {
+        return crc16speed(0, final_buf, 16);
+    }
+}
+
+void crc16_simd_init(void) {
+    crc_simd_available();
+}
+
+#elif defined(ARCH_ARM64) && defined(HAS_PMULL)
+
+/* ============================================================================
+ * ARM64 NEON/PMULL Implementation
+ * ============================================================================
+ */
+
+static inline uint64x2_t fold_128_neon(uint64x2_t acc, uint64x2_t data,
+                                       uint64_t k1, uint64_t k2) {
+    poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 0), k2);
+    poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 1), k1);
+    uint64x2_t t1_vec = vreinterpretq_u64_p128(t1);
+    uint64x2_t t2_vec = vreinterpretq_u64_p128(t2);
+    return veorq_u64(veorq_u64(t1_vec, t2_vec), data);
+}
+
+/* Byte-swap for NEON - used for non-reflected CRCs (currently unused, kept for
+ * reference) */
+#if 0
+static inline uint8x16_t bswap_128_neon(uint8x16_t x) {
+    return vrev64q_u8(x);
+}
+#endif
+
+void crc64_simd_init(void) {
+    crc_simd_available();
+}
+
+/* ============================================================================
+ * CRC16 SIMD Implementation for ARM64
+ *
+ * CRC16-CCITT (polynomial 0x1021) is a non-reflected CRC.
+ *
+ * Non-reflected CRCs process data MSB-first, which requires:
+ * 1. Byte-swapping data to get correct byte order for PMULL
+ * 2. Positioning CRC in high bits
+ * 3. Different folding constant interpretation
+ *
+ * This implementation uses the marzooqy/crc-clmul approach:
+ * - Data is byte-swapped (reversed within 128-bit blocks)
+ * - CRC goes in high 64 bits: SET(crc << 48, 0)
+ * - For non-reflected: PMULL immediates swap acc_hi*k1 and acc_lo*k2
+ * - Final reduction via table lookup
+ *
+ * Folding constants for CRC16-CCITT polynomial 0x1021:
+ * - x^192 mod P = 0x650b (for high 64-bit lane)
+ * - x^128 mod P = 0xaefc (for low 64-bit lane)
+ * ============================================================================
+ */
+
+/* CRC16-CCITT folding constants for 128-bit blocks */
+#define CRC16_K_HI UINT64_C(0x650b) /* x^192 mod P (for high 64-bit lane) */
+#define CRC16_K_LO UINT64_C(0xaefc) /* x^128 mod P (for low 64-bit lane) */
+
+/* Byte-swap 128 bits: reverse all 16 bytes */
+static inline uint8x16_t crc16_bswap_128(uint8x16_t x) {
+    /* Reverse bytes within each 64-bit lane, then swap lanes */
+    uint8x16_t rev = vrev64q_u8(x);
+    return vcombine_u8(vget_high_u8(rev), vget_low_u8(rev));
+}
+
+/* Fold helper for non-reflected CRC16
+ * For non-reflected: we multiply acc_hi by k_hi, acc_lo by k_lo
+ * This is different from reflected where we use 0x00 and 0x11 immediates
+ */
+static inline uint64x2_t crc16_fold_neon(uint64x2_t acc, uint64x2_t data,
+                                         uint64_t k_hi, uint64_t k_lo) {
+    /* Non-reflected folding:
+     * - acc_hi (lane 1) gets multiplied by k_hi
+     * - acc_lo (lane 0) gets multiplied by k_lo
+     */
+    poly128_t t1 = vmull_p64(vgetq_lane_u64(acc, 1), k_hi); /* acc_hi * k_hi */
+    poly128_t t2 = vmull_p64(vgetq_lane_u64(acc, 0), k_lo); /* acc_lo * k_lo */
+    uint64x2_t t1_vec = vreinterpretq_u64_p128(t1);
+    uint64x2_t t2_vec = vreinterpretq_u64_p128(t2);
+    return veorq_u64(veorq_u64(t1_vec, t2_vec), data);
+}
+
+uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
+    extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
+    const uint8_t *buf = (const uint8_t *)data;
+
+    /* Use table-based for small buffers - SIMD overhead not worth it */
+    if (!crc_simd_available() || len < 64) {
+        return crc16speed(crc, data, len);
+    }
+
+    /* Process 16-byte blocks using SIMD folding */
+    uint64_t blocks = len / 16;
+    if (blocks < 2) {
+        return crc16speed(crc, data, len);
+    }
+
+    /* Load and byte-swap first 16 bytes */
+    uint8x16_t data8 = vld1q_u8(buf);
+    data8 = crc16_bswap_128(data8);
+    uint64x2_t acc = vreinterpretq_u64_u8(data8);
+
+    /* XOR initial CRC into high 16 bits of high 64-bit lane
+     * For non-reflected CRC, the CRC value is conceptually at the MSB end
+     */
+    uint64_t crc_shifted = (uint64_t)crc << 48;
+    uint64x2_t crc_vec = vsetq_lane_u64(crc_shifted, vdupq_n_u64(0), 1);
+    acc = veorq_u64(acc, crc_vec);
+
+    buf += 16;
+    blocks--;
+
+    /* Main folding loop */
+    while (blocks > 0) {
+        /* Load and byte-swap next 16 bytes */
+        data8 = vld1q_u8(buf);
+        data8 = crc16_bswap_128(data8);
+        uint64x2_t data_vec = vreinterpretq_u64_u8(data8);
+
+        /* Fold accumulator with new data */
+        acc = crc16_fold_neon(acc, data_vec, CRC16_K_HI, CRC16_K_LO);
+
+        buf += 16;
+        blocks--;
+    }
+
+    /* Store folded result (byte-swap back to original order) */
+    uint8x16_t acc8 = vreinterpretq_u8_u64(acc);
+    acc8 = crc16_bswap_128(acc8);
+
+    uint8_t final_buf[32];
+    vst1q_u8(final_buf, acc8);
+
+    /* Process remaining bytes through table lookup */
+    uint64_t remaining = len % 16;
+    if (remaining > 0) {
+        memcpy(final_buf + 16, buf, remaining);
+        return crc16speed(0, final_buf, 16 + remaining);
+    } else {
+        return crc16speed(0, final_buf, 16);
+    }
+}
+
+void crc16_simd_init(void) {
+    crc_simd_available();
+}
+
+#else /* No SIMD support */
+
+bool crc_simd_available(void);
+
+uint64_t crc64_simd(uint64_t crc, const void *data, uint64_t len) {
+    extern uint64_t crc64speed(uint64_t crc, const void *s, const uint64_t l);
+    return crc64speed(crc, data, len);
+}
+
+void crc64_simd_init(void) {
+}
+
+uint16_t crc16_simd(uint16_t crc, const void *data, uint64_t len) {
+    extern uint16_t crc16speed(uint16_t crc, const void *s, const uint64_t l);
+    return crc16speed(crc, data, len);
+}
+
+void crc16_simd_init(void) {
+}
+
+#endif /* Architecture selection */
--- a/src/crcspeed.c
+++ b/src/crcspeed.c
@ -0,0 +1,289 @@
+/*
+ * Copyright (C) 2013 Mark Adler
+ * Originally by: crc64.c Version 1.4  16 Dec 2013  Mark Adler
+ * Modifications by Matt Stancliff <matt@genges.com>:
+ *   - removed CRC64-specific behavior
+ *   - added generation of lookup tables by parameters
+ *   - removed inversion of CRC input/result
+ *   - removed automatic initialization in favor of explicit initialization
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the author be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Mark Adler
+  madler@alumni.caltech.edu
+ */
+
+#include "crcspeed.h"
+
+/* Fill in a CRC constants table. */
+void crcspeed64little_init(crcfn64 crcfn, uint64_t table[8][256]) {
+    uint64_t crc;
+
+    /* generate CRCs for all single byte sequences */
+    for (int n = 0; n < 256; n++) {
+        table[0][n] = crcfn(0, &n, 1);
+    }
+
+    /* generate nested CRC table for future slice-by-8 lookup */
+    for (int n = 0; n < 256; n++) {
+        crc = table[0][n];
+        for (int k = 1; k < 8; k++) {
+            crc = table[0][crc & 0xff] ^ (crc >> 8);
+            table[k][n] = crc;
+        }
+    }
+}
+
+void crcspeed16little_init(crcfn16 crcfn, uint16_t table[8][256]) {
+    uint16_t crc;
+
+    /* generate CRCs for all single byte sequences */
+    for (int n = 0; n < 256; n++) {
+        table[0][n] = crcfn(0, &n, 1);
+    }
+
+    /* generate nested CRC table for future slice-by-8 lookup */
+    for (int n = 0; n < 256; n++) {
+        crc = table[0][n];
+        for (int k = 1; k < 8; k++) {
+            crc = table[0][(crc >> 8) & 0xff] ^ (crc << 8);
+            table[k][n] = crc;
+        }
+    }
+}
+
+/* Reverse the bytes in a 64-bit word. */
+static inline uint64_t rev8(uint64_t a) {
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap64(a);
+#else
+    uint64_t m;
+
+    m = UINT64_C(0xff00ff00ff00ff);
+    a = ((a >> 8) & m) | (a & m) << 8;
+    m = UINT64_C(0xffff0000ffff);
+    a = ((a >> 16) & m) | (a & m) << 16;
+    return a >> 32 | a << 32;
+#endif
+}
+
+/* This function is called once to initialize the CRC table for use on a
+   big-endian architecture. */
+void crcspeed64big_init(crcfn64 fn, uint64_t big_table[8][256]) {
+    /* Create the little endian table then reverse all the entires. */
+    crcspeed64little_init(fn, big_table);
+    for (int k = 0; k < 8; k++) {
+        for (int n = 0; n < 256; n++) {
+            big_table[k][n] = rev8(big_table[k][n]);
+        }
+    }
+}
+
+void crcspeed16big_init(crcfn16 fn, uint16_t big_table[8][256]) {
+    /* Create the little endian table then reverse all the entires. */
+    crcspeed16little_init(fn, big_table);
+    for (int k = 0; k < 8; k++) {
+        for (int n = 0; n < 256; n++) {
+            big_table[k][n] = rev8(big_table[k][n]);
+        }
+    }
+}
+
+/* Calculate a non-inverted CRC multiple bytes at a time on a little-endian
+ * architecture. If you need inverted CRC, invert *before* calling and invert
+ * *after* calling.
+ * 64 bit crc = process 8 bytes at once;
+ */
+uint64_t crcspeed64little(uint64_t little_table[8][256], uint64_t crc,
+                          void *buf, size_t len) {
+    unsigned char *next = buf;
+
+    /* process individual bytes until we reach an 8-byte aligned pointer */
+    while (len && ((uintptr_t)next & 7) != 0) {
+        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        len--;
+    }
+
+    /* fast middle processing, 8 bytes (aligned!) per loop */
+    /* clang-format off */
+    while (len >= 8) {
+        crc ^= *(uint64_t *)next;
+        crc = little_table[7][crc & 0xff] ^
+              little_table[6][(crc >> 8) & 0xff] ^
+              little_table[5][(crc >> 16) & 0xff] ^
+              little_table[4][(crc >> 24) & 0xff] ^
+              little_table[3][(crc >> 32) & 0xff] ^
+              little_table[2][(crc >> 40) & 0xff] ^
+              little_table[1][(crc >> 48) & 0xff] ^
+              little_table[0][crc >> 56];
+        next += 8;
+        len -= 8;
+    }
+    /* clang-format on */
+
+    /* process remaining bytes (can't be larger than 8) */
+    while (len) {
+        crc = little_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        len--;
+    }
+
+    return crc;
+}
+
+uint16_t crcspeed16little(uint16_t little_table[8][256], uint16_t crc,
+                          void *buf, size_t len) {
+    unsigned char *next = buf;
+
+    /* process individual bytes until we reach an 8-byte aligned pointer */
+    while (len && ((uintptr_t)next & 7) != 0) {
+        crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
+        len--;
+    }
+
+    /* fast middle processing, 8 bytes (aligned!) per loop */
+    /* clang-format off */
+    while (len >= 8) {
+        uint64_t n = *(uint64_t *)next;
+        crc = little_table[7][(n & 0xff) ^ ((crc >> 8) & 0xff)] ^
+              little_table[6][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
+              little_table[5][(n >> 16) & 0xff] ^
+              little_table[4][(n >> 24) & 0xff] ^
+              little_table[3][(n >> 32) & 0xff] ^
+              little_table[2][(n >> 40) & 0xff] ^
+              little_table[1][(n >> 48) & 0xff] ^
+              little_table[0][n >> 56];
+        next += 8;
+        len -= 8;
+    }
+    /* clang-format on */
+
+    /* process remaining bytes (can't be larger than 8) */
+    while (len) {
+        crc = little_table[0][((crc >> 8) ^ *next++) & 0xff] ^ (crc << 8);
+        len--;
+    }
+
+    return crc;
+}
+
+/* Calculate a non-inverted CRC eight bytes at a time on a big-endian
+ * architecture.
+ */
+uint64_t crcspeed64big(uint64_t big_table[8][256], uint64_t crc, void *buf,
+                       size_t len) {
+    unsigned char *next = buf;
+
+    crc = rev8(crc);
+    while (len && ((uintptr_t)next & 7) != 0) {
+        crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
+        len--;
+    }
+
+    /* clang-format off */
+    while (len >= 8) {
+        crc ^= *(uint64_t *)next;
+        crc = big_table[0][crc & 0xff] ^
+              big_table[1][(crc >> 8) & 0xff] ^
+              big_table[2][(crc >> 16) & 0xff] ^
+              big_table[3][(crc >> 24) & 0xff] ^
+              big_table[4][(crc >> 32) & 0xff] ^
+              big_table[5][(crc >> 40) & 0xff] ^
+              big_table[6][(crc >> 48) & 0xff] ^
+              big_table[7][crc >> 56];
+        next += 8;
+        len -= 8;
+    }
+    /* clang-format on */
+
+    while (len) {
+        crc = big_table[0][(crc >> 56) ^ *next++] ^ (crc << 8);
+        len--;
+    }
+
+    return rev8(crc);
+}
+
+/* WARNING: Completely untested on big endian architecture.  Possibly broken. */
+uint16_t crcspeed16big(uint16_t big_table[8][256], uint16_t crc_in, void *buf,
+                       size_t len) {
+    unsigned char *next = buf;
+    uint64_t crc = crc_in;
+
+    crc = rev8(crc);
+    while (len && ((uintptr_t)next & 7) != 0) {
+        crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
+        len--;
+    }
+
+    /* clang-format off */
+    while (len >= 8) {
+        uint64_t n = *(uint64_t *)next;
+        crc = big_table[0][(n & 0xff) ^ ((crc >> (56 - 8)) & 0xff)] ^
+              big_table[1][((n >> 8) & 0xff) ^ (crc & 0xff)] ^
+              big_table[2][(n >> 16) & 0xff] ^
+              big_table[3][(n >> 24) & 0xff] ^
+              big_table[4][(n >> 32) & 0xff] ^
+              big_table[5][(n >> 40) & 0xff] ^
+              big_table[6][(n >> 48) & 0xff] ^
+              big_table[7][n >> 56];
+        next += 8;
+        len -= 8;
+    }
+    /* clang-format on */
+
+    while (len) {
+        crc = big_table[0][((crc >> (56 - 8)) ^ *next++) & 0xff] ^ (crc >> 8);
+        len--;
+    }
+
+    return rev8(crc);
+}
+
+/* Return the CRC of buf[0..len-1] with initial crc, processing eight bytes
+   at a time using passed-in lookup table.
+   This selects one of two routines depending on the endianess of
+   the architecture. */
+uint64_t crcspeed64native(uint64_t table[8][256], uint64_t crc, void *buf,
+                          size_t len) {
+    uint64_t n = 1;
+
+    return *(char *)&n ? crcspeed64little(table, crc, buf, len)
+                       : crcspeed64big(table, crc, buf, len);
+}
+
+uint16_t crcspeed16native(uint16_t table[8][256], uint16_t crc, void *buf,
+                          size_t len) {
+    uint64_t n = 1;
+
+    return *(char *)&n ? crcspeed16little(table, crc, buf, len)
+                       : crcspeed16big(table, crc, buf, len);
+}
+
+/* Initialize CRC lookup table in architecture-dependent manner. */
+void crcspeed64native_init(crcfn64 fn, uint64_t table[8][256]) {
+    uint64_t n = 1;
+
+    *(char *)&n ? crcspeed64little_init(fn, table)
+                : crcspeed64big_init(fn, table);
+}
+
+void crcspeed16native_init(crcfn16 fn, uint16_t table[8][256]) {
+    uint64_t n = 1;
+
+    *(char *)&n ? crcspeed16little_init(fn, table)
+                : crcspeed16big_init(fn, table);
+}
--- a/src/libhsdaoh.c
+++ b/src/libhsdaoh.c
@ -47,7 +47,8 @@
 #include <hsdaoh.h>
 #include <hsdaoh_private.h>
 #include <format_convert.h>
-#include <crc.h>
+#include <crc16speed.h>
+#include <crc_simd.h>

 #define DEFAULT_BUFFERS 96

@ -493,6 +494,9 @@ int hsdaoh_open(hsdaoh_dev_t **out_dev, uint32_t index)
 	dev->cnv_f1 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN);
 	dev->cnv_f2 = iqconverter_float_create(HB_KERNEL_FLOAT, HB_KERNEL_FLOAT_LEN);

+	crc16speed_init();
+	crc16_simd_init();
+
 found:
 	*out_dev = dev;

@ -771,7 +775,7 @@ void hsdaoh_process_frame(hsdaoh_dev_t *dev, uint8_t *data, int size)
 				frame_errors++;

 			dev->last_crc[1] = dev->last_crc[0];
-			dev->last_crc[0] = crc16_ccitt(line_dat, dev->width * sizeof(uint16_t));
+			dev->last_crc[0] = crc16_simd(0xffff, line_dat, dev->width * sizeof(uint16_t));
 		}

 		if ((payload_len > 0) && dev->stream_synced) {