title:ARM64 assembly crc32 keywords:assembly,m1,arm64,macos,linux # ARM64 assembly crc32 ## Intro Latest ARM64 processors support hardware crc32 instruction, since architecture version __ARMv8.1-A__ . Raspbery pi have architecture version __ARMv8-A__ version and Apple M1 __ARMv8.4-A__. Both of those have hardware accelerated crc32 instruction. In example is shown how to use software and hardware implementation of that functionality for macos and linux. ## Verify architecture Before running any of this code, check that archirecture is correct in case of Apple M1 its easiest to do. With Raspberry Pi 4 need to run it with 64bit linux. ### Raspberry Pi 4 Run command ```bash uname -a Linux raspberrypi 5.4.42-v8+ #1319 SMP PREEMPT Wed May 20 14:18:56 BST 2020 aarch64 GNU/Linux ``` There is substring "aarch64" that indicates that os supports 64 bits To check if CPU supports crc32 instructions run ```bash cat /proc/cpuinfo | grep crc ``` search for "crc32" substring ### Apple M1 If you have Apple M1 then you already know it. If not sure run one of those commands to verify archirecture ```bash uname -a ``` or ```bash arch ``` in output string should be "arm64" ## ARM64 crc32 instructions Wiki have table with all different kind of implementation of crc32 https://en.wikipedia.org/wiki/Cyclic_redundancy_check arm64 crc32 implementation reffered in wiki table as __CRC-32__. There is support also for __CRC-32C__ variant. There is 4 types of crc32 instruction for 1,2,4,8 byte of data that are marked with postfix b,h,w,x. Wraped up examples of instructions: ```c //crc32 #define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) //crc32c #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) ``` https://android.googlesource.com/platform/external/linux-kselftest/+/d97034ccdf0a13ad86f00945df245bbaf0780478/arch/arm64/crypto/crc32-arm64.c ## Software implementation Found crc32 software implementation as its helps verify HW instruction results. It verify that algorithm used is correct against CRC32 table in wiki and that there is no other quirks for using crc32. Short implementation of crc32 with initial table generation http://home.thep.lu.se/~bjorn/crc/ There is implementation from BSD kernel where crc32 value tables are precalculated. https://web.mit.edu/freebsd/head/sys/libkern/crc32.c ```c uint32_t crc32_for_byte(uint32_t r) { for(int j = 0; j < 8; ++j) r = (r & 1? 0: (uint32_t)0xEDB88320L) ^ r >> 1; return r ^ (uint32_t)0xFF000000L; } void crc32(const void *data, size_t n_bytes, uint32_t* crc) { static uint32_t table[0x100]; if(!*table) for(size_t i = 0; i < 0x100; ++i) table[i] = crc32_for_byte(i); for(size_t i = 0; i < n_bytes; ++i) *crc = table[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8; } ``` ## HW and SW implementation differences when using assembly instruction please note that initial value of crc is __crc=0xffffffff__ and final result should be inverted __~crc__ to match software implementation where initial values is __crc=0x0__ and final result not need to be preprocessed. ## Source code Here is complete source code of software and hardware crc32, that runs over some array of data and compares results of 2 software implementations and hardware. ```c #include #include #include #include /* https://android.googlesource.com/platform/external/linux-kselftest/+/d97034ccdf0a13ad86f00945df245bbaf0780478/arch/arm64/crypto/crc32-arm64.c */ #define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) /* http://home.thep.lu.se/~bjorn/crc/ */ uint32_t crc32_for_byte(uint32_t r) { for(int j = 0; j < 8; ++j) r = (r & 1? 0: (uint32_t)0xEDB88320L) ^ r >> 1; return r ^ (uint32_t)0xFF000000L; } static uint32_t crc32_tab_ver1[0x100]; void crc32_ver1(const void *data, size_t n_bytes, uint32_t* crc) { static uint32_t table[0x100]; if(!*table) for(size_t i = 0; i < 0x100; ++i) crc32_tab_ver1[i] = crc32_for_byte(i); for(size_t i = 0; i < n_bytes; ++i) *crc = crc32_tab_ver1[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8; } /* https://web.mit.edu/freebsd/head/sys/libkern/crc32.c */ const uint32_t crc32_tab_ver2[] = { 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; uint32_t crc32_ver2(const void *buf, size_t size) { const uint8_t *p = buf; uint32_t crc; crc = ~0U; while (size--) crc = crc32_tab_ver2[(crc ^ *p++) & 0xFF] ^ (crc >> 8); return crc ^ ~0U; } static uint8_t test_data[] = {0xaa,0xbb,0xcc,0xdd,0xee,0xff,0x00,0x11}; int main() { size_t i=0; uint32_t crc_1=0x00000000, crc_2=0x00000000, crc_3=0xffffffff; printf("Initial values: SW1:%08x SW2:%08x HW:%08x\n",crc_1,crc_2,crc_3); for (i=0;i