Reputation: 13
I'm trying to implement a CRC-CCITT calculator in VHDL. I was able to initially do that; however, I recently found out that data is delivered starting at the least-significant byte. In my code, data is transmitted 7 bytes at a time through a frame. So let's say we have the following data: 123456789
in ASCII or 313233343536373839
in hex. The data would be transmitted as such (with the following CRC):
-- First frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second/last frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31", -- MSB
3 => xx, -- "xx" means irrelevant data, not part of CRC calculation.
4 => xx, -- This occurs only in the last frame, when it specified in
5 => xx, -- byte 0 which bytes contain data
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x31C3
Another example with data 0x4376669A1CFC048321313233343536373839
and its correct CRC is shown below:
-- First incoming frame of data
RxFrame.Data <= (
1 => x"39", -- LSB
2 => x"38",
3 => x"37",
4 => x"36",
5 => x"35",
6 => x"34",
7 => x"33"
);
-- Second incoming frame of data
RxFrame.Data <= (
1 => x"32",
2 => x"31",
3 => x"21",
4 => x"83",
5 => x"04",
6 => x"FC",
7 => x"1C"
);
-- Third/last incoming frame of data
RxFrame.Data <= (
1 => x"9A",
2 => x"66",
3 => x"76",
4 => x"43", -- MSB
5 => xx, -- Irrelevant data, specified in byte 0
6 => xx,
7 => xx
);
-- Calculated CRC should be 0x2848
Is there a concept I'm missing? Is there a way to calculate the CRC with the data being received in reverse order? I am implementing this for CANopen SDO block protocols. Thanks!
CRC calculation algorithm to verify SDO block transfer from CANopen standard
Upvotes: 1
Views: 843
Reputation: 28818
Example code to generate a CRC16 with the bytes read in reverse (last byte first), using a function to do a carryless multiply modulo the CRC polynomial. An explanation follows.
#include <stdio.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#define POLY (0x1021u)
/* carryless multiply modulo crc polynomial */
uint16_t MpyModPoly(uint16_t a, uint16_t b) /* (a*b)%poly */
{
uint16_t pd = 0;
uint16_t i;
for(i = 0; i < 16; i++){
/* assumes twos complement */
pd = (pd<<1)^((0-(pd>>15))&POLY);
pd ^= (0-(b>>15))&a;
b <<= 1;
}
return pd;
}
/* generate crc in reverse byte order */
uint16_t Crc16R(uint8_t * b, size_t sz)
{
uint8_t *e = b + sz; /* end of bfr ptr */
uint16_t crc = 0u; /* crc */
uint16_t pdm = 0x100u; /* padding multiplier */
while(e > b){ /* generate crc */
pdm = MpyModPoly(0x100, pdm);
crc ^= MpyModPoly( *--e, pdm);
}
return(crc);
}
/* msg will be processed in reverse order */
static uint8_t msg[] = {0x43,0x76,0x66,0x9A,0x1C,0xFC,0x04,0x83,
0x21,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
0x38,0x39};
int main()
{
uint16_t crc;
crc = Crc16R(msg, sizeof(msg));
printf("%04x\n", crc);
return 0;
}
Example code using X86 xmm pclmulqdq and psrlq, to emulate a 16 bit by 16 bit hardware (VHDL) carryless multiply:
/* __m128i is an intrinsic for X86 128 bit xmm register */
static __m128i poly = {.m128i_u32[0] = 0x00011021u}; /* poly */
static __m128i invpoly = {.m128i_u32[0] = 0x00008898u}; /* 2^31 / poly */
/* carryless multiply modulo crc polynomial */
/* using xmm pclmulqdq and psrlq */
uint16_t MpyModPoly(uint16_t a, uint16_t b)
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* mp = a*b */
mt = _mm_srli_epi64(mp, 16); /* mt = mp>>16 */
mt = _mm_clmulepi64_si128(mt, invpoly, 0x00); /* mt = mt*ipoly */
mt = _mm_srli_epi64(mt, 15); /* mt = mt>>15 = (a*b)/poly */
mt = _mm_clmulepi64_si128(mt, poly, 0x00); /* mt = mt*poly */
return mp.m128i_u16[0] ^ mt.m128i_u16[0]; /* ret mp^mt */
}
/* external code to generate invpoly */
#define POLY (0x11021u)
static __m128i invpoly; /* 2^31 / poly */
void GenMPoly(void) /* generate __m12i8 invpoly */
{
uint32_t N = 0x10000u; /* numerator = x^16 */
uint32_t Q = 0; /* quotient = 0 */
for(size_t i = 0; i <= 15; i++){ /* 31 - 16 = 15 */
Q <<= 1;
if(N&0x10000u){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
invpoly.m128i_u16[0] = Q;
}
Explanation: consider the data as separate strings of ever increasing length, padded with zeroes at the end. For the first few bytes of your example, the logic would calculate
CRC = CRC16({39})
CRC ^= CRC16({38 00})
CRC ^= CRC16({37 00 00})
CRC ^= CRC16({36 00 00 00})
...
To speed up this calculation, rather than actually pad with n zero bytes, you can do a carryless multiply of a CRC by 2^{n·8} modulo POLY, where POLY is the 17 bit polynomial used for CRC16:
CRC = CRC16({39})
CRC ^= (CRC16({38}) · (2^08 % POLY)) % POLY
CRC ^= (CRC16({37}) · (2^10 % POLY)) % POLY
CRC ^= (CRC16({36}) · (2^18 % POLY)) % POLY
...
A carryless multiply modulo POLY is equivalent to what CRC16 does, so this translates into pseudo code (all values in hex, 2^8 = 100)
CRC = 0
PDM = 100 ;padding multiplier
PDM = (100 · PDM) % POLY ;main loop (2 lines per byte)
CRC ^= ( 39 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 38 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 37 · PDM) % POLY
PDM = (100 · PDM) % POLY
CRC ^= ( 36 · PDM) % POLY
...
Implementing (A · B) % POLY is based on binary math:
(A · B) % POLY = (A · B) ^ (((A · B) / POLY) · POLY)
Where multiply is carryless (XOR instead of add) and divide is borrowless (XOR instead of subtract). Since the divide is borrowless, and most significant term of POLY is x^16, the quotient
Q = (A · B) / POLY
only depends on the upper 16 bits of (A · B). Dividing by POLY uses multiplication by the 16 bit constant IPOLY = (2^31)/POLY followed by a right shift:
Q = (A · B) / POLY = (((A · B) >> 16) · IPOLY) >> 15
The process uses a 16 bit by 16 bit carryless multiply, producing a 31 bit product.
POLY = 0x11021u ; CRC polynomial (17 bit)
IPOLY = 0x08898u ; 2^31 / POLY
; generated by external software
MpyModPoly(A, B)
{
MP = A · B ; MP = A · B
MT = MP >> 16 ; MT = MP >> 16
MT = MT · IPOLY ; MT = MT · IPOLY
MT = MT >> 15 ; MT = (A · B) / POLY
MT = MT · POLY ; MT = ((A · B) / POLY) * POLY
return MP xor MT ; (A·B) ^ (((A · B) / POLY) · POLY)
}
A hardware based carryless multiply would look something like this 4 bit · 4 bit example.
p[] = [a3 a2 a1 a0] · [b3 b2 b1 b0]
p[] is a 7 bit product generated with 7 parallel circuits.
The time for multiply would be worst case propagation time for p3.
p6 = a3&b3
p5 = a3&b2 ^ a2&b3
p4 = a3&b1 ^ a2&b2 ^ a1&b3
p3 = a3&b0 ^ a2&b1 ^ a1&b2 ^ a0&b3
p2 = a2&b0 ^ a1&b1 ^ a0&b2
p1 = a1&b0 ^ a0&b1
p0 = a0&b0
If the xor gates available only have 2 bit inputs, the logic can
be split up. For example:
p3 = (a3&b0 ^ a2&b1) ^ (a1&b2 ^ a0&b3)
I don't know if your VHDL toolset includes a library for carryless multiply. For a 16 bit by 16 bit multiply resulting in a 31 bit product (p30 to p00), p15 has 16 outputs from the 16 ands (in parallel), which could be xor'ed using a tree like structure, 8 xors in parallel feeding into 4 xors in parallel feeding into 2 xor's in parallel into a single xor. So the propagation time would be 1 and and 4 xor propagation times.
Upvotes: 1
Reputation: 112209
Here is an example in C that you can adapt. Since you mentioned VHDL, this is a bit-wise implementation suitable for casting into gates and flip-flops. However, if cycles are more precious to you than memory and gates, then there is also a byte-wise table-driven version that would run in 1/8 the number of cycles.
What this does is the inverse of what is done in a normal CRC calculation. It then applies the same size input in zeros with a normal CRC to get what the normal CRC would have been on that input. Running the zeros through takes the same number of cycles as the inverse CRC, i.e. O(n) where n is the size of the input. If that latency is too large, that can be reduced to O(log n) cycles, with some investment in gates.
#include <stddef.h>
// Update crc with the CRC-16/XMODEM of n zero bytes. (This can be done in
// O(log n) time or cycles instead of O(n), with a little more effort.)
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
for (size_t i = 0; i < n; i++)
for (int k = 0; k < 8; k++)
crc = crc & 0x8000 ? (crc << 1) ^ 0x1021 : crc << 1;
return crc & 0xffff;
}
// Update crc with the CRC-16/XMODEM of the len bytes at mem in reverse. If mem
// is NULL, then return the initial value for the CRC. When done,
// crc16x_zeros_bit() must be used to apply the total length of zero bytes, in
// order to get what the CRC would have been if it were calculated on the bytes
// fed in the opposite order.
static unsigned crc16x_inverse_bit(unsigned crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc &= 0xffff;
for (size_t i = 0; i < len; i++) {
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ 0x8810 : crc >> 1;
crc ^= (unsigned)data[i] << 8;
}
return crc;
}
#include <stdio.h>
int main(void) {
// Do framed example.
unsigned crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21", 2);
crc = crc16x_zeros_bit(crc, 9);
printf("%04x\n", crc);
// Do another one.
crc = crc16x_inverse_bit(0, NULL, 0);
crc = crc16x_inverse_bit(crc, (void const *)"9876543", 7);
crc = crc16x_inverse_bit(crc, (void const *)"21!\x83\x04\xfc\x1c", 7);
crc = crc16x_inverse_bit(crc, (void const *)"\x9a" "fvC", 4);
crc = crc16x_zeros_bit(crc, 18);
printf("%04x\n", crc);
return 0;
}
Here is the O(log n) version of crc16x_zeros_bit()
:
// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC
// polynomial. For speed, a cannot be zero.
static inline unsigned multmodp(unsigned a, unsigned b) {
unsigned p = 0;
for (;;) {
if (a & 1) {
p ^= b;
if (a == 1)
break;
}
a >>= 1;
b = b & 0x8000 ? (b << 1) ^ 0x1021 : b << 1;
}
return p & 0xffff;
}
// Return x^(8n) modulo p(x).
static unsigned x2nmodp(size_t n) {
unsigned p = 1; // x^0 == 1
unsigned q = 0x10; // x^2^2
while (n) {
q = multmodp(q, q); // x^2^k mod p(x), k = 3,4,...
if (n & 1)
p = multmodp(q, p);
n >>= 1;
}
return p;
}
// Update crc with the CRC-16/XMODEM of n zero bytes.
static unsigned crc16x_zeros_bit(unsigned crc, size_t n) {
return multmodp(x2nmodp(n), crc);
}
Upvotes: 1