Reputation: 59
I'm implementing Poly1305 on an ARM Cortex-M4 microcontroller, and I initially used a radix 2828 representation. I modified the implementation to use radix 226226 while keeping the same principles. However, after the change, my implementation fails test cases and does not produce the correct authentication tag. My Approach
The accumulator (h) is stored as five 26-bit integers.
I use carry propagation after additions and multiplications to keep values within 26-bit constraints.
The multiplication step (mulmod()) follows the standard modulo 2130−52130−5 reduction rules.
The finalization step (freeze()) ensures values remain within the correct range.
The Issue
The function completes execution, but the generated Poly1305 tag does not match expected outputs.
I suspect an issue with carry propagation, modular reduction, or key parsing, but I haven't pinpointed it.
I have checked bitwise operations, carry handling, and reductions, but something still seems off.
Here is my implementation:
#include "poly1305.h"
static void add(unsigned int h[5], const unsigned int c[5]) {
unsigned long long u = 0;
for (unsigned int j = 0; j < 5; ++j) {
u += (unsigned long long)h[j] + c[j];
h[j] = (unsigned int)(u & 0x3FFFFFF); // Mask for 26 bits
u >>= 26;
}
}
static void squeeze(unsigned int h[5]) {
unsigned long long u = 0;
for (unsigned int j = 0; j < 4; ++j) {
u += h[j];
h[j] = (unsigned int)(u & 0x3FFFFFF);
u >>= 26;
}
u += h[4];
h[4] = (unsigned int)(u & 0x3FFFFFF);
u >>= 26;
u *= 5;
for (unsigned int j = 0; j < 4; ++j) {
u += h[j];
h[j] = (unsigned int)(u & 0x3FFFFFF);
u >>= 26;
}
h[4] += (unsigned int)u;
}
static const unsigned int minusp[5] = {5, 0, 0, 0, 0x3FFFFFF};
static void freeze(unsigned int h[5]) {
unsigned int horig[5];
unsigned int j;
unsigned int negative;
for (j = 0; j < 5; ++j) horig[j] = h[j];
add(h, minusp);
negative = -(h[4] >> 25);
for (j = 0; j < 5; ++j) h[j] ^= negative & (horig[j] ^ h[j]);
}
static void mulmod(unsigned int h[5], const unsigned int r[5]) {
unsigned long long hr[5] = {0};
unsigned int i, j;
for (i = 0; i < 5; ++i) {
for (j = 0; j <= i; ++j) {
hr[i] += (unsigned long long)h[j] * r[i - j];
}
for (j = i + 1; j < 5; ++j) {
hr[i] += 5 * (unsigned long long)h[j] * r[i + 5 - j];
}
}
for (i = 0; i < 5; ++i) h[i] = (unsigned int)(hr[i] & 0x3FFFFFF);
for (i = 0; i < 4; ++i) h[i + 1] += (unsigned int)(hr[i] >> 26);
squeeze(h);
}
int crypto_onetimeauth_poly1305(unsigned char *out, const unsigned char *in, unsigned long long inlen, const unsigned char *k) {
unsigned int r[5], h[5] = {0}, c[5];
unsigned int j;
// Parse r from the key, ensuring the correct masking and shifting
r[0] = (k[0] | (k[1] << 8) | (k[2] << 16)) & 0x3FFFFFF;
r[1] = ((k[2] >> 2) | (k[3] << 6) | (k[4] << 14)) & 0x3FFFFFF;
r[2] = ((k[5] >> 4) | (k[6] << 4) | (k[7] << 12)) & 0x3FFFFFF;
r[3] = ((k[8] >> 6) | (k[9] << 2) | (k[10] << 10)) & 0x3FFFFFF;
r[4] = (k[11] | (k[12] << 8) | (k[13] << 16)) & 0x3FFFFFF;
// Process the input
while (inlen > 0) {
for (j = 0; j < 5; ++j) c[j] = 0;
for (j = 0; (j < 16) && (j < inlen); ++j) {
c[j / 4] |= (unsigned int)in[j] << (8 * (j % 4));
}
c[j / 4] |= 1 << ((j % 4) * 8);
in += j; inlen -= j;
add(h, c);
mulmod(h, r);
}
// Finalize
freeze(h);
// Add key
for (j = 0; j < 4; ++j) {
c[j] = k[16 + j];
}
c[4] = 0;
add(h, c);
// Write output tag
for (j = 0; j < 4; ++j) {
out[j * 4 + 0] = (unsigned char)(h[j] & 0xFF);
out[j * 4 + 1] = (unsigned char)((h[j] >> 8) & 0xFF);
out[j * 4 + 2] = (unsigned char)((h[j] >> 16) & 0xFF);
out[j * 4 + 3] = (unsigned char)((h[j] >> 24) & 0xFF);
}
out[15] = (unsigned char)(h[4] & 0x03);
return 0;
}
I suspect the issue might be in one of the following areas:
Carry propagation – The squeeze() function ensures values fit within 26 bits, but I may have missed an edge case.
Multiplication step (mulmod) – The handling of partial products and carries could be incorrect.
Finalization (freeze) – The conditional subtraction of p (modulo 2130−52130−5) might not be working correctly.
Key parsing – The way I extract r from the key and mask it might introduce issues.
Debugging Attempts
I tried printing intermediate values of h and r, but they don’t always match expected results.
The issue may lie in mulmod(), freeze(), or the final addition of the key.
Compared against reference implementations, but my version produces different results.
Questions
Could my carry handling be incorrect?
Is the modular reduction in squeeze() or freeze() wrong?
Are there known pitfalls when implementing Poly1305 on ARM Cortex-M4?
Any guidance on what might be wrong or how to debug this further would be greatly appreciated!
I modified the code from it's initial implementation which uses radix 2^8 and when I changed the radix to 2^26 using thesame principles, I could get a pass on the test
Upvotes: 1
Views: 48