Reputation: 949
The code for x86 does this (n can only be 1 through 4, unknown at compile time):
static const uint32_t wordmask[] = {
0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
};
static inline uint32_t get_unaligned_le_x86(const void *p, uint32_t n) {
uint32_t ret = *(const uint32_t *)p & wordmask[n];
return ret;
}
For architectures that don't have unaligned 32bit little endian loads I have two variants:
static uint32_t get_unaligned_le_v1(const void *p, uint32_t n) {
const uint8_t *b = (const uint8_t *)p;
uint32_t ret;
ret = b[0];
if (n > 1) {
ret |= b[1] << 8;
if (n > 2) {
ret |= b[2] << 16;
if (n > 3) {
ret |= b[3] << 24;
}
}
}
return ret;
}
static uint32_t get_unaligned_le_v2(const void *p, uint32_t n) {
const uint8_t *b = (const uint8_t *)p;
uint32_t ret = b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24);
ret &= wordmask[n];
return ret;
}
Which would be better on read hardware (I'm using qemu for development) and can you suggest a faster alternative? If it's much faster, I'm game with using assembly.
Upvotes: 0
Views: 517
Reputation: 8725
Conditional execution on the ARM is your best bet for improved performance. Table lookups (masks) will definitely be slower on ARM. Here is my ARMv5 implementation:
// When called from C, r0 = first parameter, r1 = second parameter
// r0-r3 and r12 can get trashed by C functions
unaligned_read:
ldrb r2,[r0],#1 ; byte 0 is always read (n=1..4)
cmp r1,#2
ldrgeb r3,[r0],#1 ; byte 1, n >= 2
ldrgtb r12,[r0],#1 ; byte 2, n > 2
orrge r2,r2,r3,LSL #8
orrgt r2,r2,r12,LSL #16
cmp r1,#4
ldreqb r3,[r0],#1 ; byte 3, n == 4
movne r0,r2 ; recoup wasted cycle
orreq r0,r2,r3,LSL #24
mov pc,lr ; or "bx lr" for thumb compatibility
Update: fixed ldreqb to be ldrgeb
Update 2: shaved off another cycle by inserting an instruction between last ldr/orr
Upvotes: 4
Reputation: 8411
You'll need to test those out. It's hard to say as it not only depends on the processor architecture but also on the compiler, compilation flags, and target system.
Here's another idea/trick that can eliminate the branches and table lookups (untested code):
char mask1 = -(n>1); // 0 if n<=1, 0xFF otherwise
char mask2 = -(n>2);
char mask3 = -(n>3);
ret = b[0];
ret |= (b[1] & mask1) << 8;
ret |= (b[2] & mask2) << 16;
ret |= (b[3] & mask3) << 24;
Note that this, as your second function, can read past the end of your input, which may or may not be a problem.
I get this code generated, which doesn't look that bad (15 instructions, no branches, no table lookups):
cmp r1, #2
ldrb r2, [r0, #2] @ zero_extendqisi2
ldrb r4, [r0, #1] @ zero_extendqisi2
movls r2, #0
cmp r1, #1
ldrb ip, [r0, #0] @ zero_extendqisi2
movls r4, #0
mov r3, r2, asl #16
ldrb r2, [r0, #3] @ zero_extendqisi2
cmp r1, #3
orr r0, r3, r4, asl #8
orr r3, r0, ip
movhi r1, r2
movls r1, #0
orr r0, r3, r1, asl #24
Another thing I would try is to rewrite your second function like this:
if (n > 1) {
ret |= b[1] << 8;
}
if (n > 2) {
ret |= b[2] << 16;
}
if (n > 3) {
ret |= b[3] << 24;
}
As there's a better chance the compiler would be able to use conditional execution for this, which is faster than conditional branches.
If it's really critical that this runs as fast as it can I'd consider writing it in ARM assembly.
Upvotes: 0