Reputation: 167
I have a patch of code which sometimes cost 0.1ms but sometimes 10ms , is there anybody could provide me some advise for them
for (uint32_t k = 0; k < 32; k++, dataOff += stp)
{
uint8x16_t d0 = vld1q_u8((const uint8_t *)dataOff);
uint8x16_t d1 = vld1q_u8((const uint8_t *)(dataOff + 16));
maxValue = vmaxq_u8(maxValue, vmaxq_u8(d0, d1));
minValue = vminq_u8(minValue, vminq_u8(d0, d1));
}
maxValue1 = vmax_u8(vget_high_u8(maxValue), vget_low_u8(maxValue));
minValue1 = vmin_u8(vget_high_u8(minValue), vget_low_u8(minValue));
maxValue1 = vmax_u8(maxValue1, vext_u8(maxValue1, maxValue1, 4));
minValue1 = vmin_u8(minValue1, vext_u8(minValue1, minValue1, 4));
maxValue1 = vmax_u8(maxValue1, vext_u8(maxValue1, maxValue1, 2));
minValue1 = vmin_u8(minValue1, vext_u8(minValue1, minValue1, 2));
maxValueUchar = __max(vget_lane_u8(maxValue1, 0), vget_lane_u8(maxValue1, 1));
minValueUchar = __min(vget_lane_u8(minValue1, 0), vget_lane_u8(minValue1, 1));
if (maxValueUchar - minValueUchar < hist_th_grid)continue;//bright delta < 64, NO
else if (maxValueUchar < hist_th_grid)continue;//all dark, NO
dataOff = data;
uint16x8_t sum = vdupq_n_u16(0);
uint32x4_t sum32;
uint8_t sumInt;
for (uint32_t k = 0; k < 32; k++, dataOff += stp)
{
uint8x16_t d0 = vld1q_u8((const uint8_t *)dataOff);
uint8x16_t d1 = vld1q_u8((const uint8_t *)(dataOff + 16));
sum = vaddq_u16(sum, vaddq_u16(vpaddlq_u8(d0), vpaddlq_u8(d1)));
}
sum32 = vpaddlq_u16(sum);
sum32 = vaddq_u32(sum32, vextq_u32(sum32, sum32, 2));
sum32 = vaddq_u32(sum32, vextq_u32(sum32, sum32, 1));
sumInt = __min((vgetq_lane_u32(sum32, 0) >> 10) + brt_th_grid,255u);
and what's more I found this code has more stable speed in samsung galaxy s6 than samsung galaxy s7, is there anybody could tell me why
Upvotes: 0
Views: 101
Reputation: 12149
All modern smartphones use dynamic frequency scaling to balance CPU and memory system performance against battery life, and most of the high-end devices Samsung chipsets also use heterogeneous SMP with two different CPU designs (ARM "big.LITTLE").
If you are running short code snippets there will be a large difference in performance between "big @ max frequency" and "LITTLE @ min frequency"; especially because the NEON performance on LITTLE cores is designed for efficiency rather than max performance so will be measurably slower than the big cores.
In general what this means is that when benchmarking you ideally need a steady-state sustained workload for a few seconds to warm up the device in order to let the frequency and CPU selection stabilize before you run the code you actually want to measure.
Upvotes: 3