I have rewritten the function like this:
for (i = (len >> 2); i != 0; i--)
{
hash = (hash << 5) ^ (hash >> 27) ^ (*(src++));
hash = (hash << 5) ^ (hash >> 27) ^ (*(src++));
hash = (hash << 5) ^ (hash >> 27) ^ (*(src++));
hash = (hash << 5) ^ (hash >> 27) ^ (*(src++));
}
for (i = (len & 3); i != 0; i--)
{
hash = (hash << 5) ^ (hash >> 27) ^ (*(src++));
}
and in assembly this way (for simplicity i assume that src has len >>
2 == 0):
uint32 hash(const uint16* src, int32 len) {
{
uint32 hash = 0;
asm volatile (
"vld1.32 {d0}, [%2] \n\t" //d0 = len
"vld1.32 {d1}, [%2] \n\t" //d1 = len
"1: \n\t"
"subs %2, %2, #4 \n\t"
"vld1.16 d2, [%1]! \n\t"
"vaddw.u32 q2, q2, d2 \n\t" //widening
"vshl.u32 d0, d0, #5 \n\t" //d0 = d0 << 5
"vshr.u32 d1, d1, #27 \n\t" //d1 = d1 >> 27
"veor d0, d0, d1 \n\t" //d0 = d0 ^ d1
"vdup.32 d6, d4[0] \n\t"
"veor d0, d0, d6 \n\t" //d0 = d0 ^ (*src++), d0 = hash
"vmov.i32 d1, d0 \n\t"
"vshl.u32 d0, d0, #5 \n\t"
"vshr.u32 d1, d1, #27 \n\t"
"veor d0, d0, d1 \n\t"
"vdup.32 d7, d4[1] \n\t"
"veor d0, d0, d7 \n\t"
"vmov.i32 d1, d0 \n\t"
"vshl.u32 d0, d0, #5 \n\t"
"vshr.u32 d1, d1, #27 \n\t"
"veor d0, d0, d1 \n\t"
"vdup.32 d8, d5[0] \n\t"
"veor d0, d0, d8 \n\t"
"vmov.i32 d1, d0 \n\t"
"vshl.u32 d0, d0, #5 \n\t"
"vshr.u32 d1, d1, #27 \n\t"
"veor d0, d0, d1 \n\t"
"vdup.32 d9, d5[2] \n\t"
"veor d0, d0, d9 \n\t"
"vmov.i32 d1, d0 \n\t"
"bgt 1b \n\t"
"vst1.32 d1, %0 \n\t"
: "=r"(hash), "+r"(str), "+r"(len)
:: "d0", "d1", "d2", "d3", "d4", "d5", "memory");
return hash;
}
This code gives me segmentation fault if i compile with GCC and run
under Linux. Why?
But problem remains, how could i read eficiently 4 bytes in a NEON
register as 4x32bit values?
BTW, it is my first neon code.
Modac.