When testing what TI’s terrible C compiler for PRU makes of various multiplication use-cases I noticed a little discrepancy between unsigned and signed 32x32→64-bit multiply:
#include <stdint.h>
typedef uint32_t u32;
typedef uint64_t u64;
typedef int32_t s32;
typedef int64_t s64;
// 6 cycles
u32 mul( u32 x, u32 y ) { return x * y; }
// 6 cycles
u32 mla( u32 add, u32 x, u32 y ) { return add + x * y; }
// 7 cycles
u64 umull( u32 x, u32 y ) { return (u64)x * (u64)y; }
// 7 cycles
u64 umlal( u64 add, u32 x, u32 y ) { return add + (u64)x * (u64)y; }
// 459 - 1265 cycles
s64 smull( s32 x, s32 y ) { return (s64)x * (s64)y; }
// 464 - 1270 cycles
s64 smlal( s64 add, s32 x, s32 y ) { return add + (s64)x * (s64)y; }
(All timings with -v3 -O4 –-hardware_mac=on using the ARM version of clpru 2.3.3)
Now, to be clear PRU has no native support for signed integers and thus somewhat reduced performance is expected, and I’m used to the C compiler (clpru) doing really dumb things, but this is a new record.
What’s happening here is a combination of many facepalms:
- While the compiler directly generates instructions for unsigned 32x32→64-bit multiplication, it apparently has no such ability when one or both operands are signed, even though handling those cases would require only 2 additional instructions per signed operand. Instead it sign-extends the operands to 64-bit (using a braindead 5-instruction sequence each) and then calls a library function for 64x64→64-bit multiplication (
__pruabi_mpyll), which in general is a much more expensive operation than the 32x32→64-bit multiplication that was needed. - This library function is not PRU-specific but a generic implementation of 64-bit multiplication using 32-bit multiplication. Since it doesn’t take advantage of PRU’s ability to do 32x32→64-bit multiplication, it has to split each operands into four 16-bit pieces and do ten 16x16→32-bit multiplications. Also, for some reason the compiler feels the need to use a call to the
memsetlibrary function to zeroize an 8-byteuint16_t[4]local variable. - Now the cherry on top: the runtime library (rtspruv3_le.lib) containing all of these functions is compiled without hardware multiplier support enabled, which means each of those 32-bit multiplications becomes a call to another library function (
__pruabi_mpyi) which implements it by repeated shifting and adding.
So, how to fix this:
Recompiling the runtime library with hardware multiplier support using the provided mklib tool (in pru-cgt’s lib directory):
~/.../ti-cgt-pru_2.3.3/lib$ ./mklib --index=libc.a --pattern=rtspruv3_le.lib --extra_options=“–hardware_mac=on” --log=build.log --install_to=.
reduces the cost of __pruabi_mpyll from 445-1245 cycles (operand-dependent) to fixed 397 cycles, which is better but still terrible.
So next we just replace this function by one that’s implemented using three 32x32→64-bit multiplications:
#ifdef __cplusplus
extern "C"
#endif
u64 __pruabi_mpyll( u64 x, u64 y ) {
u32 tmp = (u32)x * (u32)(y >> 32) + (u32)(x >> 32) * (u32)y;
x = (u64)(u32)x * (u64)(u32)y;
x += (u64)tmp << 32;
return x;
}
The code clpru generates for this is pretty dumb, but nevertheless the impact is huge: we’re now down to 17 cycles!
Manually implementing it in assembly further reduces it to 13 cycles:
.sect ".text:__pruabi_mpyll"
.clink
.global __pruabi_mpyll
__pruabi_mpyll:
mov r28, r15 ; ( x_hi )
mov r29, r16 ; ( y_lo )
nop
xin 0, &r26, 4 ; ( x_hi * y_lo )
mov r28, r14 ; ( x_lo )
mov r15, r26 ; result_hi = x_hi * y_lo
xin 0, &r26, 8 ; ( x_lo * y_lo )
mov r29, r17 ; ( y_hi )
mov r14, r26 ; result_lo = lsw( x_lo * y_lo )
add r15, r15, r27 ; result_hi += msw( x_lo * y_lo )
xin 0, &r26, 4 ; ( x_lo * y_hi )
add r15, r15, r26 ; result_hi += x_lo * y_hi
jmp r3.w2
It should be noted that a 64-bit multiply-accumulate can be implemented in the same number of cycles by changing two moves to additions:
; u64 mla64( u64 add, u64 x, u64 y ) { return add + x * y; }
.sect ".text:mla64"
.clink
.global mla64
mla64:
mov r28, r17 ; ( x_hi )
mov r29, r18 ; ( y_lo )
nop
xin 0, &r26, 4 ; ( x_hi * y_lo )
mov r28, r16 ; ( x_lo )
add r15, r15, r26 ; add_hi += x_hi * y_lo
xin 0, &r26, 8 ; ( x_lo * y_lo )
mov r29, r19 ; ( y_hi )
add r14, r14, r26 ; result = add + x_lo * y_lo (part 1)
adc r15, r15, r27 ; result = add + x_lo * y_lo (part 2)
xin 0, &r26, 4 ; ( x_lo * y_hi )
add r15, r15, r26 ; result_hi += x_lo * y_hi
jmp r3.w2
Finally, going back to the signed 32x32→64-bit multiplication that started all this, the first try to make it in C using unsigned multiplication + fixup:
s64 smull( s32 x, s32 y ) {
u64 z = (u64)(u32)x * (u64)(u32)y;
if( x < 0 )
z -= (u64)(u32)y << 32;
if( y < 0 )
z -= (u64)(u32)x << 32;
return z;
}
disappointingly takes 12-18 cycles due to bafflingly terrible code output for the sign-tests.
Changing the sign-tests to equivalent bit-tests reduces the cost to 9-15 cycles:
s64 smull( s32 x, s32 y ) {
u64 z = (u64)(u32)x * (u64)(u32)y;
if( x & 0x80000000 )
z -= (u64)(u32)y << 32;
if( y & 0x80000000 )
z -= (u64)(u32)x << 32;
return z;
}
This branch-free version is 14 cycles:
s64 smull( s32 x, s32 y ) {
u32 t = x & -((u32)y >> 31);
t += y & -((u32)x >> 31);
u64 z = (u64)(u32)x * (u64)(u32)y;
z -= (u64)t << 32;
return z;
}
Finally, manually writing it in assembly reduces the cost to 8-10 cycles:
.sect ".text:smull"
.clink
.global smull
smull:
mov r28, r14
mov r29, r15
qbbs $1, r28, 31
ldi r15, 0
$1:
xin 0, &r26, 8
qbbc $2, r29, 31
sub r27, r27, r28
$2:
mov r14, r26
sub r15, r27, r15
jmp r3.w2
and once again multiply-accumulate can be done in the same number of cycles:
.sect ".text:smlal"
.clink
.global smlal
smlal:
mov r28, r16
mov r29, r17
qbbc $1, r28, 31
sub r15, r15, r29
$1:
qbbc $2, r29, 31
sub r15, r15, r28
$2:
xin 0, &r26, 8
add r14, r14, r26
adc r15, r15, r27
jmp r3.w2
And as a final fun note, somehow the x86-64 linux version of clpru 2.3.3 generates worse output than the arm-linux version of the exact same compiler version. ¯\_(ツ)_/¯