c语言版本双线性插值的整齐舞蹈
inlinedoublebilinear _ interp (双精度、双精度y、双精度v 11、双精度v 12、
double v21,double v22 ) {
返回(v11 * (1- y ) v12 * y ) )1-x ) ) v21 * (y ) ) x;
}
用ARM NOEN优化的双线性插值版本
inline uint 8x8_ tbilinear _ interp _ neon (doublex,double y,uint8x8_t v11,uint8x8_t v12,
uint8x8_t v21,uint8x8_t v22 )
{
uint16x8_tV11_16=vmovl_U8(V11 );
uint16x8_tv12_16=vmovl_u8(v12;
uint16x8_tV21_16=vmovl_U8(V21;
uint16x8_tv22_16=vmovl_u8(v22;
///convert v11 to two float32x4
uint 16x4_ TV _ 16 _ low=vget _ low _ u16 (v11 _ 16 );
uint 16x4_ TV _ 16 _ high=vget _ high _ u16 (v11 _ 16 );
uint 32 x4 _ TV _ 32 _ low=v movl _ u16 (v _ 16 _ low );
uint 32 x4 _ TV _ 32 _ high=v movl _ u16 (v _ 16 _ high );
float 32 x4 _ tv11 _ 32f _ low=vcv TQ _ f32 _ u32 (v _ 32 _ low );
float 32 x4 _ tv11 _ 32f _ high=vcv TQ _ f32 _ u32 (v _ 32 _ high );
//v12
v_16_low=vget_low_u16(v12_16 );
v_16_high=vget_high_u16(v12_16 );
v_32_low=vmovl_u16(v_16_low );
v_32_high=vmovl_u16(v_16_high );
float 32 x4 _ tv12 _ 32f _ low=vcv TQ _ f32 _ u32 (v _ 32 _ low );
float 32 x4 _ tv12 _ 32f _ high=vcv TQ _ f32 _ u32 (v _ 32 _ high );
//v21
v_16_low=vget_low_u16(V21_16 );
v_16_high=vget_high_u16(V21_16 );
v_32_low=vmovl_u16(v_16_low );
v_32_high=vmovl_u16(v_16_high );
float 32 x4 _ tv21 _ 32f _ low=vcv TQ _ f32 _ u32 (v _ 32 _ low );
float 32 x4 _ tv21 _ 32f _ high=vcv TQ _ f32 _ u32 (v _ 32 _ high );
//v22
v_16_low=vget_low_u16(v22_16 );
v_16_high=vget_high_u16(v22_16 );
v_32_low=vmovl_u16(v_16_low );
v_32_high=vmovl_u16(v_16_high );
float 32 x4 _ tv22 _ 32f _ low=vcv TQ _ f32 _ u32 (v _ 32 _ low );
float 32 x4 _ tv22 _ 32f _ high=vcv TQ _ f32 _ u32 (v _ 32 _ high );
float32_t fx=(float32_t ) x;
float32_t fy=(float32_t ) y;
float32_t one_fx=1-fx;
float32_t one_fy=1-fy;
float32x4_t tmp1、tmp2、tmp3、tmp4、tmp5、tmp;
uint32x4_t result_32_low,result_32_high;
uint16x4_t result_16_low,result_16_high;
//for low 32x4
tmp1=vmulq_n_f32(V11_32f_low,one_fy );
tmp2=vmulq_n_f32(v12_32f_low,fy );
TMP3=VADdq_F32(tmp1,tmp2);
tMP4=vmulq_n_F32(tMP3,one_fx );
tmp1=vmulq_n_f3
2(v21_32f_low, one_fy);tmp2 = vmulq_n_f32(v22_32f_low, fy);
tmp3 = vaddq_f32(tmp1, tmp2);
tmp5 = vmulq_n_f32(tmp3, fx);
tmp = vaddq_f32(tmp4, tmp5);
result_32_low = vcvtq_u32_f32(tmp);
result_16_low = vqmovn_u32(result_32_low);
//for high 32x4
tmp1 = vmulq_n_f32(v11_32f_high, one_fy);
tmp2 = vmulq_n_f32(v12_32f_high, fy);
tmp3 = vaddq_f32(tmp1, tmp2);
tmp4 = vmulq_n_f32(tmp3, one_fx);
tmp1 = vmulq_n_f32(v21_32f_high, one_fy);
tmp2 = vmulq_n_f32(v22_32f_high, fy);
tmp3 = vaddq_f32(tmp1, tmp2);
tmp5 = vmulq_n_f32(tmp3, fx);
tmp = vaddq_f32(tmp4, tmp5);
result_32_high = vcvtq_u32_f32(tmp);
result_16_high = vqmovn_u32(result_32_high);
uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
uint8x8_t result_8 = vqmovn_u16(result_16);
return result_8;
}
使用ARM 友好的大象,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。
到这里我们还不能满足。要追求更快!!!
注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:
inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,
uint8x8_t v21,uint8x8_t v22)
{
uint16x8_t v11_16 = vmovl_u8(v11);
uint16x8_t v12_16 = vmovl_u8(v12);
uint16x8_t v21_16 = vmovl_u8(v21);
uint16x8_t v22_16 = vmovl_u8(v22);
uint16x4_t v_16_low = vget_low_u16(v11_16);
uint16x4_t v_16_high = vget_high_u16(v11_16);
uint32x4_t v11_32_low = vmovl_u16(v_16_low);
uint32x4_t v11_32_high = vmovl_u16(v_16_high);
v_16_low = vget_low_u16(v12_16);
v_16_high = vget_high_u16(v12_16);
uint32x4_t v12_32_low = vmovl_u16(v_16_low);
uint32x4_t v12_32_high = vmovl_u16(v_16_high);
v_16_low = vget_low_u16(v21_16);
v_16_high = vget_high_u16(v21_16);
uint32x4_t v21_32_low = vmovl_u16(v_16_low);
uint32x4_t v21_32_high = vmovl_u16(v_16_high);
v_16_low = vget_low_u16(v22_16);
v_16_high = vget_high_u16(v22_16);
uint32x4_t v22_32_low = vmovl_u16(v_16_low);
uint32x4_t v22_32_high = vmovl_u16(v_16_high);
unsigned int intX = x*4096;
unsigned int intY = y*4096;
unsigned int one_x = 4096-intX;
unsigned int one_y = 4096-intY;
uint32_t intX_32 = (uint32_t) intX;
uint32_t intY_32 = (uint32_t) intY;
uint32_t oneX_32 = (uint32_t) one_x;
uint32_t oneY_32 = (uint32_t) one_y;
uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
uint16x4_t result_16_low, result_16_high;
//for low 4 numbers
tmp1 = vmulq_n_u32(v11_32_low,oneY_32);
tmp2 = vmulq_n_u32(v12_32_low, intY_32);
tmp3 = vaddq_u32(tmp1, tmp2);
tmp4 = vmulq_n_u32(tmp3, oneX_32);
tmp1 = vmulq_n_u32(v21_32_low, oneY_32);
tmp2 = vmulq_n_u32(v22_32_low, intY_32);
tmp3 = vaddq_u32(tmp1, tmp2);
tmp5 = vmulq_n_u32(tmp3, intX_32);
tmp = vaddq_u32(tmp4, tmp5);
result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes
result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes
//for high 4 numbers
tmp1 = vmulq_n_u32(v11_32_high,oneY_32);
tmp2 = vmulq_n_u32(v12_32_high, intY_32);
tmp3 = vaddq_u32(tmp1, tmp2);
tmp4 = vmulq_n_u32(tmp3, oneX_32);
tmp1 = vmulq_n_u32(v21_32_high, oneY_32);
tmp2 = vmulq_n_u32(v22_32_high, intY_32);
tmp3 = vaddq_u32(tmp1, tmp2);
tmp5 = vmulq_n_u32(tmp3, intX_32);
tmp = vaddq_u32(tmp4, tmp5);
result_16_high = vshrn_n_u32(tmp,16); //shift right 16 bytes
result_16_high = vrshr_n_u16(result_16_high,8); //shift right 8 bytes, totally 24 bytes
uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
uint8x8_t result_8 = vqmovn_u16(result_16);
return result_8;
}
加入浮点定点化之后的优化,时间能进一步提升一倍左右。