E.3.2. 乘法

以下内在函数提供包含乘法的运算。

向量乘法:vmul -> Vr[i] := Va[i] * Vb[i]


int8x8_t    vmul_s8(int8x8_t a, int8x8_t b);         // VMUL.I8 d0,d0,d0 

int16x4_t   vmul_s16(int16x4_t a, int16x4_t b);      // VMUL.I16 d0,d0,d0

int32x2_t   vmul_s32(int32x2_t a, int32x2_t b);      // VMUL.I32 d0,d0,d0

float32x2_t vmul_f32(float32x2_t a, float32x2_t b);  // VMUL.F32 d0,d0,d0

uint8x8_t   vmul_u8(uint8x8_t a, uint8x8_t b);       // VMUL.I8 d0,d0,d0 

uint16x4_t  vmul_u16(uint16x4_t a, uint16x4_t b);    // VMUL.I16 d0,d0,d0

uint32x2_t  vmul_u32(uint32x2_t a, uint32x2_t b);    // VMUL.I32 d0,d0,d0

poly8x8_t   vmul_p8(poly8x8_t a, poly8x8_t b);       // VMUL.P8 d0,d0,d0 

int8x16_t   vmulq_s8(int8x16_t a, int8x16_t b);      // VMUL.I8 q0,q0,q0 

int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);     // VMUL.I16 q0,q0,q0

int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);     // VMUL.I32 q0,q0,q0

float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0

uint8x16_t  vmulq_u8(uint8x16_t a, uint8x16_t b);    // VMUL.I8 q0,q0,q0 

uint16x8_t  vmulq_u16(uint16x8_t a, uint16x8_t b);   // VMUL.I16 q0,q0,q0

uint32x4_t  vmulq_u32(uint32x4_t a, uint32x4_t b);   // VMUL.I32 q0,q0,q0

poly8x16_t  vmulq_p8(poly8x16_t a, poly8x16_t b);    // VMUL.P8 q0,q0,q0 

向量乘加:vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]


int8x8_t    vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c);            // VMLA.I8 d0,d0,d0 

int16x4_t   vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c);        // VMLA.I16 d0,d0,d0

int32x2_t   vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c);        // VMLA.I32 d0,d0,d0

float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c);  // VMLA.F32 d0,d0,d0

uint8x8_t   vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);         // VMLA.I8 d0,d0,d0 

uint16x4_t  vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c);     // VMLA.I16 d0,d0,d0

uint32x2_t  vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c);     // VMLA.I32 d0,d0,d0

int8x16_t   vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);        // VMLA.I8 q0,q0,q0 

int16x8_t   vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);       // VMLA.I16 q0,q0,q0

int32x4_t   vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);       // VMLA.I32 q0,q0,q0

float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0

uint8x16_t  vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);     // VMLA.I8 q0,q0,q0 

uint16x8_t  vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);    // VMLA.I16 q0,q0,q0

uint32x4_t  vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);    // VMLA.I32 q0,q0,q0

向量长型乘加:vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]


int16x8_t  vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLAL.S8 q0,d0,d0 

int32x4_t  vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLAL.S16 q0,d0,d0

int64x2_t  vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLAL.S32 q0,d0,d0

uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLAL.U8 q0,d0,d0 

uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0

uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0

向量乘减:vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]


int8x8_t    vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c);            // VMLS.I8 d0,d0,d0 

int16x4_t   vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c);        // VMLS.I16 d0,d0,d0

int32x2_t   vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c);        // VMLS.I32 d0,d0,d0

float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c);  // VMLS.F32 d0,d0,d0

uint8x8_t   vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);         // VMLS.I8 d0,d0,d0 

uint16x4_t  vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c);     // VMLS.I16 d0,d0,d0

uint32x2_t  vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c);     // VMLS.I32 d0,d0,d0

int8x16_t   vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);        // VMLS.I8 q0,q0,q0 

int16x8_t   vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);       // VMLS.I16 q0,q0,q0

int32x4_t   vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);       // VMLS.I32 q0,q0,q0

float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0

uint8x16_t  vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);     // VMLS.I8 q0,q0,q0 

uint16x8_t  vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);    // VMLS.I16 q0,q0,q0

uint32x4_t  vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);    // VMLS.I32 q0,q0,q0

向量长型乘减


int16x8_t  vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLSL.S8 q0,d0,d0 

int32x4_t  vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLSL.S16 q0,d0,d0

int64x2_t  vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLSL.S32 q0,d0,d0

uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLSL.U8 q0,d0,d0 

uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0

uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0

向量高位饱和加倍乘法


int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b);  // VQDMULH.S16 d0,d0,d0

int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b);  // VQDMULH.S32 d0,d0,d0

int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0

int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0

向量高位饱和舍入加倍乘法


int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b);  // VQRDMULH.S16 d0,d0,d0

int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b);  // VQRDMULH.S32 d0,d0,d0

int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0

int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0

向量长型饱和加倍乘加


int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0

int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0

向量长型饱和加倍乘减


int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0

int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0

向量长型乘法


int16x8_t  vmull_s8(int8x8_t a, int8x8_t b);      // VMULL.S8 q0,d0,d0 

int32x4_t  vmull_s16(int16x4_t a, int16x4_t b);   // VMULL.S16 q0,d0,d0

int64x2_t  vmull_s32(int32x2_t a, int32x2_t b);   // VMULL.S32 q0,d0,d0

uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b);    // VMULL.U8 q0,d0,d0 

uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0

uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0

poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b);    // VMULL.P8 q0,d0,d0 

向量长型饱和加倍乘法


int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0

int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0

Copyright © 2007 ARM Limited. All rights reserved. ARM DUI 0348AC
Non-Confidential