E.3.2. Multiplication

These intrinsics provide operations including multiplication.

Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]

int8x8_t    vmul_s8(int8x8_t a, int8x8_t b);         // VMUL.I8 d0,d0,d0 
int16x4_t   vmul_s16(int16x4_t a, int16x4_t b);      // VMUL.I16 d0,d0,d0
int32x2_t   vmul_s32(int32x2_t a, int32x2_t b);      // VMUL.I32 d0,d0,d0
float32x2_t vmul_f32(float32x2_t a, float32x2_t b);  // VMUL.F32 d0,d0,d0
uint8x8_t   vmul_u8(uint8x8_t a, uint8x8_t b);       // VMUL.I8 d0,d0,d0 
uint16x4_t  vmul_u16(uint16x4_t a, uint16x4_t b);    // VMUL.I16 d0,d0,d0
uint32x2_t  vmul_u32(uint32x2_t a, uint32x2_t b);    // VMUL.I32 d0,d0,d0
poly8x8_t   vmul_p8(poly8x8_t a, poly8x8_t b);       // VMUL.P8 d0,d0,d0 
int8x16_t   vmulq_s8(int8x16_t a, int8x16_t b);      // VMUL.I8 q0,q0,q0 
int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);     // VMUL.I16 q0,q0,q0
int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);     // VMUL.I32 q0,q0,q0
float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
uint8x16_t  vmulq_u8(uint8x16_t a, uint8x16_t b);    // VMUL.I8 q0,q0,q0 
uint16x8_t  vmulq_u16(uint16x8_t a, uint16x8_t b);   // VMUL.I16 q0,q0,q0
uint32x4_t  vmulq_u32(uint32x4_t a, uint32x4_t b);   // VMUL.I32 q0,q0,q0
poly8x16_t  vmulq_p8(poly8x16_t a, poly8x16_t b);    // VMUL.P8 q0,q0,q0 

Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]

int8x8_t    vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c);            // VMLA.I8 d0,d0,d0 
int16x4_t   vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c);        // VMLA.I16 d0,d0,d0
int32x2_t   vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c);        // VMLA.I32 d0,d0,d0
float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c);  // VMLA.F32 d0,d0,d0
uint8x8_t   vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);         // VMLA.I8 d0,d0,d0 
uint16x4_t  vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c);     // VMLA.I16 d0,d0,d0
uint32x2_t  vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c);     // VMLA.I32 d0,d0,d0
int8x16_t   vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);        // VMLA.I8 q0,q0,q0 
int16x8_t   vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);       // VMLA.I16 q0,q0,q0
int32x4_t   vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);       // VMLA.I32 q0,q0,q0
float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
uint8x16_t  vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);     // VMLA.I8 q0,q0,q0 
uint16x8_t  vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);    // VMLA.I16 q0,q0,q0
uint32x4_t  vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);    // VMLA.I32 q0,q0,q0

Vector multiply accumulate long: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]

int16x8_t  vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLAL.S8 q0,d0,d0 
int32x4_t  vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLAL.S16 q0,d0,d0
int64x2_t  vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLAL.S32 q0,d0,d0
uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLAL.U8 q0,d0,d0 
uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0

Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]

int8x8_t    vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c);            // VMLS.I8 d0,d0,d0 
int16x4_t   vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c);        // VMLS.I16 d0,d0,d0
int32x2_t   vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c);        // VMLS.I32 d0,d0,d0
float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c);  // VMLS.F32 d0,d0,d0
uint8x8_t   vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);         // VMLS.I8 d0,d0,d0 
uint16x4_t  vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c);     // VMLS.I16 d0,d0,d0
uint32x2_t  vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c);     // VMLS.I32 d0,d0,d0
int8x16_t   vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);        // VMLS.I8 q0,q0,q0 
int16x8_t   vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);       // VMLS.I16 q0,q0,q0
int32x4_t   vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);       // VMLS.I32 q0,q0,q0
float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
uint8x16_t  vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);     // VMLS.I8 q0,q0,q0 
uint16x8_t  vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);    // VMLS.I16 q0,q0,q0
uint32x4_t  vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);    // VMLS.I32 q0,q0,q0

Vector multiply subtract long

int16x8_t  vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLSL.S8 q0,d0,d0 
int32x4_t  vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLSL.S16 q0,d0,d0
int64x2_t  vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLSL.S32 q0,d0,d0
uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLSL.U8 q0,d0,d0 
uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0

Vector saturating doubling multiply high

int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b);  // VQDMULH.S16 d0,d0,d0
int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b);  // VQDMULH.S32 d0,d0,d0
int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0

Vector saturating rounding doubling multiply high

int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b);  // VQRDMULH.S16 d0,d0,d0
int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b);  // VQRDMULH.S32 d0,d0,d0
int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0

Vector saturating doubling multiply accumulate long

int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0

Vector saturating doubling multiply subtract long

int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0

Vector long multiply

int16x8_t  vmull_s8(int8x8_t a, int8x8_t b);      // VMULL.S8 q0,d0,d0 
int32x4_t  vmull_s16(int16x4_t a, int16x4_t b);   // VMULL.S16 q0,d0,d0
int64x2_t  vmull_s32(int32x2_t a, int32x2_t b);   // VMULL.S32 q0,d0,d0
uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b);    // VMULL.U8 q0,d0,d0 
uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b);    // VMULL.P8 q0,d0,d0 

Vector saturating doubling long multiply

int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
Copyright © 2007, 2010 ARM Limited. All rights reserved.ARM DUI 0348A
Non-Confidential