E.3.25. Operations with a scalar value

Efficient code generation for these intrinsics is only guaranteed when the scalar argument is either a constant or a use of one of the vget_lane intrinsics.

Vector multiply accumulate with scalar


int16x4_t   vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLA.I16 d0, d0, d0[0]
int32x2_t   vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLA.I32 d0, d0, d0[0]
uint16x4_t  vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLA.I16 d0, d0, d0[0]
uint32x2_t  vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLA.I32 d0, d0, d0[0]
float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l);
                                                // VMLA.F32 d0, d0, d0[0]
int16x8_t   vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLA.I16 q0, q0, d0[0]
int32x4_t   vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLA.I32 q0, q0, d0[0]
uint16x8_t  vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLA.I16 q0, q0, d0[0]
uint32x4_t  vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLA.I32 q0, q0, d0[0]
float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l);
                                                // VMLA.F32 q0, q0, d0[0]

Vector widening multiply accumulate with scalar

int32x4_t   vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLAL.S16 q0, d0, d0[0]
int64x2_t   vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLAL.S32 q0, d0, d0[0]
uint32x4_t  vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLAL.U16 q0, d0, d0[0]
uint64x2_t  vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLAL.U32 q0, d0, d0[0]

Vector widening saturating doubling multiply accumulate with scalar

int32x4_t   vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VQDMLAL.S16 q0, d0, d0[0]
int64x2_t   vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VQDMLAL.S32 q0, d0, d0[0]

Vector multiply subtract with scalar

int16x4_t   vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLS.I16 d0, d0, d0[0]
int32x2_t   vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLS.I32 d0, d0, d0[0]
uint16x4_t  vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLS.I16 d0, d0, d0[0]
uint32x2_t  vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLS.I32 d0, d0, d0[0]
float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l);
                                                // VMLS.F32 d0, d0, d0[0]
int16x8_t   vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLS.I16 q0, q0, d0[0]
int32x4_t   vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLS.I32 q0, q0, d0[0]
uint16x8_t  vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLS.I16 q0, q0, d0[0]
uint32x4_t  vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLS.I32 q0, q0, d0[0]
float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l);
                                                // VMLS.F32 q0, q0, d0[0]

Vector widening multiply subtract with scalar

int32x4_t   vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VMLSL.S16 q0, d0, d0[0]
int64x2_t   vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VMLSL.S32 q0, d0, d0[0]
uint32x4_t  vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l);
                                                // VMLSL.U16 q0, d0, d0[0]
uint64x2_t  vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l);
                                                // VMLSL.U32 q0, d0, d0[0]

Vector widening saturating doubling multiply subtract with scalar

int32x4_t   vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l);
                                                // VQDMLSL.S16 q0, d0, d0[0]
int64x2_t   vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l);
                                                // VQDMLSL.S32 q0, d0, d0[0]

Vector multiply by scalar

int16x4_t   vmul_n_s16(int16x4_t a, int16_t b);      // VMUL.I16 d0,d0,d0[0]
int32x2_t   vmul_n_s32(int32x2_t a, int32_t b);      // VMUL.I32 d0,d0,d0[0]
float32x2_t vmul_n_f32(float32x2_t a, float32_t b);  // VMUL.F32 d0,d0,d0[0]
uint16x4_t  vmul_n_u16(uint16x4_t a, uint16_t b);    // VMUL.I16 d0,d0,d0[0]
uint32x2_t  vmul_n_u32(uint32x2_t a, uint32_t b);    // VMUL.I32 d0,d0,d0[0]
int16x8_t   vmulq_n_s16(int16x8_t a, int16_t b);     // VMUL.I16 q0,q0,d0[0]
int32x4_t   vmulq_n_s32(int32x4_t a, int32_t b);     // VMUL.I32 q0,q0,d0[0]
float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
uint16x8_t  vmulq_n_u16(uint16x8_t a, uint16_t b);   // VMUL.I16 q0,q0,d0[0]
uint32x4_t  vmulq_n_u32(uint32x4_t a, uint32_t b);   // VMUL.I32 q0,q0,d0[0]

Vector long multiply with scalar

int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2);    // VMULL.S16 q0,d0,d0[0]
int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2);    // VMULL.S32 q0,d0,d0[0]
uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]

Vector long multiply by scalar

int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                                          // VMULL.S16 q0,d0,d0[0]int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                                          // VMULL.S32 q0,d0,d0[0]uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3);
                                                          // VMULL.U16 q0,d0,d0[0]uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3);
                                                          // VMULL.U32 q0,d0,d0[0]

Vector saturating doubling long multiply with scalar

int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2);    // VQDMULL.S16 q0,d0,d0[0]
int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2);    // VQDMULL.S32 q0,d0,d0[0]

Vector saturating doubling long multiply by scalar

int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                                // VQDMULL.S16 q0,d0,d0[0]
int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                                // VQDMULL.S32 q0,d0,d0[0]

Vector saturating doubling multiply high with scalar

int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2);     // VQDMULH.S16 d0,d0,d0[0]
int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2);     // VQDMULH.S32 d0,d0,d0[0]
int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);    // VQDMULH.S16 q0,q0,d0[0]
int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);    // VQDMULH.S32 q0,q0,d0[0]

Vector saturating doubling multiply high by scalar

int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                                // VQDMULH.S16 d0,d0,d0[0]
int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                                // VQDMULH.S32 d0,d0,d0[0]
int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                                // VQDMULH.S16 q0,q0,d0[0]
int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                               // VQDMULH.S32 q0,q0,d0[0]

Vector saturating rounding doubling multiply high with scalar

int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2);     // VQRDMULH.S16 d0,d0,d0[0]
int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2);     // VQRDMULH.S32 d0,d0,d0[0]
int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);    // VQRDMULH.S16 q0,q0,d0[0]
int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);    // VQRDMULH.S32 q0,q0,d0[0]

Vector rounding saturating doubling multiply high by scalar

int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                               // VQRDMULH.S16 d0,d0,d0[0]
int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                               // VQRDMULH.S32 d0,d0,d0[0]
int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3);
                                               // VQRDMULH.S16 q0,q0,d0[0]
int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3);
                                               // VQRDMULH.S32 q0,q0,d0[0]

Vector multiply accumulate with scalar

int16x4_t   vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c);           // VMLA.I16 d0, d0, d0[0]  
int32x2_t   vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c);           // VMLA.I32 d0, d0, d0[0]  
uint16x4_t  vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c);        // VMLA.I16 d0, d0, d0[0]  
uint32x2_t  vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c);        // VMLA.I32 d0, d0, d0[0]  
float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c);     // VMLA.F32 d0, d0, d0[0]  
int16x8_t   vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);          // VMLA.I16 q0, q0, d0[0]  
int32x4_t   vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);          // VMLA.I32 q0, q0, d0[0]  
uint16x8_t  vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);       // VMLA.I16 q0, q0, d0[0]  
uint32x4_t  vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);       // VMLA.I32 q0, q0, d0[0]  
float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);    // VMLA.F32 q0, q0, d0[0]  

Vector widening multiply accumulate with scalar

int32x4_t   vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c);       // VMLAL.S16 q0, d0, d0[0] 
int64x2_t   vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c);       // VMLAL.S32 q0, d0, d0[0] 
uint32x4_t  vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c);    // VMLAL.U16 q0, d0, d0[0] 
uint64x2_t  vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c);    // VMLAL.U32 q0, d0, d0[0] 

Vector widening saturating doubling multiply accumulate with scalar

int32x4_t   vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c);     // VQDMLAL.S16 q0, d0, d0[0]  
int64x2_t   vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c);     // VQDMLAL.S32 q0, d0, d0[0]  

Vector multiply subtract with scalar

int16x4_t   vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c);        // VMLS.I16 d0, d0, d0[0]  
int32x2_t   vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c);        // VMLS.I32 d0, d0, d0[0]  
uint16x4_t  vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c);     // VMLS.I16 d0, d0, d0[0]  
uint32x2_t  vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c);     // VMLS.I32 d0, d0, d0[0]  
float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c);  // VMLS.F32 d0, d0, d0[0]  
int16x8_t   vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);       // VMLS.I16 q0, q0, d0[0]  
int32x4_t   vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);       // VMLS.I32 q0, q0, d0[0]  
uint16x8_t  vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);    // VMLS.I16 q0, q0, d0[0]  
uint32x4_t  vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);    // VMLS.I32 q0, q0, d0[0]  
float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]  

Vector widening multiply subtract with scalar

int32x4_t   vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c);       // VMLSL.S16 q0, d0, d0[0] 
int64x2_t   vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c);       // VMLSL.S32 q0, d0, d0[0] 
uint32x4_t  vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c);    // VMLSL.U16 q0, d0, d0[0] 
uint64x2_t  vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c);    // VMLSL.U32 q0, d0, d0[0] 

Vector widening saturating doubling multiply subtract with scalar

int32x4_t   vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c);   // VQDMLSL.S16 q0, d0, d0[0]  
int64x2_t   vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c);   // VQDMLSL.S32 q0, d0, d0[0]  
Copyright © 2007, 2010 ARM Limited. All rights reserved.ARM DUI 0348A
Non-Confidential