## 18.5 NEON intrinsics for multiplication

These intrinsics provide operations including multiplication.

### Note

This topic describes the semantics of the intrinsics, rather than the semantics of the corresponding instructions.

For example, `Vr[i] := Va[i] + Vb[i] * Vc[i]` describes the semantics of the `vmla{q}_<type>` intrinsic, rather than the `VMLA` instruction.

The `VMLA` instruction uses three registers, multiplying the values in the 2 operand registers, adding the value in the destination register, and placing the final result in the destination register. That is: ```Va[i] := Va[i] + Vb[i] * Vc[i]```.

However, the result vector `Vr` may not be the same entity as `Va` for the corresponding intrinsic. For example:

```int8x8_t f(int8x8_t a, int8x8_t b, int8x8_t c)
{
int8x8_t r = vmla_s8(a, b, c);
}
```

## Vector multiply: vmul{q}_<type>. Vr[i] := Va[i] * Vb[i]

```int8x8_t    vmul_s8(int8x8_t a, int8x8_t b);         // VMUL.I8 d0,d0,d0
int16x4_t   vmul_s16(int16x4_t a, int16x4_t b);      // VMUL.I16 d0,d0,d0
int32x2_t   vmul_s32(int32x2_t a, int32x2_t b);      // VMUL.I32 d0,d0,d0
float32x2_t vmul_f32(float32x2_t a, float32x2_t b);  // VMUL.F32 d0,d0,d0
uint8x8_t   vmul_u8(uint8x8_t a, uint8x8_t b);       // VMUL.I8 d0,d0,d0
uint16x4_t  vmul_u16(uint16x4_t a, uint16x4_t b);    // VMUL.I16 d0,d0,d0
uint32x2_t  vmul_u32(uint32x2_t a, uint32x2_t b);    // VMUL.I32 d0,d0,d0
poly8x8_t   vmul_p8(poly8x8_t a, poly8x8_t b);       // VMUL.P8 d0,d0,d0
int8x16_t   vmulq_s8(int8x16_t a, int8x16_t b);      // VMUL.I8 q0,q0,q0
int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);     // VMUL.I16 q0,q0,q0
int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);     // VMUL.I32 q0,q0,q0
float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
uint8x16_t  vmulq_u8(uint8x16_t a, uint8x16_t b);    // VMUL.I8 q0,q0,q0
uint16x8_t  vmulq_u16(uint16x8_t a, uint16x8_t b);   // VMUL.I16 q0,q0,q0
uint32x4_t  vmulq_u32(uint32x4_t a, uint32x4_t b);   // VMUL.I32 q0,q0,q0
poly8x16_t  vmulq_p8(poly8x16_t a, poly8x16_t b);    // VMUL.P8 q0,q0,q0
```

## Vector multiply accumulate: vmla{q}_<type>. Vr[i] := Va[i] + Vb[i] * Vc[i]

```int8x8_t    vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c);        // VMLA.I8 d0,d0,d0
int16x4_t   vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c);    // VMLA.I16 d0,d0,d0
int32x2_t   vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c);    // VMLA.I32 d0,d0,d0
float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c);
// VMLA.F32 d0,d0,d0
uint8x8_t   vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);     // VMLA.I8 d0,d0,d0
uint16x4_t  vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
uint32x2_t  vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
int8x16_t   vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);    // VMLA.I8 q0,q0,q0
int16x8_t   vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);   // VMLA.I16 q0,q0,q0
int32x4_t   vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);   // VMLA.I32 q0,q0,q0
float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);
// VMLA.F32 q0,q0,q0
uint8x16_t  vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
uint16x8_t  vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);// VMLA.I16 q0,q0,q0
uint32x4_t  vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);// VMLA.I32 q0,q0,q0
```

## Vector multiply accumulate long: vmlal_<type>. Vr[i] := Va[i] + Vb[i] * Vc[i]

```int16x8_t  vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLAL.S8 q0,d0,d0
int32x4_t  vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLAL.S16 q0,d0,d0
int64x2_t  vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLAL.S32 q0,d0,d0
uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLAL.U8 q0,d0,d0
uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
```

## Vector multiply subtract: vmls{q}_<type>. Vr[i] := Va[i] - Vb[i] * Vc[i]

```int8x8_t    vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c);        // VMLS.I8 d0,d0,d0
int16x4_t   vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c);    // VMLS.I16 d0,d0,d0
int32x2_t   vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c);    // VMLS.I32 d0,d0,d0
float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c);
// VMLS.F32 d0,d0,d0
uint8x8_t   vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c);     // VMLS.I8 d0,d0,d0
uint16x4_t  vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
uint32x2_t  vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
int8x16_t   vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);    // VMLS.I8 q0,q0,q0
int16x8_t   vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);   // VMLS.I16 q0,q0,q0
int32x4_t   vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);   // VMLS.I32 q0,q0,q0
float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);
// VMLS.F32 q0,q0,q0
uint8x16_t  vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
uint16x8_t  vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);// VMLS.I16 q0,q0,q0
uint32x4_t  vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);// VMLS.I32 q0,q0,q0
```

## Vector multiply subtract long

```int16x8_t  vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c);       // VMLSL.S8 q0,d0,d0
int32x4_t  vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c);    // VMLSL.S16 q0,d0,d0
int64x2_t  vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c);    // VMLSL.S32 q0,d0,d0
uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c);    // VMLSL.U8 q0,d0,d0
uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
```

## Vector saturating doubling multiply high

```int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b);  // VQDMULH.S16 d0,d0,d0
int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b);  // VQDMULH.S32 d0,d0,d0
int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
```

## Vector saturating rounding doubling multiply high

```int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b);  // VQRDMULH.S16 d0,d0,d0
int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b);  // VQRDMULH.S32 d0,d0,d0
int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
```

## Vector saturating doubling multiply accumulate long

```int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
```

## Vector saturating doubling multiply subtract long

```int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
```

## Vector long multiply

```int16x8_t  vmull_s8(int8x8_t a, int8x8_t b);      // VMULL.S8 q0,d0,d0
int32x4_t  vmull_s16(int16x4_t a, int16x4_t b);   // VMULL.S16 q0,d0,d0
int64x2_t  vmull_s32(int32x2_t a, int32x2_t b);   // VMULL.S32 q0,d0,d0
uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b);    // VMULL.U8 q0,d0,d0
uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b);    // VMULL.P8 q0,d0,d0
```

## Vector saturating doubling long multiply

```int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
```