| |||
| Home > Using the NEON Vectorizing Compiler > Vectorizable code example | |||
Example 19 shows a
complete vectorizable code example. The options required to build
this example are listed within the introductory source code comments.
The --cpu=option
must name a processor that has NEON technology, for example, Cortex-A8,
Cortex-A9, or Cortex-A15.name
--diag_warning=optimizations can be used
to view where vectorization optimization is applied.
The use of __promise enables the compiler
to generate smaller and faster code. The code still works and vectorizes
without these promises, but is then larger and slower.
Example 19. Vectorizable code
/*
* Vectorizable example code.
* Copyright 2006 ARM. All rights reserved.
*
* Includes embedded assembly to initialize cpu; link using '--entry=init_cpu'.
*
* Build using:
* armcc --vectorize -c vector_example.c --cpu Cortex-A8 -Otime -O3 -DNDEBUG
* armlink -o vector_example.axf vector_example.o --entry=init_cpu
*/
#include <stdio.h>
#include <assert.h> /* for __promise() */
void fir(short *__restrict y, const short *x, const short *h, int n_out, int n_coefs)
{
int n;
/* I promise ‘n_out is always a positive multiple of 8’ */
__promise(0 < n_out && (n_out % 8) == 0);
for (n = 0; n < n_out; n++)
{
int k, sum = 0;
/* I promise ‘n_coefs is always a positive multiple of 4’ */
__promise(0 < n_coefs && (n_coefs % 4) == 0);
for (k = 0; k < n_coefs; k++)
{
sum += h[k] * x[n - n_coefs + 1 + k];
}
y[n] = ((sum>>15) + 1) >> 1;
}
}
int main()
{
static const short x[128] =
{
0x0000, 0x0647, 0x0c8b, 0x12c8, 0x18f8, 0x1f19, 0x2528, 0x2b1f,
0x30fb, 0x36ba, 0x3c56, 0x41ce, 0x471c, 0x4c3f, 0x5133, 0x55f5,
0x5a82, 0x5ed7, 0x62f2, 0x66cf, 0x6a6d, 0x6dca, 0x70e2, 0x73b5,
0x7641, 0x7884, 0x7a7d, 0x7c29, 0x7d8a, 0x7e9d, 0x7f62, 0x7fd8,
0x8000, 0x7fd8, 0x7f62, 0x7e9d, 0x7d8a, 0x7c29, 0x7a7d, 0x7884,
0x7641, 0x73b5, 0x70e2, 0x6dca, 0x6a6d, 0x66cf, 0x62f2, 0x5ed7,
0x5a82, 0x55f5, 0x5133, 0x4c3f, 0x471c, 0x41ce, 0x3c56, 0x36ba,
0x30fb, 0x2b1f, 0x2528, 0x1f19, 0x18f8, 0x12c8, 0x0c8b, 0x0647,
0x0000, 0xf9b9, 0xf375, 0xed38, 0xe708, 0xe0e7, 0xdad8, 0xd4e1,
0xcf05, 0xc946, 0xc3aa, 0xbe32, 0xb8e4, 0xb3c1, 0xaecd, 0xaa0b,
0xa57e, 0xa129, 0x9d0e, 0x9931, 0x9593, 0x9236, 0x8f1e, 0x8c4b,
0x89bf, 0x877c, 0x8583, 0x83d7, 0x8276, 0x8163, 0x809e, 0x8028,
0x8000, 0x8028, 0x809e, 0x8163, 0x8276, 0x83d7, 0x8583, 0x877c,
0x89bf, 0x8c4b, 0x8f1e, 0x9236, 0x9593, 0x9931, 0x9d0e, 0xa129,
0xa57e, 0xaa0b, 0xaecd, 0xb3c1, 0xb8e4, 0xbe32, 0xc3aa, 0xc946,
0xcf05, 0xd4e1, 0xdad8, 0xe0e7, 0xe708, 0xed38, 0xf375, 0xf9b9,
};
static const short coeffs[8] =
{
0x0800, 0x1000, 0x2000, 0x4000,
0x4000, 0x2000, 0x1000, 0x0800
};
int i, ok = 1;
short y[128];
static const short expected[128] =
{
0x1474, 0x1a37, 0x1fe9, 0x2588, 0x2b10, 0x307d, 0x35cc, 0x3afa,
0x4003, 0x44e5, 0x499d, 0x4e27, 0x5281, 0x56a9, 0x5a9a, 0x5e54,
0x61d4, 0x6517, 0x681c, 0x6ae1, 0x6d63, 0x6fa3, 0x719d, 0x7352,
0x74bf, 0x6de5, 0x66c1, 0x5755, 0x379e, 0x379e, 0x5755, 0x66c1,
0x6de5, 0x74bf, 0x7352, 0x719d, 0x6fa3, 0x6d63, 0x6ae1, 0x681c,
0x6517, 0x61d4, 0x5e54, 0x5a9a, 0x56a9, 0x5281, 0x4e27, 0x499d,
0x44e5, 0x4003, 0x3afa, 0x35cc, 0x307d, 0x2b10, 0x2588, 0x1fe9,
0x1a37, 0x1474, 0x0ea5, 0x08cd, 0x02f0, 0xfd10, 0xf733, 0xf15b,
0xeb8c, 0xe5c9, 0xe017, 0xda78, 0xd4f0, 0xcf83, 0xca34, 0xc506,
0xbffd, 0xbb1b, 0xb663, 0xb1d9, 0xad7f, 0xa957, 0xa566, 0xa1ac,
0x9e2c, 0x9ae9, 0x97e4, 0x951f, 0x929d, 0x905d, 0x8e63, 0x8cae,
0x8b41, 0x8a1b, 0x893f, 0x88ab, 0x8862, 0x8862, 0x88ab, 0x893f,
0x8a1b, 0x8b41, 0x8cae, 0x8e63, 0x905d, 0x929d, 0x951f, 0x97e4,
0x9ae9, 0x9e2c, 0xa1ac, 0xa566, 0xa957, 0xad7f, 0xb1d9, 0xb663,
0xbb1b, 0xbffd, 0xc506, 0xca34, 0xcf83, 0xd4f0, 0xda78, 0xe017,
0xe5c9, 0xebcc, 0xf229, 0xf96a, 0x02e9, 0x0dd8, 0x1937, 0x24ce,
};
fir(y, x + 7, coeffs, 128, 8);
for (i = 0; i < sizeof(y)/sizeof(*y); ++i)
{
if (y[i] != expected[i])
{
printf("mismatch: y[%d] = 0x%04x; expected[%d] = 0x%04x\n", i, y[i], i, expected[i]);
ok = 0;
break;
}
}
if (ok) printf("** TEST PASSED OK **\n");
return ok ? 0 : 1;
}
#ifdef __TARGET_ARCH_7_A
__asm void init_cpu() {
// Set up CPU state
MRC p15,0,r4,c1,c0,0
ORR r4,r4,#0x00400000 // enable unaligned mode (U=1)
BIC r4,r4,#0x00000002 // disable alignment faults (A=0)
// MMU not enabled: no page tables
MCR p15,0,r4,c1,c0,0
#ifdef __BIG_ENDIAN
SETEND BE
#endif
MRC p15,0,r4,c1,c0,2 // Enable VFP access in the CAR -
ORR r4,r4,#0x00f00000 // must be done before any VFP instructions
MCR p15,0,r4,c1,c0,2
MOV r4,#0x40000000 // Set EN bit in FPEXC
MSR FPEXC,r4
IMPORT __main
B __main
}
#endif