| |||
| Home > Using the NEON Vectorizing Compiler > Vectorizable loop iteration counts | |||
If a loop has a fixed iteration count, automatic vectorization is possible. The iteration count must occur at the start of the loop.
In the vectorizable loop in Table 6, the iteration count is n.
The value of n does not change throughout the
course of the loop, so this loop can be automatically vectorized.
If a loop does not have a fixed iteration count, automatic vectorization is not possible.
In the nonvectorizable loop in Table 6, the value of i changes
throughout the course of the loop, so this loop cannot be automatically
vectorized.
Table 6. Vectorizable and nonvectorizable loops
| Vectorizable loop | Nonvectorizable loop |
|---|---|
/* myprog1.c */
int a[99], b[99], c[99], i, n;
...
for (i = 0; i < n; i++)
{
a[i] = b[i] + c[i];
}
|
/* myprog2.c */
int a[99], b[99], c[99], i, n;
...
while (i < n)
{
a[i] = b[i] + c[i];
i += a[i];
};
|
armcc --cpu=Cortex-A8 -O3 -Otime --vectorize
myprog1.c -o- | armcc --cpu=Cortex-A8 -O3 -Otime --vectorize
myprog2.c |
ARM REQUIRE8 PRESERVE8 | ARM REQUIRE8 PRESERVE8 |
AREA ||.text||, CODE, READONLY, ALIGN=2 | AREA ||.text||, CODE, READONLY, ALIGN=2 |
f PROC
PUSH {r4-r6}
MOV r0,#0
LDR r4,|L1.160|
STR r0,[r4,#0] ; i
LDR r12,[r4,#4] ; n
CMP r12,#0
BLE |L1.152|
ASR r0,r12,#31
LDR r1,|L1.164|
ADD r0,r12,r0,LSR #30
ADD r2,r1,#0x18c
ASRS r0,r0,#2
SUB r3,r2,#0x318
BEQ |L1.80|
|
||foo|| PROC
LDR r3,|L1.76|
LDR r0,[r3,#0] ; i, n
LDR r2,[r3,#4]
CMP r0,r2
BXGE lr
PUSH {r4-r6}
LDR r12,|L1.80|
ADD r4,r12,#0x18c
SUB r5,r12,#0x18c
|
|L1.56|
VLD1.32 {d0,d1},[r1]!
SUBS r0,r0,#1
VLD1.32 {d2,d3},[r2]!
VADD.I32 q0,q0,q1
VST1.32 {d0,d1},[r3]!
BNE |L1.56|
|
|L1.36|
LDR r1,[r12,r0,LSL #2]
LDR r6,[r4,r0,LSL #2]
ADD r1,r1,r6
STR r1,[r5,r0,LSL #2]
ADD r0,r0,r1
CMP r0,r2
STR r0,[r3,#0] ; i
BLT |L1.36|
POP {r4-r6}
BX lr
ENDP
|
|L1.80| AND r0,r12,#3 CMP r0,#0 BLE |L1.144| SUB r0,r12,r0 CMP r0,r12 BGE |L1.144| LDR r1,|L1.164| ADD r2,r1,#0x18c SUB r3,r2,#0x318 | |
|L1.116| LDR r5,[r1,r0,LSL #2] LDR r6,[r2,r0,LSL #2] ADD r5,r5,r6 STR r5,[r3,r0,LSL #2] ADD r0,r0,#1 CMP r0,r12 BLT |L1.116| | |
|L1.144| LDR r0,[r4,#4] ; n STR r0,[r4,#0] ; i | |
|L1.152|
POP {r4-r6}
BX lr
ENDP
|
Compiler Reference: