Professional Documents
Culture Documents
Report Template PDF
Report Template PDF
root@beaglebone:~# cat /proc/cpuinfo processor :0 model name : ARMv7 Processor rev 2 (v7l) BogoMIPS : 990.68 Features : swp half thumb fastmult vfp edsp thumbee neon vfpv3 tls CPU implementer : 0x41 CPU architecture: 7 CPU variant : 0x3 CPU part : 0xc08 CPU revision : 2 Hardware Revision Serial : Generic AM33XX (Flattened Device Tree) : 0000 : 0000000000000000
Optimizations
-ftree-vectorize (Optimization #1)
Vectorization is enabled by this flag, it speeds up code by vectorizing the all the loops that can be vectorized. In this case the memory access is made such that it is predictable and there are no conditional branches. The compiler recognizes that and auto vectorizes where ever possible. Performance Impact There was a considerable speedup due to this optimization.
Summary
Overall performance improvement: Initial Cycles: 59040566 cycles Final Cycles: 9601671 cycles Speedup: (59596 - 9692)/59596 * 100 = 83.74% Which single optimization gave the largest improvement? The optimization that gave the largest improvement was inclusion of vectorization flags. It enabled the loop to get vectorized and allowed the neon unit to process 4 words simultaneously. Using the intrinsic instructions want that useful as the code was getting automatically vectorized. Hence that optimization was abandoned. The code was finally sped-up by 83.74%.
Makefile PROJ_NAME = project1 CC = gcc VECTFLAGS = -ftree-vectorize -ffast-math -fsingle-precision-constant -ftreevectorizer-verbose=2 -mvectorize-with-neon-quad CFLAGS = -Wall -O3 -march=armv7-a -mtune=cortex-a8 $(VECTFLAGS) -funroll-loops LIBS = -lm -lrt OBJFILES := $(patsubst %.c,%.o,$(wildcard *.c)) $(PROJ_NAME): $(OBJFILES) # echo $(OBJFILES) $(CC) -o $(PROJ_NAME) $(OBJFILES) $(LIBS) %.o: %.c $(CC) $(CFLAGS) -c -o $@ $< %.lst: %.c $(CC) $(CFLAGS) -Wa,-adhln $(LIBS) $< > $@ clean: rm -f *.o *.lst -mfloat-abi=softfp -mfpu=neon
void enable_runfast() { static const unsigned int x = 0x04086060; static const unsigned int y = 0x03000000; int r; asm volatile ( "fmrx %0, fpscr "and "orr %0, %0, %1 %0, %0, %2 \n\t" \n\t" \n\t" \n\t" //r0 = FPSCR //r0 = r0 & 0x04086060 //r0 = r0 | 0x03000000 //FPSCR = r0
int main(int argc, char**argv) { FILE *fgFile, *bgFile, *outFile; int result; struct timeval oldTv, newTv; //enable_runfast();
if(argc != 4){ fprintf(stderr, "Usage:%s foreground background outFile\n",argv[0]); return 1; } fgFile = fopen(argv[1], "rb"); bgFile = fopen(argv[2], "rb"); outFile = fopen(argv[3], "wb");
if(fgFile && bgFile && outFile){ result = fread(backImage, 512*sizeof(int), 512, bgFile); if(result != 512){ fprintf(stderr, "Error with backImage\n"); return 3; } result = fread(foreImage, 512*sizeof(int), 512, fgFile); if(result != 512){ fprintf(stderr, "Error with foreImage\n"); return 4; } gettimeofday(&oldTv, NULL); alphaBlend_c(&foreImage[0], &backImage[0], &newImage[0]); gettimeofday(&newTv, NULL); fprintf(stdout, oldTv.tv_usec)); "Routine took %d microseconds\n", (int)(newTv.tv_usec -
fwrite(newImage, 512*sizeof(int),512,outFile); fclose(fgFile); fclose(bgFile); fclose(outFile); return 0; } fprintf(stderr, "Problem opening a file\n"); return 2; }
#define A(x) (((x) & 0xff000000) >> 24) #define R(x) (((x) & 0x00ff0000) >> 16) #define G(x) (((x) & 0x0000ff00) >> 8) #define B(x) ((x) & 0x000000ff)
void alphaBlend_c(int *fgImage, int *bgImage, int* __restrict dstImage) { int x, pos, y; for(y = 0; y < 512; y++){ for(x = 0; x < 512; x++){ /*for(xx = 0; xx< 4; xx++) { pos[xx]= y*512 }*/ //pos = (y*512)+x; pos = (y*512)+x;
int a_fg
= A(fgImage[pos]);
int dst_r = ((R(fgImage[pos]) * a_fg) + (R(bgImage[pos]) * (255-a_fg))); int dst_g = ((G(fgImage[pos]) * a_fg) + (G(bgImage[pos]) * (255-a_fg))); int dst_b = ((B(fgImage[pos]) * a_fg) + (B(bgImage[pos]) * (255-a_fg)))>>8; dstImage[pos] = (0x000000ff & (dst_b)); } } } 0xff000000 |(0x00ff0000 & (dst_r << 8)) |(0x0000ff00 & (dst_g)) |