Dear all,
I recently started using Xeon Phi cards for parallel programming, so I am still a newbie in this field.
I wrote this code as a simple example to start understanding this fascinating world, but I got surprised when I looked at the time of executions.
When I run the code on the host, execution time is 0,08 s. When I run the code adding the pragma offload and pragma omp parallel for, execution time increase up to 9s!
When I compiled the codes, I used -O3 optimization for both of them.
Is there something I am missing?
Thanks for your help,
Flavio
#include<stdio.h> #ifdef _OPENMP #include<omp.h> #endif #define ALLOC alloc_if(1) free_if(0) #define RETAIN alloc_if(0) free_if(0) #define FREE alloc_if(0) free_if(1) #define LD long double #define MAX 100000 main(int argc, char **argv) { int i, j; LD *M = NULL; __declspec(target(mic))int cycles = 240; printf("array lenght: %d\n", cycles); //start time char fmt[64], buf1[64], buf2[64]; struct timeval tv; struct tm *tm; gettimeofday(&tv, NULL); if((tm = localtime(&tv.tv_sec)) != NULL){ strftime(fmt, sizeof fmt, "((%H*1440)+(%M*60)+%S,%%06u)", tm); snprintf(buf1, sizeof buf1, fmt, tv.tv_usec); } //array creation M = (LD*)calloc(cycles, sizeof(LD)); //allocating space on MIC #pragma offload target(mic) in(M:length(cycles) ALLOC) {} for (i=0; i<MAX; i++){ #pragma offload target(mic) inout(M:length(cycles) RETAIN) \ in(cycles) { #pragma omp parallel for private(j) #pragma ivdep for (j=0; j<cycles; j++) M[j] += 1; } //offload } //for //freeing space on MIC #pragma offload target(mic) nocopy(M:length(0) FREE) {} printf("number of cycles: %LG\n", M[0]); //tempo finale gettimeofday(&tv, NULL); if((tm = localtime(&tv.tv_sec)) != NULL){ strftime(fmt, sizeof fmt, "=((%H*1440)+(%M*60)+%S,%%06u)", tm); snprintf(buf2, sizeof buf2, fmt, tv.tv_usec); printf("%s-%s\n", buf2, buf1); } return 0; } //main