I am creating a simple matrix multiplication procedure, operating on the Intel Xeon Phi architecture.
I am using, aligned data. However, if the matrices are allocated using dynamic memory (posix_memalign), the computation incurs in a severe slow down, i.e. for TYPE=float and 512x512 matrices takes ~0.55s in the dynamic case while in the other case ~0.07s.
On a different architecture (Intel Xeon E5-2650 @ 2.00GHz), the problem changes because the static allocated case doesn't calculate the matrix (it gives me all zeros when i print a random position of C, I think because the #pragma simd. Anyway, the dynamic allocating case takes about 0.08s.
Here is the code, i also attached the optimization reports of static & dynamic cases:
#define ROW 512 #define COLWIDTH 512 #define REPEATNTIMES 512 #include <sys/time.h> #include <stdio.h> #include <math.h> #include <stdlib.h> #define FTYPE float #define ALIGNMENT 128 double clock_it(void) { double duration = 0.0; struct timeval start; gettimeofday(&start, NULL); duration = (double)(start.tv_sec + start.tv_usec/1000000.0); return duration; } int main() { double execTime = 0.0; double startTime, endTime; int k, size1, size2, i, j; #ifdef STACK printf("Using Stack!\n"); FTYPE a[ROW][COLWIDTH]; FTYPE b[ROW][COLWIDTH]; FTYPE c[ROW][COLWIDTH]; for(i=0; i<ROW; i++){ for(j=0; j<COLWIDTH; j++){ a[i][j] = 1.0f; b[i][j] = 1.0f; c[i][j] = 0.0f; } } #else printf("Using Heap!\n"); FTYPE **a; posix_memalign((void **) &a, ALIGNMENT, sizeof(FTYPE*)*ROW); FTYPE **b; posix_memalign((void **) &b, ALIGNMENT, sizeof(FTYPE*)*ROW); FTYPE **c; posix_memalign((void **) &c, ALIGNMENT, sizeof(FTYPE*)*ROW); for(i=0; i<ROW; i++){ posix_memalign((void **) &a[i], ALIGNMENT, sizeof(FTYPE)*COLWIDTH); posix_memalign((void **) &b[i], ALIGNMENT, sizeof(FTYPE)*COLWIDTH); posix_memalign((void **) &c[i], ALIGNMENT, sizeof(FTYPE)*COLWIDTH); for(j=0; j<COLWIDTH; j++){ a[i][j] = 1.0f; b[i][j] = 1.0f; c[i][j] = 0.0f; } } #endif size1 = ROW; size2 = COLWIDTH; printf("\nROW:%d COL: %d\n",ROW,COLWIDTH); //start timing the matrix multiply code startTime = clock_it(); #ifndef STACK __assume_aligned(a, ALIGNMENT); __assume_aligned(b, ALIGNMENT); __assume_aligned(c, ALIGNMENT); #endif #pragma vector aligned for (i = 0; i < REPEATNTIMES; i++) { #pragma vector aligned for (k = 0; k < size1; k++) { #pragma simd #pragma vector aligned for (j = 0;j < size2; j++) { #ifndef STACK __assume_aligned(a[i], ALIGNMENT); __assume_aligned(b[k], ALIGNMENT); __assume_aligned(c[i], ALIGNMENT); #endif c[i][j] += a[i][k] * b[k][j]; } } } endTime = clock_it(); execTime = endTime - startTime; printf("Execution time is %2.3f seconds\n", execTime); printf("GigaFlops = %f\n", (((double)REPEATNTIMES * (double)COLWIDTH * (double)ROW * 2.0) / (double)(execTime))/1000000000.0); printf("Random c_i,j %f\n", c[rand()%512][rand()%512]); return 0; }
Any help is appreciated!