Quantcast
Channel: Intel® Many Integrated Core Architecture
Viewing all articles
Browse latest Browse all 1347

Performance degradation if loop count is not known at compile time

$
0
0

I am creating a simple matrix multiplication procedure, operating on the Intel Xeon Phi architecture.

After many attempts with autovectorization, trying to get better performances, I had to use Intel Intrinsics.

Until now, the matrix size was given by a #define in the source code, but when I try to give it at run time, I have a huge performance degradation.

The source code is the following:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <stddef.h>
#include <chrono>
#include <ctime>
#include <mmintrin.h>
#include <xmmintrin.h>  // SSE
#include <pmmintrin.h>  // SSE2
#include <emmintrin.h>  // SSE3
#include <immintrin.h>
#include <zmmintrin.h>

#define ALIGNMENT 64
#ifndef SIZE
#define SIZE 960
#endif

#define vZero(c) {(c) = _mm512_setzero_pd();}

#define start_time() \
	auto start = std::chrono::high_resolution_clock::now();
/** Shows the elapsed time. See start_time for usage*/
#define elapsed_time(STRING) \
	auto elapsed = std::chrono::high_resolution_clock::now() - start; \
	long long microseconds = std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(); \
	printf(#STRING":%lld\n", microseconds);

void recTranspose(double *__restrict__ a, double *__restrict__ aT, const int n, const int k, const int lda, const int ldat){
	if (n*k <= 128) {
		for(int i = 0; i < n; i++) {
			for(int j = 0; j < k; j++) {
				aT[j*ldat+i] = a[i*lda+j];
			}
		}
		//printf("Reached _|_");
		return;
	}
	if(k > n) {
		recTranspose(a, aT, n, (k+1)/2, lda, ldat);
		recTranspose(&a[(k+1)/2], &aT[(k+1)/2*ldat], n, k-((k+1)/2), lda, ldat);
	} else {
		recTranspose(a, aT, (n+1)/2, k, lda, ldat);
		recTranspose(&a[(n+1)/2*lda], &aT[(n+1)/2], n- (n+1)/2, k, lda, ldat);
	}

}
/** Calculates 8 cols and 30 rows of c.*/
inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c, const int size) {
	__m512d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
	__m512d c10, c11, c12, c13, c14, c15, c16, c17, c18, c19;
	__m512d c20, c21, c22, c23, c24, c25, c26, c27, c28, c29;


	vZero(c0);	vZero(c1);	vZero(c2);	vZero(c3);	vZero(c4);	vZero(c5);
	vZero(c6);	vZero(c7);	vZero(c8);	vZero(c9);	vZero(c10);	vZero(c11);
	vZero(c12);	vZero(c13);	vZero(c14);	vZero(c15);	vZero(c16);	vZero(c17);
	vZero(c18);	vZero(c19);	vZero(c20);	vZero(c21);	vZero(c22);	vZero(c23);
	vZero(c24);	vZero(c25);	vZero(c26);	vZero(c27); vZero(c28);	vZero(c29);

	__assume_aligned(a, ALIGNMENT);
	__assume_aligned(b, ALIGNMENT);
	__assume_aligned(c, ALIGNMENT);
	__assume(size%16==0);
	for(int i = 0; i < size; i++) {
		const __m512d bv = _mm512_load_pd(b+i*size);
		c0 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c0);
		c1 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c1);
		c2 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c2);
		c3 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c3);
		c4 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c4);
		c5 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c5);
		c6 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c6);
		c7 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c7);
		c8 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c8);
		c9 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c9);
		c10 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c10);
		c11 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c11);
		c12 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c12);
		c13 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c13);
		c14 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+14, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c14);
		c15 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+15, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c15);
		c16 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+16, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c16);
		c17 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+17, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c17);
		c18 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+18, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c18);
		c19 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+19, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c19);
		c20 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+20, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c20);
		c21 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+21, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c21);
		c22 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+22, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c22);
		c23 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+23, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c23);
		c24 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+24, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c24);
		c25 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+25, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c25);
		c26 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+26, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c26);
		c27 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+27, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c27);
		c28 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+28, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c28);
		c29 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+29, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c29);
	}

	_mm512_storenr_pd(c+0*size, c0);
	_mm512_storenr_pd(c+1*size, c1);
	_mm512_storenr_pd(c+2*size, c2);
	_mm512_storenr_pd(c+3*size, c3);
	_mm512_storenr_pd(c+4*size, c4);
	_mm512_storenr_pd(c+5*size, c5);
	_mm512_storenr_pd(c+6*size, c6);
	_mm512_storenr_pd(c+7*size, c7);
	_mm512_storenr_pd(c+8*size, c8);
	_mm512_storenr_pd(c+9*size, c9);

	_mm512_storenr_pd(c+10*size, c10);
	_mm512_storenr_pd(c+11*size, c11);
	_mm512_storenr_pd(c+12*size, c12);
	_mm512_storenr_pd(c+13*size, c13);
	_mm512_storenr_pd(c+14*size, c14);
	_mm512_storenr_pd(c+15*size, c15);

	_mm512_storenr_pd(c+16*size, c16);
	_mm512_storenr_pd(c+17*size, c17);
	_mm512_storenr_pd(c+18*size, c18);
	_mm512_storenr_pd(c+19*size, c19);
	_mm512_storenr_pd(c+20*size, c20);
	_mm512_storenr_pd(c+21*size, c21);
	_mm512_storenr_pd(c+22*size, c22);
	_mm512_storenr_pd(c+23*size, c23);

	_mm512_storenr_pd(c+24*size, c24);
	_mm512_storenr_pd(c+25*size, c25);
	_mm512_storenr_pd(c+26*size, c26);
	_mm512_storenr_pd(c+27*size, c27);
	_mm512_storenr_pd(c+28*size, c28);
	_mm512_storenr_pd(c+29*size, c29);
}



int main(int argc, const char ** argv) {
#ifdef SIZES
	const int size = SIZE;

#else
	const int size = atoi(argv[1]);
#endif
	void* p = malloc((sizeof(double)*5*size*size) + ALIGNMENT-1);
	double *__restrict__ a = (double*)(((size_t)p + ALIGNMENT-1) / ALIGNMENT * ALIGNMENT);
	double *__restrict__ aT = (double*) a+size*size;
	double *__restrict__ b = aT+size*size;
	double *__restrict__ c = b+size*size;
	double *__restrict__ d = c+size*size;
	srand(time(NULL));

	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			a[i*size+j] = (double) (rand()%20);
		}
		for(int j2=0; j2<size; j2++){
			c[i*size+j2] = 0.0;
		}
	}
	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			b[i*size+j] = (double) (rand()%20);
		}
	}


	start_time();
	recTranspose(a, aT, size, size, size, size);
	for(int i = 0; i < size; i+=30) {
		for(int j = 0; j < size; j+=8) {
			eightbythirty(&aT[i], &b[j], &c[i*size+j], size);
		}
	}
	elapsed_time();
	double gflops = 2.0*size*size*size*1.0e-03/(microseconds);
	printf("Gflops: %f\n", gflops);

	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			double s = 0;
			for(int u = 0; u < size; u++) {
				s += a[i*size+u] * b[u*size+j];
			}
			d[i*size+j] = s;
		}
	}

	int error = 0;
	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			if(abs(c[i*size+j] - d[i*size+j]) > 1) {
				printf("Error at %d %d , %f instead of %f\n", i, j, c[i*size+j], d[i*size+j]);
				error++;
				if(error > 16) return 0;
			}
		}
	}

	printf("OK\n");

}

So for example, having size 960 (for now it works only with sizes multiples of 30*8):

  • if I compile with compile time given size: icc -mmic -O3 -restrict -std=c++11 -DSIZES -DSIZE=960 mmul.cpp -o mmul.o

    Elapsed time: 0.460745s
    Gflops: 3.840458

  • if I compile with runtime given size: icc -mmic -O3 -restrict -std=c++11 mmul.cpp -o mmul.o

    Elapsed time: 2.204564s
    Gflops: 0.802640

I'm thinking it could be a prefetching issue with icc that can't recognize the memory access pattern. Looking at the generated asm source, the number of vprefetch instructions is much more higher in the "compile time" version.

Funny fact: the check for the correct result of the multiplication (the two for loops at the end of the code, rows 178-197) is much more slower in the compile time version!

Any thoughts? I tried the #pragma loop_count but it seems it's useless, also doing manual intrinsic prefetching doesn't seem to be very effective.

Thanks in advance for any answer.

Regards,
Luca


Viewing all articles
Browse latest Browse all 1347

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>