Performance degradation if loop count is not known at compile time

I am creating a simple matrix multiplication procedure, operating on the Intel Xeon Phi architecture.

After many attempts with autovectorization, trying to get better performances, I had to use Intel Intrinsics.

Until now, the matrix size was given by a #define in the source code, but when I try to give it at run time, I have a huge performance degradation.

The source code is the following:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <stddef.h>
#include <chrono>
#include <ctime>
#include <mmintrin.h>
#include <xmmintrin.h>  // SSE
#include <pmmintrin.h>  // SSE2
#include <emmintrin.h>  // SSE3
#include <immintrin.h>
#include <zmmintrin.h>

#define ALIGNMENT 64
#ifndef SIZE
#define SIZE 960
#endif

#define vZero(c) {(c) = _mm512_setzero_pd();}

#define start_time() \
	auto start = std::chrono::high_resolution_clock::now();
/** Shows the elapsed time. See start_time for usage*/
#define elapsed_time(STRING) \
	auto elapsed = std::chrono::high_resolution_clock::now() - start; \
	long long microseconds = std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(); \
	printf(#STRING":%lld\n", microseconds);

void recTranspose(double *__restrict__ a, double *__restrict__ aT, const int n, const int k, const int lda, const int ldat){
	if (n*k <= 128) {
		for(int i = 0; i < n; i++) {
			for(int j = 0; j < k; j++) {
				aT[j*ldat+i] = a[i*lda+j];
			}
		}
		//printf("Reached _|_");
		return;
	}
	if(k > n) {
		recTranspose(a, aT, n, (k+1)/2, lda, ldat);
		recTranspose(&a[(k+1)/2], &aT[(k+1)/2*ldat], n, k-((k+1)/2), lda, ldat);
	} else {
		recTranspose(a, aT, (n+1)/2, k, lda, ldat);
		recTranspose(&a[(n+1)/2*lda], &aT[(n+1)/2], n- (n+1)/2, k, lda, ldat);
	}

}
/** Calculates 8 cols and 30 rows of c.*/
inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c, const int size) {
	__m512d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
	__m512d c10, c11, c12, c13, c14, c15, c16, c17, c18, c19;
	__m512d c20, c21, c22, c23, c24, c25, c26, c27, c28, c29;


	vZero(c0);	vZero(c1);	vZero(c2);	vZero(c3);	vZero(c4);	vZero(c5);
	vZero(c6);	vZero(c7);	vZero(c8);	vZero(c9);	vZero(c10);	vZero(c11);
	vZero(c12);	vZero(c13);	vZero(c14);	vZero(c15);	vZero(c16);	vZero(c17);
	vZero(c18);	vZero(c19);	vZero(c20);	vZero(c21);	vZero(c22);	vZero(c23);
	vZero(c24);	vZero(c25);	vZero(c26);	vZero(c27); vZero(c28);	vZero(c29);

	__assume_aligned(a, ALIGNMENT);
	__assume_aligned(b, ALIGNMENT);
	__assume_aligned(c, ALIGNMENT);
	__assume(size%16==0);
	for(int i = 0; i < size; i++) {
		const __m512d bv = _mm512_load_pd(b+i*size);
		c0 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+0, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c0);
		c1 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c1);
		c2 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c2);
		c3 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c3);
		c4 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c4);
		c5 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c5);
		c6 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c6);
		c7 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c7);
		c8 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c8);
		c9 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c9);
		c10 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c10);
		c11 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0),bv, c11);
		c12 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c12);
		c13 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c13);
		c14 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+14, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c14);
		c15 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+15, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c15);
		c16 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+16, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c16);
		c17 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+17, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c17);
		c18 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+18, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c18);
		c19 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+19, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c19);
		c20 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+20, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c20);
		c21 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+21, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c21);
		c22 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+22, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c22);
		c23 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+23, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c23);
		c24 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+24, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c24);
		c25 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+25, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c25);
		c26 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+26, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c26);
		c27 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+27, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c27);
		c28 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+28, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c28);
		c29 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+29, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0), bv, c29);
	}

	_mm512_storenr_pd(c+0*size, c0);
	_mm512_storenr_pd(c+1*size, c1);
	_mm512_storenr_pd(c+2*size, c2);
	_mm512_storenr_pd(c+3*size, c3);
	_mm512_storenr_pd(c+4*size, c4);
	_mm512_storenr_pd(c+5*size, c5);
	_mm512_storenr_pd(c+6*size, c6);
	_mm512_storenr_pd(c+7*size, c7);
	_mm512_storenr_pd(c+8*size, c8);
	_mm512_storenr_pd(c+9*size, c9);

	_mm512_storenr_pd(c+10*size, c10);
	_mm512_storenr_pd(c+11*size, c11);
	_mm512_storenr_pd(c+12*size, c12);
	_mm512_storenr_pd(c+13*size, c13);
	_mm512_storenr_pd(c+14*size, c14);
	_mm512_storenr_pd(c+15*size, c15);

	_mm512_storenr_pd(c+16*size, c16);
	_mm512_storenr_pd(c+17*size, c17);
	_mm512_storenr_pd(c+18*size, c18);
	_mm512_storenr_pd(c+19*size, c19);
	_mm512_storenr_pd(c+20*size, c20);
	_mm512_storenr_pd(c+21*size, c21);
	_mm512_storenr_pd(c+22*size, c22);
	_mm512_storenr_pd(c+23*size, c23);

	_mm512_storenr_pd(c+24*size, c24);
	_mm512_storenr_pd(c+25*size, c25);
	_mm512_storenr_pd(c+26*size, c26);
	_mm512_storenr_pd(c+27*size, c27);
	_mm512_storenr_pd(c+28*size, c28);
	_mm512_storenr_pd(c+29*size, c29);
}



int main(int argc, const char ** argv) {
#ifdef SIZES
	const int size = SIZE;

#else
	const int size = atoi(argv[1]);
#endif
	void* p = malloc((sizeof(double)*5*size*size) + ALIGNMENT-1);
	double *__restrict__ a = (double*)(((size_t)p + ALIGNMENT-1) / ALIGNMENT * ALIGNMENT);
	double *__restrict__ aT = (double*) a+size*size;
	double *__restrict__ b = aT+size*size;
	double *__restrict__ c = b+size*size;
	double *__restrict__ d = c+size*size;
	srand(time(NULL));

	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			a[i*size+j] = (double) (rand()%20);
		}
		for(int j2=0; j2<size; j2++){
			c[i*size+j2] = 0.0;
		}
	}
	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			b[i*size+j] = (double) (rand()%20);
		}
	}


	start_time();
	recTranspose(a, aT, size, size, size, size);
	for(int i = 0; i < size; i+=30) {
		for(int j = 0; j < size; j+=8) {
			eightbythirty(&aT[i], &b[j], &c[i*size+j], size);
		}
	}
	elapsed_time();
	double gflops = 2.0*size*size*size*1.0e-03/(microseconds);
	printf("Gflops: %f\n", gflops);

	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			double s = 0;
			for(int u = 0; u < size; u++) {
				s += a[i*size+u] * b[u*size+j];
			}
			d[i*size+j] = s;
		}
	}

	int error = 0;
	for(int i = 0; i < size; i++) {
		for(int j = 0; j < size; j++) {
			if(abs(c[i*size+j] - d[i*size+j]) > 1) {
				printf("Error at %d %d , %f instead of %f\n", i, j, c[i*size+j], d[i*size+j]);
				error++;
				if(error > 16) return 0;
			}
		}
	}

	printf("OK\n");

}

So for example, having size 960 (for now it works only with sizes multiples of 30*8):

if I compile with compile time given size: icc -mmic -O3 -restrict -std=c++11 -DSIZES -DSIZE=960 mmul.cpp -o mmul.o
Elapsed time: 0.460745s
Gflops: 3.840458
if I compile with runtime given size: icc -mmic -O3 -restrict -std=c++11 mmul.cpp -o mmul.o
Elapsed time: 2.204564s
Gflops: 0.802640

I'm thinking it could be a prefetching issue with icc that can't recognize the memory access pattern. Looking at the generated asm source, the number of vprefetch instructions is much more higher in the "compile time" version.

Funny fact: the check for the correct result of the multiplication (the two for loops at the end of the code, rows 178-197) is much more slower in the compile time version!

Any thoughts? I tried the #pragma loop_count but it seems it's useless, also doing manual intrinsic prefetching doesn't seem to be very effective.

Thanks in advance for any answer.

Regards,
Luca

Performance degradation if loop count is not known at compile time

Trending Articles

Kalank - Malayalam (1CD ) - subtitles

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Mp3 Download: Mr Raw - Hallelujah Ft. J Martins

[GET] AI Traffic Goldmine

IWAN – Thanks and Praise ( Throw Back Thursday )

NCERT Solutions for Class 9th Sanskrit Chapter 2 अविवेकः परमापदां पदम्

99 God Status for Whatsapp, Facebook

A/L Technology Stream – Subject combinations, Syllabuses and Teacher guides

STR W6553A SMPS Schematic TDA11106 Schematic Pin Voltages and Functions

Download: Stuf G ft B1 & Trice – Puzya Mami (Prod-j Stunner)

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Gauhati University TDC 2nd-4th-6th Result 2017 BA B.Com B.Sc

[E² Plugin] HDF-Radio

Brunei reaffirms healthcare commitment

Police confirm man stabbed to death in Selsdon was Andrew David Else of Croydon

Telangana TS New Food Security Card/ Telangana Ration card Application Form...

Notorious Naushad of Ippa gang nabbed

Inthalo ennenni vinthalo ( male ) lyrics and translation | Karthikeya (2014)