Hello ,
I wrote a simple application on cpu and I am using offload pragmas for the pieces I want to run on the coprocessors.
Since I am compiling on cpu and I use offloads , I am using :
<code>export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=120</code>
in order to specify the threads number.
My problems:
1) Running the code , shows always 40 threads been used.
2) Running again and again the code without compiling , I am getting different time results.
In order to compile:
<code>icc -std=c99 -DOFFLOAD -openmp -qopt-report -O3 xeon.c -o xeon</code>
<code>#include <stdio.h> #include <stdlib.h> #include <string.h> #include <omp.h> #include <sys/time.h> #include <cilk/cilk.h> #include <cilk/reducer_opadd.h> typedef CILK_C_DECLARE_REDUCER(float) reducer; double dtime() { double tseconds = 0.0; struct timeval mytime; gettimeofday(&mytime,(struct timezone*)0); tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); return( tseconds * 1000 ); } float openMPIntegration( int N, float * const ioA ) { float res = 0; #if DOFFLOAD #pragma offload target (mic) { #endif #pragma omp parallel for reduction(+:res) for ( int i = 0; i < N; i++ ) { res += ioA[ i ]; } #if DOFFLOAD } #endif return res; } float CilkIntegration( int N , float * const ioA ) { float res = 0; #if DOFFLOAD #pragma offload target (mic) { #endif CILK_C_REDUCER_OPADD( sum, float , 0); CILK_C_REGISTER_REDUCER(sum); cilk_for ( int i = 0; i < N; i++ ) { REDUCER_VIEW(sum) += ioA[ i ]; } res = sum.value; CILK_C_UNREGISTER_REDUCER(sum); #if DOFFLOAD } #endif return res; } int main() { int NbOfThreads; double tstart, tstop, ttime; int N = 1000000; float * A = (float*) _mm_malloc( N * sizeof(*A) , 32 ); //fill A for ( int i = 0; i < N; i++ ) A[ i ] = i; #if DOFFLOAD #pragma offload target (mic) #endif #pragma omp parallel #pragma omp master NbOfThreads = omp_get_num_threads(); printf("\nUsing %d threads\r\n",NbOfThreads); tstart = dtime(); float openMPRes = openMPIntegration( N , A ); tstop = dtime(); ttime = tstop - tstart; printf("\nopenMP integration = %10.3lf msecs \t value = %10.3f", ttime ,openMPRes); tstart = dtime(); float CilkRes = CilkIntegration( N , A ); tstop = dtime(); ttime = tstop - tstart; printf("\nCilk integration = %10.3lf msecs \t value = %10.3f", ttime,CilkRes); printf("\n"); _mm_free( A ); return 0; }</code>
( I have posted also https://stackoverflow.com/questions/29346580/thread-numbers-and-time-results-consistency )
Thanks!