Hi, I'm new here and I have a question right from the start.
What I'm trying to do is double buffering on XEON PHI. This program seems to run fine if I remove signal/wait that is when I'm not doing any asynchronous copying. But if I do (as in the program below) it will throw at me "Segmentation fault". Does anyone know what might be the problem here?
Whole offload report is in txt file as attachment.
Compile:
icc -openmp double_offload.c -o test.mic
While I'm at it. Can the XEON PHI copy Host->Device and Device->Host simultaneously. Since my goal is to make triple-buffering, if it is possible.
#include <stdio.h> #include <stdlib.h> #include <omp.h> #include <time.h> // aligment of memory; 64 because of size of cacheline #define ALLOC alloc_if(1) #define REUSE alloc_if(0) #define FREE free_if(1) #define RETAIN free_if(0) #define ALIGN 64 __attribute__ (( target (mic))) void PHI_OFF(float *input, float *output, float *coeff, int nChunks, int chunk_size) { int bl,i; printf("nChunks: %d chunk_size: %d\n",nChunks,chunk_size); #pragma omp parallel for private(bl) shared(input,output,coeff) for(bl=0; bl<nChunks; bl++) { for(i=0;i<chunk_size;i++){ output[bl*chunk_size+i]=coeff[i]*input[bl*chunk_size+i]; //output[bl*chunk_size:chunk_size]=coeff[0:chunk_size]*input[bl*chunk_size:chunk_size]; } } } int main() { int nChunks=150000; int chunk_size=512; int input_size=nChunks*chunk_size; int half_input_size=input_size/2; int f; //allocate memmory // this works on both device and host float *input; float *output; __attribute__((target(mic))) float *coeff; __attribute__((target(mic))) float *I1; __attribute__((target(mic))) float *I2; __attribute__((target(mic))) float *S1; __attribute__((target(mic))) float *S2; input = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//input data on host output = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//output data on host coeff = (float*)_mm_malloc( chunk_size*sizeof(float) ,ALIGN);//coefficients on host and device I1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device I2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device S1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device S2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device //initialize arrays srand (time(NULL)); for(f=0;f<input_size;f++){ input[f]=(float) (rand() % 10000 + 1.0)/1000.0; output[f]=0; } for(f=0;f<chunk_size;f++){ coeff[f]=(float) (rand() % 10000 + 1.0)/1000.0; } // Allocate memory on the card #pragma offload target(mic:0) \ nocopy(I1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\ nocopy(I2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\ nocopy(S1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\ nocopy(S2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\ nocopy(coeff[0:chunk_size]:align(ALIGN) RETAIN ALLOC ) {} //Copy coefficients #pragma offload_transfer target(mic:0) in(coeff:length(chunk_size) RETAIN REUSE align(ALIGN)) //in(nChunks) in(chunk_size) {} //-------------> double buffering #pragma offload_transfer target(mic:0) \ in( input[0:half_input_size]:into(I1[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I1) #pragma offload_transfer target(mic:0) \ in( input[half_input_size:half_input_size]:into(I2[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I2) #pragma offload target(mic:0) \ nocopy(coeff:length(chunk_size) RETAIN REUSE) \ nocopy(I1:length(half_input_size) RETAIN REUSE) \ out(S1[0:half_input_size]:into(output[0:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I1) { PHI_OFF(I1,S1,coeff,nChunks/2,chunk_size); } #pragma offload target(mic:0) \ nocopy(coeff:length(chunk_size) RETAIN REUSE) \ nocopy(I2:length(half_input_size) RETAIN REUSE) \ out(S2[0:half_input_size]:into(output[half_input_size:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I2) { PHI_OFF(I2,S2,coeff,nChunks/2,chunk_size); } // Deallocate memory on the card #pragma offload target(mic:0) \ nocopy(I1, I2:length(half_input_size) REUSE FREE)\ nocopy(S1, S2:length(half_input_size) REUSE FREE)\ nocopy(coeff:length(chunk_size) REUSE FREE ) {} //free the host system memory _mm_free(input); _mm_free(output); _mm_free(coeff); _mm_free(I1); _mm_free(I2); _mm_free(S1); _mm_free(S2); return 0; }