Quantcast
Channel: Intel® Many Integrated Core Architecture
Viewing all articles
Browse latest Browse all 1347

Double buffering

$
0
0

Hi, I'm new here and I have a question right from the start.

What I'm trying to do is double buffering on XEON PHI. This program seems to run fine if I remove signal/wait that is when I'm not doing any asynchronous copying. But if I do (as in the program below) it will throw at me "Segmentation fault". Does anyone know what might be the problem here?
Whole offload report is in txt file as attachment.
Compile:
icc -openmp double_offload.c -o test.mic

While I'm at it. Can the XEON PHI copy Host->Device and Device->Host simultaneously. Since my goal is to make triple-buffering, if it is possible.

 

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>

// aligment of memory; 64 because of size of cacheline
	#define ALLOC alloc_if(1)
	#define REUSE alloc_if(0)
	#define FREE  free_if(1)
	#define RETAIN free_if(0)
	#define ALIGN 64
	


__attribute__ (( target (mic))) void PHI_OFF(float *input, float *output, float *coeff, int nChunks, int chunk_size) {
	int bl,i;
	printf("nChunks: %d chunk_size: %d\n",nChunks,chunk_size);
	
	#pragma omp parallel for private(bl) shared(input,output,coeff)
	for(bl=0; bl<nChunks; bl++) {
		for(i=0;i<chunk_size;i++){
			output[bl*chunk_size+i]=coeff[i]*input[bl*chunk_size+i];
			//output[bl*chunk_size:chunk_size]=coeff[0:chunk_size]*input[bl*chunk_size:chunk_size];
		}
	}
}


int main()
{
	int nChunks=150000;
	int chunk_size=512;

	int input_size=nChunks*chunk_size;
	int half_input_size=input_size/2;

	int f;

	//allocate memmory
	// this works on both device and host
	float  *input;
	float  *output;
	__attribute__((target(mic))) float  *coeff;
	__attribute__((target(mic))) float  *I1;
	__attribute__((target(mic))) float  *I2;
	__attribute__((target(mic))) float  *S1;
	__attribute__((target(mic))) float  *S2;
	
	input = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//input data on host
	output = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//output data on host
	coeff = (float*)_mm_malloc( chunk_size*sizeof(float) ,ALIGN);//coefficients on host and device
	I1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device
	I2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device
	S1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device
	S2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device
	
	
	//initialize arrays
	srand (time(NULL));
	for(f=0;f<input_size;f++){
		input[f]=(float) (rand() % 10000 + 1.0)/1000.0;
		output[f]=0;
	}
	for(f=0;f<chunk_size;f++){
		coeff[f]=(float) (rand() % 10000 + 1.0)/1000.0;
	}
	
	
    // Allocate memory on the card
	#pragma offload target(mic:0) \
		nocopy(I1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(I2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(S1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(S2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(coeff[0:chunk_size]:align(ALIGN) RETAIN ALLOC )
	{}

	
	//Copy coefficients
	#pragma offload_transfer target(mic:0) in(coeff:length(chunk_size) RETAIN REUSE align(ALIGN)) //in(nChunks) in(chunk_size)
	{}

	//-------------> double buffering 
	
	#pragma offload_transfer target(mic:0) \
		in( input[0:half_input_size]:into(I1[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I1)

	
	#pragma offload_transfer target(mic:0) \
		in( input[half_input_size:half_input_size]:into(I2[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I2)	
		
	#pragma offload target(mic:0) \
		nocopy(coeff:length(chunk_size) RETAIN REUSE) \
		nocopy(I1:length(half_input_size) RETAIN REUSE) \
		out(S1[0:half_input_size]:into(output[0:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I1)
	{
		PHI_OFF(I1,S1,coeff,nChunks/2,chunk_size);
	}	
	
	#pragma offload target(mic:0) \
		nocopy(coeff:length(chunk_size) RETAIN REUSE) \
		nocopy(I2:length(half_input_size) RETAIN REUSE) \
		out(S2[0:half_input_size]:into(output[half_input_size:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I2)
	{
		PHI_OFF(I2,S2,coeff,nChunks/2,chunk_size);
	}
	
    // Deallocate memory on the card
	
	#pragma offload target(mic:0) \
		nocopy(I1, I2:length(half_input_size) REUSE FREE)\
		nocopy(S1, S2:length(half_input_size) REUSE FREE)\
		nocopy(coeff:length(chunk_size) REUSE FREE )
	{}

	
	
	
	//free the host system memory
	_mm_free(input);
	_mm_free(output);
	_mm_free(coeff);
	_mm_free(I1);
	_mm_free(I2);
	_mm_free(S1);
	_mm_free(S2);

  return 0;
}

 

AllegatoDimensione
Scaricaforum_report.txt9.02 KB

Viewing all articles
Browse latest Browse all 1347

Trending Articles