Double buffering

Hi, I'm new here and I have a question right from the start.

What I'm trying to do is double buffering on XEON PHI. This program seems to run fine if I remove signal/wait that is when I'm not doing any asynchronous copying. But if I do (as in the program below) it will throw at me "Segmentation fault". Does anyone know what might be the problem here?
Whole offload report is in txt file as attachment.
Compile:
icc -openmp double_offload.c -o test.mic

While I'm at it. Can the XEON PHI copy Host->Device and Device->Host simultaneously. Since my goal is to make triple-buffering, if it is possible.

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>

// aligment of memory; 64 because of size of cacheline
	#define ALLOC alloc_if(1)
	#define REUSE alloc_if(0)
	#define FREE  free_if(1)
	#define RETAIN free_if(0)
	#define ALIGN 64
	


__attribute__ (( target (mic))) void PHI_OFF(float *input, float *output, float *coeff, int nChunks, int chunk_size) {
	int bl,i;
	printf("nChunks: %d chunk_size: %d\n",nChunks,chunk_size);
	
	#pragma omp parallel for private(bl) shared(input,output,coeff)
	for(bl=0; bl<nChunks; bl++) {
		for(i=0;i<chunk_size;i++){
			output[bl*chunk_size+i]=coeff[i]*input[bl*chunk_size+i];
			//output[bl*chunk_size:chunk_size]=coeff[0:chunk_size]*input[bl*chunk_size:chunk_size];
		}
	}
}


int main()
{
	int nChunks=150000;
	int chunk_size=512;

	int input_size=nChunks*chunk_size;
	int half_input_size=input_size/2;

	int f;

	//allocate memmory
	// this works on both device and host
	float  *input;
	float  *output;
	__attribute__((target(mic))) float  *coeff;
	__attribute__((target(mic))) float  *I1;
	__attribute__((target(mic))) float  *I2;
	__attribute__((target(mic))) float  *S1;
	__attribute__((target(mic))) float  *S2;
	
	input = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//input data on host
	output = (float*)_mm_malloc( input_size*sizeof(float) ,ALIGN);//output data on host
	coeff = (float*)_mm_malloc( chunk_size*sizeof(float) ,ALIGN);//coefficients on host and device
	I1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device
	I2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the input data on device
	S1=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device
	S2=(float*)_mm_malloc(half_input_size*sizeof(float) ,ALIGN);//half of the output data on device
	
	
	//initialize arrays
	srand (time(NULL));
	for(f=0;f<input_size;f++){
		input[f]=(float) (rand() % 10000 + 1.0)/1000.0;
		output[f]=0;
	}
	for(f=0;f<chunk_size;f++){
		coeff[f]=(float) (rand() % 10000 + 1.0)/1000.0;
	}
	
	
    // Allocate memory on the card
	#pragma offload target(mic:0) \
		nocopy(I1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(I2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(S1[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(S2[0:half_input_size]:align(ALIGN) RETAIN ALLOC )\
		nocopy(coeff[0:chunk_size]:align(ALIGN) RETAIN ALLOC )
	{}

	
	//Copy coefficients
	#pragma offload_transfer target(mic:0) in(coeff:length(chunk_size) RETAIN REUSE align(ALIGN)) //in(nChunks) in(chunk_size)
	{}

	//-------------> double buffering 
	
	#pragma offload_transfer target(mic:0) \
		in( input[0:half_input_size]:into(I1[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I1)

	
	#pragma offload_transfer target(mic:0) \
		in( input[half_input_size:half_input_size]:into(I2[0:half_input_size]) RETAIN REUSE align(ALIGN) ) signal(I2)	
		
	#pragma offload target(mic:0) \
		nocopy(coeff:length(chunk_size) RETAIN REUSE) \
		nocopy(I1:length(half_input_size) RETAIN REUSE) \
		out(S1[0:half_input_size]:into(output[0:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I1)
	{
		PHI_OFF(I1,S1,coeff,nChunks/2,chunk_size);
	}	
	
	#pragma offload target(mic:0) \
		nocopy(coeff:length(chunk_size) RETAIN REUSE) \
		nocopy(I2:length(half_input_size) RETAIN REUSE) \
		out(S2[0:half_input_size]:into(output[half_input_size:half_input_size]) RETAIN REUSE align(ALIGN) ) wait(I2)
	{
		PHI_OFF(I2,S2,coeff,nChunks/2,chunk_size);
	}
	
    // Deallocate memory on the card
	
	#pragma offload target(mic:0) \
		nocopy(I1, I2:length(half_input_size) REUSE FREE)\
		nocopy(S1, S2:length(half_input_size) REUSE FREE)\
		nocopy(coeff:length(chunk_size) REUSE FREE )
	{}

	
	
	
	//free the host system memory
	_mm_free(input);
	_mm_free(output);
	_mm_free(coeff);
	_mm_free(I1);
	_mm_free(I2);
	_mm_free(S1);
	_mm_free(S2);

  return 0;
}

Allegato	Dimensione
Scarica forum_report.txt	9.02 KB

Double buffering

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List