I conducted a experiemt to test data transfer speed
#pragma offload target(mic) out(outImage:length(width*height) alloc_if(1) free_if(1)) in(img1,img2:length(width*height) alloc_if(1) free_if(1)) { //int a = 0; //omp_set_num_threads(192); //const size_t iCPUNum = omp_get_max_threads(); //printf("Get number = %d\n", iCPUNum); //fflush(0); const unsigned int ySegment = height/iCPUNum; #pragma omp parallel for for (unsigned int n = 0; n < iCPUNum; n++) { const unsigned int starty = n * width; unsigned int endy = starty + ySegment; if(n = (iCPUNum -1)) endy = height; unsigned char tmpArray1[width]; unsigned char tmpArray2[width]; unsigned char tmpArrayout[width]; for (size_t y = starty; y < endy; y++) { memcpy(tmpArray1, &img1[y*width], width*sizeof(char)); memcpy(tmpArray2, &img2[y*width], width*sizeof(char)); for (unsigned int nn = 0; nn < LOOPNUM; nn++) { for (unsigned int x = 0; x < width; x++) { tmpArrayout[ x] = tmpArray1[x]*0.5f + tmpArray2[x]*0.5f; } } memcpy( &outImage[y*width], tmpArrayout,width*sizeof(char)); } }//end of n<iCPUNum }//end of pragma
[Offload] [MIC 0] [File] imageAdd.cpp
[Offload] [MIC 0] [Line] 144
[Offload] [MIC 0] [Tag] Tag 2
[Offload] [HOST] [Tag 2] [CPU Time] 0.946899(seconds)
[Offload] [MIC 0] [Tag 2] [CPU->MIC Data] 18874384 (bytes)
[Offload] [MIC 0] [Tag 2] [MIC Time] 0.856914(seconds)
[Offload] [MIC 0] [Tag 2] [MIC->CPU Data] 9437200 (bytes)
the transfer time is t = 0.94899-0.856914 = 0.089985. Data transfer speed ~= 300 M per sec. It sounds the value is far lower that the value mentioned in some documents. Could you tell me why?
Thanks a lot!
Xin