c++ - Efficient way to copy strided data (to and from a CUDA Device)? -

September 15, 2013

is there possibility re-create info strided constant (or non-constant) value , cuda device efficiently?

i want diagonalize big symmetric matrix.

using jacobi algorithm there bunch of operations using 2 rows , 2 columns within each iteration.

since matrix big copied device exclusively looking way re-create 2 rows , columns device.

it nice utilize triangular matrix form store info additional downsides like

non-constant row-length [not kind of problem] non-constant stride of column values [the stride increases 1 each row.]

arise. [edit: using triangular form still impossible store whole matrix on gpu.]

i looked @ timings , recognized copying strided values 1 1 slow (synchronous async.).

// edit: removed solution - added answer

thanks robert crovella giving right hint utilize cudamemcpy2d. i'll append test code give possibility comprehend...

if comes suggestions solving re-create problem using row-major-ordered triangular matrices - sense free write reply please.

__global__ void setvalues (double *arr, double value) {   arr[blockidx.x] = value; }  int main( void )  {   // define consts   static size_t const r = 10, c = 10, rc = r*c;    // create matrices , initialize   double * matrix = (double*) malloc(rc*sizeof(double)),      *final_matrix = (double*) malloc(rc*sizeof(double));   (size_t i=0; i<rc; ++i) matrix[i] = rand()%r+10;   memcpy(final_matrix, matrix, rc*sizeof(double));    // create vectors on device   double *dev_col, *dev_row,      *h_row = (double*) malloc(c*sizeof(double)),      *h_col = (double*) malloc(r*sizeof(double));   cudamalloc((void**)&dev_row, c * sizeof(double));   cudamalloc((void**)&dev_col, r * sizeof(double));    //  take row / col  re-create   size_t selected_row = 7, selected_col = 3;    // since in row-major order can  re-create row @  1 time    cudamemcpy(dev_row, &matrix[selected_row*c],      c * sizeof(double), cudamemcpyhosttodevice);   // colum needs copied using cudamemcpy2d    // columnsize*sizeof(type) source pitch   cudamemcpy2d(dev_col, sizeof(double), &matrix[selected_col],      c*sizeof(double), sizeof(double), r, cudamemcpyhosttodevice);    //  re-create host check whether got right column , row   cudamemcpy(h_row, dev_row, c * sizeof(double), cudamemcpydevicetohost);   cudamemcpy(h_col, dev_col, r * sizeof(double), cudamemcpydevicetohost);   //  alter values evaluate backcopy   setvalues<<<r, 1>>>(dev_col, 88.0); // column should 88   setvalues<<<c, 1>>>(dev_row, 99.0); // row should 99   // backcopy   cudamemcpy(&final_matrix[selected_row*c], dev_row,      c * sizeof(double), cudamemcpydevicetohost);   cudamemcpy2d(&final_matrix[selected_col], c*sizeof(double), dev_col,      sizeof(double), sizeof(double), r, cudamemcpydevicetohost);    cudadevicesynchronize();   // output checking functionality    printf("initial matrix:\n");   (size_t i=0; i<r; ++i)   {     (size_t j=0; j<c; ++j) printf(" %lf", matrix[i*c+j]);     printf("\n");   }   printf("\nrow %u values: ", selected_row);   (size_t i=0; i<c; ++i) printf(" %lf", h_row[i]);   printf("\ncol %u values: ", selected_col);   (size_t i=0; i<r; ++i) printf(" %lf", h_col[i]);   printf("\n\n");    printf("final matrix:\n");   (size_t i=0; i<r; ++i)   {     (size_t j=0; j<c; ++j) printf(" %lf", final_matrix[i*c+j]);     printf("\n");   }    cudafree(dev_col);   cudafree(dev_row);   free(matrix);   free(final_matrix);   free(h_row);   free(h_col);   cudadevicereset();    homecoming 0;  }

c++ c matrix cuda memcpy

Search This Blog

Pages Vivanta

c++ - Efficient way to copy strided data (to and from a CUDA Device)? -

Comments

Post a Comment

Popular posts from this blog

web services - java.lang.NoClassDefFoundError: Could not initialize class net.sf.cglib.proxy.Enhancer -

Accessing MATLAB's unicode strings from C -

javascript - mongodb won't find my schema method in nested container -