// // GPU_check.cu // get and test GPU information // (c) MIT CBA Neil Gershenfeld 7/12/20 // #include <iostream> #include <chrono> using namespace std; __global__ void add_array(float* dbuf0,int npts) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (index/float(nindex))*npts; int stop = ((index+1)/float(nindex))*npts; for (int i = start; i < stop; ++i) for (int j = 0; j < npts; ++j) dbuf0[i] += 1.0; } __global__ void add_arrays(float* dbuf0,int npts,int gpu,int ngpus) { int index = blockIdx.x*blockDim.x+threadIdx.x; int nindex = gridDim.x*blockDim.x; int start = (gpu+index/float(nindex))*(npts/float(ngpus)); int stop = (gpu+(index+1)/float(nindex))*(npts/float(ngpus)); for (int i = start; i < stop; ++i) for (int j = 0; j < npts; ++j) dbuf0[i] += 1.0; } __global__ void nop(float* dbuf0,int npts,int gpu,int ngpus) { return; } int main(int argc, char** argv) { int ngpus; int npts = 5e6; int grid = 1024; int block = 1024; // // check peers // cudaGetDeviceCount(&ngpus); if (ngpus > 1) printf("peer access:\n"); for (int i = 0; i < ngpus; ++i) { for (int j = 0; j < ngpus; ++j) { int result; if (j != i) { cudaDeviceCanAccessPeer(&result,i,j); printf(" from %d to %d: ",i,j); if (result == 1) printf("yes\n"); else printf("no\n"); } } } // // list GPUs // if (ngpus > 1) printf("GPUs:\n"); else printf("GPU:\n"); for (int i = 0; i < ngpus; ++i) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop,i); printf(" number: %d\n",i); printf(" name: %s\n",prop.name); printf(" global memory: %lu\n",prop.totalGlobalMem); printf(" max grid size: %d\n",prop.maxGridSize[0]); printf(" max threads per block: %d\n",prop.maxThreadsPerBlock); printf(" max threads dimension: %d\n",prop.maxThreadsDim[0]); printf(" multiprocessor count: %d\n",prop.multiProcessorCount); printf(" max threads per multiprocessor: %d\n",prop.maxThreadsPerMultiProcessor); } // // time CPU to GPU // printf("copy %d floats from CPU to GPU\n",npts); cudaSetDevice(0); float *cbuf,*cbufp,*dbuf0; cbuf = new float[npts]; cudaMallocHost(&cbufp,npts*sizeof(float)); cudaMalloc(&dbuf0,npts*sizeof(float)); auto t0 = chrono::high_resolution_clock::now(); cudaMemcpy(dbuf0,cbuf,npts*sizeof(float),cudaMemcpyDefault); cudaDeviceSynchronize(); auto t1 = chrono::high_resolution_clock::now(); float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); t0 = chrono::high_resolution_clock::now(); cudaMemcpy(dbuf0,cbufp,npts*sizeof(float),cudaMemcpyDefault); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); float dt1 = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); printf(" %f us, %g B/s\n",dt,1e6*npts*sizeof(float)/dt); printf(" %f us, %g B/s pinned\n",dt1,1e6*npts*sizeof(float)/dt1); // // time GPU to GPU // if (ngpus > 1) { float* dbufs[ngpus]; int result; printf("copy %d floats from GPU to GPU 0:\n",npts); for (int i = 1; i <= (ngpus-1); ++i) { cudaSetDevice(i); cudaMalloc(&dbufs[i],npts*sizeof(float)); cudaDeviceCanAccessPeer(&result,i,0); if (result != 0) cudaDeviceEnablePeerAccess(0,0); auto t0 = chrono::high_resolution_clock::now(); cudaMemcpy(dbuf0,dbufs[i],npts*sizeof(float),cudaMemcpyDefault); cudaDeviceSynchronize(); auto t1 = chrono::high_resolution_clock::now(); float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); printf(" GPU %d: %f us, %g B/s\n",i,dt,1e6*npts*sizeof(float)/dt); } } // // time Flops // printf("add %dx%d floats:\n",npts,npts); cudaSetDevice(0); t0 = chrono::high_resolution_clock::now(); add_array<<<grid,block>>>(dbuf0,npts); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); printf(" %f s, %f G/s\n",dt/1e6,npts*(npts/dt)/1000.0); // // time peer Flops // if (ngpus > 1) { int result; cudaMemset(dbuf0,0,npts*sizeof(float)); printf("peer add %dx%d GPU 0 floats:\n",npts,npts); for (int i = 1; i < ngpus; ++i) { cudaDeviceCanAccessPeer(&result,i,0); if (result != 0) { cudaSetDevice(i); t0 = chrono::high_resolution_clock::now(); add_array<<<grid,block>>>(dbuf0,npts); cudaDeviceSynchronize(); t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); printf(" GPU %d: %f s, %f G/s\n",i,dt/1e6,npts*(npts/dt)/1000.0); } } } // // time parallel peer Flops // if (ngpus > 1) { int result; int count = 1; cudaMemset(dbuf0,0,npts*sizeof(float)); printf("parallel peer add %dx%d GPU 0 floats:\n",npts,npts); t0 = chrono::high_resolution_clock::now(); cudaSetDevice(0); add_arrays<<<grid,block>>>(dbuf0,npts,0,ngpus); for (int i = 1; i < ngpus; ++i) { cudaDeviceCanAccessPeer(&result,i,0); if (result != 0) { ++count; cudaSetDevice(i); add_arrays<<<grid,block>>>(dbuf0,npts,i,ngpus); } } cudaSetDevice(0); cudaDeviceSynchronize(); for (int i = 1; i < ngpus; ++i) { cudaDeviceCanAccessPeer(&result,i,0); if (result != 0) { cudaSetDevice(i); cudaDeviceSynchronize(); } } t1 = chrono::high_resolution_clock::now(); dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); printf(" %d GPUS: %f s, %f G/s\n",count,dt/1e6,(count*npts/float(ngpus))*(npts/dt)/1000.0); } // // check for errors // cudaError err; err = cudaGetLastError(); if (cudaSuccess != err) printf("error: %s\n",cudaGetErrorString(err)); // // return // return 0; }