Skip to content
Snippets Groups Projects
GPU_check.cu 6.34 KiB
Newer Older
  • Learn to ignore specific revisions
  • //
    // GPU_check.cu
    //    get and test GPU information
    //    (c) MIT CBA Neil Gershenfeld 7/12/20
    //
    #include <iostream>
    #include <chrono>
    using namespace std;
    __global__ void add_array(float* dbuf0,int npts) {
       int index = blockIdx.x*blockDim.x+threadIdx.x;
       int nindex = gridDim.x*blockDim.x;
       int start = (index/float(nindex))*npts;
       int stop = ((index+1)/float(nindex))*npts;
       for (int i = start; i < stop; ++i)
          for (int j = 0; j < npts; ++j)
             dbuf0[i] += 1.0;
       }
    __global__ void add_arrays(float* dbuf0,int npts,int gpu,int ngpus) {
       int index = blockIdx.x*blockDim.x+threadIdx.x;
       int nindex = gridDim.x*blockDim.x;
       int start = (gpu+index/float(nindex))*(npts/float(ngpus));
       int stop = (gpu+(index+1)/float(nindex))*(npts/float(ngpus));
       for (int i = start; i < stop; ++i)
          for (int j = 0; j < npts; ++j)
             dbuf0[i] += 1.0;
       }
    __global__ void nop(float* dbuf0,int npts,int gpu,int ngpus) {
       return;
       }
    int main(int argc, char** argv) {
       int ngpus;
       int npts = 5e6;
       int grid = 1024;
       int block = 1024;
       //
       // check peers
       //
       cudaGetDeviceCount(&ngpus);
       if (ngpus > 1)
          printf("peer access:\n");
       for (int i = 0; i < ngpus; ++i) {
          for (int j = 0; j < ngpus; ++j) {
             int result;
             if (j != i) {
                cudaDeviceCanAccessPeer(&result,i,j);
                printf("   from %d to %d: ",i,j);
                if (result == 1)
                   printf("yes\n");
                else
                   printf("no\n");
                }
             }
          }
       //
       // list GPUs
       //
       if (ngpus > 1)
          printf("GPUs:\n");
       else
          printf("GPU:\n");
       for (int i = 0; i < ngpus; ++i) {
          cudaDeviceProp prop;
          cudaGetDeviceProperties(&prop,i);
          printf("   number: %d\n",i);
          printf("      name: %s\n",prop.name);
          printf("      global memory: %lu\n",prop.totalGlobalMem);
          printf("      max grid size: %d\n",prop.maxGridSize[0]);
          printf("      max threads per block: %d\n",prop.maxThreadsPerBlock);
          printf("      max threads dimension: %d\n",prop.maxThreadsDim[0]);
          printf("      multiprocessor count: %d\n",prop.multiProcessorCount);
          printf("      max threads per multiprocessor: %d\n",prop.maxThreadsPerMultiProcessor);
          }
       //
       // time CPU to GPU
       //
       printf("copy %d floats from CPU to GPU\n",npts);
       cudaSetDevice(0);
       float *cbuf,*cbufp,*dbuf0;
       cbuf = new float[npts];
       cudaMallocHost(&cbufp,npts*sizeof(float));
       cudaMalloc(&dbuf0,npts*sizeof(float));
       auto t0 = chrono::high_resolution_clock::now();        
       cudaMemcpy(dbuf0,cbuf,npts*sizeof(float),cudaMemcpyDefault);
       cudaDeviceSynchronize();
       auto t1 = chrono::high_resolution_clock::now();        
       float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
       t0 = chrono::high_resolution_clock::now();        
       cudaMemcpy(dbuf0,cbufp,npts*sizeof(float),cudaMemcpyDefault);
       cudaDeviceSynchronize();
       t1 = chrono::high_resolution_clock::now();        
       float dt1 = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
       printf("   %f us, %g B/s\n",dt,1e6*npts*sizeof(float)/dt);
       printf("   %f us, %g B/s pinned\n",dt1,1e6*npts*sizeof(float)/dt1);
       //
       // time GPU to GPU
       //
       if (ngpus > 1) {
          float* dbufs[ngpus];
          int result;
          printf("copy %d floats from GPU to GPU 0:\n",npts);
          for (int i = 1; i <= (ngpus-1); ++i) {
             cudaSetDevice(i);
             cudaMalloc(&dbufs[i],npts*sizeof(float));
             cudaDeviceCanAccessPeer(&result,i,0);
             if (result != 0)
                cudaDeviceEnablePeerAccess(0,0);
             auto t0 = chrono::high_resolution_clock::now();        
             cudaMemcpy(dbuf0,dbufs[i],npts*sizeof(float),cudaMemcpyDefault);
             cudaDeviceSynchronize();
             auto t1 = chrono::high_resolution_clock::now();        
             float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
             printf("   GPU %d: %f us, %g B/s\n",i,dt,1e6*npts*sizeof(float)/dt);
             }
          }
       //
       // time Flops
       //
       printf("add %dx%d floats:\n",npts,npts);
       cudaSetDevice(0);
       t0 = chrono::high_resolution_clock::now();        
       add_array<<<grid,block>>>(dbuf0,npts);
       cudaDeviceSynchronize();
       t1 = chrono::high_resolution_clock::now();        
       dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
       printf("   %f s, %f G/s\n",dt/1e6,npts*(npts/dt)/1000.0);
       //
       // time peer Flops
       //
       if (ngpus > 1) {
          int result;
          cudaMemset(dbuf0,0,npts*sizeof(float));
          printf("peer add %dx%d GPU 0 floats:\n",npts,npts);
          for (int i = 1; i < ngpus; ++i) {
             cudaDeviceCanAccessPeer(&result,i,0);
             if (result != 0) {
                cudaSetDevice(i);
                t0 = chrono::high_resolution_clock::now();        
                add_array<<<grid,block>>>(dbuf0,npts);
                cudaDeviceSynchronize();
                t1 = chrono::high_resolution_clock::now();        
                dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
                printf("   GPU %d: %f s, %f G/s\n",i,dt/1e6,npts*(npts/dt)/1000.0);
                }
             }
          }
       //
       // time parallel peer Flops
       //
       if (ngpus > 1) {
          int result;
          int count = 1;
          cudaMemset(dbuf0,0,npts*sizeof(float));
          printf("parallel peer add %dx%d GPU 0 floats:\n",npts,npts);
          t0 = chrono::high_resolution_clock::now();        
          cudaSetDevice(0);
          add_arrays<<<grid,block>>>(dbuf0,npts,0,ngpus);
          for (int i = 1; i < ngpus; ++i) {
             cudaDeviceCanAccessPeer(&result,i,0);
             if (result != 0) {
                ++count;
                cudaSetDevice(i);
                add_arrays<<<grid,block>>>(dbuf0,npts,i,ngpus);
                }
             }
          cudaSetDevice(0);
          cudaDeviceSynchronize();
          for (int i = 1; i < ngpus; ++i) {
             cudaDeviceCanAccessPeer(&result,i,0);
             if (result != 0) {
                cudaSetDevice(i);
                cudaDeviceSynchronize();
                }
             }
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          printf("   %d GPUS: %f s, %f G/s\n",count,dt/1e6,(count*npts/float(ngpus))*(npts/dt)/1000.0);
          }
       //
       // check for errors
       //
       cudaError err;
       err = cudaGetLastError();
       if (cudaSuccess != err)
          printf("error: %s\n",cudaGetErrorString(err));
       //
       // return
       //
       return 0;
       }