//
// GPU_check.cu
//    get and test GPU information
//    (c) MIT CBA Neil Gershenfeld 7/12/20
//
#include <iostream>
#include <chrono>
using namespace std;
__global__ void add_array(float* dbuf0,int npts) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*npts;
   int stop = ((index+1)/float(nindex))*npts;
   for (int i = start; i < stop; ++i)
      for (int j = 0; j < npts; ++j)
         dbuf0[i] += 1.0;
   }
__global__ void add_arrays(float* dbuf0,int npts,int gpu,int ngpus) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (gpu+index/float(nindex))*(npts/float(ngpus));
   int stop = (gpu+(index+1)/float(nindex))*(npts/float(ngpus));
   for (int i = start; i < stop; ++i)
      for (int j = 0; j < npts; ++j)
         dbuf0[i] += 1.0;
   }
__global__ void nop(float* dbuf0,int npts,int gpu,int ngpus) {
   return;
   }
int main(int argc, char** argv) {
   int ngpus;
   int npts = 5e6;
   int grid = 1024;
   int block = 1024;
   //
   // check peers
   //
   cudaGetDeviceCount(&ngpus);
   if (ngpus > 1)
      printf("peer access:\n");
   for (int i = 0; i < ngpus; ++i) {
      for (int j = 0; j < ngpus; ++j) {
         int result;
         if (j != i) {
            cudaDeviceCanAccessPeer(&result,i,j);
            printf("   from %d to %d: ",i,j);
            if (result == 1)
               printf("yes\n");
            else
               printf("no\n");
            }
         }
      }
   //
   // list GPUs
   //
   if (ngpus > 1)
      printf("GPUs:\n");
   else
      printf("GPU:\n");
   for (int i = 0; i < ngpus; ++i) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop,i);
      printf("   number: %d\n",i);
      printf("      name: %s\n",prop.name);
      printf("      global memory: %lu\n",prop.totalGlobalMem);
      printf("      max grid size: %d\n",prop.maxGridSize[0]);
      printf("      max threads per block: %d\n",prop.maxThreadsPerBlock);
      printf("      max threads dimension: %d\n",prop.maxThreadsDim[0]);
      printf("      multiprocessor count: %d\n",prop.multiProcessorCount);
      printf("      max threads per multiprocessor: %d\n",prop.maxThreadsPerMultiProcessor);
      }
   //
   // time CPU to GPU
   //
   printf("copy %d floats from CPU to GPU\n",npts);
   cudaSetDevice(0);
   float *cbuf,*cbufp,*dbuf0;
   cbuf = new float[npts];
   cudaMallocHost(&cbufp,npts*sizeof(float));
   cudaMalloc(&dbuf0,npts*sizeof(float));
   auto t0 = chrono::high_resolution_clock::now();        
   cudaMemcpy(dbuf0,cbuf,npts*sizeof(float),cudaMemcpyDefault);
   cudaDeviceSynchronize();
   auto t1 = chrono::high_resolution_clock::now();        
   float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
   t0 = chrono::high_resolution_clock::now();        
   cudaMemcpy(dbuf0,cbufp,npts*sizeof(float),cudaMemcpyDefault);
   cudaDeviceSynchronize();
   t1 = chrono::high_resolution_clock::now();        
   float dt1 = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
   printf("   %f us, %g B/s\n",dt,1e6*npts*sizeof(float)/dt);
   printf("   %f us, %g B/s pinned\n",dt1,1e6*npts*sizeof(float)/dt1);
   //
   // time GPU to GPU
   //
   if (ngpus > 1) {
      float* dbufs[ngpus];
      int result;
      printf("copy %d floats from GPU to GPU 0:\n",npts);
      for (int i = 1; i <= (ngpus-1); ++i) {
         cudaSetDevice(i);
         cudaMalloc(&dbufs[i],npts*sizeof(float));
         cudaDeviceCanAccessPeer(&result,i,0);
         if (result != 0)
            cudaDeviceEnablePeerAccess(0,0);
         auto t0 = chrono::high_resolution_clock::now();        
         cudaMemcpy(dbuf0,dbufs[i],npts*sizeof(float),cudaMemcpyDefault);
         cudaDeviceSynchronize();
         auto t1 = chrono::high_resolution_clock::now();        
         float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
         printf("   GPU %d: %f us, %g B/s\n",i,dt,1e6*npts*sizeof(float)/dt);
         }
      }
   //
   // time Flops
   //
   printf("add %dx%d floats:\n",npts,npts);
   cudaSetDevice(0);
   t0 = chrono::high_resolution_clock::now();        
   add_array<<<grid,block>>>(dbuf0,npts);
   cudaDeviceSynchronize();
   t1 = chrono::high_resolution_clock::now();        
   dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
   printf("   %f s, %f G/s\n",dt/1e6,npts*(npts/dt)/1000.0);
   //
   // time peer Flops
   //
   if (ngpus > 1) {
      int result;
      cudaMemset(dbuf0,0,npts*sizeof(float));
      printf("peer add %dx%d GPU 0 floats:\n",npts,npts);
      for (int i = 1; i < ngpus; ++i) {
         cudaDeviceCanAccessPeer(&result,i,0);
         if (result != 0) {
            cudaSetDevice(i);
            t0 = chrono::high_resolution_clock::now();        
            add_array<<<grid,block>>>(dbuf0,npts);
            cudaDeviceSynchronize();
            t1 = chrono::high_resolution_clock::now();        
            dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
            printf("   GPU %d: %f s, %f G/s\n",i,dt/1e6,npts*(npts/dt)/1000.0);
            }
         }
      }
   //
   // time parallel peer Flops
   //
   if (ngpus > 1) {
      int result;
      int count = 1;
      cudaMemset(dbuf0,0,npts*sizeof(float));
      printf("parallel peer add %dx%d GPU 0 floats:\n",npts,npts);
      t0 = chrono::high_resolution_clock::now();        
      cudaSetDevice(0);
      add_arrays<<<grid,block>>>(dbuf0,npts,0,ngpus);
      for (int i = 1; i < ngpus; ++i) {
         cudaDeviceCanAccessPeer(&result,i,0);
         if (result != 0) {
            ++count;
            cudaSetDevice(i);
            add_arrays<<<grid,block>>>(dbuf0,npts,i,ngpus);
            }
         }
      cudaSetDevice(0);
      cudaDeviceSynchronize();
      for (int i = 1; i < ngpus; ++i) {
         cudaDeviceCanAccessPeer(&result,i,0);
         if (result != 0) {
            cudaSetDevice(i);
            cudaDeviceSynchronize();
            }
         }
      t1 = chrono::high_resolution_clock::now();        
      dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      printf("   %d GPUS: %f s, %f G/s\n",count,dt/1e6,(count*npts/float(ngpus))*(npts/dt)/1000.0);
      }
   //
   // check for errors
   //
   cudaError err;
   err = cudaGetLastError();
   if (cudaSuccess != err)
      printf("error: %s\n",cudaGetErrorString(err));
   //
   // return
   //
   return 0;
   }