//
// particles_bonds.cu
//    input particles, output bonds
//    (c) MIT CBA Neil Gershenfeld 6/17/20
//
// includes
//
#include <iostream>
#include <string>
#include <cstring>
using namespace std;
//
// variables
//
class vars {
   public:
      float *px,*py,*pz; // CPU particles
      float *dpx,*dpy,*dpz; // GPU particles
      float d,ds; // particle pitch, sort pitch
      float xmin,xmax,ymin,ymax,zmin,zmax; // limits
      int grid,block; // GPU
      int lpp; // links per particle
      int ms = 10; // maxsort
      int np,nx,ny,nz; // number of particles, sort
      int *dsort,*dnsort; // sort
      int *links,*nlinks,*dlinks,*dnlinks; // links
   };
vars v;
//
// sort particles
//
__global__ void sort_particles(vars v) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*v.np;
   int stop = ((index+1)/float(nindex))*v.np;
   for (int i = start; i < stop; ++i) {
      int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin);
      int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin);
      int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin);
      int isort = v.nz*v.ny*ix+v.nz*iy+iz;
      v.dsort[v.ms*isort
         +atomicAdd(&v.dnsort[isort],1)]
         = i;
      }
   }
//
// link particles
//
__global__ void link_particles(vars v) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*v.np;
   int stop = ((index+1)/float(nindex))*v.np;
   for (int i = start; i < stop; ++i) {
      int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin);
      int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin);
      int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin);
      for (int dx = -1; dx <= 1; ++dx) {
         if (((ix+dx) < 0) || ((ix+dx) > (v.nx-1))) continue;
         for (int dy = -1; dy <= 1; ++dy) {
            if (((iy+dy) < 0) || ((iy+dy) > (v.ny-1))) continue;
            for (int dz = -1; dz <= 1; ++dz) {
               if (((iz+dz) < 0) || ((iz+dz) > (v.nz-1))) continue;
               int isort = v.nz*v.ny*(ix+dx)+v.nz*(iy+dy)+(iz+dz);
               for (int s = 0; s < v.dnsort[isort]; ++s) {
                  int j = v.dsort[v.ms*isort+s];
                  if (i != j) {
                     float d = sqrtf(
                         pow((v.dpx[i]-v.dpx[j]),2)
                        +pow((v.dpy[i]-v.dpy[j]),2)
                        +pow((v.dpz[i]-v.dpz[j]),2));
                     if (d < 1.1*v.d) {
                        v.dlinks[v.lpp*i+v.dnlinks[i]] = j;
                        v.dnlinks[i] += 1;
                        }
                     }
                  }
               }
            }
         }
      }
   }
//
// CUDA error check
//
void cudaCheck(string msg) {
   cudaError err;
   err = cudaGetLastError();
   if (cudaSuccess != err)
   cerr << msg << ": " << cudaGetErrorString(err) << endl;
   }
//
// main
//
int main(int argc, char** argv) {
   if (argc == 1) {
      cerr << "command line: particles source | particles_bonds [arguments] | bonds sink" << endl;
      cerr << "   grid=(grid size)" << endl;
      cerr << "   block=(block size)" << endl;
      return 1;
      }
   for (int i = 1; i < argc; ++i) {
      if (strstr(argv[i],"grid=") != nullptr)
         v.grid= stoi(argv[i]+5);
      else if (strstr(argv[i],"block=") != nullptr)
         v.block = stoi(argv[i]+6);
      else {
         cerr << "particles_bonds argument not recognized" << endl;
         return 1;
         }
      }
   //
   // read and parse inputs
   //
   string s;
   while (1) {
      getline(cin,s);
      if (s.compare("xmin:") == 0) {
         getline(cin,s);
         v.xmin = stof(s);
         }
      else if (s.compare("xmax:") == 0) {
         getline(cin,s);
         v.xmax = stof(s);
         }
      else if (s.compare("ymin:") == 0) {
         getline(cin,s);
         v.ymin = stof(s);
         }
      else if (s.compare("ymax:") == 0) {
         getline(cin,s);
         v.ymax = stof(s);
         }
      else if (s.compare("zmin:") == 0) {
         getline(cin,s);
         v.zmin = stof(s);
         }
      else if (s.compare("zmax:") == 0) {
         getline(cin,s);
         v.zmax = stof(s);
         }
      else if (s.compare("links per particle:") == 0) {
         getline(cin,s);
         v.lpp = stof(s);
         }
      else if (s.compare("number:") == 0) {
         getline(cin,s);
         v.np = stoi(s); // number of particles
         v.px = new float[v.np]; // CPU
         v.py = new float[v.np];
         v.pz = new float[v.np];
         v.ms = 10; // max particles per sort bin
         cudaMalloc(&v.dpx,v.np*sizeof(float)); // GPU
         cudaMalloc(&v.dpy,v.np*sizeof(float));
         cudaMalloc(&v.dpz,v.np*sizeof(float));
         v.links = new int[v.lpp*v.np];
         v.nlinks = new int[v.np];
         cudaMalloc(&v.dlinks,v.lpp*v.np*sizeof(int)); // links
         cudaMalloc(&v.dnlinks,v.np*sizeof(int));
         cudaMemset(v.dnlinks,0,v.np*sizeof(int));
         }
      else if (s.compare("pitch:") == 0) {
         getline(cin,s);
         v.d = stof(s);
         v.ds = 1.1*v.d; // bigger to avoid roundoff at edges
         v.nx = (v.xmax-v.xmin)/v.ds;
         v.ny = (v.ymax-v.ymin)/v.ds;
         v.nz = (v.zmax-v.zmin)/v.ds;
         cudaMalloc(&v.dsort,v.nz*v.ny*v.nx*v.ms*sizeof(int));
         cudaMalloc(&v.dnsort,v.nz*v.ny*v.nx*sizeof(int));
         cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
         }
      else if (s.compare("particles:") == 0) {
         for (int i = 0; i < v.np ; ++i) {
            cin.read((char *)&v.px[i],4);
            cin.read((char *)&v.py[i],4);
            cin.read((char *)&v.pz[i],4);
            }
         }
      else if (s.compare("end:") == 0)
         break;
      }
   cerr << "particles_bonds:" << endl;
   cerr << "   input " << v.np << " particles" << endl;
   //
   // copy to GPU
   //
   cudaMemcpy(v.dpx,v.px,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dpy,v.py,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dpz,v.pz,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   //
   // sort
   //
   sort_particles<<<v.grid,v.block>>>(v);
   cudaCheck("sort");
   //
   // link
   //
   link_particles<<<v.grid,v.block>>>(v);
   cudaCheck("link");
   //
   // synchronize
   //
   cudaDeviceSynchronize();
   //
   // copy to CPU
   //
   cudaMemcpy(v.nlinks,v.dnlinks,v.np*sizeof(int),
      cudaMemcpyDeviceToHost);
   cudaMemcpy(v.links,v.dlinks,v.lpp*v.np*sizeof(int),
      cudaMemcpyDeviceToHost);
   //
   // write bonds
   //
   cout << "type:" << endl;
   cout << "bonds" << endl;
   cout << "format:" << endl;
   cout << "xyz float32" << endl;
   cout << "xmin:" << endl;
   cout << v.xmin << endl;
   cout << "xmax:" << endl;
   cout << v.xmax << endl;
   cout << "ymin:" << endl;
   cout << v.ymin << endl;
   cout << "ymax:" << endl;
   cout << v.ymax << endl;
   cout << "zmin:" << endl;
   cout << v.zmin << endl;
   cout << "zmax:" << endl;
   cout << v.zmax << endl;
   cout << "pitch:" << endl;
   cout << v.d << endl;
   cout << "links per particle:" << endl;
   cout << v.lpp << endl;
   cout << "number:" << endl;
   cout << v.np << endl;
   cout << "particles:" << endl;
   for (int i = 0; i < v.np; ++i) {
      cout.write((char*)&v.px[i],4);
      cout.write((char*)&v.py[i],4);
      cout.write((char*)&v.pz[i],4);
      }
   cout << endl << "nlinks:" << endl;
   for (int i = 0; i < v.np; ++i)
      cout.write((char*)&v.nlinks[i],4);
   cout << endl << "links:" << endl;
   for (int i = 0; i < v.lpp*v.np; ++i)
      cout.write((char*)&v.links[i],4);
   cout << endl << "end:" << endl;
   }