//
// bonds_stress_strain.cpp: incomplete, just for timing tests
//    input bonds, apply stress, output strain
//    (c) MIT CBA Neil Gershenfeld 8/7/20
//    reads from stdin, writes to stdout
//    bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fraction step bond
//
// includes
//
#include <iostream>
#include <string>
#include <chrono>
#include <thread>
using namespace std;
//
// variables
//
class vars {
   public:
      float *px,*py,*pz; // host particle positions
      float *fz; // host forces
      float *dpx,*dpy,*dpz; // device particle positions
      float *dvx,*dvy,*dvz; // device velocities
      float *dfx,*dfy,*dfz; // device forces
      float *s,*ds; // strain
      float d,dmax,sd; // particle pitch, max, sort
      float fraction; // fixed top and bottom particle fraction
      float step; // strain step size
      float bond; // bond break multiplier
      float stress,strain; // stress, strain accumulators
      int lpp; // links per particle
      float xmin,xmax,ymin,ymax,zmin,zmax,top,bot; // limits
      float spring,mass,diss,dt; // integration
      int nloop,grid,block; // update loop, GPU threads
      int ms = 10; // maxsort
      int np,nx,ny,nz; // number of particles, sort
      int *dsort,*dnsort; // sort
      int *links,*nlinks,*dlinks,*dnlinks; // links
      int nthreads;
      thread *threads;
   };
vars v;
//
// sort particles
//
/*
__global__ void sort_particles(vars v) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*v.np;
   int stop = ((index+1)/float(nindex))*v.np;
   for (int i = start; i < stop; ++i) {
      int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin);
         if ((ix < 0) || (ix > v.nx-1)) continue;
      int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin);
         if ((iy < 0) || (iy > v.ny-1)) continue;
      int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin);
         if ((iz < 0) || (iz > v.nz-1)) continue;
      int isort = v.nz*v.ny*ix+v.nz*iy+iz;
      v.dsort[v.ms*isort+atomicAdd(&v.dnsort[isort],1)] = i;
      }
   }
*/
//
// test for link
//
/*
__device__ bool linked(int i,int j,vars v) {
   for (int l = 0; l < v.dnlinks[i]; ++l) {
      if (j == v.dlinks[v.lpp*i+l])
         return true;
      }
   return false;
   }
*/
//
// find force
//
/*
__device__ void find_force(int i,int j,float x,float y,float z,vars v) {
   if (linked(i,j,v)) {
      float xl = v.dpx[j];
      float yl = v.dpy[j];
      float zl = v.dpz[j];
      float dx = x-xl;
      float dy = y-yl;
      float dz = z-zl;
      float d = sqrtf(dx*dx+dy*dy+dz*dz);
      float nx = dx/d;
      float ny = dy/d;
      float nz = dz/d;
      float f = -v.spring*(d-v.d)/v.d;
      float dvx = v.dvx[i]-v.dvx[j];
      float dvy = v.dvy[i]-v.dvy[j];
      float dvz = v.dvz[i]-v.dvz[j];
      f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d;
      v.dfx[i] += f*nx;
      v.dfy[i] += f*ny;
      v.dfz[i] += f*nz;
      }
   else {
      float xl = v.dpx[j];
      float yl = v.dpy[j];
      float zl = v.dpz[j];
      float dx = x-xl;
      float dy = y-yl;
      float dz = z-zl;
      float d = sqrtf(dx*dx+dy*dy+dz*dz);
      if (d < v.d) {
         float nx = dx/d;
         float ny = dy/d;
         float nz = dz/d;
         float f = -v.spring*(d-v.d)/v.d;
         float dvx = v.dvx[i]-v.dvx[j];
         float dvy = v.dvy[i]-v.dvy[j];
         float dvz = v.dvz[i]-v.dvz[j];
         f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d;
         v.dfx[i] += f*nx;
         v.dfy[i] += f*ny;
         v.dfz[i] += f*nz;
         }
      }
   }
*/
//
// calculate forces
//
/*
__global__ void calculate_forces(vars v) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*v.np;
   int stop = ((index+1)/float(nindex))*v.np;
   for (int i = start; i < stop; ++i) {
      float x = v.dpx[i];
      float y = v.dpy[i];
      float z = v.dpz[i];
      int ix = (v.nx-1)*(v.dpx[i]-v.xmin)/(v.xmax-v.xmin);
      int iy = (v.ny-1)*(v.dpy[i]-v.ymin)/(v.ymax-v.ymin);
      int iz = (v.nz-1)*(v.dpz[i]-v.zmin)/(v.zmax-v.zmin);
      //
      // loop over links to find strain
      //
      v.ds[i] = 0;
      int norm = 0;
      for (int l = 0; l < v.dnlinks[i]; ++l) {
         int j = v.dlinks[v.lpp*i+l];
         if (j == -1)
            //
            // ignore deleted link
            //
            continue;
         float dx = x-v.dpx[j];
         float dy = y-v.dpy[j];
         float dz = z-v.dpz[j];
         float d = sqrtf(dx*dx+dy*dy+dz*dz);
         //
         // accumulate
         //
         v.ds[i] += abs(d-v.d)/v.d;
         norm += 1;
         //
         // check for broken links
         //
         if (d > v.dmax)
            //
            // yes, mark for deletion
            //
            v.dlinks[v.lpp*i+l] = -1;
         }
      v.ds[i] = v.ds[i]/norm;
      //
      // loop over sort to find forces
      //
      v.dfx[i] = v.dfy[i] = v.dfz[i] = 0;
      for (int dx = -1; dx <= 1; ++dx) {
         if (((ix+dx) < 0) || ((ix+dx) > (v.nx-1))) continue;
         for (int dy = -1; dy <= 1; ++dy) {
            if (((iy+dy) < 0) || ((iy+dy) > (v.ny-1))) continue;
            for (int dz = -1; dz <= 1; ++dz) {
               if (((iz+dz) < 0) || ((iz+dz) > (v.nz-1))) continue;
               int isort = v.nz*v.ny*(ix+dx)+v.nz*(iy+dy)+(iz+dz);
               for (int s = 0; s < v.dnsort[isort]; ++s) {
                  int j = v.dsort[v.ms*isort+s];
                  if (i != j)
                     find_force(i,j,x,y,z,v);
                  }
               }
            }
         }
      }
   }
*/
//
// integrate positions
//
void integrate_thread(int index) {
   int start = (index/float(v.nthreads))*v.np;
   int stop = ((index+1)/float(v.nthreads))*v.np;
   float ax,ay,az;
   for (int i = start; i < stop; ++i) {
      if (i < v.np*v.fraction)
         v.dpz[i] -= v.step;
      else if (i > v.np*(1-v.fraction))
         v.dpz[i] += v.step;
      else {
         ax = v.dfx[i]/v.mass;
         ay = v.dfy[i]/v.mass;
         az = v.dfz[i]/v.mass;
         v.dvx[i] += v.dt*ax;
         v.dvy[i] += v.dt*ay;
         v.dvz[i] += v.dt*az;
         v.dpx[i] += v.dt*v.dvx[i];
         v.dpy[i] += v.dt*v.dvy[i];
         v.dpz[i] += v.dt*v.dvz[i];
         }
      }
   }
void integrate_positions() {
   for (int i = 0; i < v.nthreads; ++i)
      v.threads[i] = thread(integrate_thread,i);
   for (int i = 0; i < v.nthreads; ++i)
      v.threads[i].join();
   }
/*
__global__ void integrate_positions(vars v) {
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   int nindex = gridDim.x*blockDim.x;
   int start = (index/float(nindex))*v.np;
   int stop = ((index+1)/float(nindex))*v.np;
   float ax,ay,az;
   for (int i = start; i < stop; ++i) {
      if (i < v.np*v.fraction)
         v.dpz[i] -= v.step;
      else if (i > v.np*(1-v.fraction))
         v.dpz[i] += v.step;
      else {
         ax = v.dfx[i]/v.mass;
         ay = v.dfy[i]/v.mass;
         az = v.dfz[i]/v.mass;
         v.dvx[i] += v.dt*ax;
         v.dvy[i] += v.dt*ay;
         v.dvz[i] += v.dt*az;
         v.dpx[i] += v.dt*v.dvx[i];
         v.dpy[i] += v.dt*v.dvy[i];
         v.dpz[i] += v.dt*v.dvz[i];
         }
      }
   }
*/
//
// CUDA error check
//
/*
void cudaCheck(string msg) {
   cudaError err;
   err = cudaGetLastError();
   if (cudaSuccess != err)
   cerr << msg << ": " << cudaGetErrorString(err) << endl;
   }
*/
//
// main
//
int main(int argc, char** argv) {
   v.nthreads = thread::hardware_concurrency();
   v.threads = new thread[v.nthreads];
   v.np = 1e6;
   v.fraction = .01;
   v.step = .01;
   v.mass = 1;
   v.dt = 1;
   v.dfx = new float[v.np];
   v.dfy = new float[v.np];
   v.dfz = new float[v.np];
   v.dvx = new float[v.np];
   v.dvy = new float[v.np];
   v.dvz = new float[v.np];
   v.dpx = new float[v.np];
   v.dpy = new float[v.np];
   v.dpz = new float[v.np]; 
   /*
   if (argc != 11) {
      cerr << "command line: bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fraction step bond";
      return 1;
      }
   v.grid = stoi(argv[1]);
   v.block = stoi(argv[2]);
   v.spring = stof(argv[3]);
   v.mass = stof(argv[4]);
   v.diss = stof(argv[5]);
   v.dt = stof(argv[6]);
   v.nloop = stoi(argv[7]);
   v.fraction = stof(argv[8]);
   v.step = stof(argv[9]);
   v.bond = stof(argv[10]);
   */
   //
   // read and parse inputs
   //
   /*
   string s;
   while (1) {
      getline(cin,s);
      if (s.compare("xmin:") == 0) {
         getline(cin,s);
         v.xmin = stof(s);
         }
      else if (s.compare("xmax:") == 0) {
         getline(cin,s);
         v.xmax = stof(s);
         }
      else if (s.compare("ymin:") == 0) {
         getline(cin,s);
         v.ymin = stof(s);
         }
      else if (s.compare("ymax:") == 0) {
         getline(cin,s);
         v.ymax = stof(s);
         }
      else if (s.compare("zmin:") == 0) {
         getline(cin,s);
         v.zmin = stof(s);
         }
      else if (s.compare("zmax:") == 0) {
         getline(cin,s);
         v.zmax = stof(s);
         }
      else if (s.compare("pitch:") == 0) {
         getline(cin,s);
         v.d = stof(s);
         v.dmax = v.bond*v.d; // bond breaking
         v.sd = 1.1*v.d; // sort bins bigger to avoid roundoff at edges
         float xmid = (v.xmax+v.xmin)/2.0;
         float ymid = (v.ymax+v.ymin)/2.0;
         float zmid = (v.zmax+v.zmin)/2.0;
         v.xmin = xmid+1.25*(v.xmin-xmid); // enlarge sort volume
         v.xmax = xmid+1.25*(v.xmax-xmid);
         v.ymin = ymid+1.25*(v.ymin-ymid);
         v.ymax = ymid+1.25*(v.ymax-ymid);
         v.zmin = zmid+1.25*(v.zmin-zmid);
         v.zmax = zmid+1.25*(v.zmax-zmid);
         v.nx = (v.xmax-v.xmin)/v.sd;
         v.ny = (v.ymax-v.ymin)/v.sd;
         v.nz = (v.zmax-v.zmin)/v.sd;
         //cudaMalloc(&v.dsort,v.nz*v.ny*v.nx*v.ms*sizeof(int));
         //cudaMalloc(&v.dnsort,v.nz*v.ny*v.nx*sizeof(int));
         //cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
         //cudaCheck("malloc pitch");
         }
      else if (s.compare("links per particle:") == 0) {
         getline(cin,s);
         v.lpp = stoi(s);
         }
      else if (s.compare("number:") == 0) {
         getline(cin,s);
         v.np = stoi(s); // number of particles
         cudaMallocHost(&v.px,v.np*sizeof(float)); // CPU arrays
         cudaMallocHost(&v.py,v.np*sizeof(float));
         cudaMallocHost(&v.pz,v.np*sizeof(float));
         cudaMallocHost(&v.fz,v.np*sizeof(float));
         cudaMallocHost(&v.s,v.np*sizeof(float));
         v.links = new int[v.lpp*v.np];
         v.nlinks = new int[v.np];
         cudaMalloc(&v.dpx,v.np*sizeof(float)); // GPU arrays
         cudaMalloc(&v.dpy,v.np*sizeof(float));
         cudaMalloc(&v.dpz,v.np*sizeof(float));
         cudaMalloc(&v.dvx,v.np*sizeof(float));
         cudaMalloc(&v.dvy,v.np*sizeof(float));
         cudaMalloc(&v.dvz,v.np*sizeof(float));
         cudaMalloc(&v.dfx,v.np*sizeof(float));
         cudaMalloc(&v.dfy,v.np*sizeof(float));
         cudaMalloc(&v.dfz,v.np*sizeof(float));
         cudaMalloc(&v.ds,v.np*sizeof(float));
         cudaMalloc(&v.dlinks,v.lpp*v.np*sizeof(int));
         cudaMalloc(&v.dnlinks,v.np*sizeof(int));
         cudaMemset(v.dnlinks,0,v.np*sizeof(int));
         cudaCheck("malloc number");
         }
      else if (s.compare("particles:") == 0) {
         for (int i = 0; i < v.np ; ++i) {
            cin.read((char *)&v.px[i],4);
            cin.read((char *)&v.py[i],4);
            cin.read((char *)&v.pz[i],4);
            }
         }
      else if (s.compare("nlinks:") == 0) {
         for (int i = 0; i < v.np ; ++i)
            cin.read((char *)&v.nlinks[i],4);
         }
      else if (s.compare("links:") == 0) {
         for (int i = 0; i < v.lpp*v.np ; ++i)
            cin.read((char *)&v.links[i],4);
         }
      else if (s.compare("end:") == 0)
         break;
      }
   cerr << "bonds_stress_strain:" << endl;
   cerr << "   input " << v.np << " particles" << endl;
   */
   //
   // copy data to GPU
   //
   /*
   cudaMemcpy(v.dpx,v.px,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dpy,v.py,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dpz,v.pz,v.np*sizeof(float),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dnlinks,v.nlinks,v.np*sizeof(int),
      cudaMemcpyHostToDevice);
   cudaMemcpy(v.dlinks,v.links,v.lpp*v.np*sizeof(int),
      cudaMemcpyHostToDevice);
   cudaCheck("copy to GPU");
   */
   //
   // start output
   //
   /*
   cout << "type:" << endl;
   cout << "bonds" << endl;
   cout << "format:" << endl;
   cout << "xyz float32" << endl;
   cout << "pitch:" << endl;
   cout << v.d << endl;
   cout << "links per particle:" << endl;
   cout << v.lpp << endl;
   cout << "number:" << endl;
   cout << v.np << endl;
   cout << "fraction:" << endl;
   cout << v.fraction << endl;
   cout << "xmin:" << endl;
   cout << v.xmin << endl;
   cout << "xmax:" << endl;
   cout << v.xmax << endl;
   cout << "ymin:" << endl;
   cout << v.ymin << endl;
   cout << "ymax:" << endl;
   cout << v.ymax << endl;
   cout << "zmin:" << endl;
   cout << v.zmin << endl;
   cout << "zmax:" << endl;
   cout << v.zmax << endl;
   cout << "particles:" << endl;
   for (int i = 0; i < v.np; ++i) {
      cout.write((char*)&v.px[i],4);
      cout.write((char*)&v.py[i],4);
      cout.write((char*)&v.pz[i],4);
      }
   cout << endl << "continue:" << endl;
   */
   //
   // start main loop
   //
   while (1) {
      //
      // update loop
      //
      //cudaDeviceSynchronize();
      //auto t0 = chrono::high_resolution_clock::now();        
      //cout << "outer loop" << endl;
      for (int i = 0; i < v.nloop; ++i) {
         //cout << "inner loop" << endl;
         //
         // sort
         //
         //cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
         //sort_particles<<<v.grid,v.block>>>(v);
         //cudaCheck("sort");
         //
         // forces
         //
         //calculate_forces<<<v.grid,v.block>>>(v);
         //cudaCheck("forces");
         //
         // integrate
         //
         //integrate_positions<<<v.grid,v.block>>>(v);
         //cudaCheck("integrate");
         }
      /*
      cudaDeviceSynchronize();
      auto t1 = chrono::high_resolution_clock::now();        
      float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      fprintf(stderr,"\r   mps: %.0f",(v.nloop*v.np)/dt);
      */
      //
      // measure timing
      //
      /*
      t0 = chrono::high_resolution_clock::now();        
      //cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
      //sort_particles<<<v.grid,v.block>>>(v);
      //cudaCheck("time sort");
      //cudaDeviceSynchronize();
      t1 = chrono::high_resolution_clock::now();        
      dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      cerr << " srt: " << dt;
      */
      //
      /*
      auto t0 = chrono::high_resolution_clock::now();        
      //calculate_forces<<<v.grid,v.block>>>(v);
      //cudaCheck("time forces");
      //cudaDeviceSynchronize();
      auto t1 = chrono::high_resolution_clock::now();        
      float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      cerr << " frc: " << dt;
      */
      //
      auto t0 = chrono::high_resolution_clock::now();        
      integrate_positions();
      //integrate_positions<<<v.grid,v.block>>>(v);
      //cudaCheck("time integrate");
      //cudaDeviceSynchronize();
      auto t1 = chrono::high_resolution_clock::now();        
      float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      cerr << " int: " << dt << "     \r" << flush;
      //
      // copy data to CPU
      //
      /*
      t0 = chrono::high_resolution_clock::now();        
      cudaMemcpy(v.px,v.dpx,v.np*sizeof(int),cudaMemcpyDeviceToHost);
      cudaMemcpy(v.py,v.dpy,v.np*sizeof(int),cudaMemcpyDeviceToHost);
      cudaMemcpy(v.pz,v.dpz,v.np*sizeof(int),cudaMemcpyDeviceToHost);
      cudaMemcpy(v.fz,v.dfz,v.np*sizeof(int),cudaMemcpyDeviceToHost);
      cudaMemcpy(v.s,v.ds,v.np*sizeof(int),cudaMemcpyDeviceToHost);
      cudaDeviceSynchronize();
      cudaCheck("copy to CPU");
      t1 = chrono::high_resolution_clock::now();        
      dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      cerr << " cpy: " << dt;
      */
      //
      // stress
      //
      /*
      v.stress = 0;
      for (int i = 0; i < v.np; ++i) {
         if (i < v.np*v.fraction)
            v.stress -= v.fz[i];
         else if (i > v.np*(1-v.fraction))
            v.stress += v.fz[i];
         }
      //cerr << endl << v.stress << endl;
      //
      //
      // output
      //
      t0 = chrono::high_resolution_clock::now();        
      cout << "particles:" << endl;
      for (int i = 0; i < v.np; ++i) {
         cout.write((char*)&v.px[i],4);
         cout.write((char*)&v.py[i],4);
         cout.write((char*)&v.pz[i],4);
         }
      cout << endl << "strain:" << endl;
      for (int i = 0; i < v.np; ++i)
         cout.write((char*)&v.s[i],4);
      cout << endl << "continue:" << endl;
      t1 = chrono::high_resolution_clock::now();        
      dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
      fprintf(stderr," dpy: %.3g ",dt);
      //
      cerr << "(us)" << flush;
      */
      }
   }