Skip to content
Snippets Groups Projects
bonds_stress_strain1.cu 15 KiB
Newer Older
  • Learn to ignore specific revisions
  • Amira Abdel-Rahman's avatar
    Amira Abdel-Rahman committed
    //
    // bonds_stress_strain.cu
    //    input bonds, apply stress, output strain
    //    (c) MIT CBA Neil Gershenfeld 6/30/20
    //    reads from stdin, writes to stdout
    //    bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fraction step bond
    //
    // includes
    //
    #include <iostream>
    #include <string>
    #include <chrono>
    using namespace std;
    //
    // variables
    //
    class vars {
       public:
          float3 *hp,*hf; // host position, force
          float3 *dp,*dv,*df; // device position, velocity, force
          float *s,*ds; // strain
          float d,dmax,sd; // particle pitch, max, sort
          float fractionTop; // fixed top particle fraction
          float fractionBottom; // fixed bottom particle fraction
          float step; // strain step size
          float bond; // bond break multiplier
          float stress,strain; // stress, strain accumulators
          int lpp; // links per particle
          float xmin,xmax,ymin,ymax,zmin,zmax,top,bot; // limits
          float spring,mass,diss,dt; // integration
          int nloop,grid,block; // update loop, GPU threads
          int ms = 10; // maxsort
          int np,nx,ny,nz; // number of particles, sort
          int *dsort,*dnsort; // sort
          int *links,*nlinks,*dlinks,*dnlinks; // links
       };
    vars v;
    //
    // sort particles
    //
    __global__ void sort_particles(vars v) {
       int index = blockIdx.x*blockDim.x+threadIdx.x;
       int nindex = gridDim.x*blockDim.x;
       int start = (index/float(nindex))*v.np;
       int stop = ((index+1)/float(nindex))*v.np;
       for (int i = start; i < stop; ++i) {
          int ix = (v.nx-1)*(v.dp[i].x-v.xmin)/(v.xmax-v.xmin);
             if ((ix < 0) || (ix > v.nx-1)) continue;
          int iy = (v.ny-1)*(v.dp[i].y-v.ymin)/(v.ymax-v.ymin);
             if ((iy < 0) || (iy > v.ny-1)) continue;
          int iz = (v.nz-1)*(v.dp[i].z-v.zmin)/(v.zmax-v.zmin);
             if ((iz < 0) || (iz > v.nz-1)) continue;
          int isort = v.nz*v.ny*ix+v.nz*iy+iz;
          v.dsort[v.ms*isort+atomicAdd(&v.dnsort[isort],1)] = i;
          }
       }
    //
    // test for link
    //
    __device__ bool linked(int i,int j,vars v) {
       for (int l = 0; l < v.dnlinks[i]; ++l) {
          if (j == v.dlinks[v.lpp*i+l])
             return true;
          }
       return false;
       }
    //
    // find force
    //
    __device__ void find_force(int i,int j,vars v) {
       float dx = v.dp[i].x-v.dp[j].x;
       float dy = v.dp[i].y-v.dp[j].y;
       float dz = v.dp[i].z-v.dp[j].z;
       if (linked(i,j,v)) {
          float d = sqrt(dx*dx+dy*dy+dz*dz);
          float nx = dx/d;
          float ny = dy/d;
          float nz = dz/d;
          float f = -v.spring*(d-v.d)/v.d;
          float dvx = v.dv[i].x-v.dv[j].x;
          float dvy = v.dv[i].y-v.dv[j].y;
          float dvz = v.dv[i].z-v.dv[j].z;
          f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d;
          atomicAdd(&(v.df[i].x),f*nx);
          atomicAdd(&(v.df[i].y),f*ny);
          atomicAdd(&(v.df[i].z),f*nz);
          atomicAdd(&(v.df[j].x),-f*nx);
          atomicAdd(&(v.df[j].y),-f*ny);
          atomicAdd(&(v.df[j].z),-f*nz);
          }
       else {
          float d = sqrt(dx*dx+dy*dy+dz*dz);
          if (d < v.d) {
             float nx = dx/d;
             float ny = dy/d;
             float nz = dz/d;
             float f = -v.spring*(d-v.d)/v.d;
             float dvx = v.dv[i].x-v.dv[j].x;
             float dvy = v.dv[i].y-v.dv[j].y;
             float dvz = v.dv[i].z-v.dv[j].z;
             f -= v.diss*(dvx*dx+dvy*dy+dvz*dz)/d;
             atomicAdd(&(v.df[i].x),f*nx);
             atomicAdd(&(v.df[i].y),f*ny);
             atomicAdd(&(v.df[i].z),f*nz);
             atomicAdd(&(v.df[j].x),-f*nx);
             atomicAdd(&(v.df[j].y),-f*ny);
             atomicAdd(&(v.df[j].z),-f*nz);
             }
          }
       }
    //
    // calculate forces
    //
    __global__ void calculate_forces(vars v) {
       int index = blockIdx.x*blockDim.x+threadIdx.x;
       int nindex = gridDim.x*blockDim.x;
       int start = (index/float(nindex))*v.np;
       int stop = ((index+1)/float(nindex))*v.np;
       for (int i = start; i < stop; ++i) {
          float x = v.dp[i].x;
          float y = v.dp[i].y;
          float z = v.dp[i].z;
          int ix = (v.nx-1)*(v.dp[i].x-v.xmin)/(v.xmax-v.xmin);
          int iy = (v.ny-1)*(v.dp[i].y-v.ymin)/(v.ymax-v.ymin);
          int iz = (v.nz-1)*(v.dp[i].z-v.zmin)/(v.zmax-v.zmin);
          //
          // loop over links to find strain
          //
          v.ds[i] = 0;
          int norm = 0;
          for (int l = 0; l < v.dnlinks[i]; ++l) {
             int j = v.dlinks[v.lpp*i+l];
             if (j == -1)
                //
                // ignore deleted link
                //
                continue;
             float dx = x-v.dp[j].x;
             float dy = y-v.dp[j].y;
             float dz = z-v.dp[j].z;
             float d = sqrtf(dx*dx+dy*dy+dz*dz);
             //
             // accumulate
             //
             v.ds[i] += abs(d-v.d)/v.d;
             norm += 1;
             //
             // check for broken links
             //
             if (d > v.dmax)
                //
                // yes, mark for deletion
                //
                v.dlinks[v.lpp*i+l] = -1;
             }
          v.ds[i] = v.ds[i]/norm;
          //
          // loop over sort to find forces
          //
          for (int dx = -1; dx <= 1; ++dx) {
             if (((ix+dx) < 0) || ((ix+dx) > (v.nx-1))) continue;
             for (int dy = -1; dy <= 1; ++dy) {
                if (((iy+dy) < 0) || ((iy+dy) > (v.ny-1))) continue;
                for (int dz = -1; dz <= 1; ++dz) {
                   if (((iz+dz) < 0) || ((iz+dz) > (v.nz-1))) continue;
                   int isort = v.nz*v.ny*(ix+dx)+v.nz*(iy+dy)+(iz+dz);
                   for (int s = 0; s < v.dnsort[isort]; ++s) {
                      int j = v.dsort[v.ms*isort+s];
                      if (i < j)
                         find_force(i,j,v);
                      }
                   }
                }
             }
          }
       }
    //
    // integrate positions
    //
    __global__ void integrate_positions(vars v) {
       int index = blockIdx.x*blockDim.x+threadIdx.x;
       int nindex = gridDim.x*blockDim.x;
       int start = (index/float(nindex))*v.np;
       int stop = ((index+1)/float(nindex))*v.np;
       float ax,ay,az;
       for (int i = start; i < stop; ++i) {
          if (i < v.np*v.fractionTop)
             v.dp[i].z -= v.step;
          else if (i > v.np*(1-v.fractionBottom))
             v.dp[i].z += v.step;
          else {
             ax = v.df[i].x/v.mass;
             ay = v.df[i].y/v.mass;
             az = v.df[i].z/v.mass;
             v.dv[i].x += v.dt*ax;
             v.dv[i].y += v.dt*ay;
             v.dv[i].z += v.dt*az;
             v.dp[i].x += v.dt*v.dv[i].x;
             v.dp[i].y += v.dt*v.dv[i].y;
             v.dp[i].z += v.dt*v.dv[i].z;
             }
          }
       }
    //
    // CUDA error check
    //
    void cudaCheck(string msg) {
       cudaError err;
       err = cudaGetLastError();
       if (cudaSuccess != err)
       cerr << msg << ": " << cudaGetErrorString(err) << endl;
       }
    //
    // main
    //
    int main(int argc, char** argv) {
    
    Amira Abdel-Rahman (admin)'s avatar
    Amira Abdel-Rahman (admin) committed
       if (argc != 12) {
    
    Amira Abdel-Rahman's avatar
    Amira Abdel-Rahman committed
          cerr << "command line: bonds_source | bonds_stress_strain grid block spring mass dissipation dt nloop fractionTop fractionBottom step bond";
          return 1;
          }
       v.grid = stoi(argv[1]);
       v.block = stoi(argv[2]);
       v.spring = stof(argv[3]);
       v.mass = stof(argv[4]);
       v.diss = stof(argv[5]);
       v.dt = stof(argv[6]);
       v.nloop = stoi(argv[7]);
       v.fractionTop = stof(argv[8]);
       v.fractionBottom = stof(argv[9]);
       v.step = stof(argv[10]);
       v.bond = stof(argv[11]);
       //
       // read and parse inputs
       //
       string s;
       while (1) {
          getline(cin,s);
          if (s.compare("xmin:") == 0) {
             getline(cin,s);
             v.xmin = stof(s);
             }
          else if (s.compare("xmax:") == 0) {
             getline(cin,s);
             v.xmax = stof(s);
             }
          else if (s.compare("ymin:") == 0) {
             getline(cin,s);
             v.ymin = stof(s);
             }
          else if (s.compare("ymax:") == 0) {
             getline(cin,s);
             v.ymax = stof(s);
             }
          else if (s.compare("zmin:") == 0) {
             getline(cin,s);
             v.zmin = stof(s);
             }
          else if (s.compare("zmax:") == 0) {
             getline(cin,s);
             v.zmax = stof(s);
             }
          else if (s.compare("pitch:") == 0) {
             getline(cin,s);
             v.d = stof(s);
             v.dmax = v.bond*v.d; // bond breaking
             v.sd = 1.01*v.d; // sort bins bigger to avoid roundoff at edges
             float xmid = (v.xmax+v.xmin)/2.0;
             float ymid = (v.ymax+v.ymin)/2.0;
             float zmid = (v.zmax+v.zmin)/2.0;
             v.xmin = xmid+1.25*(v.xmin-xmid); // enlarge sort volume
             v.xmax = xmid+1.25*(v.xmax-xmid);
             v.ymin = ymid+1.25*(v.ymin-ymid);
             v.ymax = ymid+1.25*(v.ymax-ymid);
             v.zmin = zmid+1.25*(v.zmin-zmid);
             v.zmax = zmid+1.25*(v.zmax-zmid);
             v.nx = (v.xmax-v.xmin)/v.sd;
             v.ny = (v.ymax-v.ymin)/v.sd;
             v.nz = (v.zmax-v.zmin)/v.sd;
             cudaMalloc(&v.dsort,v.nz*v.ny*v.nx*v.ms*sizeof(int));
             cudaMalloc(&v.dnsort,v.nz*v.ny*v.nx*sizeof(int));
             cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
             cudaCheck("malloc pitch");
             }
          else if (s.compare("links per particle:") == 0) {
             getline(cin,s);
             v.lpp = stoi(s);
             }
          else if (s.compare("number:") == 0) {
             getline(cin,s);
             v.np = stoi(s); // number of particles
             cudaMallocHost(&v.hp,v.np*sizeof(float3)); // host arrays
             cudaMallocHost(&v.hf,v.np*sizeof(float3));
             cudaMallocHost(&v.s,v.np*sizeof(float));
             v.links = new int[v.lpp*v.np];
             v.nlinks = new int[v.np];
             cudaMalloc(&v.dp,v.np*sizeof(float3)); // device arrays
             cudaMalloc(&v.dv,v.np*sizeof(float3));
             cudaMalloc(&v.df,v.np*sizeof(float3));
             cudaMalloc(&v.ds,v.np*sizeof(float));
             cudaMalloc(&v.dlinks,v.lpp*v.np*sizeof(int));
             cudaMalloc(&v.dnlinks,v.np*sizeof(int));
             cudaMemset(v.dnlinks,0,v.np*sizeof(int));
             cudaCheck("malloc number");
             }
          else if (s.compare("particles:") == 0) {
             for (int i = 0; i < v.np ; ++i) {
                cin.read((char *)&v.hp[i].x,4);
                cin.read((char *)&v.hp[i].y,4);
                cin.read((char *)&v.hp[i].z,4);
                }
             }
          else if (s.compare("nlinks:") == 0) {
             for (int i = 0; i < v.np ; ++i)
                cin.read((char *)&v.nlinks[i],4);
             }
          else if (s.compare("links:") == 0) {
             for (int i = 0; i < v.lpp*v.np ; ++i)
                cin.read((char *)&v.links[i],4);
             }
          else if (s.compare("end:") == 0)
             break;
          }
       cerr << "bonds_stress_strain:" << endl;
       cerr << "   input " << v.np << " particles" << endl;
       //
       // copy data to GPU
       //
       cudaMemcpy(v.dp,v.hp,v.np*sizeof(float3),
          cudaMemcpyHostToDevice);
       cudaMemcpy(v.dnlinks,v.nlinks,v.np*sizeof(int),
          cudaMemcpyHostToDevice);
       cudaMemcpy(v.dlinks,v.links,v.lpp*v.np*sizeof(int),
          cudaMemcpyHostToDevice);
       cudaCheck("copy to GPU");
       //
       // start output
       //
       cout << "type:" << endl;
       cout << "bonds" << endl;
       cout << "format:" << endl;
       cout << "xyz float32" << endl;
       cout << "pitch:" << endl;
       cout << v.d << endl;
       cout << "links per particle:" << endl;
       cout << v.lpp << endl;
       cout << "number:" << endl;
       cout << v.np << endl;
       cout << "fractionTop:" << endl;
       cout << v.fractionTop << endl;
       cout << "fractionBottom:" << endl;
       cout << v.fractionBottom << endl;
       cout << "xmin:" << endl;
       cout << v.xmin << endl;
       cout << "xmax:" << endl;
       cout << v.xmax << endl;
       cout << "ymin:" << endl;
       cout << v.ymin << endl;
       cout << "ymax:" << endl;
       cout << v.ymax << endl;
       cout << "zmin:" << endl;
       cout << v.zmin << endl;
       cout << "zmax:" << endl;
       cout << v.zmax << endl;
       cout << "particles:" << endl;
       for (int i = 0; i < v.np; ++i) {
          cout.write((char*)&v.hp[i].y,4);
          cout.write((char*)&v.hp[i].y,4);
          cout.write((char*)&v.hp[i].y,4);
          }
       cout << endl << "continue:" << endl;
       //
       // start main loop
       //
       while (1) {
          //
          // update loop
          //
          cudaDeviceSynchronize();
          auto t0 = chrono::high_resolution_clock::now();        
          cout << "outer loop" << endl;
          for (int i = 0; i < v.nloop; ++i) {
             cout << "inner loop" << endl;
             //
             // sort
             //
             cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
             sort_particles<<<v.grid,v.block>>>(v);
             cudaCheck("sort");
             //
             // forces
             //
             cudaMemset(v.df,0,v.np*sizeof(float3));
             calculate_forces<<<v.grid,v.block>>>(v);
             cudaCheck("forces");
             //
             // integrate
             //
             integrate_positions<<<v.grid,v.block>>>(v);
             cudaCheck("integrate");
             }
          cudaDeviceSynchronize();
          auto t1 = chrono::high_resolution_clock::now();        
          float dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          fprintf(stderr,"\r   mps: %.0f",(v.nloop*v.np)/dt);
          //
          // measure timing
          //
          t0 = chrono::high_resolution_clock::now();        
          cudaMemset(v.dnsort,0,v.nz*v.ny*v.nx*sizeof(int));
          sort_particles<<<v.grid,v.block>>>(v);
          cudaCheck("time sort");
          cudaDeviceSynchronize();
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          cerr << " srt: " << dt;
          //
          t0 = chrono::high_resolution_clock::now();        
          cudaMemset(v.df,0,v.np*sizeof(float3));
          calculate_forces<<<v.grid,v.block>>>(v);
          cudaCheck("time forces");
          cudaDeviceSynchronize();
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          cerr << " frc: " << dt;
          //
          t0 = chrono::high_resolution_clock::now();        
          integrate_positions<<<v.grid,v.block>>>(v);
          cudaCheck("time integrate");
          cudaDeviceSynchronize();
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          cerr << " int: " << dt;
          //
          // copy data to CPU
          //
          t0 = chrono::high_resolution_clock::now();        
          cudaMemcpy(v.hp,v.dp,v.np*sizeof(float3),cudaMemcpyDeviceToHost);
          cudaMemcpy(v.hf,v.df,v.np*sizeof(float3),cudaMemcpyDeviceToHost);
          cudaMemcpy(v.s,v.ds,v.np*sizeof(int),cudaMemcpyDeviceToHost);
          cudaDeviceSynchronize();
          cudaCheck("copy to CPU");
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          cerr << " cpy: " << dt;
          //
          // stress
          //
          v.stress = 0;
          for (int i = 0; i < v.np; ++i) {
             if (i < v.np*v.fractionTop)
                v.stress -= v.hf[i].z;
             else if (i > v.np*(1-v.fractionBottom))
                v.stress += v.hf[i].z;
             }
          //cerr << endl << v.stress << endl;
          //
          // output
          //
          t0 = chrono::high_resolution_clock::now();        
          cout << "particles:" << endl;
          for (int i = 0; i < v.np; ++i) {
             cout.write((char*)&v.hp[i].x,4);
             cout.write((char*)&v.hp[i].y,4);
             cout.write((char*)&v.hp[i].z,4);
             }
          cout << endl << "strain:" << endl;
          for (int i = 0; i < v.np; ++i)
             cout.write((char*)&v.s[i],4);
          cout << endl << "continue:" << endl;
          t1 = chrono::high_resolution_clock::now();        
          dt = chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
          fprintf(stderr," out: %.3g ",dt);
          //
          cerr << "(us)" << flush;
          }
       }